├── test ├── __init__.py ├── test_procs │ ├── __init__.py │ ├── data │ │ ├── csv_no_header.csv │ │ ├── csv_with_header.csv │ │ ├── tweet_entity_removal.json │ │ └── twitter_raw_mock.json │ ├── wrapped_process_cmd.py │ ├── wrapped_process_json.py │ ├── test_html_remover.py │ ├── wrapped_process.py │ ├── test_twitter_monitor.py │ ├── test_pattern_remover.py │ ├── test_transformer.py │ ├── test_http_monitor.py │ ├── test_dateexpander.py │ ├── test_tweet_entity_removal.py │ ├── test_tweet_extractor.py │ ├── test_csv_converter.py │ ├── test_blacklist_filter.py │ └── test_entity_extractor.py ├── test_config.py ├── test_protocol_compliance.py ├── test_connections.py └── test_service │ └── test_http_service.py ├── MANIFEST.in ├── bin ├── es-managerd.sh ├── es-write ├── es-cleantweet └── es-read ├── eslib ├── Monitor.py ├── service │ ├── __init__.py │ ├── DummyService.py │ ├── PipelineService.py │ └── RemotingService.py ├── Generator.py ├── debug.py ├── procs │ ├── Transformer.py │ ├── Throttle.py │ ├── CLIReader.py │ ├── FileWriter.py │ ├── TwitterFollowerGetter.py │ ├── TweetEntityRemover.py │ ├── __init__.py │ ├── DateExpander.py │ ├── Timer.py │ ├── TwitterUserGetter.py │ ├── KafkaWriter.py │ ├── SmtpMailer.py │ ├── HtmlRemover.py │ ├── TweetExtractor.py │ ├── Neo4jReader.py │ ├── CsvConverter.py │ ├── RabbitmqWriter.py │ ├── PatternRemover.py │ ├── Neo4jWriter.py │ ├── KafkaMonitor.py │ ├── TcpWriter.py │ ├── RabbitmqMonitor.py │ └── FileReader.py ├── Socket.py ├── Configurable.py ├── text.py ├── TerminalInfo.py ├── prog.py ├── Terminal.py ├── esdoc.py ├── time.py ├── __init__.py ├── Connector.py └── web.py ├── examples ├── service_run_dir │ └── config │ │ ├── credentials.yaml │ │ ├── services.yaml │ │ ├── logging-console.yaml │ │ └── logging.yaml ├── resources │ └── tweet.json ├── entity_extractor.py └── remoting │ ├── RemotingClient.py │ └── DummyRemotingService.py ├── DEVHELP.txt ├── .gitignore ├── setup.py └── PROTOCOLS.md /test/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /test/test_procs/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | #include LICENSE 2 | include README.md 3 | include PROTOCOLS.md 4 | include examples/* 5 | recursive-exclude test * 6 | -------------------------------------------------------------------------------- /test/test_procs/data/csv_no_header.csv: -------------------------------------------------------------------------------- 1 | 1,"hans terje","bakke","htb" 2 | 2,"eivind","elseth","eee" 3 | 3,"ole-kristian","villabø","okv" 4 | -------------------------------------------------------------------------------- /test/test_procs/data/csv_with_header.csv: -------------------------------------------------------------------------------- 1 | "id","name","last name","initials" 2 | 1,"hans terje","bakke","htb" 3 | 2,"eivind","elseth","eee" 4 | 3,"ole-kristian","villabø","okv" 5 | -------------------------------------------------------------------------------- /bin/es-managerd.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | # Note: es-run-service must be in path 4 | # Note: ESLIB_SERVICE_DIR must be set, or -d option must be used 5 | exec ./es-service $@ -c manager managerd -e localhost:5000 --start 6 | -------------------------------------------------------------------------------- /eslib/Monitor.py: -------------------------------------------------------------------------------- 1 | from .Generator import Generator 2 | 3 | class Monitor(Generator): 4 | def __init__(self, **kwargs): 5 | super(Monitor, self).__init__(**kwargs) 6 | 7 | self.keepalive = True # A monitor never stops, unless told to 8 | -------------------------------------------------------------------------------- /test/test_procs/wrapped_process_cmd.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | import sys, codecs 5 | 6 | 7 | print "INNER/STARTING" 8 | 9 | print "INNER/" + u" ".join([codecs.decode(x, "UTF-8") for x in sys.argv[1:]]) 10 | 11 | print "INNER/EXITING" 12 | -------------------------------------------------------------------------------- /examples/service_run_dir/config/credentials.yaml: -------------------------------------------------------------------------------- 1 | rabbitmq: 2 | username : xxxx 3 | password : xxxx 4 | 5 | twitter: 6 | consumer_key : xxxxxxxxxxxxxxxxxxxxx 7 | consumer_secret : xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx 8 | access_token : xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx 9 | access_token_secret : xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx 10 | -------------------------------------------------------------------------------- /DEVHELP.txt: -------------------------------------------------------------------------------- 1 | See this guide for preparations, setting up accounts and a ~/.pypirc file: 2 | 3 | http://peterdowns.com/posts/first-time-with-pypi.html 4 | 5 | To install the package from the source tree: 6 | 7 | pip install -e . 8 | 9 | To install from PyPI: 10 | 11 | pip install elasticsearch-eslib 12 | 13 | To upload package to PyPI test (pypitest) or live (pypi): 14 | 15 | python setup.py register -r pypi 16 | python setup.py sdist upload -r pypi 17 | 18 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.py[cod] 2 | *.swp 3 | 4 | 5 | # Folders 6 | tmp/ 7 | HTBTEST/ 8 | 9 | # C extensions 10 | *.so 11 | 12 | # Packages 13 | *.egg 14 | *.egg-info 15 | dist 16 | build 17 | eggs 18 | parts 19 | #bin 20 | var 21 | sdist 22 | develop-eggs 23 | .installed.cfg 24 | lib 25 | lib64 26 | __pycache__ 27 | 28 | # Installer logs 29 | pip-log.txt 30 | 31 | # Unit test / coverage reports 32 | .coverage 33 | .tox 34 | nosetests.xml 35 | 36 | # Translations 37 | *.mo 38 | 39 | # Mr Developer 40 | .mr.developer.cfg 41 | .project 42 | .pydevproject 43 | .idea 44 | -------------------------------------------------------------------------------- /examples/service_run_dir/config/services.yaml: -------------------------------------------------------------------------------- 1 | # Manager 2 | 3 | manager: 4 | name : "manager" 5 | management_endpoint : "localhost:5000" 6 | 7 | elasticsearch_hosts : ["localhost:9200"] 8 | elasticsearch_index : "management" 9 | dynamic_port_ranges : [["localhost", 5010, 5019]] 10 | 11 | # Dummy 12 | 13 | dummy: 14 | manager_endpoint : "localhost:5000" 15 | #management_endpoint : "localhost:5008" 16 | management_endpoint : "localhost" 17 | 18 | name : "dummy" 19 | frequency : 3 20 | lifespan : 120 21 | -------------------------------------------------------------------------------- /test/test_procs/wrapped_process_json.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | import sys, select, json 5 | 6 | def send(s): 7 | print json.dumps({"inner": s}) 8 | 9 | 10 | try: 11 | while True: 12 | r,w,e = select.select([sys.stdin],[],[],0) 13 | if r: 14 | line = sys.stdin.readline() 15 | if line: 16 | dd = json.loads(line) 17 | s = dd.get("outer") 18 | if s: 19 | send("echo: %s" % s) 20 | else: 21 | send("stdin was hung up") 22 | break 23 | except KeyboardInterrupt: 24 | send("interrupted") 25 | send("finished") 26 | -------------------------------------------------------------------------------- /examples/service_run_dir/config/logging-console.yaml: -------------------------------------------------------------------------------- 1 | version : 1 2 | disable_existing_loggers: False 3 | formatters: 4 | categories: 5 | format: "%(firstName) -15s %(serviceName) -15s %(className) -20s %(instanceName) -20s %(levelname) -10s %(message)s" 6 | rich: 7 | format: "%(asctime)s %(name) -30s %(className) -20s %(lineno) 5d %(funcName) -20s %(levelname) -10s %(message)s" 8 | compact: 9 | format: "%(name) -30s %(levelname) -10s %(message)s" 10 | 11 | handlers: 12 | console: 13 | class : logging.StreamHandler 14 | formatter : rich 15 | level : TRACE 16 | stream : ext://sys.stdout 17 | loggers: 18 | "": 19 | handlers : [console] 20 | level : WARNING 21 | servicelog: 22 | level : DEBUG 23 | proclog: 24 | level : DEBUG 25 | doclog: 26 | level : WARNING 27 | -------------------------------------------------------------------------------- /eslib/service/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | """ 4 | eslib.service 5 | ~~~~~ 6 | 7 | Base classes for wrapping document processing processors into processing graphs/pipelines and control them. 8 | """ 9 | 10 | from .. import esdoc 11 | 12 | 13 | from .Service import Service, status 14 | from .HttpService import HttpService 15 | from .PipelineService import PipelineService 16 | from .ServiceManager import ServiceManager 17 | from .ServiceLauncher import ServiceLauncher 18 | from .DummyService import DummyService 19 | from .Client import Client 20 | from .RemotingService import RemotingService 21 | 22 | 23 | __all__ = ( 24 | "Service", 25 | "HttpService", 26 | "PipelineService", 27 | "ServiceManager", 28 | "ServiceLauncher", 29 | "DummyService", 30 | "Client", 31 | "RemotingService" 32 | ) 33 | -------------------------------------------------------------------------------- /test/test_procs/test_html_remover.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import unittest 4 | from eslib.procs import HtmlRemover 5 | 6 | class TestHtmlRemover(unittest.TestCase): 7 | 8 | def test_str(self): 9 | dirty = 'Lady & Landstrykeren' 10 | 11 | p = HtmlRemover() 12 | cleaned = p._clean(dirty) 13 | print "D=", dirty 14 | print "C=", cleaned 15 | 16 | self.assertTrue(cleaned == "Lady & Landstrykeren") 17 | 18 | def test_unicode(self): 19 | dirty = u'Lady & Landstrykeren' 20 | 21 | p = HtmlRemover() 22 | cleaned = p._clean(dirty) 23 | print "D=", dirty 24 | print "C=", cleaned 25 | 26 | self.assertTrue(cleaned == u"Lady & Landstrykeren") 27 | 28 | def main(): 29 | unittest.main() 30 | 31 | if __name__ == "__main__": 32 | main() 33 | -------------------------------------------------------------------------------- /eslib/Generator.py: -------------------------------------------------------------------------------- 1 | from .Processor import Processor 2 | 3 | class Generator(Processor): 4 | def __init__(self, **kwargs): 5 | super(Generator, self).__init__(**kwargs) 6 | self.is_generator = True 7 | 8 | # These methods could/should be implemented by inheriting classes: 9 | 10 | # on_open(self) # from Processor 11 | # on_close(self) # from Processor 12 | 13 | # on_startup(self) 14 | # on_shutdown(self) 15 | # on_abort(self) # from Processor 16 | # on_tick(self) 17 | # on_suspend(self) 18 | # on_resume(self) 19 | 20 | # If on_tick finishes on its own without external stop call, call self.stop() from there when done. 21 | 22 | @property 23 | def end_tick_reason(self): 24 | "If 'aborted', 'stopping' or not 'running'. 'suspended' is not a reason to leave the tick; handle this yourself." 25 | return self.aborted or self.stopping or self.restarting or not self.running 26 | -------------------------------------------------------------------------------- /eslib/debug.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | """ 4 | eslib.debug 5 | ~~~~~~~~~~~ 6 | 7 | Module containing functions useful for debugging. 8 | """ 9 | import os 10 | 11 | 12 | __all__ = ("byte_size_string", "get_memory_used") 13 | 14 | 15 | if os.name == 'posix': 16 | import resource 17 | 18 | 19 | def byte_size_string(bytes, decimals=1): 20 | kB = bytes / 1024.0 21 | MB = kB / 1024.0 22 | GB = MB / 1024.0 23 | s = None 24 | if GB > 1.0: s = "%.*f GB" % (decimals, GB) 25 | elif MB > 1.0: s = "%.*f MB" % (decimals, MB) 26 | elif kB > 1.0: s = "%.*f kB" % (decimals, kB) 27 | else: s = "%s B" % bytes 28 | return s 29 | 30 | 31 | def get_memory_used(): 32 | """Get current memory usage by this process. Supposedly in KB.""" 33 | if os.name == 'posix': 34 | return resource.getrusage(resource.RUSAGE_SELF).ru_maxrss 35 | else: 36 | 0 # Don't want to risk an exception here.. 37 | #raise NotImplementedError 38 | -------------------------------------------------------------------------------- /examples/resources/tweet.json: -------------------------------------------------------------------------------- 1 | { "_timestamp": "2014-10-14T14:26:29Z", "_source": { "id": "522030691567931393", "geo": { "coordinates": [ 40.757023, -74.001698 ], "type": "Point" }, "lang": "en", "entities": { "urls": [ { "indices": [ 70, 92 ], "url": "http://instagram.com/p/uIt8BfP5Qp/" } ], "hashtags": [], "user_mentions": [ { "name": "Stella Chuu", "screen_name": "StellaChuuuuu", "indices": [ 20, 34 ], "id": "285369968" } ] }, "truncated": false, "text": "Me & the lovely @stellachuuuuu @ Jacob K Javits Convention Center http://t.co/x6BUjNY0jv", "created_at": "2014-10-14T14:26:30Z", "source": "Instagram", "place": { "country_code": "US", "country": "United States" }, "user": { "name": "JJ Dillon", "id": "35273719", "lang": "en", "description": "i love beautiful women. like to party & have fun. very cool, calm, laid back person. i love video games, anime, movies, xbox 360, comic books, pop culture", "created_at": "2009-04-25T18:20:07Z", "profile_image_url": "http://pbs.twimg.com/profile_images/506599782908178432/c6pyAlfv_normal.jpeg", "screen_name": "JJDillon430", "location": "New York", "geo_enabled": true, "protected": false } }, "_id": "522030691567931393" } 2 | -------------------------------------------------------------------------------- /examples/entity_extractor.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | 5 | from eslib.procs import FileReader, FileWriter, EntityExtractor 6 | 7 | def listener(document): 8 | print document["_source"]["extracted"] 9 | 10 | entities = [ 11 | { 12 | "category": "location", 13 | "name": "place", 14 | "match": [ 15 | { "type": "exact", "pattern": "Convention" } 16 | #{ "type": "iprange", "value": "81.27.32.186/16" } 17 | ] 18 | }, 19 | { 20 | "category": "agent", 21 | "name": "user", 22 | "match": [ 23 | { "type": "exact", "pattern": "Jacob" } 24 | #{ "type": "iprange", "value": "81.27.32.186/16" } 25 | ] 26 | }, 27 | { 28 | "category": "agent", 29 | "name": "user", 30 | "match": [ 31 | { "type": "exact", "pattern": "stellachuuuuu" } 32 | #{ "type": "iprange", "value": "81.27.32.186/16" } 33 | ] 34 | } 35 | ] 36 | 37 | 38 | r = FileReader(filename = "resources/tweet.json") 39 | p = EntityExtractor(fields=["text"], target="extracted", entities=entities) 40 | w = FileWriter() 41 | 42 | p.subscribe(r) 43 | w.subscribe(p, "entities") 44 | 45 | r.start() 46 | w.wait() # Will finish once the reader is finished. 47 | -------------------------------------------------------------------------------- /test/test_procs/wrapped_process.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | import time, sys, signal 5 | from select import select 6 | 7 | #region Signal handling 8 | 9 | def _handler_SIGINT(signal, frame): 10 | print "INNER/RECEIVED SIGINT -- ignoring" 11 | 12 | def _handler_SIGTERM(signal, frame): 13 | global running 14 | print "INNER/RECEIVED SIGTERM -- terminating" 15 | running = False 16 | 17 | def _handler_SIGHUP(signal, frame): 18 | print "INNER/RECEIVED SIGHUP -- ignoring" 19 | 20 | signal.signal(signal.SIGINT , _handler_SIGINT ) 21 | signal.signal(signal.SIGTERM, _handler_SIGTERM) 22 | signal.signal(signal.SIGHUP , _handler_SIGHUP ) 23 | 24 | #endregion Signal handling 25 | 26 | running = True 27 | 28 | print "INNER/STARTING" 29 | 30 | while running: 31 | r,w,e = select([sys.stdin],[],[],0) 32 | if r: 33 | line = sys.stdin.readline() 34 | line = line.strip() 35 | if line: 36 | print "INNER/ECHO:", line 37 | if line == "*HANGUP*": 38 | print "INNER/HANGING UP ON *HANGUP* REQUEST" 39 | running = False 40 | elif line == "*RAISE*": 41 | raise Exception("INNER/RAISED EXCEPTION UPON *RAISE* REQUEST") 42 | else: 43 | print "INNER/STDIN WAS HUNG UP -- GOOD BYE" 44 | running = False 45 | 46 | print "INNER/EXITING" 47 | -------------------------------------------------------------------------------- /test/test_config.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | from eslib import Config 3 | 4 | class TestConfig(unittest.TestCase): 5 | 6 | def test_access(self): 7 | config = Config() 8 | config.set_default(a="A", b="B") 9 | 10 | print config["a"] 11 | print config.a 12 | 13 | self.assertEqual("A", config["a"]) 14 | self.assertEqual("A", config.a) 15 | 16 | def test_assignment(self): 17 | config = Config() 18 | #config.set_default(a="A", b="B") 19 | 20 | config["a"] = "A" 21 | config.a = "B" 22 | 23 | print config["a"] 24 | print config.a 25 | 26 | self.assertEqual("B", config["a"]) 27 | self.assertEqual("B", config.a) 28 | 29 | 30 | def test_defaults_and_overrides(self): 31 | config = Config() 32 | config.set_default(a="A", b="B", x="X") 33 | 34 | config.set(a="D", b=None) 35 | 36 | print config["a"] 37 | print config.a 38 | self.assertEqual("D", config.a) 39 | 40 | config.a = "C" 41 | print config.a 42 | self.assertEqual("C", config.a) 43 | 44 | print config["b"] 45 | print config.b 46 | self.assertEqual(None, config.b) 47 | 48 | print config.x 49 | self.assertEqual("X", config.x) 50 | 51 | def main(): 52 | unittest.main() 53 | 54 | if __name__ == "__main__": 55 | main() 56 | -------------------------------------------------------------------------------- /bin/es-write: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | 5 | from eslib.procs import ElasticsearchWriter, FileReader 6 | import eslib.prog 7 | import eslib.time 8 | import argparse, sys 9 | 10 | 11 | def main(): 12 | help_i = "Which index to write documents to." 13 | help_t = "Which type to set on document (overrides incoming type)." 14 | 15 | parser = argparse.ArgumentParser(usage="\n %(prog)s -i index [-t type] [-f field] [-l limit] [more options]") 16 | parser._actions[0].help = argparse.SUPPRESS 17 | parser.add_argument("-i", "--index" , help=help_i, required=True) 18 | parser.add_argument("-t", "--type" , help=help_t) 19 | parser.add_argument( "--host" , help="Elasticsearch host, format 'host:port' or just 'host'.", default=None) 20 | #parser.add_argument( "--debug" , action="store_true") 21 | parser.add_argument( "--name" , help="Process name.", default=None) 22 | 23 | if len(sys.argv) == 1: 24 | parser.print_usage() 25 | sys.exit(0) 26 | 27 | args = parser.parse_args() 28 | 29 | # Set up and run this processor 30 | w = ElasticsearchWriter( 31 | name = args.name or eslib.prog.progname(), 32 | hosts = [args.host] if args.host else [], 33 | index = args.index, 34 | doctype = args.type, 35 | batchsize = 1000, 36 | batchtime = 60.0 37 | ) 38 | 39 | # if args.debug: w.debuglevel = 0 40 | 41 | r = FileReader() 42 | w.subscribe(r) 43 | r.start() 44 | w.wait() 45 | 46 | 47 | if __name__ == "__main__": main() 48 | -------------------------------------------------------------------------------- /eslib/procs/Transformer.py: -------------------------------------------------------------------------------- 1 | __author__ = 'Hans Terje Bakke' 2 | 3 | from ..Processor import Processor 4 | 5 | class Transformer(Processor): 6 | """ 7 | Convert input to output protocol. 8 | Returns a LIST of zero or more documents converted to the output protocol. 9 | 10 | The following parameters are not part of the processors 'config' object, and can and must be set only upon 11 | instantiation: 12 | 13 | input_protocol = None 14 | output_protocol = None 15 | func = None # Mandatory! Must be a function returning a list (or generator) of zero or more 16 | documents complying with the output protocol. Function signature must be 17 | func(proc, doc), where proc is this transformer processor, so you can address it 18 | in your function. 19 | """ 20 | def __init__(self, func=None, input_protocol=None, output_protocol=None, **kwargs): 21 | super(Transformer, self).__init__(**kwargs) 22 | self.create_connector(self._incoming, "input", input_protocol) 23 | self._output = self.create_socket("output", output_protocol) 24 | 25 | self._func = func 26 | 27 | def _incoming(self, incoming): 28 | try: 29 | ll = self._func(self, incoming) 30 | if ll: 31 | for outgoing in ll: 32 | if outgoing: 33 | self._output.send(outgoing) 34 | except Exception as e: 35 | self.doclog.exception("Error in protocol converter function call.") 36 | -------------------------------------------------------------------------------- /eslib/Socket.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from .Terminal import Terminal 4 | 5 | 6 | class Socket(Terminal): 7 | "Output terminal in a Processor. Writes data to one or more subscribing connectors of matching protocol." 8 | 9 | def __init__(self, name, protocol=None, mimic=None): 10 | super(Socket, self).__init__(name, protocol) 11 | self.type = Socket 12 | self.callbacks = [] # List of methods for external callbacks 13 | self.mimic = mimic 14 | 15 | def send(self, document): 16 | "Send data to all subscribing connectors and callbacks." 17 | 18 | # Send data to all accepting connectors 19 | subscribers = self.connections[:] 20 | for subscriber in subscribers: 21 | if subscriber.accepting: 22 | subscriber.receive(document) 23 | # Finally, notify all subscribing callbacks 24 | for callback in self.callbacks: 25 | callback(self.owner, document) 26 | 27 | @property 28 | def has_output(self): 29 | if self.connections or self.callbacks: 30 | return True 31 | return False 32 | 33 | def _find_mimic_proto(self, visited=None): 34 | if not visited: 35 | visited = [] 36 | if self.mimic and self.mimic.connections and not self in visited: 37 | visited.append(self) 38 | connected_socket = self.mimic.connections[0] 39 | return connected_socket._find_mimic_proto(visited) 40 | return self.protocol 41 | 42 | @property 43 | def mimiced_protocol(self): 44 | return self._find_mimic_proto() 45 | -------------------------------------------------------------------------------- /test/test_procs/test_twitter_monitor.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import os 4 | import unittest, json 5 | from eslib.procs import TwitterMonitor 6 | 7 | class TestTwitterMonitor(unittest.TestCase): 8 | 9 | def test_simple(self): 10 | 11 | # Load test data 12 | self_dir, _ = os.path.split(__file__) 13 | f = open(os.path.join(self_dir, "data/twitter_raw_mock.json")) 14 | j = json.load(f) 15 | f.close() 16 | 17 | m = TwitterMonitor() 18 | raw, tweet = m._decode(j) 19 | 20 | # Test tweet 21 | self.assertTrue(tweet["_id"] == "520149420122578944") 22 | self.assertTrue(tweet["_source"]["source"] == u"Twitter for BlackBerry®") 23 | self.assertTrue(tweet["_source"]["text"] == u'These clowns must hope that we never cum under attack from any force-r we capable of protecting ourselves?') 24 | self.assertTrue(str(tweet["_source"]["created_at"]) == "2014-10-09 09:51:00.328000") 25 | self.assertTrue("geo" in tweet["_source"]) 26 | self.assertTrue(tweet["_source"]["lang"] == "en") 27 | self.assertTrue(tweet["_source"]["place"]["country"] == "South Africa") 28 | self.assertFalse("in_reply_to" in tweet["_source"]) 29 | # User 30 | self.assertTrue(tweet["_source"]["user"]["id"] == "2196916282") 31 | self.assertTrue(tweet["_source"]["user"]["lang"] == "en") 32 | self.assertTrue(tweet["_source"]["user"]["name"] == "mark fester") 33 | self.assertFalse("description" in tweet["_source"]["user"]) 34 | self.assertTrue(str(tweet["_source"]["user"]["created_at"]) == "2013-11-26 14:21:35") 35 | 36 | # Entities 37 | # // TODO 38 | 39 | def main(): 40 | unittest.main() 41 | 42 | if __name__ == "__main__": 43 | main() 44 | -------------------------------------------------------------------------------- /test/test_procs/data/tweet_entity_removal.json: -------------------------------------------------------------------------------- 1 | { 2 | "_timestamp": "2014-10-14T14:26:29Z", 3 | "_source": { 4 | "id": "522030691567931393", 5 | "geo": { 6 | "coordinates": [ 7 | 40.757023, 8 | -74.001698 9 | ], 10 | "type": "Point" 11 | }, 12 | "lang": "en", 13 | "entities": { 14 | "urls": [ 15 | { 16 | "indices": [ 17 | 70, 18 | 92 19 | ], 20 | "url": "http://instagram.com/p/uIt8BfP5Qp/" 21 | } 22 | ], 23 | "hashtags": [], 24 | "user_mentions": [ 25 | { 26 | "name": "Stella Chuu", 27 | "screen_name": "StellaChuuuuu", 28 | "indices": [ 29 | 20, 30 | 34 31 | ], 32 | "id": "285369968" 33 | } 34 | ] 35 | }, 36 | "truncated": false, 37 | "text": "Me & the lovely @stellachuuuuu @ Jacob K Javits Convention Center http://t.co/x6BUjNY0jv", 38 | "created_at": "2014-10-14T14:26:30Z", 39 | "source": "Instagram", 40 | "place": { 41 | "country_code": "US", 42 | "country": "United States" 43 | }, 44 | "user": { 45 | "name": "JJ Dillon", 46 | "id": "35273719", 47 | "lang": "en", 48 | "description": "i love beautiful women. like to party & have fun. very cool, calm, laid back person. i love video games, anime, movies, xbox 360, comic books, pop culture", 49 | "created_at": "2009-04-25T18:20:07Z", 50 | "profile_image_url": "http://pbs.twimg.com/profile_images/506599782908178432/c6pyAlfv_normal.jpeg", 51 | "screen_name": "JJDillon430", 52 | "location": "New York", 53 | "geo_enabled": true, 54 | "protected": false 55 | } 56 | }, 57 | "_id": "522030691567931393" 58 | } 59 | -------------------------------------------------------------------------------- /eslib/Configurable.py: -------------------------------------------------------------------------------- 1 | class Config(object): 2 | def __init__(self, **config): 3 | super(Config, self).__init__() 4 | if config is not None: 5 | self.__dict__ = config 6 | self.defaults = {} 7 | 8 | def set_default(self, **kwargs): 9 | for key,val in kwargs.iteritems(): 10 | self.defaults[key] = val 11 | # if not key in self.__dict__: 12 | # self.__dict__[key] = val 13 | 14 | def __getattr__(self, key): 15 | if key in self.__dict__: 16 | return self.__dict__.__getattr__(key) 17 | elif key in self.defaults: 18 | return self.defaults[key] 19 | else: 20 | raise AttributeError("'%s' has no attribute '%s'" % (self.__class__.__name__, key)) 21 | 22 | def __getitem__(self, key): 23 | if key in self.__dict__: 24 | return self.__dict__[key] 25 | elif key in self.defaults: 26 | return self.defaults[key] 27 | else: 28 | raise AttributeError("'%s' has no attribute '%s'" % (self.__class__.__name__, key)) 29 | 30 | def __setitem__(self, key, value): 31 | self.__dict__[key] = value 32 | 33 | def set(self, ignore_none=False, **kwargs): 34 | "ignore_none means that fields with value None are not set." 35 | for key,val in kwargs.iteritems(): 36 | if ignore_none and val is None: 37 | continue 38 | self.__dict__[key] = val 39 | 40 | def get_default_attributes(self): 41 | return self.defaults 42 | 43 | def get_user_attributes(self): 44 | return {key: val for key, val in self.__dict__.iteritems() if key not in self.defaults} 45 | 46 | class Configurable(object): 47 | def __init__(self, **kwargs): 48 | super(Configurable, self).__init__() 49 | self.config = Config(**kwargs) 50 | -------------------------------------------------------------------------------- /eslib/text.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | """ 4 | eslib.text 5 | ~~~~~~~~~~ 6 | 7 | Module containing operations on text strings. 8 | """ 9 | 10 | 11 | __all__ = ("remove_parts", "remove_html") 12 | 13 | 14 | import re 15 | from HTMLParser import HTMLParser 16 | 17 | import sys 18 | 19 | def remove_parts(text, sections): 20 | """ 21 | Remove sections from text. Sections is a list of tuples with (start,end) 22 | coordinates to clip from the text string. 23 | """ 24 | 25 | if not sections: return text 26 | 27 | c = sorted(sections) 28 | s = [] 29 | s.append(text[:c[0][0]]) 30 | for i in range(1, len(c)): 31 | s.append(text[c[i-1][1]:c[i][0]]) 32 | s.append(text[c[-1][1]:]) 33 | cleaned = "".join(s) 34 | return cleaned 35 | 36 | #region remove_html 37 | 38 | class _MLStripper(HTMLParser): 39 | def __init__(self): 40 | self.reset() 41 | self.fed = [] 42 | self.strict = False 43 | def handle_data(self, d): 44 | self.fed.append(d) 45 | def get_data(self): 46 | return ''.join(self.fed) 47 | 48 | 49 | _regex_whitespace = re.compile(r'\s+', re.UNICODE) 50 | _regex_scripts = re.compile(r""".*?""", re.MULTILINE|re.DOTALL|re.UNICODE) 51 | _regex_style = re.compile(r"""(.*?)""", re.MULTILINE|re.DOTALL|re.UNICODE) 52 | 53 | def remove_html(text): 54 | if not text or not type(text) in [str, unicode]: 55 | return text 56 | 57 | text = re.sub(_regex_scripts, " ", text) 58 | text = re.sub(_regex_style , " ", text) 59 | stripper = _MLStripper() 60 | cleaned = stripper.unescape(text) 61 | stripper.feed(cleaned) 62 | cleaned = stripper.get_data() 63 | cleaned = re.sub(_regex_whitespace, " ", cleaned) 64 | return cleaned 65 | 66 | #endregion remove_html 67 | 68 | -------------------------------------------------------------------------------- /eslib/procs/Throttle.py: -------------------------------------------------------------------------------- 1 | __author__ = 'Hans Terje Bakke' 2 | 3 | from ..Processor import Processor 4 | import time 5 | 6 | 7 | class Throttle(Processor): 8 | """ 9 | Only pass through documents that satisfy a whitelist of terms or where certain terms do not occur in a combination 10 | with blacklisted terms. 11 | 12 | Connectors: 13 | input (esdoc) : Incoming document in 'esdoc' dict format. 14 | Sockets: 15 | output (esdoc) : Documents that passed the blacklist filtering, arrived on 'input' connector. 16 | 17 | Config: 18 | delay = 1.0 : Time to delay document throughput, in seconds (float). 19 | drop = False : Drop items we don't have time for instead of buffering up. 20 | """ 21 | 22 | def __init__(self, **kwargs): 23 | super(Throttle, self).__init__(**kwargs) 24 | 25 | m = self.create_connector(self._incoming, "input", None, "Incoming document.") 26 | self.output = self.create_socket("output" , None, "Outgoing document.", mimic=m) 27 | 28 | self.config.set_default( 29 | delay = 1.0, 30 | drop = True 31 | ) 32 | 33 | self._last_write_ts = 0 34 | 35 | def on_open(self): 36 | self._last_write_ts = 0 37 | 38 | def _incoming(self, doc): 39 | if self.output.has_output: 40 | if self.config.drop: 41 | now_ts = time.time() 42 | if now_ts - self._last_write_ts > self.config.delay: # Otherwise just ignore the incoming doc 43 | self._last_write_ts = now_ts 44 | self.output.send(doc) 45 | #print "QUEUE=", self.connectors["input"].queue.qsize() 46 | else: 47 | time.sleep(self.config.delay) 48 | self.output.send(doc) 49 | #print "QUEUE=", self.connectors["input"].queue.qsize() 50 | -------------------------------------------------------------------------------- /examples/service_run_dir/config/logging.yaml: -------------------------------------------------------------------------------- 1 | version : 1 2 | disable_existing_loggers: False 3 | formatters: 4 | brief: 5 | format: "%(firstName) -20s %(serviceName) -20s %(className) -20s %(instanceName) -20s %(levelname) -10s %(message)s" 6 | individual: 7 | format: "%(asctime)s %(className) -20s %(instanceName) -20s %(levelname) -10s %(message)s" 8 | root: 9 | format: "%(asctime)s %(name) -50s %(levelname) -10s %(message)s" 10 | 11 | handlers: 12 | console: 13 | class : logging.StreamHandler 14 | formatter : brief 15 | level : INFO 16 | #filters : [allow_foo] 17 | stream : ext://sys.stdout 18 | file_root: 19 | class : logging.handlers.RotatingFileHandler 20 | formatter : root 21 | filename : root.log 22 | maxBytes : 1024 23 | backupCount : 3 24 | file_service: 25 | class : logging.handlers.RotatingFileHandler 26 | formatter : individual 27 | filename : service.log 28 | maxBytes : 1024 29 | backupCount : 3 30 | file_proc: 31 | class : logging.handlers.RotatingFileHandler 32 | formatter : individual 33 | filename : proc.log 34 | maxBytes : 1024 35 | backupCount : 3 36 | file_doc: 37 | class : logging.handlers.RotatingFileHandler 38 | formatter : individual 39 | filename : doc.log 40 | maxBytes : 1024 41 | backupCount : 3 42 | loggers: 43 | "": 44 | handlers : [file_root] 45 | level : DEBUG 46 | servicelog: 47 | handlers : [console, file_service] 48 | level : DEBUG 49 | propagate : false 50 | proclog: 51 | handlers : [console, file_proc] 52 | level : DEBUG 53 | propagate : false 54 | doclog: 55 | handlers : [file_doc] 56 | level : DEBUG 57 | propagate : false 58 | 59 | # servicelog.SERVICE.INSTANCE 60 | 61 | doclog.myservice.myinstance: 62 | level: DEBUG 63 | -------------------------------------------------------------------------------- /eslib/TerminalInfo.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from .Connector import Connector 4 | from .Socket import Socket 5 | 6 | 7 | class TerminalInfo(object): 8 | 9 | def __init__(self, terminal=None, include_connections=True): 10 | if terminal: 11 | self.type = terminal.type # t.__class__.__name__ 12 | owner_name = "orphan" 13 | if terminal.owner: owner_name = terminal.owner.name or "???" 14 | if terminal.owner: owner_name = terminal.owner.name or "???" 15 | self.owner = owner_name 16 | self.name = terminal.name 17 | self.protocol = terminal.protocol 18 | self.description = terminal.description 19 | connections = terminal.get_connections() 20 | self.count = len(connections) 21 | self.connections = [] 22 | if include_connections: 23 | for c in terminal.get_connections(): 24 | self.connections.append(TerminalInfo(c, False)) 25 | 26 | def DUMP(self, follow_connections=True, verbose=False, indent=0): 27 | spacing = " " 28 | spc = spacing * indent 29 | type_indicator = "?" 30 | mimic_str = "" 31 | if self.type is Socket: 32 | type_indicator = "+" 33 | if self.mimic: 34 | mimic_str = " (mimic=%s)" % self.mimic.name 35 | elif self.type is Connector: 36 | type_indicator = "-" 37 | 38 | print "%s%c%s.%s(%s) (conns=%d)%s" % (spc, type_indicator, self.owner, self.name, self.protocol, self.count, mimic_str) 39 | if verbose and self.description: 40 | print "\"%s%s%s\"" % (spc, spc, self.description) 41 | 42 | if follow_connections and self.connections: 43 | subindent = 0 44 | if verbose: 45 | print "%sConnections:" % spc 46 | subindent += 1 47 | for c in self.connections: 48 | c.DUMP(False, verbose, subindent+1) 49 | 50 | -------------------------------------------------------------------------------- /test/test_procs/test_pattern_remover.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import unittest 4 | from eslib.procs import PatternRemover 5 | from eslib import esdoc 6 | 7 | class TestPatternRemover(unittest.TestCase): 8 | 9 | def test_str(self): 10 | dirty = u"Oh my fucking god…" 11 | 12 | p = PatternRemover(patterns=["my", u"\S+…"]) 13 | p.on_open() # Force generation of internal regexes 14 | cleaned = p._clean(dirty) 15 | print "D=", dirty 16 | print "C=", cleaned 17 | 18 | self.assertTrue(cleaned == "Oh fucking") 19 | 20 | def test_field(self): 21 | dirty_text = u"Oh my fucking god…" 22 | 23 | dirty = { 24 | "_id": "somedoc", 25 | "_source": { 26 | "text": dirty_text 27 | } 28 | } 29 | 30 | p = PatternRemover(patterns=["my", u"\S+…"], target_field="cleaned") 31 | p.on_open() # Force generation of internal regexes 32 | cleaned = p._clean(dirty) 33 | print "D=", esdoc.getfield(cleaned, "_source.text") 34 | print "C=", esdoc.getfield(cleaned, "_source.cleaned") 35 | 36 | self.assertTrue(esdoc.getfield(cleaned, "_source.text" ) == dirty_text) 37 | self.assertTrue(esdoc.getfield(cleaned, "_source.cleaned") == "Oh fucking") 38 | 39 | def test_field_map(self): 40 | dirty = { 41 | "_id": "somedoc", 42 | "_source": { 43 | "A": "This was A", 44 | "b": { "B": "This was B"} 45 | } 46 | } 47 | 48 | p = PatternRemover(pattern="was", field_map={"A": "cleaned.cleaned_A", "b.B": "cleaned.cleaned_B"}) 49 | p.on_open() # Force generation of internal regexes 50 | cleaned = p._clean(dirty) 51 | 52 | self.assertTrue(esdoc.getfield(cleaned, "_source.cleaned.cleaned_A") == "This A") 53 | self.assertTrue(esdoc.getfield(cleaned, "_source.cleaned.cleaned_B") == "This B") 54 | 55 | def main(): 56 | unittest.main() 57 | 58 | if __name__ == "__main__": 59 | main() 60 | -------------------------------------------------------------------------------- /eslib/procs/CLIReader.py: -------------------------------------------------------------------------------- 1 | __author__ = 'Eivind Eidheim Elseth' 2 | import time 3 | import subprocess 4 | 5 | from ..Monitor import Monitor 6 | import logging 7 | 8 | class CLIReader(Monitor): 9 | """ 10 | The CLIReader is a Generator that will periodically call a command line utility 11 | 12 | Sockets: 13 | stdout (str) (default) : Output from the command line utility's stdout 14 | stderr (str) : Output from the command line utility's stderr 15 | 16 | Config: 17 | cmd = None : The command to run 18 | interval = 10 : The waiting period in seconds between each time the command is run 19 | 20 | """ 21 | 22 | def __init__(self, **kwargs): 23 | super(CLIReader, self).__init__(**kwargs) 24 | self._stdout = self.create_socket("stdout", "str", "The output to stdout from the command line utility", is_default=True) 25 | self._stderr = self.create_socket("stderr", "str", "The output to stderr from the command line utility") 26 | self.config.set_default( 27 | interval = 10 28 | ) 29 | self.last_get = None 30 | 31 | def on_tick(self): 32 | if not self.last_get or (time.time() - self.last_get > self.config.interval): 33 | # Since the next call may crash, at least mark the last attempt as now, 34 | # so we don't try again on every tick, but wait for the next interval. 35 | self.last_get = time.time() 36 | 37 | p = subprocess.Popen(self.config.cmd, shell=False, stdout=subprocess.PIPE) 38 | p.wait() 39 | (output, err) = p.communicate() 40 | if output: 41 | if self.doclog.isEnabledFor(logging.TRACE): 42 | self.doclog.trace("Output doc: %s" % str(output)) 43 | self._stdout.send(output) 44 | if err: 45 | self.log.error("Received message from subprocess on stderr: %s" % str(err)) 46 | self._stderr.send(err) 47 | 48 | self.last_get = time.time() 49 | -------------------------------------------------------------------------------- /eslib/procs/FileWriter.py: -------------------------------------------------------------------------------- 1 | __author__ = 'Hans Terje Bakke' 2 | 3 | # TODO: Verify encoding working, especially when writing to stdout 4 | 5 | from ..Processor import Processor 6 | import sys 7 | from ..esdoc import tojson 8 | 9 | 10 | class FileWriter(Processor): 11 | """ 12 | Write incoming documents to specified file or standard output. 13 | Documents of dict type are written as json documents, per line. Other types are written directly with 14 | their string representation. 15 | 16 | Connectors: 17 | input (*) : Incoming documents to write to file as string or json objects per line. 18 | 19 | Config: 20 | filename = None : If not set then 'stdout' is assumed. 21 | append = False : Whether to append to existing file, rather than overwrite. 22 | """ 23 | def __init__(self, **kwargs): 24 | super(FileWriter, self).__init__(**kwargs) 25 | self.create_connector(self._incoming, "input", None, "Incoming documents to write to file as string or JSON objects per line.") 26 | 27 | self.config.set_default( 28 | filename = None, 29 | append = False 30 | ) 31 | 32 | self._file = None 33 | 34 | def on_open(self): 35 | 36 | if self._file: 37 | self.log.error("on_open() attempted when _file exists -- should not be possible.") 38 | return 39 | 40 | if not self.config.filename: 41 | # Assuming stdout 42 | self._file = sys.stdout 43 | else: 44 | # May raise exception: 45 | self._file = open(self.config.filename, "a" if self.config.append else "w") 46 | 47 | def on_close(self): 48 | if self._file and self._file != sys.stdout: 49 | self._file.close() 50 | self._file = None 51 | 52 | def _incoming(self, document): 53 | if document: 54 | if type(document) is dict: 55 | print >> self._file, tojson(document) 56 | else: 57 | print >> self._file, document 58 | self._file.flush() 59 | -------------------------------------------------------------------------------- /eslib/prog.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | """ 4 | eslib.prog 5 | ~~~~~~~~~~ 6 | 7 | Helper functions for running as an executable program. 8 | """ 9 | 10 | 11 | __all__ = ( "progname", "initlogs") 12 | 13 | import os, sys, logging.config, yaml 14 | 15 | 16 | def progname(): 17 | return os.path.basename(sys.argv[0]) 18 | 19 | def initlogs(config_file=None): 20 | # if config_file: 21 | # config_file = os.path.join(os.getcwd(), config_file) 22 | # else: 23 | # location = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__))) 24 | # config_file = os.path.join(location, 'logging.yml') 25 | # 26 | # config = yaml.load(open(config_file)) # TODO: YAML files are in UTF-8... if terminal is something else, make sure we convert correctly 27 | # logging.config.dictConfig(config=config) 28 | 29 | if config_file: 30 | config_file = os.path.join(os.getcwd(), config_file) 31 | config = yaml.load(open(config_file)) # TODO: YAML files are in UTF-8... if terminal is something else, make sure we convert correctly 32 | logging.config.dictConfig(config=config) 33 | else: 34 | console = logging.StreamHandler() 35 | console.setLevel(logging.TRACE) 36 | LOG_FORMAT = '%(firstName) -20s %(levelname) -10s %(className) -20s %(instanceName) -20s %(funcName) -25s %(lineno) -5d: %(message)s' 37 | console.setFormatter(logging.Formatter(LOG_FORMAT)) 38 | 39 | servicelog = logging.getLogger("servicelog") 40 | servicelog.setLevel(logging.TRACE) 41 | servicelog.propagate = False 42 | servicelog.addHandler(console) 43 | 44 | proclog = logging.getLogger("proclog") 45 | proclog.setLevel(logging.TRACE) 46 | proclog.propagate = False 47 | proclog.addHandler(console) 48 | 49 | doclog = logging.getLogger("doclog") 50 | doclog.setLevel(logging.TRACE) 51 | doclog.propagate = False 52 | doclog.addHandler(console) 53 | 54 | rootlog = logging.getLogger() 55 | rootlog.setLevel(logging.WARNING) 56 | rootlog.addHandler(console) 57 | -------------------------------------------------------------------------------- /test/test_procs/data/twitter_raw_mock.json: -------------------------------------------------------------------------------- 1 | { 2 | "id_str": "520149420122578944", 3 | "text": "These clowns must hope that we never cum under attack from any force-r we capable of protecting ourselves?", 4 | "truncated": false, 5 | "lang": "en", 6 | "created_at": "Thu Oct 09 09:51:00 +0000 2014", 7 | "timestamp_ms": "1412848260328", 8 | "source": "Twitter for BlackBerry®", 9 | 10 | "in_reply_to_user_id_str": null, 11 | "in_reply_to_screen_name": null, 12 | "in_reply_to_status_id_str": null, 13 | 14 | "geo": { 15 | "coordinates": [ 16 | -34.07079, 17 | 18.57407 18 | ], 19 | "type": "Point" 20 | }, 21 | 22 | "place": { 23 | "country": "South Africa", 24 | "country_code": "ZA" 25 | }, 26 | 27 | "entities": { 28 | "urls": [ 29 | { 30 | "display_url": "eraliquida.com/?p=1010", 31 | "expanded_url": "http://www.eraliquida.com/?p=1010", 32 | "indices": [ 33 | 7, 34 | 29 35 | ], 36 | "url": "http://t.co/2OdUzFv0Ev" 37 | } 38 | ], 39 | "hashtags": [ 40 | { 41 | "text": "偽2ch騒動", 42 | "indices": [ 43 | 100, 44 | 107 45 | ] 46 | }, 47 | { 48 | "text": "偽2ch問題", 49 | "indices": [ 50 | 108, 51 | 115 52 | ] 53 | } 54 | ], 55 | "user_mentions": [ 56 | { 57 | "name": "اقوى العروض وارخصها", 58 | "screen_name": "rt_ld", 59 | "id_str": "2649736855", 60 | "indices": [ 61 | 0, 62 | 6 63 | ], 64 | "id": 2649736855 65 | } 66 | ] 67 | }, 68 | 69 | "user": { 70 | "id_str": "2196916282", 71 | "screen_name": "Mark_50598", 72 | "name": "mark fester", 73 | "lang": "en", 74 | "description": null, 75 | "created_at": "Tue Nov 26 14:21:35 +0000 2013", 76 | "location": "", 77 | "profile_image_url": "http://abs.twimg.com/sticky/default_profile_images/default_profile_1_normal.png", 78 | "protected": false, 79 | "geo_enabled": true 80 | } 81 | } 82 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import os 4 | import sys 5 | from glob import glob 6 | 7 | # PREREQUISITES: 8 | # yum install -y libxml2-devel libxslt-devel 9 | 10 | try: 11 | from setuptools import setup 12 | except ImportError: 13 | from distutils.core import setup 14 | 15 | if sys.argv[-1] == 'publish': 16 | os.system('python setup.py sdist upload') 17 | sys.exit() 18 | 19 | requires = [ 20 | 'elasticsearch', 21 | 'lxml', 22 | 'oauthlib', 23 | 'python-daemon==2.0.6', # For services, version 2.1 is fucked (change user stuff) 24 | 'argparse', 25 | 'psutil', 'setproctitle', 26 | 'pika', 'pyrabbit', # for Rabbitmq 27 | 'pykafka', # For Kafka 28 | 'HTMLParser', 29 | 'requests>=2', # version >=2 needed by TwitterAPI 30 | 'TwitterAPI', 31 | 'PyYAML', # for prog logging init stuff 32 | 'feedparser', # for rss 33 | 'python-dateutil', 34 | # 'mock' # for testing 35 | 'beautifulsoup4', 36 | 'textblob', 'justext' # for web.py 37 | ] 38 | 39 | 40 | setup( 41 | name='eslib', 42 | version='0.0.14', 43 | description='Document processing framework and utility for Elasticsearch (or whatever).', 44 | #long_description=open("README.md").read(), 45 | author='Hans Terje Bakke', 46 | author_email='hans.terje.bakke@comperio.no', 47 | url='https://github.com/comperiosearch/elasticsearch-eslib', 48 | keywords="document processing docproc", 49 | packages=['eslib', 'eslib.procs', 'eslib.service'], 50 | # package_data={'': ['LICENSE', 'README.md', 'PROTOCOLS.md']}, 51 | scripts=glob('bin/*'), 52 | include_package_data=True, 53 | # TODO: examples in package data 54 | install_requires=requires, 55 | license='Apache 2.0', 56 | zip_safe=False, 57 | 58 | classifiers=( 59 | 'Development Status :: 5 - Production/Stable', 60 | 'Intended Audience :: Developers', 61 | 'Natural Language :: English', 62 | 'License :: OSI Approved :: Apache Software License', 63 | 'Programming Language :: Python', 64 | 'Programming Language :: Python :: 2', 65 | 'Programming Language :: Python :: 2.7' 66 | ) 67 | ) 68 | -------------------------------------------------------------------------------- /test/test_procs/test_transformer.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import unittest 4 | from eslib.procs import Transformer 5 | 6 | class TestProtocolConverter(unittest.TestCase): 7 | 8 | def test_func_one_lambda(self): 9 | 10 | csv2list = lambda proc, doc: [",".join(doc)] 11 | 12 | p = Transformer(func=csv2list, input_protocol="list", output_protocol="csv") 13 | 14 | output = [] 15 | p.add_callback(lambda proc, doc: output.append(doc)) 16 | 17 | p.start() 18 | p.put(["a","b","c","d"]) 19 | p.stop() 20 | p.wait() 21 | 22 | print "output=", output[0] 23 | 24 | self.assertEqual(output[0], "a,b,c,d") 25 | 26 | 27 | def yieldfunc(self, proc, doc): 28 | yield doc.lower() 29 | yield doc.upper() 30 | 31 | def test_func_multi_yield(self): 32 | 33 | p = Transformer(func=self.yieldfunc, input_protocol="str", output_protocol="str") 34 | 35 | output = [] 36 | p.add_callback(lambda proc, doc: output.append(doc)) 37 | 38 | p.start() 39 | p.put("a") 40 | p.put("b") 41 | p.put("c") 42 | p.stop() 43 | p.wait() 44 | 45 | joined = ",".join(output) 46 | print "output=", joined 47 | 48 | self.assertEqual(joined, "a,A,b,B,c,C") 49 | 50 | 51 | def edge2ids(self, proc, doc): 52 | if doc["type"] == "author": 53 | yield doc["from"] 54 | else: 55 | yield doc["from"] 56 | yield doc["to"] 57 | 58 | def test_graph_edge_convertion(self): 59 | p = Transformer(func=self.edge2ids, input_protocol="str", output_protocol="str") 60 | 61 | output = [] 62 | p.add_callback(lambda proc, doc: output.append(doc)) 63 | 64 | p.start() 65 | p.put({"type": "author" , "from": "1", "to": "1"}) 66 | p.put({"type": "mention", "from": "2", "to": "3"}) 67 | p.put({"type": "quote" , "from": "4", "to": "1"}) 68 | p.stop() 69 | p.wait() 70 | 71 | joined = ",".join(output) 72 | print "output=", joined 73 | 74 | self.assertEqual(joined, "1,2,3,4,1") 75 | 76 | 77 | 78 | def main(): 79 | unittest.main() 80 | 81 | if __name__ == "__main__": 82 | main() 83 | -------------------------------------------------------------------------------- /test/test_procs/test_http_monitor.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import unittest 4 | from eslib.procs import HttpMonitor 5 | import requests 6 | 7 | import eslib.prog 8 | eslib.prog.initlogs() 9 | 10 | class TestHttpMonitor(unittest.TestCase): 11 | 12 | def test_get(self): 13 | self.hooked_msg = None 14 | output = [] 15 | 16 | p = HttpMonitor(hook=self._hook) # localhost:4000 by default 17 | p.add_callback(lambda proc, doc: output.append(doc)) 18 | 19 | print "Starting server." 20 | p.start() 21 | 22 | print "Sending request" 23 | res = requests.get("http://localhost:4000/ppp?arg=aaa") 24 | print "RES=", res, res.content 25 | 26 | 27 | print "Stopping server" 28 | p.stop() 29 | p.wait() 30 | print "Server finished." 31 | 32 | self.assertEquals(self.hooked_msg, "GET_/ppp?arg=aaa") 33 | self.assertEquals(output[0], "ppp?arg=aaa") 34 | 35 | def test_post(self): 36 | self.hooked_msg = None 37 | output = [] 38 | 39 | p = HttpMonitor(hook=self._hook) # localhost:4000 by default 40 | p.add_callback(lambda proc, doc: output.append(doc)) 41 | 42 | print "Starting server." 43 | p.start() 44 | 45 | print "Sending request (text)" 46 | res = requests.post("http://localhost:4000/ppp?arg=aaa", data="some data", headers={'content-type': 'text/text'}) 47 | print "RES=", res, res.content 48 | print "Sending request (json)" 49 | res = requests.post("http://localhost:4000/ppp?arg=aaa", data="[1, 2, 3]", headers={'content-type': 'application/json'}) 50 | print "RES=", res, res.content 51 | 52 | print "Stopping server" 53 | p.stop() 54 | p.wait() 55 | print "Server finished." 56 | 57 | self.assertEquals(self.hooked_msg, "POST_/ppp?arg=aaa") 58 | self.assertEquals(output[0], "some data") 59 | self.assertEquals(output[1], [1, 2, 3]) 60 | 61 | def _hook(self, request_handler, verb, path, data, format="application/json"): 62 | print "Hook called: ", verb, path, data 63 | self.hooked_msg = "%s_%s" % (verb, path) 64 | 65 | 66 | def main(): 67 | unittest.main() 68 | 69 | if __name__ == "__main__": 70 | main() 71 | -------------------------------------------------------------------------------- /examples/remoting/RemotingClient.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # NOTE: 4 | # Example usage of the currently (as of writing) experimental RemotingService, 5 | # talking to the DummyRemotingService. 6 | 7 | # Import and set up some simple logging 8 | from eslib.service.Client import Client 9 | import logging, time 10 | # for handler in logging.root.handlers[:]: 11 | # logging.root.removeHandler(handler) 12 | logging.getLogger("requests").setLevel(logging.WARNING) 13 | format='%(name)10s %(levelname)8s %(message)s' 14 | logging.basicConfig(format=format, level=logging.INFO) 15 | 16 | # One way of creating the client, by asking the service manager for a service named "remoting". 17 | # (We call ourself the "Hooker" client, hooking onto the dummy service. It is just a name..) 18 | client = Client("Hooker", manager="localhost:5000", service="remoting") 19 | 20 | # Another way is to address the service directly: 21 | # client = Client("Hooker", address="localhost:5001") 22 | 23 | # We can ask it for status... whether it is "DEAD", "idle", "processing", "stopping", etc. 24 | print "STATUS =", client.status() 25 | 26 | # We can ask to see detailed stats 27 | print "STATS =", client.stats() 28 | 29 | # We can ask to see what knowledge it has of the metadata from the common service metadata repository 30 | print "META =", client.meta() 31 | 32 | # We can list all available HTTP routes 33 | print "HELP =" 34 | for item in client.help()["routes"]: 35 | print " %-6s %s" % tuple(item.split(" ")) 36 | 37 | # We can start and stop the service (the processing part, not run and shut down the service process itself): 38 | # print "START=", client.start() 39 | # print "STATUS=", client.status() 40 | # print "STOP=", client.stop() 41 | # print "STATUS=", client.status() 42 | # time.sleep(2) 43 | # print "STATUS=", client.status() 44 | 45 | # TODO: We might want to be able to send stop(wait=True, timeout=10) 46 | #print "START=", client.start() # NOTE: Will get error back if already started... 47 | 48 | # This is how we send data to the service for further processing 49 | print "PUT=", client.put("yo", "input") 50 | 51 | # This is how we ask for a portion (here batch size = 2) of data queued for output in service. 52 | resultGenerator = list(client.fetch("output", 2)) 53 | print "FETCH", list(resultGenerator) 54 | -------------------------------------------------------------------------------- /bin/es-cleantweet: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | 5 | from eslib.procs import FileReader, FileWriter, TweetEntityRemover, PatternRemover, HtmlRemover 6 | import eslib.prog 7 | import argparse 8 | 9 | 10 | def main(): 11 | desc = "Perform a chain of cleaning operations on tweets:\n" + \ 12 | " Remove entities (URLs, mentions)" + \ 13 | " Remove retweet prefix and ellipses suffix" + \ 14 | " Unescape HTML encoding" 15 | help_t = "Write cleaned text to this field instead of overwriting input field." 16 | help_f = "Field to clean. Defaults to 'text'." 17 | 18 | parser = argparse.ArgumentParser(usage="\n %(prog)s -f field [-t target]", description=desc) 19 | parser._actions[0].help = argparse.SUPPRESS 20 | parser.add_argument("-f", "--field", default="text", help=help_f) 21 | parser.add_argument("-t", "--target", required=False, help=help_t) 22 | parser.add_argument( "--name" , help="Process name.", default=None) 23 | 24 | args = parser.parse_args() 25 | 26 | source = args.field 27 | target = args.target or args.field 28 | 29 | # Set up and run the pipeline 30 | entity_remover = TweetEntityRemover( 31 | name = "TER",#args.name or eslib.prog.progname(), 32 | source_field = source, 33 | target_field = target, 34 | remove_url = True, 35 | remove_mention = True) 36 | pattern_remover = PatternRemover( 37 | name = "PR",#args.name or eslib.prog.progname(), 38 | patterns = ["^RT @.+: ", u"\S+\u2026$"], # Retweet prefix, ellipsis suffix 39 | source_field = target, 40 | target_field = target 41 | ) 42 | unescaper = HtmlRemover(name="HR") 43 | 44 | r = FileReader() # Read from stdin 45 | w = FileWriter() # Write to stdout 46 | entity_remover.subscribe(r) 47 | pattern_remover.subscribe(entity_remover)#, socket_name="output", connector_name="input") 48 | unescaper.subscribe(pattern_remover)#, socket_name="output", connector_name="input",) 49 | w.subscribe(unescaper)#, socket_name="output") 50 | 51 | r.start() # Will cause cascading starts of each processor in the pipeline 52 | w.wait() # Wait for everything to finish writing 53 | 54 | 55 | if __name__ == "__main__": main() 56 | -------------------------------------------------------------------------------- /test/test_procs/test_dateexpander.py: -------------------------------------------------------------------------------- 1 | __author__ = 'Mats Julian Olsen' 2 | 3 | import unittest 4 | 5 | from eslib import time 6 | from eslib.procs import DateExpander 7 | 8 | ok_date = '2014-10-14T14:26:30+01:00' 9 | ok_date_no_tz = '2014-10-14T14:26:30' 10 | wrong_date = 2013 11 | wrong_date2 = '-120-13-142T25:61:61+30:00' 12 | 13 | ok_date_fields = { 14 | 'year': 2014, 'month': 10, 'day': 14, 15 | 'hour': 13, 'minute': 26, 'second': 30, 16 | 'week': 42, 'weekday': 2 17 | } 18 | 19 | dict_wo_source = {'i': {'am': {'a': 'dict'}}} 20 | dict_w_source = {'_source': dict_wo_source} 21 | dict_wo_sourcefield = {'_source': dict_wo_source} 22 | dict_w_sourcefield = {'_source': {'created_at': dict_wo_source}} 23 | dict_w_ok_date = {'_source': {'created_at': ok_date, "date_fields": ok_date_fields}} 24 | dict_wo_ok_date = {'_source': {'created_at': wrong_date}} 25 | dict_wo_ok_date2 = {'_source': {'created_at': wrong_date2}} 26 | 27 | 28 | class TestDateMagic(unittest.TestCase): 29 | 30 | def test_all(self): 31 | date = time.utcdate(ok_date) 32 | dd = time.date_dict(date) 33 | print dd 34 | self.assertEqual(dd, ok_date_fields) 35 | 36 | 37 | class TestDateFields(unittest.TestCase): 38 | 39 | def setUp(self): 40 | self.expander = DateExpander() 41 | 42 | def test_missing_source_section(self): 43 | # if the dict doesn't have source it should be returned 44 | doc = self.expander._process(dict_wo_source) 45 | print doc 46 | self.assertDictEqual(doc, dict_wo_source) 47 | 48 | def test_missing_source_field(self): 49 | # if the dict has source, but no source_field, it should be returned 50 | doc = self.expander._process(dict_wo_sourcefield) 51 | print doc 52 | self.assertDictEqual(doc, dict_wo_sourcefield) 53 | 54 | def test_invalid_date(self): 55 | # if the date is invalid, the same doc should be returned 56 | doc = self.expander._process(dict_wo_ok_date) 57 | print doc 58 | self.assertDictEqual(doc, dict_wo_ok_date) 59 | 60 | def test_valid_date(self): 61 | doc = self.expander._process(dict_w_ok_date) 62 | print doc 63 | self.assertIn('date_fields', doc["_source"]) 64 | 65 | doc = self.expander._process(dict_w_ok_date) 66 | print doc 67 | self.assertEqual(doc, dict_w_ok_date) 68 | -------------------------------------------------------------------------------- /test/test_procs/test_tweet_entity_removal.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import os 3 | 4 | import unittest, json 5 | from eslib.procs import TweetEntityRemover 6 | from eslib import esdoc 7 | 8 | class TestTweetEntityRemoval(unittest.TestCase): 9 | 10 | def test_all(self): 11 | 12 | # Load test data 13 | self_dir, _ = os.path.split(__file__) 14 | f = open(os.path.join(self_dir, "data/tweet_entity_removal.json")) 15 | doc = json.load(f) 16 | f.close() 17 | 18 | p_none = TweetEntityRemover(remove_urls=False, remove_mentions=False) 19 | p_url = TweetEntityRemover(remove_urls=True , remove_mentions=False) 20 | p_mention = TweetEntityRemover(remove_urls=False, remove_mentions=True) 21 | p_both = TweetEntityRemover(remove_urls=True , remove_mentions=True, target_field="cleaned") 22 | 23 | cleaned_none = p_none ._clean(doc) 24 | cleaned_url = p_url ._clean(doc) 25 | cleaned_mention = p_mention._clean(doc) 26 | cleaned_both = p_both ._clean(doc) 27 | 28 | self.assertTrue(esdoc.getfield(cleaned_none , "_source.text") == "Me & the lovely @stellachuuuuu @ Jacob K Javits Convention Center http://t.co/x6BUjNY0jv") 29 | self.assertTrue(esdoc.getfield(cleaned_url , "_source.text") == "Me & the lovely @stellachuuuuu @ Jacob K Javits Convention Center") 30 | self.assertTrue(esdoc.getfield(cleaned_mention, "_source.text") == "Me & the lovely @ Jacob K Javits Convention Center http://t.co/x6BUjNY0jv") 31 | # Original text should be untouched, and cleaned gone to separate field: 32 | self.assertTrue(esdoc.getfield(cleaned_both , "_source.text") == "Me & the lovely @stellachuuuuu @ Jacob K Javits Convention Center http://t.co/x6BUjNY0jv") 33 | self.assertTrue(esdoc.getfield(cleaned_both , "_source.cleaned") == "Me & the lovely @ Jacob K Javits Convention Center") 34 | 35 | # Verify that minimal cloning works: 36 | self.assertFalse(esdoc.getfield(doc, "_source") == esdoc.getfield(cleaned_url, "_source" ), "Expected _source old!=new") 37 | self.assertTrue (esdoc.getfield(doc, "_source.entities") == esdoc.getfield(cleaned_url, "_source.entities"), "Expected _source old==new") 38 | 39 | def main(): 40 | unittest.main() 41 | 42 | if __name__ == "__main__": 43 | main() 44 | -------------------------------------------------------------------------------- /eslib/procs/TwitterFollowerGetter.py: -------------------------------------------------------------------------------- 1 | __author__ = 'mats' 2 | from ..Generator import Generator 3 | from .twitter import Twitter 4 | 5 | class TwitterFollowerGetter(Generator): 6 | """ 7 | This generator takes as input the ids of twitter users, and then goes 8 | ahead and retrieves the followers or friends of this user, 9 | and outputs the ids. 10 | 11 | # TODO: Document argument 'twitter' and how to configure this. 'outgoing' 12 | 13 | Connectors: 14 | ids (str) : Incoming IDs to get data for. 15 | Sockets: 16 | ids (str) : IDs of related nodes. 17 | 18 | Config: 19 | outgoing = True : # TODO: Document this 20 | """ 21 | def __init__(self, twitter=None, **kwargs): 22 | super(TwitterFollowerGetter, self).__init__(**kwargs) 23 | self.twitter = twitter 24 | self.create_connector(self._incoming, "ids", "str") 25 | self._output_id = self.create_socket("ids", "str", "IDs of related nodes.") 26 | self._output_edge = self.create_socket("edge", "graph-edge") 27 | self.config.set_default(outgoing=True, reltype="follows") 28 | 29 | 30 | def on_open(self): 31 | if self.twitter is None: 32 | self.twitter = Twitter( 33 | consumer_key=self.config.consumer_key, 34 | consumer_secret=self.config.consumer_secret, 35 | access_token=self.config.access_token, 36 | access_token_secret=self.config.access_token_secret 37 | ) 38 | 39 | def _incoming(self, document): 40 | try: 41 | id_ = int(document) 42 | except ValueError: 43 | self.doclog.exception("Could not parse id: %s to int" % str(document)) 44 | else: 45 | related = self.twitter.get_follows(uid=str(id_), outgoing=self.config.outgoing) 46 | self._send(id_, related) 47 | 48 | def _send(self, origin, related): 49 | for id_ in related: 50 | edge = {"from": None, "type": self.config.reltype, "to": None} 51 | self._output_id.send(id_) 52 | if self.config.outgoing: 53 | edge["from"] = origin 54 | edge["to"] = id_ 55 | else: 56 | edge["from"] = id_ 57 | edge["to"] = origin 58 | 59 | if all(edge.itervalues()): 60 | self.doclog.trace("Sending edge %s to Neo4j" % str(edge)) 61 | self._output_edge.send(edge) 62 | else: 63 | self.doclog.error("Edge had None-fields: %s" % str(edge)) -------------------------------------------------------------------------------- /eslib/procs/TweetEntityRemover.py: -------------------------------------------------------------------------------- 1 | __author__ = 'Hans Terje Bakke' 2 | 3 | from ..Processor import Processor 4 | from eslib.text import remove_parts 5 | from .. import esdoc 6 | 7 | class TweetEntityRemover(Processor): 8 | """ 9 | Remove URLs and/or mentioned users from the tweet text. 10 | 11 | Protocols: 12 | 13 | esdoc.tweet: 14 | 15 | # TODO 16 | 17 | Connectors: 18 | input (esdoc.tweet) : Tweet 19 | Sockets: 20 | output (esdoc.tweet) : Tweet (possibly extended with a cleaned field) 21 | 22 | Config: 23 | source_field = "text" : Part of twitter dev credentials. 24 | target_field = None : Defaults to 'source_field', replacing the input field. 25 | remove_urls = True 26 | remove_mentions = False 27 | """ 28 | 29 | 30 | def __init__(self, **kwargs): 31 | super(TweetEntityRemover, self).__init__(**kwargs) 32 | 33 | self.create_connector(self._incoming, "input", "esdoc.tweet", "Incoming tweet.") 34 | self.output = self.create_socket("output" , "esdoc.tweet" , "Outgoing, cleaned, tweet.") 35 | 36 | self.config.set_default( 37 | source_field = "text", 38 | target_field = None, 39 | remove_urls = True, 40 | remove_mentions = False 41 | ) 42 | 43 | def _clean(self, doc): 44 | 45 | source = doc.get("_source") 46 | if not source: 47 | return doc 48 | 49 | text = esdoc.getfield(source, self.config.source_field) 50 | 51 | coords = [] 52 | entities = source.get("entities") 53 | if self.config.remove_urls: 54 | x = esdoc.getfield(entities, "urls", []) 55 | coords += [l["indices"] for l in x] 56 | if self.config.remove_mentions: 57 | x = esdoc.getfield(entities, "user_mentions", []) 58 | coords += [l["indices"] for l in x] 59 | cleaned = None 60 | if not text: 61 | cleaned = text 62 | else: 63 | # The removal from coords most often leaves two spaces, so remove them, too, and strip border spaces. 64 | cleaned = remove_parts(text, coords).replace(" ", " ").strip() 65 | 66 | return esdoc.shallowputfield(doc, "_source." + (self.config.target_field or self.config.source_field), cleaned) 67 | 68 | def _incoming(self, doc): 69 | if not self.output.has_output: 70 | return # No point then... 71 | cleaned_doc = self._clean(doc) 72 | self.output.send(cleaned_doc) 73 | -------------------------------------------------------------------------------- /test/test_procs/test_tweet_extractor.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import os 4 | import unittest, json 5 | from eslib.procs import TwitterMonitor, TweetExtractor 6 | 7 | class TestTwitterMonitor(unittest.TestCase): 8 | 9 | def test_simple(self): 10 | 11 | # Load test data 12 | self_dir, _ = os.path.split(__file__) 13 | f = open(os.path.join(self_dir, "data/twitter_raw_mock.json")) 14 | j = json.load(f) 15 | f.close() 16 | 17 | m = TwitterMonitor() 18 | raw, tweet_mon = m._decode(j) 19 | 20 | x = TweetExtractor() 21 | tweet, users, links = x._extract(tweet_mon) 22 | 23 | # Test links 24 | self.assertTrue(len(links) == 1) 25 | self.assertTrue(links[0]["what"] == "twitter") 26 | self.assertTrue(links[0]["who"] == "2196916282") 27 | self.assertTrue(links[0]["url"] == "http://www.eraliquida.com/?p=1010") 28 | 29 | # Test users 30 | self.assertTrue(len(users) == 2) 31 | self.assertTrue(users[0]["from"] == "2196916282") 32 | self.assertTrue(users[1]["from"] == "2196916282") 33 | self.assertTrue(users[0]["to"] == "2196916282") 34 | self.assertTrue(users[1]["to"] == "2649736855") 35 | self.assertTrue(users[0]["type"] == "author") 36 | self.assertTrue(users[1]["type"] == "mention") 37 | 38 | # Test tweet 39 | self.assertTrue(tweet["_id"] == "520149420122578944") 40 | self.assertTrue(tweet["_source"]["source"] == u"Twitter for BlackBerry®") 41 | self.assertTrue(tweet["_source"]["text"] == u'These clowns must hope that we never cum under attack from any force-r we capable of protecting ourselves?') 42 | self.assertTrue(str(tweet["_source"]["created_at"]) == "2014-10-09 09:51:00.328000") 43 | self.assertTrue("geo" in tweet["_source"]) 44 | self.assertTrue(tweet["_source"]["lang"] == "en") 45 | self.assertTrue(tweet["_source"]["place"]["country"] == "South Africa") 46 | self.assertFalse("in_reply_to" in tweet["_source"]) 47 | # User 48 | self.assertTrue(tweet["_source"]["user"]["id"] == "2196916282") 49 | self.assertTrue(tweet["_source"]["user"]["lang"] == "en") 50 | self.assertTrue(tweet["_source"]["user"]["name"] == "mark fester") 51 | self.assertFalse("description" in tweet["_source"]["user"]) 52 | self.assertTrue(str(tweet["_source"]["user"]["created_at"]) == "2013-11-26 14:21:35") 53 | 54 | # Entities 55 | # // TODO 56 | 57 | def main(): 58 | unittest.main() 59 | 60 | if __name__ == "__main__": 61 | main() 62 | -------------------------------------------------------------------------------- /eslib/Terminal.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | 4 | class TerminalProtocolException(Exception): 5 | def __init__(self, socket, connector): 6 | msg = "Socket: %s.%s(%s), Connector: %s.%s(%s)" % (socket.owner.name, socket.name, socket.protocol, connector.owner.name, connector.name, connector.protocol) 7 | super(Exception, self).__init__(self, msg) 8 | 9 | 10 | class Terminal(object): 11 | "Common abstract base class for connectors and sockets." 12 | 13 | ANY_PROTOCOL = "any" 14 | 15 | def __init__(self, name, protocol): 16 | self.type = None # type: Either 'Socket' or 'Connector' 17 | self.owner = None # Processor: 18 | self.name = "" # str: Name of terminal 19 | self.protocol = "" # str: Name of object format expected as input/output on this terminal 20 | self.description = "" # str: Text describing purpose and property of this terminal 21 | 22 | self.connections = [] 23 | 24 | self.name = name or "unnamed" 25 | self.protocol = protocol or Terminal.ANY_PROTOCOL 26 | 27 | def __str__(self): 28 | return "%s|%s" % (self.name, self.protocol) 29 | 30 | def attach(self, terminal): 31 | self.connections.append(terminal) 32 | 33 | def detach(self, terminal): 34 | if terminal in self.connections: 35 | self.connections.remove(terminal) 36 | 37 | def get_connections(self, owner=None, terminal_name=None): 38 | "Return all connections if owner is missing. Ignore terminal_name is owner is missing." 39 | connections = [] 40 | for c in self.connections[:]: 41 | if not owner or (c.owner == owner and (not terminal_name or c.name == terminal_name)): 42 | connections.append(c) 43 | return connections 44 | 45 | @staticmethod 46 | def protocol_compliance(socket, connector): 47 | if connector.protocol == Terminal.ANY_PROTOCOL or socket.protocol == Terminal.ANY_PROTOCOL: 48 | return True 49 | # In case the socket is set to mimic the protocol of one of its connectors, we check for that 50 | # instead of the directly registered protocol. 51 | ss = socket.protocol.split(".") 52 | sm = socket.mimiced_protocol.split(".") 53 | cc = connector.protocol.split(".") 54 | # print "SS=", ss[:len(cc)] 55 | # print "SM=", sm[:len(cc)] 56 | # print "CC=", cc[:len(cc)] 57 | # print "%s == %s" % (sm[:len(cc)], cc[:len(cc)]) 58 | return (ss[:len(cc)] == cc[:len(cc)]) or (sm[:len(cc)] == cc[:len(cc)]) 59 | -------------------------------------------------------------------------------- /eslib/procs/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | """ 4 | eslib.procs 5 | ~~~~~ 6 | 7 | Document processing processors. 8 | """ 9 | 10 | 11 | from .ElasticsearchReader import ElasticsearchReader 12 | from .ElasticsearchWriter import ElasticsearchWriter 13 | from .FileReader import FileReader 14 | from .FileWriter import FileWriter 15 | from .TcpWriter import TcpWriter 16 | from .RabbitmqMonitor import RabbitmqMonitor 17 | from .RabbitmqWriter import RabbitmqWriter 18 | from .KafkaMonitor import KafkaMonitor 19 | from .KafkaWriter import KafkaWriter 20 | from .HttpMonitor import HttpMonitor 21 | from .CsvConverter import CsvConverter 22 | from .WebGetter import WebGetter 23 | from .Neo4jWriter import Neo4jWriter 24 | from .Neo4jReader import Neo4jReader 25 | from .TwitterMonitor import TwitterMonitor 26 | from .TwitterUserGetter import TwitterUserGetter 27 | from .TwitterFollowerGetter import TwitterFollowerGetter 28 | from .TweetEntityRemover import TweetEntityRemover 29 | from .TweetExtractor import TweetExtractor 30 | from .PatternRemover import PatternRemover 31 | from .HtmlRemover import HtmlRemover 32 | from .BlacklistFilter import BlacklistFilter 33 | from .Throttle import Throttle 34 | from .Transformer import Transformer 35 | from .EntityExtractor import EntityExtractor 36 | from .ProcessWrapper import ProcessWrapper 37 | from .CLIReader import CLIReader 38 | from .RssMonitor import RssMonitor 39 | from .Timer import Timer 40 | from .DateExpander import DateExpander 41 | from .SmtpMailer import SmtpMailer 42 | from .FourChanMonitor import FourChanMonitor 43 | 44 | __all__ = ( 45 | "ElasticsearchReader", 46 | "ElasticsearchWriter", 47 | "FileReader", 48 | "FileWriter", 49 | "TcpWriter", 50 | "RabbitmqMonitor", 51 | "RabbitmqWriter", 52 | "KafkaMonitor", 53 | "KafkaWriter", 54 | "HttpMonitor", 55 | "CsvConverter", 56 | "WebGetter", 57 | "Neo4jWriter", 58 | "Neo4jReader", 59 | "TwitterMonitor", 60 | "TwitterUserGetter", 61 | "TwitterFollowerGetter", 62 | "TweetEntityRemover", 63 | "TweetExtractor", 64 | "PatternRemover", 65 | "HtmlRemover", 66 | "BlacklistFilter", 67 | "Throttle", 68 | "Transformer", 69 | "EntityExtractor", 70 | "ProcessWrapper", 71 | "CLIReader", 72 | "RssMonitor", 73 | "Timer", 74 | "DateExpander", 75 | "SmtpMailer", 76 | "FourChanMonitor" 77 | ) 78 | -------------------------------------------------------------------------------- /eslib/procs/DateExpander.py: -------------------------------------------------------------------------------- 1 | __author__ = 'Mats Julian Olsen' 2 | 3 | from ..Processor import Processor 4 | from .. import esdoc 5 | from .. import time 6 | 7 | 8 | class DateExpander(Processor): 9 | """ 10 | This processor will use a date field in an esdoc as a basis for constructing 11 | an object with 12 | 13 | year 14 | month (1 through 12) 15 | day (1 through 31) 16 | hour (0 through 23) 17 | minute (0 through 59) 18 | second (0 through 59) 19 | weekday (1 through 7) 20 | week (1 through 53) 21 | 22 | Connectors: 23 | input (esdoc) : Incoming. 24 | Sockets: 25 | output (esdoc) : Outgoing, with configured date field expanded. 26 | 27 | Config: 28 | source_field = "created_at" : Field which date value to expand. 29 | target_field = "date_fields" : Target field for the expanded object. 30 | """ 31 | def __init__(self, **kwargs): 32 | super(DateExpander, self).__init__(**kwargs) 33 | self._input = self.create_connector(self._incoming, 'input', 'esdoc', "Incoming.") 34 | self._output = self.create_socket('output', 'esdoc', "Outgoing, with configured date field expanded.") 35 | 36 | self.config.set_default( 37 | source_field='created_at', 38 | target_field='date_fields' 39 | ) 40 | 41 | def _incoming(self, doc): 42 | if self._output.has_output: 43 | self._output.send(self._process(doc)) 44 | 45 | def _process(self, doc): 46 | value = esdoc.getfield(doc, "_source." + self.config.source_field) 47 | if value is None: 48 | self.doclog.warning( 49 | "Document '%s' is missing field or value in '%s'." 50 | % (doc.get("_id"), self.config.source_field)) 51 | return doc 52 | 53 | date = time.utcdate(value) 54 | if date is None: 55 | self.doclog.warning( 56 | "Document '%s' has non-date value in field '%s'." 57 | % (doc.get("_id"), self.config.source_field)) 58 | return doc 59 | 60 | date_dict = time.date_dict(date) 61 | if date_dict is None: 62 | # This should not be possible, therefore logging to proclog 63 | self.log.error("Date field extraction failed for date: %s" % date) 64 | return doc 65 | 66 | # Create a new document (if necessary) with just the minimum cloning necessary, 67 | # leaving references to the rest. 68 | return esdoc.shallowputfield(doc, '_source.' + self.config.target_field, date_dict) 69 | -------------------------------------------------------------------------------- /eslib/procs/Timer.py: -------------------------------------------------------------------------------- 1 | __author__ = 'Hans Terje Bakke' 2 | 3 | from eslib import Monitor 4 | import time 5 | 6 | class Timer(Monitor): 7 | """ 8 | Send a command on an output socket at configured interval. 9 | The configured 'actions' is a list of vectors of (initial_offset, interval, document). 10 | The time units are in seconds ('float'). The 'document' is *whatever* you want on to output, 11 | typically a string or a dict. type. 12 | 13 | Note that if you have very short intervals, you might want to adjust the run loop delay 'sleep' (not in 'config'). 14 | (It defaults to 0.5 seconds for this processor.) 15 | 16 | Sockets: 17 | output (*) : Output occurring at configured intervals. From the 'document' part of the configured action. 18 | 19 | Config: 20 | actions = [] : Time to delay document throughput, in seconds (float). 21 | """ 22 | def __init__(self, **kwargs): 23 | super(Timer, self).__init__(**kwargs) 24 | self._output = self.create_socket("output", None, "Output occurring at configured intervals. From the 'document' part of the configured action.") 25 | 26 | # (Override) Let ticks last half a second here by default... there's generally no rush, unless intervals are very short: 27 | self.sleep = 0.5 28 | 29 | self.config.set_default(actions=[]) # A list of tuples of (initial_offset, interval, document) 30 | 31 | self._actions = [] 32 | 33 | def on_open(self): 34 | now = time.time() 35 | self._actions = [] 36 | if self._actions is not None: 37 | if not hasattr(self._actions, '__iter__'): 38 | msg = "'config.actions' is not iterable." 39 | self.log.critical(msg) 40 | raise ValueError(msg) 41 | for a in self.config.actions: 42 | # Validate tuple format 43 | if not type(a) in [list, tuple] or not len(a) == 3 or not type(a[0]) in [int, float] or not type(a[1] in [int, float]): 44 | msg = "An element in 'config.actions' is not of expected format and/or type '(initial_offset, interval, document)'." 45 | self.log.error(msg) 46 | #raise ValueError(msg) # Maye not critical enough to raise exception, just skip the wrong one. 47 | self._actions.append([now + a[0], a[1], a[2]]) 48 | 49 | def on_tick(self): 50 | now = time.time() 51 | for a in self._actions: 52 | if now >= a[0]: 53 | # Next time for this one is... 54 | a[0] = now + a[1] 55 | # Then send the action/document 56 | self._output.send(a[2]) 57 | -------------------------------------------------------------------------------- /eslib/service/DummyService.py: -------------------------------------------------------------------------------- 1 | from . import HttpService, PipelineService 2 | from ..procs import Timer, Transformer 3 | from .. import esdoc 4 | import time 5 | 6 | class DummyService(HttpService, PipelineService): 7 | """ 8 | Common static config: 9 | name 10 | manager_endpoint 11 | management_endpoint 12 | 13 | Static config: 14 | timer_frequency = 3 15 | lifespan = 0 # 0=infinite 16 | 17 | Runtime config: 18 | dummy.variable 19 | """ 20 | 21 | VARIABLE_CONFIG_PATH = "dummy.variable" 22 | 23 | metadata_keys = [VARIABLE_CONFIG_PATH] 24 | 25 | def __init__(self, **kwargs): 26 | super(DummyService, self).__init__(**kwargs) 27 | 28 | self.config.set_default( 29 | timer_frequency = 3, 30 | lifespan = 0 31 | ) 32 | 33 | self._logger = None 34 | self._variable = "initial" 35 | 36 | def on_configure(self, credentials, config, global_config): 37 | self.config.set( 38 | manager_endpoint = global_config.get("manager_host"), 39 | management_endpoint = config.get("management_endpoint"), 40 | 41 | timer_frequency = config["frequency"], 42 | lifespan = config["lifespan"] 43 | ) 44 | 45 | def on_setup(self): 46 | # Set up procs 47 | timer = Timer( 48 | service = self, 49 | name = "timer", 50 | actions = [(self.config.timer_frequency, self.config.timer_frequency, "ping")]) 51 | self._logger = Transformer( 52 | service = self, 53 | name = "logger", 54 | func = self._logfunc) 55 | 56 | procs = [timer, self._logger] 57 | 58 | # Link them 59 | self.link(*procs) 60 | 61 | # Register them for debug dumping 62 | self.register_procs(*procs) 63 | 64 | return True 65 | 66 | #region Service overrides 67 | 68 | def on_metadata(self, metadata): 69 | print "***METADATA", metadata 70 | self._variable = self.get_meta_section(metadata, self.VARIABLE_CONFIG_PATH) 71 | print "VAR=", self._variable 72 | self.head.restart(start=False) 73 | return True 74 | 75 | #endregion Service overrides 76 | 77 | def _logfunc(self, proc, doc): 78 | if self.config.lifespan and time.time() - self.stat_processing_started > self.config.lifespan: 79 | self.log.status("Life has come to an end; stopping.") 80 | self.processing_stop() 81 | return 82 | self.log.debug("DEBUG message.") 83 | self.log.warning("Service log entry, variable='%s'" % self._variable) 84 | self._logger.log.warning("Processor log entry, variable='%s'" % self._variable) 85 | self._logger.doclog.warning("Document log entry, variable='%s'" % self._variable) 86 | yield doc 87 | -------------------------------------------------------------------------------- /eslib/esdoc.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | """ 4 | eslib.esdoc 5 | ~~~~~~~~~~ 6 | 7 | Module containing operations on "Elasticsearch type" documents (really just a dict). 8 | """ 9 | 10 | 11 | __all__ = ("tojson", "createdoc", "getfield", "putfield") 12 | 13 | 14 | from datetime import datetime 15 | from .time import date2iso 16 | import json 17 | 18 | def _json_serializer_isodate(obj): 19 | """Default JSON serializer.""" 20 | s = None 21 | if isinstance(obj, datetime): 22 | if obj.utcoffset() is not None: 23 | obj = obj - obj.utcoffset() 24 | obj = obj.replace(tzinfo=None) 25 | s = date2iso(obj) 26 | return s 27 | 28 | def tojson(doc): 29 | return json.dumps(doc, default=_json_serializer_isodate) 30 | 31 | 32 | def getfield(doc, fieldpath, default=None): 33 | "Get value for 'fieldpath' if it exits and is not None, otherwise return the default." 34 | if doc is None or fieldpath is None: 35 | return default 36 | if fieldpath == "": 37 | return doc 38 | fp = fieldpath.split(".") 39 | d = doc 40 | for f in fp[:-1]: 41 | if not d or not f in d or not isinstance(d[f], dict): 42 | return default 43 | d = d[f] 44 | if d is None: 45 | return default 46 | v = d.get(fp[-1]) 47 | return default if v is None else v 48 | 49 | 50 | def putfield(doc, fieldpath, value): 51 | "Add or update 'fieldpath' with 'value'." 52 | if doc is None or fieldpath is None: 53 | return 54 | fp = fieldpath.split(".") 55 | d = doc 56 | for i, f in enumerate(fp[:-1]): 57 | if f in d: 58 | d = d[f] 59 | if not isinstance(d, dict): 60 | raise AttributeError("Node at '%s' is not a dict." % ".".join(fp[:i+1])) 61 | else: 62 | dd = {} 63 | d[f] = dd 64 | d = dd 65 | d[fp[-1]] = value # OBS: This also overwrites a node if this is was a node 66 | 67 | def shallowputfield(doc, fieldpath, value): 68 | "Clone as little as needed of 'doc' and add the field from 'fieldpath'. Returns the new cloned doc" 69 | if not doc or not fieldpath: return 70 | fp = fieldpath.split(".") 71 | doc_clone = doc.copy() # Shallow clone 72 | d = doc 73 | d_clone = doc_clone 74 | for i, f in enumerate(fp[:-1]): 75 | if f in d: 76 | d = d[f] 77 | if not type(d) is dict: 78 | raise Exception("Node at '%s' is not a dict." % ".".join(fp[:i+1])) 79 | d_clone[f] = d.copy() # Create shallow clone of the next level 80 | d_clone = d_clone[f] 81 | else: 82 | dd = {} # Create a new node 83 | d_clone.update({f:dd}) 84 | d_clone = dd 85 | d_clone[fp[-1]] = value # OBS: This also overwrites a node if this is was a node 86 | 87 | return doc_clone 88 | 89 | def createdoc(source, index=None, doctype=None, id=None): 90 | doc = {"_source": source} 91 | if index: doc['_index'] = index 92 | if type : doc['_type' ] = doctype 93 | if id : doc['_id' ] = id 94 | return doc 95 | 96 | 97 | -------------------------------------------------------------------------------- /eslib/procs/TwitterUserGetter.py: -------------------------------------------------------------------------------- 1 | __author__ = 'mats' 2 | 3 | from ..Generator import Generator 4 | from .twitter import Twitter 5 | 6 | import time 7 | 8 | 9 | class TwitterUserGetter(Generator): 10 | """ 11 | Receives uids on its connector and sends twitter user objects 12 | to its socket. 13 | 14 | # TODO: Document argument 'twitter' and how to configure this. 15 | 16 | Connectors: 17 | ids (str) : Incoming IDs to get data for. 18 | Sockets: 19 | user (graph-user) : Twitter users. 20 | 21 | Config: 22 | batchsize = 100 : How many users to gather up before making a call to Twitter. 23 | batchtime = 7.0 : How many seconds to wait before we send a batch if it is not full. 24 | """ 25 | 26 | def __init__(self, twitter=None, **kwargs): 27 | super(TwitterUserGetter, self).__init__(**kwargs) 28 | self.create_connector(self._incoming, "ids", "str") 29 | self._output = self.create_socket("user", "graph-user", "Twitter users.") 30 | self._queue = [] 31 | self.last_call = time.time() 32 | self.twitter = twitter 33 | self.config.set_default( 34 | batchsize=100, 35 | batchtime=7 36 | ) 37 | 38 | def on_open(self): 39 | """ Instantiate twitter class. """ 40 | if self.twitter is None: 41 | self.twitter = Twitter( 42 | consumer_key=self.config.consumer_key, 43 | consumer_secret=self.config.consumer_secret, 44 | access_token=self.config.access_token, 45 | access_token_secret=self.config.access_token_secret 46 | ) 47 | 48 | def _incoming(self, doc): 49 | """ 50 | Put str(doc) into the queue. 51 | 52 | :param doc: the id of a twitter user 53 | """ 54 | try: 55 | id_ = int(doc) 56 | except ValueError: 57 | self.doclog.exception("Could not parse id: %s to int" % doc) 58 | else: 59 | self._queue.append(str(id_)) 60 | 61 | def on_tick(self): 62 | """ 63 | Commit items in queue if queue exceeds batchsize or it's been long 64 | since last commit. 65 | """ 66 | if ((len(self._queue) >= self.config.batchsize) or 67 | (time.time() - self.last_call > self.config.batchtime and self._queue)): 68 | self.get() 69 | 70 | def on_shutdown(self): 71 | """ Get rid of rest of queue before shutting down. """ 72 | self.log.info("Processing remaining items in queue.") 73 | while self._queue: 74 | self.get() 75 | 76 | def get(self): 77 | """ 78 | Gets users from twitter and outputs to a socket. 79 | """ 80 | num = len(self._queue) 81 | self.log.debug("Getting %i users from Twitter" % num) 82 | resp = self.twitter.get_users(uids=self._queue[:num]) 83 | self._queue = self._queue[num:] 84 | for raw_user in resp: 85 | try: 86 | user = self.twitter.raw_to_dict(raw_user) 87 | except TypeError as type_error: 88 | self.log.exception(type_error) 89 | else: 90 | self._output.send(user) 91 | -------------------------------------------------------------------------------- /eslib/service/PipelineService.py: -------------------------------------------------------------------------------- 1 | __author__ = 'Hans Terje Bakke' 2 | 3 | from .Service import Service 4 | import time 5 | 6 | class PipelineService(Service): 7 | def __init__(self, **kwargs): 8 | super(PipelineService, self).__init__(**kwargs) 9 | 10 | self.head = None 11 | self.tail = None 12 | 13 | def _log_finished(self, proc): 14 | self.log.status("Processing finished.") 15 | self._processing = False # This will shortcut further evaluation of whether we are processing 16 | self.stat_processing_ended = time.time() 17 | 18 | def _log_aborted(self, proc): 19 | self.log.status("Processing finished after abort.") 20 | self._processing_aborted = True # This will shortcut further evaluation of whether we are aborted 21 | self.stat_processing_ended = time.time() 22 | 23 | def link(self, *processors): 24 | "Link processors together and assign head and tail." 25 | prev = None 26 | for proc in processors: 27 | if prev: 28 | proc.subscribe(prev) 29 | prev = proc 30 | self.head = processors[0] 31 | self.tail = processors[-1] 32 | 33 | #region Service overrides 34 | 35 | def is_processing(self): 36 | "Evaluate whether processing is in progress." 37 | return self.tail.running 38 | 39 | def is_aborted(self): 40 | "Evaluate whether processing is in progress." 41 | return self.head.aborted 42 | 43 | def is_suspended(self): 44 | "Evaluate whether processing is suspended." 45 | return self.head.suspended 46 | 47 | def on_processing_start(self): 48 | if not self._log_finished in self.tail.event_stopped: 49 | self.tail.event_stopped.append(self._log_finished) 50 | if not self._log_aborted in self.tail.event_aborted: 51 | self.tail.event_aborted.append(self._log_aborted) 52 | 53 | self.head.start() 54 | return True 55 | 56 | def on_restart(self): 57 | # if not self.head.running: 58 | # self.head.start() 59 | # else: 60 | # return True 61 | return True # Well, not really, but still.. it didn't fail either. 62 | 63 | def on_processing_stop(self): 64 | "This method should block until the process is fully stopped." 65 | self.head.stop() 66 | self.tail.wait() 67 | return True 68 | 69 | def on_processing_abort(self): 70 | self.head.abort() 71 | self.tail.wait() 72 | return True 73 | 74 | def on_processing_suspend(self): 75 | self.head.suspend() 76 | return True 77 | 78 | def on_processing_resume(self): 79 | self.head.resume() 80 | return True 81 | 82 | # TODO 83 | def on_update(self, config): 84 | # Auto-start on update 85 | if not self.head.running: 86 | self.head.start() 87 | else: 88 | return True 89 | 90 | def on_count(self): 91 | # It is probably better to count what has been handled by the tail, than what the head received or generaterd, so: 92 | return self.tail.count 93 | 94 | def on_count_total(self): 95 | return self.head.total 96 | 97 | #endregion Service overrides 98 | -------------------------------------------------------------------------------- /eslib/procs/KafkaWriter.py: -------------------------------------------------------------------------------- 1 | __author__ = 'Hans Terje Bakke' 2 | 3 | # NOTE: Using sync producer. Should change to async if performance sucks. 4 | 5 | from ..Processor import Processor 6 | from ..esdoc import tojson 7 | from pykafka import KafkaClient 8 | import zlib 9 | 10 | 11 | class KafkaWriter(Processor): 12 | """ 13 | Write data to Kafka. 14 | Writes data with type 'str', 'unicode', 'int', or 'float'. Lists and dicts are written as 'json'. 15 | Other types are cast to 'str'. 16 | The 'type' registered with the metadata is then either 'str', 'unicode', 'int', 'float' or 'json'. 17 | 18 | Connectors: 19 | input (*) : Document to write to configured Kafka topic. 20 | 21 | Config: 22 | hosts = ["localhost:9292"] : List of Kafka hosts. 23 | topic = "default_topic" : 24 | compression = False : Whether to compress the data sent to Kafka. 25 | """ 26 | 27 | def __init__(self, **kwargs): 28 | super(KafkaWriter, self).__init__(**kwargs) 29 | 30 | self._connector = self.create_connector(self._incoming, "input", None, "Document to write to configured RabbitMQ.") 31 | 32 | self.config.set_default( 33 | hosts = ["localhost:9092"], 34 | topic = "default_topic", 35 | compression = False 36 | ) 37 | 38 | self._client = None 39 | self._producer = None 40 | 41 | def on_open(self): 42 | self.count = 0 43 | self._client = KafkaClient(",".join(self.config.hosts)) 44 | topic = self._client.topics[self.config.topic] 45 | self._producer = topic.get_sync_producer(min_queued_messages=1) 46 | self.log.info("Connected to Kafka topic '%s'." % self.config.topic) 47 | 48 | def on_close(self): 49 | if self._client: 50 | self._producer.stop() 51 | self.log.info("Kafka producer stopped.") 52 | # Can't find any way to close the connection or ask it to release resources, so I try a 'del'. 53 | #del self._client 54 | self._client = None 55 | self.log.debug("Connection to Kafka deleted.") 56 | 57 | def _incoming(self, document): 58 | if document == None: 59 | return 60 | 61 | data = document 62 | msg_type = None 63 | if isinstance(document, basestring): 64 | msg_type = type(document).__name__ 65 | elif isinstance(document, (int, long, float)): 66 | msg_type = type(document).__name__ 67 | elif isinstance(document, (list, dict)): 68 | data = document 69 | msg_type = "json" 70 | else: 71 | data = str(document) 72 | msg_type = "str" #type(document).__name__ 73 | self.doclog.warning("Writing document of unsupported type '%s' as type 'str'." % type(document).__name__) 74 | 75 | kafka_data = None 76 | try: 77 | kafka_data = tojson({"type": msg_type, "data": data}) 78 | except TypeError as e: 79 | self.doclog.error("JSON serialization failed: %s" % e.message) 80 | return 81 | 82 | if self.config.compression: 83 | kafka_data = zlib.compress(kafka_data) 84 | 85 | self._producer.produce(kafka_data) 86 | self.count += 1 87 | -------------------------------------------------------------------------------- /test/test_procs/test_csv_converter.py: -------------------------------------------------------------------------------- 1 | import os 2 | import unittest 3 | from eslib.procs import FileReader, FileWriter, CsvConverter 4 | 5 | 6 | res = [] 7 | 8 | class TestCsvConverter(unittest.TestCase): 9 | 10 | def _setup(self, filename): 11 | 12 | r = FileReader(raw_lines=True) 13 | r.config.filename = filename 14 | 15 | c = CsvConverter() 16 | 17 | c.config.index = "myindex" 18 | c.config.type_field = "initials" 19 | c.config.id_field = "id" 20 | 21 | w = FileWriter() # Write to stdout 22 | 23 | r.attach(c.attach(w)) 24 | 25 | output = [] 26 | c.add_callback(lambda proc, doc: output.append(doc)) 27 | 28 | return (r, c, w, output) 29 | 30 | def _verify(self, output): 31 | self.assertTrue(len(output) == 3, "Expected 3 results.") 32 | self.assertTrue(output[1]["_type"] == "eee") 33 | self.assertTrue(output[1]["_index"] == "myindex") 34 | self.assertTrue(output[1]["_id"] == "2") 35 | self.assertTrue(len(output[1]["_source"]) == 2) 36 | 37 | 38 | def test_read(self): 39 | r = FileReader(raw_lines=True) 40 | self_dir, _ = os.path.split(__file__) 41 | r.config.filename = os.path.join(self_dir, "data/csv_with_header.csv") 42 | w = FileWriter() # Write to stdout 43 | w.subscribe(r) 44 | r.start() 45 | 46 | def test_first_line_is_columns(self): 47 | self_dir, _ = os.path.split(__file__) 48 | r, c, w, output = self._setup(os.path.join(self_dir, "data/csv_with_header.csv")) 49 | r.start() 50 | w.wait() 51 | 52 | self._verify(output) 53 | 54 | def test_no_header_line(self): 55 | self_dir, _ = os.path.split(__file__) 56 | r, c, w, output = self._setup(os.path.join(self_dir, "data/csv_no_header.csv")) 57 | c.config.columns = ["id", "name", "last name", "initials"] 58 | r.start() 59 | w.wait() 60 | 61 | self._verify(output) 62 | 63 | def test_skip_header_line(self): 64 | self_dir, _ = os.path.split(__file__) 65 | r, c, w, output = self._setup(os.path.join(self_dir, "data/csv_with_header.csv")) 66 | c.config.columns = ["id", "name", "last name", "initials"] 67 | c.config.skip_first_line = True 68 | r.start() 69 | w.wait() 70 | 71 | self._verify(output) 72 | 73 | # def test_fewer_fields(self): 74 | # self_dir, _ = os.path.split(__file__) 75 | # 76 | # r, c, w, output = self._setup(os.path.join(self_dir, "data/csv_no_header.csv")) 77 | # c.config.id_field = "_id" 78 | # c.config.type_field = "_type" 79 | # c.config.columns = ["_id", None, "last name", "initials"] 80 | # r.start() 81 | # w.wait() 82 | # 83 | # self.assertTrue(len(output) == 3, "Expected 3 results.") 84 | # self.assertTrue(output[1]["_type"] == None) 85 | # self.assertTrue(output[1]["_index"] == "myindex") 86 | # self.assertTrue(output[1]["_id"] == "2") 87 | # keys = output[1]["_source"].keys() 88 | # self.assertTrue(len(keys) == 2) 89 | # self.assertTrue("last name" in keys and "initials" in keys, "Expected 'last name' and 'initials' as result fields.") 90 | 91 | def main(): 92 | unittest.main() 93 | 94 | if __name__ == "__main__": 95 | main() 96 | -------------------------------------------------------------------------------- /eslib/service/RemotingService.py: -------------------------------------------------------------------------------- 1 | from . import HttpService 2 | from .. import Processor 3 | import Queue 4 | 5 | 6 | # NOTE: THIS IS YET EXPERIMENTAL (htb, 2016-03-21) 7 | 8 | 9 | class RemotingService(HttpService): 10 | 11 | def __init__(self, **kwargs): 12 | super(RemotingService, self).__init__(**kwargs) 13 | 14 | # Add routes to functions 15 | self.add_route(self._mgmt_fetch, "GET" , "/fetch", ["?socket", "?limit"]) 16 | self.add_route(self._mgmt_put , "PUT|POST", "/put" , ["?connector"]) 17 | 18 | self._queues = {} 19 | self._put_proc = None 20 | 21 | # NOTE: In on_setup, where you create the fetch proc, set config var congestion_limit 22 | 23 | def setup_put(self, proc): 24 | self.log.info("Registering put Processor '%s'." % proc.name) 25 | self._put_proc = proc 26 | 27 | def setup_fetch(self, proc, socket_names=None): 28 | self.log.info("Creating fetch buffers for Processor '%s'." % proc.name) 29 | if isinstance(socket_names, basestring): 30 | socket_names = [socket_names] 31 | for socket_name in proc.sockets: 32 | if not socket_names or socket_name in socket_names: 33 | self._register_callback(proc, socket_name) 34 | 35 | def _register_callback(self, proc, socket_name): 36 | def callback(proc, doc): 37 | queue = self._queues[socket_name] 38 | queue.put(doc) 39 | pass 40 | self._queues[socket_name] = Queue.Queue() 41 | proc.add_callback(callback, socket_name) 42 | 43 | def _put(self, doc, connector_name): 44 | if self._put_proc: 45 | self._put_proc.put(doc, connector_name) 46 | 47 | def _fetch(self, socket_name=None, limit=0): 48 | docs = [] 49 | if socket_name and socket_name in self._queues: 50 | queue = self._queues[socket_name] 51 | elif len(self._queues) > 0: 52 | # TODO: Get default socket instead, or error 53 | queue = self._queues.keys()[0] 54 | else: 55 | return ([], -1) # TODO: Or rather an error 56 | 57 | ##print "LIMIT=", limit 58 | while not queue.empty() and (limit == 0 or len(docs) < limit): 59 | ##print "LEN(DOCS)=%d" % len(docs) 60 | doc = queue.get_nowait() 61 | queue.task_done() 62 | if doc: 63 | docs.append(doc) 64 | return (docs, queue.qsize()) 65 | 66 | #region Extra service interface methods 67 | 68 | def _mgmt_fetch(self, request_handler, payload, **kwargs): 69 | socket_name = kwargs.get("socket") 70 | limit = kwargs.get("limit") or 0 # 0 = unlimited 71 | limit = int(limit) 72 | ##print "=== KWARGS:", kwargs 73 | ##print "=== LIMIT:", limit 74 | (docs, qsize) = self._fetch(socket_name, limit) 75 | return {"documents": docs, "status": self.status, "queued": qsize} 76 | 77 | def _mgmt_put(self, request_handler, payload, **kwargs): 78 | connector_name = kwargs.get("connector") 79 | doc = payload 80 | self._put(doc, connector_name) 81 | 82 | #endregion Extra service interface methods 83 | 84 | def on_stats(self, stats): 85 | super(RemotingService, self).on_stats(stats) 86 | stats["queued"] = {k:q.qsize() for k,q in self._queues.iteritems()} 87 | -------------------------------------------------------------------------------- /eslib/procs/SmtpMailer.py: -------------------------------------------------------------------------------- 1 | __author__ = 'Hans Terje Bakke' 2 | 3 | from ..Processor import Processor 4 | import smtplib, getpass, platform 5 | from email.mime.text import MIMEText 6 | from eslib.esdoc import tojson 7 | 8 | 9 | class SmtpMailer(Processor): 10 | """ 11 | Send incoming document as content to recipients. 12 | Sends mail outgoing on port 25 unless a username/password is specified, in which case 13 | it uses TLS on port 587. 14 | Sender defaults to current executing user if not specified. 15 | 16 | Connectors: 17 | input (*) : Incoming documents to send. Non-string documents are converted to JSON. 18 | 19 | Config: 20 | smtp_server = "localhost" 21 | username = None 22 | password = None 23 | sender = None 24 | from_name = None : Name to be added to sender into the From field, becomes: '"from_name" ' 25 | recipients = [] : List of recipient email addresses (no mail or brackets or other fuzz). 26 | subject = None 27 | """ 28 | def __init__(self, **kwargs): 29 | super(SmtpMailer, self).__init__(**kwargs) 30 | 31 | self.create_connector(self._incoming, "input", "str", "Email content string.") 32 | 33 | self.config.set_default( 34 | smtp_server = "localhost", 35 | username = None, 36 | password = None, 37 | sender = None, 38 | from_name = None, 39 | recipients = None, 40 | subject = None, 41 | ) 42 | 43 | def on_open(self): 44 | self.count = 0 45 | 46 | def _incoming(self, doc): 47 | if not doc or not self.config.recipients or not self.config.sender: 48 | return 49 | 50 | # Convert non-string documents to JSON 51 | content = doc 52 | if not isinstance(doc, basestring): 53 | content = tojson(doc) 54 | 55 | try: 56 | self._mail_text( 57 | self.config.smtp_server, 58 | self.config.recipients, 59 | self.config.subject, 60 | self.config.sender, 61 | self.config.from_name, 62 | content, 63 | self.config.username, 64 | self.config.password) 65 | self.count += 1 66 | except Exception as e: 67 | self.log.exception("Failed to send email.") 68 | 69 | 70 | def _mail_text(self, smtp_server, recipients, subject, sender=None, from_name=None, content=None, username=None, password=None): 71 | msg = MIMEText(content, "plain", "utf-8") 72 | 73 | if not sender: 74 | sender = "@".join((getpass.getuser(), platform.node())) 75 | 76 | message_from = sender if not from_name else '"%s" <%s>' % (from_name, sender) 77 | 78 | msg['Subject'] = subject 79 | msg['From'] = message_from 80 | msg['To'] = ", ".join(recipients) 81 | 82 | s = None 83 | if username or password: 84 | s = smtplib.SMTP(smtp_server, 587) 85 | s.ehlo() 86 | s.starttls() 87 | s.ehlo() 88 | s.login(username, password) 89 | else: 90 | s = smtplib.SMTP(smtp_server or "localhost") 91 | 92 | s.sendmail(sender, recipients, msg.as_string()) 93 | s.quit() 94 | 95 | -------------------------------------------------------------------------------- /eslib/time.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | """ 4 | eslib.time 5 | ~~~~~~~~~~ 6 | 7 | Module containing time/date helpers. 8 | """ 9 | 10 | 11 | __all__ = ("duration_string", "date2iso", "ago2date") 12 | 13 | 14 | import re, datetime, dateutil, dateutil.parser 15 | 16 | 17 | def duration_string(timediff): 18 | """ 19 | :type timediff: datetime.timedelta 20 | :rtype str: 21 | """ 22 | secs = timediff.seconds 23 | days = timediff.days 24 | s = secs % 60 25 | m = (secs / 60) % 60 26 | h = (secs / 60 / 60) % 24 27 | return "%d:%02d:%02d" % (days*24+h, m, s) 28 | 29 | 30 | def date2iso(dateobj): 31 | """ 32 | Convert datetime object to ISO 8601 string with UTC, e.g. '2014-03-10T23:32:47Z' 33 | :type dateobj: datetime.datetime 34 | :rtype str 35 | """ 36 | return dateobj.strftime("%Y-%m-%dT%H:%M:%SZ") # Screw the %.f ... 37 | 38 | def iso2date(isostr): 39 | """ 40 | Convert ISO 8601 string in UTC, e.g. '2014-03-10T23.32:47Z' to datetime object. 41 | :type isostr: datetime.datetime 42 | :rtype datetime.datetime 43 | """ 44 | if isostr is None: 45 | return None 46 | if "." in isostr: 47 | return datetime.datetime.strptime(isostr, "%Y-%m-%dT%H:%M:%S.%fZ") 48 | else: 49 | return datetime.datetime.strptime(isostr, "%Y-%m-%dT%H:%M:%SZ") 50 | 51 | def utcdate(obj): 52 | "Convert string or datetime object to a datetime object in UTC." 53 | dt = None 54 | if type(obj) is datetime.datetime: 55 | dt = obj 56 | try: 57 | dt = dateutil.parser.parse(obj) 58 | except: 59 | pass 60 | if dt: 61 | # Convert to UTC time and get rid of the offset 62 | utcoffs = dt.utcoffset() 63 | if utcoffs: 64 | dt = dt - utcoffs 65 | dt = dt.replace(tzinfo=None) #dateutil.tz.tzutc()) 66 | return dt 67 | 68 | def date_dict(date): 69 | return { 70 | "year": date.year, "month": date.month, "day": date.day, 71 | "hour": date.hour, "minute": date.minute, "second": date.second, 72 | "weekday": date.isoweekday(), "week": date.isocalendar()[1] 73 | } 74 | 75 | 76 | _agoRegex = re.compile("^(?P\d+)\s*(?P\w+)( ago)?$") 77 | 78 | def ago2date(ago, from_date_utc=None): 79 | """ 80 | Convert 'ago' style time specification string to a datetime object. 81 | Units are s=second, m=minute, h=hour, d=day, w=week, M=month, y=year 82 | :param str ago : "Time ago" as a string. 83 | :param datetime.datetime from_date_utc : Relative time to use instead of 'now'. In UTC. 84 | :rtype datetime.timedelta : Time difference. 85 | """ 86 | m = _agoRegex.match(ago) 87 | if not m: 88 | raise SyntaxError("Illegal 'ago' string: %s" % ago) 89 | number = int(m.group("number")) 90 | unit = m.group("unit") 91 | delta = None 92 | if unit == "s" or unit.startswith("sec") : delta = datetime.timedelta(seconds= number) 93 | elif unit == "m" or unit.startswith("min") : delta = datetime.timedelta(minutes= number) 94 | elif unit == "h" or unit.startswith("hour") : delta = datetime.timedelta(hours= number) 95 | elif unit == "d" or unit.startswith("day") : delta = datetime.timedelta(days= number) 96 | elif unit == "w" or unit.startswith("week") : delta = datetime.timedelta(weeks= number) 97 | elif unit == "M" or unit.startswith("month"): delta = datetime.timedelta(days= number*30) 98 | elif unit == "y" or unit.startswith("year") : delta = datetime.timedelta(days= number*365) 99 | else: 100 | raise SyntaxError("Illegal unit for 'ago' string in: %s" % ago) 101 | return (from_date_utc or datetime.datetime.utcnow()) - delta; 102 | -------------------------------------------------------------------------------- /PROTOCOLS.md: -------------------------------------------------------------------------------- 1 | # Protocols 2 | 3 | This document describes the common protocols for document exchange between terminals (connectors and sockets). 4 | 5 | The name of the protocol is meant as a hint, although keeping track of a common set of protocols would be good. 6 | 7 | ## esdoc 8 | 9 | ### esdoc (general) 10 | 11 | Used by 12 | 13 | ElasticsearchReader.output (socket) 14 | ElasticsearchWriter.input (connector) 15 | ElasticsearchWriter.output (socket) 16 | CsvConverter.output (socket) 17 | HtmlRemover.input (connector) 18 | HtmlRemover.output (soclet) 19 | PatternRemover.input (connector) 20 | PatternRemover.output (socket) 21 | 22 | Format 23 | 24 | _index str 25 | _type str 26 | _id str 27 | _version int 28 | _timestamp str 29 | _source dict # Dict of { field : value } 30 | 31 | 32 | All fields are optional, depending on the case 33 | 34 | ### esdoc.webpage 35 | 36 | Used by 37 | 38 | WebGetter.output (socket) 39 | 40 | Format 41 | 42 | _id str # Using the URL as ID 43 | _type str # "webpage" 44 | _timestamp datetime # When the content was fetched 45 | _source dict of ... 46 | domain str 47 | requested_by list # Of of dicts of format [ what : [ who, ...] }, ... ] 48 | content str 49 | content_type str 50 | encoding str 51 | date datetime # Web page publishing date as reported by HTTP header 52 | 53 | ### esdoc.4chan 54 | 55 | Used by 56 | 57 | FourChanMonitor.esdoc 58 | 59 | Format 60 | 61 | _id int # Post number at 4chan 62 | _type str # "4chan" 63 | _source 64 | id int # Post number at 4chan 65 | board str # Board id 66 | thread int # Thread id 67 | timestamp int # Time of posting 68 | author str # Name of author, most commonly "Anonymous" 69 | comment str # Text comment 70 | filename str # Filename, with extension 71 | response_to int # Post number this post is a response to. 0 if original posting (i.e. not a response) 72 | 73 | 74 | ## urlrequest 75 | 76 | Used by 77 | 78 | WebGetter.input (connector) 79 | 80 | Format 81 | 82 | url str # 83 | what str # Source requesting the url, e.g. "twitter_mon" 84 | who str # Who requested it, e.g. some user id from the source 85 | 86 | ## csv 87 | 88 | Used by 89 | 90 | CsvConverter.input (connector) 91 | 92 | Format 93 | 94 | ```csv 95 | "field","field,"field","..." 96 | ``` 97 | 98 | 99 | ## graph-edge 100 | The graph-edge protocol is simply a dictionary with three mandatory keys, 101 | that together represents an edge. 102 | 103 | Used by 104 | Neo4jWriter.edge (connector) 105 | 106 | Format 107 | 108 | from str # The property-id of the source node 109 | type str # The type of the edge. ("follows", "author", "mention", "quote") 110 | to str # The property-id of the receiving node 111 | 112 | Note that all fields are mandatory. 113 | 114 | ## graph-user 115 | 116 | The graph-user protocol is a dictionary holding properties. 117 | 118 | Used by 119 | 120 | Neo4jWriter.user (connector) 121 | TwitterUserGetter.user (socket) 122 | 123 | Format 124 | 125 | id str 126 | location str #Optional 127 | description str #Optional 128 | screen_name str #Optional 129 | lang str #Optional 130 | name str #Optional 131 | created_at date.isoformat()#Optional 132 | -------------------------------------------------------------------------------- /bin/es-read: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | 5 | from eslib.procs import ElasticsearchReader, FileWriter 6 | import eslib.prog 7 | import eslib.time 8 | import argparse, sys, time 9 | 10 | 11 | def main(): 12 | help_i = "Which index to return documents from." 13 | help_t = "Which type of document to return." 14 | help_l = "The maximum number of documents to return. Will by default return all documents." 15 | help_s = "Returns all documents added after SINCE. Specified in the 'ago' format (1d, 3w, 1y, etc)." 16 | help_b = "Returns all documents added after BEFORE. Specified in the 'ago' format (1d, 3w, 1y, etc)." 17 | help_tf = "The field that contains the relavant date information. Default 'timefield' to slice on is '_timestamp'." 18 | help_fi = "Format for filter is, by example: 'category:politicians,party:democrats'." 19 | 20 | parser = argparse.ArgumentParser(usage="\n %(prog)s -i index [-t type] [-f field] [-l limit] [more options]") 21 | parser._actions[0].help = argparse.SUPPRESS 22 | parser.add_argument("-i", "--index" , help=help_i, required=True) 23 | parser.add_argument("-t", "--type" , help=help_t) 24 | parser.add_argument("-l", "--limit" , help=help_l, default=0, type=int) 25 | parser.add_argument("-s", "--since" , help=help_s) 26 | parser.add_argument("-b", "--before" , help=help_b) 27 | parser.add_argument( "--host" , help="Elasticsearch host, format 'host:port' or just 'host'.", default=None) 28 | parser.add_argument( "--timefield", help=help_tf, default="_timestamp") 29 | parser.add_argument( "--filter" , help=help_fi) 30 | parser.add_argument("-v", "--verbose" , action="store_true") 31 | #parser.add_argument( "--debug" , action="store_true") 32 | parser.add_argument( "--name" , help="Process name.", default=None) 33 | 34 | if len(sys.argv) == 1: 35 | parser.print_usage() 36 | sys.exit(0) 37 | 38 | args = parser.parse_args() 39 | 40 | # Time validation conversion and checks 41 | before = None 42 | since = None 43 | if args.before: 44 | try: 45 | before = eslib.time.ago2date(args.before) 46 | except: 47 | print >> sys.stderr, "Illegal 'ago' time format to 'before' argument, '%s'" % args.before 48 | sys.exit(-1) 49 | if args.since: 50 | try: 51 | since = eslib.time.ago2date(args.since) 52 | except: 53 | print >> sys.stderr, "Illegal 'ago' time format to 'since' argument, '%s'" % args.since 54 | sys.exit(-1) 55 | 56 | # Parse filter string 57 | filters = {} 58 | if args.filter: 59 | parts = [{part[0]:part[1]} for part in [filter.split(":") for filter in args.filter.split(",")]] 60 | for part in parts: 61 | filters.update(part) 62 | 63 | # Set up and run this processor 64 | r = ElasticsearchReader( 65 | name = args.name or eslib.prog.progname(), 66 | hosts = [args.host] if args.host else [], 67 | index = args.index, 68 | doctype = args.type, 69 | limit = args.limit, 70 | filters = filters, 71 | since = since, 72 | before = before, 73 | timefield = args.timefield 74 | ) 75 | 76 | # if args.debug: r.debuglevel = 0 77 | 78 | verbose_tick_delay = 3.0 79 | 80 | w = FileWriter() 81 | w.subscribe(r) 82 | r.start() 83 | if args.verbose: 84 | # Verbose wait loop 85 | last_tick = time.time() 86 | while r.running: 87 | time.sleep(0.1) 88 | now = time.time() 89 | if (now - last_tick > verbose_tick_delay) or not r.running: 90 | print >> sys.stderr, "Read %d/%d" % (r.count, r.total) 91 | last_tick = now 92 | print >> sys.stderr, "Reading finished; waiting for writer to finish." 93 | w.wait() 94 | 95 | 96 | if __name__ == "__main__": main() 97 | -------------------------------------------------------------------------------- /examples/remoting/DummyRemotingService.py: -------------------------------------------------------------------------------- 1 | # NOTE: 2 | # 3 | # REMOTING SERVICE IS YET EXPERIMENTAL (as of when this was written) 4 | # 5 | # This is an example of how to create a service based on the RemotingService. 6 | # See also RemotingClient.py for example of how to call it remotely. 7 | # 8 | # SETUP: 9 | # 10 | # Copy the file to your service "source" directory, and add to the package __init__.py file: 11 | # 12 | # from .DummyRemotingService import DummyRemotingService 13 | # __all__ = ( 14 | # "DummyRemotingService" 15 | # ) 16 | # 17 | # In the service "config" directory, configure it like 18 | # 19 | # remoting: 20 | # type : "DummyRemotingService" 21 | # frequency : 3 22 | # lifespan : 120 23 | 24 | from eslib.service import RemotingService, PipelineService 25 | from eslib.procs import Timer 26 | from eslib import Processor 27 | 28 | 29 | # COMMENT to the below connectors and sockets: 30 | # The "command" socket and connector are set to default, so that we can easily create a 31 | # service based on the pipeline service. Then all pipleline processors are linked so that 32 | # start/stop events etc are easily propagated the way we want. The downside to this 33 | # approach is that the socket and connector we want to use from the client will have to 34 | # be names, as they are not the default ones. 35 | # (Here, by client example: client.fetch("output"), and client.put("input").) 36 | 37 | 38 | class FetchProc(Processor): 39 | def __init__(self, **kwargs): 40 | super(FetchProc, self).__init__(**kwargs) 41 | self.create_connector(self._incoming, "input") 42 | self.command = self.create_socket("command", is_default=True) # To link easily as pipeline 43 | self.output = self.create_socket("output") 44 | self.num = 0 45 | 46 | def on_open(self): 47 | self.num = 0 48 | 49 | def _incoming(self, doc): 50 | # For each incoming tick, generate one output doc: 51 | self.num += 1 52 | print "SEDNING TO QUEUE:", self.num 53 | self.output.send(self.num) 54 | 55 | class PutProc(Processor): 56 | def __init__(self, **kwargs): 57 | super(PutProc, self).__init__(**kwargs) 58 | self.create_connector(self._command, "command", is_default=True) # To link easily as pipeline 59 | self.create_connector(self._incoming, "input") 60 | 61 | def _command(self, doc): 62 | pass # Down the drain; this is simply for linking 63 | 64 | def _incoming(self, doc): 65 | print("INCOMING DOC:", doc) 66 | 67 | 68 | class DummyRemotingService(RemotingService, PipelineService): 69 | 70 | def __init__(self, **kwargs): 71 | super(DummyRemotingService, self).__init__(**kwargs) 72 | 73 | self.config.set_default( 74 | timer_frequency = 3, 75 | lifespan = 0 76 | ) 77 | 78 | def on_configure(self, credentials, config, global_config): 79 | self.config.set( 80 | manager_endpoint = global_config.get("manager_host"), 81 | management_endpoint = config.get("management_endpoint"), 82 | 83 | timer_frequency = config["frequency"], 84 | lifespan = config["lifespan"] 85 | ) 86 | 87 | def on_setup(self): 88 | timer = Timer( 89 | service = self, 90 | name = "timer", 91 | actions = [(self.config.timer_frequency, self.config.timer_frequency, "ping")] 92 | ) 93 | fetchProc = FetchProc( 94 | service = self, 95 | name = "fetchProc", 96 | ) 97 | putProc = PutProc( 98 | service = self, 99 | name = "putProc" 100 | ) 101 | 102 | procs = [timer, fetchProc, putProc] 103 | self.link(*procs) 104 | 105 | self.setup_put(putProc) 106 | self.setup_fetch(fetchProc, "output") 107 | 108 | return True 109 | -------------------------------------------------------------------------------- /eslib/procs/HtmlRemover.py: -------------------------------------------------------------------------------- 1 | __author__ = 'Hans Terje Bakke' 2 | 3 | from ..Processor import Processor 4 | from .. import esdoc 5 | from eslib.text import remove_html 6 | 7 | class HtmlRemover(Processor): 8 | """ 9 | Remove HTML tags and unescape HTML escapings. 10 | 11 | Connectors: 12 | input (esdoc) (default) : Incoming document in 'esdoc' dict format. 13 | str (str) : Incoming document of type 'str' or 'unicode'. 14 | Sockets: 15 | output (esdoc) (default) : Output of documents that arrived on 'input' connector. 16 | str (str) : Output of documents that arrived on 'str' connector. 17 | 18 | Config: 19 | source_field = "text" : Part of twitter dev credentials. 20 | target_field = None : Defaults to 'source_field', replacing the input field. 21 | field_map = {} : A dict of fields to use as { source : target }. 22 | If specified, this *replaces* the source_field and target_field pair! 23 | strip = True : Remove boundary spaces and double spaces, commonly left after a removal. 24 | """ 25 | 26 | def __init__(self, **kwargs): 27 | super(HtmlRemover, self).__init__(**kwargs) 28 | 29 | m = self.create_connector(self._incoming_esdoc, "input", "esdoc", "Incoming 'esdoc'.", is_default=True) 30 | self.create_connector(self._incoming_str , "str" , "str" , "Incoming document of type 'str' or 'unicode'.") 31 | self.output_esdoc = self.create_socket("output" , "esdoc" , "Outgoing, cleaned, 'esdoc'.", is_default=True, mimic=m) 32 | self.output_str = self.create_socket("str" , "str" , "Outgoing, cleaned, 'str'.") 33 | 34 | self.config.set_default( 35 | source_field = "text", 36 | target_field = None, 37 | field_map = {}, 38 | strip = True 39 | ) 40 | 41 | self._regexes = [] 42 | self._field_map = {} 43 | 44 | def on_open(self): 45 | # Create field map 46 | self._field_map = self.config.field_map or {} 47 | if not self._field_map: 48 | if not self.config.source_field: 49 | raise ValueError("Neither field_map nor source_field is configured.") 50 | self._field_map[self.config.source_field] = (self.config.target_field or self.config.source_field) 51 | 52 | 53 | def _clean_text(self, text): 54 | text = remove_html(text) 55 | if self.config.strip: 56 | text = text.strip().replace(" ", " ") 57 | return text 58 | 59 | def _clean(self, doc): 60 | 61 | if not doc: 62 | return doc 63 | 64 | # This makes this method work also for 'str' and 'unicode' type documents; not only for the expected 'esdoc' protocol (a 'dict'). 65 | if type(doc) in [str, unicode]: 66 | cleaned = self._clean_text(doc) 67 | return cleaned 68 | elif not type(doc) is dict: 69 | self.doclog.debug("Unsupported document type '%s'." % type(doc)) 70 | return doc 71 | 72 | source = doc.get("_source") 73 | if not source: 74 | return doc # Missing source section; don't do anything 75 | 76 | for source_field, target_field in self._field_map.iteritems(): 77 | text = esdoc.getfield(source, source_field) 78 | if text and type(text) in [str, unicode]: 79 | cleaned = self._clean_text(text) 80 | if cleaned != text: 81 | # Note: This may lead to a few strictly unnecessary shallow clonings... 82 | doc = esdoc.shallowputfield(doc, "_source." + target_field, cleaned) 83 | return doc 84 | 85 | def _incoming_esdoc(self, doc): 86 | if self.output_esdoc.has_output: 87 | self.output_esdoc.send(self._clean(doc)) 88 | 89 | def _incoming_str(self, doc): 90 | if self.output_str.has_output: 91 | self.output_str.send(self._clean(doc)) 92 | -------------------------------------------------------------------------------- /eslib/procs/TweetExtractor.py: -------------------------------------------------------------------------------- 1 | __author__ = 'Hans Terje Bakke' 2 | 3 | from ..Processor import Processor 4 | 5 | 6 | class TweetExtractor(Processor): 7 | """ 8 | Extract properties from a tweet to different sockets: 'user' and 'link'. 9 | 10 | Protocols: 11 | 12 | esdoc.tweet: 13 | 14 | # TODO 15 | 16 | graph-edge: 17 | 18 | from str : User ID. 19 | type str : Relation, one of "author", "mention", "quote". 20 | to str : User ID. 21 | 22 | urlrequest: 23 | 24 | url str 25 | what str : e.g. "twitter_mon" 26 | who str : e.g. some user id 27 | 28 | Sockets: 29 | tweet (esdoc.tweet) (default) : Tweet 30 | text (str) : Only the text from the tweet. 31 | link (urlrequest) : Link from the tweet, for potential follow-up. 32 | user (graph-edge) : Info about author, mentioned or retweeted users from the tweet. 33 | 34 | Config: 35 | drop_retweets = True : Do not report tweets from retweets if set. User relation "quote" will still be reported. 36 | """ 37 | 38 | RELATION_AUTHOR = "author" 39 | RELATION_RETWEET = "quote" 40 | RELATION_MENTION = "mention" 41 | 42 | 43 | def __init__(self, **kwargs): 44 | super(TweetExtractor, self).__init__(**kwargs) 45 | 46 | self.create_connector(self._incoming, "tweet", "esdoc.tweet", "Tweet."); 47 | 48 | self.output_tweet = self.create_socket("tweet" , "esdoc.tweet" , "Tweet.", is_default=True) 49 | self.output_text = self.create_socket("text" , "str" , "Only the text from the tweet.") 50 | self.output_link = self.create_socket("link" , "urlrequest" , "Link from the tweet, for potential follow-up.") 51 | self.output_user = self.create_socket("user" , "graph-edge" , "Info about author, mentioned or retweeted users from the tweet.") 52 | 53 | self.config.set_default( 54 | drop_retweets = True 55 | ) 56 | 57 | def _incoming(self, doc): 58 | 59 | if not doc or not type(doc) is dict or not self.has_output: 60 | return 61 | 62 | tweet, users, links = self._extract(doc) 63 | if tweet: 64 | self.output_tweet.send(tweet) 65 | self.output_text.send(tweet["_source"]["text"]) 66 | for user in users: 67 | self.output_user.send(user) 68 | for link in links: 69 | self.output_link.send(link) 70 | 71 | def _extract(self, tweet): 72 | "Return a tuple of (tweet, users, links)." 73 | 74 | users = [] 75 | links = [] 76 | 77 | source = tweet["_source"] # Always present 78 | 79 | # Add author to 'users' list 80 | user_id = source["user"]["id"] # Always present 81 | users.append({"from": user_id, "type": self.RELATION_AUTHOR, "to": user_id}) 82 | 83 | # Retweets 84 | retweet_user_id = source.get("retweet_user_id") 85 | if retweet_user_id: 86 | # Find out who has been retweeted: 87 | # Add retweet to 'users' list 88 | users.append({"from": user_id, "type": self.RELATION_RETWEET, "to": retweet_user_id}) 89 | if self.config.drop_retweets: 90 | return (None, users, links) 91 | 92 | # URLs and mentions from entities 93 | entities = source.get("entities") 94 | if entities: 95 | # Get URLs 96 | urls = entities.get("urls") 97 | if urls: 98 | for url in urls: 99 | # Add to "links" list: 100 | links.append({ 101 | "url" : url["url"], 102 | "what": "twitter", # TODO: Maybe use self.name instead? 103 | "who" : user_id 104 | }) 105 | # Get user mentions 106 | user_mentions = entities.get("user_mentions") 107 | if user_mentions: 108 | for m in user_mentions: 109 | # Add relation to 'users' list: 110 | users.append({"from": user_id, "type": self.RELATION_MENTION, "to": m["id"]}) 111 | 112 | return (tweet, users, links) 113 | -------------------------------------------------------------------------------- /eslib/procs/Neo4jReader.py: -------------------------------------------------------------------------------- 1 | __author__ = 'mats' 2 | 3 | from ..Generator import Generator 4 | from .neo4j import Neo4j 5 | 6 | from itertools import izip 7 | import time, logging 8 | 9 | class Neo4jReader(Generator): 10 | """ 11 | The purpose of this processor is to ask Neo4j if a node with a given 12 | user id has it's full set of properties. 13 | 14 | It takes an id and determines whether or not it has its properties set. 15 | If it lacks properties, it will be outputted by the 'ids' socket. 16 | 17 | Connectors: 18 | id (str) : Incoming IDs to check. 19 | Sockets: 20 | ids (str) : Outputs IDs that lack properties. 21 | 22 | Config: 23 | batchsize = 20 : How many IDs to gather up before making a call to Neo4j. 24 | batchtime = 5.0 : How many seconds to wait before we send a batch if it is not full. 25 | host = localhost: The host we should connect to 26 | port = 7474 : The default neo4j port 27 | 28 | """ 29 | 30 | def __init__(self, **kwargs): 31 | super(Neo4jReader, self).__init__(**kwargs) 32 | self.create_connector(self._incoming_id, "id", "str", "Incoming IDs to check.") 33 | self._missing = self.create_socket("missing", "str", "Outputs IDs that lack properties.") 34 | #self._missing = self.create_socket("output", "???", "Outputs data retrived, one document per ID.") 35 | 36 | self.config.set_default( 37 | batchsize = 20, 38 | batchtime = 5.0, 39 | host = "localhost", 40 | port = 7474 41 | ) 42 | 43 | self._neo4j = None 44 | 45 | self._queue = [] 46 | self._last_get = time.time() 47 | self._has_properties = set([]) 48 | 49 | #TODO: Could place this in Neo4jBase 50 | def on_open(self): 51 | """ 52 | Instantiates both a neo4j-instance and a twitter-instance. 53 | 54 | Raises: 55 | - ConnectionError if neo4j can't contact its server 56 | - Exception if twitter can't authenticate properly 57 | """ 58 | 59 | # TODO: Need logging, request timeout and exception handling down there: 60 | self.log.debug("Connecting to Neo4j.") 61 | self._neo4j = Neo4j(host=self.config.host, port=self.config.port) 62 | self.log.status("Connected to Neo4j on %s:%d." % (self.config.host, self.config.port)) 63 | 64 | def _incoming_id(self, id_): 65 | """ 66 | Takes an incoming id, gets the correct query string from self.neo4j, 67 | before appending the query to self._queue 68 | """ 69 | if id_ not in self._has_properties: 70 | query = self._neo4j.get_node_query_if_properties(id_) 71 | self._queue.append((id_, query)) 72 | 73 | def on_tick(self): 74 | """ 75 | Commit items in queue if queue exceeds batchsize or it's been long 76 | since last commit. 77 | """ 78 | if ((len(self._queue) >= self.config.batchsize) or 79 | (time.time() - self._last_get > self.config.batchtime and self._queue)): 80 | self._get() 81 | 82 | def on_shutdown(self): 83 | """ Get rid of rest of queue before shutting down. """ 84 | while self._queue: 85 | self._get() 86 | 87 | def _get(self): 88 | num_elem = len(self._queue) 89 | if num_elem > self.config.batchsize: 90 | num_elem = self.config.batchsize 91 | 92 | ids, queries = [list(t) 93 | for t in 94 | izip(*self._queue[:num_elem])] 95 | rq = self._neo4j._build_rq(queries) 96 | resp = self._neo4j.commit(rq) 97 | self.log.debug("Asking neo4j for %i users." % num_elem) 98 | self._queue = self._queue[num_elem:] 99 | self._last_get = time.time() 100 | self._write_uids(ids, resp) 101 | 102 | def _write_uids(self, ids, resp): 103 | """ 104 | Outputs the ids of the nodes in the resp-object to a socket. 105 | 106 | Args: 107 | ids: The ids that corresponds to a query 108 | resp: a requests-module response object with neo4j-nodes in 'graph'- 109 | format. 110 | """ 111 | for uid, result in izip(ids, resp.json()["results"]): 112 | if not result["data"]: 113 | self._missing.send(uid) 114 | if self.doclog.isEnabledFor(logging.TRACE): 115 | self.doclog.trace("uid %s does not have properties" % uid) 116 | else: 117 | self._has_properties.add(uid) 118 | -------------------------------------------------------------------------------- /eslib/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | """ 4 | eslib 5 | ~~~~~ 6 | 7 | Document processing library for Elasticsearch. 8 | """ 9 | 10 | __version__ = "0.0.1" 11 | __author__ = "Hans Terje Bakke" 12 | 13 | 14 | from .Terminal import TerminalProtocolException, Terminal 15 | from .TerminalInfo import TerminalInfo 16 | from .Connector import Connector 17 | from .Socket import Socket 18 | from .Processor import Processor 19 | from .Generator import Generator 20 | from .Monitor import Monitor 21 | from .Configurable import Configurable, Config 22 | 23 | 24 | __all__ = ( 25 | "TerminalProtocolException", 26 | "Terminal", 27 | "TerminalInfo", 28 | "Connector", 29 | "Socket", 30 | "Processor", 31 | "Generator", 32 | "Monitor", 33 | "Configurable", 34 | "Config", 35 | 36 | "unique" 37 | ) 38 | 39 | #region Core stuff 40 | 41 | def unique(seq, idfun=None): 42 | # order preserving 43 | if idfun is None: 44 | def idfun(x): return x 45 | seen = {} 46 | result = [] 47 | for item in seq: 48 | marker = idfun(item) 49 | if marker in seen: continue 50 | seen[marker] = 1 51 | result.append(item) 52 | return result 53 | 54 | #endregion 55 | 56 | 57 | #region Encoding of stdin/stdout 58 | 59 | import sys, codecs 60 | 61 | # Fix stdin and stdout encoding issues 62 | _encoding_stdin = sys.stdin.encoding or "UTF-8" 63 | _encoding_stdout = sys.stdout.encoding or _encoding_stdin 64 | #sys.stdin = codecs.getreader(_encoding_stdin)(sys.stdin) 65 | sys.stdout = codecs.getwriter(_encoding_stdout)(sys.stdout) 66 | 67 | #endregion Encoding of stdin/stdout 68 | 69 | #region Logging stuff 70 | 71 | import logging 72 | import logging.config 73 | 74 | class _ExtendedLogger(logging.getLoggerClass()): 75 | def makeRecord(self, name, level, fn, lno, msg, args, exc_info, func=None, extra=None): 76 | rec = logging.LogRecord(name, level, fn, lno, msg, args, exc_info, func) 77 | 78 | rec.serviceName = self.serviceName if hasattr(self, 'serviceName') else None 79 | rec.className = self.className if hasattr(self, 'className') else None 80 | rec.instanceName = self.instanceName if hasattr(self, 'instanceName') else None 81 | 82 | rec.firstName = name.split(".")[0] 83 | rec.lastName = name.split(".")[-1] 84 | rec.names = name.split(".") 85 | 86 | return rec 87 | 88 | logging.setLoggerClass(_ExtendedLogger) 89 | 90 | 91 | def _log_status(self, message, *args, **kws): 92 | if self.isEnabledFor(logging.STATUS): 93 | self._log(logging.STATUS, message, args, **kws) 94 | 95 | def _log_verbose(self, message, *args, **kws): 96 | if self.isEnabledFor(logging.VERBOSE): 97 | self._log(logging.VERBOSE, message, args, **kws) 98 | 99 | def _log_trace(self, message, *args, **kws): 100 | if self.isEnabledFor(logging.TRACE): 101 | self._log(logging.TRACE, message, args, **kws) 102 | 103 | def _log_debug_n(self, n, message, *args, **kws): 104 | candidate = logging.DEBUG - n 105 | loglevel = min(max(candidate, logging.TRACE+1), logging.DEBUG) 106 | if self.isEnabledFor(loglevel): 107 | self._log(loglevel, message, args, **kws) 108 | 109 | logging.STATUS = 25 110 | logging.VERBOSE = 15 111 | logging.TRACE = 1 112 | 113 | logging.addLevelName(logging.STATUS , "STATUS") 114 | logging.addLevelName(logging.VERBOSE, "VERBOSE") 115 | logging.addLevelName(logging.TRACE , "TRACE") 116 | for n in range(1,9): 117 | logging.addLevelName(logging.DEBUG -n, "DEBUG-%s" % n) 118 | 119 | logging.Logger.status = _log_status 120 | logging.Logger.verbose = _log_verbose 121 | logging.Logger.trace = _log_trace 122 | logging.Logger.debugn = _log_debug_n 123 | 124 | #endregion Logging stuff 125 | 126 | #region Config stuff 127 | 128 | import os, yaml 129 | from . import esdoc 130 | 131 | def get_credentials(path=None, service_dir=None, credentials_file=None): 132 | service_dir = service_dir or os.environ.get("ESLIB_SERVICE_DIR") 133 | if not service_dir: 134 | raise ValueError("Neither service_dir given nor ESLIB_SERVICE_DIR set.") 135 | dir = os.path.join(service_dir, "config") 136 | 137 | file_path = None 138 | if not credentials_file: 139 | credentials_file = "credentials.yaml" 140 | 141 | if os.path.basename(credentials_file) == credentials_file: 142 | # Pick from dir 143 | file_path = os.path.join(dir, credentials_file) 144 | else: 145 | # Use absolute path 146 | file_path = os.path.expanduser(credentials_file) 147 | 148 | # Load credentials file 149 | with open(file_path, "r") as f: 150 | credentials = yaml.load(f) 151 | 152 | if not path: 153 | return credentials 154 | else: 155 | return esdoc.getfield(credentials, path) 156 | 157 | #endregion 158 | -------------------------------------------------------------------------------- /eslib/procs/CsvConverter.py: -------------------------------------------------------------------------------- 1 | __author__ = 'Hans Terje Bakke' 2 | 3 | import csv, codecs 4 | from ..Processor import Processor 5 | 6 | class CsvConverter(Processor): 7 | """ 8 | Convert csv input to Elasticsearch document format. 9 | Field names can be explicitly entered or derived from the first line of input, 10 | assuming that is the first line contains column names. When explicitly specified, only those columns entered 11 | will be used, the others will be ignored. When derived, all columns are used. 12 | 13 | NOTE: Fields, including column headers, must not have any spacing between delimiters and quotes. 14 | 15 | NOTE: Fields that are mapped to meta fields ('_id', '_index', '_type') will not be part of the '_source'. 16 | 17 | Connectors: 18 | input (csv) : Document in 'csv' format. First document is optionally column list. 19 | Sockets: 20 | output (esdoc) : Documents converted from 'csv' to 'esdoc' format. 21 | 22 | Config: 23 | index = None : Override '_index' meta field with this value. 24 | doctype = None : Override '_type' meta field with this value. 25 | columns = None : List of columns to pick from the CSV input. Use None for columns to ignore. 26 | skip_first_line = False : Skip first line of the input. (Typically column headers you don't want. 27 | delimiter = "," : CSV column delimiter character. 28 | 29 | id_field = "_id" : Name of field to map to meta field '_id'. 30 | index_field = "_index" : Name of field to map to meta field '_index'. 31 | type_field = "_type" : Name of field to map to meta field '_type'. 32 | """ 33 | 34 | def __init__(self, **kwargs): 35 | super(CsvConverter, self).__init__(**kwargs) 36 | self.create_connector(self._incoming, "input", "csv", "Document in 'csv' format. First document is optionally column list.") 37 | self.output = self.create_socket("output", "esdoc", "Documents converted from 'csv' to 'esdoc' format.") 38 | 39 | self.config.set_default( 40 | index = None, 41 | doctype = None, 42 | columns = None, 43 | skip_first_line = False, 44 | delimiter = ",", 45 | 46 | id_field = "_id", 47 | index_field = "_index", 48 | type_field = "_type" 49 | ) 50 | 51 | self._columns = [] 52 | self._first_line_processed = False 53 | 54 | 55 | def on_open(self): 56 | # Sanity check: 57 | if self.config.skip_first_line and not self.config.columns: 58 | raise Exception("Nothing specified in 'columns' and 'skip_first_line' set. Unable to determine fields to include, then.") 59 | 60 | self._first_line_processed = False 61 | self._columns = self.config.columns or [] 62 | 63 | def _incoming(self, line): 64 | # Check if we should skip first line or use it as column definitions (columns) 65 | if not self._first_line_processed: 66 | self._first_line_processed = True 67 | if self.config.skip_first_line: 68 | return 69 | if not self._columns: 70 | # No skipping first line ordered and no field list. Now assume first line to be column headings 71 | for csvrow in csv.reader([line], delimiter=self.config.delimiter): 72 | self._columns = csvrow 73 | return 74 | 75 | # Pick the only line. Since csv does not support unicode, we do this little encoding massage: 76 | raw_line = codecs.encode(line, "UTF-8") 77 | raw_csvrow = csv.reader([raw_line], delimiter=self.config.delimiter).next() 78 | csvrow = [codecs.decode(x, "UTF-8") for x in raw_csvrow] 79 | 80 | if not len(self._columns) == len(csvrow): 81 | self.doclog.warning("Column count does not match number of fields. Aborting. Row =\n%s" % csvrow) 82 | self.abort() # NOTE: We might want to continue processing, or we might not... 83 | 84 | doc = {} 85 | id = None 86 | index = None 87 | doctype = None 88 | for i in range(len(self._columns)): 89 | if not self._columns[i]: 90 | continue # Skip non-specified fields 91 | elif self._columns[i] == self.config.id_field: 92 | id = csvrow[i] 93 | elif self._columns[i] == self.config.index_field: # Override index 94 | index = csvrow[i] 95 | elif self._columns[i] == self.config.type_field: # Override doctype 96 | doctype = csvrow[i] 97 | else: 98 | doc.update({self._columns[i]: csvrow[i]}) 99 | 100 | # Convert to Elasticsearch type document 101 | esdoc = {"_index":self.config.index or index, "_type":self.config.doctype or doctype, "_id":id, "_source":doc} 102 | 103 | self.output.send(esdoc) 104 | -------------------------------------------------------------------------------- /eslib/Connector.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from __future__ import absolute_import 4 | 5 | from .Terminal import Terminal 6 | import Queue 7 | import threading 8 | import time 9 | 10 | 11 | class Connector(Terminal): 12 | 13 | def __init__(self, name, protocol=None, method=None): 14 | self.sleep = 0.1 #0.001 # Check for data in incoming queue this often (then burst through as much as possible) 15 | 16 | super(Connector, self).__init__(name, protocol) 17 | self.type = Connector 18 | self.queue = Queue.Queue() 19 | self.method = method 20 | 21 | # Execution control status 22 | self._thread = None 23 | self.accepting = False 24 | self.stopping = False 25 | self.running = False 26 | self.suspended = False 27 | self.aborted = False 28 | 29 | #region Queue management 30 | 31 | def _clear(self): 32 | "Clear the queue." 33 | while not self.queue.empty(): 34 | self.queue.get_nowait() 35 | self.queue.task_done() 36 | 37 | @property 38 | def pending(self): 39 | "Report number of pending items in queue." 40 | return self.queue.qsize() 41 | 42 | def _process(self): 43 | "Grab item from queue and call the pre-registered method on it." 44 | if not self.queue.empty(): 45 | document = self.queue.get_nowait() 46 | self.queue.task_done() 47 | if document: 48 | if self.method: 49 | try: 50 | self.method(document) 51 | except Exception as e: 52 | msg = "Unhandled exception in processor '%s' func '%s' while processing a document." % (self.owner.name, self.method.__name__) 53 | self.owner.doclog.exception(msg) 54 | self.owner.log.exception(msg) 55 | 56 | def receive(self, document): 57 | "Put document on the incoming queue for this connector. Called by sockets." 58 | if self.accepting: 59 | self.queue.put(document) # Infinite queue, so it should never block 60 | 61 | #endregion Queue management 62 | 63 | #region Operation management 64 | 65 | def _run(self): 66 | while self.running: 67 | if self.sleep: 68 | time.sleep(self.sleep) 69 | if not self.running: 70 | break 71 | if self.stopping and (self.suspended or self.queue.empty()): 72 | # Notify owner that we are finished stopping 73 | self.owner.production_stopped() 74 | # Now we can finally stop 75 | self.stopping = False 76 | self.running = False 77 | elif not self.suspended: 78 | while self.running and not self.suspended and not self.queue.empty(): 79 | self._process() 80 | 81 | # Clean out the queue (in case we just aborted) 82 | self._clear() 83 | self.stopping = False # In case we were stopping while aborted 84 | 85 | # Note: The reason for the split of run() and accept_incoming(): 86 | # The entire system should first be accepting data before the individual 87 | # components start processing. When processing, a document is passed on 88 | # through sockets to listening connectors. If those connectors are not yet 89 | # accepting new items on their queues, incoming items will be dropped (i.e. 90 | # not put on the queue, and we would potentially lose the first items 91 | # during start-up. 92 | 93 | def run(self): 94 | "Should be called after all connectors in the system accept incoming data." 95 | if self.running: 96 | raise Exception("Connector is already running.") 97 | if not self.accepting: 98 | raise Exception("Connector is not accepting input before call to run(). Call accept_incoming() on all connectors in the system first.") 99 | 100 | self.aborted = False 101 | self.stopping = False 102 | self.suspended = False 103 | self.running = True 104 | 105 | self._thread = threading.Thread(target=self._run) 106 | self._thread.start() 107 | 108 | def accept_incoming(self): 109 | "Should be called for all connectors in the system before processes start running and processing!" 110 | if self.stopping: 111 | raise Exception("Connector is stopping. Refusing to accept new incoming again until fully stopped.") 112 | self.accepting = True 113 | 114 | def stop(self): 115 | self.accepting = False 116 | self.stopping = True # We must wait for items in the queue to be processed before we finally stop running 117 | if self._thread and self._thread.isAlive(): 118 | try: 119 | self._thread.join() # NOTE: Are we sure we want to wait for this ?? 120 | except: 121 | pass # Ignore 122 | self._thread = None 123 | 124 | def abort(self): 125 | self.aborted = True 126 | self.accepting = False 127 | self.running = False # Run loop will stop immediately 128 | 129 | def suspend(self): 130 | self.suspended = True 131 | 132 | def resume(self): 133 | self.suspended = False 134 | 135 | #endregion Operation management 136 | -------------------------------------------------------------------------------- /eslib/procs/RabbitmqWriter.py: -------------------------------------------------------------------------------- 1 | __author__ = 'Hans Terje Bakke' 2 | 3 | from ..Processor import Processor 4 | from .RabbitmqBase import RabbitmqBase 5 | from ..esdoc import tojson 6 | import time 7 | 8 | 9 | class RabbitmqWriter(Processor, RabbitmqBase): 10 | """ 11 | Write data to RabbitMQ. 12 | Writes data with type 'str', 'unicode', 'int', or 'float'. Lists and dicts are written as 'json'. 13 | Other types are cast to 'str'. 14 | The 'type' registered with the metadata is then either 'str', 'unicode', 'int', 'float' or 'json'. 15 | 16 | Connectors: 17 | input (*) : Document to write to configured RabbitMQ. 18 | 19 | Config: 20 | host = localhost : 21 | port = 5672 : 22 | admin_port = 15672 : 23 | username = guest : 24 | password = guest : 25 | virtual_host = None : 26 | exchange = None : If specified, data is written to this 'exchange', and also 27 | persisted on a durable queue '_shared'. Clients can 28 | ask to listen to the exchange on this queue ('consumable' 29 | behaviour, the default), or to listen to a live stream on an 30 | exclusive queue that is a copy of all data meant only for that 31 | listener. Clients connected to the shared queue will consume data 32 | from it, thus splitting workload (intended) or competing for the 33 | same data (unintended). 34 | queue = "default" : Not used if 'exchange' is specified. 35 | persisting = True : When this is on, the exchange will store data in a queue until it 36 | is consumed by a consuming monitor. Otherwise, data will only be 37 | queued if there is a listener. 38 | max_reconnects = 3 : 39 | reconnect_timeout = 3 : 40 | max_queue_size = 100000 : If the output queue exceeds this number, this processor is considered congested. 41 | """ 42 | 43 | MAX_CONNECTOR_QUEUE_SIZE = 10000 44 | CHECK_QUEUE_INTERVAL = 5 # 5 seconds; how often to check whether the message queue is "congested" 45 | 46 | _is_reader = False # This is a writer 47 | 48 | def __init__(self, **kwargs): 49 | super(RabbitmqWriter, self).__init__(**kwargs) 50 | 51 | self._connector = self.create_connector(self._incoming, "input", None, "Document to write to configured RabbitMQ.") 52 | 53 | self.config.set_default( 54 | persisting = True, 55 | max_queue_size = 100000 56 | ) 57 | 58 | self._last_check_queue_time = 0 59 | self._last_known_queue_size = 0 60 | 61 | 62 | def on_open(self): 63 | self._last_check_queue_time = 0 64 | self._last_known_queue_size = 0 65 | 66 | self.count = 0 67 | self._open_connection() 68 | self.log.info("Connected to RabbitMQ.") 69 | 70 | def on_close(self): 71 | if self._close_connection(): 72 | self.log.info("Connection to RabbitMQ closed.") 73 | 74 | def _incoming(self, document): 75 | if document == None: 76 | return 77 | 78 | data = None 79 | msg_type = None 80 | if isinstance(document, basestring): 81 | data = document 82 | msg_type = type(document).__name__ 83 | elif isinstance(document, (int, long, float)): 84 | data = str(document) 85 | msg_type = type(document).__name__ 86 | elif isinstance(document, (list, dict)): 87 | try: 88 | data = tojson(document) 89 | except TypeError as e: 90 | self.doclog.error("JSON serialization failed: %s" % e.message) 91 | return 92 | msg_type = "json" 93 | else: 94 | data = str(document) 95 | msg_type = "str" #type(document).__name__ 96 | self.doclog.warning("Writing document of unsupported type '%s' as type 'str'." % type(document).__name__) 97 | 98 | if self._publish(msg_type, data): 99 | self.count += 1 100 | 101 | def is_congested(self): 102 | if super(RabbitmqWriter, self).is_congested(): 103 | return True 104 | if self._connector.queue.qsize() > self.MAX_CONNECTOR_QUEUE_SIZE: 105 | return True 106 | elif not self.config.exchange or self.config.persisting: 107 | if self.config.max_queue_size: 108 | now = time.time() 109 | if now - self._last_check_queue_time > self.CHECK_QUEUE_INTERVAL: 110 | try: 111 | self._last_known_queue_size = self.get_queue_size() 112 | except Exception as e: 113 | self.log.warning("Failed to get queue size for queue '%s': %s" % (self._queue_name, e)) 114 | self._last_check_queue_time = now 115 | 116 | if self._last_known_queue_size > self.config.max_queue_size: 117 | return True 118 | 119 | return False 120 | -------------------------------------------------------------------------------- /test/test_protocol_compliance.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | from eslib import Processor, Terminal, Connector, Socket 3 | 4 | class TestProtocolCompliance(unittest.TestCase): 5 | 6 | # TEST mimic / passthrough protocols 7 | 8 | def test_protocol_equal(self): 9 | s = Socket("sock_a", "proto_a") 10 | c = Connector("conn_a", "proto_a") 11 | self.assertTrue(Terminal.protocol_compliance(s, c)) 12 | 13 | def test_protocol_not_equal(self): 14 | s = Socket("sock_a", "proto_b") 15 | c = Connector("conn_a", "proto_a") 16 | self.assertFalse(Terminal.protocol_compliance(s, c)) 17 | 18 | def test_protocol_general_accepts_special(self): 19 | s = Socket("sock_a", "general.special") 20 | c = Connector("conn_a", "general") 21 | self.assertTrue(Terminal.protocol_compliance(s, c)) 22 | 23 | def test_protocol_special_too_strict_for_general(self): 24 | s = Socket("sock_a", "general") 25 | c = Connector("conn_a", "general.special") 26 | self.assertFalse(Terminal.protocol_compliance(s, c)) 27 | 28 | def test_protocol_any_any(self): 29 | s = Socket("sock_a", None) 30 | c = Connector("conn_a", None) 31 | self.assertTrue(Terminal.protocol_compliance(s, c)) 32 | 33 | def test_protocol_any_sock(self): 34 | s = Socket("sock_a", None) 35 | c = Connector("conn_a", "x") 36 | self.assertTrue(Terminal.protocol_compliance(s, c)) 37 | 38 | def test_protocol_any_conn(self): 39 | s = Socket("sock_a", "x") 40 | c = Connector("conn_a", None) 41 | self.assertTrue(Terminal.protocol_compliance(s, c)) 42 | 43 | def test_protocol_mimic(self): 44 | a_s = Socket ("sock_a", "esdoc.tweet") 45 | b_c = Connector("conn_b", "esdoc") 46 | b_s = Socket ("sock_b", "esdoc", mimic=b_c) # Should end up mimicing 'esdoc.tweet' from a_s if connected 47 | c_c = Connector("conn_c", "esdoc.tweet") 48 | 49 | # Only unidirectional attachment needed for this test 50 | b_c.attach(a_s) 51 | 52 | print "b_s proto =", b_s.protocol 53 | print "b_s mimiced proto =", b_s.mimiced_protocol 54 | comply = Terminal.protocol_compliance(b_s, c_c) 55 | print "compiance=", comply 56 | 57 | self.assertTrue(b_s.mimiced_protocol == "esdoc.tweet") 58 | 59 | self.assertTrue(Terminal.protocol_compliance(a_s, b_c)) 60 | self.assertTrue(Terminal.protocol_compliance(b_s, c_c)) 61 | 62 | def test_protocol_mimic_no_connection(self): 63 | a_s = Socket ("sock_a", "esdoc.tweet") 64 | b_c = Connector("conn_b", "esdoc") 65 | b_s = Socket ("sock_b", "esdoc", mimic=b_c) # Should end up mimicing 'esdoc.tweet' from a_s if connected 66 | c_c = Connector("conn_c", "esdoc.tweet") 67 | 68 | print "b_s proto =", b_s.protocol 69 | print "b_s mimiced proto =", b_s.mimiced_protocol 70 | comply = Terminal.protocol_compliance(b_s, c_c) 71 | print "compiance=", comply 72 | 73 | self.assertTrue(b_s.mimiced_protocol == "esdoc") 74 | 75 | self.assertTrue(Terminal.protocol_compliance(a_s, b_c)) 76 | self.assertFalse(Terminal.protocol_compliance(b_s, c_c)) 77 | 78 | def test_protocol_mimic_sequence(self): 79 | a_s = Socket ("sock_a", "esdoc.tweet") 80 | 81 | b_c = Connector("conn_b", "esdoc") 82 | b_s = Socket ("sock_b", "esdoc", mimic=b_c) 83 | 84 | c_c = Connector("conn_c", "esdoc.tweet") 85 | c_s = Socket ("sock_b", "esdoc", mimic=c_c) 86 | 87 | print "NOT ATTACHED:" 88 | print "b_s proto =", b_s.protocol 89 | print "c_s proto =", b_s.protocol 90 | print "b_s mimiced proto =", c_s.mimiced_protocol 91 | print "c_s mimiced proto =", c_s.mimiced_protocol 92 | 93 | self.assertTrue(c_s.mimiced_protocol == "esdoc") 94 | 95 | # Only unidirectional attachments needed for this test 96 | b_c.attach(a_s) 97 | c_c.attach(b_s) 98 | 99 | print "\nATTACHED:" 100 | print "b_s proto =", b_s.protocol 101 | print "c_s proto =", c_s.protocol 102 | print "b_s mimiced proto =", b_s.mimiced_protocol 103 | print "c_s mimiced proto =", c_s.mimiced_protocol 104 | 105 | self.assertTrue(c_s.mimiced_protocol == "esdoc.tweet") 106 | 107 | def test_protocol_mimic_circular(self): 108 | a_s = Socket ("sock_a", "esdoc.tweet") 109 | 110 | b_c = Connector("conn_b", "esdoc") 111 | b_s = Socket ("sock_b", "esdoc", mimic=b_c) 112 | 113 | c_c = Connector("conn_c", "esdoc.tweet") 114 | c_s = Socket ("sock_b", "esdoc", mimic=c_c) 115 | 116 | # Only unidirectional attachments needed for this test 117 | b_c.attach(c_s) # Making it circular 118 | c_c.attach(b_s) 119 | 120 | print "\nATTACHED:" 121 | print "b_s proto =", b_s.protocol 122 | print "c_s proto =", c_s.protocol 123 | print "b_s mimiced proto =", b_s.mimiced_protocol 124 | print "c_s mimiced proto =", c_s.mimiced_protocol 125 | 126 | self.assertTrue(b_s.mimiced_protocol == "esdoc") 127 | 128 | # And most important, it does not enter an infinite loop and finally gets here.. 129 | 130 | def main(): 131 | unittest.main() 132 | 133 | if __name__ == "__main__": 134 | main() 135 | -------------------------------------------------------------------------------- /eslib/procs/PatternRemover.py: -------------------------------------------------------------------------------- 1 | __author__ = 'Hans Terje Bakke' 2 | 3 | from ..Processor import Processor 4 | from .. import esdoc 5 | import re 6 | 7 | class PatternRemover(Processor): 8 | """ 9 | Remove text using a regex pattern. 10 | 11 | Connectors: 12 | input (esdoc) (default) : Incoming document in 'esdoc' dict format. 13 | str (str) : Incoming document of type 'str' or 'unicode'. 14 | Sockets: 15 | output (esdoc) (default) : Output of documents that arrived on 'input' connector. 16 | str (str) : Output of documents that arrived on 'str' connector. 17 | 18 | Config: 19 | source_field = "text" : Part of twitter dev credentials. 20 | target_field = None : Defaults to 'source_field', replacing the input field. 21 | field_map = {} : A dict of fields to use as { source : target }. 22 | If specified, this *replaces* the source_field and target_field pair! 23 | pattern = None : Pattern to apply. (All 'patterns' are also applied, if specified.) 24 | patterns = [] : List of patterns to apply. ('pattern' will be applied first, if it exists.) 25 | regex_options = DOTALL|IGNORECASE|MULTILINE|UNICODE 26 | : Options for *all* regex patterns. 27 | strip = True : Remove boundary spaces and double spaces, commonly left after a removal. 28 | """ 29 | 30 | def __init__(self, **kwargs): 31 | super(PatternRemover, self).__init__(**kwargs) 32 | 33 | m = self.create_connector(self._incoming_esdoc, "input", "esdoc", "Incoming 'esdoc'.", is_default=True) 34 | self.create_connector(self._incoming_str , "str" , "str" , "Incoming document of type 'str' or 'unicode'.") 35 | self.output_esdoc = self.create_socket("output" , "esdoc" , "Outgoing, cleaned, 'esdoc'.", is_default=True, mimic=m) 36 | self.output_str = self.create_socket("str" , "str" , "Outgoing, cleaned, 'str'.") 37 | 38 | self.config.set_default( 39 | source_field = "text", 40 | target_field = None, 41 | field_map = {}, 42 | pattern = None, 43 | patterns = [], 44 | regex_options = re.DOTALL|re.IGNORECASE|re.MULTILINE|re.UNICODE, 45 | strip = True 46 | ) 47 | 48 | self._regexes = [] 49 | self._field_map = {} 50 | 51 | def on_open(self): 52 | """ 53 | :raises ValueError, if failed to parse a pattern as regex 54 | """ 55 | 56 | # Create list of regexes 57 | patterns = [] 58 | if self.config.pattern: 59 | patterns = [self.config.pattern] 60 | if self.config.patterns: 61 | patterns.extend(self.config.patterns) 62 | self._regexes = [] 63 | for pattern in patterns: 64 | try: 65 | regex = re.compile(r"(%s)" % pattern, self.config.regex_options) 66 | self._regexes.append(regex) 67 | except Exception as e: 68 | raise ValueError("Error parsing pattern: %s\nPattern was: %s" % (e.message, pattern)) 69 | 70 | # Create field map 71 | self._field_map = self.config.field_map or {} 72 | if not self._field_map: 73 | if not self.config.source_field: 74 | raise ValueError("Neither field_map nor source_field is configured.") 75 | self._field_map[self.config.source_field] = (self.config.target_field or self.config.source_field) 76 | 77 | 78 | def _clean_text(self, text): 79 | for regex in self._regexes: 80 | text = regex.sub("", text) 81 | if self.config.strip: 82 | text = text.strip().replace(" ", " ") 83 | return text 84 | 85 | def _clean(self, doc): 86 | 87 | if not doc or not self._regexes: 88 | return doc 89 | 90 | # This makes this method work also for 'str' and 'unicode' type documents; not only for the expected 'esdoc' protocol (a 'dict'). 91 | if type(doc) in [str, unicode]: 92 | cleaned = self._clean_text(doc) 93 | return cleaned 94 | elif not type(doc) is dict: 95 | self.doclog.debug("Unsupported document type '%s'." % type(doc)) 96 | return doc 97 | 98 | source = doc.get("_source") 99 | if not source: 100 | return doc # Missing source section; don't do anything 101 | 102 | for source_field, target_field in self._field_map.iteritems(): 103 | text = esdoc.getfield(source, source_field) 104 | if text and type(text) in [str, unicode]: 105 | cleaned = self._clean_text(text) 106 | if cleaned != text: 107 | # Note: This may lead to a few strictly unnecessary shallow clonings... 108 | doc = esdoc.shallowputfield(doc, "_source." + target_field, cleaned) 109 | return doc 110 | 111 | def _incoming_esdoc(self, doc): 112 | if self.output_esdoc.has_output: 113 | self.output_esdoc.send(self._clean(doc)) 114 | 115 | def _incoming_str(self, doc): 116 | if self.output_str.has_output: 117 | self.output_str.send(self._clean(doc)) 118 | -------------------------------------------------------------------------------- /test/test_procs/test_blacklist_filter.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import unittest 4 | from eslib.procs import BlacklistFilter 5 | 6 | class TestBlacklistFilter_str(unittest.TestCase): 7 | 8 | def test_str_nohit(self): 9 | s = "I am marvellous" 10 | p = BlacklistFilter(filters=[{"tokens": ["we", "like"], "blacklist": ["young"]}]) 11 | p.on_open() 12 | check = p._check(s) 13 | 14 | print "str_nohit (exp:True)=", check 15 | self.assertTrue(check) 16 | 17 | def test_str_hit_but_not_blacklisted(self): 18 | s = "I like girls." 19 | p = BlacklistFilter(filters=[{"tokens": ["we", "like"], "blacklist": ["young"]}]) 20 | print "filters=", p._filters 21 | p.on_open() 22 | check = p._check(s) 23 | 24 | print "str_hit_but_not_blacklisted (exp:True)=", check 25 | self.assertTrue(check) 26 | 27 | def test_str_hit_and_blacklisted(self): 28 | s = "I like young girls." 29 | p = BlacklistFilter(filters=[{"tokens": ["we", "like"], "blacklist": ["young"]}]) 30 | print "filters=", p._filters 31 | p.on_open() 32 | check = p._check(s) 33 | 34 | print "str_hit_and_blacklisted (exp:False)=", check # Should have hit "young" from blacklist 35 | self.assertFalse(check) 36 | 37 | def test_str_global_whitelist_override(self): 38 | s = "We only like girls. Young girls are always welcome!" 39 | p = BlacklistFilter(filters=[{"tokens": ["we", "like"], "blacklist": ["young"]}], whitelist=["young girls"]) 40 | p.on_open() 41 | check = p._check(s) 42 | 43 | print "str_global_whitelist_override (exp:True)=", check 44 | # Should have hit "young" from blacklist, but "young girls" from whitelist should override it 45 | self.assertTrue(check) 46 | 47 | 48 | def test_brooklyn(self): 49 | s = "Brooklyn Nets trounce short-handed Oklahoma City Thunder 116-85 http://t.co/qJZPBEJRCT" 50 | p = BlacklistFilter(filters=[{"tokens": ["nets"], "blacklist": ["brooklyn"]}]) 51 | p.on_open() 52 | check = p._check(s) 53 | 54 | print "check (expect False)=", check 55 | self.assertFalse(check) 56 | 57 | 58 | class TestBlacklistFilter_esdoc(unittest.TestCase): 59 | 60 | # check == True means the document was NOT filtered out, i.e. it PASSED the filter 61 | 62 | def test_str_nohit(self): 63 | s = "I am marvellous" 64 | doc = {"_source": {"field1": s}} 65 | p = BlacklistFilter( 66 | field="field1", 67 | filters=[{"tokens": ["we", "like"], "blacklist": ["young"]}]) 68 | p.on_open() 69 | check = p._check(doc) 70 | 71 | print "str_nohit (exp:False)=", check 72 | self.assertTrue(check) 73 | 74 | def test_str_hit_but_not_blacklisted(self): 75 | s = "I like girls." 76 | doc = {"_source": {"field1": s}} 77 | p = BlacklistFilter( 78 | fields=["field1", "field2"], 79 | filters=[{"tokens": ["we", "like"], "blacklist": ["young"]}]) 80 | print "filters=", p._filters 81 | p.on_open() 82 | check = p._check(doc) 83 | 84 | print "str_hit_but_not_blacklisted (exp:False)=", check 85 | self.assertTrue(check) 86 | 87 | def test_str_hit_and_blacklisted(self): 88 | s1 = "I like young girls." 89 | s2 = "I am a boy." 90 | doc = {"_source": {"field1": s1, "field2": s2}} 91 | p = BlacklistFilter( 92 | fields=["field1", "field2"], 93 | filters=[{"tokens": ["we", "like"], "blacklist": ["young"]}]) 94 | print "filters=", p._filters 95 | p.on_open() 96 | check = p._check(doc) 97 | 98 | print "str_hit_and_blacklisted (exp:False)=", check # Should have hit "young" from blacklist 99 | self.assertFalse(check) 100 | 101 | def test_str_global_whitelist_override(self): 102 | s1 = "We only like girls. Young girls are always welcome!" 103 | s2 = "I like young boys." 104 | doc = {"_source": {"field1": s1, "field2": s2}} 105 | p = BlacklistFilter( 106 | fields=["field1", "field2"], 107 | filters=[{"tokens": ["we", "like"], "blacklist": ["young"]}], 108 | whitelist=["young girls"]) 109 | p.on_open() 110 | check = p._check(doc) 111 | 112 | print "str_global_whitelist_override (exp:True)=", check 113 | # Should have hit "young" from blacklist, but "young girls" from whitelist should override it 114 | self.assertTrue(check) 115 | 116 | def test_str_global_whitelist_override_not_hitting(self): 117 | s1 = "We only like girls. Young girls are always welcome!" 118 | s2 = "I like young boys." 119 | doc = {"_source": {"field1": s1, "field2": s2}} 120 | p = BlacklistFilter( 121 | fields=["field2"], 122 | filters=[{"tokens": ["we", "like"], "blacklist": ["young"]}], 123 | whitelist=["young girls"]) 124 | p.on_open() 125 | check = p._check(doc) 126 | 127 | print "str_global_whitelist_override_not_hitting (exp:False)=", check 128 | # Should have hit "young" from blacklist; "young girls" from whitelist does not apply to field2, so we should not override here 129 | self.assertFalse(check) 130 | 131 | def main(): 132 | unittest.main() 133 | 134 | if __name__ == "__main__": 135 | main() 136 | -------------------------------------------------------------------------------- /eslib/procs/Neo4jWriter.py: -------------------------------------------------------------------------------- 1 | __author__ = 'mats' 2 | 3 | from itertools import izip 4 | import time, logging 5 | 6 | from ..Generator import Generator 7 | from .neo4j import Neo4j 8 | 9 | 10 | class Neo4jWriter(Generator): 11 | """ 12 | This is a pipeline step which primary function is to push an edge 13 | between the author of a tweet to all the people mentioned in the tweet. 14 | 15 | Connectors: 16 | edge (graph-edge) : Edge object to write. 17 | user (graph-user) : User object to write. 18 | 19 | Config: 20 | batchsize = 20 : How many IDs to gather up before making a call to Neo4j. 21 | batchtime = 5.0 : How many seconds to wait before we send a batch if it is not full. 22 | host = localhost: The host we should connect to 23 | port = 7474 : The default neo4j port 24 | 25 | """ 26 | 27 | def __init__(self, **kwargs): 28 | super(Neo4jWriter, self).__init__(**kwargs) 29 | self.create_connector(self._incoming_edge, "edge", "graph-edge") 30 | self.create_connector(self._incoming_user, "user", "graph-user") 31 | 32 | self.config.set_default( 33 | batchsize = 20, 34 | batchtime = 5, 35 | host = "localhost", 36 | port = 7474 37 | ) 38 | 39 | self._neo4j = None 40 | 41 | # This could be better 42 | self._edge_queue = [] 43 | self._last_edge_commit = time.time() 44 | self._user_queue = [] 45 | self._last_user_commit = time.time() 46 | 47 | def on_open(self): 48 | """ 49 | Instantiates both a neo4j-instance and a twitter-instance. 50 | 51 | Raises: 52 | - ConnectionError if neo4j can't contact its server 53 | - Exception if twitter can't authenticate properly 54 | 55 | """ 56 | 57 | # TODO: Need logging, request timeout and exception handling down there: 58 | self.log.debug("Connecting to Neo4j.") 59 | self._neo4j = Neo4j(host=self.config.host, port=self.config.port) 60 | self.log.status("Connected to Neo4j on %s:%s." % (self.config.host, self.config.port)) 61 | 62 | def _incoming_edge(self, document): 63 | """ 64 | Takes an edge and puts it's correct query in the queue. 65 | 66 | Args: 67 | document: A dict with "from", "to" and "type" as fields. 68 | 69 | The ambition is that this Processor should never go down no matter 70 | what happens to a document in this method. 71 | 72 | """ 73 | try: 74 | from_id = document["from"] 75 | to_id = document["to"] 76 | edge_type = document["type"] 77 | except KeyError: 78 | self.doclog.exception("Unable to parse document: %s" % str(document)) 79 | else: 80 | query = self._neo4j.get_edge_query(from_id, edge_type, to_id) 81 | self._edge_queue.append(query) 82 | 83 | def _incoming_user(self, document): 84 | if self.doclog.isEnabledFor(logging.TRACE): 85 | self.doclog.trace("Incoming user '%s' ('%s')." % (document["screen_name"], document["id"])) 86 | query, params = self._neo4j.get_node_merge_query(document) 87 | self._user_queue.append((query, params)) 88 | 89 | def on_tick(self): 90 | """ 91 | Commit items in queue if queue exceeds batchsize or it's been long 92 | since last commit. 93 | 94 | """ 95 | now = time.time() 96 | if ((len(self._edge_queue) >= self.config.batchsize) or 97 | (now - self._last_edge_commit >= self.config.batchtime and 98 | self._edge_queue)): 99 | self._edge_send() 100 | 101 | if ((len(self._user_queue) >= self.config.batchsize) or 102 | ((now - self._last_user_commit >= self.config.batchtime) and 103 | self._user_queue)): 104 | self._user_send() 105 | 106 | def on_shutdown(self): 107 | """ Clear out the rest of the items in the queue """ 108 | self.log.info("Processing remaining edge queue.") 109 | while self._edge_queue: 110 | self._edge_send() 111 | self.log.info("Processing remaining user queue.") 112 | while self._user_queue: 113 | self._user_send() 114 | 115 | def _edge_send(self): 116 | num_edges = len(self._edge_queue) 117 | if num_edges > self.config.batchsize: 118 | num_edges = self.config.batchsize 119 | 120 | rq = self._neo4j._build_rq(self._edge_queue[:num_edges]) 121 | self._neo4j.commit(rq) 122 | self.log.debug("Committed %i edges." % num_edges) 123 | self._edge_queue = self._edge_queue[num_edges:] 124 | self._last_edge_commit = time.time() 125 | 126 | def _user_send(self): 127 | num_users = len(self._user_queue) 128 | if num_users > self.config.batchsize: 129 | num_users = self.config.batchsize 130 | 131 | users, params = [list(t) 132 | for t in 133 | izip(*self._user_queue[:num_users])] 134 | 135 | rq = self._neo4j._build_rq(users, params) 136 | self._neo4j.commit(rq) 137 | self.log.debug("Committed %i users" % num_users) 138 | self.user_queue = self._user_queue[num_users:] 139 | self.last_user_commit = time.time() 140 | -------------------------------------------------------------------------------- /eslib/procs/KafkaMonitor.py: -------------------------------------------------------------------------------- 1 | __author__ = 'Hans Terje Bakke' 2 | 3 | from ..Monitor import Monitor 4 | from pykafka import KafkaClient 5 | import json, time 6 | import logging 7 | import zlib 8 | 9 | 10 | class KafkaMonitor(Monitor): 11 | """ 12 | Monitor a Kafka topic. 13 | Assumes data with type 'str', 'unicode', 'int', 'float' or 'json' from RabbitMQ. 14 | Incoming documents are attempted deserialized into these types. Unknown types are passed as 'str'. 15 | 16 | Sockets: 17 | output (*) : Document received on monitored queue. 18 | 19 | Config: 20 | hosts = ["localhost:9292"] : List of Kafka hosts. 21 | zookeeper_hosts = ["localhost:2181"] : For balanced consumption via zookeeper. 22 | topic = "default_topic" : 23 | consumer_group = "default_group" : Balanced consumer group. 24 | compression = False : Whether to decompress the data read from Kafka. 25 | """ 26 | 27 | CONGESTION_SLEEP_TIME = 10.0 28 | WORK_TIME = 5.0 29 | 30 | def __init__(self, **kwargs): 31 | super(KafkaMonitor, self).__init__(**kwargs) 32 | 33 | self.output = self.create_socket("output", None, "Document received on monitored queue.") 34 | 35 | self.config.set_default( 36 | hosts = ["localhost:9092"], 37 | zookeeper_hosts = ["localhost:2181"], 38 | topic = "default_topic", 39 | consumer_group = "default_group", 40 | compression = False 41 | ) 42 | 43 | self._client = None 44 | self._consumer = None 45 | 46 | #region Processor stuff 47 | 48 | def on_open(self): 49 | self.count = 0 50 | self._client = KafkaClient(",".join(self.config.hosts)) 51 | topic = self._client.topics[self.config.topic] 52 | self._consumer = topic.get_balanced_consumer( 53 | auto_commit_enable = True, 54 | consumer_group = self.config.consumer_group, 55 | zookeeper_connect = ",".join(self.config.zookeeper_hosts) 56 | ) 57 | 58 | self.log.info("Connected to Kafka topic '%s', balanced via zookeeper." % self.config.topic) 59 | 60 | def on_close(self): 61 | if self._client: 62 | self._consumer.stop() 63 | #del self._consumer 64 | self.log.info("Kafka consumer stopped.") 65 | # Can't find any way to close the connection or ask it to release resources, so I try a 'del'. 66 | #del self._client 67 | self._client = None 68 | self.log.debug("Connection to Kafka deleted.") 69 | 70 | #endregion Processor stuff 71 | 72 | #region Generator stuff 73 | 74 | def on_startup(self): 75 | self.count = 0 76 | 77 | def on_tick(self): 78 | 79 | congested = self.congestion() 80 | if congested: 81 | self.log.debug("Congestion in dependent processor '%s'; sleeping %d seconds." % (congested.name, self.CONGESTION_SLEEP_TIME)) 82 | self.congestion_sleep(self.CONGESTION_SLEEP_TIME) 83 | else: 84 | # Read as much as we can for WORK_TIME seconds, then return to controlling 85 | # loop. This way this processor should hang a maximum of WORK_TIME seconds 86 | # before accepting control commands. 87 | start_time = time.time() 88 | while True: 89 | if self.end_tick_reason: 90 | return 91 | if time.time() - start_time > self.WORK_TIME: 92 | self.log.debug("Work time exceeded %s seconds. Returning to control loop." % self.WORK_TIME) 93 | try: 94 | kafka_message = self._consumer.consume(block=False) 95 | except Exception as e: 96 | self.log.error("Error consuming Kafka. Aborting. [%s]" % e.__class__.__name__) 97 | self.abort() 98 | return 99 | if kafka_message is None: 100 | return 101 | 102 | self.count += 1 103 | 104 | if not self.output.has_output: # Don't bother with further message processing, in this case. 105 | return 106 | 107 | document = self._decode_message(kafka_message.value) 108 | if document is not None: 109 | self.output.send(document) 110 | 111 | def _decode_message(self, kafka_data): 112 | 113 | # print "INCOMING KAFKA DATA: [%s]" % kafka_data 114 | 115 | if not kafka_data: 116 | return None 117 | 118 | if self.config.compression: 119 | kafka_data = zlib.decompress(kafka_data) 120 | 121 | msg_type = None 122 | document = None 123 | try: 124 | jj = json.loads(kafka_data) 125 | # kafka_data = tojson({"type": msg_type, "data": data}) 126 | except TypeError as e: 127 | self.doclog.warning("JSON deserialization failed: %s" % e.message) 128 | return None 129 | msg_type = jj.get("type") 130 | document = jj.get("data") 131 | if not msg_type or document is None: 132 | return None 133 | 134 | if self.log.isEnabledFor(logging.TRACE): 135 | self.log.trace("Received message of type '%s', Kafka payload size = %d." % (msg_type, len(kafka_data))) 136 | return document 137 | 138 | #endregion Generator stuff 139 | -------------------------------------------------------------------------------- /eslib/procs/TcpWriter.py: -------------------------------------------------------------------------------- 1 | __author__ = 'Hans Terje Bakke' 2 | 3 | from ..Generator import Generator 4 | from ..esdoc import tojson 5 | import socket 6 | from select import select 7 | 8 | 9 | class TcpWriter(Generator): 10 | """ 11 | Write incoming documents to a TCP port. 12 | Documents of type 'str' and 'unicode' are writtes as-is. Other types are attempted written as JSON. 13 | 14 | NOTE: This processor operates as a Generator, but is considered to be passive; hence keepalive defaults to False. 15 | 16 | Connectors: 17 | input (*) : Incoming documents to write to a TCP socket. 18 | 19 | Config: 20 | hostname = "" : Default to any address the machine happens to have. Use "localhost" to enforce local onlu. 21 | port = 4000 : 22 | reuse_address = False : Whether to allow reusing an existing TCP address/port. 23 | """ 24 | def __init__(self, **kwargs): 25 | super(TcpWriter, self).__init__(**kwargs) 26 | self.create_connector(self._incoming, "input", None, "Incoming documents to write to a TCP socket.") 27 | 28 | self.keepalive = False # Passive of nature, hence this default 29 | 30 | self.config.set_default( 31 | hostname = "", 32 | port = 4000, 33 | reuse_address = False 34 | ) 35 | 36 | self._connections = [] # List of (socket, address) pairs 37 | self._socket = None 38 | 39 | def on_open(self): 40 | self._socket = None 41 | sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) 42 | if self.config.reuse_address: 43 | sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1) 44 | address = (self.config.hostname #or socket.gethostname() 45 | , self.config.port) 46 | try: 47 | sock.bind(address) 48 | #sock.setblocking(0) 49 | sock.listen(0) # No backlog limit 50 | self.log.info("Listening for connections on %s:%d." % address) 51 | except socket.error as e: 52 | self.log.critical("Listener failed to bind to %s:%d. (errno=%d, message=%s)" % (self.config.hostname, self.config.port, e.errno, e.args[1])) 53 | raise e 54 | 55 | self._connections = [] 56 | self._socket = sock 57 | 58 | self.total = 0 59 | 60 | def on_close(self): 61 | if self._connections: 62 | for c in self._connections: 63 | s, a = c 64 | s.close() 65 | self._connections = [] 66 | if self._socket: 67 | self._socket.close() 68 | self._socket = None 69 | self.log.info("Listener closed.") 70 | 71 | @staticmethod 72 | def _get_conn(connections, sock): 73 | for c in connections: 74 | if c[0] == sock: 75 | return c 76 | return None 77 | 78 | def on_tick(self): 79 | if not self.running or self.stopping: 80 | return 81 | 82 | r, w, e = select([self._socket], [], [self._socket], 0) # Non-blocking 83 | if e: 84 | self.log.warning("Error on server socket -- now what?") 85 | if r: 86 | # We have one or more new connections pending. Get one and return to run loop. 87 | c = self._socket.accept() 88 | s, a = c 89 | self.log.info("New connection from %s:%d." % a) 90 | self._connections.append(c) 91 | 92 | # Check for dead connections 93 | connections = self._connections[:] 94 | sockets = [s for s,a in connections] 95 | r, w, e = select(sockets, [], sockets, 0) 96 | if e: 97 | self.log.warning("Error on connected socket -- now what?") 98 | for s in r: 99 | # This socket is intended for write only, but since there is now data, 100 | # we read a bit just to work down the input buffer. If it is empty, getting 101 | # here means the connection has been closed on the other end, and we can remove it. 102 | data = s.recv(1024) 103 | if not data: 104 | s.close() 105 | c = self._get_conn(connections, s) 106 | if c and c in self._connections: 107 | self.log.info("Connection closed by client %s:%d." % c[1]) 108 | self._connections.remove(c) 109 | else: 110 | self.log.info("Unknown connection closed by client.") 111 | 112 | def _send(self, data): 113 | connections = self._connections[:] 114 | for c in connections: 115 | s, a = c 116 | try: 117 | s.sendall((data + "\n").encode("utf8")) 118 | #s.flush() 119 | except socket.error as e: 120 | if e.errno == socket.errno.EPIPE: # Broken pipe 121 | self.log.info("Connection closed by client %s:%d. (Broken pipe)" % a) 122 | else: 123 | self.log.error("Unhandled error writing to socket from %s:%d. Disconnecting. (errno=%d, message=%s)" % 124 | (a[0], a[1], e.errno, e.args[1])) 125 | self._connections.remove(c) 126 | 127 | def _incoming(self, document): 128 | if document: 129 | data = document 130 | if not type(document) in [str, unicode]: 131 | data = tojson() 132 | self._send(data) 133 | 134 | self.count += 1 135 | self.total += 1 136 | -------------------------------------------------------------------------------- /test/test_procs/test_entity_extractor.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | __author__ = 'Hans Terje Bakke' 4 | 5 | import unittest 6 | from eslib.procs.EntityExtractor import EntityExtractor 7 | from eslib import esdoc 8 | from eslib import unique 9 | 10 | 11 | class TestEntityExtractor(unittest.TestCase): 12 | entities = \ 13 | [ 14 | { 15 | "category": "webpage", 16 | "name": "nrk", 17 | "match": [ 18 | { "type": "exact", "pattern": "nrk.no" }, 19 | #{ "type": "iprange", "value": "160.68.205.231/16" } 20 | ] 21 | }, 22 | { 23 | "category": "targets", 24 | "name": "comperio", 25 | "match": [ 26 | { "type": "exact", "pattern": u"hans terje bøkke", "weight": 0.8 }, 27 | { "type": "exact", "pattern": "10.0.0.100", "weight": 0.5 }, 28 | { "type": "exact", "pattern": "comperio" } 29 | ] 30 | }, 31 | { 32 | "category": "targets", 33 | "name": "IBM", 34 | "match": [ 35 | { "type": "exact", "pattern": "ibm" } 36 | ] 37 | }, 38 | { 39 | "category": "creditcards", 40 | "name": "creditcard", # The name should become the credit card number 41 | "match": [ { "type": "creditcard" } ] 42 | }, 43 | { 44 | "category": "emails", 45 | "name": "email", # The email should become the email address 46 | "match": [ { "type": "email" } ] 47 | }, 48 | ] 49 | 50 | def test_defaults(self): 51 | ex = EntityExtractor() 52 | ex.on_open() 53 | 54 | self.assertEqual(ex.config.fields, []) 55 | self.assertEqual(ex.config.target, "entities") 56 | self.assertEqual(ex.config.entities, []) 57 | 58 | def test_extract_str(self): 59 | ex = EntityExtractor() 60 | ex.config.entities = self.entities 61 | ex.on_open() 62 | 63 | s = u"As mentioned ø on nrk.no, Hans Terje Bøkke works for Comperio. His PC has IP address 10.0.0.100. " + \ 64 | "He never uses his credit card: 1234.5678.9876.5432. You can contact him on " + \ 65 | "hans.terje.bakke@gmail.com. But balle.klorin@wesenlund.no will not work for IBM." 66 | 67 | extracted = ex._extract(None, s) 68 | elist = list(extracted) 69 | 70 | for e in elist: 71 | print e 72 | 73 | self.assertEqual(len(elist), 8) 74 | 75 | 76 | def _verify(self, entities): 77 | webpages = unique([x["name"] for x in entities["webpage"]]) 78 | targets = unique([x["name"] for x in entities["targets"]]) 79 | emails = unique([x["name"] for x in entities["emails"]]) 80 | creditcards = unique([x["name"] for x in entities["creditcards"]]) 81 | 82 | print "WEBPAGE:", webpages 83 | print "TARGETS:", targets 84 | print "EMAILS :", emails 85 | print "CREDITC:", creditcards 86 | 87 | self.assertEqual(['nrk'], webpages) 88 | self.assertEqual(['comperio', 'IBM'], targets) 89 | self.assertEqual(['hans.terje.bakke@gmail.com', 'balle.klorin@wesenlund.no'], emails) 90 | self.assertEqual(['1234.5678.9876.5432'], creditcards) 91 | 92 | def test_merge(self): 93 | ex = EntityExtractor() 94 | ex.config.entities = self.entities 95 | ex.on_open() 96 | 97 | s = "As mentioned on nrk.no, Hans Terje Bakke works for Comperio. His PC has IP address 10.0.0.100. " + \ 98 | "He never uses his credit card: 1234.5678.9876.5432. You can contact him on " + \ 99 | "hans.terje.bakke@gmail.com. But balle.klorin@wesenlund.no will not work for IBM." 100 | 101 | extracted = ex._extract(None, s) 102 | entities = ex._merge(extracted) 103 | 104 | self._verify(entities) 105 | 106 | def test_doc_through(self): 107 | 108 | ex = EntityExtractor() 109 | ex.config.entities = self.entities 110 | 111 | doc = {"_id": "123", "_source": { 112 | "field1": "As mentioned on nrk.no, Hans Terje Bakke works for Comperio.", 113 | "field2": "He never uses his credit card: 1234.5678.9876.5432.", 114 | "field3": "You can contact him on hans.terje.bakke@gmail.com.", 115 | "subsection" : { 116 | "subfield": "But balle.klorin@wesenlund.no will not work for IBM." 117 | }, 118 | "entities": { "old" : "stuff" } 119 | }} 120 | 121 | ex.config.fields = ["field1", "field2", "field3", "subsection.subfield"] 122 | 123 | output = [] 124 | ex.add_callback(lambda proc, doc: output.append(doc)) 125 | ex.start() 126 | ex.put(doc) 127 | ex.stop() 128 | ex.wait() 129 | 130 | #print output[0] 131 | 132 | new_doc = output[0] 133 | entities = new_doc["_source"]["entities"] 134 | 135 | self._verify(entities) 136 | 137 | # Check that old and new doc are not the same 138 | self.assertFalse(doc is new_doc) 139 | 140 | # Check that the previous entities still exist in the new document 141 | old = esdoc.getfield(new_doc, "_source.entities.old") 142 | self.assertEqual(old, "stuff") 143 | 144 | # Check that the new entities do not exist in the original document 145 | self.assertTrue(esdoc.getfield(doc, "_source.entities.webpage") is None) 146 | self.assertTrue(esdoc.getfield(new_doc, "_source.entities.webpage") is not None) 147 | -------------------------------------------------------------------------------- /test/test_connections.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | from eslib import Processor 3 | 4 | class Connections(object): 5 | 6 | def create_processors(self): 7 | self.a = Processor(name="processor_a") 8 | self.b = Processor(name="processor_b") 9 | self.c = Processor(name="processor_c") 10 | self.d = Processor(name="processor_d") 11 | 12 | def create_terminals(self): 13 | self.a.create_connector(None, "input") # Protocol anything 14 | self.a.create_socket("output", "proto_doc") 15 | self.b.create_connector(None, "input", "proto_doc") 16 | self.b.create_socket("output_doc", "proto_doc") 17 | self.b.create_socket("output_str", "proto_str") 18 | self.c.create_connector(None, "input_doc", "proto_doc") 19 | self.c.create_connector(None, "input_str", "proto_str") 20 | self.c.create_socket("output_doc", "proto_doc") 21 | self.c.create_socket("output_ext", "proto_doc.extended") 22 | self.c.create_socket("output_anything") 23 | self.d.create_connector(None, "input_anything") 24 | self.d.create_connector(None, "input_doc", "proto_doc") 25 | self.d.create_connector(None, "input_ext", "proto_doc.extended") 26 | 27 | def connect_terminals(self): 28 | self.b.subscribe(self.a) # Ok call, only one socket and connector 29 | self.c.subscribe(self.b, "output_doc", "input_doc") # Ok 30 | self.c.subscribe(self.a, connector_name="input_doc") # Ok, a's only socket name can be omitted 31 | self.d.subscribe(self.c, "output_doc", "input_anything") # Ok, any input accepted 32 | self.d.subscribe(self.c, "output_ext", "input_ext") # Ok, protocol exact match 33 | 34 | 35 | class TestConnections(unittest.TestCase, Connections): 36 | 37 | def test_create_processors(self): 38 | self.create_processors() 39 | 40 | self.assertIsNotNone(self.a, "Processor a None") 41 | self.assertIsNotNone(self.b, "Processor b None") 42 | self.assertIsNotNone(self.c, "Processor c None") 43 | self.assertIsNotNone(self.d, "Processor d None") 44 | 45 | def test_create_terminals(self): 46 | self.create_processors() 47 | self.create_terminals() 48 | 49 | self.assertTrue(len(self.a.connectors) == 1, "Expected 1 connector for a") 50 | self.assertTrue(len(self.b.connectors) == 1, "Expected 1 connector for b") 51 | self.assertTrue(len(self.c.connectors) == 2, "Expected 2 connectors for c") 52 | self.assertTrue(len(self.d.connectors) == 3, "Expected 3 connectors for d") 53 | 54 | self.assertTrue(len(self.a.sockets) == 1, "Expected 1 socket for a") 55 | self.assertTrue(len(self.b.sockets) == 2, "Expected 2 sockets for b") 56 | self.assertTrue(len(self.c.sockets) == 3, "Expected 3 sockets for c") 57 | self.assertTrue(len(self.d.sockets) == 0, "Expected 0 sockets for d") 58 | 59 | 60 | def test_connect(self): 61 | self.create_processors() 62 | self.create_terminals() 63 | self.connect_terminals() 64 | 65 | # Cannot decide socket, should fail: 66 | self.assertRaises(Exception, self.c.subscribe, (self.b,)) 67 | # Ok for socket, but still cannot decide which one of C's connectors: 68 | self.assertRaises(Exception, self.c.subscribe, (self.b, "output_doc")) 69 | # Protocol error: 70 | self.assertRaises(Exception, self.c.subscribe, (self.b, "output_doc", "input_str")) 71 | # Should fail on protocol error: 72 | self.assertRaises(Exception, self.d.subscribe, (self.c, "output_anything", "input_doc")) 73 | # Protocol error: 74 | self.assertRaises(Exception, self.d.subscribe, (self.c, "output_ext", "input_doc")) 75 | # Protocol error, connector more specific than socket: 76 | self.assertRaises(Exception, self.d.subscribe, (self.c, "output_doc", "input_ext")) 77 | 78 | # Do a quick check to see if expected number of connections are now ok 79 | self.assertTrue(len(self.a.sockets["output"].connections) == 2) # b and c 80 | self.assertTrue(len(self.b.connectors["input"].connections) == 1) # b 81 | self.assertTrue(len(self.b.sockets["output_doc"].connections) == 1) # c 82 | self.assertTrue(len(self.c.connectors["input_doc"].connections) == 2) # a and b 83 | self.assertTrue(len(self.c.sockets["output_doc"].connections) == 1) # d 84 | self.assertTrue(len(self.c.sockets["output_ext"].connections) == 1) # d 85 | self.assertTrue(len(self.d.connectors["input_anything"].connections) == 1) # c 86 | self.assertTrue(len(self.d.connectors["input_ext"].connections) == 1) # c 87 | 88 | 89 | def test_connect2(self): 90 | self.create_processors() 91 | self.create_terminals() 92 | self.connect_terminals() 93 | 94 | self.b.unsubscribe() # unsubscribes all input connectors 95 | self.assertTrue(len(self.a.sockets["output"].connections) == 1) # only c left 96 | self.assertTrue(len(self.b.connectors["input"].connections) == 0) 97 | 98 | self.c.unsubscribe(self.a) 99 | self.c.unsubscribe(self.a, connector_name="input_doc") 100 | self.assertTrue(len(self.a.sockets["output"].connections) == 0) 101 | self.assertTrue(len(self.b.sockets["output_doc"].connections) == 1) # c remains 102 | self.assertTrue(len(self.c.connectors["input_doc"].connections) == 1) # only b left 103 | 104 | self.c.unsubscribe(connector_name="input_doc") 105 | self.assertTrue(len(self.b.sockets["output_doc"].connections) == 0) # c now also gone 106 | 107 | self.c.detach(self.d) # Should detach all connections to d 108 | self.assertTrue(len(self.c.sockets["output_doc"].connections) == 0) 109 | self.assertTrue(len(self.c.sockets["output_ext"].connections) == 0) 110 | self.assertTrue(len(self.d.connectors["input_anything"].connections) == 0) 111 | self.assertTrue(len(self.d.connectors["input_ext"].connections) == 0) 112 | 113 | 114 | def main(): 115 | unittest.main() 116 | 117 | if __name__ == "__main__": 118 | main() 119 | -------------------------------------------------------------------------------- /test/test_service/test_http_service.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | ENDPOINT = "localhost:4000" 4 | 5 | import unittest 6 | from eslib.service import Service, HttpService, status 7 | from eslib.procs import Timer, Transformer 8 | import requests, time, threading 9 | 10 | import eslib.prog 11 | eslib.prog.initlogs() 12 | 13 | class TestService(Service): 14 | def __init__(self, **kwargs): 15 | super(TestService, self).__init__(**kwargs) 16 | 17 | self.ending = False 18 | self.requires_metadata = False 19 | 20 | def on_setup(self): 21 | self._timer = Timer(service=self, actions=[(3, 3, "ping")]) 22 | self._pc = Transformer(service=self, func=self._func) 23 | self._pc.subscribe(self._timer) 24 | 25 | self.register_procs(self._timer, self._pc) 26 | 27 | return True 28 | 29 | def _func(self, proc, doc): 30 | print doc 31 | if self.ending: 32 | print "FUNC STOP" 33 | self._timer.stop() 34 | 35 | def is_processing(self): 36 | return self._pc.running 37 | 38 | def is_aborted(self): 39 | return self._pc.aborted 40 | 41 | def is_suspended(self): 42 | return self._pc.suspended 43 | 44 | # on_start_processing (should be ran async) 45 | def on_processing_start(self): 46 | self._timer.start() 47 | time.sleep(1) # Simulate that it takes some time 48 | return True 49 | 50 | def on_processing_stop(self): 51 | time.sleep(1) # Simulate that it takes some time 52 | self._timer.stop() 53 | self._pc.wait() 54 | return True 55 | 56 | # on_abort_processing 57 | def on_processing_abort(self): 58 | self._timer.abort() 59 | self._pc.stop() 60 | return True 61 | 62 | 63 | # TODO: on_update_metadata 64 | 65 | 66 | class HttpTestService(HttpService, TestService): 67 | 68 | def __init__(self, **kwargs): 69 | super(HttpTestService, self).__init__(**kwargs) 70 | 71 | # Add management routes to functions 72 | self.add_route(self._test1, "GET", "test1/{id}/{?mode}", ["mode"]) 73 | 74 | def _test1(self, request_handler, payload, **kwargs): 75 | parameters = kwargs 76 | print "TEST1:", parameters 77 | return {"echo": parameters} 78 | 79 | class TestTestService(unittest.TestCase): 80 | 81 | def test_run_shutdown(self): 82 | p = TestService()#mgmt_endpoint=ENDPOINT) # localhost:4444 by default 83 | p.ending = False 84 | 85 | print "Starting service" 86 | print "Asserting '%s' (not started)" % status.DOWN 87 | self.assertEqual(p.status, status.DOWN) 88 | 89 | p.run() 90 | # This does not require config, thus going straight from 'down' to 'idle' 91 | print "Asserting '%s'" % status.IDLE 92 | self.assertEqual(p.status, status.IDLE) 93 | 94 | print "Shutting down" 95 | p.shutdown(wait=True) 96 | print "Asserting '%s' (shut down)" % status.DOWN 97 | self.assertEqual(p.status, status.DOWN) 98 | 99 | def test_lifecycle(self): 100 | p = TestService()#mgmt_endpoint=ENDPOINT) # localhost:4444 by default 101 | p.ending = False 102 | 103 | print "Starting service" 104 | print "Asserting '%s' (not started)" % status.DOWN 105 | self.assertEqual(status.DOWN, p.status) 106 | 107 | p.run() 108 | # This does not require config, thus going straight from 'down' to 'idle' 109 | print "Asserting '%s'" % status.IDLE 110 | self.assertEqual(status.IDLE, p.status) 111 | 112 | print "Starting processing" 113 | p.processing_start() 114 | print "Asserting '%s'" % status.PROCESSING 115 | self.assertEqual(status.PROCESSING, p.status) 116 | 117 | time.sleep(1) 118 | print "Stopping processing" 119 | p.processing_stop() 120 | time.sleep(0.1) 121 | print "Asserting '%s'" % status.STOPPING 122 | self.assertEqual(status.STOPPING, p.status) 123 | 124 | print "Waiting for processing to stop" 125 | p.processing_wait() 126 | print "Asserting '%s' (stopped)" % status.IDLE 127 | self.assertEqual(status.IDLE, p.status) 128 | 129 | print "Starting processing" 130 | p.processing_start() 131 | print "Asserting '%s'" % status.PROCESSING 132 | self.assertEqual(status.PROCESSING, p.status) 133 | 134 | time.sleep(1) 135 | print "Aborting processing" 136 | p.processing_abort() 137 | print "Asserting '%s'" % status.ABORTED 138 | self.assertEqual(status.ABORTED, p.status) 139 | 140 | print "Starting processing" 141 | p.processing_start() 142 | print "Asserting '%s'" % status.PROCESSING 143 | self.assertEqual(status.PROCESSING, p.status) 144 | 145 | print "Shutting down" 146 | p.shutdown() 147 | #threading.Thread(target=lambda : p.shutdown()).start() 148 | time.sleep(0.1) 149 | print "Asserting '%s'" % status.CLOSING 150 | self.assertEqual(status.CLOSING, p.status) 151 | 152 | print "Waiting for shutdown" 153 | p.wait() 154 | print "Asserting '%s' (shut down)" % status.DOWN 155 | self.assertEqual(status.DOWN, p.status) 156 | 157 | def test_lifecycle_ending_service(self): 158 | p = TestService()#mgmt_endpoint=ENDPOINT) # localhost:4444 by default 159 | p.ending = True 160 | 161 | print "Starting service" 162 | print "Asserting '%s' (not started)" % status.DOWN 163 | self.assertEqual(status.DOWN, p.status) 164 | 165 | p.run() 166 | # This does not require config, thus going straight from 'down' to 'idle' 167 | print "Asserting '%s'" % status.IDLE 168 | self.assertEqual(status.IDLE, p.status) 169 | 170 | print "Starting processing (take 1)" 171 | p.processing_start() 172 | print "Asserting '%s'" % status.PROCESSING 173 | self.assertEqual(status.PROCESSING, p.status) 174 | 175 | print "Waiting for processing to finish (take 1)" 176 | p.processing_wait() 177 | print "Asserting '%s' (stopped)" % status.IDLE 178 | self.assertEqual(status.IDLE, p.status) 179 | 180 | print "Starting processing (take 2)" 181 | p.processing_start() 182 | print "Asserting '%s'" % status.PROCESSING 183 | self.assertEqual(status.PROCESSING, p.status) 184 | 185 | print "Waiting for processing to finish (take 2)" 186 | p.processing_wait() 187 | print "Asserting '%s' (stopped)" % status.IDLE 188 | self.assertEqual(status.IDLE, p.status) 189 | 190 | print "Shutting down (waiting)" 191 | p.shutdown(wait=True) 192 | print "Asserting '%s' (shut down)" % status.DOWN 193 | self.assertEqual(status.DOWN, p.status) 194 | 195 | def main(): 196 | unittest.main() 197 | 198 | if __name__ == "__main__": 199 | main() 200 | -------------------------------------------------------------------------------- /eslib/web.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | """ 4 | eslib.web 5 | ~~~~~~~~~~ 6 | 7 | Module containing operations against web servers and on web content. 8 | """ 9 | 10 | 11 | __all__ = ("WebGetter", "detect_language", "remove_boilerplate") 12 | 13 | 14 | import requests 15 | import eslib 16 | from collections import Counter 17 | from textblob import TextBlob 18 | import justext 19 | from datetime import datetime, timedelta 20 | from email.utils import parsedate_tz, mktime_tz 21 | 22 | class WebGetter(object): 23 | def __init__(self, max_size=-1, content_types=None): 24 | self.content_types = content_types or ["text/plain", "text/html", "text/xml", "application/xml"] 25 | self.max_size = 1024*1024 # 1 MB 26 | if max_size > 0: self.max_size = max_size 27 | 28 | def get(self, url): 29 | # Fetch web page 30 | try: 31 | res = requests.get(url, verify=False) 32 | res.raise_for_status 33 | except: 34 | msg = "URL failed: %s" % url 35 | raise IOError(msg) 36 | if not res.ok: 37 | msg = "URL not ok, status_code=%s for URL: %s" % (res.status_code, url) 38 | raise IOError(msg) 39 | 40 | # Verify allowed content type 41 | content_type = (res.headers.get("content-type") or "").split(";")[0] 42 | if not content_type in self.content_types: 43 | msg = "Skipping web page with content type '%s', URL: %s" % (content_type, url) 44 | raise ValueError(msg) 45 | 46 | # Size check with reported content size 47 | if self.max_size > 0: 48 | size = int(res.headers.get("content-length") or -1) 49 | if size > 0 and size > self.max_size: 50 | msg = "Skipping too large web page (%s), URL: %s" % (eslib.debug.byteSizeString(size, 2), url) 51 | raise ValueError(msg) 52 | 53 | # Find timestamp 54 | date_str = res.headers.get("date") 55 | if not date_str: 56 | timestamp = datetime.utcnow() 57 | else: 58 | t = mktime_tz(parsedate_tz(date_str)) 59 | timestamp = datetime(1970, 1, 1) + timedelta(seconds=t) 60 | 61 | # Extract vitals from web result 62 | id = url # res.url 63 | encoding = res.encoding 64 | content = res.text 65 | 66 | # Repeat size check with actual content size 67 | if self.max_size > 0: 68 | size = len(content) 69 | if size > self.max_size: 70 | msg = "Skipping too large web page (%s), URL: %s" % (eslib.debug.byteSizeString(size, 2), url) 71 | raise ValueError(msg) 72 | 73 | body = {"content": content, "content_type": content_type, "encoding": encoding, "date": timestamp} 74 | return body 75 | 76 | #region Language detection 77 | 78 | def detect_language(text, chunk_size=250, max_chunks=5): 79 | """ 80 | Detects language of the passed text. Returns majority detection on multiple chunks in order to avoid 81 | misclassification on text with boilerplate text of another language in the beginning of the string. 82 | 83 | Uses Google Translate REST API through the TextBlob library. 84 | 85 | :param text: str 86 | :param chunk_size: int Number of characters in each detection chunk. 87 | :param max_chunks: int Maximum number of chunks to run detection on. 88 | :return: str Google Translate language code. 89 | """ 90 | n_chunks = int(max(min(len(text) / chunk_size, max_chunks), 1)) 91 | detections = [] 92 | 93 | for c in xrange(n_chunks): 94 | l = c * chunk_size 95 | u = max((c + 1) * chunk_size, len(text)) 96 | 97 | chunk = text[l:u] 98 | detections.append(TextBlob(chunk).detect_language()) 99 | 100 | counts = Counter(detections) 101 | 102 | return counts.most_common(n=1)[0][0] 103 | 104 | #endregion Language detection 105 | 106 | # #region Boilerplate removal 107 | 108 | # Map of correspondences between Google Translate and internal JusText 109 | # language codes 110 | GTRANS_JUSTEXT_LANG_MAP = { 111 | u'af': u'Afrikaans', 112 | u'sq': u'Albanian', 113 | u'ar': u'Arabic', 114 | u'az': u'Azerbaijani', 115 | u'eu': u'Basque', 116 | u'be': u'Belarusian', 117 | u'bg': u'Bulgarian', 118 | u'ca': u'Catalan', 119 | u'hr': u'Croatian', 120 | u'cz': u'Czech', 121 | u'da': u'Danish', 122 | u'nl': u'Dutch', 123 | u'en': u'English', 124 | u'eo': u'Esperanto', 125 | u'et': u'Estonian', 126 | u'fi': u'Finnish', 127 | u'fr': u'French', 128 | u'gl': u'Galician', 129 | u'ka': u'Georgian', 130 | u'de': u'German', 131 | u'el': u'Greek', 132 | u'gu': u'Gujarati', 133 | u'ht': u'Haitian', 134 | u'iw': u'Hebrew', 135 | u'hi': u'Hindi', 136 | u'hu': u'Hungarian', 137 | u'is': u'Icelandic', 138 | u'id': u'Indonesian', 139 | u'ga': u'Irish', 140 | u'it': u'Italian', 141 | u'kn': u'Kannada', 142 | u'ko': u'Korean', 143 | u'la': u'Latin', 144 | u'lv': u'Latvian', 145 | u'lt': u'Lithuanian', 146 | u'mk': u'Macedonian', 147 | u'ms': u'Malay', 148 | u'mt': u'Maltese', 149 | u'no': u'Norwegian_Bokmal', 150 | u'fa': u'Persian', 151 | u'pl': u'Polish', 152 | u'pt': u'Portuguese', 153 | u'ro': u'Romanian', 154 | u'ru': u'Russian', 155 | u'sr': u'Serbian', 156 | u'sk': u'Slovak', 157 | u'sl': u'Slovenian', 158 | u'es': u'Spanish', 159 | u'sw': u'Swahili', 160 | u'sv': u'Swedish', 161 | u'tl': u'Tagalog', 162 | u'ta': u'Tamil', 163 | u'te': u'Telugu', 164 | u'tr': u'Turkish', 165 | u'uk': u'Ukrainian', 166 | u'ur': u'Urdu', 167 | u'vi': u'Vietnamese', 168 | u'cy': u'Welsh'} 169 | 170 | def remove_boilerplate(page_str, lang, relaxed=False): 171 | """ 172 | Removes boilerplate from HTML documents. 173 | 174 | Uses JusText library. 175 | 176 | NOTE: quality dependent on correct language detection. 177 | 178 | :param page_str: str HTML page source. 179 | :param lang: str Google Translate language code. 180 | :param relaxed: boolean If True the span between the first and last good/near-good boilerplate match 181 | is returned. Short and bad segments in between are kept. 182 | :return: list List of non-boilerplate segments/paragraphs. 183 | """ 184 | if lang not in GTRANS_JUSTEXT_LANG_MAP: 185 | #raise AttributeError("Can not remove boilerplate for language code lang='%s'." % lang) 186 | return [] 187 | 188 | jt_lang = GTRANS_JUSTEXT_LANG_MAP[lang] 189 | 190 | paragraphs = justext.justext(page_str, justext.get_stoplist(jt_lang)) 191 | 192 | if relaxed: 193 | good_indexes = [paragraphs.index(p) for p in paragraphs if p.class_type in ['near-good', 'good']] 194 | 195 | if len(good_indexes) == 0: 196 | return [] 197 | 198 | return [paragraph.text for paragraph in paragraphs[min(good_indexes):max(good_indexes) + 1]] 199 | else: 200 | return [paragraph.text for paragraph in paragraphs if paragraph.class_type in ['near-good', 'good', 'short']] 201 | 202 | #endregion Boilerplate removal 203 | -------------------------------------------------------------------------------- /eslib/procs/RabbitmqMonitor.py: -------------------------------------------------------------------------------- 1 | __author__ = 'Hans Terje Bakke' 2 | 3 | from ..Monitor import Monitor 4 | from .RabbitmqBase import RabbitmqBase 5 | import pika 6 | import json, time 7 | 8 | class RabbitmqMonitor(Monitor, RabbitmqBase): 9 | """ 10 | Monitor a queue in RabbitMQ. 11 | Assumes data with type 'str', 'unicode', 'int', 'float' or 'json' from RabbitMQ. 12 | Incoming documents are attempted deserialized into these types. Unknown types are passed as 'str'. 13 | 14 | Sockets: 15 | output (*) : Document received on monitored queue. 16 | 17 | Config: 18 | host = localhost : 19 | port = 5672 : 20 | admin_port = 15672 : 21 | username = guest : 22 | password = guest : 23 | virtual_host = None : 24 | exchange = None : 25 | queue = "default" : Not used if 'exchange' is specified. 26 | consuming = True : Consume from the queue, rather than to listen on an 27 | exclusive queue that will be deleted when disconnect. 28 | Non-consuming behaviour only works with an 'exchange'. 29 | max_reconnects = 3 : 30 | reconnect_timeout = 3 : 31 | """ 32 | 33 | CALC_TOTAL = True # Whether to check our the RabbitMQ at intervals and calculate a total 34 | # from current count and remaining in queue. It thus becomes a moving 35 | # target for ETA calculations. 36 | CALC_TOTAL_INTERVAL = 10.0 # seconds 37 | 38 | _is_reader = True 39 | 40 | def __init__(self, **kwargs): 41 | super(RabbitmqMonitor, self).__init__(**kwargs) 42 | 43 | self.output = self.create_socket("output", None, "Document received on monitored queue.") 44 | 45 | self.config.set_default( 46 | max_reconnects = 3, 47 | reconnect_timeout = 3 48 | ) 49 | 50 | self._reconnecting = 0 51 | self._last_calc_total = 0 52 | 53 | #region Processor stuff 54 | 55 | def on_open(self): 56 | self._open_connection() 57 | self.log.info("Connected to RabbitMQ.") 58 | 59 | def on_close(self): 60 | self._calc_total() 61 | if self._close_connection(): 62 | self.log.info("Connection to RabbitMQ closed.") 63 | 64 | #endregion Processor stuff 65 | 66 | #region Generator stuff 67 | 68 | def _start_consuming(self): 69 | self._consumer_tag = self._channel.basic_consume(self._callback, queue=self._queue_name, no_ack=True) 70 | 71 | def _stop_consuming(self): 72 | if self._channel: 73 | self._channel.basic_cancel(self._consumer_tag) 74 | 75 | def on_startup(self): 76 | if self.CALC_TOTAL: 77 | self.total = 0 # We will collect this from message queue, otherwise it should be set to None 78 | self._last_calc_total = 0 79 | self.count = 0 80 | self._start_consuming() 81 | 82 | def on_shutdown(self): 83 | self._stop_consuming() 84 | 85 | def on_abort(self): 86 | self._stop_consuming() 87 | 88 | def on_suspend(self): 89 | self._stop_consuming() 90 | 91 | def on_resume(self): 92 | self._start_consuming() 93 | 94 | def on_tick(self): 95 | if self._reconnecting > 0: 96 | self._reconnecting -= 1 97 | # Try to reconnect 98 | ok = False 99 | try: 100 | self._close_connection() 101 | self._open_connection() 102 | self.log.info("Successfully reconnected to RabbitMQ.") 103 | self.reconnecting = 0 # No longer attempting reconnects 104 | self._start_consuming() 105 | except pika.exceptions.AMQPConnectionError as e: 106 | if self._reconnecting > 0: 107 | timeout = self.config.reconnect_timeout 108 | self.log.warning("Reconnect to RabbitMQ failed. Waiting %d seconds." % timeout) 109 | time.sleep(timeout) 110 | else: 111 | self.log.critical("Missing connection to RabbitMQ. Max retries exceeded. Aborting.") 112 | self.abort() # We give up and abort 113 | return 114 | 115 | try: 116 | self._calc_total() 117 | congested = self.congestion() 118 | if congested: 119 | self.log.debug("Congestion in dependent processor '%s'; sleeping 10 seconds." % congested.name) 120 | self.congestion_sleep(10.0) 121 | else: 122 | self._channel.connection.process_data_events() 123 | except Exception as e: 124 | if self._reconnecting >= 0: 125 | self.log.info("No open connection to RabbitMQ. Trying to reconnect.") 126 | self._reconnecting = self.config.max_reconnects # Number of reconnect attempts; will start reconnecting on next tick 127 | 128 | def _calc_total(self): 129 | """ 130 | Calculate total number of messages. 131 | That is the sum of what is processed so far, and what remains in the queue. 132 | """ 133 | if not self.CALC_TOTAL: 134 | return 135 | 136 | now = time.time() 137 | if now - self._last_calc_total > self.CALC_TOTAL_INTERVAL: 138 | try: 139 | self.total = self.get_queue_size() + self.count 140 | except Exception as e: 141 | self.log.warning("Failed to get queue size for queue '%s': %s" % (self._queue_name, e)) 142 | self._last_calc_total = now 143 | 144 | def _callback(self, callback, method, properties, body): 145 | #print "*** RabbitmqMonitor received:" 146 | #print "*** Properties:", properties 147 | #print "*** Body: ", body 148 | 149 | self.count += 1 150 | 151 | if not self.output.has_output: # Don't bother deserializing, etc, in this case 152 | return 153 | 154 | try: 155 | msg_type = properties.type 156 | document = None 157 | if msg_type == "json": 158 | try: 159 | document = json.loads(body) 160 | except TypeError as e: 161 | self.doclog.warning(e.message) 162 | return 163 | elif msg_type in ["str", "unicode"]: 164 | document = body 165 | elif msg_type == "int": 166 | document = int(str(body)) 167 | elif msg_type == "float": 168 | document = float(str(body)) 169 | elif body: 170 | self.doclog.debug("Received document of type='%s'; converting to str.", msg_type) 171 | document = str(body) 172 | 173 | if document != None: 174 | self.output.send(document) 175 | else: 176 | self.doclog.warning("Received empty document from RabbitMQ.") 177 | except Exception as e: 178 | self.log.error("An exception occurred inside the callback: %s" % e.message) 179 | 180 | #endregion Generator stuff 181 | -------------------------------------------------------------------------------- /eslib/procs/FileReader.py: -------------------------------------------------------------------------------- 1 | __author__ = 'Hans Terje Bakke' 2 | 3 | from ..Generator import Generator 4 | from select import select 5 | import codecs 6 | import sys, os, os.path, errno 7 | import json 8 | 9 | 10 | # TODO: Windows does not support file descriptors in select() 11 | # Alternative method to _read_as_much_as_possible() needed for Windows. 12 | 13 | 14 | class FileReader(Generator): 15 | """ 16 | Read documents from specified files or standard input. 17 | Reads entire file as one document, or per line, according to config. 18 | 19 | Previous behaviour, removed: 20 | Documents starting with '{' are considered JSON documents and converted to 'dict', unless otherwise configured. 21 | All are now considered JSON documents and converted to 'dict', unless 'raw_lines' are set in the config. 22 | 23 | Sockets: 24 | output (*) : Documents read. Either entire file as one, or per line. Either raw string or dict. 25 | 26 | Config: 27 | filename = None : Appended to filenames, for simplicity. 28 | filenames = None : If not set then 'stdin' is assumed. Can take a list of files. 29 | document_pre_file = False : Read each file as one string to be treated as one document. 30 | raw_lines = False : Setting this to True treats the line as a string instead of JSON. 31 | strip_line = True : Whether to remove leading and trailing spaces on a line. 32 | skip_blank_line = True : Whether to skip empty lines (after stripping). 33 | skip_comment_line = True : Whether to skip comment lines 34 | comment_prefix = "#" : Lines beginning with this string is considered to be a comment line if 35 | 'skip_comment_line' is True. 36 | """ 37 | 38 | def __init__(self, **kwargs): 39 | super(FileReader, self).__init__(**kwargs) 40 | self.output = self.create_socket("output", None, "Documents read. Either entire file as one, or per line. Either raw string or dict.") 41 | 42 | self.config.set_default( 43 | filename = None, 44 | filenames = [], 45 | document_per_file = False, 46 | raw_lines = False, 47 | strip_line = True, 48 | skip_blank_line = True, 49 | skip_comment_line = True, 50 | comment_prefix = "#", 51 | ) 52 | self._filenames = [] 53 | self._file = None 54 | self._filename_index = 0 55 | 56 | def on_open(self): 57 | 58 | if self._file: 59 | self.log.error("on_open() attempted when _file exists -- should not be possible.") 60 | return 61 | 62 | # Create a more usable filenames array 63 | self._filenames = [] 64 | if self.config.filename: 65 | self._filenames.append(self.config.filename) 66 | if not self.config.filenames: 67 | if not self.config.filename: 68 | self._filenames.append(None) # stdin will be expected 69 | elif type(self.config.filenames) in [str, unicode]: 70 | self._filenames.append(self.config.filenames) 71 | else: 72 | self._filenames.extend(self.config.filenames) 73 | 74 | # Verify that files exists and that we can read them upon starting 75 | for filename in self._filenames: 76 | if filename: 77 | if not os.path.isfile(filename): 78 | e = IOError("File not found: %s" % filename) 79 | e.filename = filename 80 | e.errno = errno.ENOENT # No such file or directory 81 | raise e 82 | elif not os.access(filename, os.R_OK): 83 | e = IOError("Failed to read file: %s" % filename) 84 | e.filename = filename 85 | e.errno = errno.EACCES # Permission denied 86 | raise e 87 | 88 | def _close_file(self): 89 | if self._file and self._file != sys.stdin: 90 | self._file.close() 91 | self._file = None 92 | 93 | def on_close(self): 94 | # If we have an open file, this is our last chance to close it 95 | self._close_file() 96 | 97 | def _handle_data(self, incoming): 98 | data = incoming 99 | if data == None: 100 | return 101 | if self.config.strip_line: 102 | data = data.strip() 103 | if self.config.skip_comment_line and data.startswith(self.config.comment_prefix): 104 | return 105 | if self.config.skip_blank_line and not data: 106 | return 107 | if not self.config.raw_lines:# and data.startswith("{"): 108 | # NOTE: May raise ValueError: 109 | data = json.loads(data) 110 | self.output.send(data) 111 | 112 | 113 | def _read_as_much_as_possible(self): 114 | while True: 115 | # Read as much as we can 116 | r,w,e = select([self._file], [], [self._file], 0) 117 | if e: 118 | pass 119 | # Hm... this happens on every normal file... 120 | #self._close_file() 121 | #break 122 | if r: 123 | line = self._file.readline() 124 | line = codecs.decode(line, self._file.encoding or "UTF-8", "replace") 125 | 126 | if line: 127 | self._handle_data(line) 128 | # In case we should leave the loop while there is still input available: 129 | if self.end_tick_reason or self.suspend: 130 | break 131 | if not line: 132 | # We've reached the end of input 133 | self._close_file() 134 | break 135 | else: 136 | break 137 | 138 | # Candidate for Windows: 139 | def _read_as_much_as_possible_Windows(self): 140 | for line in self._file: 141 | line = codecs.decode(line, self._file.encoding or "UTF-8", "replace") 142 | self._handle_data(line) 143 | # In case we should leave the loop while there is still input available: 144 | if self.end_tick_reason or self.suspend: 145 | return 146 | self._close_file() 147 | 148 | def on_tick(self): 149 | 150 | if self._file: 151 | # We were working on a file... keep reading 152 | if self.config.document_per_file: 153 | all = self._file.read() 154 | self._handle_data(all) 155 | self._close_file() 156 | else: 157 | self._read_as_much_as_possible() 158 | elif self._filename_index >= len(self._filenames): 159 | # We're done! 160 | self.stop() 161 | return 162 | else: 163 | filename = self._filenames[self._filename_index] 164 | if not filename: 165 | self.log.debug("Starting read from stdin.") 166 | self._file = sys.stdin 167 | else: 168 | self.log.debug("Opening file '%s'." % filename) 169 | self._file = open(filename, "r" if self.config.document_per_file else "rt") 170 | self._filename_index += 1 171 | # Return from tick and reenter later with a file to process 172 | return 173 | --------------------------------------------------------------------------------