├── test
├── __init__.py
├── test_procs
│ ├── __init__.py
│ ├── data
│ │ ├── csv_no_header.csv
│ │ ├── csv_with_header.csv
│ │ ├── tweet_entity_removal.json
│ │ └── twitter_raw_mock.json
│ ├── wrapped_process_cmd.py
│ ├── wrapped_process_json.py
│ ├── test_html_remover.py
│ ├── wrapped_process.py
│ ├── test_twitter_monitor.py
│ ├── test_pattern_remover.py
│ ├── test_transformer.py
│ ├── test_http_monitor.py
│ ├── test_dateexpander.py
│ ├── test_tweet_entity_removal.py
│ ├── test_tweet_extractor.py
│ ├── test_csv_converter.py
│ ├── test_blacklist_filter.py
│ └── test_entity_extractor.py
├── test_config.py
├── test_protocol_compliance.py
├── test_connections.py
└── test_service
│ └── test_http_service.py
├── MANIFEST.in
├── bin
├── es-managerd.sh
├── es-write
├── es-cleantweet
└── es-read
├── eslib
├── Monitor.py
├── service
│ ├── __init__.py
│ ├── DummyService.py
│ ├── PipelineService.py
│ └── RemotingService.py
├── Generator.py
├── debug.py
├── procs
│ ├── Transformer.py
│ ├── Throttle.py
│ ├── CLIReader.py
│ ├── FileWriter.py
│ ├── TwitterFollowerGetter.py
│ ├── TweetEntityRemover.py
│ ├── __init__.py
│ ├── DateExpander.py
│ ├── Timer.py
│ ├── TwitterUserGetter.py
│ ├── KafkaWriter.py
│ ├── SmtpMailer.py
│ ├── HtmlRemover.py
│ ├── TweetExtractor.py
│ ├── Neo4jReader.py
│ ├── CsvConverter.py
│ ├── RabbitmqWriter.py
│ ├── PatternRemover.py
│ ├── Neo4jWriter.py
│ ├── KafkaMonitor.py
│ ├── TcpWriter.py
│ ├── RabbitmqMonitor.py
│ └── FileReader.py
├── Socket.py
├── Configurable.py
├── text.py
├── TerminalInfo.py
├── prog.py
├── Terminal.py
├── esdoc.py
├── time.py
├── __init__.py
├── Connector.py
└── web.py
├── examples
├── service_run_dir
│ └── config
│ │ ├── credentials.yaml
│ │ ├── services.yaml
│ │ ├── logging-console.yaml
│ │ └── logging.yaml
├── resources
│ └── tweet.json
├── entity_extractor.py
└── remoting
│ ├── RemotingClient.py
│ └── DummyRemotingService.py
├── DEVHELP.txt
├── .gitignore
├── setup.py
└── PROTOCOLS.md
/test/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/test/test_procs/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | #include LICENSE
2 | include README.md
3 | include PROTOCOLS.md
4 | include examples/*
5 | recursive-exclude test *
6 |
--------------------------------------------------------------------------------
/test/test_procs/data/csv_no_header.csv:
--------------------------------------------------------------------------------
1 | 1,"hans terje","bakke","htb"
2 | 2,"eivind","elseth","eee"
3 | 3,"ole-kristian","villabø","okv"
4 |
--------------------------------------------------------------------------------
/test/test_procs/data/csv_with_header.csv:
--------------------------------------------------------------------------------
1 | "id","name","last name","initials"
2 | 1,"hans terje","bakke","htb"
3 | 2,"eivind","elseth","eee"
4 | 3,"ole-kristian","villabø","okv"
5 |
--------------------------------------------------------------------------------
/bin/es-managerd.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 |
3 | # Note: es-run-service must be in path
4 | # Note: ESLIB_SERVICE_DIR must be set, or -d option must be used
5 | exec ./es-service $@ -c manager managerd -e localhost:5000 --start
6 |
--------------------------------------------------------------------------------
/eslib/Monitor.py:
--------------------------------------------------------------------------------
1 | from .Generator import Generator
2 |
3 | class Monitor(Generator):
4 | def __init__(self, **kwargs):
5 | super(Monitor, self).__init__(**kwargs)
6 |
7 | self.keepalive = True # A monitor never stops, unless told to
8 |
--------------------------------------------------------------------------------
/test/test_procs/wrapped_process_cmd.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 |
4 | import sys, codecs
5 |
6 |
7 | print "INNER/STARTING"
8 |
9 | print "INNER/" + u" ".join([codecs.decode(x, "UTF-8") for x in sys.argv[1:]])
10 |
11 | print "INNER/EXITING"
12 |
--------------------------------------------------------------------------------
/examples/service_run_dir/config/credentials.yaml:
--------------------------------------------------------------------------------
1 | rabbitmq:
2 | username : xxxx
3 | password : xxxx
4 |
5 | twitter:
6 | consumer_key : xxxxxxxxxxxxxxxxxxxxx
7 | consumer_secret : xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx
8 | access_token : xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx
9 | access_token_secret : xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx
10 |
--------------------------------------------------------------------------------
/DEVHELP.txt:
--------------------------------------------------------------------------------
1 | See this guide for preparations, setting up accounts and a ~/.pypirc file:
2 |
3 | http://peterdowns.com/posts/first-time-with-pypi.html
4 |
5 | To install the package from the source tree:
6 |
7 | pip install -e .
8 |
9 | To install from PyPI:
10 |
11 | pip install elasticsearch-eslib
12 |
13 | To upload package to PyPI test (pypitest) or live (pypi):
14 |
15 | python setup.py register -r pypi
16 | python setup.py sdist upload -r pypi
17 |
18 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | *.py[cod]
2 | *.swp
3 |
4 |
5 | # Folders
6 | tmp/
7 | HTBTEST/
8 |
9 | # C extensions
10 | *.so
11 |
12 | # Packages
13 | *.egg
14 | *.egg-info
15 | dist
16 | build
17 | eggs
18 | parts
19 | #bin
20 | var
21 | sdist
22 | develop-eggs
23 | .installed.cfg
24 | lib
25 | lib64
26 | __pycache__
27 |
28 | # Installer logs
29 | pip-log.txt
30 |
31 | # Unit test / coverage reports
32 | .coverage
33 | .tox
34 | nosetests.xml
35 |
36 | # Translations
37 | *.mo
38 |
39 | # Mr Developer
40 | .mr.developer.cfg
41 | .project
42 | .pydevproject
43 | .idea
44 |
--------------------------------------------------------------------------------
/examples/service_run_dir/config/services.yaml:
--------------------------------------------------------------------------------
1 | # Manager
2 |
3 | manager:
4 | name : "manager"
5 | management_endpoint : "localhost:5000"
6 |
7 | elasticsearch_hosts : ["localhost:9200"]
8 | elasticsearch_index : "management"
9 | dynamic_port_ranges : [["localhost", 5010, 5019]]
10 |
11 | # Dummy
12 |
13 | dummy:
14 | manager_endpoint : "localhost:5000"
15 | #management_endpoint : "localhost:5008"
16 | management_endpoint : "localhost"
17 |
18 | name : "dummy"
19 | frequency : 3
20 | lifespan : 120
21 |
--------------------------------------------------------------------------------
/test/test_procs/wrapped_process_json.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 |
4 | import sys, select, json
5 |
6 | def send(s):
7 | print json.dumps({"inner": s})
8 |
9 |
10 | try:
11 | while True:
12 | r,w,e = select.select([sys.stdin],[],[],0)
13 | if r:
14 | line = sys.stdin.readline()
15 | if line:
16 | dd = json.loads(line)
17 | s = dd.get("outer")
18 | if s:
19 | send("echo: %s" % s)
20 | else:
21 | send("stdin was hung up")
22 | break
23 | except KeyboardInterrupt:
24 | send("interrupted")
25 | send("finished")
26 |
--------------------------------------------------------------------------------
/examples/service_run_dir/config/logging-console.yaml:
--------------------------------------------------------------------------------
1 | version : 1
2 | disable_existing_loggers: False
3 | formatters:
4 | categories:
5 | format: "%(firstName) -15s %(serviceName) -15s %(className) -20s %(instanceName) -20s %(levelname) -10s %(message)s"
6 | rich:
7 | format: "%(asctime)s %(name) -30s %(className) -20s %(lineno) 5d %(funcName) -20s %(levelname) -10s %(message)s"
8 | compact:
9 | format: "%(name) -30s %(levelname) -10s %(message)s"
10 |
11 | handlers:
12 | console:
13 | class : logging.StreamHandler
14 | formatter : rich
15 | level : TRACE
16 | stream : ext://sys.stdout
17 | loggers:
18 | "":
19 | handlers : [console]
20 | level : WARNING
21 | servicelog:
22 | level : DEBUG
23 | proclog:
24 | level : DEBUG
25 | doclog:
26 | level : WARNING
27 |
--------------------------------------------------------------------------------
/eslib/service/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | """
4 | eslib.service
5 | ~~~~~
6 |
7 | Base classes for wrapping document processing processors into processing graphs/pipelines and control them.
8 | """
9 |
10 | from .. import esdoc
11 |
12 |
13 | from .Service import Service, status
14 | from .HttpService import HttpService
15 | from .PipelineService import PipelineService
16 | from .ServiceManager import ServiceManager
17 | from .ServiceLauncher import ServiceLauncher
18 | from .DummyService import DummyService
19 | from .Client import Client
20 | from .RemotingService import RemotingService
21 |
22 |
23 | __all__ = (
24 | "Service",
25 | "HttpService",
26 | "PipelineService",
27 | "ServiceManager",
28 | "ServiceLauncher",
29 | "DummyService",
30 | "Client",
31 | "RemotingService"
32 | )
33 |
--------------------------------------------------------------------------------
/test/test_procs/test_html_remover.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | import unittest
4 | from eslib.procs import HtmlRemover
5 |
6 | class TestHtmlRemover(unittest.TestCase):
7 |
8 | def test_str(self):
9 | dirty = 'Lady & Landstrykeren'
10 |
11 | p = HtmlRemover()
12 | cleaned = p._clean(dirty)
13 | print "D=", dirty
14 | print "C=", cleaned
15 |
16 | self.assertTrue(cleaned == "Lady & Landstrykeren")
17 |
18 | def test_unicode(self):
19 | dirty = u'Lady & Landstrykeren'
20 |
21 | p = HtmlRemover()
22 | cleaned = p._clean(dirty)
23 | print "D=", dirty
24 | print "C=", cleaned
25 |
26 | self.assertTrue(cleaned == u"Lady & Landstrykeren")
27 |
28 | def main():
29 | unittest.main()
30 |
31 | if __name__ == "__main__":
32 | main()
33 |
--------------------------------------------------------------------------------
/eslib/Generator.py:
--------------------------------------------------------------------------------
1 | from .Processor import Processor
2 |
3 | class Generator(Processor):
4 | def __init__(self, **kwargs):
5 | super(Generator, self).__init__(**kwargs)
6 | self.is_generator = True
7 |
8 | # These methods could/should be implemented by inheriting classes:
9 |
10 | # on_open(self) # from Processor
11 | # on_close(self) # from Processor
12 |
13 | # on_startup(self)
14 | # on_shutdown(self)
15 | # on_abort(self) # from Processor
16 | # on_tick(self)
17 | # on_suspend(self)
18 | # on_resume(self)
19 |
20 | # If on_tick finishes on its own without external stop call, call self.stop() from there when done.
21 |
22 | @property
23 | def end_tick_reason(self):
24 | "If 'aborted', 'stopping' or not 'running'. 'suspended' is not a reason to leave the tick; handle this yourself."
25 | return self.aborted or self.stopping or self.restarting or not self.running
26 |
--------------------------------------------------------------------------------
/eslib/debug.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | """
4 | eslib.debug
5 | ~~~~~~~~~~~
6 |
7 | Module containing functions useful for debugging.
8 | """
9 | import os
10 |
11 |
12 | __all__ = ("byte_size_string", "get_memory_used")
13 |
14 |
15 | if os.name == 'posix':
16 | import resource
17 |
18 |
19 | def byte_size_string(bytes, decimals=1):
20 | kB = bytes / 1024.0
21 | MB = kB / 1024.0
22 | GB = MB / 1024.0
23 | s = None
24 | if GB > 1.0: s = "%.*f GB" % (decimals, GB)
25 | elif MB > 1.0: s = "%.*f MB" % (decimals, MB)
26 | elif kB > 1.0: s = "%.*f kB" % (decimals, kB)
27 | else: s = "%s B" % bytes
28 | return s
29 |
30 |
31 | def get_memory_used():
32 | """Get current memory usage by this process. Supposedly in KB."""
33 | if os.name == 'posix':
34 | return resource.getrusage(resource.RUSAGE_SELF).ru_maxrss
35 | else:
36 | 0 # Don't want to risk an exception here..
37 | #raise NotImplementedError
38 |
--------------------------------------------------------------------------------
/examples/resources/tweet.json:
--------------------------------------------------------------------------------
1 | { "_timestamp": "2014-10-14T14:26:29Z", "_source": { "id": "522030691567931393", "geo": { "coordinates": [ 40.757023, -74.001698 ], "type": "Point" }, "lang": "en", "entities": { "urls": [ { "indices": [ 70, 92 ], "url": "http://instagram.com/p/uIt8BfP5Qp/" } ], "hashtags": [], "user_mentions": [ { "name": "Stella Chuu", "screen_name": "StellaChuuuuu", "indices": [ 20, 34 ], "id": "285369968" } ] }, "truncated": false, "text": "Me & the lovely @stellachuuuuu @ Jacob K Javits Convention Center http://t.co/x6BUjNY0jv", "created_at": "2014-10-14T14:26:30Z", "source": "Instagram", "place": { "country_code": "US", "country": "United States" }, "user": { "name": "JJ Dillon", "id": "35273719", "lang": "en", "description": "i love beautiful women. like to party & have fun. very cool, calm, laid back person. i love video games, anime, movies, xbox 360, comic books, pop culture", "created_at": "2009-04-25T18:20:07Z", "profile_image_url": "http://pbs.twimg.com/profile_images/506599782908178432/c6pyAlfv_normal.jpeg", "screen_name": "JJDillon430", "location": "New York", "geo_enabled": true, "protected": false } }, "_id": "522030691567931393" }
2 |
--------------------------------------------------------------------------------
/examples/entity_extractor.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 |
4 |
5 | from eslib.procs import FileReader, FileWriter, EntityExtractor
6 |
7 | def listener(document):
8 | print document["_source"]["extracted"]
9 |
10 | entities = [
11 | {
12 | "category": "location",
13 | "name": "place",
14 | "match": [
15 | { "type": "exact", "pattern": "Convention" }
16 | #{ "type": "iprange", "value": "81.27.32.186/16" }
17 | ]
18 | },
19 | {
20 | "category": "agent",
21 | "name": "user",
22 | "match": [
23 | { "type": "exact", "pattern": "Jacob" }
24 | #{ "type": "iprange", "value": "81.27.32.186/16" }
25 | ]
26 | },
27 | {
28 | "category": "agent",
29 | "name": "user",
30 | "match": [
31 | { "type": "exact", "pattern": "stellachuuuuu" }
32 | #{ "type": "iprange", "value": "81.27.32.186/16" }
33 | ]
34 | }
35 | ]
36 |
37 |
38 | r = FileReader(filename = "resources/tweet.json")
39 | p = EntityExtractor(fields=["text"], target="extracted", entities=entities)
40 | w = FileWriter()
41 |
42 | p.subscribe(r)
43 | w.subscribe(p, "entities")
44 |
45 | r.start()
46 | w.wait() # Will finish once the reader is finished.
47 |
--------------------------------------------------------------------------------
/test/test_procs/wrapped_process.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 |
4 | import time, sys, signal
5 | from select import select
6 |
7 | #region Signal handling
8 |
9 | def _handler_SIGINT(signal, frame):
10 | print "INNER/RECEIVED SIGINT -- ignoring"
11 |
12 | def _handler_SIGTERM(signal, frame):
13 | global running
14 | print "INNER/RECEIVED SIGTERM -- terminating"
15 | running = False
16 |
17 | def _handler_SIGHUP(signal, frame):
18 | print "INNER/RECEIVED SIGHUP -- ignoring"
19 |
20 | signal.signal(signal.SIGINT , _handler_SIGINT )
21 | signal.signal(signal.SIGTERM, _handler_SIGTERM)
22 | signal.signal(signal.SIGHUP , _handler_SIGHUP )
23 |
24 | #endregion Signal handling
25 |
26 | running = True
27 |
28 | print "INNER/STARTING"
29 |
30 | while running:
31 | r,w,e = select([sys.stdin],[],[],0)
32 | if r:
33 | line = sys.stdin.readline()
34 | line = line.strip()
35 | if line:
36 | print "INNER/ECHO:", line
37 | if line == "*HANGUP*":
38 | print "INNER/HANGING UP ON *HANGUP* REQUEST"
39 | running = False
40 | elif line == "*RAISE*":
41 | raise Exception("INNER/RAISED EXCEPTION UPON *RAISE* REQUEST")
42 | else:
43 | print "INNER/STDIN WAS HUNG UP -- GOOD BYE"
44 | running = False
45 |
46 | print "INNER/EXITING"
47 |
--------------------------------------------------------------------------------
/test/test_config.py:
--------------------------------------------------------------------------------
1 | import unittest
2 | from eslib import Config
3 |
4 | class TestConfig(unittest.TestCase):
5 |
6 | def test_access(self):
7 | config = Config()
8 | config.set_default(a="A", b="B")
9 |
10 | print config["a"]
11 | print config.a
12 |
13 | self.assertEqual("A", config["a"])
14 | self.assertEqual("A", config.a)
15 |
16 | def test_assignment(self):
17 | config = Config()
18 | #config.set_default(a="A", b="B")
19 |
20 | config["a"] = "A"
21 | config.a = "B"
22 |
23 | print config["a"]
24 | print config.a
25 |
26 | self.assertEqual("B", config["a"])
27 | self.assertEqual("B", config.a)
28 |
29 |
30 | def test_defaults_and_overrides(self):
31 | config = Config()
32 | config.set_default(a="A", b="B", x="X")
33 |
34 | config.set(a="D", b=None)
35 |
36 | print config["a"]
37 | print config.a
38 | self.assertEqual("D", config.a)
39 |
40 | config.a = "C"
41 | print config.a
42 | self.assertEqual("C", config.a)
43 |
44 | print config["b"]
45 | print config.b
46 | self.assertEqual(None, config.b)
47 |
48 | print config.x
49 | self.assertEqual("X", config.x)
50 |
51 | def main():
52 | unittest.main()
53 |
54 | if __name__ == "__main__":
55 | main()
56 |
--------------------------------------------------------------------------------
/bin/es-write:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 |
4 |
5 | from eslib.procs import ElasticsearchWriter, FileReader
6 | import eslib.prog
7 | import eslib.time
8 | import argparse, sys
9 |
10 |
11 | def main():
12 | help_i = "Which index to write documents to."
13 | help_t = "Which type to set on document (overrides incoming type)."
14 |
15 | parser = argparse.ArgumentParser(usage="\n %(prog)s -i index [-t type] [-f field] [-l limit] [more options]")
16 | parser._actions[0].help = argparse.SUPPRESS
17 | parser.add_argument("-i", "--index" , help=help_i, required=True)
18 | parser.add_argument("-t", "--type" , help=help_t)
19 | parser.add_argument( "--host" , help="Elasticsearch host, format 'host:port' or just 'host'.", default=None)
20 | #parser.add_argument( "--debug" , action="store_true")
21 | parser.add_argument( "--name" , help="Process name.", default=None)
22 |
23 | if len(sys.argv) == 1:
24 | parser.print_usage()
25 | sys.exit(0)
26 |
27 | args = parser.parse_args()
28 |
29 | # Set up and run this processor
30 | w = ElasticsearchWriter(
31 | name = args.name or eslib.prog.progname(),
32 | hosts = [args.host] if args.host else [],
33 | index = args.index,
34 | doctype = args.type,
35 | batchsize = 1000,
36 | batchtime = 60.0
37 | )
38 |
39 | # if args.debug: w.debuglevel = 0
40 |
41 | r = FileReader()
42 | w.subscribe(r)
43 | r.start()
44 | w.wait()
45 |
46 |
47 | if __name__ == "__main__": main()
48 |
--------------------------------------------------------------------------------
/eslib/procs/Transformer.py:
--------------------------------------------------------------------------------
1 | __author__ = 'Hans Terje Bakke'
2 |
3 | from ..Processor import Processor
4 |
5 | class Transformer(Processor):
6 | """
7 | Convert input to output protocol.
8 | Returns a LIST of zero or more documents converted to the output protocol.
9 |
10 | The following parameters are not part of the processors 'config' object, and can and must be set only upon
11 | instantiation:
12 |
13 | input_protocol = None
14 | output_protocol = None
15 | func = None # Mandatory! Must be a function returning a list (or generator) of zero or more
16 | documents complying with the output protocol. Function signature must be
17 | func(proc, doc), where proc is this transformer processor, so you can address it
18 | in your function.
19 | """
20 | def __init__(self, func=None, input_protocol=None, output_protocol=None, **kwargs):
21 | super(Transformer, self).__init__(**kwargs)
22 | self.create_connector(self._incoming, "input", input_protocol)
23 | self._output = self.create_socket("output", output_protocol)
24 |
25 | self._func = func
26 |
27 | def _incoming(self, incoming):
28 | try:
29 | ll = self._func(self, incoming)
30 | if ll:
31 | for outgoing in ll:
32 | if outgoing:
33 | self._output.send(outgoing)
34 | except Exception as e:
35 | self.doclog.exception("Error in protocol converter function call.")
36 |
--------------------------------------------------------------------------------
/eslib/Socket.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | from .Terminal import Terminal
4 |
5 |
6 | class Socket(Terminal):
7 | "Output terminal in a Processor. Writes data to one or more subscribing connectors of matching protocol."
8 |
9 | def __init__(self, name, protocol=None, mimic=None):
10 | super(Socket, self).__init__(name, protocol)
11 | self.type = Socket
12 | self.callbacks = [] # List of methods for external callbacks
13 | self.mimic = mimic
14 |
15 | def send(self, document):
16 | "Send data to all subscribing connectors and callbacks."
17 |
18 | # Send data to all accepting connectors
19 | subscribers = self.connections[:]
20 | for subscriber in subscribers:
21 | if subscriber.accepting:
22 | subscriber.receive(document)
23 | # Finally, notify all subscribing callbacks
24 | for callback in self.callbacks:
25 | callback(self.owner, document)
26 |
27 | @property
28 | def has_output(self):
29 | if self.connections or self.callbacks:
30 | return True
31 | return False
32 |
33 | def _find_mimic_proto(self, visited=None):
34 | if not visited:
35 | visited = []
36 | if self.mimic and self.mimic.connections and not self in visited:
37 | visited.append(self)
38 | connected_socket = self.mimic.connections[0]
39 | return connected_socket._find_mimic_proto(visited)
40 | return self.protocol
41 |
42 | @property
43 | def mimiced_protocol(self):
44 | return self._find_mimic_proto()
45 |
--------------------------------------------------------------------------------
/test/test_procs/test_twitter_monitor.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | import os
4 | import unittest, json
5 | from eslib.procs import TwitterMonitor
6 |
7 | class TestTwitterMonitor(unittest.TestCase):
8 |
9 | def test_simple(self):
10 |
11 | # Load test data
12 | self_dir, _ = os.path.split(__file__)
13 | f = open(os.path.join(self_dir, "data/twitter_raw_mock.json"))
14 | j = json.load(f)
15 | f.close()
16 |
17 | m = TwitterMonitor()
18 | raw, tweet = m._decode(j)
19 |
20 | # Test tweet
21 | self.assertTrue(tweet["_id"] == "520149420122578944")
22 | self.assertTrue(tweet["_source"]["source"] == u"Twitter for BlackBerry®")
23 | self.assertTrue(tweet["_source"]["text"] == u'These clowns must hope that we never cum under attack from any force-r we capable of protecting ourselves?')
24 | self.assertTrue(str(tweet["_source"]["created_at"]) == "2014-10-09 09:51:00.328000")
25 | self.assertTrue("geo" in tweet["_source"])
26 | self.assertTrue(tweet["_source"]["lang"] == "en")
27 | self.assertTrue(tweet["_source"]["place"]["country"] == "South Africa")
28 | self.assertFalse("in_reply_to" in tweet["_source"])
29 | # User
30 | self.assertTrue(tweet["_source"]["user"]["id"] == "2196916282")
31 | self.assertTrue(tweet["_source"]["user"]["lang"] == "en")
32 | self.assertTrue(tweet["_source"]["user"]["name"] == "mark fester")
33 | self.assertFalse("description" in tweet["_source"]["user"])
34 | self.assertTrue(str(tweet["_source"]["user"]["created_at"]) == "2013-11-26 14:21:35")
35 |
36 | # Entities
37 | # // TODO
38 |
39 | def main():
40 | unittest.main()
41 |
42 | if __name__ == "__main__":
43 | main()
44 |
--------------------------------------------------------------------------------
/test/test_procs/data/tweet_entity_removal.json:
--------------------------------------------------------------------------------
1 | {
2 | "_timestamp": "2014-10-14T14:26:29Z",
3 | "_source": {
4 | "id": "522030691567931393",
5 | "geo": {
6 | "coordinates": [
7 | 40.757023,
8 | -74.001698
9 | ],
10 | "type": "Point"
11 | },
12 | "lang": "en",
13 | "entities": {
14 | "urls": [
15 | {
16 | "indices": [
17 | 70,
18 | 92
19 | ],
20 | "url": "http://instagram.com/p/uIt8BfP5Qp/"
21 | }
22 | ],
23 | "hashtags": [],
24 | "user_mentions": [
25 | {
26 | "name": "Stella Chuu",
27 | "screen_name": "StellaChuuuuu",
28 | "indices": [
29 | 20,
30 | 34
31 | ],
32 | "id": "285369968"
33 | }
34 | ]
35 | },
36 | "truncated": false,
37 | "text": "Me & the lovely @stellachuuuuu @ Jacob K Javits Convention Center http://t.co/x6BUjNY0jv",
38 | "created_at": "2014-10-14T14:26:30Z",
39 | "source": "Instagram",
40 | "place": {
41 | "country_code": "US",
42 | "country": "United States"
43 | },
44 | "user": {
45 | "name": "JJ Dillon",
46 | "id": "35273719",
47 | "lang": "en",
48 | "description": "i love beautiful women. like to party & have fun. very cool, calm, laid back person. i love video games, anime, movies, xbox 360, comic books, pop culture",
49 | "created_at": "2009-04-25T18:20:07Z",
50 | "profile_image_url": "http://pbs.twimg.com/profile_images/506599782908178432/c6pyAlfv_normal.jpeg",
51 | "screen_name": "JJDillon430",
52 | "location": "New York",
53 | "geo_enabled": true,
54 | "protected": false
55 | }
56 | },
57 | "_id": "522030691567931393"
58 | }
59 |
--------------------------------------------------------------------------------
/eslib/Configurable.py:
--------------------------------------------------------------------------------
1 | class Config(object):
2 | def __init__(self, **config):
3 | super(Config, self).__init__()
4 | if config is not None:
5 | self.__dict__ = config
6 | self.defaults = {}
7 |
8 | def set_default(self, **kwargs):
9 | for key,val in kwargs.iteritems():
10 | self.defaults[key] = val
11 | # if not key in self.__dict__:
12 | # self.__dict__[key] = val
13 |
14 | def __getattr__(self, key):
15 | if key in self.__dict__:
16 | return self.__dict__.__getattr__(key)
17 | elif key in self.defaults:
18 | return self.defaults[key]
19 | else:
20 | raise AttributeError("'%s' has no attribute '%s'" % (self.__class__.__name__, key))
21 |
22 | def __getitem__(self, key):
23 | if key in self.__dict__:
24 | return self.__dict__[key]
25 | elif key in self.defaults:
26 | return self.defaults[key]
27 | else:
28 | raise AttributeError("'%s' has no attribute '%s'" % (self.__class__.__name__, key))
29 |
30 | def __setitem__(self, key, value):
31 | self.__dict__[key] = value
32 |
33 | def set(self, ignore_none=False, **kwargs):
34 | "ignore_none means that fields with value None are not set."
35 | for key,val in kwargs.iteritems():
36 | if ignore_none and val is None:
37 | continue
38 | self.__dict__[key] = val
39 |
40 | def get_default_attributes(self):
41 | return self.defaults
42 |
43 | def get_user_attributes(self):
44 | return {key: val for key, val in self.__dict__.iteritems() if key not in self.defaults}
45 |
46 | class Configurable(object):
47 | def __init__(self, **kwargs):
48 | super(Configurable, self).__init__()
49 | self.config = Config(**kwargs)
50 |
--------------------------------------------------------------------------------
/eslib/text.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | """
4 | eslib.text
5 | ~~~~~~~~~~
6 |
7 | Module containing operations on text strings.
8 | """
9 |
10 |
11 | __all__ = ("remove_parts", "remove_html")
12 |
13 |
14 | import re
15 | from HTMLParser import HTMLParser
16 |
17 | import sys
18 |
19 | def remove_parts(text, sections):
20 | """
21 | Remove sections from text. Sections is a list of tuples with (start,end)
22 | coordinates to clip from the text string.
23 | """
24 |
25 | if not sections: return text
26 |
27 | c = sorted(sections)
28 | s = []
29 | s.append(text[:c[0][0]])
30 | for i in range(1, len(c)):
31 | s.append(text[c[i-1][1]:c[i][0]])
32 | s.append(text[c[-1][1]:])
33 | cleaned = "".join(s)
34 | return cleaned
35 |
36 | #region remove_html
37 |
38 | class _MLStripper(HTMLParser):
39 | def __init__(self):
40 | self.reset()
41 | self.fed = []
42 | self.strict = False
43 | def handle_data(self, d):
44 | self.fed.append(d)
45 | def get_data(self):
46 | return ''.join(self.fed)
47 |
48 |
49 | _regex_whitespace = re.compile(r'\s+', re.UNICODE)
50 | _regex_scripts = re.compile(r"""""", re.MULTILINE|re.DOTALL|re.UNICODE)
51 | _regex_style = re.compile(r"""()""", re.MULTILINE|re.DOTALL|re.UNICODE)
52 |
53 | def remove_html(text):
54 | if not text or not type(text) in [str, unicode]:
55 | return text
56 |
57 | text = re.sub(_regex_scripts, " ", text)
58 | text = re.sub(_regex_style , " ", text)
59 | stripper = _MLStripper()
60 | cleaned = stripper.unescape(text)
61 | stripper.feed(cleaned)
62 | cleaned = stripper.get_data()
63 | cleaned = re.sub(_regex_whitespace, " ", cleaned)
64 | return cleaned
65 |
66 | #endregion remove_html
67 |
68 |
--------------------------------------------------------------------------------
/eslib/procs/Throttle.py:
--------------------------------------------------------------------------------
1 | __author__ = 'Hans Terje Bakke'
2 |
3 | from ..Processor import Processor
4 | import time
5 |
6 |
7 | class Throttle(Processor):
8 | """
9 | Only pass through documents that satisfy a whitelist of terms or where certain terms do not occur in a combination
10 | with blacklisted terms.
11 |
12 | Connectors:
13 | input (esdoc) : Incoming document in 'esdoc' dict format.
14 | Sockets:
15 | output (esdoc) : Documents that passed the blacklist filtering, arrived on 'input' connector.
16 |
17 | Config:
18 | delay = 1.0 : Time to delay document throughput, in seconds (float).
19 | drop = False : Drop items we don't have time for instead of buffering up.
20 | """
21 |
22 | def __init__(self, **kwargs):
23 | super(Throttle, self).__init__(**kwargs)
24 |
25 | m = self.create_connector(self._incoming, "input", None, "Incoming document.")
26 | self.output = self.create_socket("output" , None, "Outgoing document.", mimic=m)
27 |
28 | self.config.set_default(
29 | delay = 1.0,
30 | drop = True
31 | )
32 |
33 | self._last_write_ts = 0
34 |
35 | def on_open(self):
36 | self._last_write_ts = 0
37 |
38 | def _incoming(self, doc):
39 | if self.output.has_output:
40 | if self.config.drop:
41 | now_ts = time.time()
42 | if now_ts - self._last_write_ts > self.config.delay: # Otherwise just ignore the incoming doc
43 | self._last_write_ts = now_ts
44 | self.output.send(doc)
45 | #print "QUEUE=", self.connectors["input"].queue.qsize()
46 | else:
47 | time.sleep(self.config.delay)
48 | self.output.send(doc)
49 | #print "QUEUE=", self.connectors["input"].queue.qsize()
50 |
--------------------------------------------------------------------------------
/examples/service_run_dir/config/logging.yaml:
--------------------------------------------------------------------------------
1 | version : 1
2 | disable_existing_loggers: False
3 | formatters:
4 | brief:
5 | format: "%(firstName) -20s %(serviceName) -20s %(className) -20s %(instanceName) -20s %(levelname) -10s %(message)s"
6 | individual:
7 | format: "%(asctime)s %(className) -20s %(instanceName) -20s %(levelname) -10s %(message)s"
8 | root:
9 | format: "%(asctime)s %(name) -50s %(levelname) -10s %(message)s"
10 |
11 | handlers:
12 | console:
13 | class : logging.StreamHandler
14 | formatter : brief
15 | level : INFO
16 | #filters : [allow_foo]
17 | stream : ext://sys.stdout
18 | file_root:
19 | class : logging.handlers.RotatingFileHandler
20 | formatter : root
21 | filename : root.log
22 | maxBytes : 1024
23 | backupCount : 3
24 | file_service:
25 | class : logging.handlers.RotatingFileHandler
26 | formatter : individual
27 | filename : service.log
28 | maxBytes : 1024
29 | backupCount : 3
30 | file_proc:
31 | class : logging.handlers.RotatingFileHandler
32 | formatter : individual
33 | filename : proc.log
34 | maxBytes : 1024
35 | backupCount : 3
36 | file_doc:
37 | class : logging.handlers.RotatingFileHandler
38 | formatter : individual
39 | filename : doc.log
40 | maxBytes : 1024
41 | backupCount : 3
42 | loggers:
43 | "":
44 | handlers : [file_root]
45 | level : DEBUG
46 | servicelog:
47 | handlers : [console, file_service]
48 | level : DEBUG
49 | propagate : false
50 | proclog:
51 | handlers : [console, file_proc]
52 | level : DEBUG
53 | propagate : false
54 | doclog:
55 | handlers : [file_doc]
56 | level : DEBUG
57 | propagate : false
58 |
59 | # servicelog.SERVICE.INSTANCE
60 |
61 | doclog.myservice.myinstance:
62 | level: DEBUG
63 |
--------------------------------------------------------------------------------
/eslib/TerminalInfo.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | from .Connector import Connector
4 | from .Socket import Socket
5 |
6 |
7 | class TerminalInfo(object):
8 |
9 | def __init__(self, terminal=None, include_connections=True):
10 | if terminal:
11 | self.type = terminal.type # t.__class__.__name__
12 | owner_name = "orphan"
13 | if terminal.owner: owner_name = terminal.owner.name or "???"
14 | if terminal.owner: owner_name = terminal.owner.name or "???"
15 | self.owner = owner_name
16 | self.name = terminal.name
17 | self.protocol = terminal.protocol
18 | self.description = terminal.description
19 | connections = terminal.get_connections()
20 | self.count = len(connections)
21 | self.connections = []
22 | if include_connections:
23 | for c in terminal.get_connections():
24 | self.connections.append(TerminalInfo(c, False))
25 |
26 | def DUMP(self, follow_connections=True, verbose=False, indent=0):
27 | spacing = " "
28 | spc = spacing * indent
29 | type_indicator = "?"
30 | mimic_str = ""
31 | if self.type is Socket:
32 | type_indicator = "+"
33 | if self.mimic:
34 | mimic_str = " (mimic=%s)" % self.mimic.name
35 | elif self.type is Connector:
36 | type_indicator = "-"
37 |
38 | print "%s%c%s.%s(%s) (conns=%d)%s" % (spc, type_indicator, self.owner, self.name, self.protocol, self.count, mimic_str)
39 | if verbose and self.description:
40 | print "\"%s%s%s\"" % (spc, spc, self.description)
41 |
42 | if follow_connections and self.connections:
43 | subindent = 0
44 | if verbose:
45 | print "%sConnections:" % spc
46 | subindent += 1
47 | for c in self.connections:
48 | c.DUMP(False, verbose, subindent+1)
49 |
50 |
--------------------------------------------------------------------------------
/test/test_procs/test_pattern_remover.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | import unittest
4 | from eslib.procs import PatternRemover
5 | from eslib import esdoc
6 |
7 | class TestPatternRemover(unittest.TestCase):
8 |
9 | def test_str(self):
10 | dirty = u"Oh my fucking god…"
11 |
12 | p = PatternRemover(patterns=["my", u"\S+…"])
13 | p.on_open() # Force generation of internal regexes
14 | cleaned = p._clean(dirty)
15 | print "D=", dirty
16 | print "C=", cleaned
17 |
18 | self.assertTrue(cleaned == "Oh fucking")
19 |
20 | def test_field(self):
21 | dirty_text = u"Oh my fucking god…"
22 |
23 | dirty = {
24 | "_id": "somedoc",
25 | "_source": {
26 | "text": dirty_text
27 | }
28 | }
29 |
30 | p = PatternRemover(patterns=["my", u"\S+…"], target_field="cleaned")
31 | p.on_open() # Force generation of internal regexes
32 | cleaned = p._clean(dirty)
33 | print "D=", esdoc.getfield(cleaned, "_source.text")
34 | print "C=", esdoc.getfield(cleaned, "_source.cleaned")
35 |
36 | self.assertTrue(esdoc.getfield(cleaned, "_source.text" ) == dirty_text)
37 | self.assertTrue(esdoc.getfield(cleaned, "_source.cleaned") == "Oh fucking")
38 |
39 | def test_field_map(self):
40 | dirty = {
41 | "_id": "somedoc",
42 | "_source": {
43 | "A": "This was A",
44 | "b": { "B": "This was B"}
45 | }
46 | }
47 |
48 | p = PatternRemover(pattern="was", field_map={"A": "cleaned.cleaned_A", "b.B": "cleaned.cleaned_B"})
49 | p.on_open() # Force generation of internal regexes
50 | cleaned = p._clean(dirty)
51 |
52 | self.assertTrue(esdoc.getfield(cleaned, "_source.cleaned.cleaned_A") == "This A")
53 | self.assertTrue(esdoc.getfield(cleaned, "_source.cleaned.cleaned_B") == "This B")
54 |
55 | def main():
56 | unittest.main()
57 |
58 | if __name__ == "__main__":
59 | main()
60 |
--------------------------------------------------------------------------------
/eslib/procs/CLIReader.py:
--------------------------------------------------------------------------------
1 | __author__ = 'Eivind Eidheim Elseth'
2 | import time
3 | import subprocess
4 |
5 | from ..Monitor import Monitor
6 | import logging
7 |
8 | class CLIReader(Monitor):
9 | """
10 | The CLIReader is a Generator that will periodically call a command line utility
11 |
12 | Sockets:
13 | stdout (str) (default) : Output from the command line utility's stdout
14 | stderr (str) : Output from the command line utility's stderr
15 |
16 | Config:
17 | cmd = None : The command to run
18 | interval = 10 : The waiting period in seconds between each time the command is run
19 |
20 | """
21 |
22 | def __init__(self, **kwargs):
23 | super(CLIReader, self).__init__(**kwargs)
24 | self._stdout = self.create_socket("stdout", "str", "The output to stdout from the command line utility", is_default=True)
25 | self._stderr = self.create_socket("stderr", "str", "The output to stderr from the command line utility")
26 | self.config.set_default(
27 | interval = 10
28 | )
29 | self.last_get = None
30 |
31 | def on_tick(self):
32 | if not self.last_get or (time.time() - self.last_get > self.config.interval):
33 | # Since the next call may crash, at least mark the last attempt as now,
34 | # so we don't try again on every tick, but wait for the next interval.
35 | self.last_get = time.time()
36 |
37 | p = subprocess.Popen(self.config.cmd, shell=False, stdout=subprocess.PIPE)
38 | p.wait()
39 | (output, err) = p.communicate()
40 | if output:
41 | if self.doclog.isEnabledFor(logging.TRACE):
42 | self.doclog.trace("Output doc: %s" % str(output))
43 | self._stdout.send(output)
44 | if err:
45 | self.log.error("Received message from subprocess on stderr: %s" % str(err))
46 | self._stderr.send(err)
47 |
48 | self.last_get = time.time()
49 |
--------------------------------------------------------------------------------
/eslib/procs/FileWriter.py:
--------------------------------------------------------------------------------
1 | __author__ = 'Hans Terje Bakke'
2 |
3 | # TODO: Verify encoding working, especially when writing to stdout
4 |
5 | from ..Processor import Processor
6 | import sys
7 | from ..esdoc import tojson
8 |
9 |
10 | class FileWriter(Processor):
11 | """
12 | Write incoming documents to specified file or standard output.
13 | Documents of dict type are written as json documents, per line. Other types are written directly with
14 | their string representation.
15 |
16 | Connectors:
17 | input (*) : Incoming documents to write to file as string or json objects per line.
18 |
19 | Config:
20 | filename = None : If not set then 'stdout' is assumed.
21 | append = False : Whether to append to existing file, rather than overwrite.
22 | """
23 | def __init__(self, **kwargs):
24 | super(FileWriter, self).__init__(**kwargs)
25 | self.create_connector(self._incoming, "input", None, "Incoming documents to write to file as string or JSON objects per line.")
26 |
27 | self.config.set_default(
28 | filename = None,
29 | append = False
30 | )
31 |
32 | self._file = None
33 |
34 | def on_open(self):
35 |
36 | if self._file:
37 | self.log.error("on_open() attempted when _file exists -- should not be possible.")
38 | return
39 |
40 | if not self.config.filename:
41 | # Assuming stdout
42 | self._file = sys.stdout
43 | else:
44 | # May raise exception:
45 | self._file = open(self.config.filename, "a" if self.config.append else "w")
46 |
47 | def on_close(self):
48 | if self._file and self._file != sys.stdout:
49 | self._file.close()
50 | self._file = None
51 |
52 | def _incoming(self, document):
53 | if document:
54 | if type(document) is dict:
55 | print >> self._file, tojson(document)
56 | else:
57 | print >> self._file, document
58 | self._file.flush()
59 |
--------------------------------------------------------------------------------
/eslib/prog.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | """
4 | eslib.prog
5 | ~~~~~~~~~~
6 |
7 | Helper functions for running as an executable program.
8 | """
9 |
10 |
11 | __all__ = ( "progname", "initlogs")
12 |
13 | import os, sys, logging.config, yaml
14 |
15 |
16 | def progname():
17 | return os.path.basename(sys.argv[0])
18 |
19 | def initlogs(config_file=None):
20 | # if config_file:
21 | # config_file = os.path.join(os.getcwd(), config_file)
22 | # else:
23 | # location = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__)))
24 | # config_file = os.path.join(location, 'logging.yml')
25 | #
26 | # config = yaml.load(open(config_file)) # TODO: YAML files are in UTF-8... if terminal is something else, make sure we convert correctly
27 | # logging.config.dictConfig(config=config)
28 |
29 | if config_file:
30 | config_file = os.path.join(os.getcwd(), config_file)
31 | config = yaml.load(open(config_file)) # TODO: YAML files are in UTF-8... if terminal is something else, make sure we convert correctly
32 | logging.config.dictConfig(config=config)
33 | else:
34 | console = logging.StreamHandler()
35 | console.setLevel(logging.TRACE)
36 | LOG_FORMAT = '%(firstName) -20s %(levelname) -10s %(className) -20s %(instanceName) -20s %(funcName) -25s %(lineno) -5d: %(message)s'
37 | console.setFormatter(logging.Formatter(LOG_FORMAT))
38 |
39 | servicelog = logging.getLogger("servicelog")
40 | servicelog.setLevel(logging.TRACE)
41 | servicelog.propagate = False
42 | servicelog.addHandler(console)
43 |
44 | proclog = logging.getLogger("proclog")
45 | proclog.setLevel(logging.TRACE)
46 | proclog.propagate = False
47 | proclog.addHandler(console)
48 |
49 | doclog = logging.getLogger("doclog")
50 | doclog.setLevel(logging.TRACE)
51 | doclog.propagate = False
52 | doclog.addHandler(console)
53 |
54 | rootlog = logging.getLogger()
55 | rootlog.setLevel(logging.WARNING)
56 | rootlog.addHandler(console)
57 |
--------------------------------------------------------------------------------
/test/test_procs/data/twitter_raw_mock.json:
--------------------------------------------------------------------------------
1 | {
2 | "id_str": "520149420122578944",
3 | "text": "These clowns must hope that we never cum under attack from any force-r we capable of protecting ourselves?",
4 | "truncated": false,
5 | "lang": "en",
6 | "created_at": "Thu Oct 09 09:51:00 +0000 2014",
7 | "timestamp_ms": "1412848260328",
8 | "source": "Twitter for BlackBerry®",
9 |
10 | "in_reply_to_user_id_str": null,
11 | "in_reply_to_screen_name": null,
12 | "in_reply_to_status_id_str": null,
13 |
14 | "geo": {
15 | "coordinates": [
16 | -34.07079,
17 | 18.57407
18 | ],
19 | "type": "Point"
20 | },
21 |
22 | "place": {
23 | "country": "South Africa",
24 | "country_code": "ZA"
25 | },
26 |
27 | "entities": {
28 | "urls": [
29 | {
30 | "display_url": "eraliquida.com/?p=1010",
31 | "expanded_url": "http://www.eraliquida.com/?p=1010",
32 | "indices": [
33 | 7,
34 | 29
35 | ],
36 | "url": "http://t.co/2OdUzFv0Ev"
37 | }
38 | ],
39 | "hashtags": [
40 | {
41 | "text": "偽2ch騒動",
42 | "indices": [
43 | 100,
44 | 107
45 | ]
46 | },
47 | {
48 | "text": "偽2ch問題",
49 | "indices": [
50 | 108,
51 | 115
52 | ]
53 | }
54 | ],
55 | "user_mentions": [
56 | {
57 | "name": "اقوى العروض وارخصها",
58 | "screen_name": "rt_ld",
59 | "id_str": "2649736855",
60 | "indices": [
61 | 0,
62 | 6
63 | ],
64 | "id": 2649736855
65 | }
66 | ]
67 | },
68 |
69 | "user": {
70 | "id_str": "2196916282",
71 | "screen_name": "Mark_50598",
72 | "name": "mark fester",
73 | "lang": "en",
74 | "description": null,
75 | "created_at": "Tue Nov 26 14:21:35 +0000 2013",
76 | "location": "",
77 | "profile_image_url": "http://abs.twimg.com/sticky/default_profile_images/default_profile_1_normal.png",
78 | "protected": false,
79 | "geo_enabled": true
80 | }
81 | }
82 |
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 |
3 | import os
4 | import sys
5 | from glob import glob
6 |
7 | # PREREQUISITES:
8 | # yum install -y libxml2-devel libxslt-devel
9 |
10 | try:
11 | from setuptools import setup
12 | except ImportError:
13 | from distutils.core import setup
14 |
15 | if sys.argv[-1] == 'publish':
16 | os.system('python setup.py sdist upload')
17 | sys.exit()
18 |
19 | requires = [
20 | 'elasticsearch',
21 | 'lxml',
22 | 'oauthlib',
23 | 'python-daemon==2.0.6', # For services, version 2.1 is fucked (change user stuff)
24 | 'argparse',
25 | 'psutil', 'setproctitle',
26 | 'pika', 'pyrabbit', # for Rabbitmq
27 | 'pykafka', # For Kafka
28 | 'HTMLParser',
29 | 'requests>=2', # version >=2 needed by TwitterAPI
30 | 'TwitterAPI',
31 | 'PyYAML', # for prog logging init stuff
32 | 'feedparser', # for rss
33 | 'python-dateutil',
34 | # 'mock' # for testing
35 | 'beautifulsoup4',
36 | 'textblob', 'justext' # for web.py
37 | ]
38 |
39 |
40 | setup(
41 | name='eslib',
42 | version='0.0.14',
43 | description='Document processing framework and utility for Elasticsearch (or whatever).',
44 | #long_description=open("README.md").read(),
45 | author='Hans Terje Bakke',
46 | author_email='hans.terje.bakke@comperio.no',
47 | url='https://github.com/comperiosearch/elasticsearch-eslib',
48 | keywords="document processing docproc",
49 | packages=['eslib', 'eslib.procs', 'eslib.service'],
50 | # package_data={'': ['LICENSE', 'README.md', 'PROTOCOLS.md']},
51 | scripts=glob('bin/*'),
52 | include_package_data=True,
53 | # TODO: examples in package data
54 | install_requires=requires,
55 | license='Apache 2.0',
56 | zip_safe=False,
57 |
58 | classifiers=(
59 | 'Development Status :: 5 - Production/Stable',
60 | 'Intended Audience :: Developers',
61 | 'Natural Language :: English',
62 | 'License :: OSI Approved :: Apache Software License',
63 | 'Programming Language :: Python',
64 | 'Programming Language :: Python :: 2',
65 | 'Programming Language :: Python :: 2.7'
66 | )
67 | )
68 |
--------------------------------------------------------------------------------
/test/test_procs/test_transformer.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | import unittest
4 | from eslib.procs import Transformer
5 |
6 | class TestProtocolConverter(unittest.TestCase):
7 |
8 | def test_func_one_lambda(self):
9 |
10 | csv2list = lambda proc, doc: [",".join(doc)]
11 |
12 | p = Transformer(func=csv2list, input_protocol="list", output_protocol="csv")
13 |
14 | output = []
15 | p.add_callback(lambda proc, doc: output.append(doc))
16 |
17 | p.start()
18 | p.put(["a","b","c","d"])
19 | p.stop()
20 | p.wait()
21 |
22 | print "output=", output[0]
23 |
24 | self.assertEqual(output[0], "a,b,c,d")
25 |
26 |
27 | def yieldfunc(self, proc, doc):
28 | yield doc.lower()
29 | yield doc.upper()
30 |
31 | def test_func_multi_yield(self):
32 |
33 | p = Transformer(func=self.yieldfunc, input_protocol="str", output_protocol="str")
34 |
35 | output = []
36 | p.add_callback(lambda proc, doc: output.append(doc))
37 |
38 | p.start()
39 | p.put("a")
40 | p.put("b")
41 | p.put("c")
42 | p.stop()
43 | p.wait()
44 |
45 | joined = ",".join(output)
46 | print "output=", joined
47 |
48 | self.assertEqual(joined, "a,A,b,B,c,C")
49 |
50 |
51 | def edge2ids(self, proc, doc):
52 | if doc["type"] == "author":
53 | yield doc["from"]
54 | else:
55 | yield doc["from"]
56 | yield doc["to"]
57 |
58 | def test_graph_edge_convertion(self):
59 | p = Transformer(func=self.edge2ids, input_protocol="str", output_protocol="str")
60 |
61 | output = []
62 | p.add_callback(lambda proc, doc: output.append(doc))
63 |
64 | p.start()
65 | p.put({"type": "author" , "from": "1", "to": "1"})
66 | p.put({"type": "mention", "from": "2", "to": "3"})
67 | p.put({"type": "quote" , "from": "4", "to": "1"})
68 | p.stop()
69 | p.wait()
70 |
71 | joined = ",".join(output)
72 | print "output=", joined
73 |
74 | self.assertEqual(joined, "1,2,3,4,1")
75 |
76 |
77 |
78 | def main():
79 | unittest.main()
80 |
81 | if __name__ == "__main__":
82 | main()
83 |
--------------------------------------------------------------------------------
/test/test_procs/test_http_monitor.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | import unittest
4 | from eslib.procs import HttpMonitor
5 | import requests
6 |
7 | import eslib.prog
8 | eslib.prog.initlogs()
9 |
10 | class TestHttpMonitor(unittest.TestCase):
11 |
12 | def test_get(self):
13 | self.hooked_msg = None
14 | output = []
15 |
16 | p = HttpMonitor(hook=self._hook) # localhost:4000 by default
17 | p.add_callback(lambda proc, doc: output.append(doc))
18 |
19 | print "Starting server."
20 | p.start()
21 |
22 | print "Sending request"
23 | res = requests.get("http://localhost:4000/ppp?arg=aaa")
24 | print "RES=", res, res.content
25 |
26 |
27 | print "Stopping server"
28 | p.stop()
29 | p.wait()
30 | print "Server finished."
31 |
32 | self.assertEquals(self.hooked_msg, "GET_/ppp?arg=aaa")
33 | self.assertEquals(output[0], "ppp?arg=aaa")
34 |
35 | def test_post(self):
36 | self.hooked_msg = None
37 | output = []
38 |
39 | p = HttpMonitor(hook=self._hook) # localhost:4000 by default
40 | p.add_callback(lambda proc, doc: output.append(doc))
41 |
42 | print "Starting server."
43 | p.start()
44 |
45 | print "Sending request (text)"
46 | res = requests.post("http://localhost:4000/ppp?arg=aaa", data="some data", headers={'content-type': 'text/text'})
47 | print "RES=", res, res.content
48 | print "Sending request (json)"
49 | res = requests.post("http://localhost:4000/ppp?arg=aaa", data="[1, 2, 3]", headers={'content-type': 'application/json'})
50 | print "RES=", res, res.content
51 |
52 | print "Stopping server"
53 | p.stop()
54 | p.wait()
55 | print "Server finished."
56 |
57 | self.assertEquals(self.hooked_msg, "POST_/ppp?arg=aaa")
58 | self.assertEquals(output[0], "some data")
59 | self.assertEquals(output[1], [1, 2, 3])
60 |
61 | def _hook(self, request_handler, verb, path, data, format="application/json"):
62 | print "Hook called: ", verb, path, data
63 | self.hooked_msg = "%s_%s" % (verb, path)
64 |
65 |
66 | def main():
67 | unittest.main()
68 |
69 | if __name__ == "__main__":
70 | main()
71 |
--------------------------------------------------------------------------------
/examples/remoting/RemotingClient.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 |
3 | # NOTE:
4 | # Example usage of the currently (as of writing) experimental RemotingService,
5 | # talking to the DummyRemotingService.
6 |
7 | # Import and set up some simple logging
8 | from eslib.service.Client import Client
9 | import logging, time
10 | # for handler in logging.root.handlers[:]:
11 | # logging.root.removeHandler(handler)
12 | logging.getLogger("requests").setLevel(logging.WARNING)
13 | format='%(name)10s %(levelname)8s %(message)s'
14 | logging.basicConfig(format=format, level=logging.INFO)
15 |
16 | # One way of creating the client, by asking the service manager for a service named "remoting".
17 | # (We call ourself the "Hooker" client, hooking onto the dummy service. It is just a name..)
18 | client = Client("Hooker", manager="localhost:5000", service="remoting")
19 |
20 | # Another way is to address the service directly:
21 | # client = Client("Hooker", address="localhost:5001")
22 |
23 | # We can ask it for status... whether it is "DEAD", "idle", "processing", "stopping", etc.
24 | print "STATUS =", client.status()
25 |
26 | # We can ask to see detailed stats
27 | print "STATS =", client.stats()
28 |
29 | # We can ask to see what knowledge it has of the metadata from the common service metadata repository
30 | print "META =", client.meta()
31 |
32 | # We can list all available HTTP routes
33 | print "HELP ="
34 | for item in client.help()["routes"]:
35 | print " %-6s %s" % tuple(item.split(" "))
36 |
37 | # We can start and stop the service (the processing part, not run and shut down the service process itself):
38 | # print "START=", client.start()
39 | # print "STATUS=", client.status()
40 | # print "STOP=", client.stop()
41 | # print "STATUS=", client.status()
42 | # time.sleep(2)
43 | # print "STATUS=", client.status()
44 |
45 | # TODO: We might want to be able to send stop(wait=True, timeout=10)
46 | #print "START=", client.start() # NOTE: Will get error back if already started...
47 |
48 | # This is how we send data to the service for further processing
49 | print "PUT=", client.put("yo", "input")
50 |
51 | # This is how we ask for a portion (here batch size = 2) of data queued for output in service.
52 | resultGenerator = list(client.fetch("output", 2))
53 | print "FETCH", list(resultGenerator)
54 |
--------------------------------------------------------------------------------
/bin/es-cleantweet:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 |
4 |
5 | from eslib.procs import FileReader, FileWriter, TweetEntityRemover, PatternRemover, HtmlRemover
6 | import eslib.prog
7 | import argparse
8 |
9 |
10 | def main():
11 | desc = "Perform a chain of cleaning operations on tweets:\n" + \
12 | " Remove entities (URLs, mentions)" + \
13 | " Remove retweet prefix and ellipses suffix" + \
14 | " Unescape HTML encoding"
15 | help_t = "Write cleaned text to this field instead of overwriting input field."
16 | help_f = "Field to clean. Defaults to 'text'."
17 |
18 | parser = argparse.ArgumentParser(usage="\n %(prog)s -f field [-t target]", description=desc)
19 | parser._actions[0].help = argparse.SUPPRESS
20 | parser.add_argument("-f", "--field", default="text", help=help_f)
21 | parser.add_argument("-t", "--target", required=False, help=help_t)
22 | parser.add_argument( "--name" , help="Process name.", default=None)
23 |
24 | args = parser.parse_args()
25 |
26 | source = args.field
27 | target = args.target or args.field
28 |
29 | # Set up and run the pipeline
30 | entity_remover = TweetEntityRemover(
31 | name = "TER",#args.name or eslib.prog.progname(),
32 | source_field = source,
33 | target_field = target,
34 | remove_url = True,
35 | remove_mention = True)
36 | pattern_remover = PatternRemover(
37 | name = "PR",#args.name or eslib.prog.progname(),
38 | patterns = ["^RT @.+: ", u"\S+\u2026$"], # Retweet prefix, ellipsis suffix
39 | source_field = target,
40 | target_field = target
41 | )
42 | unescaper = HtmlRemover(name="HR")
43 |
44 | r = FileReader() # Read from stdin
45 | w = FileWriter() # Write to stdout
46 | entity_remover.subscribe(r)
47 | pattern_remover.subscribe(entity_remover)#, socket_name="output", connector_name="input")
48 | unescaper.subscribe(pattern_remover)#, socket_name="output", connector_name="input",)
49 | w.subscribe(unescaper)#, socket_name="output")
50 |
51 | r.start() # Will cause cascading starts of each processor in the pipeline
52 | w.wait() # Wait for everything to finish writing
53 |
54 |
55 | if __name__ == "__main__": main()
56 |
--------------------------------------------------------------------------------
/test/test_procs/test_dateexpander.py:
--------------------------------------------------------------------------------
1 | __author__ = 'Mats Julian Olsen'
2 |
3 | import unittest
4 |
5 | from eslib import time
6 | from eslib.procs import DateExpander
7 |
8 | ok_date = '2014-10-14T14:26:30+01:00'
9 | ok_date_no_tz = '2014-10-14T14:26:30'
10 | wrong_date = 2013
11 | wrong_date2 = '-120-13-142T25:61:61+30:00'
12 |
13 | ok_date_fields = {
14 | 'year': 2014, 'month': 10, 'day': 14,
15 | 'hour': 13, 'minute': 26, 'second': 30,
16 | 'week': 42, 'weekday': 2
17 | }
18 |
19 | dict_wo_source = {'i': {'am': {'a': 'dict'}}}
20 | dict_w_source = {'_source': dict_wo_source}
21 | dict_wo_sourcefield = {'_source': dict_wo_source}
22 | dict_w_sourcefield = {'_source': {'created_at': dict_wo_source}}
23 | dict_w_ok_date = {'_source': {'created_at': ok_date, "date_fields": ok_date_fields}}
24 | dict_wo_ok_date = {'_source': {'created_at': wrong_date}}
25 | dict_wo_ok_date2 = {'_source': {'created_at': wrong_date2}}
26 |
27 |
28 | class TestDateMagic(unittest.TestCase):
29 |
30 | def test_all(self):
31 | date = time.utcdate(ok_date)
32 | dd = time.date_dict(date)
33 | print dd
34 | self.assertEqual(dd, ok_date_fields)
35 |
36 |
37 | class TestDateFields(unittest.TestCase):
38 |
39 | def setUp(self):
40 | self.expander = DateExpander()
41 |
42 | def test_missing_source_section(self):
43 | # if the dict doesn't have source it should be returned
44 | doc = self.expander._process(dict_wo_source)
45 | print doc
46 | self.assertDictEqual(doc, dict_wo_source)
47 |
48 | def test_missing_source_field(self):
49 | # if the dict has source, but no source_field, it should be returned
50 | doc = self.expander._process(dict_wo_sourcefield)
51 | print doc
52 | self.assertDictEqual(doc, dict_wo_sourcefield)
53 |
54 | def test_invalid_date(self):
55 | # if the date is invalid, the same doc should be returned
56 | doc = self.expander._process(dict_wo_ok_date)
57 | print doc
58 | self.assertDictEqual(doc, dict_wo_ok_date)
59 |
60 | def test_valid_date(self):
61 | doc = self.expander._process(dict_w_ok_date)
62 | print doc
63 | self.assertIn('date_fields', doc["_source"])
64 |
65 | doc = self.expander._process(dict_w_ok_date)
66 | print doc
67 | self.assertEqual(doc, dict_w_ok_date)
68 |
--------------------------------------------------------------------------------
/test/test_procs/test_tweet_entity_removal.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | import os
3 |
4 | import unittest, json
5 | from eslib.procs import TweetEntityRemover
6 | from eslib import esdoc
7 |
8 | class TestTweetEntityRemoval(unittest.TestCase):
9 |
10 | def test_all(self):
11 |
12 | # Load test data
13 | self_dir, _ = os.path.split(__file__)
14 | f = open(os.path.join(self_dir, "data/tweet_entity_removal.json"))
15 | doc = json.load(f)
16 | f.close()
17 |
18 | p_none = TweetEntityRemover(remove_urls=False, remove_mentions=False)
19 | p_url = TweetEntityRemover(remove_urls=True , remove_mentions=False)
20 | p_mention = TweetEntityRemover(remove_urls=False, remove_mentions=True)
21 | p_both = TweetEntityRemover(remove_urls=True , remove_mentions=True, target_field="cleaned")
22 |
23 | cleaned_none = p_none ._clean(doc)
24 | cleaned_url = p_url ._clean(doc)
25 | cleaned_mention = p_mention._clean(doc)
26 | cleaned_both = p_both ._clean(doc)
27 |
28 | self.assertTrue(esdoc.getfield(cleaned_none , "_source.text") == "Me & the lovely @stellachuuuuu @ Jacob K Javits Convention Center http://t.co/x6BUjNY0jv")
29 | self.assertTrue(esdoc.getfield(cleaned_url , "_source.text") == "Me & the lovely @stellachuuuuu @ Jacob K Javits Convention Center")
30 | self.assertTrue(esdoc.getfield(cleaned_mention, "_source.text") == "Me & the lovely @ Jacob K Javits Convention Center http://t.co/x6BUjNY0jv")
31 | # Original text should be untouched, and cleaned gone to separate field:
32 | self.assertTrue(esdoc.getfield(cleaned_both , "_source.text") == "Me & the lovely @stellachuuuuu @ Jacob K Javits Convention Center http://t.co/x6BUjNY0jv")
33 | self.assertTrue(esdoc.getfield(cleaned_both , "_source.cleaned") == "Me & the lovely @ Jacob K Javits Convention Center")
34 |
35 | # Verify that minimal cloning works:
36 | self.assertFalse(esdoc.getfield(doc, "_source") == esdoc.getfield(cleaned_url, "_source" ), "Expected _source old!=new")
37 | self.assertTrue (esdoc.getfield(doc, "_source.entities") == esdoc.getfield(cleaned_url, "_source.entities"), "Expected _source old==new")
38 |
39 | def main():
40 | unittest.main()
41 |
42 | if __name__ == "__main__":
43 | main()
44 |
--------------------------------------------------------------------------------
/eslib/procs/TwitterFollowerGetter.py:
--------------------------------------------------------------------------------
1 | __author__ = 'mats'
2 | from ..Generator import Generator
3 | from .twitter import Twitter
4 |
5 | class TwitterFollowerGetter(Generator):
6 | """
7 | This generator takes as input the ids of twitter users, and then goes
8 | ahead and retrieves the followers or friends of this user,
9 | and outputs the ids.
10 |
11 | # TODO: Document argument 'twitter' and how to configure this. 'outgoing'
12 |
13 | Connectors:
14 | ids (str) : Incoming IDs to get data for.
15 | Sockets:
16 | ids (str) : IDs of related nodes.
17 |
18 | Config:
19 | outgoing = True : # TODO: Document this
20 | """
21 | def __init__(self, twitter=None, **kwargs):
22 | super(TwitterFollowerGetter, self).__init__(**kwargs)
23 | self.twitter = twitter
24 | self.create_connector(self._incoming, "ids", "str")
25 | self._output_id = self.create_socket("ids", "str", "IDs of related nodes.")
26 | self._output_edge = self.create_socket("edge", "graph-edge")
27 | self.config.set_default(outgoing=True, reltype="follows")
28 |
29 |
30 | def on_open(self):
31 | if self.twitter is None:
32 | self.twitter = Twitter(
33 | consumer_key=self.config.consumer_key,
34 | consumer_secret=self.config.consumer_secret,
35 | access_token=self.config.access_token,
36 | access_token_secret=self.config.access_token_secret
37 | )
38 |
39 | def _incoming(self, document):
40 | try:
41 | id_ = int(document)
42 | except ValueError:
43 | self.doclog.exception("Could not parse id: %s to int" % str(document))
44 | else:
45 | related = self.twitter.get_follows(uid=str(id_), outgoing=self.config.outgoing)
46 | self._send(id_, related)
47 |
48 | def _send(self, origin, related):
49 | for id_ in related:
50 | edge = {"from": None, "type": self.config.reltype, "to": None}
51 | self._output_id.send(id_)
52 | if self.config.outgoing:
53 | edge["from"] = origin
54 | edge["to"] = id_
55 | else:
56 | edge["from"] = id_
57 | edge["to"] = origin
58 |
59 | if all(edge.itervalues()):
60 | self.doclog.trace("Sending edge %s to Neo4j" % str(edge))
61 | self._output_edge.send(edge)
62 | else:
63 | self.doclog.error("Edge had None-fields: %s" % str(edge))
--------------------------------------------------------------------------------
/eslib/procs/TweetEntityRemover.py:
--------------------------------------------------------------------------------
1 | __author__ = 'Hans Terje Bakke'
2 |
3 | from ..Processor import Processor
4 | from eslib.text import remove_parts
5 | from .. import esdoc
6 |
7 | class TweetEntityRemover(Processor):
8 | """
9 | Remove URLs and/or mentioned users from the tweet text.
10 |
11 | Protocols:
12 |
13 | esdoc.tweet:
14 |
15 | # TODO
16 |
17 | Connectors:
18 | input (esdoc.tweet) : Tweet
19 | Sockets:
20 | output (esdoc.tweet) : Tweet (possibly extended with a cleaned field)
21 |
22 | Config:
23 | source_field = "text" : Part of twitter dev credentials.
24 | target_field = None : Defaults to 'source_field', replacing the input field.
25 | remove_urls = True
26 | remove_mentions = False
27 | """
28 |
29 |
30 | def __init__(self, **kwargs):
31 | super(TweetEntityRemover, self).__init__(**kwargs)
32 |
33 | self.create_connector(self._incoming, "input", "esdoc.tweet", "Incoming tweet.")
34 | self.output = self.create_socket("output" , "esdoc.tweet" , "Outgoing, cleaned, tweet.")
35 |
36 | self.config.set_default(
37 | source_field = "text",
38 | target_field = None,
39 | remove_urls = True,
40 | remove_mentions = False
41 | )
42 |
43 | def _clean(self, doc):
44 |
45 | source = doc.get("_source")
46 | if not source:
47 | return doc
48 |
49 | text = esdoc.getfield(source, self.config.source_field)
50 |
51 | coords = []
52 | entities = source.get("entities")
53 | if self.config.remove_urls:
54 | x = esdoc.getfield(entities, "urls", [])
55 | coords += [l["indices"] for l in x]
56 | if self.config.remove_mentions:
57 | x = esdoc.getfield(entities, "user_mentions", [])
58 | coords += [l["indices"] for l in x]
59 | cleaned = None
60 | if not text:
61 | cleaned = text
62 | else:
63 | # The removal from coords most often leaves two spaces, so remove them, too, and strip border spaces.
64 | cleaned = remove_parts(text, coords).replace(" ", " ").strip()
65 |
66 | return esdoc.shallowputfield(doc, "_source." + (self.config.target_field or self.config.source_field), cleaned)
67 |
68 | def _incoming(self, doc):
69 | if not self.output.has_output:
70 | return # No point then...
71 | cleaned_doc = self._clean(doc)
72 | self.output.send(cleaned_doc)
73 |
--------------------------------------------------------------------------------
/test/test_procs/test_tweet_extractor.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | import os
4 | import unittest, json
5 | from eslib.procs import TwitterMonitor, TweetExtractor
6 |
7 | class TestTwitterMonitor(unittest.TestCase):
8 |
9 | def test_simple(self):
10 |
11 | # Load test data
12 | self_dir, _ = os.path.split(__file__)
13 | f = open(os.path.join(self_dir, "data/twitter_raw_mock.json"))
14 | j = json.load(f)
15 | f.close()
16 |
17 | m = TwitterMonitor()
18 | raw, tweet_mon = m._decode(j)
19 |
20 | x = TweetExtractor()
21 | tweet, users, links = x._extract(tweet_mon)
22 |
23 | # Test links
24 | self.assertTrue(len(links) == 1)
25 | self.assertTrue(links[0]["what"] == "twitter")
26 | self.assertTrue(links[0]["who"] == "2196916282")
27 | self.assertTrue(links[0]["url"] == "http://www.eraliquida.com/?p=1010")
28 |
29 | # Test users
30 | self.assertTrue(len(users) == 2)
31 | self.assertTrue(users[0]["from"] == "2196916282")
32 | self.assertTrue(users[1]["from"] == "2196916282")
33 | self.assertTrue(users[0]["to"] == "2196916282")
34 | self.assertTrue(users[1]["to"] == "2649736855")
35 | self.assertTrue(users[0]["type"] == "author")
36 | self.assertTrue(users[1]["type"] == "mention")
37 |
38 | # Test tweet
39 | self.assertTrue(tweet["_id"] == "520149420122578944")
40 | self.assertTrue(tweet["_source"]["source"] == u"Twitter for BlackBerry®")
41 | self.assertTrue(tweet["_source"]["text"] == u'These clowns must hope that we never cum under attack from any force-r we capable of protecting ourselves?')
42 | self.assertTrue(str(tweet["_source"]["created_at"]) == "2014-10-09 09:51:00.328000")
43 | self.assertTrue("geo" in tweet["_source"])
44 | self.assertTrue(tweet["_source"]["lang"] == "en")
45 | self.assertTrue(tweet["_source"]["place"]["country"] == "South Africa")
46 | self.assertFalse("in_reply_to" in tweet["_source"])
47 | # User
48 | self.assertTrue(tweet["_source"]["user"]["id"] == "2196916282")
49 | self.assertTrue(tweet["_source"]["user"]["lang"] == "en")
50 | self.assertTrue(tweet["_source"]["user"]["name"] == "mark fester")
51 | self.assertFalse("description" in tweet["_source"]["user"])
52 | self.assertTrue(str(tweet["_source"]["user"]["created_at"]) == "2013-11-26 14:21:35")
53 |
54 | # Entities
55 | # // TODO
56 |
57 | def main():
58 | unittest.main()
59 |
60 | if __name__ == "__main__":
61 | main()
62 |
--------------------------------------------------------------------------------
/eslib/Terminal.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 |
4 | class TerminalProtocolException(Exception):
5 | def __init__(self, socket, connector):
6 | msg = "Socket: %s.%s(%s), Connector: %s.%s(%s)" % (socket.owner.name, socket.name, socket.protocol, connector.owner.name, connector.name, connector.protocol)
7 | super(Exception, self).__init__(self, msg)
8 |
9 |
10 | class Terminal(object):
11 | "Common abstract base class for connectors and sockets."
12 |
13 | ANY_PROTOCOL = "any"
14 |
15 | def __init__(self, name, protocol):
16 | self.type = None # type: Either 'Socket' or 'Connector'
17 | self.owner = None # Processor:
18 | self.name = "" # str: Name of terminal
19 | self.protocol = "" # str: Name of object format expected as input/output on this terminal
20 | self.description = "" # str: Text describing purpose and property of this terminal
21 |
22 | self.connections = []
23 |
24 | self.name = name or "unnamed"
25 | self.protocol = protocol or Terminal.ANY_PROTOCOL
26 |
27 | def __str__(self):
28 | return "%s|%s" % (self.name, self.protocol)
29 |
30 | def attach(self, terminal):
31 | self.connections.append(terminal)
32 |
33 | def detach(self, terminal):
34 | if terminal in self.connections:
35 | self.connections.remove(terminal)
36 |
37 | def get_connections(self, owner=None, terminal_name=None):
38 | "Return all connections if owner is missing. Ignore terminal_name is owner is missing."
39 | connections = []
40 | for c in self.connections[:]:
41 | if not owner or (c.owner == owner and (not terminal_name or c.name == terminal_name)):
42 | connections.append(c)
43 | return connections
44 |
45 | @staticmethod
46 | def protocol_compliance(socket, connector):
47 | if connector.protocol == Terminal.ANY_PROTOCOL or socket.protocol == Terminal.ANY_PROTOCOL:
48 | return True
49 | # In case the socket is set to mimic the protocol of one of its connectors, we check for that
50 | # instead of the directly registered protocol.
51 | ss = socket.protocol.split(".")
52 | sm = socket.mimiced_protocol.split(".")
53 | cc = connector.protocol.split(".")
54 | # print "SS=", ss[:len(cc)]
55 | # print "SM=", sm[:len(cc)]
56 | # print "CC=", cc[:len(cc)]
57 | # print "%s == %s" % (sm[:len(cc)], cc[:len(cc)])
58 | return (ss[:len(cc)] == cc[:len(cc)]) or (sm[:len(cc)] == cc[:len(cc)])
59 |
--------------------------------------------------------------------------------
/eslib/procs/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | """
4 | eslib.procs
5 | ~~~~~
6 |
7 | Document processing processors.
8 | """
9 |
10 |
11 | from .ElasticsearchReader import ElasticsearchReader
12 | from .ElasticsearchWriter import ElasticsearchWriter
13 | from .FileReader import FileReader
14 | from .FileWriter import FileWriter
15 | from .TcpWriter import TcpWriter
16 | from .RabbitmqMonitor import RabbitmqMonitor
17 | from .RabbitmqWriter import RabbitmqWriter
18 | from .KafkaMonitor import KafkaMonitor
19 | from .KafkaWriter import KafkaWriter
20 | from .HttpMonitor import HttpMonitor
21 | from .CsvConverter import CsvConverter
22 | from .WebGetter import WebGetter
23 | from .Neo4jWriter import Neo4jWriter
24 | from .Neo4jReader import Neo4jReader
25 | from .TwitterMonitor import TwitterMonitor
26 | from .TwitterUserGetter import TwitterUserGetter
27 | from .TwitterFollowerGetter import TwitterFollowerGetter
28 | from .TweetEntityRemover import TweetEntityRemover
29 | from .TweetExtractor import TweetExtractor
30 | from .PatternRemover import PatternRemover
31 | from .HtmlRemover import HtmlRemover
32 | from .BlacklistFilter import BlacklistFilter
33 | from .Throttle import Throttle
34 | from .Transformer import Transformer
35 | from .EntityExtractor import EntityExtractor
36 | from .ProcessWrapper import ProcessWrapper
37 | from .CLIReader import CLIReader
38 | from .RssMonitor import RssMonitor
39 | from .Timer import Timer
40 | from .DateExpander import DateExpander
41 | from .SmtpMailer import SmtpMailer
42 | from .FourChanMonitor import FourChanMonitor
43 |
44 | __all__ = (
45 | "ElasticsearchReader",
46 | "ElasticsearchWriter",
47 | "FileReader",
48 | "FileWriter",
49 | "TcpWriter",
50 | "RabbitmqMonitor",
51 | "RabbitmqWriter",
52 | "KafkaMonitor",
53 | "KafkaWriter",
54 | "HttpMonitor",
55 | "CsvConverter",
56 | "WebGetter",
57 | "Neo4jWriter",
58 | "Neo4jReader",
59 | "TwitterMonitor",
60 | "TwitterUserGetter",
61 | "TwitterFollowerGetter",
62 | "TweetEntityRemover",
63 | "TweetExtractor",
64 | "PatternRemover",
65 | "HtmlRemover",
66 | "BlacklistFilter",
67 | "Throttle",
68 | "Transformer",
69 | "EntityExtractor",
70 | "ProcessWrapper",
71 | "CLIReader",
72 | "RssMonitor",
73 | "Timer",
74 | "DateExpander",
75 | "SmtpMailer",
76 | "FourChanMonitor"
77 | )
78 |
--------------------------------------------------------------------------------
/eslib/procs/DateExpander.py:
--------------------------------------------------------------------------------
1 | __author__ = 'Mats Julian Olsen'
2 |
3 | from ..Processor import Processor
4 | from .. import esdoc
5 | from .. import time
6 |
7 |
8 | class DateExpander(Processor):
9 | """
10 | This processor will use a date field in an esdoc as a basis for constructing
11 | an object with
12 |
13 | year
14 | month (1 through 12)
15 | day (1 through 31)
16 | hour (0 through 23)
17 | minute (0 through 59)
18 | second (0 through 59)
19 | weekday (1 through 7)
20 | week (1 through 53)
21 |
22 | Connectors:
23 | input (esdoc) : Incoming.
24 | Sockets:
25 | output (esdoc) : Outgoing, with configured date field expanded.
26 |
27 | Config:
28 | source_field = "created_at" : Field which date value to expand.
29 | target_field = "date_fields" : Target field for the expanded object.
30 | """
31 | def __init__(self, **kwargs):
32 | super(DateExpander, self).__init__(**kwargs)
33 | self._input = self.create_connector(self._incoming, 'input', 'esdoc', "Incoming.")
34 | self._output = self.create_socket('output', 'esdoc', "Outgoing, with configured date field expanded.")
35 |
36 | self.config.set_default(
37 | source_field='created_at',
38 | target_field='date_fields'
39 | )
40 |
41 | def _incoming(self, doc):
42 | if self._output.has_output:
43 | self._output.send(self._process(doc))
44 |
45 | def _process(self, doc):
46 | value = esdoc.getfield(doc, "_source." + self.config.source_field)
47 | if value is None:
48 | self.doclog.warning(
49 | "Document '%s' is missing field or value in '%s'."
50 | % (doc.get("_id"), self.config.source_field))
51 | return doc
52 |
53 | date = time.utcdate(value)
54 | if date is None:
55 | self.doclog.warning(
56 | "Document '%s' has non-date value in field '%s'."
57 | % (doc.get("_id"), self.config.source_field))
58 | return doc
59 |
60 | date_dict = time.date_dict(date)
61 | if date_dict is None:
62 | # This should not be possible, therefore logging to proclog
63 | self.log.error("Date field extraction failed for date: %s" % date)
64 | return doc
65 |
66 | # Create a new document (if necessary) with just the minimum cloning necessary,
67 | # leaving references to the rest.
68 | return esdoc.shallowputfield(doc, '_source.' + self.config.target_field, date_dict)
69 |
--------------------------------------------------------------------------------
/eslib/procs/Timer.py:
--------------------------------------------------------------------------------
1 | __author__ = 'Hans Terje Bakke'
2 |
3 | from eslib import Monitor
4 | import time
5 |
6 | class Timer(Monitor):
7 | """
8 | Send a command on an output socket at configured interval.
9 | The configured 'actions' is a list of vectors of (initial_offset, interval, document).
10 | The time units are in seconds ('float'). The 'document' is *whatever* you want on to output,
11 | typically a string or a dict. type.
12 |
13 | Note that if you have very short intervals, you might want to adjust the run loop delay 'sleep' (not in 'config').
14 | (It defaults to 0.5 seconds for this processor.)
15 |
16 | Sockets:
17 | output (*) : Output occurring at configured intervals. From the 'document' part of the configured action.
18 |
19 | Config:
20 | actions = [] : Time to delay document throughput, in seconds (float).
21 | """
22 | def __init__(self, **kwargs):
23 | super(Timer, self).__init__(**kwargs)
24 | self._output = self.create_socket("output", None, "Output occurring at configured intervals. From the 'document' part of the configured action.")
25 |
26 | # (Override) Let ticks last half a second here by default... there's generally no rush, unless intervals are very short:
27 | self.sleep = 0.5
28 |
29 | self.config.set_default(actions=[]) # A list of tuples of (initial_offset, interval, document)
30 |
31 | self._actions = []
32 |
33 | def on_open(self):
34 | now = time.time()
35 | self._actions = []
36 | if self._actions is not None:
37 | if not hasattr(self._actions, '__iter__'):
38 | msg = "'config.actions' is not iterable."
39 | self.log.critical(msg)
40 | raise ValueError(msg)
41 | for a in self.config.actions:
42 | # Validate tuple format
43 | if not type(a) in [list, tuple] or not len(a) == 3 or not type(a[0]) in [int, float] or not type(a[1] in [int, float]):
44 | msg = "An element in 'config.actions' is not of expected format and/or type '(initial_offset, interval, document)'."
45 | self.log.error(msg)
46 | #raise ValueError(msg) # Maye not critical enough to raise exception, just skip the wrong one.
47 | self._actions.append([now + a[0], a[1], a[2]])
48 |
49 | def on_tick(self):
50 | now = time.time()
51 | for a in self._actions:
52 | if now >= a[0]:
53 | # Next time for this one is...
54 | a[0] = now + a[1]
55 | # Then send the action/document
56 | self._output.send(a[2])
57 |
--------------------------------------------------------------------------------
/eslib/service/DummyService.py:
--------------------------------------------------------------------------------
1 | from . import HttpService, PipelineService
2 | from ..procs import Timer, Transformer
3 | from .. import esdoc
4 | import time
5 |
6 | class DummyService(HttpService, PipelineService):
7 | """
8 | Common static config:
9 | name
10 | manager_endpoint
11 | management_endpoint
12 |
13 | Static config:
14 | timer_frequency = 3
15 | lifespan = 0 # 0=infinite
16 |
17 | Runtime config:
18 | dummy.variable
19 | """
20 |
21 | VARIABLE_CONFIG_PATH = "dummy.variable"
22 |
23 | metadata_keys = [VARIABLE_CONFIG_PATH]
24 |
25 | def __init__(self, **kwargs):
26 | super(DummyService, self).__init__(**kwargs)
27 |
28 | self.config.set_default(
29 | timer_frequency = 3,
30 | lifespan = 0
31 | )
32 |
33 | self._logger = None
34 | self._variable = "initial"
35 |
36 | def on_configure(self, credentials, config, global_config):
37 | self.config.set(
38 | manager_endpoint = global_config.get("manager_host"),
39 | management_endpoint = config.get("management_endpoint"),
40 |
41 | timer_frequency = config["frequency"],
42 | lifespan = config["lifespan"]
43 | )
44 |
45 | def on_setup(self):
46 | # Set up procs
47 | timer = Timer(
48 | service = self,
49 | name = "timer",
50 | actions = [(self.config.timer_frequency, self.config.timer_frequency, "ping")])
51 | self._logger = Transformer(
52 | service = self,
53 | name = "logger",
54 | func = self._logfunc)
55 |
56 | procs = [timer, self._logger]
57 |
58 | # Link them
59 | self.link(*procs)
60 |
61 | # Register them for debug dumping
62 | self.register_procs(*procs)
63 |
64 | return True
65 |
66 | #region Service overrides
67 |
68 | def on_metadata(self, metadata):
69 | print "***METADATA", metadata
70 | self._variable = self.get_meta_section(metadata, self.VARIABLE_CONFIG_PATH)
71 | print "VAR=", self._variable
72 | self.head.restart(start=False)
73 | return True
74 |
75 | #endregion Service overrides
76 |
77 | def _logfunc(self, proc, doc):
78 | if self.config.lifespan and time.time() - self.stat_processing_started > self.config.lifespan:
79 | self.log.status("Life has come to an end; stopping.")
80 | self.processing_stop()
81 | return
82 | self.log.debug("DEBUG message.")
83 | self.log.warning("Service log entry, variable='%s'" % self._variable)
84 | self._logger.log.warning("Processor log entry, variable='%s'" % self._variable)
85 | self._logger.doclog.warning("Document log entry, variable='%s'" % self._variable)
86 | yield doc
87 |
--------------------------------------------------------------------------------
/eslib/esdoc.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | """
4 | eslib.esdoc
5 | ~~~~~~~~~~
6 |
7 | Module containing operations on "Elasticsearch type" documents (really just a dict).
8 | """
9 |
10 |
11 | __all__ = ("tojson", "createdoc", "getfield", "putfield")
12 |
13 |
14 | from datetime import datetime
15 | from .time import date2iso
16 | import json
17 |
18 | def _json_serializer_isodate(obj):
19 | """Default JSON serializer."""
20 | s = None
21 | if isinstance(obj, datetime):
22 | if obj.utcoffset() is not None:
23 | obj = obj - obj.utcoffset()
24 | obj = obj.replace(tzinfo=None)
25 | s = date2iso(obj)
26 | return s
27 |
28 | def tojson(doc):
29 | return json.dumps(doc, default=_json_serializer_isodate)
30 |
31 |
32 | def getfield(doc, fieldpath, default=None):
33 | "Get value for 'fieldpath' if it exits and is not None, otherwise return the default."
34 | if doc is None or fieldpath is None:
35 | return default
36 | if fieldpath == "":
37 | return doc
38 | fp = fieldpath.split(".")
39 | d = doc
40 | for f in fp[:-1]:
41 | if not d or not f in d or not isinstance(d[f], dict):
42 | return default
43 | d = d[f]
44 | if d is None:
45 | return default
46 | v = d.get(fp[-1])
47 | return default if v is None else v
48 |
49 |
50 | def putfield(doc, fieldpath, value):
51 | "Add or update 'fieldpath' with 'value'."
52 | if doc is None or fieldpath is None:
53 | return
54 | fp = fieldpath.split(".")
55 | d = doc
56 | for i, f in enumerate(fp[:-1]):
57 | if f in d:
58 | d = d[f]
59 | if not isinstance(d, dict):
60 | raise AttributeError("Node at '%s' is not a dict." % ".".join(fp[:i+1]))
61 | else:
62 | dd = {}
63 | d[f] = dd
64 | d = dd
65 | d[fp[-1]] = value # OBS: This also overwrites a node if this is was a node
66 |
67 | def shallowputfield(doc, fieldpath, value):
68 | "Clone as little as needed of 'doc' and add the field from 'fieldpath'. Returns the new cloned doc"
69 | if not doc or not fieldpath: return
70 | fp = fieldpath.split(".")
71 | doc_clone = doc.copy() # Shallow clone
72 | d = doc
73 | d_clone = doc_clone
74 | for i, f in enumerate(fp[:-1]):
75 | if f in d:
76 | d = d[f]
77 | if not type(d) is dict:
78 | raise Exception("Node at '%s' is not a dict." % ".".join(fp[:i+1]))
79 | d_clone[f] = d.copy() # Create shallow clone of the next level
80 | d_clone = d_clone[f]
81 | else:
82 | dd = {} # Create a new node
83 | d_clone.update({f:dd})
84 | d_clone = dd
85 | d_clone[fp[-1]] = value # OBS: This also overwrites a node if this is was a node
86 |
87 | return doc_clone
88 |
89 | def createdoc(source, index=None, doctype=None, id=None):
90 | doc = {"_source": source}
91 | if index: doc['_index'] = index
92 | if type : doc['_type' ] = doctype
93 | if id : doc['_id' ] = id
94 | return doc
95 |
96 |
97 |
--------------------------------------------------------------------------------
/eslib/procs/TwitterUserGetter.py:
--------------------------------------------------------------------------------
1 | __author__ = 'mats'
2 |
3 | from ..Generator import Generator
4 | from .twitter import Twitter
5 |
6 | import time
7 |
8 |
9 | class TwitterUserGetter(Generator):
10 | """
11 | Receives uids on its connector and sends twitter user objects
12 | to its socket.
13 |
14 | # TODO: Document argument 'twitter' and how to configure this.
15 |
16 | Connectors:
17 | ids (str) : Incoming IDs to get data for.
18 | Sockets:
19 | user (graph-user) : Twitter users.
20 |
21 | Config:
22 | batchsize = 100 : How many users to gather up before making a call to Twitter.
23 | batchtime = 7.0 : How many seconds to wait before we send a batch if it is not full.
24 | """
25 |
26 | def __init__(self, twitter=None, **kwargs):
27 | super(TwitterUserGetter, self).__init__(**kwargs)
28 | self.create_connector(self._incoming, "ids", "str")
29 | self._output = self.create_socket("user", "graph-user", "Twitter users.")
30 | self._queue = []
31 | self.last_call = time.time()
32 | self.twitter = twitter
33 | self.config.set_default(
34 | batchsize=100,
35 | batchtime=7
36 | )
37 |
38 | def on_open(self):
39 | """ Instantiate twitter class. """
40 | if self.twitter is None:
41 | self.twitter = Twitter(
42 | consumer_key=self.config.consumer_key,
43 | consumer_secret=self.config.consumer_secret,
44 | access_token=self.config.access_token,
45 | access_token_secret=self.config.access_token_secret
46 | )
47 |
48 | def _incoming(self, doc):
49 | """
50 | Put str(doc) into the queue.
51 |
52 | :param doc: the id of a twitter user
53 | """
54 | try:
55 | id_ = int(doc)
56 | except ValueError:
57 | self.doclog.exception("Could not parse id: %s to int" % doc)
58 | else:
59 | self._queue.append(str(id_))
60 |
61 | def on_tick(self):
62 | """
63 | Commit items in queue if queue exceeds batchsize or it's been long
64 | since last commit.
65 | """
66 | if ((len(self._queue) >= self.config.batchsize) or
67 | (time.time() - self.last_call > self.config.batchtime and self._queue)):
68 | self.get()
69 |
70 | def on_shutdown(self):
71 | """ Get rid of rest of queue before shutting down. """
72 | self.log.info("Processing remaining items in queue.")
73 | while self._queue:
74 | self.get()
75 |
76 | def get(self):
77 | """
78 | Gets users from twitter and outputs to a socket.
79 | """
80 | num = len(self._queue)
81 | self.log.debug("Getting %i users from Twitter" % num)
82 | resp = self.twitter.get_users(uids=self._queue[:num])
83 | self._queue = self._queue[num:]
84 | for raw_user in resp:
85 | try:
86 | user = self.twitter.raw_to_dict(raw_user)
87 | except TypeError as type_error:
88 | self.log.exception(type_error)
89 | else:
90 | self._output.send(user)
91 |
--------------------------------------------------------------------------------
/eslib/service/PipelineService.py:
--------------------------------------------------------------------------------
1 | __author__ = 'Hans Terje Bakke'
2 |
3 | from .Service import Service
4 | import time
5 |
6 | class PipelineService(Service):
7 | def __init__(self, **kwargs):
8 | super(PipelineService, self).__init__(**kwargs)
9 |
10 | self.head = None
11 | self.tail = None
12 |
13 | def _log_finished(self, proc):
14 | self.log.status("Processing finished.")
15 | self._processing = False # This will shortcut further evaluation of whether we are processing
16 | self.stat_processing_ended = time.time()
17 |
18 | def _log_aborted(self, proc):
19 | self.log.status("Processing finished after abort.")
20 | self._processing_aborted = True # This will shortcut further evaluation of whether we are aborted
21 | self.stat_processing_ended = time.time()
22 |
23 | def link(self, *processors):
24 | "Link processors together and assign head and tail."
25 | prev = None
26 | for proc in processors:
27 | if prev:
28 | proc.subscribe(prev)
29 | prev = proc
30 | self.head = processors[0]
31 | self.tail = processors[-1]
32 |
33 | #region Service overrides
34 |
35 | def is_processing(self):
36 | "Evaluate whether processing is in progress."
37 | return self.tail.running
38 |
39 | def is_aborted(self):
40 | "Evaluate whether processing is in progress."
41 | return self.head.aborted
42 |
43 | def is_suspended(self):
44 | "Evaluate whether processing is suspended."
45 | return self.head.suspended
46 |
47 | def on_processing_start(self):
48 | if not self._log_finished in self.tail.event_stopped:
49 | self.tail.event_stopped.append(self._log_finished)
50 | if not self._log_aborted in self.tail.event_aborted:
51 | self.tail.event_aborted.append(self._log_aborted)
52 |
53 | self.head.start()
54 | return True
55 |
56 | def on_restart(self):
57 | # if not self.head.running:
58 | # self.head.start()
59 | # else:
60 | # return True
61 | return True # Well, not really, but still.. it didn't fail either.
62 |
63 | def on_processing_stop(self):
64 | "This method should block until the process is fully stopped."
65 | self.head.stop()
66 | self.tail.wait()
67 | return True
68 |
69 | def on_processing_abort(self):
70 | self.head.abort()
71 | self.tail.wait()
72 | return True
73 |
74 | def on_processing_suspend(self):
75 | self.head.suspend()
76 | return True
77 |
78 | def on_processing_resume(self):
79 | self.head.resume()
80 | return True
81 |
82 | # TODO
83 | def on_update(self, config):
84 | # Auto-start on update
85 | if not self.head.running:
86 | self.head.start()
87 | else:
88 | return True
89 |
90 | def on_count(self):
91 | # It is probably better to count what has been handled by the tail, than what the head received or generaterd, so:
92 | return self.tail.count
93 |
94 | def on_count_total(self):
95 | return self.head.total
96 |
97 | #endregion Service overrides
98 |
--------------------------------------------------------------------------------
/eslib/procs/KafkaWriter.py:
--------------------------------------------------------------------------------
1 | __author__ = 'Hans Terje Bakke'
2 |
3 | # NOTE: Using sync producer. Should change to async if performance sucks.
4 |
5 | from ..Processor import Processor
6 | from ..esdoc import tojson
7 | from pykafka import KafkaClient
8 | import zlib
9 |
10 |
11 | class KafkaWriter(Processor):
12 | """
13 | Write data to Kafka.
14 | Writes data with type 'str', 'unicode', 'int', or 'float'. Lists and dicts are written as 'json'.
15 | Other types are cast to 'str'.
16 | The 'type' registered with the metadata is then either 'str', 'unicode', 'int', 'float' or 'json'.
17 |
18 | Connectors:
19 | input (*) : Document to write to configured Kafka topic.
20 |
21 | Config:
22 | hosts = ["localhost:9292"] : List of Kafka hosts.
23 | topic = "default_topic" :
24 | compression = False : Whether to compress the data sent to Kafka.
25 | """
26 |
27 | def __init__(self, **kwargs):
28 | super(KafkaWriter, self).__init__(**kwargs)
29 |
30 | self._connector = self.create_connector(self._incoming, "input", None, "Document to write to configured RabbitMQ.")
31 |
32 | self.config.set_default(
33 | hosts = ["localhost:9092"],
34 | topic = "default_topic",
35 | compression = False
36 | )
37 |
38 | self._client = None
39 | self._producer = None
40 |
41 | def on_open(self):
42 | self.count = 0
43 | self._client = KafkaClient(",".join(self.config.hosts))
44 | topic = self._client.topics[self.config.topic]
45 | self._producer = topic.get_sync_producer(min_queued_messages=1)
46 | self.log.info("Connected to Kafka topic '%s'." % self.config.topic)
47 |
48 | def on_close(self):
49 | if self._client:
50 | self._producer.stop()
51 | self.log.info("Kafka producer stopped.")
52 | # Can't find any way to close the connection or ask it to release resources, so I try a 'del'.
53 | #del self._client
54 | self._client = None
55 | self.log.debug("Connection to Kafka deleted.")
56 |
57 | def _incoming(self, document):
58 | if document == None:
59 | return
60 |
61 | data = document
62 | msg_type = None
63 | if isinstance(document, basestring):
64 | msg_type = type(document).__name__
65 | elif isinstance(document, (int, long, float)):
66 | msg_type = type(document).__name__
67 | elif isinstance(document, (list, dict)):
68 | data = document
69 | msg_type = "json"
70 | else:
71 | data = str(document)
72 | msg_type = "str" #type(document).__name__
73 | self.doclog.warning("Writing document of unsupported type '%s' as type 'str'." % type(document).__name__)
74 |
75 | kafka_data = None
76 | try:
77 | kafka_data = tojson({"type": msg_type, "data": data})
78 | except TypeError as e:
79 | self.doclog.error("JSON serialization failed: %s" % e.message)
80 | return
81 |
82 | if self.config.compression:
83 | kafka_data = zlib.compress(kafka_data)
84 |
85 | self._producer.produce(kafka_data)
86 | self.count += 1
87 |
--------------------------------------------------------------------------------
/test/test_procs/test_csv_converter.py:
--------------------------------------------------------------------------------
1 | import os
2 | import unittest
3 | from eslib.procs import FileReader, FileWriter, CsvConverter
4 |
5 |
6 | res = []
7 |
8 | class TestCsvConverter(unittest.TestCase):
9 |
10 | def _setup(self, filename):
11 |
12 | r = FileReader(raw_lines=True)
13 | r.config.filename = filename
14 |
15 | c = CsvConverter()
16 |
17 | c.config.index = "myindex"
18 | c.config.type_field = "initials"
19 | c.config.id_field = "id"
20 |
21 | w = FileWriter() # Write to stdout
22 |
23 | r.attach(c.attach(w))
24 |
25 | output = []
26 | c.add_callback(lambda proc, doc: output.append(doc))
27 |
28 | return (r, c, w, output)
29 |
30 | def _verify(self, output):
31 | self.assertTrue(len(output) == 3, "Expected 3 results.")
32 | self.assertTrue(output[1]["_type"] == "eee")
33 | self.assertTrue(output[1]["_index"] == "myindex")
34 | self.assertTrue(output[1]["_id"] == "2")
35 | self.assertTrue(len(output[1]["_source"]) == 2)
36 |
37 |
38 | def test_read(self):
39 | r = FileReader(raw_lines=True)
40 | self_dir, _ = os.path.split(__file__)
41 | r.config.filename = os.path.join(self_dir, "data/csv_with_header.csv")
42 | w = FileWriter() # Write to stdout
43 | w.subscribe(r)
44 | r.start()
45 |
46 | def test_first_line_is_columns(self):
47 | self_dir, _ = os.path.split(__file__)
48 | r, c, w, output = self._setup(os.path.join(self_dir, "data/csv_with_header.csv"))
49 | r.start()
50 | w.wait()
51 |
52 | self._verify(output)
53 |
54 | def test_no_header_line(self):
55 | self_dir, _ = os.path.split(__file__)
56 | r, c, w, output = self._setup(os.path.join(self_dir, "data/csv_no_header.csv"))
57 | c.config.columns = ["id", "name", "last name", "initials"]
58 | r.start()
59 | w.wait()
60 |
61 | self._verify(output)
62 |
63 | def test_skip_header_line(self):
64 | self_dir, _ = os.path.split(__file__)
65 | r, c, w, output = self._setup(os.path.join(self_dir, "data/csv_with_header.csv"))
66 | c.config.columns = ["id", "name", "last name", "initials"]
67 | c.config.skip_first_line = True
68 | r.start()
69 | w.wait()
70 |
71 | self._verify(output)
72 |
73 | # def test_fewer_fields(self):
74 | # self_dir, _ = os.path.split(__file__)
75 | #
76 | # r, c, w, output = self._setup(os.path.join(self_dir, "data/csv_no_header.csv"))
77 | # c.config.id_field = "_id"
78 | # c.config.type_field = "_type"
79 | # c.config.columns = ["_id", None, "last name", "initials"]
80 | # r.start()
81 | # w.wait()
82 | #
83 | # self.assertTrue(len(output) == 3, "Expected 3 results.")
84 | # self.assertTrue(output[1]["_type"] == None)
85 | # self.assertTrue(output[1]["_index"] == "myindex")
86 | # self.assertTrue(output[1]["_id"] == "2")
87 | # keys = output[1]["_source"].keys()
88 | # self.assertTrue(len(keys) == 2)
89 | # self.assertTrue("last name" in keys and "initials" in keys, "Expected 'last name' and 'initials' as result fields.")
90 |
91 | def main():
92 | unittest.main()
93 |
94 | if __name__ == "__main__":
95 | main()
96 |
--------------------------------------------------------------------------------
/eslib/service/RemotingService.py:
--------------------------------------------------------------------------------
1 | from . import HttpService
2 | from .. import Processor
3 | import Queue
4 |
5 |
6 | # NOTE: THIS IS YET EXPERIMENTAL (htb, 2016-03-21)
7 |
8 |
9 | class RemotingService(HttpService):
10 |
11 | def __init__(self, **kwargs):
12 | super(RemotingService, self).__init__(**kwargs)
13 |
14 | # Add routes to functions
15 | self.add_route(self._mgmt_fetch, "GET" , "/fetch", ["?socket", "?limit"])
16 | self.add_route(self._mgmt_put , "PUT|POST", "/put" , ["?connector"])
17 |
18 | self._queues = {}
19 | self._put_proc = None
20 |
21 | # NOTE: In on_setup, where you create the fetch proc, set config var congestion_limit
22 |
23 | def setup_put(self, proc):
24 | self.log.info("Registering put Processor '%s'." % proc.name)
25 | self._put_proc = proc
26 |
27 | def setup_fetch(self, proc, socket_names=None):
28 | self.log.info("Creating fetch buffers for Processor '%s'." % proc.name)
29 | if isinstance(socket_names, basestring):
30 | socket_names = [socket_names]
31 | for socket_name in proc.sockets:
32 | if not socket_names or socket_name in socket_names:
33 | self._register_callback(proc, socket_name)
34 |
35 | def _register_callback(self, proc, socket_name):
36 | def callback(proc, doc):
37 | queue = self._queues[socket_name]
38 | queue.put(doc)
39 | pass
40 | self._queues[socket_name] = Queue.Queue()
41 | proc.add_callback(callback, socket_name)
42 |
43 | def _put(self, doc, connector_name):
44 | if self._put_proc:
45 | self._put_proc.put(doc, connector_name)
46 |
47 | def _fetch(self, socket_name=None, limit=0):
48 | docs = []
49 | if socket_name and socket_name in self._queues:
50 | queue = self._queues[socket_name]
51 | elif len(self._queues) > 0:
52 | # TODO: Get default socket instead, or error
53 | queue = self._queues.keys()[0]
54 | else:
55 | return ([], -1) # TODO: Or rather an error
56 |
57 | ##print "LIMIT=", limit
58 | while not queue.empty() and (limit == 0 or len(docs) < limit):
59 | ##print "LEN(DOCS)=%d" % len(docs)
60 | doc = queue.get_nowait()
61 | queue.task_done()
62 | if doc:
63 | docs.append(doc)
64 | return (docs, queue.qsize())
65 |
66 | #region Extra service interface methods
67 |
68 | def _mgmt_fetch(self, request_handler, payload, **kwargs):
69 | socket_name = kwargs.get("socket")
70 | limit = kwargs.get("limit") or 0 # 0 = unlimited
71 | limit = int(limit)
72 | ##print "=== KWARGS:", kwargs
73 | ##print "=== LIMIT:", limit
74 | (docs, qsize) = self._fetch(socket_name, limit)
75 | return {"documents": docs, "status": self.status, "queued": qsize}
76 |
77 | def _mgmt_put(self, request_handler, payload, **kwargs):
78 | connector_name = kwargs.get("connector")
79 | doc = payload
80 | self._put(doc, connector_name)
81 |
82 | #endregion Extra service interface methods
83 |
84 | def on_stats(self, stats):
85 | super(RemotingService, self).on_stats(stats)
86 | stats["queued"] = {k:q.qsize() for k,q in self._queues.iteritems()}
87 |
--------------------------------------------------------------------------------
/eslib/procs/SmtpMailer.py:
--------------------------------------------------------------------------------
1 | __author__ = 'Hans Terje Bakke'
2 |
3 | from ..Processor import Processor
4 | import smtplib, getpass, platform
5 | from email.mime.text import MIMEText
6 | from eslib.esdoc import tojson
7 |
8 |
9 | class SmtpMailer(Processor):
10 | """
11 | Send incoming document as content to recipients.
12 | Sends mail outgoing on port 25 unless a username/password is specified, in which case
13 | it uses TLS on port 587.
14 | Sender defaults to current executing user if not specified.
15 |
16 | Connectors:
17 | input (*) : Incoming documents to send. Non-string documents are converted to JSON.
18 |
19 | Config:
20 | smtp_server = "localhost"
21 | username = None
22 | password = None
23 | sender = None
24 | from_name = None : Name to be added to sender into the From field, becomes: '"from_name" '
25 | recipients = [] : List of recipient email addresses (no mail or brackets or other fuzz).
26 | subject = None
27 | """
28 | def __init__(self, **kwargs):
29 | super(SmtpMailer, self).__init__(**kwargs)
30 |
31 | self.create_connector(self._incoming, "input", "str", "Email content string.")
32 |
33 | self.config.set_default(
34 | smtp_server = "localhost",
35 | username = None,
36 | password = None,
37 | sender = None,
38 | from_name = None,
39 | recipients = None,
40 | subject = None,
41 | )
42 |
43 | def on_open(self):
44 | self.count = 0
45 |
46 | def _incoming(self, doc):
47 | if not doc or not self.config.recipients or not self.config.sender:
48 | return
49 |
50 | # Convert non-string documents to JSON
51 | content = doc
52 | if not isinstance(doc, basestring):
53 | content = tojson(doc)
54 |
55 | try:
56 | self._mail_text(
57 | self.config.smtp_server,
58 | self.config.recipients,
59 | self.config.subject,
60 | self.config.sender,
61 | self.config.from_name,
62 | content,
63 | self.config.username,
64 | self.config.password)
65 | self.count += 1
66 | except Exception as e:
67 | self.log.exception("Failed to send email.")
68 |
69 |
70 | def _mail_text(self, smtp_server, recipients, subject, sender=None, from_name=None, content=None, username=None, password=None):
71 | msg = MIMEText(content, "plain", "utf-8")
72 |
73 | if not sender:
74 | sender = "@".join((getpass.getuser(), platform.node()))
75 |
76 | message_from = sender if not from_name else '"%s" <%s>' % (from_name, sender)
77 |
78 | msg['Subject'] = subject
79 | msg['From'] = message_from
80 | msg['To'] = ", ".join(recipients)
81 |
82 | s = None
83 | if username or password:
84 | s = smtplib.SMTP(smtp_server, 587)
85 | s.ehlo()
86 | s.starttls()
87 | s.ehlo()
88 | s.login(username, password)
89 | else:
90 | s = smtplib.SMTP(smtp_server or "localhost")
91 |
92 | s.sendmail(sender, recipients, msg.as_string())
93 | s.quit()
94 |
95 |
--------------------------------------------------------------------------------
/eslib/time.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | """
4 | eslib.time
5 | ~~~~~~~~~~
6 |
7 | Module containing time/date helpers.
8 | """
9 |
10 |
11 | __all__ = ("duration_string", "date2iso", "ago2date")
12 |
13 |
14 | import re, datetime, dateutil, dateutil.parser
15 |
16 |
17 | def duration_string(timediff):
18 | """
19 | :type timediff: datetime.timedelta
20 | :rtype str:
21 | """
22 | secs = timediff.seconds
23 | days = timediff.days
24 | s = secs % 60
25 | m = (secs / 60) % 60
26 | h = (secs / 60 / 60) % 24
27 | return "%d:%02d:%02d" % (days*24+h, m, s)
28 |
29 |
30 | def date2iso(dateobj):
31 | """
32 | Convert datetime object to ISO 8601 string with UTC, e.g. '2014-03-10T23:32:47Z'
33 | :type dateobj: datetime.datetime
34 | :rtype str
35 | """
36 | return dateobj.strftime("%Y-%m-%dT%H:%M:%SZ") # Screw the %.f ...
37 |
38 | def iso2date(isostr):
39 | """
40 | Convert ISO 8601 string in UTC, e.g. '2014-03-10T23.32:47Z' to datetime object.
41 | :type isostr: datetime.datetime
42 | :rtype datetime.datetime
43 | """
44 | if isostr is None:
45 | return None
46 | if "." in isostr:
47 | return datetime.datetime.strptime(isostr, "%Y-%m-%dT%H:%M:%S.%fZ")
48 | else:
49 | return datetime.datetime.strptime(isostr, "%Y-%m-%dT%H:%M:%SZ")
50 |
51 | def utcdate(obj):
52 | "Convert string or datetime object to a datetime object in UTC."
53 | dt = None
54 | if type(obj) is datetime.datetime:
55 | dt = obj
56 | try:
57 | dt = dateutil.parser.parse(obj)
58 | except:
59 | pass
60 | if dt:
61 | # Convert to UTC time and get rid of the offset
62 | utcoffs = dt.utcoffset()
63 | if utcoffs:
64 | dt = dt - utcoffs
65 | dt = dt.replace(tzinfo=None) #dateutil.tz.tzutc())
66 | return dt
67 |
68 | def date_dict(date):
69 | return {
70 | "year": date.year, "month": date.month, "day": date.day,
71 | "hour": date.hour, "minute": date.minute, "second": date.second,
72 | "weekday": date.isoweekday(), "week": date.isocalendar()[1]
73 | }
74 |
75 |
76 | _agoRegex = re.compile("^(?P\d+)\s*(?P\w+)( ago)?$")
77 |
78 | def ago2date(ago, from_date_utc=None):
79 | """
80 | Convert 'ago' style time specification string to a datetime object.
81 | Units are s=second, m=minute, h=hour, d=day, w=week, M=month, y=year
82 | :param str ago : "Time ago" as a string.
83 | :param datetime.datetime from_date_utc : Relative time to use instead of 'now'. In UTC.
84 | :rtype datetime.timedelta : Time difference.
85 | """
86 | m = _agoRegex.match(ago)
87 | if not m:
88 | raise SyntaxError("Illegal 'ago' string: %s" % ago)
89 | number = int(m.group("number"))
90 | unit = m.group("unit")
91 | delta = None
92 | if unit == "s" or unit.startswith("sec") : delta = datetime.timedelta(seconds= number)
93 | elif unit == "m" or unit.startswith("min") : delta = datetime.timedelta(minutes= number)
94 | elif unit == "h" or unit.startswith("hour") : delta = datetime.timedelta(hours= number)
95 | elif unit == "d" or unit.startswith("day") : delta = datetime.timedelta(days= number)
96 | elif unit == "w" or unit.startswith("week") : delta = datetime.timedelta(weeks= number)
97 | elif unit == "M" or unit.startswith("month"): delta = datetime.timedelta(days= number*30)
98 | elif unit == "y" or unit.startswith("year") : delta = datetime.timedelta(days= number*365)
99 | else:
100 | raise SyntaxError("Illegal unit for 'ago' string in: %s" % ago)
101 | return (from_date_utc or datetime.datetime.utcnow()) - delta;
102 |
--------------------------------------------------------------------------------
/PROTOCOLS.md:
--------------------------------------------------------------------------------
1 | # Protocols
2 |
3 | This document describes the common protocols for document exchange between terminals (connectors and sockets).
4 |
5 | The name of the protocol is meant as a hint, although keeping track of a common set of protocols would be good.
6 |
7 | ## esdoc
8 |
9 | ### esdoc (general)
10 |
11 | Used by
12 |
13 | ElasticsearchReader.output (socket)
14 | ElasticsearchWriter.input (connector)
15 | ElasticsearchWriter.output (socket)
16 | CsvConverter.output (socket)
17 | HtmlRemover.input (connector)
18 | HtmlRemover.output (soclet)
19 | PatternRemover.input (connector)
20 | PatternRemover.output (socket)
21 |
22 | Format
23 |
24 | _index str
25 | _type str
26 | _id str
27 | _version int
28 | _timestamp str
29 | _source dict # Dict of { field : value }
30 |
31 |
32 | All fields are optional, depending on the case
33 |
34 | ### esdoc.webpage
35 |
36 | Used by
37 |
38 | WebGetter.output (socket)
39 |
40 | Format
41 |
42 | _id str # Using the URL as ID
43 | _type str # "webpage"
44 | _timestamp datetime # When the content was fetched
45 | _source dict of ...
46 | domain str
47 | requested_by list # Of of dicts of format [ what : [ who, ...] }, ... ]
48 | content str
49 | content_type str
50 | encoding str
51 | date datetime # Web page publishing date as reported by HTTP header
52 |
53 | ### esdoc.4chan
54 |
55 | Used by
56 |
57 | FourChanMonitor.esdoc
58 |
59 | Format
60 |
61 | _id int # Post number at 4chan
62 | _type str # "4chan"
63 | _source
64 | id int # Post number at 4chan
65 | board str # Board id
66 | thread int # Thread id
67 | timestamp int # Time of posting
68 | author str # Name of author, most commonly "Anonymous"
69 | comment str # Text comment
70 | filename str # Filename, with extension
71 | response_to int # Post number this post is a response to. 0 if original posting (i.e. not a response)
72 |
73 |
74 | ## urlrequest
75 |
76 | Used by
77 |
78 | WebGetter.input (connector)
79 |
80 | Format
81 |
82 | url str #
83 | what str # Source requesting the url, e.g. "twitter_mon"
84 | who str # Who requested it, e.g. some user id from the source
85 |
86 | ## csv
87 |
88 | Used by
89 |
90 | CsvConverter.input (connector)
91 |
92 | Format
93 |
94 | ```csv
95 | "field","field,"field","..."
96 | ```
97 |
98 |
99 | ## graph-edge
100 | The graph-edge protocol is simply a dictionary with three mandatory keys,
101 | that together represents an edge.
102 |
103 | Used by
104 | Neo4jWriter.edge (connector)
105 |
106 | Format
107 |
108 | from str # The property-id of the source node
109 | type str # The type of the edge. ("follows", "author", "mention", "quote")
110 | to str # The property-id of the receiving node
111 |
112 | Note that all fields are mandatory.
113 |
114 | ## graph-user
115 |
116 | The graph-user protocol is a dictionary holding properties.
117 |
118 | Used by
119 |
120 | Neo4jWriter.user (connector)
121 | TwitterUserGetter.user (socket)
122 |
123 | Format
124 |
125 | id str
126 | location str #Optional
127 | description str #Optional
128 | screen_name str #Optional
129 | lang str #Optional
130 | name str #Optional
131 | created_at date.isoformat()#Optional
132 |
--------------------------------------------------------------------------------
/bin/es-read:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 |
4 |
5 | from eslib.procs import ElasticsearchReader, FileWriter
6 | import eslib.prog
7 | import eslib.time
8 | import argparse, sys, time
9 |
10 |
11 | def main():
12 | help_i = "Which index to return documents from."
13 | help_t = "Which type of document to return."
14 | help_l = "The maximum number of documents to return. Will by default return all documents."
15 | help_s = "Returns all documents added after SINCE. Specified in the 'ago' format (1d, 3w, 1y, etc)."
16 | help_b = "Returns all documents added after BEFORE. Specified in the 'ago' format (1d, 3w, 1y, etc)."
17 | help_tf = "The field that contains the relavant date information. Default 'timefield' to slice on is '_timestamp'."
18 | help_fi = "Format for filter is, by example: 'category:politicians,party:democrats'."
19 |
20 | parser = argparse.ArgumentParser(usage="\n %(prog)s -i index [-t type] [-f field] [-l limit] [more options]")
21 | parser._actions[0].help = argparse.SUPPRESS
22 | parser.add_argument("-i", "--index" , help=help_i, required=True)
23 | parser.add_argument("-t", "--type" , help=help_t)
24 | parser.add_argument("-l", "--limit" , help=help_l, default=0, type=int)
25 | parser.add_argument("-s", "--since" , help=help_s)
26 | parser.add_argument("-b", "--before" , help=help_b)
27 | parser.add_argument( "--host" , help="Elasticsearch host, format 'host:port' or just 'host'.", default=None)
28 | parser.add_argument( "--timefield", help=help_tf, default="_timestamp")
29 | parser.add_argument( "--filter" , help=help_fi)
30 | parser.add_argument("-v", "--verbose" , action="store_true")
31 | #parser.add_argument( "--debug" , action="store_true")
32 | parser.add_argument( "--name" , help="Process name.", default=None)
33 |
34 | if len(sys.argv) == 1:
35 | parser.print_usage()
36 | sys.exit(0)
37 |
38 | args = parser.parse_args()
39 |
40 | # Time validation conversion and checks
41 | before = None
42 | since = None
43 | if args.before:
44 | try:
45 | before = eslib.time.ago2date(args.before)
46 | except:
47 | print >> sys.stderr, "Illegal 'ago' time format to 'before' argument, '%s'" % args.before
48 | sys.exit(-1)
49 | if args.since:
50 | try:
51 | since = eslib.time.ago2date(args.since)
52 | except:
53 | print >> sys.stderr, "Illegal 'ago' time format to 'since' argument, '%s'" % args.since
54 | sys.exit(-1)
55 |
56 | # Parse filter string
57 | filters = {}
58 | if args.filter:
59 | parts = [{part[0]:part[1]} for part in [filter.split(":") for filter in args.filter.split(",")]]
60 | for part in parts:
61 | filters.update(part)
62 |
63 | # Set up and run this processor
64 | r = ElasticsearchReader(
65 | name = args.name or eslib.prog.progname(),
66 | hosts = [args.host] if args.host else [],
67 | index = args.index,
68 | doctype = args.type,
69 | limit = args.limit,
70 | filters = filters,
71 | since = since,
72 | before = before,
73 | timefield = args.timefield
74 | )
75 |
76 | # if args.debug: r.debuglevel = 0
77 |
78 | verbose_tick_delay = 3.0
79 |
80 | w = FileWriter()
81 | w.subscribe(r)
82 | r.start()
83 | if args.verbose:
84 | # Verbose wait loop
85 | last_tick = time.time()
86 | while r.running:
87 | time.sleep(0.1)
88 | now = time.time()
89 | if (now - last_tick > verbose_tick_delay) or not r.running:
90 | print >> sys.stderr, "Read %d/%d" % (r.count, r.total)
91 | last_tick = now
92 | print >> sys.stderr, "Reading finished; waiting for writer to finish."
93 | w.wait()
94 |
95 |
96 | if __name__ == "__main__": main()
97 |
--------------------------------------------------------------------------------
/examples/remoting/DummyRemotingService.py:
--------------------------------------------------------------------------------
1 | # NOTE:
2 | #
3 | # REMOTING SERVICE IS YET EXPERIMENTAL (as of when this was written)
4 | #
5 | # This is an example of how to create a service based on the RemotingService.
6 | # See also RemotingClient.py for example of how to call it remotely.
7 | #
8 | # SETUP:
9 | #
10 | # Copy the file to your service "source" directory, and add to the package __init__.py file:
11 | #
12 | # from .DummyRemotingService import DummyRemotingService
13 | # __all__ = (
14 | # "DummyRemotingService"
15 | # )
16 | #
17 | # In the service "config" directory, configure it like
18 | #
19 | # remoting:
20 | # type : "DummyRemotingService"
21 | # frequency : 3
22 | # lifespan : 120
23 |
24 | from eslib.service import RemotingService, PipelineService
25 | from eslib.procs import Timer
26 | from eslib import Processor
27 |
28 |
29 | # COMMENT to the below connectors and sockets:
30 | # The "command" socket and connector are set to default, so that we can easily create a
31 | # service based on the pipeline service. Then all pipleline processors are linked so that
32 | # start/stop events etc are easily propagated the way we want. The downside to this
33 | # approach is that the socket and connector we want to use from the client will have to
34 | # be names, as they are not the default ones.
35 | # (Here, by client example: client.fetch("output"), and client.put("input").)
36 |
37 |
38 | class FetchProc(Processor):
39 | def __init__(self, **kwargs):
40 | super(FetchProc, self).__init__(**kwargs)
41 | self.create_connector(self._incoming, "input")
42 | self.command = self.create_socket("command", is_default=True) # To link easily as pipeline
43 | self.output = self.create_socket("output")
44 | self.num = 0
45 |
46 | def on_open(self):
47 | self.num = 0
48 |
49 | def _incoming(self, doc):
50 | # For each incoming tick, generate one output doc:
51 | self.num += 1
52 | print "SEDNING TO QUEUE:", self.num
53 | self.output.send(self.num)
54 |
55 | class PutProc(Processor):
56 | def __init__(self, **kwargs):
57 | super(PutProc, self).__init__(**kwargs)
58 | self.create_connector(self._command, "command", is_default=True) # To link easily as pipeline
59 | self.create_connector(self._incoming, "input")
60 |
61 | def _command(self, doc):
62 | pass # Down the drain; this is simply for linking
63 |
64 | def _incoming(self, doc):
65 | print("INCOMING DOC:", doc)
66 |
67 |
68 | class DummyRemotingService(RemotingService, PipelineService):
69 |
70 | def __init__(self, **kwargs):
71 | super(DummyRemotingService, self).__init__(**kwargs)
72 |
73 | self.config.set_default(
74 | timer_frequency = 3,
75 | lifespan = 0
76 | )
77 |
78 | def on_configure(self, credentials, config, global_config):
79 | self.config.set(
80 | manager_endpoint = global_config.get("manager_host"),
81 | management_endpoint = config.get("management_endpoint"),
82 |
83 | timer_frequency = config["frequency"],
84 | lifespan = config["lifespan"]
85 | )
86 |
87 | def on_setup(self):
88 | timer = Timer(
89 | service = self,
90 | name = "timer",
91 | actions = [(self.config.timer_frequency, self.config.timer_frequency, "ping")]
92 | )
93 | fetchProc = FetchProc(
94 | service = self,
95 | name = "fetchProc",
96 | )
97 | putProc = PutProc(
98 | service = self,
99 | name = "putProc"
100 | )
101 |
102 | procs = [timer, fetchProc, putProc]
103 | self.link(*procs)
104 |
105 | self.setup_put(putProc)
106 | self.setup_fetch(fetchProc, "output")
107 |
108 | return True
109 |
--------------------------------------------------------------------------------
/eslib/procs/HtmlRemover.py:
--------------------------------------------------------------------------------
1 | __author__ = 'Hans Terje Bakke'
2 |
3 | from ..Processor import Processor
4 | from .. import esdoc
5 | from eslib.text import remove_html
6 |
7 | class HtmlRemover(Processor):
8 | """
9 | Remove HTML tags and unescape HTML escapings.
10 |
11 | Connectors:
12 | input (esdoc) (default) : Incoming document in 'esdoc' dict format.
13 | str (str) : Incoming document of type 'str' or 'unicode'.
14 | Sockets:
15 | output (esdoc) (default) : Output of documents that arrived on 'input' connector.
16 | str (str) : Output of documents that arrived on 'str' connector.
17 |
18 | Config:
19 | source_field = "text" : Part of twitter dev credentials.
20 | target_field = None : Defaults to 'source_field', replacing the input field.
21 | field_map = {} : A dict of fields to use as { source : target }.
22 | If specified, this *replaces* the source_field and target_field pair!
23 | strip = True : Remove boundary spaces and double spaces, commonly left after a removal.
24 | """
25 |
26 | def __init__(self, **kwargs):
27 | super(HtmlRemover, self).__init__(**kwargs)
28 |
29 | m = self.create_connector(self._incoming_esdoc, "input", "esdoc", "Incoming 'esdoc'.", is_default=True)
30 | self.create_connector(self._incoming_str , "str" , "str" , "Incoming document of type 'str' or 'unicode'.")
31 | self.output_esdoc = self.create_socket("output" , "esdoc" , "Outgoing, cleaned, 'esdoc'.", is_default=True, mimic=m)
32 | self.output_str = self.create_socket("str" , "str" , "Outgoing, cleaned, 'str'.")
33 |
34 | self.config.set_default(
35 | source_field = "text",
36 | target_field = None,
37 | field_map = {},
38 | strip = True
39 | )
40 |
41 | self._regexes = []
42 | self._field_map = {}
43 |
44 | def on_open(self):
45 | # Create field map
46 | self._field_map = self.config.field_map or {}
47 | if not self._field_map:
48 | if not self.config.source_field:
49 | raise ValueError("Neither field_map nor source_field is configured.")
50 | self._field_map[self.config.source_field] = (self.config.target_field or self.config.source_field)
51 |
52 |
53 | def _clean_text(self, text):
54 | text = remove_html(text)
55 | if self.config.strip:
56 | text = text.strip().replace(" ", " ")
57 | return text
58 |
59 | def _clean(self, doc):
60 |
61 | if not doc:
62 | return doc
63 |
64 | # This makes this method work also for 'str' and 'unicode' type documents; not only for the expected 'esdoc' protocol (a 'dict').
65 | if type(doc) in [str, unicode]:
66 | cleaned = self._clean_text(doc)
67 | return cleaned
68 | elif not type(doc) is dict:
69 | self.doclog.debug("Unsupported document type '%s'." % type(doc))
70 | return doc
71 |
72 | source = doc.get("_source")
73 | if not source:
74 | return doc # Missing source section; don't do anything
75 |
76 | for source_field, target_field in self._field_map.iteritems():
77 | text = esdoc.getfield(source, source_field)
78 | if text and type(text) in [str, unicode]:
79 | cleaned = self._clean_text(text)
80 | if cleaned != text:
81 | # Note: This may lead to a few strictly unnecessary shallow clonings...
82 | doc = esdoc.shallowputfield(doc, "_source." + target_field, cleaned)
83 | return doc
84 |
85 | def _incoming_esdoc(self, doc):
86 | if self.output_esdoc.has_output:
87 | self.output_esdoc.send(self._clean(doc))
88 |
89 | def _incoming_str(self, doc):
90 | if self.output_str.has_output:
91 | self.output_str.send(self._clean(doc))
92 |
--------------------------------------------------------------------------------
/eslib/procs/TweetExtractor.py:
--------------------------------------------------------------------------------
1 | __author__ = 'Hans Terje Bakke'
2 |
3 | from ..Processor import Processor
4 |
5 |
6 | class TweetExtractor(Processor):
7 | """
8 | Extract properties from a tweet to different sockets: 'user' and 'link'.
9 |
10 | Protocols:
11 |
12 | esdoc.tweet:
13 |
14 | # TODO
15 |
16 | graph-edge:
17 |
18 | from str : User ID.
19 | type str : Relation, one of "author", "mention", "quote".
20 | to str : User ID.
21 |
22 | urlrequest:
23 |
24 | url str
25 | what str : e.g. "twitter_mon"
26 | who str : e.g. some user id
27 |
28 | Sockets:
29 | tweet (esdoc.tweet) (default) : Tweet
30 | text (str) : Only the text from the tweet.
31 | link (urlrequest) : Link from the tweet, for potential follow-up.
32 | user (graph-edge) : Info about author, mentioned or retweeted users from the tweet.
33 |
34 | Config:
35 | drop_retweets = True : Do not report tweets from retweets if set. User relation "quote" will still be reported.
36 | """
37 |
38 | RELATION_AUTHOR = "author"
39 | RELATION_RETWEET = "quote"
40 | RELATION_MENTION = "mention"
41 |
42 |
43 | def __init__(self, **kwargs):
44 | super(TweetExtractor, self).__init__(**kwargs)
45 |
46 | self.create_connector(self._incoming, "tweet", "esdoc.tweet", "Tweet.");
47 |
48 | self.output_tweet = self.create_socket("tweet" , "esdoc.tweet" , "Tweet.", is_default=True)
49 | self.output_text = self.create_socket("text" , "str" , "Only the text from the tweet.")
50 | self.output_link = self.create_socket("link" , "urlrequest" , "Link from the tweet, for potential follow-up.")
51 | self.output_user = self.create_socket("user" , "graph-edge" , "Info about author, mentioned or retweeted users from the tweet.")
52 |
53 | self.config.set_default(
54 | drop_retweets = True
55 | )
56 |
57 | def _incoming(self, doc):
58 |
59 | if not doc or not type(doc) is dict or not self.has_output:
60 | return
61 |
62 | tweet, users, links = self._extract(doc)
63 | if tweet:
64 | self.output_tweet.send(tweet)
65 | self.output_text.send(tweet["_source"]["text"])
66 | for user in users:
67 | self.output_user.send(user)
68 | for link in links:
69 | self.output_link.send(link)
70 |
71 | def _extract(self, tweet):
72 | "Return a tuple of (tweet, users, links)."
73 |
74 | users = []
75 | links = []
76 |
77 | source = tweet["_source"] # Always present
78 |
79 | # Add author to 'users' list
80 | user_id = source["user"]["id"] # Always present
81 | users.append({"from": user_id, "type": self.RELATION_AUTHOR, "to": user_id})
82 |
83 | # Retweets
84 | retweet_user_id = source.get("retweet_user_id")
85 | if retweet_user_id:
86 | # Find out who has been retweeted:
87 | # Add retweet to 'users' list
88 | users.append({"from": user_id, "type": self.RELATION_RETWEET, "to": retweet_user_id})
89 | if self.config.drop_retweets:
90 | return (None, users, links)
91 |
92 | # URLs and mentions from entities
93 | entities = source.get("entities")
94 | if entities:
95 | # Get URLs
96 | urls = entities.get("urls")
97 | if urls:
98 | for url in urls:
99 | # Add to "links" list:
100 | links.append({
101 | "url" : url["url"],
102 | "what": "twitter", # TODO: Maybe use self.name instead?
103 | "who" : user_id
104 | })
105 | # Get user mentions
106 | user_mentions = entities.get("user_mentions")
107 | if user_mentions:
108 | for m in user_mentions:
109 | # Add relation to 'users' list:
110 | users.append({"from": user_id, "type": self.RELATION_MENTION, "to": m["id"]})
111 |
112 | return (tweet, users, links)
113 |
--------------------------------------------------------------------------------
/eslib/procs/Neo4jReader.py:
--------------------------------------------------------------------------------
1 | __author__ = 'mats'
2 |
3 | from ..Generator import Generator
4 | from .neo4j import Neo4j
5 |
6 | from itertools import izip
7 | import time, logging
8 |
9 | class Neo4jReader(Generator):
10 | """
11 | The purpose of this processor is to ask Neo4j if a node with a given
12 | user id has it's full set of properties.
13 |
14 | It takes an id and determines whether or not it has its properties set.
15 | If it lacks properties, it will be outputted by the 'ids' socket.
16 |
17 | Connectors:
18 | id (str) : Incoming IDs to check.
19 | Sockets:
20 | ids (str) : Outputs IDs that lack properties.
21 |
22 | Config:
23 | batchsize = 20 : How many IDs to gather up before making a call to Neo4j.
24 | batchtime = 5.0 : How many seconds to wait before we send a batch if it is not full.
25 | host = localhost: The host we should connect to
26 | port = 7474 : The default neo4j port
27 |
28 | """
29 |
30 | def __init__(self, **kwargs):
31 | super(Neo4jReader, self).__init__(**kwargs)
32 | self.create_connector(self._incoming_id, "id", "str", "Incoming IDs to check.")
33 | self._missing = self.create_socket("missing", "str", "Outputs IDs that lack properties.")
34 | #self._missing = self.create_socket("output", "???", "Outputs data retrived, one document per ID.")
35 |
36 | self.config.set_default(
37 | batchsize = 20,
38 | batchtime = 5.0,
39 | host = "localhost",
40 | port = 7474
41 | )
42 |
43 | self._neo4j = None
44 |
45 | self._queue = []
46 | self._last_get = time.time()
47 | self._has_properties = set([])
48 |
49 | #TODO: Could place this in Neo4jBase
50 | def on_open(self):
51 | """
52 | Instantiates both a neo4j-instance and a twitter-instance.
53 |
54 | Raises:
55 | - ConnectionError if neo4j can't contact its server
56 | - Exception if twitter can't authenticate properly
57 | """
58 |
59 | # TODO: Need logging, request timeout and exception handling down there:
60 | self.log.debug("Connecting to Neo4j.")
61 | self._neo4j = Neo4j(host=self.config.host, port=self.config.port)
62 | self.log.status("Connected to Neo4j on %s:%d." % (self.config.host, self.config.port))
63 |
64 | def _incoming_id(self, id_):
65 | """
66 | Takes an incoming id, gets the correct query string from self.neo4j,
67 | before appending the query to self._queue
68 | """
69 | if id_ not in self._has_properties:
70 | query = self._neo4j.get_node_query_if_properties(id_)
71 | self._queue.append((id_, query))
72 |
73 | def on_tick(self):
74 | """
75 | Commit items in queue if queue exceeds batchsize or it's been long
76 | since last commit.
77 | """
78 | if ((len(self._queue) >= self.config.batchsize) or
79 | (time.time() - self._last_get > self.config.batchtime and self._queue)):
80 | self._get()
81 |
82 | def on_shutdown(self):
83 | """ Get rid of rest of queue before shutting down. """
84 | while self._queue:
85 | self._get()
86 |
87 | def _get(self):
88 | num_elem = len(self._queue)
89 | if num_elem > self.config.batchsize:
90 | num_elem = self.config.batchsize
91 |
92 | ids, queries = [list(t)
93 | for t in
94 | izip(*self._queue[:num_elem])]
95 | rq = self._neo4j._build_rq(queries)
96 | resp = self._neo4j.commit(rq)
97 | self.log.debug("Asking neo4j for %i users." % num_elem)
98 | self._queue = self._queue[num_elem:]
99 | self._last_get = time.time()
100 | self._write_uids(ids, resp)
101 |
102 | def _write_uids(self, ids, resp):
103 | """
104 | Outputs the ids of the nodes in the resp-object to a socket.
105 |
106 | Args:
107 | ids: The ids that corresponds to a query
108 | resp: a requests-module response object with neo4j-nodes in 'graph'-
109 | format.
110 | """
111 | for uid, result in izip(ids, resp.json()["results"]):
112 | if not result["data"]:
113 | self._missing.send(uid)
114 | if self.doclog.isEnabledFor(logging.TRACE):
115 | self.doclog.trace("uid %s does not have properties" % uid)
116 | else:
117 | self._has_properties.add(uid)
118 |
--------------------------------------------------------------------------------
/eslib/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | """
4 | eslib
5 | ~~~~~
6 |
7 | Document processing library for Elasticsearch.
8 | """
9 |
10 | __version__ = "0.0.1"
11 | __author__ = "Hans Terje Bakke"
12 |
13 |
14 | from .Terminal import TerminalProtocolException, Terminal
15 | from .TerminalInfo import TerminalInfo
16 | from .Connector import Connector
17 | from .Socket import Socket
18 | from .Processor import Processor
19 | from .Generator import Generator
20 | from .Monitor import Monitor
21 | from .Configurable import Configurable, Config
22 |
23 |
24 | __all__ = (
25 | "TerminalProtocolException",
26 | "Terminal",
27 | "TerminalInfo",
28 | "Connector",
29 | "Socket",
30 | "Processor",
31 | "Generator",
32 | "Monitor",
33 | "Configurable",
34 | "Config",
35 |
36 | "unique"
37 | )
38 |
39 | #region Core stuff
40 |
41 | def unique(seq, idfun=None):
42 | # order preserving
43 | if idfun is None:
44 | def idfun(x): return x
45 | seen = {}
46 | result = []
47 | for item in seq:
48 | marker = idfun(item)
49 | if marker in seen: continue
50 | seen[marker] = 1
51 | result.append(item)
52 | return result
53 |
54 | #endregion
55 |
56 |
57 | #region Encoding of stdin/stdout
58 |
59 | import sys, codecs
60 |
61 | # Fix stdin and stdout encoding issues
62 | _encoding_stdin = sys.stdin.encoding or "UTF-8"
63 | _encoding_stdout = sys.stdout.encoding or _encoding_stdin
64 | #sys.stdin = codecs.getreader(_encoding_stdin)(sys.stdin)
65 | sys.stdout = codecs.getwriter(_encoding_stdout)(sys.stdout)
66 |
67 | #endregion Encoding of stdin/stdout
68 |
69 | #region Logging stuff
70 |
71 | import logging
72 | import logging.config
73 |
74 | class _ExtendedLogger(logging.getLoggerClass()):
75 | def makeRecord(self, name, level, fn, lno, msg, args, exc_info, func=None, extra=None):
76 | rec = logging.LogRecord(name, level, fn, lno, msg, args, exc_info, func)
77 |
78 | rec.serviceName = self.serviceName if hasattr(self, 'serviceName') else None
79 | rec.className = self.className if hasattr(self, 'className') else None
80 | rec.instanceName = self.instanceName if hasattr(self, 'instanceName') else None
81 |
82 | rec.firstName = name.split(".")[0]
83 | rec.lastName = name.split(".")[-1]
84 | rec.names = name.split(".")
85 |
86 | return rec
87 |
88 | logging.setLoggerClass(_ExtendedLogger)
89 |
90 |
91 | def _log_status(self, message, *args, **kws):
92 | if self.isEnabledFor(logging.STATUS):
93 | self._log(logging.STATUS, message, args, **kws)
94 |
95 | def _log_verbose(self, message, *args, **kws):
96 | if self.isEnabledFor(logging.VERBOSE):
97 | self._log(logging.VERBOSE, message, args, **kws)
98 |
99 | def _log_trace(self, message, *args, **kws):
100 | if self.isEnabledFor(logging.TRACE):
101 | self._log(logging.TRACE, message, args, **kws)
102 |
103 | def _log_debug_n(self, n, message, *args, **kws):
104 | candidate = logging.DEBUG - n
105 | loglevel = min(max(candidate, logging.TRACE+1), logging.DEBUG)
106 | if self.isEnabledFor(loglevel):
107 | self._log(loglevel, message, args, **kws)
108 |
109 | logging.STATUS = 25
110 | logging.VERBOSE = 15
111 | logging.TRACE = 1
112 |
113 | logging.addLevelName(logging.STATUS , "STATUS")
114 | logging.addLevelName(logging.VERBOSE, "VERBOSE")
115 | logging.addLevelName(logging.TRACE , "TRACE")
116 | for n in range(1,9):
117 | logging.addLevelName(logging.DEBUG -n, "DEBUG-%s" % n)
118 |
119 | logging.Logger.status = _log_status
120 | logging.Logger.verbose = _log_verbose
121 | logging.Logger.trace = _log_trace
122 | logging.Logger.debugn = _log_debug_n
123 |
124 | #endregion Logging stuff
125 |
126 | #region Config stuff
127 |
128 | import os, yaml
129 | from . import esdoc
130 |
131 | def get_credentials(path=None, service_dir=None, credentials_file=None):
132 | service_dir = service_dir or os.environ.get("ESLIB_SERVICE_DIR")
133 | if not service_dir:
134 | raise ValueError("Neither service_dir given nor ESLIB_SERVICE_DIR set.")
135 | dir = os.path.join(service_dir, "config")
136 |
137 | file_path = None
138 | if not credentials_file:
139 | credentials_file = "credentials.yaml"
140 |
141 | if os.path.basename(credentials_file) == credentials_file:
142 | # Pick from dir
143 | file_path = os.path.join(dir, credentials_file)
144 | else:
145 | # Use absolute path
146 | file_path = os.path.expanduser(credentials_file)
147 |
148 | # Load credentials file
149 | with open(file_path, "r") as f:
150 | credentials = yaml.load(f)
151 |
152 | if not path:
153 | return credentials
154 | else:
155 | return esdoc.getfield(credentials, path)
156 |
157 | #endregion
158 |
--------------------------------------------------------------------------------
/eslib/procs/CsvConverter.py:
--------------------------------------------------------------------------------
1 | __author__ = 'Hans Terje Bakke'
2 |
3 | import csv, codecs
4 | from ..Processor import Processor
5 |
6 | class CsvConverter(Processor):
7 | """
8 | Convert csv input to Elasticsearch document format.
9 | Field names can be explicitly entered or derived from the first line of input,
10 | assuming that is the first line contains column names. When explicitly specified, only those columns entered
11 | will be used, the others will be ignored. When derived, all columns are used.
12 |
13 | NOTE: Fields, including column headers, must not have any spacing between delimiters and quotes.
14 |
15 | NOTE: Fields that are mapped to meta fields ('_id', '_index', '_type') will not be part of the '_source'.
16 |
17 | Connectors:
18 | input (csv) : Document in 'csv' format. First document is optionally column list.
19 | Sockets:
20 | output (esdoc) : Documents converted from 'csv' to 'esdoc' format.
21 |
22 | Config:
23 | index = None : Override '_index' meta field with this value.
24 | doctype = None : Override '_type' meta field with this value.
25 | columns = None : List of columns to pick from the CSV input. Use None for columns to ignore.
26 | skip_first_line = False : Skip first line of the input. (Typically column headers you don't want.
27 | delimiter = "," : CSV column delimiter character.
28 |
29 | id_field = "_id" : Name of field to map to meta field '_id'.
30 | index_field = "_index" : Name of field to map to meta field '_index'.
31 | type_field = "_type" : Name of field to map to meta field '_type'.
32 | """
33 |
34 | def __init__(self, **kwargs):
35 | super(CsvConverter, self).__init__(**kwargs)
36 | self.create_connector(self._incoming, "input", "csv", "Document in 'csv' format. First document is optionally column list.")
37 | self.output = self.create_socket("output", "esdoc", "Documents converted from 'csv' to 'esdoc' format.")
38 |
39 | self.config.set_default(
40 | index = None,
41 | doctype = None,
42 | columns = None,
43 | skip_first_line = False,
44 | delimiter = ",",
45 |
46 | id_field = "_id",
47 | index_field = "_index",
48 | type_field = "_type"
49 | )
50 |
51 | self._columns = []
52 | self._first_line_processed = False
53 |
54 |
55 | def on_open(self):
56 | # Sanity check:
57 | if self.config.skip_first_line and not self.config.columns:
58 | raise Exception("Nothing specified in 'columns' and 'skip_first_line' set. Unable to determine fields to include, then.")
59 |
60 | self._first_line_processed = False
61 | self._columns = self.config.columns or []
62 |
63 | def _incoming(self, line):
64 | # Check if we should skip first line or use it as column definitions (columns)
65 | if not self._first_line_processed:
66 | self._first_line_processed = True
67 | if self.config.skip_first_line:
68 | return
69 | if not self._columns:
70 | # No skipping first line ordered and no field list. Now assume first line to be column headings
71 | for csvrow in csv.reader([line], delimiter=self.config.delimiter):
72 | self._columns = csvrow
73 | return
74 |
75 | # Pick the only line. Since csv does not support unicode, we do this little encoding massage:
76 | raw_line = codecs.encode(line, "UTF-8")
77 | raw_csvrow = csv.reader([raw_line], delimiter=self.config.delimiter).next()
78 | csvrow = [codecs.decode(x, "UTF-8") for x in raw_csvrow]
79 |
80 | if not len(self._columns) == len(csvrow):
81 | self.doclog.warning("Column count does not match number of fields. Aborting. Row =\n%s" % csvrow)
82 | self.abort() # NOTE: We might want to continue processing, or we might not...
83 |
84 | doc = {}
85 | id = None
86 | index = None
87 | doctype = None
88 | for i in range(len(self._columns)):
89 | if not self._columns[i]:
90 | continue # Skip non-specified fields
91 | elif self._columns[i] == self.config.id_field:
92 | id = csvrow[i]
93 | elif self._columns[i] == self.config.index_field: # Override index
94 | index = csvrow[i]
95 | elif self._columns[i] == self.config.type_field: # Override doctype
96 | doctype = csvrow[i]
97 | else:
98 | doc.update({self._columns[i]: csvrow[i]})
99 |
100 | # Convert to Elasticsearch type document
101 | esdoc = {"_index":self.config.index or index, "_type":self.config.doctype or doctype, "_id":id, "_source":doc}
102 |
103 | self.output.send(esdoc)
104 |
--------------------------------------------------------------------------------
/eslib/Connector.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | from __future__ import absolute_import
4 |
5 | from .Terminal import Terminal
6 | import Queue
7 | import threading
8 | import time
9 |
10 |
11 | class Connector(Terminal):
12 |
13 | def __init__(self, name, protocol=None, method=None):
14 | self.sleep = 0.1 #0.001 # Check for data in incoming queue this often (then burst through as much as possible)
15 |
16 | super(Connector, self).__init__(name, protocol)
17 | self.type = Connector
18 | self.queue = Queue.Queue()
19 | self.method = method
20 |
21 | # Execution control status
22 | self._thread = None
23 | self.accepting = False
24 | self.stopping = False
25 | self.running = False
26 | self.suspended = False
27 | self.aborted = False
28 |
29 | #region Queue management
30 |
31 | def _clear(self):
32 | "Clear the queue."
33 | while not self.queue.empty():
34 | self.queue.get_nowait()
35 | self.queue.task_done()
36 |
37 | @property
38 | def pending(self):
39 | "Report number of pending items in queue."
40 | return self.queue.qsize()
41 |
42 | def _process(self):
43 | "Grab item from queue and call the pre-registered method on it."
44 | if not self.queue.empty():
45 | document = self.queue.get_nowait()
46 | self.queue.task_done()
47 | if document:
48 | if self.method:
49 | try:
50 | self.method(document)
51 | except Exception as e:
52 | msg = "Unhandled exception in processor '%s' func '%s' while processing a document." % (self.owner.name, self.method.__name__)
53 | self.owner.doclog.exception(msg)
54 | self.owner.log.exception(msg)
55 |
56 | def receive(self, document):
57 | "Put document on the incoming queue for this connector. Called by sockets."
58 | if self.accepting:
59 | self.queue.put(document) # Infinite queue, so it should never block
60 |
61 | #endregion Queue management
62 |
63 | #region Operation management
64 |
65 | def _run(self):
66 | while self.running:
67 | if self.sleep:
68 | time.sleep(self.sleep)
69 | if not self.running:
70 | break
71 | if self.stopping and (self.suspended or self.queue.empty()):
72 | # Notify owner that we are finished stopping
73 | self.owner.production_stopped()
74 | # Now we can finally stop
75 | self.stopping = False
76 | self.running = False
77 | elif not self.suspended:
78 | while self.running and not self.suspended and not self.queue.empty():
79 | self._process()
80 |
81 | # Clean out the queue (in case we just aborted)
82 | self._clear()
83 | self.stopping = False # In case we were stopping while aborted
84 |
85 | # Note: The reason for the split of run() and accept_incoming():
86 | # The entire system should first be accepting data before the individual
87 | # components start processing. When processing, a document is passed on
88 | # through sockets to listening connectors. If those connectors are not yet
89 | # accepting new items on their queues, incoming items will be dropped (i.e.
90 | # not put on the queue, and we would potentially lose the first items
91 | # during start-up.
92 |
93 | def run(self):
94 | "Should be called after all connectors in the system accept incoming data."
95 | if self.running:
96 | raise Exception("Connector is already running.")
97 | if not self.accepting:
98 | raise Exception("Connector is not accepting input before call to run(). Call accept_incoming() on all connectors in the system first.")
99 |
100 | self.aborted = False
101 | self.stopping = False
102 | self.suspended = False
103 | self.running = True
104 |
105 | self._thread = threading.Thread(target=self._run)
106 | self._thread.start()
107 |
108 | def accept_incoming(self):
109 | "Should be called for all connectors in the system before processes start running and processing!"
110 | if self.stopping:
111 | raise Exception("Connector is stopping. Refusing to accept new incoming again until fully stopped.")
112 | self.accepting = True
113 |
114 | def stop(self):
115 | self.accepting = False
116 | self.stopping = True # We must wait for items in the queue to be processed before we finally stop running
117 | if self._thread and self._thread.isAlive():
118 | try:
119 | self._thread.join() # NOTE: Are we sure we want to wait for this ??
120 | except:
121 | pass # Ignore
122 | self._thread = None
123 |
124 | def abort(self):
125 | self.aborted = True
126 | self.accepting = False
127 | self.running = False # Run loop will stop immediately
128 |
129 | def suspend(self):
130 | self.suspended = True
131 |
132 | def resume(self):
133 | self.suspended = False
134 |
135 | #endregion Operation management
136 |
--------------------------------------------------------------------------------
/eslib/procs/RabbitmqWriter.py:
--------------------------------------------------------------------------------
1 | __author__ = 'Hans Terje Bakke'
2 |
3 | from ..Processor import Processor
4 | from .RabbitmqBase import RabbitmqBase
5 | from ..esdoc import tojson
6 | import time
7 |
8 |
9 | class RabbitmqWriter(Processor, RabbitmqBase):
10 | """
11 | Write data to RabbitMQ.
12 | Writes data with type 'str', 'unicode', 'int', or 'float'. Lists and dicts are written as 'json'.
13 | Other types are cast to 'str'.
14 | The 'type' registered with the metadata is then either 'str', 'unicode', 'int', 'float' or 'json'.
15 |
16 | Connectors:
17 | input (*) : Document to write to configured RabbitMQ.
18 |
19 | Config:
20 | host = localhost :
21 | port = 5672 :
22 | admin_port = 15672 :
23 | username = guest :
24 | password = guest :
25 | virtual_host = None :
26 | exchange = None : If specified, data is written to this 'exchange', and also
27 | persisted on a durable queue '_shared'. Clients can
28 | ask to listen to the exchange on this queue ('consumable'
29 | behaviour, the default), or to listen to a live stream on an
30 | exclusive queue that is a copy of all data meant only for that
31 | listener. Clients connected to the shared queue will consume data
32 | from it, thus splitting workload (intended) or competing for the
33 | same data (unintended).
34 | queue = "default" : Not used if 'exchange' is specified.
35 | persisting = True : When this is on, the exchange will store data in a queue until it
36 | is consumed by a consuming monitor. Otherwise, data will only be
37 | queued if there is a listener.
38 | max_reconnects = 3 :
39 | reconnect_timeout = 3 :
40 | max_queue_size = 100000 : If the output queue exceeds this number, this processor is considered congested.
41 | """
42 |
43 | MAX_CONNECTOR_QUEUE_SIZE = 10000
44 | CHECK_QUEUE_INTERVAL = 5 # 5 seconds; how often to check whether the message queue is "congested"
45 |
46 | _is_reader = False # This is a writer
47 |
48 | def __init__(self, **kwargs):
49 | super(RabbitmqWriter, self).__init__(**kwargs)
50 |
51 | self._connector = self.create_connector(self._incoming, "input", None, "Document to write to configured RabbitMQ.")
52 |
53 | self.config.set_default(
54 | persisting = True,
55 | max_queue_size = 100000
56 | )
57 |
58 | self._last_check_queue_time = 0
59 | self._last_known_queue_size = 0
60 |
61 |
62 | def on_open(self):
63 | self._last_check_queue_time = 0
64 | self._last_known_queue_size = 0
65 |
66 | self.count = 0
67 | self._open_connection()
68 | self.log.info("Connected to RabbitMQ.")
69 |
70 | def on_close(self):
71 | if self._close_connection():
72 | self.log.info("Connection to RabbitMQ closed.")
73 |
74 | def _incoming(self, document):
75 | if document == None:
76 | return
77 |
78 | data = None
79 | msg_type = None
80 | if isinstance(document, basestring):
81 | data = document
82 | msg_type = type(document).__name__
83 | elif isinstance(document, (int, long, float)):
84 | data = str(document)
85 | msg_type = type(document).__name__
86 | elif isinstance(document, (list, dict)):
87 | try:
88 | data = tojson(document)
89 | except TypeError as e:
90 | self.doclog.error("JSON serialization failed: %s" % e.message)
91 | return
92 | msg_type = "json"
93 | else:
94 | data = str(document)
95 | msg_type = "str" #type(document).__name__
96 | self.doclog.warning("Writing document of unsupported type '%s' as type 'str'." % type(document).__name__)
97 |
98 | if self._publish(msg_type, data):
99 | self.count += 1
100 |
101 | def is_congested(self):
102 | if super(RabbitmqWriter, self).is_congested():
103 | return True
104 | if self._connector.queue.qsize() > self.MAX_CONNECTOR_QUEUE_SIZE:
105 | return True
106 | elif not self.config.exchange or self.config.persisting:
107 | if self.config.max_queue_size:
108 | now = time.time()
109 | if now - self._last_check_queue_time > self.CHECK_QUEUE_INTERVAL:
110 | try:
111 | self._last_known_queue_size = self.get_queue_size()
112 | except Exception as e:
113 | self.log.warning("Failed to get queue size for queue '%s': %s" % (self._queue_name, e))
114 | self._last_check_queue_time = now
115 |
116 | if self._last_known_queue_size > self.config.max_queue_size:
117 | return True
118 |
119 | return False
120 |
--------------------------------------------------------------------------------
/test/test_protocol_compliance.py:
--------------------------------------------------------------------------------
1 | import unittest
2 | from eslib import Processor, Terminal, Connector, Socket
3 |
4 | class TestProtocolCompliance(unittest.TestCase):
5 |
6 | # TEST mimic / passthrough protocols
7 |
8 | def test_protocol_equal(self):
9 | s = Socket("sock_a", "proto_a")
10 | c = Connector("conn_a", "proto_a")
11 | self.assertTrue(Terminal.protocol_compliance(s, c))
12 |
13 | def test_protocol_not_equal(self):
14 | s = Socket("sock_a", "proto_b")
15 | c = Connector("conn_a", "proto_a")
16 | self.assertFalse(Terminal.protocol_compliance(s, c))
17 |
18 | def test_protocol_general_accepts_special(self):
19 | s = Socket("sock_a", "general.special")
20 | c = Connector("conn_a", "general")
21 | self.assertTrue(Terminal.protocol_compliance(s, c))
22 |
23 | def test_protocol_special_too_strict_for_general(self):
24 | s = Socket("sock_a", "general")
25 | c = Connector("conn_a", "general.special")
26 | self.assertFalse(Terminal.protocol_compliance(s, c))
27 |
28 | def test_protocol_any_any(self):
29 | s = Socket("sock_a", None)
30 | c = Connector("conn_a", None)
31 | self.assertTrue(Terminal.protocol_compliance(s, c))
32 |
33 | def test_protocol_any_sock(self):
34 | s = Socket("sock_a", None)
35 | c = Connector("conn_a", "x")
36 | self.assertTrue(Terminal.protocol_compliance(s, c))
37 |
38 | def test_protocol_any_conn(self):
39 | s = Socket("sock_a", "x")
40 | c = Connector("conn_a", None)
41 | self.assertTrue(Terminal.protocol_compliance(s, c))
42 |
43 | def test_protocol_mimic(self):
44 | a_s = Socket ("sock_a", "esdoc.tweet")
45 | b_c = Connector("conn_b", "esdoc")
46 | b_s = Socket ("sock_b", "esdoc", mimic=b_c) # Should end up mimicing 'esdoc.tweet' from a_s if connected
47 | c_c = Connector("conn_c", "esdoc.tweet")
48 |
49 | # Only unidirectional attachment needed for this test
50 | b_c.attach(a_s)
51 |
52 | print "b_s proto =", b_s.protocol
53 | print "b_s mimiced proto =", b_s.mimiced_protocol
54 | comply = Terminal.protocol_compliance(b_s, c_c)
55 | print "compiance=", comply
56 |
57 | self.assertTrue(b_s.mimiced_protocol == "esdoc.tweet")
58 |
59 | self.assertTrue(Terminal.protocol_compliance(a_s, b_c))
60 | self.assertTrue(Terminal.protocol_compliance(b_s, c_c))
61 |
62 | def test_protocol_mimic_no_connection(self):
63 | a_s = Socket ("sock_a", "esdoc.tweet")
64 | b_c = Connector("conn_b", "esdoc")
65 | b_s = Socket ("sock_b", "esdoc", mimic=b_c) # Should end up mimicing 'esdoc.tweet' from a_s if connected
66 | c_c = Connector("conn_c", "esdoc.tweet")
67 |
68 | print "b_s proto =", b_s.protocol
69 | print "b_s mimiced proto =", b_s.mimiced_protocol
70 | comply = Terminal.protocol_compliance(b_s, c_c)
71 | print "compiance=", comply
72 |
73 | self.assertTrue(b_s.mimiced_protocol == "esdoc")
74 |
75 | self.assertTrue(Terminal.protocol_compliance(a_s, b_c))
76 | self.assertFalse(Terminal.protocol_compliance(b_s, c_c))
77 |
78 | def test_protocol_mimic_sequence(self):
79 | a_s = Socket ("sock_a", "esdoc.tweet")
80 |
81 | b_c = Connector("conn_b", "esdoc")
82 | b_s = Socket ("sock_b", "esdoc", mimic=b_c)
83 |
84 | c_c = Connector("conn_c", "esdoc.tweet")
85 | c_s = Socket ("sock_b", "esdoc", mimic=c_c)
86 |
87 | print "NOT ATTACHED:"
88 | print "b_s proto =", b_s.protocol
89 | print "c_s proto =", b_s.protocol
90 | print "b_s mimiced proto =", c_s.mimiced_protocol
91 | print "c_s mimiced proto =", c_s.mimiced_protocol
92 |
93 | self.assertTrue(c_s.mimiced_protocol == "esdoc")
94 |
95 | # Only unidirectional attachments needed for this test
96 | b_c.attach(a_s)
97 | c_c.attach(b_s)
98 |
99 | print "\nATTACHED:"
100 | print "b_s proto =", b_s.protocol
101 | print "c_s proto =", c_s.protocol
102 | print "b_s mimiced proto =", b_s.mimiced_protocol
103 | print "c_s mimiced proto =", c_s.mimiced_protocol
104 |
105 | self.assertTrue(c_s.mimiced_protocol == "esdoc.tweet")
106 |
107 | def test_protocol_mimic_circular(self):
108 | a_s = Socket ("sock_a", "esdoc.tweet")
109 |
110 | b_c = Connector("conn_b", "esdoc")
111 | b_s = Socket ("sock_b", "esdoc", mimic=b_c)
112 |
113 | c_c = Connector("conn_c", "esdoc.tweet")
114 | c_s = Socket ("sock_b", "esdoc", mimic=c_c)
115 |
116 | # Only unidirectional attachments needed for this test
117 | b_c.attach(c_s) # Making it circular
118 | c_c.attach(b_s)
119 |
120 | print "\nATTACHED:"
121 | print "b_s proto =", b_s.protocol
122 | print "c_s proto =", c_s.protocol
123 | print "b_s mimiced proto =", b_s.mimiced_protocol
124 | print "c_s mimiced proto =", c_s.mimiced_protocol
125 |
126 | self.assertTrue(b_s.mimiced_protocol == "esdoc")
127 |
128 | # And most important, it does not enter an infinite loop and finally gets here..
129 |
130 | def main():
131 | unittest.main()
132 |
133 | if __name__ == "__main__":
134 | main()
135 |
--------------------------------------------------------------------------------
/eslib/procs/PatternRemover.py:
--------------------------------------------------------------------------------
1 | __author__ = 'Hans Terje Bakke'
2 |
3 | from ..Processor import Processor
4 | from .. import esdoc
5 | import re
6 |
7 | class PatternRemover(Processor):
8 | """
9 | Remove text using a regex pattern.
10 |
11 | Connectors:
12 | input (esdoc) (default) : Incoming document in 'esdoc' dict format.
13 | str (str) : Incoming document of type 'str' or 'unicode'.
14 | Sockets:
15 | output (esdoc) (default) : Output of documents that arrived on 'input' connector.
16 | str (str) : Output of documents that arrived on 'str' connector.
17 |
18 | Config:
19 | source_field = "text" : Part of twitter dev credentials.
20 | target_field = None : Defaults to 'source_field', replacing the input field.
21 | field_map = {} : A dict of fields to use as { source : target }.
22 | If specified, this *replaces* the source_field and target_field pair!
23 | pattern = None : Pattern to apply. (All 'patterns' are also applied, if specified.)
24 | patterns = [] : List of patterns to apply. ('pattern' will be applied first, if it exists.)
25 | regex_options = DOTALL|IGNORECASE|MULTILINE|UNICODE
26 | : Options for *all* regex patterns.
27 | strip = True : Remove boundary spaces and double spaces, commonly left after a removal.
28 | """
29 |
30 | def __init__(self, **kwargs):
31 | super(PatternRemover, self).__init__(**kwargs)
32 |
33 | m = self.create_connector(self._incoming_esdoc, "input", "esdoc", "Incoming 'esdoc'.", is_default=True)
34 | self.create_connector(self._incoming_str , "str" , "str" , "Incoming document of type 'str' or 'unicode'.")
35 | self.output_esdoc = self.create_socket("output" , "esdoc" , "Outgoing, cleaned, 'esdoc'.", is_default=True, mimic=m)
36 | self.output_str = self.create_socket("str" , "str" , "Outgoing, cleaned, 'str'.")
37 |
38 | self.config.set_default(
39 | source_field = "text",
40 | target_field = None,
41 | field_map = {},
42 | pattern = None,
43 | patterns = [],
44 | regex_options = re.DOTALL|re.IGNORECASE|re.MULTILINE|re.UNICODE,
45 | strip = True
46 | )
47 |
48 | self._regexes = []
49 | self._field_map = {}
50 |
51 | def on_open(self):
52 | """
53 | :raises ValueError, if failed to parse a pattern as regex
54 | """
55 |
56 | # Create list of regexes
57 | patterns = []
58 | if self.config.pattern:
59 | patterns = [self.config.pattern]
60 | if self.config.patterns:
61 | patterns.extend(self.config.patterns)
62 | self._regexes = []
63 | for pattern in patterns:
64 | try:
65 | regex = re.compile(r"(%s)" % pattern, self.config.regex_options)
66 | self._regexes.append(regex)
67 | except Exception as e:
68 | raise ValueError("Error parsing pattern: %s\nPattern was: %s" % (e.message, pattern))
69 |
70 | # Create field map
71 | self._field_map = self.config.field_map or {}
72 | if not self._field_map:
73 | if not self.config.source_field:
74 | raise ValueError("Neither field_map nor source_field is configured.")
75 | self._field_map[self.config.source_field] = (self.config.target_field or self.config.source_field)
76 |
77 |
78 | def _clean_text(self, text):
79 | for regex in self._regexes:
80 | text = regex.sub("", text)
81 | if self.config.strip:
82 | text = text.strip().replace(" ", " ")
83 | return text
84 |
85 | def _clean(self, doc):
86 |
87 | if not doc or not self._regexes:
88 | return doc
89 |
90 | # This makes this method work also for 'str' and 'unicode' type documents; not only for the expected 'esdoc' protocol (a 'dict').
91 | if type(doc) in [str, unicode]:
92 | cleaned = self._clean_text(doc)
93 | return cleaned
94 | elif not type(doc) is dict:
95 | self.doclog.debug("Unsupported document type '%s'." % type(doc))
96 | return doc
97 |
98 | source = doc.get("_source")
99 | if not source:
100 | return doc # Missing source section; don't do anything
101 |
102 | for source_field, target_field in self._field_map.iteritems():
103 | text = esdoc.getfield(source, source_field)
104 | if text and type(text) in [str, unicode]:
105 | cleaned = self._clean_text(text)
106 | if cleaned != text:
107 | # Note: This may lead to a few strictly unnecessary shallow clonings...
108 | doc = esdoc.shallowputfield(doc, "_source." + target_field, cleaned)
109 | return doc
110 |
111 | def _incoming_esdoc(self, doc):
112 | if self.output_esdoc.has_output:
113 | self.output_esdoc.send(self._clean(doc))
114 |
115 | def _incoming_str(self, doc):
116 | if self.output_str.has_output:
117 | self.output_str.send(self._clean(doc))
118 |
--------------------------------------------------------------------------------
/test/test_procs/test_blacklist_filter.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | import unittest
4 | from eslib.procs import BlacklistFilter
5 |
6 | class TestBlacklistFilter_str(unittest.TestCase):
7 |
8 | def test_str_nohit(self):
9 | s = "I am marvellous"
10 | p = BlacklistFilter(filters=[{"tokens": ["we", "like"], "blacklist": ["young"]}])
11 | p.on_open()
12 | check = p._check(s)
13 |
14 | print "str_nohit (exp:True)=", check
15 | self.assertTrue(check)
16 |
17 | def test_str_hit_but_not_blacklisted(self):
18 | s = "I like girls."
19 | p = BlacklistFilter(filters=[{"tokens": ["we", "like"], "blacklist": ["young"]}])
20 | print "filters=", p._filters
21 | p.on_open()
22 | check = p._check(s)
23 |
24 | print "str_hit_but_not_blacklisted (exp:True)=", check
25 | self.assertTrue(check)
26 |
27 | def test_str_hit_and_blacklisted(self):
28 | s = "I like young girls."
29 | p = BlacklistFilter(filters=[{"tokens": ["we", "like"], "blacklist": ["young"]}])
30 | print "filters=", p._filters
31 | p.on_open()
32 | check = p._check(s)
33 |
34 | print "str_hit_and_blacklisted (exp:False)=", check # Should have hit "young" from blacklist
35 | self.assertFalse(check)
36 |
37 | def test_str_global_whitelist_override(self):
38 | s = "We only like girls. Young girls are always welcome!"
39 | p = BlacklistFilter(filters=[{"tokens": ["we", "like"], "blacklist": ["young"]}], whitelist=["young girls"])
40 | p.on_open()
41 | check = p._check(s)
42 |
43 | print "str_global_whitelist_override (exp:True)=", check
44 | # Should have hit "young" from blacklist, but "young girls" from whitelist should override it
45 | self.assertTrue(check)
46 |
47 |
48 | def test_brooklyn(self):
49 | s = "Brooklyn Nets trounce short-handed Oklahoma City Thunder 116-85 http://t.co/qJZPBEJRCT"
50 | p = BlacklistFilter(filters=[{"tokens": ["nets"], "blacklist": ["brooklyn"]}])
51 | p.on_open()
52 | check = p._check(s)
53 |
54 | print "check (expect False)=", check
55 | self.assertFalse(check)
56 |
57 |
58 | class TestBlacklistFilter_esdoc(unittest.TestCase):
59 |
60 | # check == True means the document was NOT filtered out, i.e. it PASSED the filter
61 |
62 | def test_str_nohit(self):
63 | s = "I am marvellous"
64 | doc = {"_source": {"field1": s}}
65 | p = BlacklistFilter(
66 | field="field1",
67 | filters=[{"tokens": ["we", "like"], "blacklist": ["young"]}])
68 | p.on_open()
69 | check = p._check(doc)
70 |
71 | print "str_nohit (exp:False)=", check
72 | self.assertTrue(check)
73 |
74 | def test_str_hit_but_not_blacklisted(self):
75 | s = "I like girls."
76 | doc = {"_source": {"field1": s}}
77 | p = BlacklistFilter(
78 | fields=["field1", "field2"],
79 | filters=[{"tokens": ["we", "like"], "blacklist": ["young"]}])
80 | print "filters=", p._filters
81 | p.on_open()
82 | check = p._check(doc)
83 |
84 | print "str_hit_but_not_blacklisted (exp:False)=", check
85 | self.assertTrue(check)
86 |
87 | def test_str_hit_and_blacklisted(self):
88 | s1 = "I like young girls."
89 | s2 = "I am a boy."
90 | doc = {"_source": {"field1": s1, "field2": s2}}
91 | p = BlacklistFilter(
92 | fields=["field1", "field2"],
93 | filters=[{"tokens": ["we", "like"], "blacklist": ["young"]}])
94 | print "filters=", p._filters
95 | p.on_open()
96 | check = p._check(doc)
97 |
98 | print "str_hit_and_blacklisted (exp:False)=", check # Should have hit "young" from blacklist
99 | self.assertFalse(check)
100 |
101 | def test_str_global_whitelist_override(self):
102 | s1 = "We only like girls. Young girls are always welcome!"
103 | s2 = "I like young boys."
104 | doc = {"_source": {"field1": s1, "field2": s2}}
105 | p = BlacklistFilter(
106 | fields=["field1", "field2"],
107 | filters=[{"tokens": ["we", "like"], "blacklist": ["young"]}],
108 | whitelist=["young girls"])
109 | p.on_open()
110 | check = p._check(doc)
111 |
112 | print "str_global_whitelist_override (exp:True)=", check
113 | # Should have hit "young" from blacklist, but "young girls" from whitelist should override it
114 | self.assertTrue(check)
115 |
116 | def test_str_global_whitelist_override_not_hitting(self):
117 | s1 = "We only like girls. Young girls are always welcome!"
118 | s2 = "I like young boys."
119 | doc = {"_source": {"field1": s1, "field2": s2}}
120 | p = BlacklistFilter(
121 | fields=["field2"],
122 | filters=[{"tokens": ["we", "like"], "blacklist": ["young"]}],
123 | whitelist=["young girls"])
124 | p.on_open()
125 | check = p._check(doc)
126 |
127 | print "str_global_whitelist_override_not_hitting (exp:False)=", check
128 | # Should have hit "young" from blacklist; "young girls" from whitelist does not apply to field2, so we should not override here
129 | self.assertFalse(check)
130 |
131 | def main():
132 | unittest.main()
133 |
134 | if __name__ == "__main__":
135 | main()
136 |
--------------------------------------------------------------------------------
/eslib/procs/Neo4jWriter.py:
--------------------------------------------------------------------------------
1 | __author__ = 'mats'
2 |
3 | from itertools import izip
4 | import time, logging
5 |
6 | from ..Generator import Generator
7 | from .neo4j import Neo4j
8 |
9 |
10 | class Neo4jWriter(Generator):
11 | """
12 | This is a pipeline step which primary function is to push an edge
13 | between the author of a tweet to all the people mentioned in the tweet.
14 |
15 | Connectors:
16 | edge (graph-edge) : Edge object to write.
17 | user (graph-user) : User object to write.
18 |
19 | Config:
20 | batchsize = 20 : How many IDs to gather up before making a call to Neo4j.
21 | batchtime = 5.0 : How many seconds to wait before we send a batch if it is not full.
22 | host = localhost: The host we should connect to
23 | port = 7474 : The default neo4j port
24 |
25 | """
26 |
27 | def __init__(self, **kwargs):
28 | super(Neo4jWriter, self).__init__(**kwargs)
29 | self.create_connector(self._incoming_edge, "edge", "graph-edge")
30 | self.create_connector(self._incoming_user, "user", "graph-user")
31 |
32 | self.config.set_default(
33 | batchsize = 20,
34 | batchtime = 5,
35 | host = "localhost",
36 | port = 7474
37 | )
38 |
39 | self._neo4j = None
40 |
41 | # This could be better
42 | self._edge_queue = []
43 | self._last_edge_commit = time.time()
44 | self._user_queue = []
45 | self._last_user_commit = time.time()
46 |
47 | def on_open(self):
48 | """
49 | Instantiates both a neo4j-instance and a twitter-instance.
50 |
51 | Raises:
52 | - ConnectionError if neo4j can't contact its server
53 | - Exception if twitter can't authenticate properly
54 |
55 | """
56 |
57 | # TODO: Need logging, request timeout and exception handling down there:
58 | self.log.debug("Connecting to Neo4j.")
59 | self._neo4j = Neo4j(host=self.config.host, port=self.config.port)
60 | self.log.status("Connected to Neo4j on %s:%s." % (self.config.host, self.config.port))
61 |
62 | def _incoming_edge(self, document):
63 | """
64 | Takes an edge and puts it's correct query in the queue.
65 |
66 | Args:
67 | document: A dict with "from", "to" and "type" as fields.
68 |
69 | The ambition is that this Processor should never go down no matter
70 | what happens to a document in this method.
71 |
72 | """
73 | try:
74 | from_id = document["from"]
75 | to_id = document["to"]
76 | edge_type = document["type"]
77 | except KeyError:
78 | self.doclog.exception("Unable to parse document: %s" % str(document))
79 | else:
80 | query = self._neo4j.get_edge_query(from_id, edge_type, to_id)
81 | self._edge_queue.append(query)
82 |
83 | def _incoming_user(self, document):
84 | if self.doclog.isEnabledFor(logging.TRACE):
85 | self.doclog.trace("Incoming user '%s' ('%s')." % (document["screen_name"], document["id"]))
86 | query, params = self._neo4j.get_node_merge_query(document)
87 | self._user_queue.append((query, params))
88 |
89 | def on_tick(self):
90 | """
91 | Commit items in queue if queue exceeds batchsize or it's been long
92 | since last commit.
93 |
94 | """
95 | now = time.time()
96 | if ((len(self._edge_queue) >= self.config.batchsize) or
97 | (now - self._last_edge_commit >= self.config.batchtime and
98 | self._edge_queue)):
99 | self._edge_send()
100 |
101 | if ((len(self._user_queue) >= self.config.batchsize) or
102 | ((now - self._last_user_commit >= self.config.batchtime) and
103 | self._user_queue)):
104 | self._user_send()
105 |
106 | def on_shutdown(self):
107 | """ Clear out the rest of the items in the queue """
108 | self.log.info("Processing remaining edge queue.")
109 | while self._edge_queue:
110 | self._edge_send()
111 | self.log.info("Processing remaining user queue.")
112 | while self._user_queue:
113 | self._user_send()
114 |
115 | def _edge_send(self):
116 | num_edges = len(self._edge_queue)
117 | if num_edges > self.config.batchsize:
118 | num_edges = self.config.batchsize
119 |
120 | rq = self._neo4j._build_rq(self._edge_queue[:num_edges])
121 | self._neo4j.commit(rq)
122 | self.log.debug("Committed %i edges." % num_edges)
123 | self._edge_queue = self._edge_queue[num_edges:]
124 | self._last_edge_commit = time.time()
125 |
126 | def _user_send(self):
127 | num_users = len(self._user_queue)
128 | if num_users > self.config.batchsize:
129 | num_users = self.config.batchsize
130 |
131 | users, params = [list(t)
132 | for t in
133 | izip(*self._user_queue[:num_users])]
134 |
135 | rq = self._neo4j._build_rq(users, params)
136 | self._neo4j.commit(rq)
137 | self.log.debug("Committed %i users" % num_users)
138 | self.user_queue = self._user_queue[num_users:]
139 | self.last_user_commit = time.time()
140 |
--------------------------------------------------------------------------------
/eslib/procs/KafkaMonitor.py:
--------------------------------------------------------------------------------
1 | __author__ = 'Hans Terje Bakke'
2 |
3 | from ..Monitor import Monitor
4 | from pykafka import KafkaClient
5 | import json, time
6 | import logging
7 | import zlib
8 |
9 |
10 | class KafkaMonitor(Monitor):
11 | """
12 | Monitor a Kafka topic.
13 | Assumes data with type 'str', 'unicode', 'int', 'float' or 'json' from RabbitMQ.
14 | Incoming documents are attempted deserialized into these types. Unknown types are passed as 'str'.
15 |
16 | Sockets:
17 | output (*) : Document received on monitored queue.
18 |
19 | Config:
20 | hosts = ["localhost:9292"] : List of Kafka hosts.
21 | zookeeper_hosts = ["localhost:2181"] : For balanced consumption via zookeeper.
22 | topic = "default_topic" :
23 | consumer_group = "default_group" : Balanced consumer group.
24 | compression = False : Whether to decompress the data read from Kafka.
25 | """
26 |
27 | CONGESTION_SLEEP_TIME = 10.0
28 | WORK_TIME = 5.0
29 |
30 | def __init__(self, **kwargs):
31 | super(KafkaMonitor, self).__init__(**kwargs)
32 |
33 | self.output = self.create_socket("output", None, "Document received on monitored queue.")
34 |
35 | self.config.set_default(
36 | hosts = ["localhost:9092"],
37 | zookeeper_hosts = ["localhost:2181"],
38 | topic = "default_topic",
39 | consumer_group = "default_group",
40 | compression = False
41 | )
42 |
43 | self._client = None
44 | self._consumer = None
45 |
46 | #region Processor stuff
47 |
48 | def on_open(self):
49 | self.count = 0
50 | self._client = KafkaClient(",".join(self.config.hosts))
51 | topic = self._client.topics[self.config.topic]
52 | self._consumer = topic.get_balanced_consumer(
53 | auto_commit_enable = True,
54 | consumer_group = self.config.consumer_group,
55 | zookeeper_connect = ",".join(self.config.zookeeper_hosts)
56 | )
57 |
58 | self.log.info("Connected to Kafka topic '%s', balanced via zookeeper." % self.config.topic)
59 |
60 | def on_close(self):
61 | if self._client:
62 | self._consumer.stop()
63 | #del self._consumer
64 | self.log.info("Kafka consumer stopped.")
65 | # Can't find any way to close the connection or ask it to release resources, so I try a 'del'.
66 | #del self._client
67 | self._client = None
68 | self.log.debug("Connection to Kafka deleted.")
69 |
70 | #endregion Processor stuff
71 |
72 | #region Generator stuff
73 |
74 | def on_startup(self):
75 | self.count = 0
76 |
77 | def on_tick(self):
78 |
79 | congested = self.congestion()
80 | if congested:
81 | self.log.debug("Congestion in dependent processor '%s'; sleeping %d seconds." % (congested.name, self.CONGESTION_SLEEP_TIME))
82 | self.congestion_sleep(self.CONGESTION_SLEEP_TIME)
83 | else:
84 | # Read as much as we can for WORK_TIME seconds, then return to controlling
85 | # loop. This way this processor should hang a maximum of WORK_TIME seconds
86 | # before accepting control commands.
87 | start_time = time.time()
88 | while True:
89 | if self.end_tick_reason:
90 | return
91 | if time.time() - start_time > self.WORK_TIME:
92 | self.log.debug("Work time exceeded %s seconds. Returning to control loop." % self.WORK_TIME)
93 | try:
94 | kafka_message = self._consumer.consume(block=False)
95 | except Exception as e:
96 | self.log.error("Error consuming Kafka. Aborting. [%s]" % e.__class__.__name__)
97 | self.abort()
98 | return
99 | if kafka_message is None:
100 | return
101 |
102 | self.count += 1
103 |
104 | if not self.output.has_output: # Don't bother with further message processing, in this case.
105 | return
106 |
107 | document = self._decode_message(kafka_message.value)
108 | if document is not None:
109 | self.output.send(document)
110 |
111 | def _decode_message(self, kafka_data):
112 |
113 | # print "INCOMING KAFKA DATA: [%s]" % kafka_data
114 |
115 | if not kafka_data:
116 | return None
117 |
118 | if self.config.compression:
119 | kafka_data = zlib.decompress(kafka_data)
120 |
121 | msg_type = None
122 | document = None
123 | try:
124 | jj = json.loads(kafka_data)
125 | # kafka_data = tojson({"type": msg_type, "data": data})
126 | except TypeError as e:
127 | self.doclog.warning("JSON deserialization failed: %s" % e.message)
128 | return None
129 | msg_type = jj.get("type")
130 | document = jj.get("data")
131 | if not msg_type or document is None:
132 | return None
133 |
134 | if self.log.isEnabledFor(logging.TRACE):
135 | self.log.trace("Received message of type '%s', Kafka payload size = %d." % (msg_type, len(kafka_data)))
136 | return document
137 |
138 | #endregion Generator stuff
139 |
--------------------------------------------------------------------------------
/eslib/procs/TcpWriter.py:
--------------------------------------------------------------------------------
1 | __author__ = 'Hans Terje Bakke'
2 |
3 | from ..Generator import Generator
4 | from ..esdoc import tojson
5 | import socket
6 | from select import select
7 |
8 |
9 | class TcpWriter(Generator):
10 | """
11 | Write incoming documents to a TCP port.
12 | Documents of type 'str' and 'unicode' are writtes as-is. Other types are attempted written as JSON.
13 |
14 | NOTE: This processor operates as a Generator, but is considered to be passive; hence keepalive defaults to False.
15 |
16 | Connectors:
17 | input (*) : Incoming documents to write to a TCP socket.
18 |
19 | Config:
20 | hostname = "" : Default to any address the machine happens to have. Use "localhost" to enforce local onlu.
21 | port = 4000 :
22 | reuse_address = False : Whether to allow reusing an existing TCP address/port.
23 | """
24 | def __init__(self, **kwargs):
25 | super(TcpWriter, self).__init__(**kwargs)
26 | self.create_connector(self._incoming, "input", None, "Incoming documents to write to a TCP socket.")
27 |
28 | self.keepalive = False # Passive of nature, hence this default
29 |
30 | self.config.set_default(
31 | hostname = "",
32 | port = 4000,
33 | reuse_address = False
34 | )
35 |
36 | self._connections = [] # List of (socket, address) pairs
37 | self._socket = None
38 |
39 | def on_open(self):
40 | self._socket = None
41 | sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
42 | if self.config.reuse_address:
43 | sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
44 | address = (self.config.hostname #or socket.gethostname()
45 | , self.config.port)
46 | try:
47 | sock.bind(address)
48 | #sock.setblocking(0)
49 | sock.listen(0) # No backlog limit
50 | self.log.info("Listening for connections on %s:%d." % address)
51 | except socket.error as e:
52 | self.log.critical("Listener failed to bind to %s:%d. (errno=%d, message=%s)" % (self.config.hostname, self.config.port, e.errno, e.args[1]))
53 | raise e
54 |
55 | self._connections = []
56 | self._socket = sock
57 |
58 | self.total = 0
59 |
60 | def on_close(self):
61 | if self._connections:
62 | for c in self._connections:
63 | s, a = c
64 | s.close()
65 | self._connections = []
66 | if self._socket:
67 | self._socket.close()
68 | self._socket = None
69 | self.log.info("Listener closed.")
70 |
71 | @staticmethod
72 | def _get_conn(connections, sock):
73 | for c in connections:
74 | if c[0] == sock:
75 | return c
76 | return None
77 |
78 | def on_tick(self):
79 | if not self.running or self.stopping:
80 | return
81 |
82 | r, w, e = select([self._socket], [], [self._socket], 0) # Non-blocking
83 | if e:
84 | self.log.warning("Error on server socket -- now what?")
85 | if r:
86 | # We have one or more new connections pending. Get one and return to run loop.
87 | c = self._socket.accept()
88 | s, a = c
89 | self.log.info("New connection from %s:%d." % a)
90 | self._connections.append(c)
91 |
92 | # Check for dead connections
93 | connections = self._connections[:]
94 | sockets = [s for s,a in connections]
95 | r, w, e = select(sockets, [], sockets, 0)
96 | if e:
97 | self.log.warning("Error on connected socket -- now what?")
98 | for s in r:
99 | # This socket is intended for write only, but since there is now data,
100 | # we read a bit just to work down the input buffer. If it is empty, getting
101 | # here means the connection has been closed on the other end, and we can remove it.
102 | data = s.recv(1024)
103 | if not data:
104 | s.close()
105 | c = self._get_conn(connections, s)
106 | if c and c in self._connections:
107 | self.log.info("Connection closed by client %s:%d." % c[1])
108 | self._connections.remove(c)
109 | else:
110 | self.log.info("Unknown connection closed by client.")
111 |
112 | def _send(self, data):
113 | connections = self._connections[:]
114 | for c in connections:
115 | s, a = c
116 | try:
117 | s.sendall((data + "\n").encode("utf8"))
118 | #s.flush()
119 | except socket.error as e:
120 | if e.errno == socket.errno.EPIPE: # Broken pipe
121 | self.log.info("Connection closed by client %s:%d. (Broken pipe)" % a)
122 | else:
123 | self.log.error("Unhandled error writing to socket from %s:%d. Disconnecting. (errno=%d, message=%s)" %
124 | (a[0], a[1], e.errno, e.args[1]))
125 | self._connections.remove(c)
126 |
127 | def _incoming(self, document):
128 | if document:
129 | data = document
130 | if not type(document) in [str, unicode]:
131 | data = tojson()
132 | self._send(data)
133 |
134 | self.count += 1
135 | self.total += 1
136 |
--------------------------------------------------------------------------------
/test/test_procs/test_entity_extractor.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | __author__ = 'Hans Terje Bakke'
4 |
5 | import unittest
6 | from eslib.procs.EntityExtractor import EntityExtractor
7 | from eslib import esdoc
8 | from eslib import unique
9 |
10 |
11 | class TestEntityExtractor(unittest.TestCase):
12 | entities = \
13 | [
14 | {
15 | "category": "webpage",
16 | "name": "nrk",
17 | "match": [
18 | { "type": "exact", "pattern": "nrk.no" },
19 | #{ "type": "iprange", "value": "160.68.205.231/16" }
20 | ]
21 | },
22 | {
23 | "category": "targets",
24 | "name": "comperio",
25 | "match": [
26 | { "type": "exact", "pattern": u"hans terje bøkke", "weight": 0.8 },
27 | { "type": "exact", "pattern": "10.0.0.100", "weight": 0.5 },
28 | { "type": "exact", "pattern": "comperio" }
29 | ]
30 | },
31 | {
32 | "category": "targets",
33 | "name": "IBM",
34 | "match": [
35 | { "type": "exact", "pattern": "ibm" }
36 | ]
37 | },
38 | {
39 | "category": "creditcards",
40 | "name": "creditcard", # The name should become the credit card number
41 | "match": [ { "type": "creditcard" } ]
42 | },
43 | {
44 | "category": "emails",
45 | "name": "email", # The email should become the email address
46 | "match": [ { "type": "email" } ]
47 | },
48 | ]
49 |
50 | def test_defaults(self):
51 | ex = EntityExtractor()
52 | ex.on_open()
53 |
54 | self.assertEqual(ex.config.fields, [])
55 | self.assertEqual(ex.config.target, "entities")
56 | self.assertEqual(ex.config.entities, [])
57 |
58 | def test_extract_str(self):
59 | ex = EntityExtractor()
60 | ex.config.entities = self.entities
61 | ex.on_open()
62 |
63 | s = u"As mentioned ø on nrk.no, Hans Terje Bøkke works for Comperio. His PC has IP address 10.0.0.100. " + \
64 | "He never uses his credit card: 1234.5678.9876.5432. You can contact him on " + \
65 | "hans.terje.bakke@gmail.com. But balle.klorin@wesenlund.no will not work for IBM."
66 |
67 | extracted = ex._extract(None, s)
68 | elist = list(extracted)
69 |
70 | for e in elist:
71 | print e
72 |
73 | self.assertEqual(len(elist), 8)
74 |
75 |
76 | def _verify(self, entities):
77 | webpages = unique([x["name"] for x in entities["webpage"]])
78 | targets = unique([x["name"] for x in entities["targets"]])
79 | emails = unique([x["name"] for x in entities["emails"]])
80 | creditcards = unique([x["name"] for x in entities["creditcards"]])
81 |
82 | print "WEBPAGE:", webpages
83 | print "TARGETS:", targets
84 | print "EMAILS :", emails
85 | print "CREDITC:", creditcards
86 |
87 | self.assertEqual(['nrk'], webpages)
88 | self.assertEqual(['comperio', 'IBM'], targets)
89 | self.assertEqual(['hans.terje.bakke@gmail.com', 'balle.klorin@wesenlund.no'], emails)
90 | self.assertEqual(['1234.5678.9876.5432'], creditcards)
91 |
92 | def test_merge(self):
93 | ex = EntityExtractor()
94 | ex.config.entities = self.entities
95 | ex.on_open()
96 |
97 | s = "As mentioned on nrk.no, Hans Terje Bakke works for Comperio. His PC has IP address 10.0.0.100. " + \
98 | "He never uses his credit card: 1234.5678.9876.5432. You can contact him on " + \
99 | "hans.terje.bakke@gmail.com. But balle.klorin@wesenlund.no will not work for IBM."
100 |
101 | extracted = ex._extract(None, s)
102 | entities = ex._merge(extracted)
103 |
104 | self._verify(entities)
105 |
106 | def test_doc_through(self):
107 |
108 | ex = EntityExtractor()
109 | ex.config.entities = self.entities
110 |
111 | doc = {"_id": "123", "_source": {
112 | "field1": "As mentioned on nrk.no, Hans Terje Bakke works for Comperio.",
113 | "field2": "He never uses his credit card: 1234.5678.9876.5432.",
114 | "field3": "You can contact him on hans.terje.bakke@gmail.com.",
115 | "subsection" : {
116 | "subfield": "But balle.klorin@wesenlund.no will not work for IBM."
117 | },
118 | "entities": { "old" : "stuff" }
119 | }}
120 |
121 | ex.config.fields = ["field1", "field2", "field3", "subsection.subfield"]
122 |
123 | output = []
124 | ex.add_callback(lambda proc, doc: output.append(doc))
125 | ex.start()
126 | ex.put(doc)
127 | ex.stop()
128 | ex.wait()
129 |
130 | #print output[0]
131 |
132 | new_doc = output[0]
133 | entities = new_doc["_source"]["entities"]
134 |
135 | self._verify(entities)
136 |
137 | # Check that old and new doc are not the same
138 | self.assertFalse(doc is new_doc)
139 |
140 | # Check that the previous entities still exist in the new document
141 | old = esdoc.getfield(new_doc, "_source.entities.old")
142 | self.assertEqual(old, "stuff")
143 |
144 | # Check that the new entities do not exist in the original document
145 | self.assertTrue(esdoc.getfield(doc, "_source.entities.webpage") is None)
146 | self.assertTrue(esdoc.getfield(new_doc, "_source.entities.webpage") is not None)
147 |
--------------------------------------------------------------------------------
/test/test_connections.py:
--------------------------------------------------------------------------------
1 | import unittest
2 | from eslib import Processor
3 |
4 | class Connections(object):
5 |
6 | def create_processors(self):
7 | self.a = Processor(name="processor_a")
8 | self.b = Processor(name="processor_b")
9 | self.c = Processor(name="processor_c")
10 | self.d = Processor(name="processor_d")
11 |
12 | def create_terminals(self):
13 | self.a.create_connector(None, "input") # Protocol anything
14 | self.a.create_socket("output", "proto_doc")
15 | self.b.create_connector(None, "input", "proto_doc")
16 | self.b.create_socket("output_doc", "proto_doc")
17 | self.b.create_socket("output_str", "proto_str")
18 | self.c.create_connector(None, "input_doc", "proto_doc")
19 | self.c.create_connector(None, "input_str", "proto_str")
20 | self.c.create_socket("output_doc", "proto_doc")
21 | self.c.create_socket("output_ext", "proto_doc.extended")
22 | self.c.create_socket("output_anything")
23 | self.d.create_connector(None, "input_anything")
24 | self.d.create_connector(None, "input_doc", "proto_doc")
25 | self.d.create_connector(None, "input_ext", "proto_doc.extended")
26 |
27 | def connect_terminals(self):
28 | self.b.subscribe(self.a) # Ok call, only one socket and connector
29 | self.c.subscribe(self.b, "output_doc", "input_doc") # Ok
30 | self.c.subscribe(self.a, connector_name="input_doc") # Ok, a's only socket name can be omitted
31 | self.d.subscribe(self.c, "output_doc", "input_anything") # Ok, any input accepted
32 | self.d.subscribe(self.c, "output_ext", "input_ext") # Ok, protocol exact match
33 |
34 |
35 | class TestConnections(unittest.TestCase, Connections):
36 |
37 | def test_create_processors(self):
38 | self.create_processors()
39 |
40 | self.assertIsNotNone(self.a, "Processor a None")
41 | self.assertIsNotNone(self.b, "Processor b None")
42 | self.assertIsNotNone(self.c, "Processor c None")
43 | self.assertIsNotNone(self.d, "Processor d None")
44 |
45 | def test_create_terminals(self):
46 | self.create_processors()
47 | self.create_terminals()
48 |
49 | self.assertTrue(len(self.a.connectors) == 1, "Expected 1 connector for a")
50 | self.assertTrue(len(self.b.connectors) == 1, "Expected 1 connector for b")
51 | self.assertTrue(len(self.c.connectors) == 2, "Expected 2 connectors for c")
52 | self.assertTrue(len(self.d.connectors) == 3, "Expected 3 connectors for d")
53 |
54 | self.assertTrue(len(self.a.sockets) == 1, "Expected 1 socket for a")
55 | self.assertTrue(len(self.b.sockets) == 2, "Expected 2 sockets for b")
56 | self.assertTrue(len(self.c.sockets) == 3, "Expected 3 sockets for c")
57 | self.assertTrue(len(self.d.sockets) == 0, "Expected 0 sockets for d")
58 |
59 |
60 | def test_connect(self):
61 | self.create_processors()
62 | self.create_terminals()
63 | self.connect_terminals()
64 |
65 | # Cannot decide socket, should fail:
66 | self.assertRaises(Exception, self.c.subscribe, (self.b,))
67 | # Ok for socket, but still cannot decide which one of C's connectors:
68 | self.assertRaises(Exception, self.c.subscribe, (self.b, "output_doc"))
69 | # Protocol error:
70 | self.assertRaises(Exception, self.c.subscribe, (self.b, "output_doc", "input_str"))
71 | # Should fail on protocol error:
72 | self.assertRaises(Exception, self.d.subscribe, (self.c, "output_anything", "input_doc"))
73 | # Protocol error:
74 | self.assertRaises(Exception, self.d.subscribe, (self.c, "output_ext", "input_doc"))
75 | # Protocol error, connector more specific than socket:
76 | self.assertRaises(Exception, self.d.subscribe, (self.c, "output_doc", "input_ext"))
77 |
78 | # Do a quick check to see if expected number of connections are now ok
79 | self.assertTrue(len(self.a.sockets["output"].connections) == 2) # b and c
80 | self.assertTrue(len(self.b.connectors["input"].connections) == 1) # b
81 | self.assertTrue(len(self.b.sockets["output_doc"].connections) == 1) # c
82 | self.assertTrue(len(self.c.connectors["input_doc"].connections) == 2) # a and b
83 | self.assertTrue(len(self.c.sockets["output_doc"].connections) == 1) # d
84 | self.assertTrue(len(self.c.sockets["output_ext"].connections) == 1) # d
85 | self.assertTrue(len(self.d.connectors["input_anything"].connections) == 1) # c
86 | self.assertTrue(len(self.d.connectors["input_ext"].connections) == 1) # c
87 |
88 |
89 | def test_connect2(self):
90 | self.create_processors()
91 | self.create_terminals()
92 | self.connect_terminals()
93 |
94 | self.b.unsubscribe() # unsubscribes all input connectors
95 | self.assertTrue(len(self.a.sockets["output"].connections) == 1) # only c left
96 | self.assertTrue(len(self.b.connectors["input"].connections) == 0)
97 |
98 | self.c.unsubscribe(self.a)
99 | self.c.unsubscribe(self.a, connector_name="input_doc")
100 | self.assertTrue(len(self.a.sockets["output"].connections) == 0)
101 | self.assertTrue(len(self.b.sockets["output_doc"].connections) == 1) # c remains
102 | self.assertTrue(len(self.c.connectors["input_doc"].connections) == 1) # only b left
103 |
104 | self.c.unsubscribe(connector_name="input_doc")
105 | self.assertTrue(len(self.b.sockets["output_doc"].connections) == 0) # c now also gone
106 |
107 | self.c.detach(self.d) # Should detach all connections to d
108 | self.assertTrue(len(self.c.sockets["output_doc"].connections) == 0)
109 | self.assertTrue(len(self.c.sockets["output_ext"].connections) == 0)
110 | self.assertTrue(len(self.d.connectors["input_anything"].connections) == 0)
111 | self.assertTrue(len(self.d.connectors["input_ext"].connections) == 0)
112 |
113 |
114 | def main():
115 | unittest.main()
116 |
117 | if __name__ == "__main__":
118 | main()
119 |
--------------------------------------------------------------------------------
/test/test_service/test_http_service.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | ENDPOINT = "localhost:4000"
4 |
5 | import unittest
6 | from eslib.service import Service, HttpService, status
7 | from eslib.procs import Timer, Transformer
8 | import requests, time, threading
9 |
10 | import eslib.prog
11 | eslib.prog.initlogs()
12 |
13 | class TestService(Service):
14 | def __init__(self, **kwargs):
15 | super(TestService, self).__init__(**kwargs)
16 |
17 | self.ending = False
18 | self.requires_metadata = False
19 |
20 | def on_setup(self):
21 | self._timer = Timer(service=self, actions=[(3, 3, "ping")])
22 | self._pc = Transformer(service=self, func=self._func)
23 | self._pc.subscribe(self._timer)
24 |
25 | self.register_procs(self._timer, self._pc)
26 |
27 | return True
28 |
29 | def _func(self, proc, doc):
30 | print doc
31 | if self.ending:
32 | print "FUNC STOP"
33 | self._timer.stop()
34 |
35 | def is_processing(self):
36 | return self._pc.running
37 |
38 | def is_aborted(self):
39 | return self._pc.aborted
40 |
41 | def is_suspended(self):
42 | return self._pc.suspended
43 |
44 | # on_start_processing (should be ran async)
45 | def on_processing_start(self):
46 | self._timer.start()
47 | time.sleep(1) # Simulate that it takes some time
48 | return True
49 |
50 | def on_processing_stop(self):
51 | time.sleep(1) # Simulate that it takes some time
52 | self._timer.stop()
53 | self._pc.wait()
54 | return True
55 |
56 | # on_abort_processing
57 | def on_processing_abort(self):
58 | self._timer.abort()
59 | self._pc.stop()
60 | return True
61 |
62 |
63 | # TODO: on_update_metadata
64 |
65 |
66 | class HttpTestService(HttpService, TestService):
67 |
68 | def __init__(self, **kwargs):
69 | super(HttpTestService, self).__init__(**kwargs)
70 |
71 | # Add management routes to functions
72 | self.add_route(self._test1, "GET", "test1/{id}/{?mode}", ["mode"])
73 |
74 | def _test1(self, request_handler, payload, **kwargs):
75 | parameters = kwargs
76 | print "TEST1:", parameters
77 | return {"echo": parameters}
78 |
79 | class TestTestService(unittest.TestCase):
80 |
81 | def test_run_shutdown(self):
82 | p = TestService()#mgmt_endpoint=ENDPOINT) # localhost:4444 by default
83 | p.ending = False
84 |
85 | print "Starting service"
86 | print "Asserting '%s' (not started)" % status.DOWN
87 | self.assertEqual(p.status, status.DOWN)
88 |
89 | p.run()
90 | # This does not require config, thus going straight from 'down' to 'idle'
91 | print "Asserting '%s'" % status.IDLE
92 | self.assertEqual(p.status, status.IDLE)
93 |
94 | print "Shutting down"
95 | p.shutdown(wait=True)
96 | print "Asserting '%s' (shut down)" % status.DOWN
97 | self.assertEqual(p.status, status.DOWN)
98 |
99 | def test_lifecycle(self):
100 | p = TestService()#mgmt_endpoint=ENDPOINT) # localhost:4444 by default
101 | p.ending = False
102 |
103 | print "Starting service"
104 | print "Asserting '%s' (not started)" % status.DOWN
105 | self.assertEqual(status.DOWN, p.status)
106 |
107 | p.run()
108 | # This does not require config, thus going straight from 'down' to 'idle'
109 | print "Asserting '%s'" % status.IDLE
110 | self.assertEqual(status.IDLE, p.status)
111 |
112 | print "Starting processing"
113 | p.processing_start()
114 | print "Asserting '%s'" % status.PROCESSING
115 | self.assertEqual(status.PROCESSING, p.status)
116 |
117 | time.sleep(1)
118 | print "Stopping processing"
119 | p.processing_stop()
120 | time.sleep(0.1)
121 | print "Asserting '%s'" % status.STOPPING
122 | self.assertEqual(status.STOPPING, p.status)
123 |
124 | print "Waiting for processing to stop"
125 | p.processing_wait()
126 | print "Asserting '%s' (stopped)" % status.IDLE
127 | self.assertEqual(status.IDLE, p.status)
128 |
129 | print "Starting processing"
130 | p.processing_start()
131 | print "Asserting '%s'" % status.PROCESSING
132 | self.assertEqual(status.PROCESSING, p.status)
133 |
134 | time.sleep(1)
135 | print "Aborting processing"
136 | p.processing_abort()
137 | print "Asserting '%s'" % status.ABORTED
138 | self.assertEqual(status.ABORTED, p.status)
139 |
140 | print "Starting processing"
141 | p.processing_start()
142 | print "Asserting '%s'" % status.PROCESSING
143 | self.assertEqual(status.PROCESSING, p.status)
144 |
145 | print "Shutting down"
146 | p.shutdown()
147 | #threading.Thread(target=lambda : p.shutdown()).start()
148 | time.sleep(0.1)
149 | print "Asserting '%s'" % status.CLOSING
150 | self.assertEqual(status.CLOSING, p.status)
151 |
152 | print "Waiting for shutdown"
153 | p.wait()
154 | print "Asserting '%s' (shut down)" % status.DOWN
155 | self.assertEqual(status.DOWN, p.status)
156 |
157 | def test_lifecycle_ending_service(self):
158 | p = TestService()#mgmt_endpoint=ENDPOINT) # localhost:4444 by default
159 | p.ending = True
160 |
161 | print "Starting service"
162 | print "Asserting '%s' (not started)" % status.DOWN
163 | self.assertEqual(status.DOWN, p.status)
164 |
165 | p.run()
166 | # This does not require config, thus going straight from 'down' to 'idle'
167 | print "Asserting '%s'" % status.IDLE
168 | self.assertEqual(status.IDLE, p.status)
169 |
170 | print "Starting processing (take 1)"
171 | p.processing_start()
172 | print "Asserting '%s'" % status.PROCESSING
173 | self.assertEqual(status.PROCESSING, p.status)
174 |
175 | print "Waiting for processing to finish (take 1)"
176 | p.processing_wait()
177 | print "Asserting '%s' (stopped)" % status.IDLE
178 | self.assertEqual(status.IDLE, p.status)
179 |
180 | print "Starting processing (take 2)"
181 | p.processing_start()
182 | print "Asserting '%s'" % status.PROCESSING
183 | self.assertEqual(status.PROCESSING, p.status)
184 |
185 | print "Waiting for processing to finish (take 2)"
186 | p.processing_wait()
187 | print "Asserting '%s' (stopped)" % status.IDLE
188 | self.assertEqual(status.IDLE, p.status)
189 |
190 | print "Shutting down (waiting)"
191 | p.shutdown(wait=True)
192 | print "Asserting '%s' (shut down)" % status.DOWN
193 | self.assertEqual(status.DOWN, p.status)
194 |
195 | def main():
196 | unittest.main()
197 |
198 | if __name__ == "__main__":
199 | main()
200 |
--------------------------------------------------------------------------------
/eslib/web.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | """
4 | eslib.web
5 | ~~~~~~~~~~
6 |
7 | Module containing operations against web servers and on web content.
8 | """
9 |
10 |
11 | __all__ = ("WebGetter", "detect_language", "remove_boilerplate")
12 |
13 |
14 | import requests
15 | import eslib
16 | from collections import Counter
17 | from textblob import TextBlob
18 | import justext
19 | from datetime import datetime, timedelta
20 | from email.utils import parsedate_tz, mktime_tz
21 |
22 | class WebGetter(object):
23 | def __init__(self, max_size=-1, content_types=None):
24 | self.content_types = content_types or ["text/plain", "text/html", "text/xml", "application/xml"]
25 | self.max_size = 1024*1024 # 1 MB
26 | if max_size > 0: self.max_size = max_size
27 |
28 | def get(self, url):
29 | # Fetch web page
30 | try:
31 | res = requests.get(url, verify=False)
32 | res.raise_for_status
33 | except:
34 | msg = "URL failed: %s" % url
35 | raise IOError(msg)
36 | if not res.ok:
37 | msg = "URL not ok, status_code=%s for URL: %s" % (res.status_code, url)
38 | raise IOError(msg)
39 |
40 | # Verify allowed content type
41 | content_type = (res.headers.get("content-type") or "").split(";")[0]
42 | if not content_type in self.content_types:
43 | msg = "Skipping web page with content type '%s', URL: %s" % (content_type, url)
44 | raise ValueError(msg)
45 |
46 | # Size check with reported content size
47 | if self.max_size > 0:
48 | size = int(res.headers.get("content-length") or -1)
49 | if size > 0 and size > self.max_size:
50 | msg = "Skipping too large web page (%s), URL: %s" % (eslib.debug.byteSizeString(size, 2), url)
51 | raise ValueError(msg)
52 |
53 | # Find timestamp
54 | date_str = res.headers.get("date")
55 | if not date_str:
56 | timestamp = datetime.utcnow()
57 | else:
58 | t = mktime_tz(parsedate_tz(date_str))
59 | timestamp = datetime(1970, 1, 1) + timedelta(seconds=t)
60 |
61 | # Extract vitals from web result
62 | id = url # res.url
63 | encoding = res.encoding
64 | content = res.text
65 |
66 | # Repeat size check with actual content size
67 | if self.max_size > 0:
68 | size = len(content)
69 | if size > self.max_size:
70 | msg = "Skipping too large web page (%s), URL: %s" % (eslib.debug.byteSizeString(size, 2), url)
71 | raise ValueError(msg)
72 |
73 | body = {"content": content, "content_type": content_type, "encoding": encoding, "date": timestamp}
74 | return body
75 |
76 | #region Language detection
77 |
78 | def detect_language(text, chunk_size=250, max_chunks=5):
79 | """
80 | Detects language of the passed text. Returns majority detection on multiple chunks in order to avoid
81 | misclassification on text with boilerplate text of another language in the beginning of the string.
82 |
83 | Uses Google Translate REST API through the TextBlob library.
84 |
85 | :param text: str
86 | :param chunk_size: int Number of characters in each detection chunk.
87 | :param max_chunks: int Maximum number of chunks to run detection on.
88 | :return: str Google Translate language code.
89 | """
90 | n_chunks = int(max(min(len(text) / chunk_size, max_chunks), 1))
91 | detections = []
92 |
93 | for c in xrange(n_chunks):
94 | l = c * chunk_size
95 | u = max((c + 1) * chunk_size, len(text))
96 |
97 | chunk = text[l:u]
98 | detections.append(TextBlob(chunk).detect_language())
99 |
100 | counts = Counter(detections)
101 |
102 | return counts.most_common(n=1)[0][0]
103 |
104 | #endregion Language detection
105 |
106 | # #region Boilerplate removal
107 |
108 | # Map of correspondences between Google Translate and internal JusText
109 | # language codes
110 | GTRANS_JUSTEXT_LANG_MAP = {
111 | u'af': u'Afrikaans',
112 | u'sq': u'Albanian',
113 | u'ar': u'Arabic',
114 | u'az': u'Azerbaijani',
115 | u'eu': u'Basque',
116 | u'be': u'Belarusian',
117 | u'bg': u'Bulgarian',
118 | u'ca': u'Catalan',
119 | u'hr': u'Croatian',
120 | u'cz': u'Czech',
121 | u'da': u'Danish',
122 | u'nl': u'Dutch',
123 | u'en': u'English',
124 | u'eo': u'Esperanto',
125 | u'et': u'Estonian',
126 | u'fi': u'Finnish',
127 | u'fr': u'French',
128 | u'gl': u'Galician',
129 | u'ka': u'Georgian',
130 | u'de': u'German',
131 | u'el': u'Greek',
132 | u'gu': u'Gujarati',
133 | u'ht': u'Haitian',
134 | u'iw': u'Hebrew',
135 | u'hi': u'Hindi',
136 | u'hu': u'Hungarian',
137 | u'is': u'Icelandic',
138 | u'id': u'Indonesian',
139 | u'ga': u'Irish',
140 | u'it': u'Italian',
141 | u'kn': u'Kannada',
142 | u'ko': u'Korean',
143 | u'la': u'Latin',
144 | u'lv': u'Latvian',
145 | u'lt': u'Lithuanian',
146 | u'mk': u'Macedonian',
147 | u'ms': u'Malay',
148 | u'mt': u'Maltese',
149 | u'no': u'Norwegian_Bokmal',
150 | u'fa': u'Persian',
151 | u'pl': u'Polish',
152 | u'pt': u'Portuguese',
153 | u'ro': u'Romanian',
154 | u'ru': u'Russian',
155 | u'sr': u'Serbian',
156 | u'sk': u'Slovak',
157 | u'sl': u'Slovenian',
158 | u'es': u'Spanish',
159 | u'sw': u'Swahili',
160 | u'sv': u'Swedish',
161 | u'tl': u'Tagalog',
162 | u'ta': u'Tamil',
163 | u'te': u'Telugu',
164 | u'tr': u'Turkish',
165 | u'uk': u'Ukrainian',
166 | u'ur': u'Urdu',
167 | u'vi': u'Vietnamese',
168 | u'cy': u'Welsh'}
169 |
170 | def remove_boilerplate(page_str, lang, relaxed=False):
171 | """
172 | Removes boilerplate from HTML documents.
173 |
174 | Uses JusText library.
175 |
176 | NOTE: quality dependent on correct language detection.
177 |
178 | :param page_str: str HTML page source.
179 | :param lang: str Google Translate language code.
180 | :param relaxed: boolean If True the span between the first and last good/near-good boilerplate match
181 | is returned. Short and bad segments in between are kept.
182 | :return: list List of non-boilerplate segments/paragraphs.
183 | """
184 | if lang not in GTRANS_JUSTEXT_LANG_MAP:
185 | #raise AttributeError("Can not remove boilerplate for language code lang='%s'." % lang)
186 | return []
187 |
188 | jt_lang = GTRANS_JUSTEXT_LANG_MAP[lang]
189 |
190 | paragraphs = justext.justext(page_str, justext.get_stoplist(jt_lang))
191 |
192 | if relaxed:
193 | good_indexes = [paragraphs.index(p) for p in paragraphs if p.class_type in ['near-good', 'good']]
194 |
195 | if len(good_indexes) == 0:
196 | return []
197 |
198 | return [paragraph.text for paragraph in paragraphs[min(good_indexes):max(good_indexes) + 1]]
199 | else:
200 | return [paragraph.text for paragraph in paragraphs if paragraph.class_type in ['near-good', 'good', 'short']]
201 |
202 | #endregion Boilerplate removal
203 |
--------------------------------------------------------------------------------
/eslib/procs/RabbitmqMonitor.py:
--------------------------------------------------------------------------------
1 | __author__ = 'Hans Terje Bakke'
2 |
3 | from ..Monitor import Monitor
4 | from .RabbitmqBase import RabbitmqBase
5 | import pika
6 | import json, time
7 |
8 | class RabbitmqMonitor(Monitor, RabbitmqBase):
9 | """
10 | Monitor a queue in RabbitMQ.
11 | Assumes data with type 'str', 'unicode', 'int', 'float' or 'json' from RabbitMQ.
12 | Incoming documents are attempted deserialized into these types. Unknown types are passed as 'str'.
13 |
14 | Sockets:
15 | output (*) : Document received on monitored queue.
16 |
17 | Config:
18 | host = localhost :
19 | port = 5672 :
20 | admin_port = 15672 :
21 | username = guest :
22 | password = guest :
23 | virtual_host = None :
24 | exchange = None :
25 | queue = "default" : Not used if 'exchange' is specified.
26 | consuming = True : Consume from the queue, rather than to listen on an
27 | exclusive queue that will be deleted when disconnect.
28 | Non-consuming behaviour only works with an 'exchange'.
29 | max_reconnects = 3 :
30 | reconnect_timeout = 3 :
31 | """
32 |
33 | CALC_TOTAL = True # Whether to check our the RabbitMQ at intervals and calculate a total
34 | # from current count and remaining in queue. It thus becomes a moving
35 | # target for ETA calculations.
36 | CALC_TOTAL_INTERVAL = 10.0 # seconds
37 |
38 | _is_reader = True
39 |
40 | def __init__(self, **kwargs):
41 | super(RabbitmqMonitor, self).__init__(**kwargs)
42 |
43 | self.output = self.create_socket("output", None, "Document received on monitored queue.")
44 |
45 | self.config.set_default(
46 | max_reconnects = 3,
47 | reconnect_timeout = 3
48 | )
49 |
50 | self._reconnecting = 0
51 | self._last_calc_total = 0
52 |
53 | #region Processor stuff
54 |
55 | def on_open(self):
56 | self._open_connection()
57 | self.log.info("Connected to RabbitMQ.")
58 |
59 | def on_close(self):
60 | self._calc_total()
61 | if self._close_connection():
62 | self.log.info("Connection to RabbitMQ closed.")
63 |
64 | #endregion Processor stuff
65 |
66 | #region Generator stuff
67 |
68 | def _start_consuming(self):
69 | self._consumer_tag = self._channel.basic_consume(self._callback, queue=self._queue_name, no_ack=True)
70 |
71 | def _stop_consuming(self):
72 | if self._channel:
73 | self._channel.basic_cancel(self._consumer_tag)
74 |
75 | def on_startup(self):
76 | if self.CALC_TOTAL:
77 | self.total = 0 # We will collect this from message queue, otherwise it should be set to None
78 | self._last_calc_total = 0
79 | self.count = 0
80 | self._start_consuming()
81 |
82 | def on_shutdown(self):
83 | self._stop_consuming()
84 |
85 | def on_abort(self):
86 | self._stop_consuming()
87 |
88 | def on_suspend(self):
89 | self._stop_consuming()
90 |
91 | def on_resume(self):
92 | self._start_consuming()
93 |
94 | def on_tick(self):
95 | if self._reconnecting > 0:
96 | self._reconnecting -= 1
97 | # Try to reconnect
98 | ok = False
99 | try:
100 | self._close_connection()
101 | self._open_connection()
102 | self.log.info("Successfully reconnected to RabbitMQ.")
103 | self.reconnecting = 0 # No longer attempting reconnects
104 | self._start_consuming()
105 | except pika.exceptions.AMQPConnectionError as e:
106 | if self._reconnecting > 0:
107 | timeout = self.config.reconnect_timeout
108 | self.log.warning("Reconnect to RabbitMQ failed. Waiting %d seconds." % timeout)
109 | time.sleep(timeout)
110 | else:
111 | self.log.critical("Missing connection to RabbitMQ. Max retries exceeded. Aborting.")
112 | self.abort() # We give up and abort
113 | return
114 |
115 | try:
116 | self._calc_total()
117 | congested = self.congestion()
118 | if congested:
119 | self.log.debug("Congestion in dependent processor '%s'; sleeping 10 seconds." % congested.name)
120 | self.congestion_sleep(10.0)
121 | else:
122 | self._channel.connection.process_data_events()
123 | except Exception as e:
124 | if self._reconnecting >= 0:
125 | self.log.info("No open connection to RabbitMQ. Trying to reconnect.")
126 | self._reconnecting = self.config.max_reconnects # Number of reconnect attempts; will start reconnecting on next tick
127 |
128 | def _calc_total(self):
129 | """
130 | Calculate total number of messages.
131 | That is the sum of what is processed so far, and what remains in the queue.
132 | """
133 | if not self.CALC_TOTAL:
134 | return
135 |
136 | now = time.time()
137 | if now - self._last_calc_total > self.CALC_TOTAL_INTERVAL:
138 | try:
139 | self.total = self.get_queue_size() + self.count
140 | except Exception as e:
141 | self.log.warning("Failed to get queue size for queue '%s': %s" % (self._queue_name, e))
142 | self._last_calc_total = now
143 |
144 | def _callback(self, callback, method, properties, body):
145 | #print "*** RabbitmqMonitor received:"
146 | #print "*** Properties:", properties
147 | #print "*** Body: ", body
148 |
149 | self.count += 1
150 |
151 | if not self.output.has_output: # Don't bother deserializing, etc, in this case
152 | return
153 |
154 | try:
155 | msg_type = properties.type
156 | document = None
157 | if msg_type == "json":
158 | try:
159 | document = json.loads(body)
160 | except TypeError as e:
161 | self.doclog.warning(e.message)
162 | return
163 | elif msg_type in ["str", "unicode"]:
164 | document = body
165 | elif msg_type == "int":
166 | document = int(str(body))
167 | elif msg_type == "float":
168 | document = float(str(body))
169 | elif body:
170 | self.doclog.debug("Received document of type='%s'; converting to str.", msg_type)
171 | document = str(body)
172 |
173 | if document != None:
174 | self.output.send(document)
175 | else:
176 | self.doclog.warning("Received empty document from RabbitMQ.")
177 | except Exception as e:
178 | self.log.error("An exception occurred inside the callback: %s" % e.message)
179 |
180 | #endregion Generator stuff
181 |
--------------------------------------------------------------------------------
/eslib/procs/FileReader.py:
--------------------------------------------------------------------------------
1 | __author__ = 'Hans Terje Bakke'
2 |
3 | from ..Generator import Generator
4 | from select import select
5 | import codecs
6 | import sys, os, os.path, errno
7 | import json
8 |
9 |
10 | # TODO: Windows does not support file descriptors in select()
11 | # Alternative method to _read_as_much_as_possible() needed for Windows.
12 |
13 |
14 | class FileReader(Generator):
15 | """
16 | Read documents from specified files or standard input.
17 | Reads entire file as one document, or per line, according to config.
18 |
19 | Previous behaviour, removed:
20 | Documents starting with '{' are considered JSON documents and converted to 'dict', unless otherwise configured.
21 | All are now considered JSON documents and converted to 'dict', unless 'raw_lines' are set in the config.
22 |
23 | Sockets:
24 | output (*) : Documents read. Either entire file as one, or per line. Either raw string or dict.
25 |
26 | Config:
27 | filename = None : Appended to filenames, for simplicity.
28 | filenames = None : If not set then 'stdin' is assumed. Can take a list of files.
29 | document_pre_file = False : Read each file as one string to be treated as one document.
30 | raw_lines = False : Setting this to True treats the line as a string instead of JSON.
31 | strip_line = True : Whether to remove leading and trailing spaces on a line.
32 | skip_blank_line = True : Whether to skip empty lines (after stripping).
33 | skip_comment_line = True : Whether to skip comment lines
34 | comment_prefix = "#" : Lines beginning with this string is considered to be a comment line if
35 | 'skip_comment_line' is True.
36 | """
37 |
38 | def __init__(self, **kwargs):
39 | super(FileReader, self).__init__(**kwargs)
40 | self.output = self.create_socket("output", None, "Documents read. Either entire file as one, or per line. Either raw string or dict.")
41 |
42 | self.config.set_default(
43 | filename = None,
44 | filenames = [],
45 | document_per_file = False,
46 | raw_lines = False,
47 | strip_line = True,
48 | skip_blank_line = True,
49 | skip_comment_line = True,
50 | comment_prefix = "#",
51 | )
52 | self._filenames = []
53 | self._file = None
54 | self._filename_index = 0
55 |
56 | def on_open(self):
57 |
58 | if self._file:
59 | self.log.error("on_open() attempted when _file exists -- should not be possible.")
60 | return
61 |
62 | # Create a more usable filenames array
63 | self._filenames = []
64 | if self.config.filename:
65 | self._filenames.append(self.config.filename)
66 | if not self.config.filenames:
67 | if not self.config.filename:
68 | self._filenames.append(None) # stdin will be expected
69 | elif type(self.config.filenames) in [str, unicode]:
70 | self._filenames.append(self.config.filenames)
71 | else:
72 | self._filenames.extend(self.config.filenames)
73 |
74 | # Verify that files exists and that we can read them upon starting
75 | for filename in self._filenames:
76 | if filename:
77 | if not os.path.isfile(filename):
78 | e = IOError("File not found: %s" % filename)
79 | e.filename = filename
80 | e.errno = errno.ENOENT # No such file or directory
81 | raise e
82 | elif not os.access(filename, os.R_OK):
83 | e = IOError("Failed to read file: %s" % filename)
84 | e.filename = filename
85 | e.errno = errno.EACCES # Permission denied
86 | raise e
87 |
88 | def _close_file(self):
89 | if self._file and self._file != sys.stdin:
90 | self._file.close()
91 | self._file = None
92 |
93 | def on_close(self):
94 | # If we have an open file, this is our last chance to close it
95 | self._close_file()
96 |
97 | def _handle_data(self, incoming):
98 | data = incoming
99 | if data == None:
100 | return
101 | if self.config.strip_line:
102 | data = data.strip()
103 | if self.config.skip_comment_line and data.startswith(self.config.comment_prefix):
104 | return
105 | if self.config.skip_blank_line and not data:
106 | return
107 | if not self.config.raw_lines:# and data.startswith("{"):
108 | # NOTE: May raise ValueError:
109 | data = json.loads(data)
110 | self.output.send(data)
111 |
112 |
113 | def _read_as_much_as_possible(self):
114 | while True:
115 | # Read as much as we can
116 | r,w,e = select([self._file], [], [self._file], 0)
117 | if e:
118 | pass
119 | # Hm... this happens on every normal file...
120 | #self._close_file()
121 | #break
122 | if r:
123 | line = self._file.readline()
124 | line = codecs.decode(line, self._file.encoding or "UTF-8", "replace")
125 |
126 | if line:
127 | self._handle_data(line)
128 | # In case we should leave the loop while there is still input available:
129 | if self.end_tick_reason or self.suspend:
130 | break
131 | if not line:
132 | # We've reached the end of input
133 | self._close_file()
134 | break
135 | else:
136 | break
137 |
138 | # Candidate for Windows:
139 | def _read_as_much_as_possible_Windows(self):
140 | for line in self._file:
141 | line = codecs.decode(line, self._file.encoding or "UTF-8", "replace")
142 | self._handle_data(line)
143 | # In case we should leave the loop while there is still input available:
144 | if self.end_tick_reason or self.suspend:
145 | return
146 | self._close_file()
147 |
148 | def on_tick(self):
149 |
150 | if self._file:
151 | # We were working on a file... keep reading
152 | if self.config.document_per_file:
153 | all = self._file.read()
154 | self._handle_data(all)
155 | self._close_file()
156 | else:
157 | self._read_as_much_as_possible()
158 | elif self._filename_index >= len(self._filenames):
159 | # We're done!
160 | self.stop()
161 | return
162 | else:
163 | filename = self._filenames[self._filename_index]
164 | if not filename:
165 | self.log.debug("Starting read from stdin.")
166 | self._file = sys.stdin
167 | else:
168 | self.log.debug("Opening file '%s'." % filename)
169 | self._file = open(filename, "r" if self.config.document_per_file else "rt")
170 | self._filename_index += 1
171 | # Return from tick and reenter later with a file to process
172 | return
173 |
--------------------------------------------------------------------------------