├── test
    ├── __init__.py
    ├── test_procs
    │   ├── __init__.py
    │   ├── data
    │   │   ├── csv_no_header.csv
    │   │   ├── csv_with_header.csv
    │   │   ├── tweet_entity_removal.json
    │   │   └── twitter_raw_mock.json
    │   ├── wrapped_process_cmd.py
    │   ├── wrapped_process_json.py
    │   ├── test_html_remover.py
    │   ├── wrapped_process.py
    │   ├── test_twitter_monitor.py
    │   ├── test_pattern_remover.py
    │   ├── test_transformer.py
    │   ├── test_http_monitor.py
    │   ├── test_dateexpander.py
    │   ├── test_tweet_entity_removal.py
    │   ├── test_tweet_extractor.py
    │   ├── test_csv_converter.py
    │   ├── test_blacklist_filter.py
    │   └── test_entity_extractor.py
    ├── test_config.py
    ├── test_protocol_compliance.py
    ├── test_connections.py
    └── test_service
    │   └── test_http_service.py
├── MANIFEST.in
├── bin
    ├── es-managerd.sh
    ├── es-write
    ├── es-cleantweet
    └── es-read
├── eslib
    ├── Monitor.py
    ├── service
    │   ├── __init__.py
    │   ├── DummyService.py
    │   ├── PipelineService.py
    │   └── RemotingService.py
    ├── Generator.py
    ├── debug.py
    ├── procs
    │   ├── Transformer.py
    │   ├── Throttle.py
    │   ├── CLIReader.py
    │   ├── FileWriter.py
    │   ├── TwitterFollowerGetter.py
    │   ├── TweetEntityRemover.py
    │   ├── __init__.py
    │   ├── DateExpander.py
    │   ├── Timer.py
    │   ├── TwitterUserGetter.py
    │   ├── KafkaWriter.py
    │   ├── SmtpMailer.py
    │   ├── HtmlRemover.py
    │   ├── TweetExtractor.py
    │   ├── Neo4jReader.py
    │   ├── CsvConverter.py
    │   ├── RabbitmqWriter.py
    │   ├── PatternRemover.py
    │   ├── Neo4jWriter.py
    │   ├── KafkaMonitor.py
    │   ├── TcpWriter.py
    │   ├── RabbitmqMonitor.py
    │   └── FileReader.py
    ├── Socket.py
    ├── Configurable.py
    ├── text.py
    ├── TerminalInfo.py
    ├── prog.py
    ├── Terminal.py
    ├── esdoc.py
    ├── time.py
    ├── __init__.py
    ├── Connector.py
    └── web.py
├── examples
    ├── service_run_dir
    │   └── config
    │   │   ├── credentials.yaml
    │   │   ├── services.yaml
    │   │   ├── logging-console.yaml
    │   │   └── logging.yaml
    ├── resources
    │   └── tweet.json
    ├── entity_extractor.py
    └── remoting
    │   ├── RemotingClient.py
    │   └── DummyRemotingService.py
├── DEVHELP.txt
├── .gitignore
├── setup.py
└── PROTOCOLS.md


/test/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/test/test_procs/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | #include LICENSE
2 | include README.md
3 | include PROTOCOLS.md
4 | include examples/*
5 | recursive-exclude test *
6 | 


--------------------------------------------------------------------------------
/test/test_procs/data/csv_no_header.csv:
--------------------------------------------------------------------------------
1 | 1,"hans terje","bakke","htb"
2 | 2,"eivind","elseth","eee"
3 | 3,"ole-kristian","villabø","okv"
4 | 


--------------------------------------------------------------------------------
/test/test_procs/data/csv_with_header.csv:
--------------------------------------------------------------------------------
1 | "id","name","last name","initials"
2 | 1,"hans terje","bakke","htb"
3 | 2,"eivind","elseth","eee"
4 | 3,"ole-kristian","villabø","okv"
5 | 


--------------------------------------------------------------------------------
/bin/es-managerd.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | 
3 | # Note: es-run-service must be in path
4 | # Note: ESLIB_SERVICE_DIR must be set, or -d option must be used
5 | exec ./es-service $@ -c manager managerd -e localhost:5000 --start
6 | 


--------------------------------------------------------------------------------
/eslib/Monitor.py:
--------------------------------------------------------------------------------
1 | from .Generator import Generator
2 | 
3 | class Monitor(Generator):
4 |     def __init__(self, **kwargs):
5 |         super(Monitor, self).__init__(**kwargs)
6 | 
7 |         self.keepalive = True  # A monitor never stops, unless told to
8 | 


--------------------------------------------------------------------------------
/test/test_procs/wrapped_process_cmd.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | import sys, codecs
 5 | 
 6 | 
 7 | print "INNER/STARTING"
 8 | 
 9 | print "INNER/" + u" ".join([codecs.decode(x, "UTF-8") for x in sys.argv[1:]])
10 | 
11 | print "INNER/EXITING"
12 | 


--------------------------------------------------------------------------------
/examples/service_run_dir/config/credentials.yaml:
--------------------------------------------------------------------------------
 1 | rabbitmq:
 2 |   username  : xxxx
 3 |   password  : xxxx
 4 | 
 5 | twitter:
 6 |   consumer_key        : xxxxxxxxxxxxxxxxxxxxx
 7 |   consumer_secret     : xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx
 8 |   access_token        : xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx
 9 |   access_token_secret : xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx
10 | 


--------------------------------------------------------------------------------
/DEVHELP.txt:
--------------------------------------------------------------------------------
 1 | See this guide for preparations, setting up accounts and a ~/.pypirc file:
 2 | 
 3 |     http://peterdowns.com/posts/first-time-with-pypi.html
 4 | 
 5 | To install the package from the source tree:
 6 | 
 7 |     pip install -e .
 8 | 
 9 | To install from PyPI:
10 | 
11 |     pip install elasticsearch-eslib
12 | 
13 | To upload package to PyPI test (pypitest) or live (pypi):
14 | 
15 |     python setup.py register -r pypi
16 |     python setup.py sdist upload -r pypi
17 | 
18 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | *.py[cod]
 2 | *.swp
 3 | 
 4 | 
 5 | # Folders
 6 | tmp/
 7 | HTBTEST/
 8 | 
 9 | # C extensions
10 | *.so
11 | 
12 | # Packages
13 | *.egg
14 | *.egg-info
15 | dist
16 | build
17 | eggs
18 | parts
19 | #bin
20 | var
21 | sdist
22 | develop-eggs
23 | .installed.cfg
24 | lib
25 | lib64
26 | __pycache__
27 | 
28 | # Installer logs
29 | pip-log.txt
30 | 
31 | # Unit test / coverage reports
32 | .coverage
33 | .tox
34 | nosetests.xml
35 | 
36 | # Translations
37 | *.mo
38 | 
39 | # Mr Developer
40 | .mr.developer.cfg
41 | .project
42 | .pydevproject
43 | .idea
44 | 


--------------------------------------------------------------------------------
/examples/service_run_dir/config/services.yaml:
--------------------------------------------------------------------------------
 1 | # Manager
 2 | 
 3 | manager:
 4 |   name                : "manager"
 5 |   management_endpoint : "localhost:5000"
 6 | 
 7 |   elasticsearch_hosts : ["localhost:9200"]
 8 |   elasticsearch_index : "management"
 9 |   dynamic_port_ranges : [["localhost", 5010, 5019]]
10 | 
11 | # Dummy
12 | 
13 | dummy:
14 |   manager_endpoint    : "localhost:5000"
15 |   #management_endpoint : "localhost:5008"
16 |   management_endpoint : "localhost"
17 | 
18 |   name                : "dummy"
19 |   frequency           : 3
20 |   lifespan            : 120
21 | 


--------------------------------------------------------------------------------
/test/test_procs/wrapped_process_json.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | import sys, select, json
 5 | 
 6 | def send(s):
 7 |     print json.dumps({"inner": s})
 8 | 
 9 | 
10 | try:
11 |     while True:
12 |         r,w,e = select.select([sys.stdin],[],[],0)
13 |         if r:
14 |             line = sys.stdin.readline()
15 |             if line:
16 |                 dd = json.loads(line)
17 |                 s = dd.get("outer")
18 |                 if s:
19 |                     send("echo: %s" % s)
20 |             else:
21 |                 send("stdin was hung up")
22 |                 break
23 | except KeyboardInterrupt:
24 |     send("interrupted")
25 | send("finished")
26 | 


--------------------------------------------------------------------------------
/examples/service_run_dir/config/logging-console.yaml:
--------------------------------------------------------------------------------
 1 | version : 1
 2 | disable_existing_loggers: False
 3 | formatters:
 4 |   categories:
 5 |     format: "%(firstName) -15s %(serviceName) -15s %(className) -20s %(instanceName) -20s %(levelname) -10s %(message)s"
 6 |   rich:
 7 |     format: "%(asctime)s  %(name) -30s %(className) -20s %(lineno) 5d %(funcName) -20s %(levelname) -10s %(message)s"
 8 |   compact:
 9 |     format: "%(name) -30s %(levelname) -10s %(message)s"
10 | 
11 | handlers:
12 |   console:
13 |     class       : logging.StreamHandler
14 |     formatter   : rich
15 |     level       : TRACE
16 |     stream      : ext://sys.stdout
17 | loggers:
18 |   "":
19 |     handlers    : [console]
20 |     level       : WARNING
21 |   servicelog:
22 |     level       : DEBUG
23 |   proclog:
24 |     level       : DEBUG
25 |   doclog:
26 |     level       : WARNING
27 | 


--------------------------------------------------------------------------------
/eslib/service/__init__.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | """
 4 | eslib.service
 5 | ~~~~~
 6 | 
 7 | Base classes for wrapping document processing processors into processing graphs/pipelines and control them.
 8 | """
 9 | 
10 | from .. import esdoc
11 | 
12 | 
13 | from .Service            import Service, status
14 | from .HttpService        import HttpService
15 | from .PipelineService    import PipelineService
16 | from .ServiceManager     import ServiceManager
17 | from .ServiceLauncher    import ServiceLauncher
18 | from .DummyService       import DummyService
19 | from .Client             import Client
20 | from .RemotingService    import RemotingService
21 | 
22 | 
23 | __all__ = (
24 |     "Service",
25 |     "HttpService",
26 |     "PipelineService",
27 |     "ServiceManager",
28 |     "ServiceLauncher",
29 |     "DummyService",
30 |     "Client",
31 |     "RemotingService"
32 | )
33 | 


--------------------------------------------------------------------------------
/test/test_procs/test_html_remover.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | import unittest
 4 | from eslib.procs import HtmlRemover
 5 | 
 6 | class TestHtmlRemover(unittest.TestCase):
 7 | 
 8 |     def test_str(self):
 9 |         dirty = '<a href="http://blabla.com/bla">Lady &amp; Landstrykeren</a>'
10 | 
11 |         p = HtmlRemover()
12 |         cleaned = p._clean(dirty)
13 |         print "D=", dirty
14 |         print "C=", cleaned
15 | 
16 |         self.assertTrue(cleaned == "Lady & Landstrykeren")
17 | 
18 |     def test_unicode(self):
19 |         dirty = u'<a href="http://blabla.com/bla">Lady &amp; Landstrykeren</a>'
20 | 
21 |         p = HtmlRemover()
22 |         cleaned = p._clean(dirty)
23 |         print "D=", dirty
24 |         print "C=", cleaned
25 | 
26 |         self.assertTrue(cleaned == u"Lady & Landstrykeren")
27 | 
28 | def main():
29 |     unittest.main()
30 | 
31 | if __name__ == "__main__":
32 |     main()
33 | 


--------------------------------------------------------------------------------
/eslib/Generator.py:
--------------------------------------------------------------------------------
 1 | from .Processor import Processor
 2 | 
 3 | class Generator(Processor):
 4 |     def __init__(self, **kwargs):
 5 |         super(Generator, self).__init__(**kwargs)
 6 |         self.is_generator = True
 7 | 
 8 |     # These methods could/should be implemented by inheriting classes:
 9 | 
10 |     # on_open(self)     # from Processor
11 |     # on_close(self)    # from Processor
12 | 
13 |     # on_startup(self)
14 |     # on_shutdown(self)
15 |     # on_abort(self)    # from Processor
16 |     # on_tick(self)
17 |     # on_suspend(self)
18 |     # on_resume(self)
19 | 
20 |     # If on_tick finishes on its own without external stop call, call self.stop() from there when done.
21 | 
22 |     @property
23 |     def end_tick_reason(self):
24 |         "If 'aborted', 'stopping' or not 'running'. 'suspended' is not a reason to leave the tick; handle this yourself."
25 |         return self.aborted or self.stopping or self.restarting or not self.running
26 | 


--------------------------------------------------------------------------------
/eslib/debug.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | """
 4 | eslib.debug
 5 | ~~~~~~~~~~~
 6 | 
 7 | Module containing functions useful for debugging.
 8 | """
 9 | import os
10 | 
11 | 
12 | __all__ = ("byte_size_string", "get_memory_used")
13 | 
14 | 
15 | if os.name == 'posix':
16 |     import resource
17 | 
18 | 
19 | def byte_size_string(bytes, decimals=1):
20 |     kB = bytes / 1024.0
21 |     MB = kB / 1024.0
22 |     GB = MB / 1024.0
23 |     s = None
24 |     if   GB > 1.0: s = "%.*f GB" % (decimals, GB)
25 |     elif MB > 1.0: s = "%.*f MB" % (decimals, MB)
26 |     elif kB > 1.0: s = "%.*f kB" % (decimals, kB)
27 |     else: s = "%s B" % bytes
28 |     return s
29 | 
30 | 
31 | def get_memory_used():
32 |     """Get current memory usage by this process. Supposedly in KB."""
33 |     if os.name == 'posix':
34 |         return resource.getrusage(resource.RUSAGE_SELF).ru_maxrss
35 |     else:
36 |         0  # Don't want to risk an exception here..
37 |         #raise NotImplementedError
38 | 


--------------------------------------------------------------------------------
/examples/resources/tweet.json:
--------------------------------------------------------------------------------
1 | { "_timestamp": "2014-10-14T14:26:29Z", "_source": { "id": "522030691567931393", "geo": { "coordinates": [ 40.757023, -74.001698 ], "type": "Point" }, "lang": "en", "entities": { "urls": [ { "indices": [ 70, 92 ], "url": "http://instagram.com/p/uIt8BfP5Qp/" } ], "hashtags": [], "user_mentions": [ { "name": "Stella Chuu", "screen_name": "StellaChuuuuu", "indices": [ 20, 34 ], "id": "285369968" } ] }, "truncated": false, "text": "Me &amp; the lovely @stellachuuuuu @ Jacob K Javits Convention Center http://t.co/x6BUjNY0jv", "created_at": "2014-10-14T14:26:30Z", "source": "Instagram", "place": { "country_code": "US", "country": "United States" }, "user": { "name": "JJ Dillon", "id": "35273719", "lang": "en", "description": "i love beautiful women. like to party & have fun. very cool, calm, laid back person. i love video games, anime, movies, xbox 360, comic books, pop culture", "created_at": "2009-04-25T18:20:07Z", "profile_image_url": "http://pbs.twimg.com/profile_images/506599782908178432/c6pyAlfv_normal.jpeg", "screen_name": "JJDillon430", "location": "New York", "geo_enabled": true, "protected": false } }, "_id": "522030691567931393" } 
2 | 


--------------------------------------------------------------------------------
/examples/entity_extractor.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | 
 5 | from eslib.procs import FileReader, FileWriter, EntityExtractor
 6 | 
 7 | def listener(document):
 8 |     print document["_source"]["extracted"]
 9 | 
10 | entities = [
11 |     {
12 |         "category": "location",
13 |         "name": "place",
14 |         "match": [
15 |             { "type": "exact", "pattern": "Convention" }
16 |             #{ "type": "iprange", "value": "81.27.32.186/16" }
17 |         ]
18 |     },
19 |     {
20 |         "category": "agent",
21 |         "name": "user",
22 |         "match": [
23 |             { "type": "exact", "pattern": "Jacob" }
24 |             #{ "type": "iprange", "value": "81.27.32.186/16" }
25 |         ]
26 |     },
27 |     {
28 |         "category": "agent",
29 |         "name": "user",
30 |         "match": [
31 |             { "type": "exact", "pattern": "stellachuuuuu" }
32 |             #{ "type": "iprange", "value": "81.27.32.186/16" }
33 |         ]
34 |     }
35 | ]
36 | 
37 | 
38 | r = FileReader(filename = "resources/tweet.json")
39 | p = EntityExtractor(fields=["text"], target="extracted", entities=entities)
40 | w = FileWriter()
41 | 
42 | p.subscribe(r)
43 | w.subscribe(p, "entities")
44 | 
45 | r.start()
46 | w.wait() # Will finish once the reader is finished.
47 | 


--------------------------------------------------------------------------------
/test/test_procs/wrapped_process.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | import time, sys, signal
 5 | from select import select
 6 | 
 7 | #region Signal handling
 8 | 
 9 | def _handler_SIGINT(signal, frame):
10 |     print "INNER/RECEIVED SIGINT -- ignoring"
11 | 
12 | def _handler_SIGTERM(signal, frame):
13 |     global running
14 |     print "INNER/RECEIVED SIGTERM -- terminating"
15 |     running = False
16 | 
17 | def _handler_SIGHUP(signal, frame):
18 |     print "INNER/RECEIVED SIGHUP -- ignoring"
19 | 
20 | signal.signal(signal.SIGINT , _handler_SIGINT )
21 | signal.signal(signal.SIGTERM, _handler_SIGTERM)
22 | signal.signal(signal.SIGHUP , _handler_SIGHUP )
23 | 
24 | #endregion Signal handling
25 | 
26 | running = True
27 | 
28 | print "INNER/STARTING"
29 | 
30 | while running:
31 |     r,w,e = select([sys.stdin],[],[],0)
32 |     if r:
33 |         line = sys.stdin.readline()
34 |         line = line.strip()
35 |         if line:
36 |             print "INNER/ECHO:", line
37 |             if line == "*HANGUP*":
38 |                 print "INNER/HANGING UP ON *HANGUP* REQUEST"
39 |                 running = False
40 |             elif line == "*RAISE*":
41 |                 raise Exception("INNER/RAISED EXCEPTION UPON *RAISE* REQUEST")
42 |         else:
43 |             print "INNER/STDIN WAS HUNG UP -- GOOD BYE"
44 |             running = False
45 | 
46 | print "INNER/EXITING"
47 | 


--------------------------------------------------------------------------------
/test/test_config.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | from eslib import Config
 3 | 
 4 | class TestConfig(unittest.TestCase):
 5 | 
 6 |     def test_access(self):
 7 |         config = Config()
 8 |         config.set_default(a="A", b="B")
 9 | 
10 |         print config["a"]
11 |         print config.a
12 | 
13 |         self.assertEqual("A", config["a"])
14 |         self.assertEqual("A", config.a)
15 | 
16 |     def test_assignment(self):
17 |         config = Config()
18 |         #config.set_default(a="A", b="B")
19 | 
20 |         config["a"] = "A"
21 |         config.a = "B"
22 | 
23 |         print config["a"]
24 |         print config.a
25 | 
26 |         self.assertEqual("B", config["a"])
27 |         self.assertEqual("B", config.a)
28 | 
29 | 
30 |     def test_defaults_and_overrides(self):
31 |         config = Config()
32 |         config.set_default(a="A", b="B", x="X")
33 | 
34 |         config.set(a="D", b=None)
35 | 
36 |         print config["a"]
37 |         print config.a
38 |         self.assertEqual("D", config.a)
39 | 
40 |         config.a = "C"
41 |         print config.a
42 |         self.assertEqual("C", config.a)
43 | 
44 |         print config["b"]
45 |         print config.b
46 |         self.assertEqual(None, config.b)
47 | 
48 |         print config.x
49 |         self.assertEqual("X", config.x)
50 | 
51 | def main():
52 |     unittest.main()
53 | 
54 | if __name__ == "__main__":
55 |     main()
56 | 


--------------------------------------------------------------------------------
/bin/es-write:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | 
 5 | from eslib.procs import ElasticsearchWriter, FileReader
 6 | import eslib.prog
 7 | import eslib.time
 8 | import argparse, sys
 9 | 
10 | 
11 | def main():
12 |     help_i  = "Which index to write documents to."
13 |     help_t  = "Which type to set on document (overrides incoming type)."
14 | 
15 |     parser = argparse.ArgumentParser(usage="\n  %(prog)s -i index [-t type] [-f field] [-l limit] [more options]")
16 |     parser._actions[0].help = argparse.SUPPRESS
17 |     parser.add_argument("-i", "--index"    , help=help_i, required=True)
18 |     parser.add_argument("-t", "--type"     , help=help_t)
19 |     parser.add_argument(      "--host"     , help="Elasticsearch host, format 'host:port' or just 'host'.", default=None)
20 |     #parser.add_argument(      "--debug"    , action="store_true")
21 |     parser.add_argument(      "--name"     , help="Process name.", default=None)
22 | 
23 |     if len(sys.argv) == 1:
24 |         parser.print_usage()
25 |         sys.exit(0)
26 | 
27 |     args = parser.parse_args()
28 | 
29 |     # Set up and run this processor
30 |     w = ElasticsearchWriter(
31 |         name = args.name or eslib.prog.progname(),
32 |         hosts        = [args.host] if args.host else [],
33 |         index        = args.index,
34 |         doctype      = args.type,
35 |         batchsize    = 1000,
36 |         batchtime    = 60.0
37 |     )
38 | 
39 | #    if args.debug: w.debuglevel = 0
40 | 
41 |     r = FileReader()
42 |     w.subscribe(r)
43 |     r.start()
44 |     w.wait()
45 | 
46 | 
47 | if __name__ == "__main__": main()
48 | 


--------------------------------------------------------------------------------
/eslib/procs/Transformer.py:
--------------------------------------------------------------------------------
 1 | __author__ = 'Hans Terje Bakke'
 2 | 
 3 | from ..Processor import Processor
 4 | 
 5 | class Transformer(Processor):
 6 |     """
 7 |     Convert input to output protocol.
 8 |     Returns a LIST of zero or more documents converted to the output protocol.
 9 | 
10 |     The following parameters are not part of the processors 'config' object, and can and must be set only upon
11 |     instantiation:
12 | 
13 |         input_protocol  = None
14 |         output_protocol = None
15 |         func            = None     # Mandatory! Must be a function returning a list (or generator) of zero or more
16 |                                      documents complying with the output protocol. Function signature must be
17 |                                      func(proc, doc), where proc is this transformer processor, so you can address it
18 |                                      in your function.
19 |     """
20 |     def __init__(self, func=None, input_protocol=None, output_protocol=None, **kwargs):
21 |         super(Transformer, self).__init__(**kwargs)
22 |         self.create_connector(self._incoming, "input", input_protocol)
23 |         self._output = self.create_socket("output", output_protocol)
24 | 
25 |         self._func = func
26 | 
27 |     def _incoming(self, incoming):
28 |         try:
29 |             ll = self._func(self, incoming)
30 |             if ll:
31 |                 for outgoing in ll:
32 |                     if outgoing:
33 |                         self._output.send(outgoing)
34 |         except Exception as e:
35 |             self.doclog.exception("Error in protocol converter function call.")
36 | 


--------------------------------------------------------------------------------
/eslib/Socket.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | from .Terminal import Terminal
 4 | 
 5 | 
 6 | class Socket(Terminal):
 7 |     "Output terminal in a Processor. Writes data to one or more subscribing connectors of matching protocol."
 8 | 
 9 |     def __init__(self, name, protocol=None, mimic=None):
10 |         super(Socket, self).__init__(name, protocol)
11 |         self.type = Socket
12 |         self.callbacks = []  # List of methods for external callbacks
13 |         self.mimic = mimic
14 | 
15 |     def send(self, document):
16 |         "Send data to all subscribing connectors and callbacks."
17 | 
18 |         # Send data to all accepting connectors
19 |         subscribers = self.connections[:]
20 |         for subscriber in subscribers:
21 |             if subscriber.accepting:
22 |                 subscriber.receive(document)
23 |         # Finally, notify all subscribing callbacks
24 |         for callback in self.callbacks:
25 |             callback(self.owner, document)
26 | 
27 |     @property
28 |     def has_output(self):
29 |         if self.connections or self.callbacks:
30 |             return True
31 |         return False
32 | 
33 |     def _find_mimic_proto(self, visited=None):
34 |         if not visited:
35 |             visited = []
36 |         if self.mimic and self.mimic.connections and not self in visited:
37 |             visited.append(self)
38 |             connected_socket = self.mimic.connections[0]
39 |             return connected_socket._find_mimic_proto(visited)
40 |         return self.protocol
41 | 
42 |     @property
43 |     def mimiced_protocol(self):
44 |         return self._find_mimic_proto()
45 | 


--------------------------------------------------------------------------------
/test/test_procs/test_twitter_monitor.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | import os
 4 | import unittest, json
 5 | from eslib.procs import TwitterMonitor
 6 | 
 7 | class TestTwitterMonitor(unittest.TestCase):
 8 | 
 9 |     def test_simple(self):
10 | 
11 |         # Load test data
12 |         self_dir, _ = os.path.split(__file__)
13 |         f = open(os.path.join(self_dir, "data/twitter_raw_mock.json"))
14 |         j = json.load(f)
15 |         f.close()
16 | 
17 |         m = TwitterMonitor()
18 |         raw, tweet = m._decode(j)
19 | 
20 |         # Test tweet
21 |         self.assertTrue(tweet["_id"] == "520149420122578944")
22 |         self.assertTrue(tweet["_source"]["source"] == u"Twitter for BlackBerry®")
23 |         self.assertTrue(tweet["_source"]["text"] == u'These clowns must hope that we never cum under attack from any force-r we capable of protecting ourselves?')
24 |         self.assertTrue(str(tweet["_source"]["created_at"]) == "2014-10-09 09:51:00.328000")
25 |         self.assertTrue("geo" in tweet["_source"])
26 |         self.assertTrue(tweet["_source"]["lang"] == "en")
27 |         self.assertTrue(tweet["_source"]["place"]["country"] == "South Africa")
28 |         self.assertFalse("in_reply_to" in tweet["_source"])
29 |         # User
30 |         self.assertTrue(tweet["_source"]["user"]["id"] == "2196916282")
31 |         self.assertTrue(tweet["_source"]["user"]["lang"] == "en")
32 |         self.assertTrue(tweet["_source"]["user"]["name"] == "mark fester")
33 |         self.assertFalse("description" in tweet["_source"]["user"])
34 |         self.assertTrue(str(tweet["_source"]["user"]["created_at"]) == "2013-11-26 14:21:35")
35 | 
36 |         # Entities
37 |         # // TODO
38 | 
39 | def main():
40 |     unittest.main()
41 | 
42 | if __name__ == "__main__":
43 |     main()
44 | 


--------------------------------------------------------------------------------
/test/test_procs/data/tweet_entity_removal.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "_timestamp": "2014-10-14T14:26:29Z",
 3 |   "_source": {
 4 |     "id": "522030691567931393",
 5 |     "geo": {
 6 |       "coordinates": [
 7 |         40.757023,
 8 |         -74.001698
 9 |       ],
10 |       "type": "Point"
11 |     },
12 |     "lang": "en",
13 |     "entities": {
14 |       "urls": [
15 |         {
16 |           "indices": [
17 |             70,
18 |             92
19 |           ],
20 |           "url": "http://instagram.com/p/uIt8BfP5Qp/"
21 |         }
22 |       ],
23 |       "hashtags": [],
24 |       "user_mentions": [
25 |         {
26 |           "name": "Stella Chuu",
27 |           "screen_name": "StellaChuuuuu",
28 |           "indices": [
29 |             20,
30 |             34
31 |           ],
32 |           "id": "285369968"
33 |         }
34 |       ]
35 |     },
36 |     "truncated": false,
37 |     "text": "Me &amp; the lovely @stellachuuuuu @ Jacob K Javits Convention Center http://t.co/x6BUjNY0jv",
38 |     "created_at": "2014-10-14T14:26:30Z",
39 |     "source": "Instagram",
40 |     "place": {
41 |       "country_code": "US",
42 |       "country": "United States"
43 |     },
44 |     "user": {
45 |       "name": "JJ Dillon",
46 |       "id": "35273719",
47 |       "lang": "en",
48 |       "description": "i love beautiful women. like to party & have fun. very cool, calm, laid back person. i love video games, anime, movies, xbox 360, comic books, pop culture",
49 |       "created_at": "2009-04-25T18:20:07Z",
50 |       "profile_image_url": "http://pbs.twimg.com/profile_images/506599782908178432/c6pyAlfv_normal.jpeg",
51 |       "screen_name": "JJDillon430",
52 |       "location": "New York",
53 |       "geo_enabled": true,
54 |       "protected": false
55 |     }
56 |   },
57 |   "_id": "522030691567931393"
58 | }
59 | 


--------------------------------------------------------------------------------
/eslib/Configurable.py:
--------------------------------------------------------------------------------
 1 | class Config(object):
 2 |     def __init__(self, **config):
 3 |         super(Config, self).__init__()
 4 |         if config is not None:
 5 |             self.__dict__ = config
 6 |             self.defaults = {}
 7 | 
 8 |     def set_default(self, **kwargs):
 9 |         for key,val in kwargs.iteritems():
10 |             self.defaults[key] = val
11 |             # if not key in self.__dict__:
12 |             #     self.__dict__[key] = val
13 | 
14 |     def __getattr__(self, key):
15 |         if key in self.__dict__:
16 |             return self.__dict__.__getattr__(key)
17 |         elif key in self.defaults:
18 |             return self.defaults[key]
19 |         else:
20 |             raise AttributeError("'%s' has no attribute '%s'" % (self.__class__.__name__, key))
21 | 
22 |     def __getitem__(self, key):
23 |         if key in self.__dict__:
24 |             return self.__dict__[key]
25 |         elif key in self.defaults:
26 |             return self.defaults[key]
27 |         else:
28 |             raise AttributeError("'%s' has no attribute '%s'" % (self.__class__.__name__, key))
29 | 
30 |     def __setitem__(self, key, value):
31 |             self.__dict__[key] = value
32 | 
33 |     def set(self, ignore_none=False, **kwargs):
34 |         "ignore_none means that fields with value None are not set."
35 |         for key,val in kwargs.iteritems():
36 |             if ignore_none and val is None:
37 |                 continue
38 |             self.__dict__[key] = val
39 | 
40 |     def get_default_attributes(self):
41 |         return self.defaults
42 | 
43 |     def get_user_attributes(self):
44 |         return {key: val for key, val in self.__dict__.iteritems() if key not in self.defaults}
45 | 
46 | class Configurable(object):
47 |     def __init__(self, **kwargs):
48 |         super(Configurable, self).__init__()
49 |         self.config = Config(**kwargs)
50 | 


--------------------------------------------------------------------------------
/eslib/text.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | """
 4 | eslib.text
 5 | ~~~~~~~~~~
 6 | 
 7 | Module containing operations on text strings.
 8 | """
 9 | 
10 | 
11 | __all__ = ("remove_parts", "remove_html")
12 | 
13 | 
14 | import re
15 | from HTMLParser import HTMLParser
16 | 
17 | import sys
18 | 
19 | def remove_parts(text, sections):
20 |     """
21 |     Remove sections from text. Sections is a list of tuples with (start,end)
22 |     coordinates to clip from the text string.
23 |     """
24 | 
25 |     if not sections: return text
26 | 
27 |     c = sorted(sections)
28 |     s = []
29 |     s.append(text[:c[0][0]])
30 |     for i in range(1, len(c)):
31 |         s.append(text[c[i-1][1]:c[i][0]])
32 |     s.append(text[c[-1][1]:])
33 |     cleaned = "".join(s)
34 |     return cleaned
35 | 
36 | #region remove_html
37 | 
38 | class _MLStripper(HTMLParser):
39 |     def __init__(self):
40 |         self.reset()
41 |         self.fed = []
42 |         self.strict = False
43 |     def handle_data(self, d):
44 |         self.fed.append(d)
45 |     def get_data(self):
46 |         return ''.join(self.fed)
47 | 
48 | 
49 | _regex_whitespace = re.compile(r'\s+', re.UNICODE)
50 | _regex_scripts    = re.compile(r"""<script\s*(type=((".*?")|('.*?')))?>.*?</script>""", re.MULTILINE|re.DOTALL|re.UNICODE)
51 | _regex_style      = re.compile(r"""(<style\s*(type=((".*?")|('.*?')))?>.*?</style>)""", re.MULTILINE|re.DOTALL|re.UNICODE)
52 | 
53 | def remove_html(text):
54 |     if not text or not type(text) in [str, unicode]:
55 |         return text
56 | 
57 |     text = re.sub(_regex_scripts, " ", text)
58 |     text = re.sub(_regex_style  , " ", text)
59 |     stripper = _MLStripper()
60 |     cleaned = stripper.unescape(text)
61 |     stripper.feed(cleaned)
62 |     cleaned = stripper.get_data()
63 |     cleaned = re.sub(_regex_whitespace, " ", cleaned)
64 |     return cleaned
65 | 
66 | #endregion remove_html
67 | 
68 | 


--------------------------------------------------------------------------------
/eslib/procs/Throttle.py:
--------------------------------------------------------------------------------
 1 | __author__ = 'Hans Terje Bakke'
 2 | 
 3 | from ..Processor import Processor
 4 | import time
 5 | 
 6 | 
 7 | class Throttle(Processor):
 8 |     """
 9 |     Only pass through documents that satisfy a whitelist of terms or where certain terms do not occur in a combination
10 |     with blacklisted terms.
11 | 
12 |     Connectors:
13 |         input      (esdoc)   : Incoming document in 'esdoc' dict format.
14 |     Sockets:
15 |         output     (esdoc)   : Documents that passed the blacklist filtering, arrived on 'input' connector.
16 | 
17 |     Config:
18 |         delay     = 1.0      : Time to delay document throughput, in seconds (float).
19 |         drop      = False    : Drop items we don't have time for instead of buffering up.
20 |     """
21 | 
22 |     def __init__(self, **kwargs):
23 |         super(Throttle, self).__init__(**kwargs)
24 | 
25 |         m = self.create_connector(self._incoming, "input", None, "Incoming document.")
26 |         self.output = self.create_socket("output" , None, "Outgoing document.", mimic=m)
27 | 
28 |         self.config.set_default(
29 |             delay  = 1.0,
30 |             drop   = True
31 |         )
32 | 
33 |         self._last_write_ts = 0
34 | 
35 |     def on_open(self):
36 |         self._last_write_ts = 0
37 | 
38 |     def _incoming(self, doc):
39 |         if self.output.has_output:
40 |             if self.config.drop:
41 |                 now_ts = time.time()
42 |                 if now_ts - self._last_write_ts > self.config.delay:  # Otherwise just ignore the incoming doc
43 |                     self._last_write_ts = now_ts
44 |                     self.output.send(doc)
45 |                     #print "QUEUE=", self.connectors["input"].queue.qsize()
46 |             else:
47 |                 time.sleep(self.config.delay)
48 |                 self.output.send(doc)
49 |                 #print "QUEUE=", self.connectors["input"].queue.qsize()
50 | 


--------------------------------------------------------------------------------
/examples/service_run_dir/config/logging.yaml:
--------------------------------------------------------------------------------
 1 | version : 1
 2 | disable_existing_loggers: False
 3 | formatters:
 4 |   brief:
 5 |     format: "%(firstName) -20s %(serviceName) -20s %(className) -20s %(instanceName) -20s %(levelname) -10s %(message)s"
 6 |   individual:
 7 |     format: "%(asctime)s  %(className) -20s %(instanceName) -20s %(levelname) -10s %(message)s"
 8 |   root:
 9 |     format: "%(asctime)s  %(name) -50s %(levelname) -10s %(message)s"
10 | 
11 | handlers:
12 |   console:
13 |     class       : logging.StreamHandler
14 |     formatter   : brief
15 |     level       : INFO
16 |     #filters     : [allow_foo]
17 |     stream      : ext://sys.stdout
18 |   file_root:
19 |     class       : logging.handlers.RotatingFileHandler
20 |     formatter   : root
21 |     filename    : root.log
22 |     maxBytes    : 1024
23 |     backupCount : 3
24 |   file_service:
25 |     class       : logging.handlers.RotatingFileHandler
26 |     formatter   : individual
27 |     filename    : service.log
28 |     maxBytes    : 1024
29 |     backupCount : 3
30 |   file_proc:
31 |     class       : logging.handlers.RotatingFileHandler
32 |     formatter   : individual
33 |     filename    : proc.log
34 |     maxBytes    : 1024
35 |     backupCount : 3
36 |   file_doc:
37 |     class       : logging.handlers.RotatingFileHandler
38 |     formatter   : individual
39 |     filename    : doc.log
40 |     maxBytes    : 1024
41 |     backupCount : 3
42 | loggers:
43 |   "":
44 |     handlers    : [file_root]
45 |     level       : DEBUG
46 |   servicelog:
47 |     handlers    : [console, file_service]
48 |     level       : DEBUG
49 |     propagate   : false
50 |   proclog:
51 |     handlers    : [console, file_proc]
52 |     level       : DEBUG
53 |     propagate   : false
54 |   doclog:
55 |     handlers    : [file_doc]
56 |     level       : DEBUG
57 |     propagate   : false
58 | 
59 | # servicelog.SERVICE.INSTANCE
60 | 
61 |   doclog.myservice.myinstance:
62 |     level: DEBUG
63 | 


--------------------------------------------------------------------------------
/eslib/TerminalInfo.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | from .Connector import Connector
 4 | from .Socket import Socket
 5 | 
 6 | 
 7 | class TerminalInfo(object):
 8 | 
 9 |     def __init__(self, terminal=None, include_connections=True):
10 |         if terminal:
11 |             self.type = terminal.type # t.__class__.__name__
12 |             owner_name = "orphan"
13 |             if terminal.owner: owner_name = terminal.owner.name or "???"
14 |             if terminal.owner: owner_name = terminal.owner.name or "???"
15 |             self.owner = owner_name
16 |             self.name = terminal.name
17 |             self.protocol = terminal.protocol
18 |             self.description = terminal.description
19 |             connections = terminal.get_connections()
20 |             self.count = len(connections)
21 |             self.connections = []
22 |             if include_connections:
23 |                 for c in terminal.get_connections():
24 |                     self.connections.append(TerminalInfo(c, False))
25 | 
26 |     def DUMP(self, follow_connections=True, verbose=False, indent=0):
27 |         spacing = "  "
28 |         spc = spacing * indent
29 |         type_indicator = "?"
30 |         mimic_str = ""
31 |         if self.type is Socket:
32 |             type_indicator = "+"
33 |             if self.mimic:
34 |                 mimic_str = " (mimic=%s)" % self.mimic.name
35 |         elif self.type is Connector:
36 |             type_indicator = "-"
37 | 
38 |         print "%s%c%s.%s(%s) (conns=%d)%s" % (spc, type_indicator, self.owner, self.name, self.protocol, self.count, mimic_str)
39 |         if verbose and self.description:
40 |             print "\"%s%s%s\"" % (spc, spc, self.description)
41 | 
42 |         if follow_connections and self.connections:
43 |             subindent = 0
44 |             if verbose:
45 |                 print "%sConnections:" % spc
46 |                 subindent += 1
47 |             for c in self.connections:
48 |                 c.DUMP(False, verbose, subindent+1)
49 | 
50 | 


--------------------------------------------------------------------------------
/test/test_procs/test_pattern_remover.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | import unittest
 4 | from eslib.procs import PatternRemover
 5 | from eslib import esdoc
 6 | 
 7 | class TestPatternRemover(unittest.TestCase):
 8 | 
 9 |     def test_str(self):
10 |         dirty = u"Oh my fucking god…"
11 | 
12 |         p = PatternRemover(patterns=["my", u"\S+…"])
13 |         p.on_open()  # Force generation of internal regexes
14 |         cleaned = p._clean(dirty)
15 |         print "D=", dirty
16 |         print "C=", cleaned
17 | 
18 |         self.assertTrue(cleaned == "Oh fucking")
19 | 
20 |     def test_field(self):
21 |         dirty_text = u"Oh my fucking god…"
22 | 
23 |         dirty = {
24 |             "_id": "somedoc",
25 |             "_source": {
26 |                 "text": dirty_text
27 |             }
28 |         }
29 | 
30 |         p = PatternRemover(patterns=["my", u"\S+…"], target_field="cleaned")
31 |         p.on_open()  # Force generation of internal regexes
32 |         cleaned = p._clean(dirty)
33 |         print "D=", esdoc.getfield(cleaned, "_source.text")
34 |         print "C=", esdoc.getfield(cleaned, "_source.cleaned")
35 | 
36 |         self.assertTrue(esdoc.getfield(cleaned, "_source.text"   ) == dirty_text)
37 |         self.assertTrue(esdoc.getfield(cleaned, "_source.cleaned") == "Oh fucking")
38 | 
39 |     def test_field_map(self):
40 |         dirty = {
41 |             "_id": "somedoc",
42 |             "_source": {
43 |                 "A": "This was A",
44 |                 "b": { "B": "This was B"}
45 |             }
46 |         }
47 | 
48 |         p = PatternRemover(pattern="was", field_map={"A": "cleaned.cleaned_A", "b.B": "cleaned.cleaned_B"})
49 |         p.on_open()  # Force generation of internal regexes
50 |         cleaned = p._clean(dirty)
51 | 
52 |         self.assertTrue(esdoc.getfield(cleaned, "_source.cleaned.cleaned_A") == "This A")
53 |         self.assertTrue(esdoc.getfield(cleaned, "_source.cleaned.cleaned_B") == "This B")
54 | 
55 | def main():
56 |     unittest.main()
57 | 
58 | if __name__ == "__main__":
59 |     main()
60 | 


--------------------------------------------------------------------------------
/eslib/procs/CLIReader.py:
--------------------------------------------------------------------------------
 1 | __author__ = 'Eivind Eidheim Elseth'
 2 | import time
 3 | import subprocess
 4 | 
 5 | from ..Monitor import Monitor
 6 | import logging
 7 | 
 8 | class CLIReader(Monitor):
 9 |     """
10 |     The CLIReader is a Generator that will periodically call a command line utility
11 | 
12 |     Sockets:
13 |         stdout     (str)   (default)   : Output from the command line utility's stdout
14 |         stderr     (str)               : Output from the command line utility's stderr
15 | 
16 |     Config:
17 |         cmd             = None   : The command to run
18 |         interval        = 10     : The waiting period in seconds between each time the command is run
19 | 
20 |     """
21 | 
22 |     def __init__(self, **kwargs):
23 |         super(CLIReader, self).__init__(**kwargs)
24 |         self._stdout = self.create_socket("stdout", "str", "The output to stdout from the command line utility", is_default=True)
25 |         self._stderr = self.create_socket("stderr", "str", "The output to stderr from the command line utility")
26 |         self.config.set_default(
27 |             interval = 10
28 |         )
29 |         self.last_get = None
30 | 
31 |     def on_tick(self):
32 |         if not self.last_get or (time.time() - self.last_get  > self.config.interval):
33 |             # Since the next call may crash, at least mark the last attempt as now,
34 |             # so we don't try again on every tick, but wait for the next interval.
35 |             self.last_get = time.time()
36 | 
37 |             p = subprocess.Popen(self.config.cmd, shell=False, stdout=subprocess.PIPE)
38 |             p.wait()
39 |             (output, err) = p.communicate()
40 |             if output:
41 |                 if self.doclog.isEnabledFor(logging.TRACE):
42 |                     self.doclog.trace("Output doc: %s" % str(output))
43 |                 self._stdout.send(output)
44 |             if err:
45 |                 self.log.error("Received message from subprocess on stderr: %s" % str(err))
46 |                 self._stderr.send(err)
47 | 
48 |             self.last_get = time.time()
49 | 


--------------------------------------------------------------------------------
/eslib/procs/FileWriter.py:
--------------------------------------------------------------------------------
 1 | __author__ = 'Hans Terje Bakke'
 2 | 
 3 | # TODO: Verify encoding working, especially when writing to stdout
 4 | 
 5 | from ..Processor import Processor
 6 | import sys
 7 | from ..esdoc import tojson
 8 | 
 9 | 
10 | class FileWriter(Processor):
11 |     """
12 |     Write incoming documents to specified file or standard output.
13 |     Documents of dict type are written as json documents, per line. Other types are written directly with
14 |     their string representation.
15 | 
16 |     Connectors:
17 |         input      (*)       : Incoming documents to write to file as string or json objects per line.
18 | 
19 |     Config:
20 |         filename          = None    : If not set then 'stdout' is assumed.
21 |         append            = False   : Whether to append to existing file, rather than overwrite.
22 |     """
23 |     def __init__(self, **kwargs):
24 |         super(FileWriter, self).__init__(**kwargs)
25 |         self.create_connector(self._incoming, "input", None, "Incoming documents to write to file as string or JSON objects per line.")
26 | 
27 |         self.config.set_default(
28 |             filename = None,
29 |             append   = False
30 |         )
31 | 
32 |         self._file = None
33 | 
34 |     def on_open(self):
35 | 
36 |         if self._file:
37 |             self.log.error("on_open() attempted when _file exists -- should not be possible.")
38 |             return
39 | 
40 |         if not self.config.filename:
41 |             # Assuming stdout
42 |             self._file = sys.stdout
43 |         else:
44 |             # May raise exception:
45 |             self._file = open(self.config.filename, "a" if self.config.append else "w")
46 | 
47 |     def on_close(self):
48 |         if self._file and self._file != sys.stdout:
49 |             self._file.close()
50 |         self._file = None
51 | 
52 |     def _incoming(self, document):
53 |         if document:
54 |             if type(document) is dict:
55 |                 print >> self._file, tojson(document)
56 |             else:
57 |                 print >> self._file, document
58 |             self._file.flush()
59 | 


--------------------------------------------------------------------------------
/eslib/prog.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | """
 4 | eslib.prog
 5 | ~~~~~~~~~~
 6 | 
 7 | Helper functions for running as an executable program.
 8 | """
 9 | 
10 | 
11 | __all__ = ( "progname", "initlogs")
12 | 
13 | import os, sys, logging.config, yaml
14 | 
15 | 
16 | def progname():
17 |     return os.path.basename(sys.argv[0])
18 | 
19 | def initlogs(config_file=None):
20 |     # if config_file:
21 |     #     config_file = os.path.join(os.getcwd(), config_file)
22 |     # else:
23 |     #     location = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__)))
24 |     #     config_file = os.path.join(location, 'logging.yml')
25 |     #
26 |     # config = yaml.load(open(config_file)) # TODO: YAML files are in UTF-8... if terminal is something else, make sure we convert correctly
27 |     # logging.config.dictConfig(config=config)
28 | 
29 |     if config_file:
30 |         config_file = os.path.join(os.getcwd(), config_file)
31 |         config = yaml.load(open(config_file)) # TODO: YAML files are in UTF-8... if terminal is something else, make sure we convert correctly
32 |         logging.config.dictConfig(config=config)
33 |     else:
34 |         console = logging.StreamHandler()
35 |         console.setLevel(logging.TRACE)
36 |         LOG_FORMAT = '%(firstName) -20s %(levelname) -10s %(className) -20s %(instanceName) -20s %(funcName) -25s %(lineno) -5d: %(message)s'
37 |         console.setFormatter(logging.Formatter(LOG_FORMAT))
38 | 
39 |         servicelog = logging.getLogger("servicelog")
40 |         servicelog.setLevel(logging.TRACE)
41 |         servicelog.propagate = False
42 |         servicelog.addHandler(console)
43 | 
44 |         proclog = logging.getLogger("proclog")
45 |         proclog.setLevel(logging.TRACE)
46 |         proclog.propagate = False
47 |         proclog.addHandler(console)
48 | 
49 |         doclog  = logging.getLogger("doclog")
50 |         doclog.setLevel(logging.TRACE)
51 |         doclog.propagate = False
52 |         doclog.addHandler(console)
53 | 
54 |         rootlog = logging.getLogger()
55 |         rootlog.setLevel(logging.WARNING)
56 |         rootlog.addHandler(console)
57 | 


--------------------------------------------------------------------------------
/test/test_procs/data/twitter_raw_mock.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "id_str": "520149420122578944",
 3 |   "text": "These clowns must hope that we never cum under attack from any force-r we capable of protecting ourselves?",
 4 |   "truncated": false,
 5 |   "lang": "en",
 6 |   "created_at": "Thu Oct 09 09:51:00 +0000 2014",
 7 |   "timestamp_ms": "1412848260328",
 8 |   "source": "<a href=\"http://blackberry.com/twitter\" rel=\"nofollow\">Twitter for BlackBerry®</a>",
 9 | 
10 |   "in_reply_to_user_id_str": null,
11 |   "in_reply_to_screen_name": null,
12 |   "in_reply_to_status_id_str": null,
13 | 
14 |   "geo": {
15 |     "coordinates": [
16 |       -34.07079,
17 |       18.57407
18 |     ],
19 |     "type": "Point"
20 |   },
21 | 
22 |   "place": {
23 |     "country": "South Africa",
24 |     "country_code": "ZA"
25 |   },
26 | 
27 |   "entities": {
28 |     "urls": [
29 |       {
30 |         "display_url": "eraliquida.com/?p=1010",
31 |         "expanded_url": "http://www.eraliquida.com/?p=1010",
32 |         "indices": [
33 |           7,
34 |           29
35 |         ],
36 |         "url": "http://t.co/2OdUzFv0Ev"
37 |       }
38 |     ],
39 |     "hashtags": [
40 |       {
41 |         "text": "偽2ch騒動",
42 |         "indices": [
43 |           100,
44 |           107
45 |         ]
46 |       },
47 |       {
48 |         "text": "偽2ch問題",
49 |         "indices": [
50 |           108,
51 |           115
52 |         ]
53 |       }
54 |     ],
55 |     "user_mentions": [
56 |       {
57 |         "name": "اقوى العروض وارخصها",
58 |         "screen_name": "rt_ld",
59 |         "id_str": "2649736855",
60 |         "indices": [
61 |           0,
62 |           6
63 |         ],
64 |         "id": 2649736855
65 |       }
66 |     ]
67 |   },
68 | 
69 |   "user": {
70 |     "id_str": "2196916282",
71 |     "screen_name": "Mark_50598",
72 |     "name": "mark fester",
73 |     "lang": "en",
74 |     "description": null,
75 |     "created_at": "Tue Nov 26 14:21:35 +0000 2013",
76 |     "location": "",
77 |     "profile_image_url": "http://abs.twimg.com/sticky/default_profile_images/default_profile_1_normal.png",
78 |     "protected": false,
79 |     "geo_enabled": true
80 |   }
81 | }
82 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | import os
 4 | import sys
 5 | from glob import glob
 6 | 
 7 | # PREREQUISITES:
 8 | # yum install -y libxml2-devel libxslt-devel
 9 | 
10 | try:
11 |     from setuptools import setup
12 | except ImportError:
13 |     from distutils.core import setup
14 | 
15 | if sys.argv[-1] == 'publish':
16 |     os.system('python setup.py sdist upload')
17 |     sys.exit()
18 | 
19 | requires = [
20 |     'elasticsearch',
21 |     'lxml',
22 |     'oauthlib',
23 |     'python-daemon==2.0.6',      # For services, version 2.1 is fucked (change user stuff)
24 |     'argparse',
25 |     'psutil', 'setproctitle',
26 |     'pika', 'pyrabbit',   # for Rabbitmq
27 |     'pykafka',            # For Kafka
28 |     'HTMLParser',
29 |     'requests>=2',        # version >=2 needed by TwitterAPI
30 |     'TwitterAPI',
31 |     'PyYAML',             # for prog logging init stuff
32 |     'feedparser',         # for rss
33 |     'python-dateutil',
34 | #    'mock'                # for testing
35 |     'beautifulsoup4',
36 |     'textblob', 'justext' # for web.py
37 | ]
38 | 
39 | 
40 | setup(
41 |     name='eslib',
42 |     version='0.0.14',
43 |     description='Document processing framework and utility for Elasticsearch (or whatever).',
44 |     #long_description=open("README.md").read(),
45 |     author='Hans Terje Bakke',
46 |     author_email='hans.terje.bakke@comperio.no',
47 |     url='https://github.com/comperiosearch/elasticsearch-eslib',
48 |     keywords="document processing docproc",
49 |     packages=['eslib', 'eslib.procs', 'eslib.service'],
50 | #    package_data={'': ['LICENSE', 'README.md', 'PROTOCOLS.md']},
51 |     scripts=glob('bin/*'),
52 |     include_package_data=True,
53 |     # TODO: examples in package data
54 |     install_requires=requires,
55 |     license='Apache 2.0',
56 |     zip_safe=False,
57 | 
58 |     classifiers=(
59 |         'Development Status :: 5 - Production/Stable',
60 |         'Intended Audience :: Developers',
61 |         'Natural Language :: English',
62 |         'License :: OSI Approved :: Apache Software License',
63 |         'Programming Language :: Python',
64 |         'Programming Language :: Python :: 2',
65 |         'Programming Language :: Python :: 2.7'
66 |     )
67 | )
68 | 


--------------------------------------------------------------------------------
/test/test_procs/test_transformer.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | import unittest
 4 | from eslib.procs import Transformer
 5 | 
 6 | class TestProtocolConverter(unittest.TestCase):
 7 | 
 8 |     def test_func_one_lambda(self):
 9 | 
10 |         csv2list = lambda proc, doc: [",".join(doc)]
11 | 
12 |         p = Transformer(func=csv2list, input_protocol="list", output_protocol="csv")
13 | 
14 |         output = []
15 |         p.add_callback(lambda proc, doc: output.append(doc))
16 | 
17 |         p.start()
18 |         p.put(["a","b","c","d"])
19 |         p.stop()
20 |         p.wait()
21 | 
22 |         print "output=", output[0]
23 | 
24 |         self.assertEqual(output[0], "a,b,c,d")
25 | 
26 | 
27 |     def yieldfunc(self, proc, doc):
28 |         yield doc.lower()
29 |         yield doc.upper()
30 | 
31 |     def test_func_multi_yield(self):
32 | 
33 |         p = Transformer(func=self.yieldfunc, input_protocol="str", output_protocol="str")
34 | 
35 |         output = []
36 |         p.add_callback(lambda proc, doc: output.append(doc))
37 | 
38 |         p.start()
39 |         p.put("a")
40 |         p.put("b")
41 |         p.put("c")
42 |         p.stop()
43 |         p.wait()
44 | 
45 |         joined = ",".join(output)
46 |         print "output=", joined
47 | 
48 |         self.assertEqual(joined, "a,A,b,B,c,C")
49 | 
50 | 
51 |     def edge2ids(self, proc, doc):
52 |         if doc["type"] == "author":
53 |             yield doc["from"]
54 |         else:
55 |             yield doc["from"]
56 |             yield doc["to"]
57 | 
58 |     def test_graph_edge_convertion(self):
59 |         p = Transformer(func=self.edge2ids, input_protocol="str", output_protocol="str")
60 | 
61 |         output = []
62 |         p.add_callback(lambda proc, doc: output.append(doc))
63 | 
64 |         p.start()
65 |         p.put({"type": "author" , "from": "1", "to": "1"})
66 |         p.put({"type": "mention", "from": "2", "to": "3"})
67 |         p.put({"type": "quote"  , "from": "4", "to": "1"})
68 |         p.stop()
69 |         p.wait()
70 | 
71 |         joined = ",".join(output)
72 |         print "output=", joined
73 | 
74 |         self.assertEqual(joined, "1,2,3,4,1")
75 | 
76 | 
77 | 
78 | def main():
79 |     unittest.main()
80 | 
81 | if __name__ == "__main__":
82 |     main()
83 | 


--------------------------------------------------------------------------------
/test/test_procs/test_http_monitor.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | import unittest
 4 | from eslib.procs import HttpMonitor
 5 | import requests
 6 | 
 7 | import eslib.prog
 8 | eslib.prog.initlogs()
 9 | 
10 | class TestHttpMonitor(unittest.TestCase):
11 | 
12 |     def test_get(self):
13 |         self.hooked_msg = None
14 |         output = []
15 | 
16 |         p = HttpMonitor(hook=self._hook)  # localhost:4000 by default
17 |         p.add_callback(lambda proc, doc: output.append(doc))
18 | 
19 |         print "Starting server."
20 |         p.start()
21 | 
22 |         print "Sending request"
23 |         res = requests.get("http://localhost:4000/ppp?arg=aaa")
24 |         print "RES=", res, res.content
25 | 
26 | 
27 |         print "Stopping server"
28 |         p.stop()
29 |         p.wait()
30 |         print "Server finished."
31 | 
32 |         self.assertEquals(self.hooked_msg, "GET_/ppp?arg=aaa")
33 |         self.assertEquals(output[0], "ppp?arg=aaa")
34 | 
35 |     def test_post(self):
36 |         self.hooked_msg = None
37 |         output = []
38 | 
39 |         p = HttpMonitor(hook=self._hook)  # localhost:4000 by default
40 |         p.add_callback(lambda proc, doc: output.append(doc))
41 | 
42 |         print "Starting server."
43 |         p.start()
44 | 
45 |         print "Sending request (text)"
46 |         res = requests.post("http://localhost:4000/ppp?arg=aaa", data="some data", headers={'content-type': 'text/text'})
47 |         print "RES=", res, res.content
48 |         print "Sending request (json)"
49 |         res = requests.post("http://localhost:4000/ppp?arg=aaa", data="[1, 2, 3]", headers={'content-type': 'application/json'})
50 |         print "RES=", res, res.content
51 | 
52 |         print "Stopping server"
53 |         p.stop()
54 |         p.wait()
55 |         print "Server finished."
56 | 
57 |         self.assertEquals(self.hooked_msg, "POST_/ppp?arg=aaa")
58 |         self.assertEquals(output[0], "some data")
59 |         self.assertEquals(output[1], [1, 2, 3])
60 | 
61 |     def _hook(self, request_handler, verb, path, data, format="application/json"):
62 |         print "Hook called: ", verb, path, data
63 |         self.hooked_msg = "%s_%s" % (verb, path)
64 | 
65 | 
66 | def main():
67 |     unittest.main()
68 | 
69 | if __name__ == "__main__":
70 |     main()
71 | 


--------------------------------------------------------------------------------
/examples/remoting/RemotingClient.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | # NOTE:
 4 | # Example usage of the currently (as of writing) experimental RemotingService,
 5 | # talking to the DummyRemotingService.
 6 | 
 7 | # Import and set up some simple logging
 8 | from eslib.service.Client import Client
 9 | import logging, time
10 | # for handler in logging.root.handlers[:]:
11 | #     logging.root.removeHandler(handler)
12 | logging.getLogger("requests").setLevel(logging.WARNING)
13 | format='%(name)10s %(levelname)8s %(message)s'
14 | logging.basicConfig(format=format, level=logging.INFO)
15 | 
16 | # One way of creating the client, by asking the service manager for a service named "remoting".
17 | # (We call ourself the "Hooker" client, hooking onto the dummy service. It is just a name..)
18 | client = Client("Hooker", manager="localhost:5000", service="remoting")
19 | 
20 | # Another way is to address the service directly:
21 | # client = Client("Hooker", address="localhost:5001")
22 | 
23 | # We can ask it for status... whether it is "DEAD", "idle", "processing", "stopping", etc.
24 | print "STATUS =", client.status()
25 | 
26 | # We can ask to see detailed stats
27 | print "STATS  =", client.stats()
28 | 
29 | # We can ask to see what knowledge it has of the metadata from the common service metadata repository
30 | print "META   =", client.meta()
31 | 
32 | # We can list all available HTTP routes
33 | print "HELP   ="
34 | for item in client.help()["routes"]:
35 |     print "   %-6s %s" % tuple(item.split(" "))
36 | 
37 | # We can start and stop the service (the processing part, not run and shut down the service process itself):
38 | # print "START=", client.start()
39 | # print "STATUS=", client.status()
40 | # print "STOP=", client.stop()
41 | # print "STATUS=", client.status()
42 | # time.sleep(2)
43 | # print "STATUS=", client.status()
44 | 
45 | # TODO: We might want to be able to send stop(wait=True, timeout=10)
46 | #print "START=", client.start()  # NOTE: Will get error back if already started...
47 | 
48 | # This is how we send data to the service for further processing
49 | print "PUT=", client.put("yo", "input")
50 | 
51 | # This is how we ask for a portion (here batch size = 2) of data queued for output in service.
52 | resultGenerator = list(client.fetch("output", 2))
53 | print "FETCH", list(resultGenerator)
54 | 


--------------------------------------------------------------------------------
/bin/es-cleantweet:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | 
 5 | from eslib.procs import FileReader, FileWriter, TweetEntityRemover, PatternRemover, HtmlRemover
 6 | import eslib.prog
 7 | import argparse
 8 | 
 9 | 
10 | def main():
11 |     desc   = "Perform a chain of cleaning operations on tweets:\n" + \
12 |              "  Remove entities (URLs, mentions)" + \
13 |              "  Remove retweet prefix and ellipses suffix" + \
14 |              "  Unescape HTML encoding"
15 |     help_t = "Write cleaned text to this field instead of overwriting input field."
16 |     help_f = "Field to clean. Defaults to 'text'."
17 | 
18 |     parser = argparse.ArgumentParser(usage="\n  %(prog)s -f field [-t target]", description=desc)
19 |     parser._actions[0].help = argparse.SUPPRESS
20 |     parser.add_argument("-f", "--field",   default="text", help=help_f)
21 |     parser.add_argument("-t", "--target",  required=False, help=help_t)
22 |     parser.add_argument(      "--name"   , help="Process name.", default=None)
23 | 
24 |     args = parser.parse_args()
25 | 
26 |     source = args.field
27 |     target = args.target or args.field
28 | 
29 |     # Set up and run the pipeline
30 |     entity_remover = TweetEntityRemover(
31 |         name           = "TER",#args.name or eslib.prog.progname(),
32 |         source_field   = source,
33 |         target_field   = target,
34 |         remove_url     = True,
35 |         remove_mention = True)
36 |     pattern_remover = PatternRemover(
37 |         name           = "PR",#args.name or eslib.prog.progname(),
38 |         patterns       = ["^RT @.+: ", u"\S+\u2026$"],  # Retweet prefix, ellipsis suffix
39 |         source_field   = target,
40 |         target_field   = target
41 |     )
42 |     unescaper = HtmlRemover(name="HR")
43 | 
44 |     r = FileReader()  # Read from stdin
45 |     w = FileWriter()  # Write to stdout
46 |     entity_remover.subscribe(r)
47 |     pattern_remover.subscribe(entity_remover)#, socket_name="output", connector_name="input")
48 |     unescaper.subscribe(pattern_remover)#, socket_name="output", connector_name="input",)
49 |     w.subscribe(unescaper)#, socket_name="output")
50 | 
51 |     r.start()  # Will cause cascading starts of each processor in the pipeline
52 |     w.wait()   # Wait for everything to finish writing
53 | 
54 | 
55 | if __name__ == "__main__": main()
56 | 


--------------------------------------------------------------------------------
/test/test_procs/test_dateexpander.py:
--------------------------------------------------------------------------------
 1 | __author__ = 'Mats Julian Olsen'
 2 | 
 3 | import unittest
 4 | 
 5 | from eslib import time
 6 | from eslib.procs import DateExpander
 7 | 
 8 | ok_date = '2014-10-14T14:26:30+01:00'
 9 | ok_date_no_tz = '2014-10-14T14:26:30'
10 | wrong_date = 2013
11 | wrong_date2 = '-120-13-142T25:61:61+30:00'
12 | 
13 | ok_date_fields = {
14 |     'year': 2014, 'month': 10, 'day': 14,
15 |     'hour': 13, 'minute': 26, 'second': 30,
16 |     'week': 42, 'weekday': 2
17 | }
18 | 
19 | dict_wo_source = {'i': {'am': {'a': 'dict'}}}
20 | dict_w_source = {'_source': dict_wo_source}
21 | dict_wo_sourcefield = {'_source': dict_wo_source}
22 | dict_w_sourcefield = {'_source': {'created_at': dict_wo_source}}
23 | dict_w_ok_date = {'_source': {'created_at': ok_date, "date_fields": ok_date_fields}}
24 | dict_wo_ok_date = {'_source': {'created_at': wrong_date}}
25 | dict_wo_ok_date2 = {'_source': {'created_at': wrong_date2}}
26 | 
27 | 
28 | class TestDateMagic(unittest.TestCase):
29 | 
30 |     def test_all(self):
31 |         date = time.utcdate(ok_date)
32 |         dd = time.date_dict(date)
33 |         print dd
34 |         self.assertEqual(dd, ok_date_fields)
35 | 
36 | 
37 | class TestDateFields(unittest.TestCase):
38 | 
39 |     def setUp(self):
40 |         self.expander = DateExpander()
41 | 
42 |     def test_missing_source_section(self):
43 |         # if the dict doesn't have source it should be returned
44 |         doc = self.expander._process(dict_wo_source)
45 |         print doc
46 |         self.assertDictEqual(doc, dict_wo_source)
47 | 
48 |     def test_missing_source_field(self):
49 |         # if the dict has source, but no source_field, it should be returned
50 |         doc = self.expander._process(dict_wo_sourcefield)
51 |         print doc
52 |         self.assertDictEqual(doc, dict_wo_sourcefield)
53 | 
54 |     def test_invalid_date(self):
55 |         # if the date is invalid, the same doc should be returned
56 |         doc = self.expander._process(dict_wo_ok_date)
57 |         print doc
58 |         self.assertDictEqual(doc, dict_wo_ok_date)
59 | 
60 |     def test_valid_date(self):
61 |         doc = self.expander._process(dict_w_ok_date)
62 |         print doc
63 |         self.assertIn('date_fields', doc["_source"])
64 | 
65 |         doc = self.expander._process(dict_w_ok_date)
66 |         print doc
67 |         self.assertEqual(doc, dict_w_ok_date)
68 | 


--------------------------------------------------------------------------------
/test/test_procs/test_tweet_entity_removal.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import os
 3 | 
 4 | import unittest, json
 5 | from eslib.procs import TweetEntityRemover
 6 | from eslib import esdoc
 7 | 
 8 | class TestTweetEntityRemoval(unittest.TestCase):
 9 | 
10 |     def test_all(self):
11 | 
12 |         # Load test data
13 |         self_dir, _ = os.path.split(__file__)
14 |         f = open(os.path.join(self_dir, "data/tweet_entity_removal.json"))
15 |         doc = json.load(f)
16 |         f.close()
17 | 
18 |         p_none    = TweetEntityRemover(remove_urls=False, remove_mentions=False)
19 |         p_url     = TweetEntityRemover(remove_urls=True , remove_mentions=False)
20 |         p_mention = TweetEntityRemover(remove_urls=False, remove_mentions=True)
21 |         p_both    = TweetEntityRemover(remove_urls=True , remove_mentions=True, target_field="cleaned")
22 | 
23 |         cleaned_none    = p_none   ._clean(doc)
24 |         cleaned_url     = p_url    ._clean(doc)
25 |         cleaned_mention = p_mention._clean(doc)
26 |         cleaned_both    = p_both   ._clean(doc)
27 | 
28 |         self.assertTrue(esdoc.getfield(cleaned_none   , "_source.text")    == "Me &amp; the lovely @stellachuuuuu @ Jacob K Javits Convention Center http://t.co/x6BUjNY0jv")
29 |         self.assertTrue(esdoc.getfield(cleaned_url    , "_source.text")    == "Me &amp; the lovely @stellachuuuuu @ Jacob K Javits Convention Center")
30 |         self.assertTrue(esdoc.getfield(cleaned_mention, "_source.text")    == "Me &amp; the lovely @ Jacob K Javits Convention Center http://t.co/x6BUjNY0jv")
31 |         # Original text should be untouched, and cleaned gone to separate field:
32 |         self.assertTrue(esdoc.getfield(cleaned_both   , "_source.text")    == "Me &amp; the lovely @stellachuuuuu @ Jacob K Javits Convention Center http://t.co/x6BUjNY0jv")
33 |         self.assertTrue(esdoc.getfield(cleaned_both   , "_source.cleaned") == "Me &amp; the lovely @ Jacob K Javits Convention Center")
34 | 
35 |         # Verify that minimal cloning works:
36 |         self.assertFalse(esdoc.getfield(doc, "_source")          == esdoc.getfield(cleaned_url, "_source"         ), "Expected _source old!=new")
37 |         self.assertTrue (esdoc.getfield(doc, "_source.entities") == esdoc.getfield(cleaned_url, "_source.entities"), "Expected _source old==new")
38 | 
39 | def main():
40 |     unittest.main()
41 | 
42 | if __name__ == "__main__":
43 |     main()
44 | 


--------------------------------------------------------------------------------
/eslib/procs/TwitterFollowerGetter.py:
--------------------------------------------------------------------------------
 1 | __author__ = 'mats'
 2 | from ..Generator import Generator
 3 | from .twitter import Twitter
 4 | 
 5 | class TwitterFollowerGetter(Generator):
 6 |     """
 7 |     This generator takes as input the ids of twitter users, and then goes
 8 |     ahead and retrieves the followers or friends of this user,
 9 |     and outputs the ids.
10 | 
11 |     # TODO: Document argument 'twitter' and how to configure this. 'outgoing'
12 | 
13 |     Connectors:
14 |         ids        (str)         : Incoming IDs to get data for.
15 |     Sockets:
16 |         ids        (str)         : IDs of related nodes.
17 | 
18 |     Config:
19 |         outgoing   = True        : # TODO: Document this
20 |     """
21 |     def __init__(self, twitter=None, **kwargs):
22 |         super(TwitterFollowerGetter, self).__init__(**kwargs)
23 |         self.twitter = twitter
24 |         self.create_connector(self._incoming, "ids", "str")
25 |         self._output_id = self.create_socket("ids", "str", "IDs of related nodes.")
26 |         self._output_edge = self.create_socket("edge", "graph-edge")
27 |         self.config.set_default(outgoing=True, reltype="follows")
28 | 
29 | 
30 |     def on_open(self):
31 |         if self.twitter is None:
32 |             self.twitter = Twitter(
33 |                 consumer_key=self.config.consumer_key,
34 |                 consumer_secret=self.config.consumer_secret,
35 |                 access_token=self.config.access_token,
36 |                 access_token_secret=self.config.access_token_secret
37 |             )
38 | 
39 |     def _incoming(self, document):
40 |         try:
41 |             id_ = int(document)
42 |         except ValueError:
43 |             self.doclog.exception("Could not parse id: %s to int" % str(document))
44 |         else:
45 |             related = self.twitter.get_follows(uid=str(id_), outgoing=self.config.outgoing)
46 |             self._send(id_, related)
47 | 
48 |     def _send(self, origin, related):
49 |         for id_ in related:
50 |             edge = {"from": None, "type": self.config.reltype, "to": None}
51 |             self._output_id.send(id_)
52 |             if self.config.outgoing:
53 |                 edge["from"] = origin
54 |                 edge["to"] = id_
55 |             else:
56 |                 edge["from"] = id_
57 |                 edge["to"] = origin
58 | 
59 |             if all(edge.itervalues()):
60 |                 self.doclog.trace("Sending edge %s to Neo4j" % str(edge))
61 |                 self._output_edge.send(edge)
62 |             else:
63 |                 self.doclog.error("Edge had None-fields: %s" % str(edge))


--------------------------------------------------------------------------------
/eslib/procs/TweetEntityRemover.py:
--------------------------------------------------------------------------------
 1 | __author__ = 'Hans Terje Bakke'
 2 | 
 3 | from ..Processor import Processor
 4 | from eslib.text import remove_parts
 5 | from .. import esdoc
 6 | 
 7 | class TweetEntityRemover(Processor):
 8 |     """
 9 |     Remove URLs and/or mentioned users from the tweet text.
10 | 
11 |     Protocols:
12 | 
13 |         esdoc.tweet:
14 | 
15 |             # TODO
16 | 
17 |     Connectors:
18 |         input      (esdoc.tweet)       : Tweet
19 |     Sockets:
20 |         output     (esdoc.tweet)       : Tweet (possibly extended with a cleaned field)
21 | 
22 |     Config:
23 |         source_field        = "text"   : Part of twitter dev credentials.
24 |         target_field        = None     : Defaults to 'source_field', replacing the input field.
25 |         remove_urls         = True
26 |         remove_mentions     = False
27 |     """
28 | 
29 | 
30 |     def __init__(self, **kwargs):
31 |         super(TweetEntityRemover, self).__init__(**kwargs)
32 | 
33 |         self.create_connector(self._incoming, "input", "esdoc.tweet", "Incoming tweet.")
34 |         self.output = self.create_socket("output" , "esdoc.tweet"  , "Outgoing, cleaned, tweet.")
35 | 
36 |         self.config.set_default(
37 |             source_field    = "text",
38 |             target_field    = None,
39 |             remove_urls     = True,
40 |             remove_mentions = False
41 |         )
42 | 
43 |     def _clean(self, doc):
44 | 
45 |         source = doc.get("_source")
46 |         if not source:
47 |             return doc
48 | 
49 |         text = esdoc.getfield(source, self.config.source_field)
50 | 
51 |         coords = []
52 |         entities = source.get("entities")
53 |         if self.config.remove_urls:
54 |             x = esdoc.getfield(entities, "urls", [])
55 |             coords += [l["indices"] for l in x]
56 |         if self.config.remove_mentions:
57 |             x = esdoc.getfield(entities, "user_mentions", [])
58 |             coords += [l["indices"] for l in x]
59 |         cleaned = None
60 |         if not text:
61 |             cleaned = text
62 |         else:
63 |             # The removal from coords most often leaves two spaces, so remove them, too, and strip border spaces.
64 |             cleaned = remove_parts(text, coords).replace("  ", " ").strip()
65 | 
66 |         return esdoc.shallowputfield(doc, "_source." + (self.config.target_field or self.config.source_field), cleaned)
67 | 
68 |     def _incoming(self, doc):
69 |         if not self.output.has_output:
70 |             return # No point then...
71 |         cleaned_doc = self._clean(doc)
72 |         self.output.send(cleaned_doc)
73 | 


--------------------------------------------------------------------------------
/test/test_procs/test_tweet_extractor.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | import os
 4 | import unittest, json
 5 | from eslib.procs import TwitterMonitor, TweetExtractor
 6 | 
 7 | class TestTwitterMonitor(unittest.TestCase):
 8 | 
 9 |     def test_simple(self):
10 | 
11 |         # Load test data
12 |         self_dir, _ = os.path.split(__file__)
13 |         f = open(os.path.join(self_dir, "data/twitter_raw_mock.json"))
14 |         j = json.load(f)
15 |         f.close()
16 | 
17 |         m = TwitterMonitor()
18 |         raw, tweet_mon = m._decode(j)
19 | 
20 |         x = TweetExtractor()
21 |         tweet, users, links = x._extract(tweet_mon)
22 | 
23 |         # Test links
24 |         self.assertTrue(len(links) == 1)
25 |         self.assertTrue(links[0]["what"] == "twitter")
26 |         self.assertTrue(links[0]["who"]  == "2196916282")
27 |         self.assertTrue(links[0]["url"]  == "http://www.eraliquida.com/?p=1010")
28 | 
29 |         # Test users
30 |         self.assertTrue(len(users) == 2)
31 |         self.assertTrue(users[0]["from"] == "2196916282")
32 |         self.assertTrue(users[1]["from"] == "2196916282")
33 |         self.assertTrue(users[0]["to"] == "2196916282")
34 |         self.assertTrue(users[1]["to"] == "2649736855")
35 |         self.assertTrue(users[0]["type"] == "author")
36 |         self.assertTrue(users[1]["type"] == "mention")
37 | 
38 |         # Test tweet
39 |         self.assertTrue(tweet["_id"] == "520149420122578944")
40 |         self.assertTrue(tweet["_source"]["source"] == u"Twitter for BlackBerry®")
41 |         self.assertTrue(tweet["_source"]["text"] == u'These clowns must hope that we never cum under attack from any force-r we capable of protecting ourselves?')
42 |         self.assertTrue(str(tweet["_source"]["created_at"]) == "2014-10-09 09:51:00.328000")
43 |         self.assertTrue("geo" in tweet["_source"])
44 |         self.assertTrue(tweet["_source"]["lang"] == "en")
45 |         self.assertTrue(tweet["_source"]["place"]["country"] == "South Africa")
46 |         self.assertFalse("in_reply_to" in tweet["_source"])
47 |         # User
48 |         self.assertTrue(tweet["_source"]["user"]["id"] == "2196916282")
49 |         self.assertTrue(tweet["_source"]["user"]["lang"] == "en")
50 |         self.assertTrue(tweet["_source"]["user"]["name"] == "mark fester")
51 |         self.assertFalse("description" in tweet["_source"]["user"])
52 |         self.assertTrue(str(tweet["_source"]["user"]["created_at"]) == "2013-11-26 14:21:35")
53 | 
54 |         # Entities
55 |         # // TODO
56 | 
57 | def main():
58 |     unittest.main()
59 | 
60 | if __name__ == "__main__":
61 |     main()
62 | 


--------------------------------------------------------------------------------
/eslib/Terminal.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | 
 4 | class TerminalProtocolException(Exception):
 5 |     def __init__(self, socket, connector):
 6 |         msg = "Socket: %s.%s(%s), Connector: %s.%s(%s)" % (socket.owner.name, socket.name, socket.protocol, connector.owner.name, connector.name, connector.protocol)
 7 |         super(Exception, self).__init__(self, msg)
 8 | 
 9 | 
10 | class Terminal(object):
11 |     "Common abstract base class for connectors and sockets."
12 | 
13 |     ANY_PROTOCOL = "any"
14 | 
15 |     def __init__(self, name, protocol):
16 |         self.type        = None   # type:      Either 'Socket' or 'Connector'
17 |         self.owner       = None   # Processor:
18 |         self.name        = ""     # str:       Name of terminal
19 |         self.protocol    = ""     # str:       Name of object format expected as input/output on this terminal
20 |         self.description = ""     # str:       Text describing purpose and property of this terminal
21 | 
22 |         self.connections = []
23 | 
24 |         self.name = name or "unnamed"
25 |         self.protocol = protocol or Terminal.ANY_PROTOCOL
26 | 
27 |     def __str__(self):
28 |         return "%s|%s" % (self.name, self.protocol)
29 | 
30 |     def attach(self, terminal):
31 |         self.connections.append(terminal)
32 | 
33 |     def detach(self, terminal):
34 |         if terminal in self.connections:
35 |             self.connections.remove(terminal)
36 | 
37 |     def get_connections(self, owner=None, terminal_name=None):
38 |         "Return all connections if owner is missing. Ignore terminal_name is owner is missing."
39 |         connections = []
40 |         for c in self.connections[:]:
41 |             if not owner or (c.owner == owner and (not terminal_name or c.name == terminal_name)):
42 |                 connections.append(c)
43 |         return connections
44 | 
45 |     @staticmethod
46 |     def protocol_compliance(socket, connector):
47 |         if connector.protocol == Terminal.ANY_PROTOCOL or socket.protocol == Terminal.ANY_PROTOCOL:
48 |             return True
49 |         # In case the socket is set to mimic the protocol of one of its connectors, we check for that
50 |         # instead of the directly registered protocol.
51 |         ss = socket.protocol.split(".")
52 |         sm = socket.mimiced_protocol.split(".")
53 |         cc = connector.protocol.split(".")
54 |         # print "SS=", ss[:len(cc)]
55 |         # print "SM=", sm[:len(cc)]
56 |         # print "CC=", cc[:len(cc)]
57 |         # print "%s == %s" % (sm[:len(cc)], cc[:len(cc)])
58 |         return (ss[:len(cc)] == cc[:len(cc)]) or (sm[:len(cc)] == cc[:len(cc)])
59 | 


--------------------------------------------------------------------------------
/eslib/procs/__init__.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | """
 4 | eslib.procs
 5 | ~~~~~
 6 | 
 7 | Document processing processors.
 8 | """
 9 | 
10 | 
11 | from .ElasticsearchReader   import ElasticsearchReader
12 | from .ElasticsearchWriter   import ElasticsearchWriter
13 | from .FileReader            import FileReader
14 | from .FileWriter            import FileWriter
15 | from .TcpWriter             import TcpWriter
16 | from .RabbitmqMonitor       import RabbitmqMonitor
17 | from .RabbitmqWriter        import RabbitmqWriter
18 | from .KafkaMonitor          import KafkaMonitor
19 | from .KafkaWriter           import KafkaWriter
20 | from .HttpMonitor           import HttpMonitor
21 | from .CsvConverter          import CsvConverter
22 | from .WebGetter             import WebGetter
23 | from .Neo4jWriter           import Neo4jWriter
24 | from .Neo4jReader           import Neo4jReader
25 | from .TwitterMonitor        import TwitterMonitor
26 | from .TwitterUserGetter     import TwitterUserGetter
27 | from .TwitterFollowerGetter import TwitterFollowerGetter
28 | from .TweetEntityRemover    import TweetEntityRemover
29 | from .TweetExtractor        import TweetExtractor
30 | from .PatternRemover        import PatternRemover
31 | from .HtmlRemover           import HtmlRemover
32 | from .BlacklistFilter       import BlacklistFilter
33 | from .Throttle              import Throttle
34 | from .Transformer           import Transformer
35 | from .EntityExtractor       import EntityExtractor
36 | from .ProcessWrapper        import ProcessWrapper
37 | from .CLIReader             import CLIReader
38 | from .RssMonitor            import RssMonitor
39 | from .Timer                 import Timer
40 | from .DateExpander          import DateExpander
41 | from .SmtpMailer            import SmtpMailer
42 | from .FourChanMonitor       import FourChanMonitor
43 | 
44 | __all__ = (
45 |     "ElasticsearchReader",
46 |     "ElasticsearchWriter",
47 |     "FileReader",
48 |     "FileWriter",
49 |     "TcpWriter",
50 |     "RabbitmqMonitor",
51 |     "RabbitmqWriter",
52 |     "KafkaMonitor",
53 |     "KafkaWriter",
54 |     "HttpMonitor",
55 |     "CsvConverter",
56 |     "WebGetter",
57 |     "Neo4jWriter",
58 |     "Neo4jReader",
59 |     "TwitterMonitor",
60 |     "TwitterUserGetter",
61 |     "TwitterFollowerGetter",
62 |     "TweetEntityRemover",
63 |     "TweetExtractor",
64 |     "PatternRemover",
65 |     "HtmlRemover",
66 |     "BlacklistFilter",
67 |     "Throttle",
68 |     "Transformer",
69 |     "EntityExtractor",
70 |     "ProcessWrapper",
71 |     "CLIReader",
72 |     "RssMonitor",
73 |     "Timer",
74 |     "DateExpander",
75 |     "SmtpMailer",
76 |     "FourChanMonitor"
77 | )
78 | 


--------------------------------------------------------------------------------
/eslib/procs/DateExpander.py:
--------------------------------------------------------------------------------
 1 | __author__ = 'Mats Julian Olsen'
 2 | 
 3 | from ..Processor import Processor
 4 | from .. import esdoc
 5 | from .. import time
 6 | 
 7 | 
 8 | class DateExpander(Processor):
 9 |     """
10 |     This processor will use a date field in an esdoc as a basis for constructing
11 |     an object with
12 | 
13 |         year
14 |         month        (1 through 12)
15 |         day          (1 through 31)
16 |         hour         (0 through 23)
17 |         minute       (0 through 59)
18 |         second       (0 through 59)
19 |         weekday      (1 through 7)
20 |         week         (1 through 53)
21 | 
22 |     Connectors:
23 |         input       (esdoc)   : Incoming.
24 |     Sockets:
25 |         output      (esdoc)   : Outgoing, with configured date field expanded.
26 | 
27 |     Config:
28 |         source_field      = "created_at"   : Field which date value to expand.
29 |         target_field      = "date_fields"  : Target field for the expanded object.
30 |     """
31 |     def __init__(self, **kwargs):
32 |         super(DateExpander, self).__init__(**kwargs)
33 |         self._input = self.create_connector(self._incoming, 'input', 'esdoc', "Incoming.")
34 |         self._output = self.create_socket('output', 'esdoc', "Outgoing, with configured date field expanded.")
35 | 
36 |         self.config.set_default(
37 |             source_field='created_at',
38 |             target_field='date_fields'
39 |         )
40 | 
41 |     def _incoming(self, doc):
42 |         if self._output.has_output:
43 |             self._output.send(self._process(doc))
44 | 
45 |     def _process(self, doc):
46 |         value = esdoc.getfield(doc, "_source." + self.config.source_field)
47 |         if value is None:
48 |             self.doclog.warning(
49 |                 "Document '%s' is missing field or value in '%s'."
50 |                 % (doc.get("_id"), self.config.source_field))
51 |             return doc
52 | 
53 |         date = time.utcdate(value)
54 |         if date is None:
55 |             self.doclog.warning(
56 |                 "Document '%s' has non-date value in field '%s'."
57 |                 % (doc.get("_id"), self.config.source_field))
58 |             return doc
59 | 
60 |         date_dict = time.date_dict(date)
61 |         if date_dict is None:
62 |             # This should not be possible, therefore logging to proclog
63 |             self.log.error("Date field extraction failed for date: %s" % date)
64 |             return doc
65 | 
66 |         # Create a new document (if necessary) with just the minimum cloning necessary,
67 |         # leaving references to the rest.
68 |         return esdoc.shallowputfield(doc, '_source.' + self.config.target_field, date_dict)
69 | 


--------------------------------------------------------------------------------
/eslib/procs/Timer.py:
--------------------------------------------------------------------------------
 1 | __author__ = 'Hans Terje Bakke'
 2 | 
 3 | from eslib import Monitor
 4 | import time
 5 | 
 6 | class Timer(Monitor):
 7 |     """
 8 |     Send a command on an output socket at configured interval.
 9 |     The configured 'actions' is a list of vectors of (initial_offset, interval, document).
10 |     The time units are in seconds ('float'). The 'document' is *whatever* you want on to output,
11 |     typically a string or a dict. type.
12 | 
13 |     Note that if you have very short intervals, you might want to adjust the run loop delay 'sleep' (not in 'config').
14 |     (It defaults to 0.5 seconds for this processor.)
15 | 
16 |     Sockets:
17 |         output     (*)       : Output occurring at configured intervals. From the 'document' part of the configured action.
18 | 
19 |     Config:
20 |         actions   = []       : Time to delay document throughput, in seconds (float).
21 |     """
22 |     def __init__(self, **kwargs):
23 |         super(Timer, self).__init__(**kwargs)
24 |         self._output = self.create_socket("output", None, "Output occurring at configured intervals. From the 'document' part of the configured action.")
25 | 
26 |         # (Override) Let ticks last half a second here by default... there's generally no rush, unless intervals are very short:
27 |         self.sleep = 0.5
28 | 
29 |         self.config.set_default(actions=[]) # A list of tuples of (initial_offset, interval, document)
30 | 
31 |         self._actions = []
32 | 
33 |     def on_open(self):
34 |         now = time.time()
35 |         self._actions = []
36 |         if self._actions is not None:
37 |             if not hasattr(self._actions, '__iter__'):
38 |                 msg = "'config.actions' is not iterable."
39 |                 self.log.critical(msg)
40 |                 raise ValueError(msg)
41 |             for a in self.config.actions:
42 |                 # Validate tuple format
43 |                 if not type(a) in [list, tuple] or not len(a) == 3 or not type(a[0]) in [int, float] or not type(a[1] in [int, float]):
44 |                     msg = "An element in 'config.actions' is not of expected format and/or type '(initial_offset, interval, document)'."
45 |                     self.log.error(msg)
46 |                     #raise ValueError(msg)  # Maye not critical enough to raise exception, just skip the wrong one.
47 |                 self._actions.append([now + a[0], a[1], a[2]])
48 | 
49 |     def on_tick(self):
50 |         now = time.time()
51 |         for a in self._actions:
52 |             if now >= a[0]:
53 |                 # Next time for this one is...
54 |                 a[0] = now + a[1]
55 |                 # Then send the action/document
56 |                 self._output.send(a[2])
57 | 


--------------------------------------------------------------------------------
/eslib/service/DummyService.py:
--------------------------------------------------------------------------------
 1 | from . import HttpService, PipelineService
 2 | from ..procs import Timer, Transformer
 3 | from .. import esdoc
 4 | import time
 5 | 
 6 | class DummyService(HttpService, PipelineService):
 7 |     """
 8 |     Common static config:
 9 |         name
10 |         manager_endpoint
11 |         management_endpoint
12 |     
13 |     Static config:
14 |         timer_frequency = 3
15 |         lifespan        = 0     # 0=infinite
16 | 
17 |     Runtime config:
18 |         dummy.variable
19 |     """
20 | 
21 |     VARIABLE_CONFIG_PATH = "dummy.variable"
22 | 
23 |     metadata_keys = [VARIABLE_CONFIG_PATH]
24 | 
25 |     def __init__(self, **kwargs):
26 |         super(DummyService, self).__init__(**kwargs)
27 | 
28 |         self.config.set_default(
29 |             timer_frequency       = 3,
30 |             lifespan              = 0
31 |         )
32 | 
33 |         self._logger = None
34 |         self._variable = "initial"
35 | 
36 |     def on_configure(self, credentials, config, global_config):
37 |         self.config.set(
38 |             manager_endpoint      = global_config.get("manager_host"),
39 |             management_endpoint   = config.get("management_endpoint"),
40 | 
41 |             timer_frequency       = config["frequency"],
42 |             lifespan              = config["lifespan"]
43 |         )
44 | 
45 |     def on_setup(self):
46 |         # Set up procs
47 |         timer = Timer(
48 |             service = self,
49 |             name    = "timer",
50 |             actions = [(self.config.timer_frequency, self.config.timer_frequency, "ping")])
51 |         self._logger = Transformer(
52 |             service = self,
53 |             name    = "logger",
54 |             func    = self._logfunc)
55 | 
56 |         procs = [timer, self._logger]
57 | 
58 |         # Link them
59 |         self.link(*procs)
60 | 
61 |         # Register them for debug dumping
62 |         self.register_procs(*procs)
63 | 
64 |         return True
65 | 
66 |     #region Service overrides
67 | 
68 |     def on_metadata(self, metadata):
69 |         print "***METADATA", metadata
70 |         self._variable = self.get_meta_section(metadata, self.VARIABLE_CONFIG_PATH)
71 |         print "VAR=", self._variable
72 |         self.head.restart(start=False)
73 |         return True
74 | 
75 |     #endregion Service overrides
76 | 
77 |     def _logfunc(self, proc, doc):
78 |         if self.config.lifespan and time.time() - self.stat_processing_started > self.config.lifespan:
79 |             self.log.status("Life has come to an end; stopping.")
80 |             self.processing_stop()
81 |             return
82 |         self.log.debug("DEBUG message.")
83 |         self.log.warning("Service log entry, variable='%s'" % self._variable)
84 |         self._logger.log.warning("Processor log entry, variable='%s'" % self._variable)
85 |         self._logger.doclog.warning("Document log entry, variable='%s'" % self._variable)
86 |         yield doc
87 | 


--------------------------------------------------------------------------------
/eslib/esdoc.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | """
 4 | eslib.esdoc
 5 | ~~~~~~~~~~
 6 | 
 7 | Module containing operations on "Elasticsearch type" documents (really just a dict).
 8 | """
 9 | 
10 | 
11 | __all__ = ("tojson", "createdoc", "getfield", "putfield")
12 | 
13 | 
14 | from datetime import datetime
15 | from .time import date2iso
16 | import json
17 | 
18 | def _json_serializer_isodate(obj):
19 |     """Default JSON serializer."""
20 |     s = None
21 |     if isinstance(obj, datetime):
22 |         if obj.utcoffset() is not None:
23 |             obj = obj - obj.utcoffset()
24 |             obj = obj.replace(tzinfo=None)
25 |         s = date2iso(obj)
26 |     return s
27 | 
28 | def tojson(doc):
29 |     return json.dumps(doc, default=_json_serializer_isodate)
30 | 
31 | 
32 | def getfield(doc, fieldpath, default=None):
33 |     "Get value for 'fieldpath' if it exits and is not None, otherwise return the default."
34 |     if doc is None or fieldpath is None:
35 |         return default
36 |     if fieldpath == "":
37 |         return doc
38 |     fp = fieldpath.split(".")
39 |     d = doc
40 |     for f in fp[:-1]:
41 |         if not d or not f in d or not isinstance(d[f], dict):
42 |             return default
43 |         d = d[f]
44 |     if d is None:
45 |         return default
46 |     v = d.get(fp[-1])
47 |     return default if v is None else v
48 | 
49 | 
50 | def putfield(doc, fieldpath, value):
51 |     "Add or update 'fieldpath' with 'value'."
52 |     if doc is None or fieldpath is None:
53 |         return
54 |     fp = fieldpath.split(".")
55 |     d = doc
56 |     for i, f in enumerate(fp[:-1]):
57 |         if f in d:
58 |             d = d[f]
59 |             if not isinstance(d, dict):
60 |                 raise AttributeError("Node at '%s' is not a dict." % ".".join(fp[:i+1]))
61 |         else:
62 |             dd = {}
63 |             d[f] = dd
64 |             d = dd
65 |     d[fp[-1]] = value  # OBS: This also overwrites a node if this is was a node
66 | 
67 | def shallowputfield(doc, fieldpath, value):
68 |     "Clone as little as needed of 'doc' and add the field from 'fieldpath'. Returns the new cloned doc"
69 |     if not doc or not fieldpath: return
70 |     fp = fieldpath.split(".")
71 |     doc_clone = doc.copy()  # Shallow clone
72 |     d = doc
73 |     d_clone = doc_clone
74 |     for i, f in enumerate(fp[:-1]):
75 |         if f in d:
76 |             d = d[f]
77 |             if not type(d) is dict:
78 |                 raise Exception("Node at '%s' is not a dict." % ".".join(fp[:i+1]))
79 |             d_clone[f] = d.copy()  # Create shallow clone of the next level
80 |             d_clone = d_clone[f]
81 |         else:
82 |             dd = {}  # Create a new node
83 |             d_clone.update({f:dd})
84 |             d_clone = dd
85 |     d_clone[fp[-1]] = value  # OBS: This also overwrites a node if this is was a node
86 | 
87 |     return doc_clone
88 | 
89 | def createdoc(source, index=None, doctype=None, id=None):
90 |     doc = {"_source": source}
91 |     if index: doc['_index']  = index
92 |     if type : doc['_type' ]  = doctype
93 |     if id   : doc['_id'   ]  = id
94 |     return doc
95 | 
96 | 
97 | 


--------------------------------------------------------------------------------
/eslib/procs/TwitterUserGetter.py:
--------------------------------------------------------------------------------
 1 | __author__ = 'mats'
 2 | 
 3 | from ..Generator import Generator
 4 | from .twitter import Twitter
 5 | 
 6 | import time
 7 | 
 8 | 
 9 | class TwitterUserGetter(Generator):
10 |     """
11 |     Receives uids on its connector and sends twitter user objects
12 |     to its socket.
13 | 
14 |     # TODO: Document argument 'twitter' and how to configure this.
15 | 
16 |     Connectors:
17 |         ids        (str)         : Incoming IDs to get data for.
18 |     Sockets:
19 |         user       (graph-user)  : Twitter users.
20 | 
21 |     Config:
22 |         batchsize  = 100      : How many users to gather up before making a call to Twitter.
23 |         batchtime  = 7.0      : How many seconds to wait before we send a batch if it is not full.
24 |     """
25 | 
26 |     def __init__(self, twitter=None, **kwargs):
27 |         super(TwitterUserGetter, self).__init__(**kwargs)
28 |         self.create_connector(self._incoming, "ids", "str")
29 |         self._output = self.create_socket("user", "graph-user", "Twitter users.")
30 |         self._queue = []
31 |         self.last_call = time.time()
32 |         self.twitter = twitter
33 |         self.config.set_default(
34 |             batchsize=100,
35 |             batchtime=7
36 |         )
37 | 
38 |     def on_open(self):
39 |         """ Instantiate twitter class. """
40 |         if self.twitter is None:
41 |             self.twitter = Twitter(
42 |                 consumer_key=self.config.consumer_key,
43 |                 consumer_secret=self.config.consumer_secret,
44 |                 access_token=self.config.access_token,
45 |                 access_token_secret=self.config.access_token_secret
46 |             )
47 | 
48 |     def _incoming(self, doc):
49 |         """
50 |         Put str(doc) into the queue.
51 | 
52 |         :param doc: the id of a twitter user
53 |         """
54 |         try:
55 |             id_ = int(doc)
56 |         except ValueError:
57 |             self.doclog.exception("Could not parse id: %s to int" % doc)
58 |         else:
59 |             self._queue.append(str(id_))
60 | 
61 |     def on_tick(self):
62 |         """
63 |         Commit items in queue if queue exceeds batchsize or it's been long
64 |         since last commit.
65 |         """
66 |         if ((len(self._queue) >= self.config.batchsize) or
67 |             (time.time() - self.last_call > self.config.batchtime and self._queue)):
68 |             self.get()
69 | 
70 |     def on_shutdown(self):
71 |         """ Get rid of rest of queue before shutting down. """
72 |         self.log.info("Processing remaining items in queue.")
73 |         while self._queue:
74 |             self.get()
75 | 
76 |     def get(self):
77 |         """
78 |         Gets users from twitter and outputs to a socket.
79 |         """
80 |         num = len(self._queue)
81 |         self.log.debug("Getting %i users from Twitter" % num)
82 |         resp = self.twitter.get_users(uids=self._queue[:num])
83 |         self._queue = self._queue[num:]
84 |         for raw_user in resp:
85 |             try:
86 |                 user = self.twitter.raw_to_dict(raw_user)
87 |             except TypeError as type_error:
88 |                 self.log.exception(type_error)
89 |             else:
90 |                 self._output.send(user)
91 | 


--------------------------------------------------------------------------------
/eslib/service/PipelineService.py:
--------------------------------------------------------------------------------
 1 | __author__ = 'Hans Terje Bakke'
 2 | 
 3 | from .Service import Service
 4 | import time
 5 | 
 6 | class PipelineService(Service):
 7 |     def __init__(self, **kwargs):
 8 |         super(PipelineService, self).__init__(**kwargs)
 9 | 
10 |         self.head = None
11 |         self.tail = None
12 | 
13 |     def _log_finished(self, proc):
14 |         self.log.status("Processing finished.")
15 |         self._processing = False  # This will shortcut further evaluation of whether we are processing
16 |         self.stat_processing_ended = time.time()
17 | 
18 |     def _log_aborted(self, proc):
19 |         self.log.status("Processing finished after abort.")
20 |         self._processing_aborted = True  # This will shortcut further evaluation of whether we are aborted
21 |         self.stat_processing_ended = time.time()
22 | 
23 |     def link(self, *processors):
24 |         "Link processors together and assign head and tail."
25 |         prev = None
26 |         for proc in processors:
27 |             if prev:
28 |                 proc.subscribe(prev)
29 |             prev = proc
30 |         self.head = processors[0]
31 |         self.tail = processors[-1]
32 | 
33 |     #region Service overrides
34 | 
35 |     def is_processing(self):
36 |         "Evaluate whether processing is in progress."
37 |         return self.tail.running
38 | 
39 |     def is_aborted(self):
40 |         "Evaluate whether processing is in progress."
41 |         return self.head.aborted
42 | 
43 |     def is_suspended(self):
44 |         "Evaluate whether processing is suspended."
45 |         return self.head.suspended
46 | 
47 |     def on_processing_start(self):
48 |         if not self._log_finished in self.tail.event_stopped:
49 |             self.tail.event_stopped.append(self._log_finished)
50 |         if not self._log_aborted in self.tail.event_aborted:
51 |             self.tail.event_aborted.append(self._log_aborted)
52 | 
53 |         self.head.start()
54 |         return True
55 | 
56 |     def on_restart(self):
57 |         # if not self.head.running:
58 |         #     self.head.start()
59 |         # else:
60 |         #     return True
61 |         return True  # Well, not really, but still.. it didn't fail either.
62 | 
63 |     def on_processing_stop(self):
64 |         "This method should block until the process is fully stopped."
65 |         self.head.stop()
66 |         self.tail.wait()
67 |         return True
68 | 
69 |     def on_processing_abort(self):
70 |         self.head.abort()
71 |         self.tail.wait()
72 |         return True
73 | 
74 |     def on_processing_suspend(self):
75 |         self.head.suspend()
76 |         return True
77 | 
78 |     def on_processing_resume(self):
79 |         self.head.resume()
80 |         return True
81 | 
82 |     # TODO
83 |     def on_update(self, config):
84 |         # Auto-start on update
85 |         if not self.head.running:
86 |             self.head.start()
87 |         else:
88 |             return True
89 | 
90 |     def on_count(self):
91 |         # It is probably better to count what has been handled by the tail, than what the head received or generaterd, so:
92 |         return self.tail.count
93 | 
94 |     def on_count_total(self):
95 |         return self.head.total
96 | 
97 |     #endregion Service overrides
98 | 


--------------------------------------------------------------------------------
/eslib/procs/KafkaWriter.py:
--------------------------------------------------------------------------------
 1 | __author__ = 'Hans Terje Bakke'
 2 | 
 3 | # NOTE: Using sync producer. Should change to async if performance sucks.
 4 | 
 5 | from ..Processor import Processor
 6 | from ..esdoc import tojson
 7 | from pykafka import KafkaClient
 8 | import zlib
 9 | 
10 | 
11 | class KafkaWriter(Processor):
12 |     """
13 |     Write data to Kafka.
14 |     Writes data with type 'str', 'unicode', 'int', or 'float'. Lists and dicts are written as 'json'.
15 |     Other types are cast to 'str'.
16 |     The 'type' registered with the metadata is then either 'str', 'unicode', 'int', 'float' or 'json'.
17 | 
18 |     Connectors:
19 |         input      (*)       : Document to write to configured Kafka topic.
20 | 
21 |     Config:
22 |         hosts             = ["localhost:9292"]    : List of Kafka hosts.
23 |         topic             = "default_topic"       :
24 |         compression       = False                 : Whether to compress the data sent to Kafka.
25 |     """
26 | 
27 |     def __init__(self, **kwargs):
28 |         super(KafkaWriter, self).__init__(**kwargs)
29 | 
30 |         self._connector = self.create_connector(self._incoming, "input", None, "Document to write to configured RabbitMQ.")
31 | 
32 |         self.config.set_default(
33 |             hosts       = ["localhost:9092"],
34 |             topic       = "default_topic",
35 |             compression = False
36 |         )
37 | 
38 |         self._client   = None
39 |         self._producer = None
40 | 
41 |     def on_open(self):
42 |         self.count = 0
43 |         self._client = KafkaClient(",".join(self.config.hosts))
44 |         topic = self._client.topics[self.config.topic]
45 |         self._producer = topic.get_sync_producer(min_queued_messages=1)
46 |         self.log.info("Connected to Kafka topic '%s'." % self.config.topic)
47 | 
48 |     def on_close(self):
49 |         if self._client:
50 |             self._producer.stop()
51 |             self.log.info("Kafka producer stopped.")
52 |             # Can't find any way to close the connection or ask it to release resources, so I try a 'del'.
53 |             #del self._client
54 |             self._client = None
55 |             self.log.debug("Connection to Kafka deleted.")
56 | 
57 |     def _incoming(self, document):
58 |         if document == None:
59 |             return
60 | 
61 |         data = document
62 |         msg_type = None
63 |         if isinstance(document, basestring):
64 |             msg_type = type(document).__name__
65 |         elif isinstance(document, (int, long, float)):
66 |             msg_type = type(document).__name__
67 |         elif isinstance(document, (list, dict)):
68 |             data = document
69 |             msg_type = "json"
70 |         else:
71 |             data = str(document)
72 |             msg_type = "str" #type(document).__name__
73 |             self.doclog.warning("Writing document of unsupported type '%s' as type 'str'." % type(document).__name__)
74 | 
75 |         kafka_data = None
76 |         try:
77 |             kafka_data = tojson({"type": msg_type, "data": data})
78 |         except TypeError as e:
79 |             self.doclog.error("JSON serialization failed: %s" % e.message)
80 |             return
81 | 
82 |         if self.config.compression:
83 |             kafka_data = zlib.compress(kafka_data)
84 | 
85 |         self._producer.produce(kafka_data)
86 |         self.count += 1
87 | 


--------------------------------------------------------------------------------
/test/test_procs/test_csv_converter.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import unittest
 3 | from eslib.procs import FileReader, FileWriter, CsvConverter
 4 | 
 5 | 
 6 | res = []
 7 | 
 8 | class TestCsvConverter(unittest.TestCase):
 9 | 
10 |     def _setup(self, filename):
11 | 
12 |         r = FileReader(raw_lines=True)
13 |         r.config.filename = filename
14 | 
15 |         c = CsvConverter()
16 | 
17 |         c.config.index      = "myindex"
18 |         c.config.type_field = "initials"
19 |         c.config.id_field   = "id"
20 | 
21 |         w = FileWriter()  # Write to stdout
22 | 
23 |         r.attach(c.attach(w))
24 | 
25 |         output = []
26 |         c.add_callback(lambda proc, doc: output.append(doc))
27 | 
28 |         return (r, c, w, output)
29 | 
30 |     def _verify(self, output):
31 |         self.assertTrue(len(output) == 3, "Expected 3 results.")
32 |         self.assertTrue(output[1]["_type"] == "eee")
33 |         self.assertTrue(output[1]["_index"] == "myindex")
34 |         self.assertTrue(output[1]["_id"] == "2")
35 |         self.assertTrue(len(output[1]["_source"]) == 2)
36 | 
37 | 
38 |     def test_read(self):
39 |         r = FileReader(raw_lines=True)
40 |         self_dir, _ = os.path.split(__file__)
41 |         r.config.filename = os.path.join(self_dir, "data/csv_with_header.csv")
42 |         w = FileWriter()  # Write to stdout
43 |         w.subscribe(r)
44 |         r.start()
45 | 
46 |     def test_first_line_is_columns(self):
47 |         self_dir, _ = os.path.split(__file__)
48 |         r, c, w, output = self._setup(os.path.join(self_dir, "data/csv_with_header.csv"))
49 |         r.start()
50 |         w.wait()
51 | 
52 |         self._verify(output)
53 | 
54 |     def test_no_header_line(self):
55 |         self_dir, _ = os.path.split(__file__)
56 |         r, c, w, output = self._setup(os.path.join(self_dir, "data/csv_no_header.csv"))
57 |         c.config.columns = ["id", "name", "last name", "initials"]
58 |         r.start()
59 |         w.wait()
60 | 
61 |         self._verify(output)
62 | 
63 |     def test_skip_header_line(self):
64 |         self_dir, _ = os.path.split(__file__)
65 |         r, c, w, output = self._setup(os.path.join(self_dir, "data/csv_with_header.csv"))
66 |         c.config.columns = ["id", "name", "last name", "initials"]
67 |         c.config.skip_first_line = True
68 |         r.start()
69 |         w.wait()
70 | 
71 |         self._verify(output)
72 | 
73 |     # def test_fewer_fields(self):
74 |     #     self_dir, _ = os.path.split(__file__)
75 |     #
76 |     #     r, c, w, output = self._setup(os.path.join(self_dir, "data/csv_no_header.csv"))
77 |     #     c.config.id_field = "_id"
78 |     #     c.config.type_field = "_type"
79 |     #     c.config.columns = ["_id", None, "last name", "initials"]
80 |     #     r.start()
81 |     #     w.wait()
82 |     #
83 |     #     self.assertTrue(len(output) == 3, "Expected 3 results.")
84 |     #     self.assertTrue(output[1]["_type"] == None)
85 |     #     self.assertTrue(output[1]["_index"] == "myindex")
86 |     #     self.assertTrue(output[1]["_id"] == "2")
87 |     #     keys = output[1]["_source"].keys()
88 |     #     self.assertTrue(len(keys) == 2)
89 |     #     self.assertTrue("last name" in keys and "initials" in keys, "Expected 'last name' and 'initials' as result fields.")
90 | 
91 | def main():
92 |     unittest.main()
93 | 
94 | if __name__ == "__main__":
95 |     main()
96 | 


--------------------------------------------------------------------------------
/eslib/service/RemotingService.py:
--------------------------------------------------------------------------------
 1 | from . import HttpService
 2 | from .. import Processor
 3 | import Queue
 4 | 
 5 | 
 6 | # NOTE: THIS IS YET EXPERIMENTAL (htb, 2016-03-21)
 7 | 
 8 | 
 9 | class RemotingService(HttpService):
10 | 
11 |     def __init__(self, **kwargs):
12 |         super(RemotingService, self).__init__(**kwargs)
13 | 
14 |         # Add routes to functions
15 |         self.add_route(self._mgmt_fetch, "GET"     , "/fetch", ["?socket", "?limit"])
16 |         self.add_route(self._mgmt_put  , "PUT|POST", "/put"  , ["?connector"])
17 | 
18 |         self._queues = {}
19 |         self._put_proc = None
20 | 
21 |         # NOTE: In on_setup, where you create the fetch proc, set config var congestion_limit
22 | 
23 |     def setup_put(self, proc):
24 |         self.log.info("Registering put Processor '%s'." % proc.name)
25 |         self._put_proc = proc
26 | 
27 |     def setup_fetch(self, proc, socket_names=None):
28 |         self.log.info("Creating fetch buffers for Processor '%s'." % proc.name)
29 |         if isinstance(socket_names, basestring):
30 |             socket_names = [socket_names]
31 |         for socket_name in proc.sockets:
32 |             if not socket_names or socket_name in socket_names:
33 |                 self._register_callback(proc, socket_name)
34 | 
35 |     def _register_callback(self, proc, socket_name):
36 |         def callback(proc, doc):
37 |             queue = self._queues[socket_name]
38 |             queue.put(doc)
39 |             pass
40 |         self._queues[socket_name] = Queue.Queue()
41 |         proc.add_callback(callback, socket_name)
42 | 
43 |     def _put(self, doc, connector_name):
44 |         if self._put_proc:
45 |             self._put_proc.put(doc, connector_name)
46 | 
47 |     def _fetch(self, socket_name=None, limit=0):
48 |         docs = []
49 |         if socket_name and socket_name in self._queues:
50 |             queue = self._queues[socket_name]
51 |         elif len(self._queues) > 0:
52 |             # TODO: Get default socket instead, or error
53 |             queue = self._queues.keys()[0]
54 |         else:
55 |             return ([], -1)  #  TODO: Or rather an error
56 | 
57 |         ##print "LIMIT=", limit
58 |         while not queue.empty() and (limit == 0 or len(docs) < limit):
59 |             ##print "LEN(DOCS)=%d" % len(docs)
60 |             doc = queue.get_nowait()
61 |             queue.task_done()
62 |             if doc:
63 |                 docs.append(doc)
64 |         return (docs, queue.qsize())
65 | 
66 |     #region Extra service interface methods
67 | 
68 |     def _mgmt_fetch(self, request_handler, payload, **kwargs):
69 |         socket_name = kwargs.get("socket")
70 |         limit       = kwargs.get("limit") or 0  # 0 = unlimited
71 |         limit = int(limit)
72 |         ##print "=== KWARGS:", kwargs
73 |         ##print "=== LIMIT:", limit
74 |         (docs, qsize) = self._fetch(socket_name, limit)
75 |         return {"documents": docs, "status": self.status, "queued": qsize}
76 | 
77 |     def _mgmt_put(self, request_handler, payload, **kwargs):
78 |         connector_name = kwargs.get("connector")
79 |         doc = payload
80 |         self._put(doc, connector_name)
81 | 
82 |     #endregion Extra service interface methods
83 | 
84 |     def on_stats(self, stats):
85 |         super(RemotingService, self).on_stats(stats)
86 |         stats["queued"] = {k:q.qsize() for k,q in self._queues.iteritems()}
87 | 


--------------------------------------------------------------------------------
/eslib/procs/SmtpMailer.py:
--------------------------------------------------------------------------------
 1 | __author__ = 'Hans Terje Bakke'
 2 | 
 3 | from ..Processor import Processor
 4 | import smtplib, getpass, platform
 5 | from email.mime.text import MIMEText
 6 | from eslib.esdoc import tojson
 7 | 
 8 | 
 9 | class SmtpMailer(Processor):
10 |     """
11 |     Send incoming document as content to recipients.
12 |     Sends mail outgoing on port 25 unless a username/password is specified, in which case
13 |     it uses TLS on port 587.
14 |     Sender defaults to current executing user if not specified.
15 | 
16 |     Connectors:
17 |         input      (*)       : Incoming documents to send. Non-string documents are converted to JSON.
18 | 
19 |     Config:
20 |         smtp_server       = "localhost"
21 |         username          = None
22 |         password          = None
23 |         sender            = None
24 |         from_name         = None          : Name to be added to sender into the From field, becomes: '"from_name" <user@domain.com>'
25 |         recipients        = []            : List of recipient email addresses (no mail or brackets or other fuzz).
26 |         subject           = None
27 |     """
28 |     def __init__(self, **kwargs):
29 |         super(SmtpMailer, self).__init__(**kwargs)
30 | 
31 |         self.create_connector(self._incoming, "input", "str", "Email content string.")
32 | 
33 |         self.config.set_default(
34 |             smtp_server         = "localhost",
35 |             username            = None,
36 |             password            = None,
37 |             sender              = None,
38 |             from_name           = None,
39 |             recipients          = None,
40 |             subject             = None,
41 |         )
42 | 
43 |     def on_open(self):
44 |         self.count = 0
45 | 
46 |     def _incoming(self, doc):
47 |         if not doc or not self.config.recipients or not self.config.sender:
48 |             return
49 | 
50 |         # Convert non-string documents to JSON
51 |         content = doc
52 |         if not isinstance(doc, basestring):
53 |             content = tojson(doc)
54 | 
55 |         try:
56 |             self._mail_text(
57 |                 self.config.smtp_server,
58 |                 self.config.recipients,
59 |                 self.config.subject,
60 |                 self.config.sender,
61 |                 self.config.from_name,
62 |                 content,
63 |                 self.config.username,
64 |                 self.config.password)
65 |             self.count += 1
66 |         except Exception as e:
67 |             self.log.exception("Failed to send email.")
68 | 
69 | 
70 |     def _mail_text(self, smtp_server, recipients, subject, sender=None, from_name=None, content=None, username=None, password=None):
71 |         msg = MIMEText(content, "plain", "utf-8")
72 | 
73 |         if not sender:
74 |             sender = "@".join((getpass.getuser(), platform.node()))
75 | 
76 |         message_from = sender if not from_name else '"%s" <%s>' % (from_name, sender)
77 | 
78 |         msg['Subject'] = subject
79 |         msg['From']    = message_from
80 |         msg['To']      = ", ".join(recipients)
81 | 
82 |         s = None
83 |         if username or password:
84 |             s = smtplib.SMTP(smtp_server, 587)
85 |             s.ehlo()
86 |             s.starttls()
87 |             s.ehlo()
88 |             s.login(username, password)
89 |         else:
90 |             s = smtplib.SMTP(smtp_server or "localhost")
91 | 
92 |         s.sendmail(sender, recipients, msg.as_string())
93 |         s.quit()
94 | 
95 | 


--------------------------------------------------------------------------------
/eslib/time.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | """
  4 | eslib.time
  5 | ~~~~~~~~~~
  6 | 
  7 | Module containing time/date helpers.
  8 | """
  9 | 
 10 | 
 11 | __all__ = ("duration_string", "date2iso", "ago2date")
 12 | 
 13 | 
 14 | import re, datetime, dateutil, dateutil.parser
 15 | 
 16 | 
 17 | def duration_string(timediff):
 18 |     """
 19 |     :type timediff: datetime.timedelta
 20 |     :rtype str:
 21 |     """
 22 |     secs = timediff.seconds
 23 |     days = timediff.days
 24 |     s = secs % 60
 25 |     m = (secs / 60) % 60
 26 |     h = (secs / 60 / 60) % 24
 27 |     return "%d:%02d:%02d" % (days*24+h, m, s)
 28 | 
 29 | 
 30 | def date2iso(dateobj):
 31 |     """
 32 |     Convert datetime object to ISO 8601 string with UTC, e.g. '2014-03-10T23:32:47Z'
 33 |     :type dateobj: datetime.datetime
 34 |     :rtype str
 35 |     """
 36 |     return dateobj.strftime("%Y-%m-%dT%H:%M:%SZ") # Screw the %.f ...
 37 | 
 38 | def iso2date(isostr):
 39 |     """
 40 |     Convert ISO 8601 string in UTC, e.g. '2014-03-10T23.32:47Z' to datetime object.
 41 |     :type isostr: datetime.datetime
 42 |     :rtype datetime.datetime
 43 |     """
 44 |     if isostr is None:
 45 |         return None
 46 |     if "." in isostr:
 47 |         return datetime.datetime.strptime(isostr, "%Y-%m-%dT%H:%M:%S.%fZ")
 48 |     else:
 49 |         return datetime.datetime.strptime(isostr, "%Y-%m-%dT%H:%M:%SZ")
 50 | 
 51 | def utcdate(obj):
 52 |     "Convert string or datetime object to a datetime object in UTC."
 53 |     dt = None
 54 |     if type(obj) is datetime.datetime:
 55 |         dt = obj
 56 |     try:
 57 |         dt = dateutil.parser.parse(obj)
 58 |     except:
 59 |         pass
 60 |     if dt:
 61 |         # Convert to UTC time and get rid of the offset
 62 |         utcoffs = dt.utcoffset()
 63 |         if utcoffs:
 64 |             dt = dt - utcoffs
 65 |         dt = dt.replace(tzinfo=None) #dateutil.tz.tzutc())
 66 |     return dt
 67 | 
 68 | def date_dict(date):
 69 |     return {
 70 |         "year": date.year, "month": date.month, "day": date.day,
 71 |         "hour": date.hour, "minute": date.minute, "second": date.second,
 72 |         "weekday": date.isoweekday(), "week": date.isocalendar()[1]
 73 |     }
 74 | 
 75 | 
 76 | _agoRegex = re.compile("^(?P<number>\d+)\s*(?P<unit>\w+)( ago)?$")
 77 | 
 78 | def ago2date(ago, from_date_utc=None):
 79 |     """
 80 |     Convert 'ago' style time specification string to a datetime object.
 81 |     Units are s=second, m=minute, h=hour, d=day, w=week, M=month, y=year
 82 |     :param str ago                         : "Time ago" as a string.
 83 |     :param datetime.datetime from_date_utc : Relative time to use instead of 'now'. In UTC.
 84 |     :rtype datetime.timedelta              : Time difference.
 85 |     """
 86 |     m = _agoRegex.match(ago)
 87 |     if not m:
 88 |         raise SyntaxError("Illegal 'ago' string: %s" % ago)
 89 |     number = int(m.group("number"))
 90 |     unit = m.group("unit")
 91 |     delta = None
 92 |     if   unit == "s" or unit.startswith("sec")  : delta = datetime.timedelta(seconds= number)
 93 |     elif unit == "m" or unit.startswith("min")  : delta = datetime.timedelta(minutes= number)
 94 |     elif unit == "h" or unit.startswith("hour") : delta = datetime.timedelta(hours= number)
 95 |     elif unit == "d" or unit.startswith("day")  : delta = datetime.timedelta(days= number)
 96 |     elif unit == "w" or unit.startswith("week") : delta = datetime.timedelta(weeks= number)
 97 |     elif unit == "M" or unit.startswith("month"): delta = datetime.timedelta(days= number*30)
 98 |     elif unit == "y" or unit.startswith("year") : delta = datetime.timedelta(days= number*365)
 99 |     else:
100 |         raise SyntaxError("Illegal unit for 'ago' string in: %s" % ago)
101 |     return (from_date_utc or datetime.datetime.utcnow()) - delta;
102 | 


--------------------------------------------------------------------------------
/PROTOCOLS.md:
--------------------------------------------------------------------------------
  1 | # Protocols
  2 | 
  3 | This document describes the common protocols for document exchange between terminals (connectors and sockets).
  4 | 
  5 | The name of the protocol is meant as a hint, although keeping track of a common set of protocols would be good.
  6 | 
  7 | ## esdoc
  8 | 
  9 | ### esdoc (general)
 10 | 
 11 | Used by
 12 | 
 13 |     ElasticsearchReader.output (socket)
 14 |     ElasticsearchWriter.input (connector)
 15 |     ElasticsearchWriter.output (socket)
 16 |     CsvConverter.output (socket)
 17 |     HtmlRemover.input (connector)
 18 |     HtmlRemover.output (soclet)
 19 |     PatternRemover.input (connector)
 20 |     PatternRemover.output (socket)
 21 | 
 22 | Format
 23 | 
 24 |     _index          str
 25 |     _type           str
 26 |     _id             str
 27 |     _version        int
 28 |     _timestamp      str
 29 |     _source         dict  # Dict of { field : value }
 30 | 
 31 | 
 32 | All fields are optional, depending on the case
 33 | 
 34 | ### esdoc.webpage
 35 | 
 36 | Used by
 37 | 
 38 |     WebGetter.output (socket)
 39 |     
 40 | Format
 41 | 
 42 |     _id                str          # Using the URL as ID
 43 |     _type              str          # "webpage"
 44 |     _timestamp         datetime     # When the content was fetched
 45 |     _source            dict of ...
 46 |         domain         str
 47 |         requested_by   list         # Of of dicts of format [ what : [ who, ...] }, ... ]
 48 |         content        str
 49 |         content_type   str
 50 |         encoding       str
 51 |         date           datetime     # Web page publishing date as reported by HTTP header
 52 | 
 53 | ### esdoc.4chan
 54 | 
 55 | Used by
 56 | 
 57 |     FourChanMonitor.esdoc
 58 | 
 59 | Format
 60 | 
 61 |     _id                int   # Post number at 4chan
 62 |     _type              str   # "4chan"
 63 |     _source
 64 |         id             int   # Post number at 4chan
 65 |         board          str   # Board id
 66 |         thread         int   # Thread id
 67 |         timestamp      int   # Time of posting
 68 |         author         str   # Name of author, most commonly "Anonymous"
 69 |         comment        str   # Text comment
 70 |         filename       str   # Filename, with extension
 71 |         response_to    int   # Post number this post is a response to. 0 if original posting (i.e. not a response)
 72 | 
 73 | 
 74 | ## urlrequest
 75 | 
 76 | Used by
 77 | 
 78 |     WebGetter.input (connector)
 79 | 
 80 | Format
 81 | 
 82 |     url             str  #
 83 |     what            str  # Source requesting the url, e.g. "twitter_mon"
 84 |     who             str  # Who requested it, e.g. some user id from the source
 85 | 
 86 | ## csv
 87 | 
 88 | Used by
 89 | 
 90 |     CsvConverter.input (connector)
 91 | 
 92 | Format
 93 | 
 94 | ```csv
 95 | "field","field,"field","..."
 96 | ```
 97 | 
 98 | 
 99 | ## graph-edge
100 | The graph-edge protocol is simply a dictionary with three mandatory keys,
101 | that together represents an edge.
102 | 
103 | Used by
104 |     Neo4jWriter.edge (connector)
105 | 
106 | Format
107 | 
108 |     from    str # The property-id of the source node
109 |     type    str # The type of the edge. ("follows", "author", "mention", "quote")
110 |     to      str # The property-id of the receiving node
111 | 
112 | Note that all fields are mandatory.
113 | 
114 | ## graph-user
115 | 
116 | The graph-user protocol is a dictionary holding properties.
117 | 
118 | Used by
119 |     
120 |     Neo4jWriter.user (connector)
121 |     TwitterUserGetter.user (socket)
122 | 
123 | Format
124 |     
125 |     id              str              
126 |     location        str             #Optional
127 |     description     str             #Optional
128 |     screen_name     str             #Optional
129 |     lang            str             #Optional
130 |     name            str             #Optional
131 |     created_at      date.isoformat()#Optional
132 |     


--------------------------------------------------------------------------------
/bin/es-read:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | 
 5 | from eslib.procs import ElasticsearchReader, FileWriter
 6 | import eslib.prog
 7 | import eslib.time
 8 | import argparse, sys, time
 9 | 
10 | 
11 | def main():
12 |     help_i  = "Which index to return documents from."
13 |     help_t  = "Which type of document to return."
14 |     help_l  = "The maximum number of documents to return. Will by default return all documents."
15 |     help_s  = "Returns all documents added after SINCE. Specified in the 'ago' format (1d, 3w, 1y, etc)."
16 |     help_b  = "Returns all documents added after BEFORE. Specified in the 'ago' format (1d, 3w, 1y, etc)."
17 |     help_tf = "The field that contains the relavant date information. Default 'timefield' to slice on is '_timestamp'."
18 |     help_fi = "Format for filter is, by example: 'category:politicians,party:democrats'."
19 | 
20 |     parser = argparse.ArgumentParser(usage="\n  %(prog)s -i index [-t type] [-f field] [-l limit] [more options]")
21 |     parser._actions[0].help = argparse.SUPPRESS
22 |     parser.add_argument("-i", "--index"    , help=help_i, required=True)
23 |     parser.add_argument("-t", "--type"     , help=help_t)
24 |     parser.add_argument("-l", "--limit"    , help=help_l, default=0, type=int)
25 |     parser.add_argument("-s", "--since"    , help=help_s)
26 |     parser.add_argument("-b", "--before"   , help=help_b)
27 |     parser.add_argument(      "--host"     , help="Elasticsearch host, format 'host:port' or just 'host'.", default=None)
28 |     parser.add_argument(      "--timefield", help=help_tf, default="_timestamp")
29 |     parser.add_argument(      "--filter"   , help=help_fi)
30 |     parser.add_argument("-v", "--verbose"  , action="store_true")
31 |     #parser.add_argument(      "--debug"    , action="store_true")
32 |     parser.add_argument(      "--name"     , help="Process name.", default=None)
33 | 
34 |     if len(sys.argv) == 1:
35 |         parser.print_usage()
36 |         sys.exit(0)
37 | 
38 |     args = parser.parse_args()
39 | 
40 |     # Time validation conversion and checks
41 |     before = None
42 |     since  = None
43 |     if args.before:
44 |         try:
45 |             before = eslib.time.ago2date(args.before)
46 |         except:
47 |             print >> sys.stderr, "Illegal 'ago' time format to 'before' argument, '%s'" % args.before
48 |             sys.exit(-1)
49 |     if args.since:
50 |         try:
51 |             since = eslib.time.ago2date(args.since)
52 |         except:
53 |            print >> sys.stderr, "Illegal 'ago' time format to 'since' argument, '%s'" % args.since
54 |            sys.exit(-1)
55 | 
56 |     # Parse filter string
57 |     filters = {}
58 |     if args.filter:
59 |         parts = [{part[0]:part[1]} for part in [filter.split(":") for filter in args.filter.split(",")]]
60 |         for part in parts:
61 |             filters.update(part)
62 | 
63 |     # Set up and run this processor
64 |     r = ElasticsearchReader(
65 |         name      = args.name or eslib.prog.progname(),
66 |         hosts     = [args.host] if args.host else [],
67 |         index     = args.index,
68 |         doctype   = args.type,
69 |         limit     = args.limit,
70 |         filters   = filters,
71 |         since     = since,
72 |         before    = before,
73 |         timefield = args.timefield
74 |     )
75 | 
76 | #    if args.debug: r.debuglevel = 0
77 | 
78 |     verbose_tick_delay = 3.0
79 | 
80 |     w = FileWriter()
81 |     w.subscribe(r)
82 |     r.start()
83 |     if args.verbose:
84 |         # Verbose wait loop
85 |         last_tick = time.time()
86 |         while r.running:
87 |             time.sleep(0.1)
88 |             now = time.time()
89 |             if (now - last_tick > verbose_tick_delay) or not r.running:
90 |                 print >> sys.stderr, "Read %d/%d" % (r.count, r.total)
91 |                 last_tick = now
92 |         print >> sys.stderr, "Reading finished; waiting for writer to finish."
93 |     w.wait()
94 | 
95 | 
96 | if __name__ == "__main__": main()
97 | 


--------------------------------------------------------------------------------
/examples/remoting/DummyRemotingService.py:
--------------------------------------------------------------------------------
  1 | # NOTE:
  2 | #
  3 | # REMOTING SERVICE IS YET EXPERIMENTAL (as of when this was written)
  4 | #
  5 | # This is an example of how to create a service based on the RemotingService.
  6 | # See also RemotingClient.py for example of how to call it remotely.
  7 | #
  8 | # SETUP:
  9 | #
 10 | #  Copy the file to your service "source" directory, and add to the package __init__.py file:
 11 | #
 12 | #   from .DummyRemotingService       import DummyRemotingService
 13 | #   __all__ = (
 14 | #       "DummyRemotingService"
 15 | #   )
 16 | #
 17 | # In the service "config" directory, configure it like
 18 | #
 19 | #   remoting:
 20 | #     type                : "DummyRemotingService"
 21 | #     frequency           : 3
 22 | #     lifespan            : 120
 23 | 
 24 | from eslib.service import RemotingService, PipelineService
 25 | from eslib.procs import Timer
 26 | from eslib import Processor
 27 | 
 28 | 
 29 | # COMMENT to the below connectors and sockets:
 30 | # The "command" socket and connector are set to default, so that we can easily create a
 31 | # service based on the pipeline service. Then all pipleline processors are linked so that
 32 | # start/stop events etc are easily propagated the way we want. The downside to this
 33 | # approach is that the socket and connector we want to use from the client will have to
 34 | # be names, as they are not the default ones.
 35 | # (Here, by client example: client.fetch("output"), and client.put("input").)
 36 | 
 37 | 
 38 | class FetchProc(Processor):
 39 |     def __init__(self, **kwargs):
 40 |         super(FetchProc, self).__init__(**kwargs)
 41 |         self.create_connector(self._incoming, "input")
 42 |         self.command = self.create_socket("command", is_default=True)  # To link easily as pipeline
 43 |         self.output = self.create_socket("output")
 44 |         self.num = 0
 45 | 
 46 |     def on_open(self):
 47 |         self.num = 0
 48 | 
 49 |     def _incoming(self, doc):
 50 |         # For each incoming tick, generate one output doc:
 51 |         self.num += 1
 52 |         print "SEDNING TO QUEUE:", self.num
 53 |         self.output.send(self.num)
 54 | 
 55 | class PutProc(Processor):
 56 |     def __init__(self, **kwargs):
 57 |         super(PutProc, self).__init__(**kwargs)
 58 |         self.create_connector(self._command, "command", is_default=True)  # To link easily as pipeline
 59 |         self.create_connector(self._incoming, "input")
 60 | 
 61 |     def _command(self, doc):
 62 |         pass  # Down the drain; this is simply for linking
 63 | 
 64 |     def _incoming(self, doc):
 65 |         print("INCOMING DOC:", doc)
 66 | 
 67 | 
 68 | class DummyRemotingService(RemotingService, PipelineService):
 69 | 
 70 |     def __init__(self, **kwargs):
 71 |         super(DummyRemotingService, self).__init__(**kwargs)
 72 | 
 73 |         self.config.set_default(
 74 |             timer_frequency       = 3,
 75 |             lifespan              = 0
 76 |         )
 77 | 
 78 |     def on_configure(self, credentials, config, global_config):
 79 |         self.config.set(
 80 |             manager_endpoint      = global_config.get("manager_host"),
 81 |             management_endpoint   = config.get("management_endpoint"),
 82 | 
 83 |             timer_frequency       = config["frequency"],
 84 |             lifespan              = config["lifespan"]
 85 |         )
 86 | 
 87 |     def on_setup(self):
 88 |         timer = Timer(
 89 |             service = self,
 90 |             name    = "timer",
 91 |             actions = [(self.config.timer_frequency, self.config.timer_frequency, "ping")]
 92 |         )
 93 |         fetchProc = FetchProc(
 94 |             service = self,
 95 |             name    = "fetchProc",
 96 |         )
 97 |         putProc = PutProc(
 98 |             service = self,
 99 |             name    = "putProc"
100 |         )
101 | 
102 |         procs = [timer, fetchProc, putProc]
103 |         self.link(*procs)
104 | 
105 |         self.setup_put(putProc)
106 |         self.setup_fetch(fetchProc, "output")
107 | 
108 |         return True
109 | 


--------------------------------------------------------------------------------
/eslib/procs/HtmlRemover.py:
--------------------------------------------------------------------------------
 1 | __author__ = 'Hans Terje Bakke'
 2 | 
 3 | from ..Processor import Processor
 4 | from .. import esdoc
 5 | from eslib.text import remove_html
 6 | 
 7 | class HtmlRemover(Processor):
 8 |     """
 9 |     Remove HTML tags and unescape HTML escapings.
10 | 
11 |     Connectors:
12 |         input      (esdoc)   (default) : Incoming document in 'esdoc' dict format.
13 |         str        (str)               : Incoming document of type 'str' or 'unicode'.
14 |     Sockets:
15 |         output     (esdoc)   (default) : Output of documents that arrived on 'input' connector.
16 |         str        (str)               : Output of documents that arrived on 'str' connector.
17 | 
18 |     Config:
19 |         source_field        = "text"   : Part of twitter dev credentials.
20 |         target_field        = None     : Defaults to 'source_field', replacing the input field.
21 |         field_map           = {}       : A dict of fields to use as { source : target }.
22 |                                          If specified, this *replaces* the source_field and target_field pair!
23 |         strip               = True     : Remove boundary spaces and double spaces, commonly left after a removal.
24 |     """
25 | 
26 |     def __init__(self, **kwargs):
27 |         super(HtmlRemover, self).__init__(**kwargs)
28 | 
29 |         m = self.create_connector(self._incoming_esdoc, "input", "esdoc", "Incoming 'esdoc'.", is_default=True)
30 |         self.create_connector(self._incoming_str  , "str"  , "str"  , "Incoming document of type 'str' or 'unicode'.")
31 |         self.output_esdoc = self.create_socket("output" , "esdoc"   , "Outgoing, cleaned, 'esdoc'.", is_default=True, mimic=m)
32 |         self.output_str   = self.create_socket("str"    , "str"     , "Outgoing, cleaned, 'str'.")
33 | 
34 |         self.config.set_default(
35 |             source_field    = "text",
36 |             target_field    = None,
37 |             field_map       = {},
38 |             strip           = True
39 |         )
40 | 
41 |         self._regexes = []
42 |         self._field_map = {}
43 | 
44 |     def on_open(self):
45 |         # Create field map
46 |         self._field_map = self.config.field_map or {}
47 |         if not self._field_map:
48 |             if not self.config.source_field:
49 |                 raise ValueError("Neither field_map nor source_field is configured.")
50 |             self._field_map[self.config.source_field] = (self.config.target_field or self.config.source_field)
51 | 
52 | 
53 |     def _clean_text(self, text):
54 |         text = remove_html(text)
55 |         if self.config.strip:
56 |             text = text.strip().replace("  ", " ")
57 |         return text
58 | 
59 |     def _clean(self, doc):
60 | 
61 |         if not doc:
62 |             return doc
63 | 
64 |         # This makes this method work also for 'str' and 'unicode' type documents; not only for the expected 'esdoc' protocol (a 'dict').
65 |         if type(doc) in [str, unicode]:
66 |             cleaned = self._clean_text(doc)
67 |             return cleaned
68 |         elif not type(doc) is dict:
69 |             self.doclog.debug("Unsupported document type '%s'." % type(doc))
70 |             return doc
71 | 
72 |         source = doc.get("_source")
73 |         if not source:
74 |             return doc  # Missing source section; don't do anything
75 | 
76 |         for source_field, target_field in self._field_map.iteritems():
77 |             text = esdoc.getfield(source, source_field)
78 |             if text and type(text) in [str, unicode]:
79 |                 cleaned = self._clean_text(text)
80 |                 if cleaned != text:
81 |                     # Note: This may lead to a few strictly unnecessary shallow clonings...
82 |                     doc = esdoc.shallowputfield(doc, "_source." + target_field, cleaned)
83 |         return doc
84 | 
85 |     def _incoming_esdoc(self, doc):
86 |         if self.output_esdoc.has_output:
87 |             self.output_esdoc.send(self._clean(doc))
88 | 
89 |     def _incoming_str(self, doc):
90 |         if self.output_str.has_output:
91 |             self.output_str.send(self._clean(doc))
92 | 


--------------------------------------------------------------------------------
/eslib/procs/TweetExtractor.py:
--------------------------------------------------------------------------------
  1 | __author__ = 'Hans Terje Bakke'
  2 | 
  3 | from ..Processor import Processor
  4 | 
  5 | 
  6 | class TweetExtractor(Processor):
  7 |     """
  8 |     Extract properties from a tweet to different sockets: 'user' and 'link'.
  9 | 
 10 |     Protocols:
 11 | 
 12 |         esdoc.tweet:
 13 | 
 14 |             # TODO
 15 | 
 16 |         graph-edge:
 17 | 
 18 |             from       str : User ID.
 19 |             type       str : Relation, one of "author", "mention", "quote".
 20 |             to         str : User ID.
 21 | 
 22 |         urlrequest:
 23 | 
 24 |             url        str
 25 |             what       str : e.g. "twitter_mon"
 26 |             who        str : e.g. some user id
 27 | 
 28 |     Sockets:
 29 |         tweet  (esdoc.tweet) (default) : Tweet
 30 |         text   (str)                   : Only the text from the tweet.
 31 |         link   (urlrequest)            : Link from the tweet, for potential follow-up.
 32 |         user   (graph-edge)            : Info about author, mentioned or retweeted users from the tweet.
 33 | 
 34 |     Config:
 35 |         drop_retweets       = True     : Do not report tweets from retweets if set. User relation "quote" will still be reported.
 36 |     """
 37 | 
 38 |     RELATION_AUTHOR       = "author"
 39 |     RELATION_RETWEET      = "quote"
 40 |     RELATION_MENTION      = "mention"
 41 | 
 42 | 
 43 |     def __init__(self, **kwargs):
 44 |         super(TweetExtractor, self).__init__(**kwargs)
 45 | 
 46 |         self.create_connector(self._incoming, "tweet", "esdoc.tweet", "Tweet.");
 47 | 
 48 |         self.output_tweet  = self.create_socket("tweet" , "esdoc.tweet"  , "Tweet.", is_default=True)
 49 |         self.output_text   = self.create_socket("text"  , "str"          , "Only the text from the tweet.")
 50 |         self.output_link   = self.create_socket("link"  , "urlrequest"   , "Link from the tweet, for potential follow-up.")
 51 |         self.output_user   = self.create_socket("user"  , "graph-edge"   , "Info about author, mentioned or retweeted users from the tweet.")
 52 | 
 53 |         self.config.set_default(
 54 |             drop_retweets      = True
 55 |         )
 56 | 
 57 |     def _incoming(self, doc):
 58 | 
 59 |         if not doc or not type(doc) is dict or not self.has_output:
 60 |             return
 61 | 
 62 |         tweet, users, links = self._extract(doc)
 63 |         if tweet:
 64 |             self.output_tweet.send(tweet)
 65 |             self.output_text.send(tweet["_source"]["text"])
 66 |         for user in users:
 67 |             self.output_user.send(user)
 68 |         for link in links:
 69 |             self.output_link.send(link)
 70 | 
 71 |     def _extract(self, tweet):
 72 |         "Return a tuple of (tweet, users, links)."
 73 | 
 74 |         users = []
 75 |         links = []
 76 | 
 77 |         source = tweet["_source"]  # Always present
 78 | 
 79 |         # Add author to 'users' list
 80 |         user_id = source["user"]["id"]  # Always present
 81 |         users.append({"from": user_id, "type": self.RELATION_AUTHOR, "to": user_id})
 82 | 
 83 |         # Retweets
 84 |         retweet_user_id = source.get("retweet_user_id")
 85 |         if retweet_user_id:
 86 |             # Find out who has been retweeted:
 87 |             # Add retweet to 'users' list
 88 |             users.append({"from": user_id, "type": self.RELATION_RETWEET, "to": retweet_user_id})
 89 |             if self.config.drop_retweets:
 90 |                 return (None, users, links)
 91 | 
 92 |         # URLs and mentions from entities
 93 |         entities = source.get("entities")
 94 |         if entities:
 95 |             # Get URLs
 96 |             urls = entities.get("urls")
 97 |             if urls:
 98 |                 for url in urls:
 99 |                     # Add to "links" list:
100 |                     links.append({
101 |                         "url" : url["url"],
102 |                         "what": "twitter",  # TODO: Maybe use self.name instead?
103 |                         "who" : user_id
104 |                     })
105 |             # Get user mentions
106 |             user_mentions = entities.get("user_mentions")
107 |             if user_mentions:
108 |                 for m in user_mentions:
109 |                     # Add relation to 'users' list:
110 |                     users.append({"from": user_id, "type": self.RELATION_MENTION, "to": m["id"]})
111 | 
112 |         return (tweet, users, links)
113 | 


--------------------------------------------------------------------------------
/eslib/procs/Neo4jReader.py:
--------------------------------------------------------------------------------
  1 | __author__ = 'mats'
  2 | 
  3 | from ..Generator import Generator
  4 | from .neo4j import Neo4j
  5 | 
  6 | from itertools import izip
  7 | import time, logging
  8 | 
  9 | class Neo4jReader(Generator):
 10 |     """
 11 |     The purpose of this processor is to ask Neo4j if a node with a given
 12 |     user id has it's full set of properties.
 13 | 
 14 |     It takes an id and determines whether or not it has its properties set.
 15 |     If it lacks properties, it will be outputted by the 'ids' socket.
 16 | 
 17 |     Connectors:
 18 |         id         (str)      : Incoming IDs to check.
 19 |     Sockets:
 20 |         ids        (str)      : Outputs IDs that lack properties.
 21 | 
 22 |     Config:
 23 |         batchsize  = 20       : How many IDs to gather up before making a call to Neo4j.
 24 |         batchtime  = 5.0      : How many seconds to wait before we send a batch if it is not full.
 25 |         host       = localhost: The host we should connect to
 26 |         port       = 7474     : The default neo4j port
 27 | 
 28 |     """
 29 | 
 30 |     def __init__(self, **kwargs):
 31 |         super(Neo4jReader, self).__init__(**kwargs)
 32 |         self.create_connector(self._incoming_id, "id", "str", "Incoming IDs to check.")
 33 |         self._missing = self.create_socket("missing", "str", "Outputs IDs that lack properties.")
 34 |         #self._missing = self.create_socket("output", "???", "Outputs data retrived, one document per ID.")
 35 | 
 36 |         self.config.set_default(
 37 |             batchsize = 20,
 38 |             batchtime = 5.0,
 39 |             host      = "localhost",
 40 |             port      = 7474
 41 |         )
 42 | 
 43 |         self._neo4j = None
 44 | 
 45 |         self._queue = []
 46 |         self._last_get = time.time()
 47 |         self._has_properties = set([])
 48 | 
 49 |     #TODO: Could place this in Neo4jBase
 50 |     def on_open(self):
 51 |         """
 52 |         Instantiates both a neo4j-instance and a twitter-instance.
 53 | 
 54 |         Raises:
 55 |             - ConnectionError if neo4j can't contact its server
 56 |             - Exception if twitter can't authenticate properly
 57 |         """
 58 | 
 59 |         # TODO: Need logging, request timeout and exception handling down there:
 60 |         self.log.debug("Connecting to Neo4j.")
 61 |         self._neo4j = Neo4j(host=self.config.host, port=self.config.port)
 62 |         self.log.status("Connected to Neo4j on %s:%d." % (self.config.host, self.config.port))
 63 | 
 64 |     def _incoming_id(self, id_):
 65 |         """
 66 |         Takes an incoming id, gets the correct query string from self.neo4j,
 67 |         before appending the query to self._queue
 68 |         """
 69 |         if id_ not in self._has_properties:
 70 |             query = self._neo4j.get_node_query_if_properties(id_)
 71 |             self._queue.append((id_, query))
 72 | 
 73 |     def on_tick(self):
 74 |         """
 75 |         Commit items in queue if queue exceeds batchsize or it's been long
 76 |         since last commit.
 77 |         """
 78 |         if ((len(self._queue) >= self.config.batchsize) or
 79 |             (time.time() - self._last_get > self.config.batchtime and self._queue)):
 80 |             self._get()
 81 | 
 82 |     def on_shutdown(self):
 83 |         """ Get rid of rest of queue before shutting down. """
 84 |         while self._queue:
 85 |             self._get()
 86 | 
 87 |     def _get(self):
 88 |         num_elem = len(self._queue)
 89 |         if num_elem > self.config.batchsize:
 90 |             num_elem = self.config.batchsize
 91 | 
 92 |         ids, queries = [list(t)
 93 |                         for t in
 94 |                         izip(*self._queue[:num_elem])]
 95 |         rq = self._neo4j._build_rq(queries)
 96 |         resp = self._neo4j.commit(rq)
 97 |         self.log.debug("Asking neo4j for %i users." % num_elem)
 98 |         self._queue = self._queue[num_elem:]
 99 |         self._last_get = time.time()
100 |         self._write_uids(ids, resp)
101 | 
102 |     def _write_uids(self, ids, resp):
103 |         """
104 |         Outputs the ids of the nodes in the resp-object to a socket.
105 | 
106 |         Args:
107 |             ids: The ids that corresponds to a query
108 |             resp: a requests-module response object with neo4j-nodes in 'graph'-
109 |                   format.
110 |         """
111 |         for uid, result in izip(ids, resp.json()["results"]):
112 |             if not result["data"]:
113 |                 self._missing.send(uid)
114 |                 if self.doclog.isEnabledFor(logging.TRACE):
115 |                     self.doclog.trace("uid %s does not have properties" % uid)
116 |             else:
117 |                 self._has_properties.add(uid)
118 | 


--------------------------------------------------------------------------------
/eslib/__init__.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | """
  4 | eslib
  5 | ~~~~~
  6 | 
  7 | Document processing library for Elasticsearch.
  8 | """
  9 | 
 10 | __version__ = "0.0.1"
 11 | __author__ = "Hans Terje Bakke"
 12 | 
 13 | 
 14 | from .Terminal     import TerminalProtocolException, Terminal
 15 | from .TerminalInfo import TerminalInfo
 16 | from .Connector    import Connector
 17 | from .Socket       import Socket
 18 | from .Processor    import Processor
 19 | from .Generator    import Generator
 20 | from .Monitor      import Monitor
 21 | from .Configurable import Configurable, Config
 22 | 
 23 | 
 24 | __all__ = (
 25 |     "TerminalProtocolException",
 26 |     "Terminal",
 27 |     "TerminalInfo",
 28 |     "Connector",
 29 |     "Socket",
 30 |     "Processor",
 31 |     "Generator",
 32 |     "Monitor",
 33 |     "Configurable",
 34 |     "Config",
 35 | 
 36 |     "unique"
 37 | )
 38 | 
 39 | #region Core stuff
 40 | 
 41 | def unique(seq, idfun=None):
 42 |    # order preserving
 43 |    if idfun is None:
 44 |        def idfun(x): return x
 45 |    seen = {}
 46 |    result = []
 47 |    for item in seq:
 48 |        marker = idfun(item)
 49 |        if marker in seen: continue
 50 |        seen[marker] = 1
 51 |        result.append(item)
 52 |    return result
 53 | 
 54 | #endregion
 55 | 
 56 | 
 57 | #region Encoding of stdin/stdout
 58 | 
 59 | import sys, codecs
 60 | 
 61 | # Fix stdin and stdout encoding issues
 62 | _encoding_stdin  = sys.stdin.encoding or "UTF-8"
 63 | _encoding_stdout = sys.stdout.encoding or _encoding_stdin
 64 | #sys.stdin = codecs.getreader(_encoding_stdin)(sys.stdin)
 65 | sys.stdout = codecs.getwriter(_encoding_stdout)(sys.stdout)
 66 | 
 67 | #endregion Encoding of stdin/stdout
 68 | 
 69 | #region Logging stuff
 70 | 
 71 | import logging
 72 | import logging.config
 73 | 
 74 | class _ExtendedLogger(logging.getLoggerClass()):
 75 |     def makeRecord(self, name, level, fn, lno, msg, args, exc_info, func=None, extra=None):
 76 |         rec = logging.LogRecord(name, level, fn, lno, msg, args, exc_info, func)
 77 | 
 78 |         rec.serviceName = self.serviceName if hasattr(self, 'serviceName') else None
 79 |         rec.className = self.className if hasattr(self, 'className') else None
 80 |         rec.instanceName = self.instanceName if hasattr(self, 'instanceName') else None
 81 | 
 82 |         rec.firstName = name.split(".")[0]
 83 |         rec.lastName = name.split(".")[-1]
 84 |         rec.names = name.split(".")
 85 | 
 86 |         return rec
 87 | 
 88 | logging.setLoggerClass(_ExtendedLogger)
 89 | 
 90 | 
 91 | def _log_status(self, message, *args, **kws):
 92 |     if self.isEnabledFor(logging.STATUS):
 93 |         self._log(logging.STATUS, message, args, **kws)
 94 | 
 95 | def _log_verbose(self, message, *args, **kws):
 96 |     if self.isEnabledFor(logging.VERBOSE):
 97 |         self._log(logging.VERBOSE, message, args, **kws)
 98 | 
 99 | def _log_trace(self, message, *args, **kws):
100 |     if self.isEnabledFor(logging.TRACE):
101 |         self._log(logging.TRACE, message, args, **kws)
102 | 
103 | def _log_debug_n(self, n, message, *args, **kws):
104 |     candidate = logging.DEBUG - n
105 |     loglevel = min(max(candidate, logging.TRACE+1), logging.DEBUG)
106 |     if self.isEnabledFor(loglevel):
107 |         self._log(loglevel, message, args, **kws)
108 | 
109 | logging.STATUS  = 25
110 | logging.VERBOSE = 15
111 | logging.TRACE   =  1
112 | 
113 | logging.addLevelName(logging.STATUS , "STATUS")
114 | logging.addLevelName(logging.VERBOSE, "VERBOSE")
115 | logging.addLevelName(logging.TRACE  , "TRACE")
116 | for n in range(1,9):
117 |     logging.addLevelName(logging.DEBUG -n, "DEBUG-%s" % n)
118 | 
119 | logging.Logger.status  = _log_status
120 | logging.Logger.verbose = _log_verbose
121 | logging.Logger.trace   = _log_trace
122 | logging.Logger.debugn  = _log_debug_n
123 | 
124 | #endregion Logging stuff
125 | 
126 | #region Config stuff
127 | 
128 | import os, yaml
129 | from . import esdoc
130 | 
131 | def get_credentials(path=None, service_dir=None, credentials_file=None):
132 |     service_dir = service_dir or os.environ.get("ESLIB_SERVICE_DIR")
133 |     if not service_dir:
134 |         raise ValueError("Neither service_dir given nor ESLIB_SERVICE_DIR set.")
135 |     dir = os.path.join(service_dir, "config")
136 | 
137 |     file_path = None
138 |     if not credentials_file:
139 |         credentials_file = "credentials.yaml"
140 | 
141 |     if os.path.basename(credentials_file) == credentials_file:
142 |         # Pick from dir
143 |         file_path = os.path.join(dir, credentials_file)
144 |     else:
145 |         # Use absolute path
146 |         file_path = os.path.expanduser(credentials_file)
147 | 
148 |     # Load credentials file
149 |     with open(file_path, "r") as f:
150 |         credentials = yaml.load(f)
151 | 
152 |     if not path:
153 |         return credentials
154 |     else:
155 |         return esdoc.getfield(credentials, path)
156 | 
157 | #endregion
158 | 


--------------------------------------------------------------------------------
/eslib/procs/CsvConverter.py:
--------------------------------------------------------------------------------
  1 | __author__ = 'Hans Terje Bakke'
  2 | 
  3 | import csv, codecs
  4 | from ..Processor import Processor
  5 | 
  6 | class CsvConverter(Processor):
  7 |     """
  8 |     Convert csv input to Elasticsearch document format.
  9 |     Field names can be explicitly entered or derived from the first line of input,
 10 |     assuming that is the first line contains column names. When explicitly specified, only those columns entered
 11 |     will be used, the others will be ignored. When derived, all columns are used.
 12 | 
 13 |     NOTE: Fields, including column headers, must not have any spacing between delimiters and quotes.
 14 | 
 15 |     NOTE: Fields that are mapped to meta fields ('_id', '_index', '_type') will not be part of the '_source'.
 16 | 
 17 |     Connectors:
 18 |         input      (csv)     : Document in 'csv' format. First document is optionally column list.
 19 |     Sockets:
 20 |         output     (esdoc)   : Documents converted from 'csv' to 'esdoc' format.
 21 | 
 22 |     Config:
 23 |         index             = None     : Override '_index' meta field with this value.
 24 |         doctype           = None     : Override '_type' meta field with this value.
 25 |         columns           = None     : List of columns to pick from the CSV input. Use None for columns to ignore.
 26 |         skip_first_line   = False    : Skip first line of the input. (Typically column headers you don't want.
 27 |         delimiter         = ","      : CSV column delimiter character.
 28 | 
 29 |         id_field          = "_id"    : Name of field to map to meta field '_id'.
 30 |         index_field       = "_index" : Name of field to map to meta field '_index'.
 31 |         type_field        = "_type"  : Name of field to map to meta field '_type'.
 32 |     """
 33 | 
 34 |     def __init__(self, **kwargs):
 35 |         super(CsvConverter, self).__init__(**kwargs)
 36 |         self.create_connector(self._incoming, "input", "csv", "Document in 'csv' format. First document is optionally column list.")
 37 |         self.output = self.create_socket("output", "esdoc", "Documents converted from 'csv' to 'esdoc' format.")
 38 | 
 39 |         self.config.set_default(
 40 |             index           = None,
 41 |             doctype         = None,
 42 |             columns         = None,
 43 |             skip_first_line = False,
 44 |             delimiter       = ",",
 45 | 
 46 |             id_field        = "_id",
 47 |             index_field     = "_index",
 48 |             type_field      = "_type"
 49 |         )
 50 | 
 51 |         self._columns = []
 52 |         self._first_line_processed = False
 53 | 
 54 | 
 55 |     def on_open(self):
 56 |         # Sanity check:
 57 |         if self.config.skip_first_line and not self.config.columns:
 58 |             raise Exception("Nothing specified in 'columns' and 'skip_first_line' set. Unable to determine fields to include, then.")
 59 | 
 60 |         self._first_line_processed = False
 61 |         self._columns = self.config.columns or []
 62 | 
 63 |     def _incoming(self, line):
 64 |         # Check if we should skip first line or use it as column definitions (columns)
 65 |         if not self._first_line_processed:
 66 |             self._first_line_processed = True
 67 |             if self.config.skip_first_line:
 68 |                 return
 69 |             if not self._columns:
 70 |                 # No skipping first line ordered and no field list. Now assume first line to be column headings
 71 |                 for csvrow in csv.reader([line], delimiter=self.config.delimiter):
 72 |                     self._columns = csvrow
 73 |                     return
 74 | 
 75 |         # Pick the only line. Since csv does not support unicode, we do this little encoding massage:
 76 |         raw_line = codecs.encode(line, "UTF-8")
 77 |         raw_csvrow = csv.reader([raw_line], delimiter=self.config.delimiter).next()
 78 |         csvrow = [codecs.decode(x, "UTF-8") for x in raw_csvrow]
 79 | 
 80 |         if not len(self._columns) == len(csvrow):
 81 |             self.doclog.warning("Column count does not match number of fields. Aborting. Row =\n%s" % csvrow)
 82 |             self.abort()  # NOTE: We might want to continue processing, or we might not...
 83 | 
 84 |         doc = {}
 85 |         id = None
 86 |         index = None
 87 |         doctype = None
 88 |         for i in range(len(self._columns)):
 89 |             if not self._columns[i]:
 90 |                 continue # Skip non-specified fields
 91 |             elif self._columns[i] == self.config.id_field:
 92 |                 id = csvrow[i]
 93 |             elif self._columns[i] == self.config.index_field: # Override index
 94 |                 index = csvrow[i]
 95 |             elif self._columns[i] == self.config.type_field: # Override doctype
 96 |                 doctype = csvrow[i]
 97 |             else:
 98 |                 doc.update({self._columns[i]: csvrow[i]})
 99 | 
100 |         # Convert to Elasticsearch type document
101 |         esdoc = {"_index":self.config.index or index, "_type":self.config.doctype or doctype, "_id":id, "_source":doc}
102 | 
103 |         self.output.send(esdoc)
104 | 


--------------------------------------------------------------------------------
/eslib/Connector.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | from __future__ import absolute_import
  4 | 
  5 | from .Terminal import Terminal
  6 | import Queue
  7 | import threading
  8 | import time
  9 | 
 10 | 
 11 | class Connector(Terminal):
 12 | 
 13 |     def __init__(self, name, protocol=None, method=None):
 14 |         self.sleep = 0.1 #0.001  # Check for data in incoming queue this often (then burst through as much as possible)
 15 | 
 16 |         super(Connector, self).__init__(name, protocol)
 17 |         self.type = Connector
 18 |         self.queue = Queue.Queue()
 19 |         self.method = method
 20 | 
 21 |         # Execution control status
 22 |         self._thread = None
 23 |         self.accepting = False
 24 |         self.stopping = False
 25 |         self.running = False
 26 |         self.suspended = False
 27 |         self.aborted = False
 28 | 
 29 |     #region Queue management
 30 | 
 31 |     def _clear(self):
 32 |         "Clear the queue."
 33 |         while not self.queue.empty():
 34 |             self.queue.get_nowait()
 35 |             self.queue.task_done()
 36 | 
 37 |     @property
 38 |     def pending(self):
 39 |         "Report number of pending items in queue."
 40 |         return self.queue.qsize()
 41 | 
 42 |     def _process(self):
 43 |         "Grab item from queue and call the pre-registered method on it."
 44 |         if not self.queue.empty():
 45 |             document = self.queue.get_nowait()
 46 |             self.queue.task_done()
 47 |             if document:
 48 |                 if self.method:
 49 |                     try:
 50 |                         self.method(document)
 51 |                     except Exception as e:
 52 |                         msg = "Unhandled exception in processor '%s' func '%s' while processing a document." % (self.owner.name, self.method.__name__)
 53 |                         self.owner.doclog.exception(msg)
 54 |                         self.owner.log.exception(msg)
 55 | 
 56 |     def receive(self, document):
 57 |         "Put document on the incoming queue for this connector. Called by sockets."
 58 |         if self.accepting:
 59 |             self.queue.put(document)  # Infinite queue, so it should never block
 60 | 
 61 |     #endregion Queue management
 62 | 
 63 |     #region Operation management
 64 | 
 65 |     def _run(self):
 66 |         while self.running:
 67 |             if self.sleep:
 68 |                 time.sleep(self.sleep)
 69 |             if not self.running:
 70 |                 break
 71 |             if self.stopping and (self.suspended or self.queue.empty()):
 72 |                 # Notify owner that we are finished stopping
 73 |                 self.owner.production_stopped()
 74 |                 # Now we can finally stop
 75 |                 self.stopping = False
 76 |                 self.running = False
 77 |             elif not self.suspended:
 78 |                 while self.running and not self.suspended and not self.queue.empty():
 79 |                     self._process()
 80 | 
 81 |         # Clean out the queue (in case we just aborted)
 82 |         self._clear()
 83 |         self.stopping = False  # In case we were stopping while aborted
 84 | 
 85 |     # Note: The reason for the split of run() and accept_incoming():
 86 |     #       The entire system should first be accepting data before the individual
 87 |     #       components start processing. When processing, a document is passed on
 88 |     #       through sockets to listening connectors. If those connectors are not yet
 89 |     #       accepting new items on their queues, incoming items will be dropped (i.e.
 90 |     #       not put on the queue, and we would potentially lose the first items
 91 |     #       during start-up.
 92 | 
 93 |     def run(self):
 94 |         "Should be called after all connectors in the system accept incoming data."
 95 |         if self.running:
 96 |             raise Exception("Connector is already running.")
 97 |         if not self.accepting:
 98 |             raise Exception("Connector is not accepting input before call to run(). Call accept_incoming() on all connectors in the system first.")
 99 | 
100 |         self.aborted = False
101 |         self.stopping = False
102 |         self.suspended = False
103 |         self.running = True
104 | 
105 |         self._thread = threading.Thread(target=self._run)
106 |         self._thread.start()
107 | 
108 |     def accept_incoming(self):
109 |         "Should be called for all connectors in the system before processes start running and processing!"
110 |         if self.stopping:
111 |             raise Exception("Connector is stopping. Refusing to accept new incoming again until fully stopped.")
112 |         self.accepting = True
113 | 
114 |     def stop(self):
115 |         self.accepting = False
116 |         self.stopping = True  # We must wait for items in the queue to be processed before we finally stop running
117 |         if self._thread and self._thread.isAlive():
118 |             try:
119 |                 self._thread.join()  # NOTE: Are we sure we want to wait for this ??
120 |             except:
121 |                 pass  # Ignore
122 |         self._thread = None
123 | 
124 |     def abort(self):
125 |         self.aborted = True
126 |         self.accepting = False
127 |         self.running = False  # Run loop will stop immediately
128 | 
129 |     def suspend(self):
130 |         self.suspended = True
131 | 
132 |     def resume(self):
133 |         self.suspended = False
134 | 
135 |     #endregion Operation management
136 | 


--------------------------------------------------------------------------------
/eslib/procs/RabbitmqWriter.py:
--------------------------------------------------------------------------------
  1 | __author__ = 'Hans Terje Bakke'
  2 | 
  3 | from ..Processor import Processor
  4 | from .RabbitmqBase import RabbitmqBase
  5 | from ..esdoc import tojson
  6 | import time
  7 | 
  8 | 
  9 | class RabbitmqWriter(Processor, RabbitmqBase):
 10 |     """
 11 |     Write data to RabbitMQ.
 12 |     Writes data with type 'str', 'unicode', 'int', or 'float'. Lists and dicts are written as 'json'.
 13 |     Other types are cast to 'str'.
 14 |     The 'type' registered with the metadata is then either 'str', 'unicode', 'int', 'float' or 'json'.
 15 | 
 16 |     Connectors:
 17 |         input      (*)       : Document to write to configured RabbitMQ.
 18 | 
 19 |     Config:
 20 |         host              = localhost  :
 21 |         port              = 5672       :
 22 |         admin_port        = 15672      :
 23 |         username          = guest      :
 24 |         password          = guest      :
 25 |         virtual_host      = None       :
 26 |         exchange          = None       : If specified, data is written to this 'exchange', and also
 27 |                                          persisted on a durable queue '<exchange>_shared'. Clients can
 28 |                                          ask to listen to the exchange on this queue ('consumable'
 29 |                                          behaviour, the default), or to listen to a live stream on an
 30 |                                          exclusive queue that is a copy of all data meant only for that
 31 |                                          listener. Clients connected to the shared queue will consume data
 32 |                                          from it, thus splitting workload (intended) or competing for the
 33 |                                          same data (unintended).
 34 |         queue             = "default"  : Not used if 'exchange' is specified.
 35 |         persisting        = True       : When this is on, the exchange will store data in a queue until it
 36 |                                          is consumed by a consuming monitor. Otherwise, data will only be
 37 |                                          queued if there is a listener.
 38 |         max_reconnects    = 3          :
 39 |         reconnect_timeout = 3          :
 40 |         max_queue_size    = 100000     : If the output queue exceeds this number, this processor is considered congested.
 41 |     """
 42 | 
 43 |     MAX_CONNECTOR_QUEUE_SIZE = 10000
 44 |     CHECK_QUEUE_INTERVAL = 5 # 5 seconds; how often to check whether the message queue is "congested"
 45 | 
 46 |     _is_reader = False  # This is a writer
 47 | 
 48 |     def __init__(self, **kwargs):
 49 |         super(RabbitmqWriter, self).__init__(**kwargs)
 50 | 
 51 |         self._connector = self.create_connector(self._incoming, "input", None, "Document to write to configured RabbitMQ.")
 52 | 
 53 |         self.config.set_default(
 54 |             persisting     = True,
 55 |             max_queue_size = 100000
 56 |         )
 57 | 
 58 |         self._last_check_queue_time = 0
 59 |         self._last_known_queue_size = 0
 60 | 
 61 | 
 62 |     def on_open(self):
 63 |         self._last_check_queue_time = 0
 64 |         self._last_known_queue_size = 0
 65 | 
 66 |         self.count = 0
 67 |         self._open_connection()
 68 |         self.log.info("Connected to RabbitMQ.")
 69 | 
 70 |     def on_close(self):
 71 |         if self._close_connection():
 72 |             self.log.info("Connection to RabbitMQ closed.")
 73 | 
 74 |     def _incoming(self, document):
 75 |         if document == None:
 76 |             return
 77 | 
 78 |         data = None
 79 |         msg_type = None
 80 |         if isinstance(document, basestring):
 81 |             data = document
 82 |             msg_type = type(document).__name__
 83 |         elif isinstance(document, (int, long, float)):
 84 |             data = str(document)
 85 |             msg_type = type(document).__name__
 86 |         elif isinstance(document, (list, dict)):
 87 |             try:
 88 |                 data = tojson(document)
 89 |             except TypeError as e:
 90 |                 self.doclog.error("JSON serialization failed: %s" % e.message)
 91 |                 return
 92 |             msg_type = "json"
 93 |         else:
 94 |             data = str(document)
 95 |             msg_type = "str" #type(document).__name__
 96 |             self.doclog.warning("Writing document of unsupported type '%s' as type 'str'." % type(document).__name__)
 97 | 
 98 |         if self._publish(msg_type, data):
 99 |             self.count += 1
100 | 
101 |     def is_congested(self):
102 |         if super(RabbitmqWriter, self).is_congested():
103 |             return True
104 |         if self._connector.queue.qsize() > self.MAX_CONNECTOR_QUEUE_SIZE:
105 |             return True
106 |         elif not self.config.exchange or self.config.persisting:
107 |             if self.config.max_queue_size:
108 |                 now = time.time()
109 |                 if now - self._last_check_queue_time > self.CHECK_QUEUE_INTERVAL:
110 |                     try:
111 |                         self._last_known_queue_size = self.get_queue_size()
112 |                     except Exception as e:
113 |                         self.log.warning("Failed to get queue size for queue '%s': %s" % (self._queue_name, e))
114 |                     self._last_check_queue_time = now
115 | 
116 |                 if self._last_known_queue_size > self.config.max_queue_size:
117 |                     return True
118 | 
119 |         return False
120 | 


--------------------------------------------------------------------------------
/test/test_protocol_compliance.py:
--------------------------------------------------------------------------------
  1 | import unittest
  2 | from eslib import Processor, Terminal, Connector, Socket
  3 | 
  4 | class TestProtocolCompliance(unittest.TestCase):
  5 | 
  6 | # TEST mimic / passthrough protocols
  7 | 
  8 |     def test_protocol_equal(self):
  9 |         s = Socket("sock_a", "proto_a")
 10 |         c = Connector("conn_a", "proto_a")
 11 |         self.assertTrue(Terminal.protocol_compliance(s, c))
 12 | 
 13 |     def test_protocol_not_equal(self):
 14 |         s = Socket("sock_a", "proto_b")
 15 |         c = Connector("conn_a", "proto_a")
 16 |         self.assertFalse(Terminal.protocol_compliance(s, c))
 17 | 
 18 |     def test_protocol_general_accepts_special(self):
 19 |         s = Socket("sock_a", "general.special")
 20 |         c = Connector("conn_a", "general")
 21 |         self.assertTrue(Terminal.protocol_compliance(s, c))
 22 | 
 23 |     def test_protocol_special_too_strict_for_general(self):
 24 |         s = Socket("sock_a", "general")
 25 |         c = Connector("conn_a", "general.special")
 26 |         self.assertFalse(Terminal.protocol_compliance(s, c))
 27 | 
 28 |     def test_protocol_any_any(self):
 29 |         s = Socket("sock_a", None)
 30 |         c = Connector("conn_a", None)
 31 |         self.assertTrue(Terminal.protocol_compliance(s, c))
 32 | 
 33 |     def test_protocol_any_sock(self):
 34 |         s = Socket("sock_a", None)
 35 |         c = Connector("conn_a", "x")
 36 |         self.assertTrue(Terminal.protocol_compliance(s, c))
 37 | 
 38 |     def test_protocol_any_conn(self):
 39 |         s = Socket("sock_a", "x")
 40 |         c = Connector("conn_a", None)
 41 |         self.assertTrue(Terminal.protocol_compliance(s, c))
 42 | 
 43 |     def test_protocol_mimic(self):
 44 |         a_s = Socket   ("sock_a", "esdoc.tweet")
 45 |         b_c = Connector("conn_b", "esdoc")
 46 |         b_s = Socket   ("sock_b", "esdoc", mimic=b_c)  # Should end up mimicing 'esdoc.tweet' from a_s if connected
 47 |         c_c = Connector("conn_c", "esdoc.tweet")
 48 | 
 49 |         # Only unidirectional attachment needed for this test
 50 |         b_c.attach(a_s)
 51 | 
 52 |         print "b_s proto         =", b_s.protocol
 53 |         print "b_s mimiced proto =", b_s.mimiced_protocol
 54 |         comply = Terminal.protocol_compliance(b_s, c_c)
 55 |         print "compiance=", comply
 56 | 
 57 |         self.assertTrue(b_s.mimiced_protocol == "esdoc.tweet")
 58 | 
 59 |         self.assertTrue(Terminal.protocol_compliance(a_s, b_c))
 60 |         self.assertTrue(Terminal.protocol_compliance(b_s, c_c))
 61 | 
 62 |     def test_protocol_mimic_no_connection(self):
 63 |         a_s = Socket   ("sock_a", "esdoc.tweet")
 64 |         b_c = Connector("conn_b", "esdoc")
 65 |         b_s = Socket   ("sock_b", "esdoc", mimic=b_c)  # Should end up mimicing 'esdoc.tweet' from a_s if connected
 66 |         c_c = Connector("conn_c", "esdoc.tweet")
 67 | 
 68 |         print "b_s proto         =", b_s.protocol
 69 |         print "b_s mimiced proto =", b_s.mimiced_protocol
 70 |         comply = Terminal.protocol_compliance(b_s, c_c)
 71 |         print "compiance=", comply
 72 | 
 73 |         self.assertTrue(b_s.mimiced_protocol == "esdoc")
 74 | 
 75 |         self.assertTrue(Terminal.protocol_compliance(a_s, b_c))
 76 |         self.assertFalse(Terminal.protocol_compliance(b_s, c_c))
 77 | 
 78 |     def test_protocol_mimic_sequence(self):
 79 |         a_s = Socket   ("sock_a", "esdoc.tweet")
 80 | 
 81 |         b_c = Connector("conn_b", "esdoc")
 82 |         b_s = Socket   ("sock_b", "esdoc", mimic=b_c)
 83 | 
 84 |         c_c = Connector("conn_c", "esdoc.tweet")
 85 |         c_s = Socket   ("sock_b", "esdoc", mimic=c_c)
 86 | 
 87 |         print "NOT ATTACHED:"
 88 |         print "b_s         proto =", b_s.protocol
 89 |         print "c_s         proto =", b_s.protocol
 90 |         print "b_s mimiced proto =", c_s.mimiced_protocol
 91 |         print "c_s mimiced proto =", c_s.mimiced_protocol
 92 | 
 93 |         self.assertTrue(c_s.mimiced_protocol == "esdoc")
 94 | 
 95 |         # Only unidirectional attachments needed for this test
 96 |         b_c.attach(a_s)
 97 |         c_c.attach(b_s)
 98 | 
 99 |         print "\nATTACHED:"
100 |         print "b_s         proto =", b_s.protocol
101 |         print "c_s         proto =", c_s.protocol
102 |         print "b_s mimiced proto =", b_s.mimiced_protocol
103 |         print "c_s mimiced proto =", c_s.mimiced_protocol
104 | 
105 |         self.assertTrue(c_s.mimiced_protocol == "esdoc.tweet")
106 | 
107 |     def test_protocol_mimic_circular(self):
108 |         a_s = Socket   ("sock_a", "esdoc.tweet")
109 | 
110 |         b_c = Connector("conn_b", "esdoc")
111 |         b_s = Socket   ("sock_b", "esdoc", mimic=b_c)
112 | 
113 |         c_c = Connector("conn_c", "esdoc.tweet")
114 |         c_s = Socket   ("sock_b", "esdoc", mimic=c_c)
115 | 
116 |         # Only unidirectional attachments needed for this test
117 |         b_c.attach(c_s) # Making it circular
118 |         c_c.attach(b_s)
119 | 
120 |         print "\nATTACHED:"
121 |         print "b_s         proto =", b_s.protocol
122 |         print "c_s         proto =", c_s.protocol
123 |         print "b_s mimiced proto =", b_s.mimiced_protocol
124 |         print "c_s mimiced proto =", c_s.mimiced_protocol
125 | 
126 |         self.assertTrue(b_s.mimiced_protocol == "esdoc")
127 | 
128 |         # And most important, it does not enter an infinite loop and finally gets here..
129 | 
130 | def main():
131 |     unittest.main()
132 | 
133 | if __name__ == "__main__":
134 |     main()
135 | 


--------------------------------------------------------------------------------
/eslib/procs/PatternRemover.py:
--------------------------------------------------------------------------------
  1 | __author__ = 'Hans Terje Bakke'
  2 | 
  3 | from ..Processor import Processor
  4 | from .. import esdoc
  5 | import re
  6 | 
  7 | class PatternRemover(Processor):
  8 |     """
  9 |     Remove text using a regex pattern.
 10 | 
 11 |     Connectors:
 12 |         input      (esdoc)   (default) : Incoming document in 'esdoc' dict format.
 13 |         str        (str)               : Incoming document of type 'str' or 'unicode'.
 14 |     Sockets:
 15 |         output     (esdoc)   (default) : Output of documents that arrived on 'input' connector.
 16 |         str        (str)               : Output of documents that arrived on 'str' connector.
 17 | 
 18 |     Config:
 19 |         source_field        = "text"   : Part of twitter dev credentials.
 20 |         target_field        = None     : Defaults to 'source_field', replacing the input field.
 21 |         field_map           = {}       : A dict of fields to use as { source : target }.
 22 |                                          If specified, this *replaces* the source_field and target_field pair!
 23 |         pattern             = None     : Pattern to apply. (All 'patterns' are also applied, if specified.)
 24 |         patterns            = []       : List of patterns to apply. ('pattern' will be applied first, if it exists.)
 25 |         regex_options       = DOTALL|IGNORECASE|MULTILINE|UNICODE
 26 |                                        : Options for *all* regex patterns.
 27 |         strip               = True     : Remove boundary spaces and double spaces, commonly left after a removal.
 28 |     """
 29 | 
 30 |     def __init__(self, **kwargs):
 31 |         super(PatternRemover, self).__init__(**kwargs)
 32 | 
 33 |         m = self.create_connector(self._incoming_esdoc, "input", "esdoc", "Incoming 'esdoc'.", is_default=True)
 34 |         self.create_connector(self._incoming_str  , "str"  , "str"  , "Incoming document of type 'str' or 'unicode'.")
 35 |         self.output_esdoc = self.create_socket("output" , "esdoc"   , "Outgoing, cleaned, 'esdoc'.", is_default=True, mimic=m)
 36 |         self.output_str   = self.create_socket("str"    , "str"     , "Outgoing, cleaned, 'str'.")
 37 | 
 38 |         self.config.set_default(
 39 |             source_field    = "text",
 40 |             target_field    = None,
 41 |             field_map       = {},
 42 |             pattern         = None,
 43 |             patterns        = [],
 44 |             regex_options   = re.DOTALL|re.IGNORECASE|re.MULTILINE|re.UNICODE,
 45 |             strip           = True
 46 |         )
 47 | 
 48 |         self._regexes = []
 49 |         self._field_map = {}
 50 | 
 51 |     def on_open(self):
 52 |         """
 53 |         :raises ValueError, if failed to parse a pattern as regex
 54 |         """
 55 | 
 56 |         # Create list of regexes
 57 |         patterns = []
 58 |         if self.config.pattern:
 59 |             patterns = [self.config.pattern]
 60 |         if self.config.patterns:
 61 |             patterns.extend(self.config.patterns)
 62 |         self._regexes = []
 63 |         for pattern in patterns:
 64 |             try:
 65 |                 regex = re.compile(r"(%s)" % pattern, self.config.regex_options)
 66 |                 self._regexes.append(regex)
 67 |             except Exception as e:
 68 |                 raise ValueError("Error parsing pattern: %s\nPattern was: %s" % (e.message, pattern))
 69 | 
 70 |         # Create field map
 71 |         self._field_map = self.config.field_map or {}
 72 |         if not self._field_map:
 73 |             if not self.config.source_field:
 74 |                 raise ValueError("Neither field_map nor source_field is configured.")
 75 |             self._field_map[self.config.source_field] = (self.config.target_field or self.config.source_field)
 76 | 
 77 | 
 78 |     def _clean_text(self, text):
 79 |         for regex in self._regexes:
 80 |             text = regex.sub("", text)
 81 |             if self.config.strip:
 82 |                 text = text.strip().replace("  ", " ")
 83 |         return text
 84 | 
 85 |     def _clean(self, doc):
 86 | 
 87 |         if not doc or not self._regexes:
 88 |             return doc
 89 | 
 90 |         # This makes this method work also for 'str' and 'unicode' type documents; not only for the expected 'esdoc' protocol (a 'dict').
 91 |         if type(doc) in [str, unicode]:
 92 |             cleaned = self._clean_text(doc)
 93 |             return cleaned
 94 |         elif not type(doc) is dict:
 95 |             self.doclog.debug("Unsupported document type '%s'." % type(doc))
 96 |             return doc
 97 | 
 98 |         source = doc.get("_source")
 99 |         if not source:
100 |             return doc  # Missing source section; don't do anything
101 | 
102 |         for source_field, target_field in self._field_map.iteritems():
103 |             text = esdoc.getfield(source, source_field)
104 |             if text and type(text) in [str, unicode]:
105 |                 cleaned = self._clean_text(text)
106 |                 if cleaned != text:
107 |                     # Note: This may lead to a few strictly unnecessary shallow clonings...
108 |                     doc = esdoc.shallowputfield(doc, "_source." + target_field, cleaned)
109 |         return doc
110 | 
111 |     def _incoming_esdoc(self, doc):
112 |         if self.output_esdoc.has_output:
113 |             self.output_esdoc.send(self._clean(doc))
114 | 
115 |     def _incoming_str(self, doc):
116 |         if self.output_str.has_output:
117 |             self.output_str.send(self._clean(doc))
118 | 


--------------------------------------------------------------------------------
/test/test_procs/test_blacklist_filter.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | import unittest
  4 | from eslib.procs import BlacklistFilter
  5 | 
  6 | class TestBlacklistFilter_str(unittest.TestCase):
  7 | 
  8 |     def test_str_nohit(self):
  9 |         s = "I am marvellous"
 10 |         p = BlacklistFilter(filters=[{"tokens": ["we", "like"], "blacklist": ["young"]}])
 11 |         p.on_open()
 12 |         check = p._check(s)
 13 | 
 14 |         print "str_nohit (exp:True)=", check
 15 |         self.assertTrue(check)
 16 | 
 17 |     def test_str_hit_but_not_blacklisted(self):
 18 |         s = "I like girls."
 19 |         p = BlacklistFilter(filters=[{"tokens": ["we", "like"], "blacklist": ["young"]}])
 20 |         print "filters=", p._filters
 21 |         p.on_open()
 22 |         check = p._check(s)
 23 | 
 24 |         print "str_hit_but_not_blacklisted (exp:True)=", check
 25 |         self.assertTrue(check)
 26 | 
 27 |     def test_str_hit_and_blacklisted(self):
 28 |         s = "I like young girls."
 29 |         p = BlacklistFilter(filters=[{"tokens": ["we", "like"], "blacklist": ["young"]}])
 30 |         print "filters=", p._filters
 31 |         p.on_open()
 32 |         check = p._check(s)
 33 | 
 34 |         print "str_hit_and_blacklisted (exp:False)=", check  # Should have hit "young" from blacklist
 35 |         self.assertFalse(check)
 36 | 
 37 |     def test_str_global_whitelist_override(self):
 38 |         s = "We only like girls. Young girls are always welcome!"
 39 |         p = BlacklistFilter(filters=[{"tokens": ["we", "like"], "blacklist": ["young"]}], whitelist=["young girls"])
 40 |         p.on_open()
 41 |         check = p._check(s)
 42 | 
 43 |         print "str_global_whitelist_override (exp:True)=", check
 44 |         # Should have hit "young" from blacklist, but "young girls" from whitelist should override it
 45 |         self.assertTrue(check)
 46 | 
 47 | 
 48 |     def test_brooklyn(self):
 49 |         s = "Brooklyn Nets trounce short-handed Oklahoma City Thunder 116-85 http://t.co/qJZPBEJRCT"
 50 |         p = BlacklistFilter(filters=[{"tokens": ["nets"], "blacklist": ["brooklyn"]}])
 51 |         p.on_open()
 52 |         check = p._check(s)
 53 | 
 54 |         print "check (expect False)=", check
 55 |         self.assertFalse(check)
 56 | 
 57 | 
 58 | class TestBlacklistFilter_esdoc(unittest.TestCase):
 59 | 
 60 |     # check == True means the document was NOT filtered out, i.e. it PASSED the filter
 61 | 
 62 |     def test_str_nohit(self):
 63 |         s = "I am marvellous"
 64 |         doc = {"_source": {"field1": s}}
 65 |         p = BlacklistFilter(
 66 |             field="field1",
 67 |             filters=[{"tokens": ["we", "like"], "blacklist": ["young"]}])
 68 |         p.on_open()
 69 |         check = p._check(doc)
 70 | 
 71 |         print "str_nohit (exp:False)=", check
 72 |         self.assertTrue(check)
 73 | 
 74 |     def test_str_hit_but_not_blacklisted(self):
 75 |         s = "I like girls."
 76 |         doc = {"_source": {"field1": s}}
 77 |         p = BlacklistFilter(
 78 |             fields=["field1", "field2"],
 79 |             filters=[{"tokens": ["we", "like"], "blacklist": ["young"]}])
 80 |         print "filters=", p._filters
 81 |         p.on_open()
 82 |         check = p._check(doc)
 83 | 
 84 |         print "str_hit_but_not_blacklisted (exp:False)=", check
 85 |         self.assertTrue(check)
 86 | 
 87 |     def test_str_hit_and_blacklisted(self):
 88 |         s1 = "I like young girls."
 89 |         s2 = "I am a boy."
 90 |         doc = {"_source": {"field1": s1, "field2": s2}}
 91 |         p = BlacklistFilter(
 92 |             fields=["field1", "field2"],
 93 |             filters=[{"tokens": ["we", "like"], "blacklist": ["young"]}])
 94 |         print "filters=", p._filters
 95 |         p.on_open()
 96 |         check = p._check(doc)
 97 | 
 98 |         print "str_hit_and_blacklisted (exp:False)=", check  # Should have hit "young" from blacklist
 99 |         self.assertFalse(check)
100 | 
101 |     def test_str_global_whitelist_override(self):
102 |         s1 = "We only like girls. Young girls are always welcome!"
103 |         s2 = "I like young boys."
104 |         doc = {"_source": {"field1": s1, "field2": s2}}
105 |         p = BlacklistFilter(
106 |             fields=["field1", "field2"],
107 |             filters=[{"tokens": ["we", "like"], "blacklist": ["young"]}],
108 |             whitelist=["young girls"])
109 |         p.on_open()
110 |         check = p._check(doc)
111 | 
112 |         print "str_global_whitelist_override (exp:True)=", check
113 |         # Should have hit "young" from blacklist, but "young girls" from whitelist should override it
114 |         self.assertTrue(check)
115 | 
116 |     def test_str_global_whitelist_override_not_hitting(self):
117 |         s1 = "We only like girls. Young girls are always welcome!"
118 |         s2 = "I like young boys."
119 |         doc = {"_source": {"field1": s1, "field2": s2}}
120 |         p = BlacklistFilter(
121 |             fields=["field2"],
122 |             filters=[{"tokens": ["we", "like"], "blacklist": ["young"]}],
123 |             whitelist=["young girls"])
124 |         p.on_open()
125 |         check = p._check(doc)
126 | 
127 |         print "str_global_whitelist_override_not_hitting (exp:False)=", check
128 |         # Should have hit "young" from blacklist; "young girls" from whitelist does not apply to field2, so we should not override here
129 |         self.assertFalse(check)
130 | 
131 | def main():
132 |     unittest.main()
133 | 
134 | if __name__ == "__main__":
135 |     main()
136 | 


--------------------------------------------------------------------------------
/eslib/procs/Neo4jWriter.py:
--------------------------------------------------------------------------------
  1 | __author__ = 'mats'
  2 | 
  3 | from itertools import izip
  4 | import time, logging
  5 | 
  6 | from ..Generator import Generator
  7 | from .neo4j import Neo4j
  8 | 
  9 | 
 10 | class Neo4jWriter(Generator):
 11 |     """
 12 |     This is a pipeline step which primary function is to push an edge
 13 |     between the author of a tweet to all the people mentioned in the tweet.
 14 |     
 15 |     Connectors:
 16 |         edge       (graph-edge)   : Edge object to write.
 17 |         user       (graph-user)   : User object to write.
 18 | 
 19 |     Config:
 20 |         batchsize  = 20           : How many IDs to gather up before making a call to Neo4j.
 21 |         batchtime  = 5.0          : How many seconds to wait before we send a batch if it is not full.
 22 |         host       = localhost: The host we should connect to
 23 |         port       = 7474     : The default neo4j port
 24 | 
 25 |     """
 26 | 
 27 |     def __init__(self, **kwargs):
 28 |         super(Neo4jWriter, self).__init__(**kwargs)
 29 |         self.create_connector(self._incoming_edge, "edge", "graph-edge")
 30 |         self.create_connector(self._incoming_user, "user", "graph-user")
 31 | 
 32 |         self.config.set_default(
 33 |             batchsize = 20,
 34 |             batchtime = 5,
 35 |             host      = "localhost",
 36 |             port      = 7474
 37 |         )
 38 | 
 39 |         self._neo4j = None
 40 | 
 41 |         # This could be better
 42 |         self._edge_queue = []
 43 |         self._last_edge_commit = time.time()
 44 |         self._user_queue = []
 45 |         self._last_user_commit = time.time()
 46 | 
 47 |     def on_open(self):
 48 |         """
 49 |         Instantiates both a neo4j-instance and a twitter-instance.
 50 | 
 51 |         Raises:
 52 |             - ConnectionError if neo4j can't contact its server
 53 |             - Exception if twitter can't authenticate properly
 54 | 
 55 |         """
 56 | 
 57 |         # TODO: Need logging, request timeout and exception handling down there:
 58 |         self.log.debug("Connecting to Neo4j.")
 59 |         self._neo4j = Neo4j(host=self.config.host, port=self.config.port)
 60 |         self.log.status("Connected to Neo4j on %s:%s." % (self.config.host, self.config.port))
 61 | 
 62 |     def _incoming_edge(self, document):
 63 |         """
 64 |         Takes an edge and puts it's correct query in the queue.
 65 | 
 66 |         Args: 
 67 |             document: A dict with "from", "to" and "type" as fields.
 68 | 
 69 |         The ambition is that this Processor should never go down no matter
 70 |         what happens to a document in this method.
 71 | 
 72 |         """
 73 |         try:
 74 |             from_id = document["from"]
 75 |             to_id = document["to"]
 76 |             edge_type = document["type"]
 77 |         except KeyError:
 78 |             self.doclog.exception("Unable to parse document: %s" % str(document))
 79 |         else:
 80 |             query = self._neo4j.get_edge_query(from_id, edge_type, to_id)
 81 |             self._edge_queue.append(query)
 82 | 
 83 |     def _incoming_user(self, document):
 84 |         if self.doclog.isEnabledFor(logging.TRACE):
 85 |             self.doclog.trace("Incoming user '%s' ('%s')." % (document["screen_name"], document["id"]))
 86 |         query, params = self._neo4j.get_node_merge_query(document)
 87 |         self._user_queue.append((query, params))
 88 | 
 89 |     def on_tick(self):
 90 |         """
 91 |         Commit items in queue if queue exceeds batchsize or it's been long
 92 |         since last commit.
 93 | 
 94 |         """
 95 |         now = time.time()
 96 |         if ((len(self._edge_queue) >= self.config.batchsize) or
 97 |             (now - self._last_edge_commit >= self.config.batchtime and
 98 |                  self._edge_queue)):
 99 |             self._edge_send()
100 | 
101 |         if ((len(self._user_queue) >= self.config.batchsize) or
102 |            ((now - self._last_user_commit >= self.config.batchtime) and
103 |                 self._user_queue)):
104 |             self._user_send()
105 | 
106 |     def on_shutdown(self):
107 |         """ Clear out the rest of the items in the queue """
108 |         self.log.info("Processing remaining edge queue.")
109 |         while self._edge_queue:
110 |             self._edge_send()
111 |         self.log.info("Processing remaining user queue.")
112 |         while self._user_queue:
113 |             self._user_send()
114 | 
115 |     def _edge_send(self):
116 |         num_edges = len(self._edge_queue)
117 |         if num_edges > self.config.batchsize:
118 |             num_edges = self.config.batchsize
119 |         
120 |         rq = self._neo4j._build_rq(self._edge_queue[:num_edges])
121 |         self._neo4j.commit(rq)
122 |         self.log.debug("Committed %i edges." % num_edges)
123 |         self._edge_queue = self._edge_queue[num_edges:]
124 |         self._last_edge_commit = time.time()
125 |     
126 |     def _user_send(self):
127 |         num_users = len(self._user_queue)
128 |         if num_users > self.config.batchsize:
129 |             num_users = self.config.batchsize
130 | 
131 |         users, params = [list(t)
132 |                          for t in
133 |                          izip(*self._user_queue[:num_users])]
134 | 
135 |         rq = self._neo4j._build_rq(users, params)
136 |         self._neo4j.commit(rq)
137 |         self.log.debug("Committed %i users" % num_users)
138 |         self.user_queue = self._user_queue[num_users:]
139 |         self.last_user_commit = time.time()
140 | 


--------------------------------------------------------------------------------
/eslib/procs/KafkaMonitor.py:
--------------------------------------------------------------------------------
  1 | __author__ = 'Hans Terje Bakke'
  2 | 
  3 | from ..Monitor import Monitor
  4 | from pykafka import KafkaClient
  5 | import json, time
  6 | import logging
  7 | import zlib
  8 | 
  9 | 
 10 | class KafkaMonitor(Monitor):
 11 |     """
 12 |     Monitor a Kafka topic.
 13 |     Assumes data with type 'str', 'unicode', 'int', 'float' or 'json' from RabbitMQ.
 14 |     Incoming documents are attempted deserialized into these types. Unknown types are passed as 'str'.
 15 | 
 16 |     Sockets:
 17 |         output     (*)       : Document received on monitored queue.
 18 | 
 19 |     Config:
 20 |         hosts             = ["localhost:9292"]    : List of Kafka hosts.
 21 |         zookeeper_hosts   = ["localhost:2181"]    : For balanced consumption via zookeeper.
 22 |         topic             = "default_topic"       :
 23 |         consumer_group    = "default_group"       : Balanced consumer group.
 24 |         compression       = False                 : Whether to decompress the data read from Kafka.
 25 |     """
 26 | 
 27 |     CONGESTION_SLEEP_TIME = 10.0
 28 |     WORK_TIME             = 5.0
 29 | 
 30 |     def __init__(self, **kwargs):
 31 |         super(KafkaMonitor, self).__init__(**kwargs)
 32 | 
 33 |         self.output = self.create_socket("output", None, "Document received on monitored queue.")
 34 | 
 35 |         self.config.set_default(
 36 |             hosts           = ["localhost:9092"],
 37 |             zookeeper_hosts = ["localhost:2181"],
 38 |             topic           = "default_topic",
 39 |             consumer_group  = "default_group",
 40 |             compression     = False
 41 |         )
 42 | 
 43 |         self._client   = None
 44 |         self._consumer = None
 45 | 
 46 |     #region Processor stuff
 47 | 
 48 |     def on_open(self):
 49 |         self.count = 0
 50 |         self._client = KafkaClient(",".join(self.config.hosts))
 51 |         topic = self._client.topics[self.config.topic]
 52 |         self._consumer = topic.get_balanced_consumer(
 53 |             auto_commit_enable = True,
 54 |             consumer_group     = self.config.consumer_group,
 55 |             zookeeper_connect  = ",".join(self.config.zookeeper_hosts)
 56 |         )
 57 | 
 58 |         self.log.info("Connected to Kafka topic '%s', balanced via zookeeper." % self.config.topic)
 59 | 
 60 |     def on_close(self):
 61 |         if self._client:
 62 |             self._consumer.stop()
 63 |             #del self._consumer
 64 |             self.log.info("Kafka consumer stopped.")
 65 |             # Can't find any way to close the connection or ask it to release resources, so I try a 'del'.
 66 |             #del self._client
 67 |             self._client = None
 68 |             self.log.debug("Connection to Kafka deleted.")
 69 | 
 70 |     #endregion Processor stuff
 71 | 
 72 |     #region Generator stuff
 73 | 
 74 |     def on_startup(self):
 75 |         self.count = 0
 76 | 
 77 |     def on_tick(self):
 78 | 
 79 |         congested = self.congestion()
 80 |         if congested:
 81 |             self.log.debug("Congestion in dependent processor '%s'; sleeping %d seconds." % (congested.name, self.CONGESTION_SLEEP_TIME))
 82 |             self.congestion_sleep(self.CONGESTION_SLEEP_TIME)
 83 |         else:
 84 |             # Read as much as we can for WORK_TIME seconds, then return to controlling
 85 |             # loop. This way this processor should hang a maximum of WORK_TIME seconds
 86 |             # before accepting control commands.
 87 |             start_time = time.time()
 88 |             while True:
 89 |                 if self.end_tick_reason:
 90 |                     return
 91 |                 if time.time() - start_time > self.WORK_TIME:
 92 |                     self.log.debug("Work time exceeded %s seconds. Returning to control loop." % self.WORK_TIME)
 93 |                 try:
 94 |                     kafka_message = self._consumer.consume(block=False)
 95 |                 except Exception as e:
 96 |                     self.log.error("Error consuming Kafka. Aborting. [%s]" % e.__class__.__name__)
 97 |                     self.abort()
 98 |                     return
 99 |                 if kafka_message is None:
100 |                     return
101 | 
102 |                 self.count += 1
103 | 
104 |                 if not self.output.has_output: # Don't bother with further message processing, in this case.
105 |                     return
106 | 
107 |                 document = self._decode_message(kafka_message.value)
108 |                 if document is not None:
109 |                     self.output.send(document)
110 | 
111 |     def _decode_message(self, kafka_data):
112 | 
113 |         # print "INCOMING KAFKA DATA: [%s]" % kafka_data
114 | 
115 |         if not kafka_data:
116 |             return None
117 | 
118 |         if self.config.compression:
119 |             kafka_data = zlib.decompress(kafka_data)
120 | 
121 |         msg_type = None
122 |         document = None
123 |         try:
124 |             jj = json.loads(kafka_data)
125 | #            kafka_data = tojson({"type": msg_type, "data": data})
126 |         except TypeError as e:
127 |             self.doclog.warning("JSON deserialization failed: %s" % e.message)
128 |             return None
129 |         msg_type = jj.get("type")
130 |         document = jj.get("data")
131 |         if not msg_type or document is None:
132 |             return None
133 | 
134 |         if self.log.isEnabledFor(logging.TRACE):
135 |             self.log.trace("Received message of type '%s', Kafka payload size = %d." % (msg_type, len(kafka_data)))
136 |         return document
137 | 
138 |     #endregion Generator stuff
139 | 


--------------------------------------------------------------------------------
/eslib/procs/TcpWriter.py:
--------------------------------------------------------------------------------
  1 | __author__ = 'Hans Terje Bakke'
  2 | 
  3 | from ..Generator import Generator
  4 | from ..esdoc import tojson
  5 | import socket
  6 | from select import select
  7 | 
  8 | 
  9 | class TcpWriter(Generator):
 10 |     """
 11 |     Write incoming documents to a TCP port.
 12 |     Documents of type 'str' and 'unicode' are writtes as-is. Other types are attempted written as JSON.
 13 | 
 14 |     NOTE: This processor operates as a Generator, but is considered to be passive; hence keepalive defaults to False.
 15 | 
 16 |     Connectors:
 17 |         input      (*)         : Incoming documents to write to a TCP socket.
 18 | 
 19 |     Config:
 20 |         hostname      = ""     : Default to any address the machine happens to have. Use "localhost" to enforce local onlu.
 21 |         port          = 4000   :
 22 |         reuse_address = False  : Whether to allow reusing an existing TCP address/port.
 23 |     """
 24 |     def __init__(self, **kwargs):
 25 |         super(TcpWriter, self).__init__(**kwargs)
 26 |         self.create_connector(self._incoming, "input", None, "Incoming documents to write to a TCP socket.")
 27 | 
 28 |         self.keepalive = False  # Passive of nature, hence this default
 29 | 
 30 |         self.config.set_default(
 31 |             hostname      = "",
 32 |             port          = 4000,
 33 |             reuse_address = False
 34 |         )
 35 | 
 36 |         self._connections = []  # List of (socket, address) pairs
 37 |         self._socket = None
 38 | 
 39 |     def on_open(self):
 40 |         self._socket = None
 41 |         sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
 42 |         if self.config.reuse_address:
 43 |             sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
 44 |         address = (self.config.hostname #or socket.gethostname()
 45 |                    , self.config.port)
 46 |         try:
 47 |             sock.bind(address)
 48 |             #sock.setblocking(0)
 49 |             sock.listen(0)  # No backlog limit
 50 |             self.log.info("Listening for connections on %s:%d." % address)
 51 |         except socket.error as e:
 52 |             self.log.critical("Listener failed to bind to %s:%d. (errno=%d, message=%s)" % (self.config.hostname, self.config.port, e.errno, e.args[1]))
 53 |             raise e
 54 | 
 55 |         self._connections = []
 56 |         self._socket = sock
 57 | 
 58 |         self.total = 0
 59 | 
 60 |     def on_close(self):
 61 |         if self._connections:
 62 |             for c in self._connections:
 63 |                 s, a = c
 64 |                 s.close()
 65 |             self._connections = []
 66 |         if self._socket:
 67 |             self._socket.close()
 68 |             self._socket = None
 69 |             self.log.info("Listener closed.")
 70 | 
 71 |     @staticmethod
 72 |     def _get_conn(connections, sock):
 73 |         for c in connections:
 74 |             if c[0] == sock:
 75 |                 return c
 76 |         return None
 77 | 
 78 |     def on_tick(self):
 79 |         if not self.running or self.stopping:
 80 |             return
 81 | 
 82 |         r, w, e = select([self._socket], [], [self._socket], 0)  # Non-blocking
 83 |         if e:
 84 |             self.log.warning("Error on server socket -- now what?")
 85 |         if r:
 86 |             # We have one or more new connections pending. Get one and return to run loop.
 87 |             c = self._socket.accept()
 88 |             s, a = c
 89 |             self.log.info("New connection from %s:%d." % a)
 90 |             self._connections.append(c)
 91 | 
 92 |         # Check for dead connections
 93 |         connections = self._connections[:]
 94 |         sockets = [s for s,a in connections]
 95 |         r, w, e = select(sockets, [], sockets, 0)
 96 |         if e:
 97 |             self.log.warning("Error on connected socket -- now what?")
 98 |         for s in r:
 99 |             # This socket is intended for write only, but since there is now data,
100 |             # we read a bit just to work down the input buffer. If it is empty, getting
101 |             # here means the connection has been closed on the other end, and we can remove it.
102 |             data = s.recv(1024)
103 |             if not data:
104 |                 s.close()
105 |                 c = self._get_conn(connections, s)
106 |                 if c and c in self._connections:
107 |                     self.log.info("Connection closed by client %s:%d." % c[1])
108 |                     self._connections.remove(c)
109 |                 else:
110 |                     self.log.info("Unknown connection closed by client.")
111 | 
112 |     def _send(self, data):
113 |         connections = self._connections[:]
114 |         for c in connections:
115 |             s, a = c
116 |             try:
117 |                 s.sendall((data + "\n").encode("utf8"))
118 |                 #s.flush()
119 |             except socket.error as e:
120 |                 if e.errno == socket.errno.EPIPE:  # Broken pipe
121 |                     self.log.info("Connection closed by client %s:%d. (Broken pipe)" % a)
122 |                 else:
123 |                     self.log.error("Unhandled error writing to socket from %s:%d. Disconnecting. (errno=%d, message=%s)" %
124 |                                    (a[0], a[1], e.errno, e.args[1]))
125 |                 self._connections.remove(c)
126 | 
127 |     def _incoming(self, document):
128 |         if document:
129 |             data = document
130 |             if not type(document) in [str, unicode]:
131 |                 data = tojson()
132 |             self._send(data)
133 | 
134 |             self.count += 1
135 |             self.total += 1
136 | 


--------------------------------------------------------------------------------
/test/test_procs/test_entity_extractor.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | __author__ = 'Hans Terje Bakke'
  4 | 
  5 | import unittest
  6 | from eslib.procs.EntityExtractor import EntityExtractor
  7 | from eslib import esdoc
  8 | from eslib import unique
  9 | 
 10 | 
 11 | class TestEntityExtractor(unittest.TestCase):
 12 |     entities = \
 13 |     [
 14 |         {
 15 |             "category": "webpage",
 16 |             "name": "nrk",
 17 |             "match": [
 18 |                 { "type": "exact", "pattern": "nrk.no" },
 19 |                 #{ "type": "iprange", "value": "160.68.205.231/16" }
 20 |             ]
 21 |         },
 22 |         {
 23 |             "category": "targets",
 24 |             "name": "comperio",
 25 |             "match": [
 26 |                 { "type": "exact", "pattern": u"hans terje bøkke", "weight": 0.8 },
 27 |                 { "type": "exact", "pattern": "10.0.0.100", "weight": 0.5 },
 28 |                 { "type": "exact", "pattern": "comperio" }
 29 |             ]
 30 |         },
 31 |         {
 32 |             "category": "targets",
 33 |             "name": "IBM",
 34 |             "match": [
 35 |                 { "type": "exact", "pattern": "ibm" }
 36 |             ]
 37 |         },
 38 |         {
 39 |             "category": "creditcards",
 40 |             "name": "creditcard",  # The name should become the credit card number
 41 |             "match": [ { "type": "creditcard" } ]
 42 |         },
 43 |         {
 44 |             "category": "emails",
 45 |             "name": "email",  # The email should become the email address
 46 |             "match": [ { "type": "email" } ]
 47 |         },
 48 |     ]
 49 | 
 50 |     def test_defaults(self):
 51 |         ex = EntityExtractor()
 52 |         ex.on_open()
 53 | 
 54 |         self.assertEqual(ex.config.fields, [])
 55 |         self.assertEqual(ex.config.target, "entities")
 56 |         self.assertEqual(ex.config.entities, [])
 57 | 
 58 |     def test_extract_str(self):
 59 |         ex = EntityExtractor()
 60 |         ex.config.entities = self.entities
 61 |         ex.on_open()
 62 | 
 63 |         s = u"As mentioned ø on nrk.no, Hans Terje Bøkke works for Comperio. His PC has IP address 10.0.0.100. " + \
 64 |        "He never uses his credit card: 1234.5678.9876.5432. You can contact him on " + \
 65 |        "hans.terje.bakke@gmail.com. But balle.klorin@wesenlund.no will not work for IBM."
 66 | 
 67 |         extracted = ex._extract(None, s)
 68 |         elist = list(extracted)
 69 | 
 70 |         for e in elist:
 71 |             print e
 72 | 
 73 |         self.assertEqual(len(elist), 8)
 74 | 
 75 | 
 76 |     def _verify(self, entities):
 77 |         webpages    = unique([x["name"] for x in entities["webpage"]])
 78 |         targets     = unique([x["name"] for x in entities["targets"]])
 79 |         emails      = unique([x["name"] for x in entities["emails"]])
 80 |         creditcards = unique([x["name"] for x in entities["creditcards"]])
 81 | 
 82 |         print "WEBPAGE:", webpages
 83 |         print "TARGETS:", targets
 84 |         print "EMAILS :", emails
 85 |         print "CREDITC:", creditcards
 86 | 
 87 |         self.assertEqual(['nrk'], webpages)
 88 |         self.assertEqual(['comperio', 'IBM'], targets)
 89 |         self.assertEqual(['hans.terje.bakke@gmail.com', 'balle.klorin@wesenlund.no'], emails)
 90 |         self.assertEqual(['1234.5678.9876.5432'], creditcards)
 91 | 
 92 |     def test_merge(self):
 93 |         ex = EntityExtractor()
 94 |         ex.config.entities = self.entities
 95 |         ex.on_open()
 96 | 
 97 |         s = "As mentioned on nrk.no, Hans Terje Bakke works for Comperio. His PC has IP address 10.0.0.100. " + \
 98 |        "He never uses his credit card: 1234.5678.9876.5432. You can contact him on " + \
 99 |        "hans.terje.bakke@gmail.com. But balle.klorin@wesenlund.no will not work for IBM."
100 | 
101 |         extracted = ex._extract(None, s)
102 |         entities = ex._merge(extracted)
103 | 
104 |         self._verify(entities)
105 | 
106 |     def test_doc_through(self):
107 | 
108 |         ex = EntityExtractor()
109 |         ex.config.entities = self.entities
110 | 
111 |         doc = {"_id": "123", "_source": {
112 |             "field1": "As mentioned on nrk.no, Hans Terje Bakke works for Comperio.",
113 |             "field2": "He never uses his credit card: 1234.5678.9876.5432.",
114 |             "field3": "You can contact him on hans.terje.bakke@gmail.com.",
115 |             "subsection" : {
116 |                 "subfield": "But balle.klorin@wesenlund.no will not work for IBM."
117 |             },
118 |             "entities": { "old" : "stuff" }
119 |         }}
120 | 
121 |         ex.config.fields = ["field1", "field2", "field3", "subsection.subfield"]
122 | 
123 |         output = []
124 |         ex.add_callback(lambda proc, doc: output.append(doc))
125 |         ex.start()
126 |         ex.put(doc)
127 |         ex.stop()
128 |         ex.wait()
129 | 
130 |         #print output[0]
131 | 
132 |         new_doc = output[0]
133 |         entities = new_doc["_source"]["entities"]
134 | 
135 |         self._verify(entities)
136 | 
137 |         # Check that old and new doc are not the same
138 |         self.assertFalse(doc is new_doc)
139 | 
140 |         # Check that the previous entities still exist in the new document
141 |         old = esdoc.getfield(new_doc, "_source.entities.old")
142 |         self.assertEqual(old, "stuff")
143 | 
144 |         # Check that the new entities do not exist in the original document
145 |         self.assertTrue(esdoc.getfield(doc, "_source.entities.webpage") is None)
146 |         self.assertTrue(esdoc.getfield(new_doc, "_source.entities.webpage") is not None)
147 | 


--------------------------------------------------------------------------------
/test/test_connections.py:
--------------------------------------------------------------------------------
  1 | import unittest
  2 | from eslib import Processor
  3 | 
  4 | class Connections(object):
  5 | 
  6 |     def create_processors(self):
  7 |         self.a = Processor(name="processor_a")
  8 |         self.b = Processor(name="processor_b")
  9 |         self.c = Processor(name="processor_c")
 10 |         self.d = Processor(name="processor_d")
 11 | 
 12 |     def create_terminals(self):
 13 |         self.a.create_connector(None, "input") # Protocol anything
 14 |         self.a.create_socket("output", "proto_doc")
 15 |         self.b.create_connector(None, "input", "proto_doc")
 16 |         self.b.create_socket("output_doc", "proto_doc")
 17 |         self.b.create_socket("output_str", "proto_str")
 18 |         self.c.create_connector(None, "input_doc", "proto_doc")
 19 |         self.c.create_connector(None, "input_str", "proto_str")
 20 |         self.c.create_socket("output_doc", "proto_doc")
 21 |         self.c.create_socket("output_ext", "proto_doc.extended")
 22 |         self.c.create_socket("output_anything")
 23 |         self.d.create_connector(None, "input_anything")
 24 |         self.d.create_connector(None, "input_doc", "proto_doc")
 25 |         self.d.create_connector(None, "input_ext", "proto_doc.extended")
 26 | 
 27 |     def connect_terminals(self):
 28 |         self.b.subscribe(self.a)  # Ok call, only one socket and connector
 29 |         self.c.subscribe(self.b, "output_doc", "input_doc") # Ok
 30 |         self.c.subscribe(self.a, connector_name="input_doc") # Ok, a's only socket name can be omitted
 31 |         self.d.subscribe(self.c, "output_doc", "input_anything") # Ok, any input accepted
 32 |         self.d.subscribe(self.c, "output_ext", "input_ext") # Ok, protocol exact match
 33 | 
 34 | 
 35 | class TestConnections(unittest.TestCase, Connections):
 36 | 
 37 |     def test_create_processors(self):
 38 |         self.create_processors()
 39 | 
 40 |         self.assertIsNotNone(self.a, "Processor a None")
 41 |         self.assertIsNotNone(self.b, "Processor b None")
 42 |         self.assertIsNotNone(self.c, "Processor c None")
 43 |         self.assertIsNotNone(self.d, "Processor d None")
 44 | 
 45 |     def test_create_terminals(self):
 46 |         self.create_processors()
 47 |         self.create_terminals()
 48 | 
 49 |         self.assertTrue(len(self.a.connectors) == 1, "Expected 1 connector for a")
 50 |         self.assertTrue(len(self.b.connectors) == 1, "Expected 1 connector for b")
 51 |         self.assertTrue(len(self.c.connectors) == 2, "Expected 2 connectors for c")
 52 |         self.assertTrue(len(self.d.connectors) == 3, "Expected 3 connectors for d")
 53 | 
 54 |         self.assertTrue(len(self.a.sockets) == 1, "Expected 1 socket for a")
 55 |         self.assertTrue(len(self.b.sockets) == 2, "Expected 2 sockets for b")
 56 |         self.assertTrue(len(self.c.sockets) == 3, "Expected 3 sockets for c")
 57 |         self.assertTrue(len(self.d.sockets) == 0, "Expected 0 sockets for d")
 58 | 
 59 | 
 60 |     def test_connect(self):
 61 |         self.create_processors()
 62 |         self.create_terminals()
 63 |         self.connect_terminals()
 64 | 
 65 |         # Cannot decide socket, should fail:
 66 |         self.assertRaises(Exception, self.c.subscribe, (self.b,))
 67 |         # Ok for socket, but still cannot decide which one of C's connectors:
 68 |         self.assertRaises(Exception, self.c.subscribe, (self.b, "output_doc"))
 69 |         # Protocol error:
 70 |         self.assertRaises(Exception, self.c.subscribe, (self.b, "output_doc", "input_str"))
 71 |         # Should fail on protocol error:
 72 |         self.assertRaises(Exception, self.d.subscribe, (self.c, "output_anything", "input_doc"))
 73 |          # Protocol error:
 74 |         self.assertRaises(Exception, self.d.subscribe, (self.c, "output_ext", "input_doc"))
 75 |         # Protocol error, connector more specific than socket:
 76 |         self.assertRaises(Exception, self.d.subscribe, (self.c, "output_doc", "input_ext"))
 77 | 
 78 |         # Do a quick check to see if expected number of connections are now ok
 79 |         self.assertTrue(len(self.a.sockets["output"].connections) == 2) # b and c
 80 |         self.assertTrue(len(self.b.connectors["input"].connections) == 1) # b
 81 |         self.assertTrue(len(self.b.sockets["output_doc"].connections) == 1) # c
 82 |         self.assertTrue(len(self.c.connectors["input_doc"].connections) == 2) # a and b
 83 |         self.assertTrue(len(self.c.sockets["output_doc"].connections) == 1) # d
 84 |         self.assertTrue(len(self.c.sockets["output_ext"].connections) == 1) # d
 85 |         self.assertTrue(len(self.d.connectors["input_anything"].connections) == 1) # c
 86 |         self.assertTrue(len(self.d.connectors["input_ext"].connections) == 1) # c
 87 | 
 88 | 
 89 |     def test_connect2(self):
 90 |         self.create_processors()
 91 |         self.create_terminals()
 92 |         self.connect_terminals()
 93 | 
 94 |         self.b.unsubscribe() # unsubscribes all input connectors
 95 |         self.assertTrue(len(self.a.sockets["output"].connections) == 1)  # only c left
 96 |         self.assertTrue(len(self.b.connectors["input"].connections) == 0)
 97 | 
 98 |         self.c.unsubscribe(self.a)
 99 |         self.c.unsubscribe(self.a, connector_name="input_doc")
100 |         self.assertTrue(len(self.a.sockets["output"].connections) == 0)
101 |         self.assertTrue(len(self.b.sockets["output_doc"].connections) == 1) # c remains
102 |         self.assertTrue(len(self.c.connectors["input_doc"].connections) == 1)  # only b left
103 | 
104 |         self.c.unsubscribe(connector_name="input_doc")
105 |         self.assertTrue(len(self.b.sockets["output_doc"].connections) == 0) # c now also gone
106 | 
107 |         self.c.detach(self.d) # Should detach all connections to d
108 |         self.assertTrue(len(self.c.sockets["output_doc"].connections) == 0)
109 |         self.assertTrue(len(self.c.sockets["output_ext"].connections) == 0)
110 |         self.assertTrue(len(self.d.connectors["input_anything"].connections) == 0)
111 |         self.assertTrue(len(self.d.connectors["input_ext"].connections) == 0)
112 | 
113 | 
114 | def main():
115 |     unittest.main()
116 | 
117 | if __name__ == "__main__":
118 |     main()
119 | 


--------------------------------------------------------------------------------
/test/test_service/test_http_service.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | ENDPOINT = "localhost:4000"
  4 | 
  5 | import unittest
  6 | from eslib.service import Service, HttpService, status
  7 | from eslib.procs import Timer, Transformer
  8 | import requests, time, threading
  9 | 
 10 | import eslib.prog
 11 | eslib.prog.initlogs()
 12 | 
 13 | class TestService(Service):
 14 |     def __init__(self, **kwargs):
 15 |         super(TestService, self).__init__(**kwargs)
 16 | 
 17 |         self.ending = False
 18 |         self.requires_metadata = False
 19 | 
 20 |     def on_setup(self):
 21 |         self._timer = Timer(service=self, actions=[(3, 3, "ping")])
 22 |         self._pc = Transformer(service=self, func=self._func)
 23 |         self._pc.subscribe(self._timer)
 24 | 
 25 |         self.register_procs(self._timer, self._pc)
 26 | 
 27 |         return True
 28 | 
 29 |     def _func(self, proc, doc):
 30 |         print doc
 31 |         if self.ending:
 32 |            print "FUNC STOP"
 33 |            self._timer.stop()
 34 | 
 35 |     def is_processing(self):
 36 |         return self._pc.running
 37 | 
 38 |     def is_aborted(self):
 39 |         return self._pc.aborted
 40 | 
 41 |     def is_suspended(self):
 42 |         return self._pc.suspended
 43 | 
 44 |     # on_start_processing (should be ran async)
 45 |     def on_processing_start(self):
 46 |         self._timer.start()
 47 |         time.sleep(1)  # Simulate that it takes some time
 48 |         return True
 49 | 
 50 |     def on_processing_stop(self):
 51 |         time.sleep(1)  # Simulate that it takes some time
 52 |         self._timer.stop()
 53 |         self._pc.wait()
 54 |         return True
 55 | 
 56 |     # on_abort_processing
 57 |     def on_processing_abort(self):
 58 |         self._timer.abort()
 59 |         self._pc.stop()
 60 |         return True
 61 | 
 62 | 
 63 |     # TODO: on_update_metadata
 64 | 
 65 | 
 66 | class HttpTestService(HttpService, TestService):
 67 | 
 68 |     def __init__(self, **kwargs):
 69 |         super(HttpTestService, self).__init__(**kwargs)
 70 | 
 71 |         # Add management routes to functions
 72 |         self.add_route(self._test1, "GET", "test1/{id}/{?mode}", ["mode"])
 73 | 
 74 |     def _test1(self, request_handler, payload, **kwargs):
 75 |         parameters = kwargs
 76 |         print "TEST1:", parameters
 77 |         return {"echo": parameters}
 78 | 
 79 | class TestTestService(unittest.TestCase):
 80 | 
 81 |     def test_run_shutdown(self):
 82 |         p = TestService()#mgmt_endpoint=ENDPOINT)  # localhost:4444 by default
 83 |         p.ending = False
 84 | 
 85 |         print "Starting service"
 86 |         print "Asserting '%s' (not started)" % status.DOWN
 87 |         self.assertEqual(p.status, status.DOWN)
 88 | 
 89 |         p.run()
 90 |         # This does not require config, thus going straight from 'down' to 'idle'
 91 |         print "Asserting '%s'" % status.IDLE
 92 |         self.assertEqual(p.status, status.IDLE)
 93 | 
 94 |         print "Shutting down"
 95 |         p.shutdown(wait=True)
 96 |         print "Asserting '%s' (shut down)" % status.DOWN
 97 |         self.assertEqual(p.status, status.DOWN)
 98 | 
 99 |     def test_lifecycle(self):
100 |         p = TestService()#mgmt_endpoint=ENDPOINT)  # localhost:4444 by default
101 |         p.ending = False
102 | 
103 |         print "Starting service"
104 |         print "Asserting '%s' (not started)" % status.DOWN
105 |         self.assertEqual(status.DOWN, p.status)
106 | 
107 |         p.run()
108 |         # This does not require config, thus going straight from 'down' to 'idle'
109 |         print "Asserting '%s'" % status.IDLE
110 |         self.assertEqual(status.IDLE, p.status)
111 | 
112 |         print "Starting processing"
113 |         p.processing_start()
114 |         print "Asserting '%s'" % status.PROCESSING
115 |         self.assertEqual(status.PROCESSING, p.status)
116 | 
117 |         time.sleep(1)
118 |         print "Stopping processing"
119 |         p.processing_stop()
120 |         time.sleep(0.1)
121 |         print "Asserting '%s'" % status.STOPPING
122 |         self.assertEqual(status.STOPPING, p.status)
123 | 
124 |         print "Waiting for processing to stop"
125 |         p.processing_wait()
126 |         print "Asserting '%s' (stopped)" % status.IDLE
127 |         self.assertEqual(status.IDLE, p.status)
128 | 
129 |         print "Starting processing"
130 |         p.processing_start()
131 |         print "Asserting '%s'" % status.PROCESSING
132 |         self.assertEqual(status.PROCESSING, p.status)
133 | 
134 |         time.sleep(1)
135 |         print "Aborting processing"
136 |         p.processing_abort()
137 |         print "Asserting '%s'" % status.ABORTED
138 |         self.assertEqual(status.ABORTED, p.status)
139 | 
140 |         print "Starting processing"
141 |         p.processing_start()
142 |         print "Asserting '%s'" % status.PROCESSING
143 |         self.assertEqual(status.PROCESSING, p.status)
144 | 
145 |         print "Shutting down"
146 |         p.shutdown()
147 |         #threading.Thread(target=lambda : p.shutdown()).start()
148 |         time.sleep(0.1)
149 |         print "Asserting '%s'" % status.CLOSING
150 |         self.assertEqual(status.CLOSING, p.status)
151 | 
152 |         print "Waiting for shutdown"
153 |         p.wait()
154 |         print "Asserting '%s' (shut down)" % status.DOWN
155 |         self.assertEqual(status.DOWN, p.status)
156 | 
157 |     def test_lifecycle_ending_service(self):
158 |         p = TestService()#mgmt_endpoint=ENDPOINT)  # localhost:4444 by default
159 |         p.ending = True
160 | 
161 |         print "Starting service"
162 |         print "Asserting '%s' (not started)" % status.DOWN
163 |         self.assertEqual(status.DOWN, p.status)
164 | 
165 |         p.run()
166 |         # This does not require config, thus going straight from 'down' to 'idle'
167 |         print "Asserting '%s'" % status.IDLE
168 |         self.assertEqual(status.IDLE, p.status)
169 | 
170 |         print "Starting processing (take 1)"
171 |         p.processing_start()
172 |         print "Asserting '%s'" % status.PROCESSING
173 |         self.assertEqual(status.PROCESSING, p.status)
174 | 
175 |         print "Waiting for processing to finish (take 1)"
176 |         p.processing_wait()
177 |         print "Asserting '%s' (stopped)" % status.IDLE
178 |         self.assertEqual(status.IDLE, p.status)
179 | 
180 |         print "Starting processing (take 2)"
181 |         p.processing_start()
182 |         print "Asserting '%s'" % status.PROCESSING
183 |         self.assertEqual(status.PROCESSING, p.status)
184 | 
185 |         print "Waiting for processing to finish (take 2)"
186 |         p.processing_wait()
187 |         print "Asserting '%s' (stopped)" % status.IDLE
188 |         self.assertEqual(status.IDLE, p.status)
189 | 
190 |         print "Shutting down (waiting)"
191 |         p.shutdown(wait=True)
192 |         print "Asserting '%s' (shut down)" % status.DOWN
193 |         self.assertEqual(status.DOWN, p.status)
194 | 
195 | def main():
196 |     unittest.main()
197 | 
198 | if __name__ == "__main__":
199 |     main()
200 | 


--------------------------------------------------------------------------------
/eslib/web.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | """
  4 | eslib.web
  5 | ~~~~~~~~~~
  6 | 
  7 | Module containing operations against web servers and on web content.
  8 | """
  9 | 
 10 | 
 11 | __all__ = ("WebGetter", "detect_language", "remove_boilerplate")
 12 | 
 13 | 
 14 | import requests
 15 | import eslib
 16 | from collections import Counter
 17 | from textblob import TextBlob
 18 | import justext
 19 | from datetime import datetime, timedelta
 20 | from email.utils import parsedate_tz, mktime_tz
 21 | 
 22 | class WebGetter(object):
 23 |     def __init__(self, max_size=-1, content_types=None):
 24 |         self.content_types = content_types or ["text/plain", "text/html", "text/xml", "application/xml"]
 25 |         self.max_size = 1024*1024 # 1 MB
 26 |         if max_size > 0: self.max_size = max_size
 27 | 
 28 |     def get(self, url):
 29 |         # Fetch web page
 30 |         try:
 31 |             res = requests.get(url, verify=False)
 32 |             res.raise_for_status
 33 |         except:
 34 |             msg = "URL failed: %s" % url
 35 |             raise IOError(msg)
 36 |         if not res.ok:
 37 |             msg = "URL not ok, status_code=%s for URL: %s" % (res.status_code, url)
 38 |             raise IOError(msg)
 39 | 
 40 |         # Verify allowed content type
 41 |         content_type = (res.headers.get("content-type") or "").split(";")[0]
 42 |         if not content_type in self.content_types:
 43 |             msg = "Skipping web page with content type '%s', URL: %s" % (content_type, url)
 44 |             raise ValueError(msg)
 45 | 
 46 |         # Size check with reported content size
 47 |         if self.max_size > 0:
 48 |             size = int(res.headers.get("content-length") or -1)
 49 |             if size > 0 and size > self.max_size:
 50 |                 msg = "Skipping too large web page (%s), URL: %s" % (eslib.debug.byteSizeString(size, 2), url)
 51 |                 raise ValueError(msg)
 52 | 
 53 |         # Find timestamp
 54 |         date_str = res.headers.get("date")
 55 |         if not date_str:
 56 |             timestamp = datetime.utcnow()
 57 |         else:
 58 |             t = mktime_tz(parsedate_tz(date_str))
 59 |             timestamp = datetime(1970, 1, 1) + timedelta(seconds=t)
 60 | 
 61 |         # Extract vitals from web result
 62 |         id = url # res.url
 63 |         encoding = res.encoding
 64 |         content = res.text
 65 | 
 66 |         # Repeat size check with actual content size
 67 |         if self.max_size > 0:
 68 |             size = len(content)
 69 |             if size > self.max_size:
 70 |                 msg = "Skipping too large web page (%s), URL: %s" % (eslib.debug.byteSizeString(size, 2), url)
 71 |                 raise ValueError(msg)
 72 | 
 73 |         body = {"content": content, "content_type": content_type, "encoding": encoding, "date": timestamp}
 74 |         return body
 75 | 
 76 | #region Language detection
 77 | 
 78 | def detect_language(text, chunk_size=250, max_chunks=5):
 79 |     """
 80 |     Detects language of the passed text. Returns majority detection on multiple chunks in order to avoid
 81 |     misclassification on text with boilerplate text of another language in the beginning of the string.
 82 | 
 83 |     Uses Google Translate REST API through the TextBlob library.
 84 | 
 85 |     :param text: str
 86 |     :param chunk_size: int Number of characters in each detection chunk.
 87 |     :param max_chunks: int Maximum number of chunks to run detection on.
 88 |     :return: str Google Translate language code.
 89 |     """
 90 |     n_chunks = int(max(min(len(text) / chunk_size, max_chunks), 1))
 91 |     detections = []
 92 | 
 93 |     for c in xrange(n_chunks):
 94 |         l = c * chunk_size
 95 |         u = max((c + 1) * chunk_size, len(text))
 96 | 
 97 |         chunk = text[l:u]
 98 |         detections.append(TextBlob(chunk).detect_language())
 99 | 
100 |     counts = Counter(detections)
101 | 
102 |     return counts.most_common(n=1)[0][0]
103 | 
104 | #endregion Language detection
105 | 
106 | # #region Boilerplate removal
107 | 
108 | # Map of correspondences between Google Translate and internal JusText
109 | # language codes
110 | GTRANS_JUSTEXT_LANG_MAP = {
111 |     u'af': u'Afrikaans',
112 |     u'sq': u'Albanian',
113 |     u'ar': u'Arabic',
114 |     u'az': u'Azerbaijani',
115 |     u'eu': u'Basque',
116 |     u'be': u'Belarusian',
117 |     u'bg': u'Bulgarian',
118 |     u'ca': u'Catalan',
119 |     u'hr': u'Croatian',
120 |     u'cz': u'Czech',
121 |     u'da': u'Danish',
122 |     u'nl': u'Dutch',
123 |     u'en': u'English',
124 |     u'eo': u'Esperanto',
125 |     u'et': u'Estonian',
126 |     u'fi': u'Finnish',
127 |     u'fr': u'French',
128 |     u'gl': u'Galician',
129 |     u'ka': u'Georgian',
130 |     u'de': u'German',
131 |     u'el': u'Greek',
132 |     u'gu': u'Gujarati',
133 |     u'ht': u'Haitian',
134 |     u'iw': u'Hebrew',
135 |     u'hi': u'Hindi',
136 |     u'hu': u'Hungarian',
137 |     u'is': u'Icelandic',
138 |     u'id': u'Indonesian',
139 |     u'ga': u'Irish',
140 |     u'it': u'Italian',
141 |     u'kn': u'Kannada',
142 |     u'ko': u'Korean',
143 |     u'la': u'Latin',
144 |     u'lv': u'Latvian',
145 |     u'lt': u'Lithuanian',
146 |     u'mk': u'Macedonian',
147 |     u'ms': u'Malay',
148 |     u'mt': u'Maltese',
149 |     u'no': u'Norwegian_Bokmal',
150 |     u'fa': u'Persian',
151 |     u'pl': u'Polish',
152 |     u'pt': u'Portuguese',
153 |     u'ro': u'Romanian',
154 |     u'ru': u'Russian',
155 |     u'sr': u'Serbian',
156 |     u'sk': u'Slovak',
157 |     u'sl': u'Slovenian',
158 |     u'es': u'Spanish',
159 |     u'sw': u'Swahili',
160 |     u'sv': u'Swedish',
161 |     u'tl': u'Tagalog',
162 |     u'ta': u'Tamil',
163 |     u'te': u'Telugu',
164 |     u'tr': u'Turkish',
165 |     u'uk': u'Ukrainian',
166 |     u'ur': u'Urdu',
167 |     u'vi': u'Vietnamese',
168 |     u'cy': u'Welsh'}
169 | 
170 | def remove_boilerplate(page_str, lang, relaxed=False):
171 |     """
172 |     Removes boilerplate from HTML documents.
173 | 
174 |     Uses JusText library.
175 | 
176 |     NOTE: quality dependent on correct language detection.
177 | 
178 |     :param page_str: str HTML page source.
179 |     :param lang: str Google Translate language code.
180 |     :param relaxed: boolean If True the span between the first and last good/near-good boilerplate match
181 |         is returned. Short and bad segments in between are kept.
182 |     :return: list List of non-boilerplate segments/paragraphs.
183 |     """
184 |     if lang not in GTRANS_JUSTEXT_LANG_MAP:
185 |         #raise AttributeError("Can not remove boilerplate for language code lang='%s'." % lang)
186 |         return []
187 | 
188 |     jt_lang = GTRANS_JUSTEXT_LANG_MAP[lang]
189 | 
190 |     paragraphs = justext.justext(page_str, justext.get_stoplist(jt_lang))
191 | 
192 |     if relaxed:
193 |         good_indexes = [paragraphs.index(p) for p in paragraphs if p.class_type in ['near-good', 'good']]
194 | 
195 |         if len(good_indexes) == 0:
196 |             return []
197 | 
198 |         return [paragraph.text for paragraph in paragraphs[min(good_indexes):max(good_indexes) + 1]]
199 |     else:
200 |         return [paragraph.text for paragraph in paragraphs if paragraph.class_type in ['near-good', 'good', 'short']]
201 | 
202 | #endregion Boilerplate removal
203 | 


--------------------------------------------------------------------------------
/eslib/procs/RabbitmqMonitor.py:
--------------------------------------------------------------------------------
  1 | __author__ = 'Hans Terje Bakke'
  2 | 
  3 | from ..Monitor import Monitor
  4 | from .RabbitmqBase import RabbitmqBase
  5 | import pika
  6 | import json, time
  7 | 
  8 | class RabbitmqMonitor(Monitor, RabbitmqBase):
  9 |     """
 10 |     Monitor a queue in RabbitMQ.
 11 |     Assumes data with type 'str', 'unicode', 'int', 'float' or 'json' from RabbitMQ.
 12 |     Incoming documents are attempted deserialized into these types. Unknown types are passed as 'str'.
 13 | 
 14 |     Sockets:
 15 |         output     (*)       : Document received on monitored queue.
 16 | 
 17 |     Config:
 18 |         host              = localhost  :
 19 |         port              = 5672       :
 20 |         admin_port        = 15672      :
 21 |         username          = guest      :
 22 |         password          = guest      :
 23 |         virtual_host      = None       :
 24 |         exchange          = None       :
 25 |         queue             = "default"  : Not used if 'exchange' is specified.
 26 |         consuming         = True       : Consume from the queue, rather than to listen on an
 27 |                                          exclusive queue that will be deleted when disconnect.
 28 |                                          Non-consuming behaviour only works with an 'exchange'.
 29 |         max_reconnects    = 3          :
 30 |         reconnect_timeout = 3          :
 31 |     """
 32 | 
 33 |     CALC_TOTAL          = True  # Whether to check our the RabbitMQ at intervals and calculate a total
 34 |                                 # from current count and remaining in queue. It thus becomes a moving
 35 |                                 # target for ETA calculations.
 36 |     CALC_TOTAL_INTERVAL = 10.0  # seconds
 37 | 
 38 |     _is_reader = True
 39 | 
 40 |     def __init__(self, **kwargs):
 41 |         super(RabbitmqMonitor, self).__init__(**kwargs)
 42 | 
 43 |         self.output = self.create_socket("output", None, "Document received on monitored queue.")
 44 | 
 45 |         self.config.set_default(
 46 |             max_reconnects    = 3,
 47 |             reconnect_timeout = 3
 48 |         )
 49 | 
 50 |         self._reconnecting = 0
 51 |         self._last_calc_total = 0
 52 | 
 53 |     #region Processor stuff
 54 | 
 55 |     def on_open(self):
 56 |         self._open_connection()
 57 |         self.log.info("Connected to RabbitMQ.")
 58 | 
 59 |     def on_close(self):
 60 |         self._calc_total()
 61 |         if self._close_connection():
 62 |             self.log.info("Connection to RabbitMQ closed.")
 63 | 
 64 |     #endregion Processor stuff
 65 | 
 66 |     #region Generator stuff
 67 | 
 68 |     def _start_consuming(self):
 69 |         self._consumer_tag = self._channel.basic_consume(self._callback, queue=self._queue_name, no_ack=True)
 70 | 
 71 |     def _stop_consuming(self):
 72 |         if self._channel:
 73 |             self._channel.basic_cancel(self._consumer_tag)
 74 | 
 75 |     def on_startup(self):
 76 |         if self.CALC_TOTAL:
 77 |             self.total = 0  # We will collect this from message queue, otherwise it should be set to None
 78 |             self._last_calc_total = 0
 79 |         self.count = 0
 80 |         self._start_consuming()
 81 | 
 82 |     def on_shutdown(self):
 83 |         self._stop_consuming()
 84 | 
 85 |     def on_abort(self):
 86 |         self._stop_consuming()
 87 | 
 88 |     def on_suspend(self):
 89 |         self._stop_consuming()
 90 | 
 91 |     def on_resume(self):
 92 |         self._start_consuming()
 93 | 
 94 |     def on_tick(self):
 95 |         if self._reconnecting > 0:
 96 |             self._reconnecting -= 1
 97 |             # Try to reconnect
 98 |             ok = False
 99 |             try:
100 |                 self._close_connection()
101 |                 self._open_connection()
102 |                 self.log.info("Successfully reconnected to RabbitMQ.")
103 |                 self.reconnecting = 0  # No longer attempting reconnects
104 |                 self._start_consuming()
105 |             except pika.exceptions.AMQPConnectionError as e:
106 |                 if self._reconnecting > 0:
107 |                     timeout = self.config.reconnect_timeout
108 |                     self.log.warning("Reconnect to RabbitMQ failed. Waiting %d seconds." % timeout)
109 |                     time.sleep(timeout)
110 |                 else:
111 |                     self.log.critical("Missing connection to RabbitMQ. Max retries exceeded. Aborting.")
112 |                     self.abort()  # We give up and abort
113 |             return
114 | 
115 |         try:
116 |             self._calc_total()
117 |             congested = self.congestion()
118 |             if congested:
119 |                 self.log.debug("Congestion in dependent processor '%s'; sleeping 10 seconds." % congested.name)
120 |                 self.congestion_sleep(10.0)
121 |             else:
122 |                 self._channel.connection.process_data_events()
123 |         except Exception as e:
124 |             if self._reconnecting >= 0:
125 |                 self.log.info("No open connection to RabbitMQ. Trying to reconnect.")
126 |                 self._reconnecting = self.config.max_reconnects  # Number of reconnect attempts; will start reconnecting on next tick
127 | 
128 |     def _calc_total(self):
129 |         """
130 |         Calculate total number of messages.
131 |         That is the sum of what is processed so far, and what remains in the queue.
132 |         """
133 |         if not self.CALC_TOTAL:
134 |             return
135 | 
136 |         now = time.time()
137 |         if now - self._last_calc_total > self.CALC_TOTAL_INTERVAL:
138 |             try:
139 |                 self.total = self.get_queue_size() + self.count
140 |             except Exception as e:
141 |                 self.log.warning("Failed to get queue size for queue '%s': %s" % (self._queue_name, e))
142 |             self._last_calc_total = now
143 | 
144 |     def _callback(self, callback, method, properties, body):
145 |         #print "*** RabbitmqMonitor received:"
146 |         #print "***    Properties:", properties
147 |         #print "***    Body: ", body
148 | 
149 |         self.count += 1
150 | 
151 |         if not self.output.has_output: # Don't bother deserializing, etc, in this case
152 |             return
153 | 
154 |         try:
155 |             msg_type = properties.type
156 |             document = None
157 |             if msg_type == "json":
158 |                 try:
159 |                     document = json.loads(body)
160 |                 except TypeError as e:
161 |                     self.doclog.warning(e.message)
162 |                     return
163 |             elif msg_type in ["str", "unicode"]:
164 |                 document = body
165 |             elif msg_type == "int":
166 |                 document = int(str(body))
167 |             elif msg_type == "float":
168 |                 document = float(str(body))
169 |             elif body:
170 |                 self.doclog.debug("Received document of type='%s'; converting to str.", msg_type)
171 |                 document = str(body)
172 | 
173 |             if document != None:
174 |                 self.output.send(document)
175 |             else:
176 |                 self.doclog.warning("Received empty document from RabbitMQ.")
177 |         except Exception as e:
178 |             self.log.error("An exception occurred inside the callback: %s" % e.message)
179 | 
180 |     #endregion Generator stuff
181 | 


--------------------------------------------------------------------------------
/eslib/procs/FileReader.py:
--------------------------------------------------------------------------------
  1 | __author__ = 'Hans Terje Bakke'
  2 | 
  3 | from ..Generator import Generator
  4 | from select import select
  5 | import codecs
  6 | import sys, os, os.path, errno
  7 | import json
  8 | 
  9 | 
 10 | # TODO: Windows does not support file descriptors in select()
 11 | #       Alternative method to _read_as_much_as_possible() needed for Windows.
 12 | 
 13 | 
 14 | class FileReader(Generator):
 15 |     """
 16 |     Read documents from specified files or standard input.
 17 |     Reads entire file as one document, or per line, according to config.
 18 | 
 19 |     Previous behaviour, removed:
 20 |         Documents starting with '{' are considered JSON documents and converted to 'dict', unless otherwise configured.
 21 |     All are now considered JSON documents and converted to 'dict', unless 'raw_lines' are set in the config.
 22 | 
 23 |     Sockets:
 24 |         output     (*)       : Documents read. Either entire file as one, or per line. Either raw string or dict.
 25 | 
 26 |     Config:
 27 |         filename          = None    : Appended to filenames, for simplicity.
 28 |         filenames         = None    : If not set then 'stdin' is assumed. Can take a list of files.
 29 |         document_pre_file = False   : Read each file as one string to be treated as one document.
 30 |         raw_lines         = False   : Setting this to True treats the line as a string instead of JSON.
 31 |         strip_line        = True    : Whether to remove leading and trailing spaces on a line.
 32 |         skip_blank_line   = True    : Whether to skip empty lines (after stripping).
 33 |         skip_comment_line = True    : Whether to skip comment lines
 34 |         comment_prefix    = "#"     : Lines beginning with this string is considered to be a comment line if
 35 |                                       'skip_comment_line' is True.
 36 |     """
 37 | 
 38 |     def __init__(self, **kwargs):
 39 |         super(FileReader, self).__init__(**kwargs)
 40 |         self.output = self.create_socket("output", None, "Documents read. Either entire file as one, or per line. Either raw string or dict.")
 41 | 
 42 |         self.config.set_default(
 43 |             filename          = None,
 44 |             filenames         = [],
 45 |             document_per_file = False,
 46 |             raw_lines         = False,
 47 |             strip_line        = True,
 48 |             skip_blank_line   = True,
 49 |             skip_comment_line = True,
 50 |             comment_prefix    = "#",
 51 |         )
 52 |         self._filenames = []
 53 |         self._file = None
 54 |         self._filename_index = 0
 55 | 
 56 |     def on_open(self):
 57 | 
 58 |         if self._file:
 59 |             self.log.error("on_open() attempted when _file exists -- should not be possible.")
 60 |             return
 61 | 
 62 |         # Create a more usable filenames array
 63 |         self._filenames = []
 64 |         if self.config.filename:
 65 |             self._filenames.append(self.config.filename)
 66 |         if not self.config.filenames:
 67 |             if not self.config.filename:
 68 |                 self._filenames.append(None)  # stdin will be expected
 69 |         elif type(self.config.filenames) in [str, unicode]:
 70 |             self._filenames.append(self.config.filenames)
 71 |         else:
 72 |             self._filenames.extend(self.config.filenames)
 73 | 
 74 |         # Verify that files exists and that we can read them upon starting
 75 |         for filename in self._filenames:
 76 |             if filename:
 77 |                 if not os.path.isfile(filename):
 78 |                     e = IOError("File not found: %s" % filename)
 79 |                     e.filename = filename
 80 |                     e.errno = errno.ENOENT  # No such file or directory
 81 |                     raise e
 82 |                 elif not os.access(filename, os.R_OK):
 83 |                     e = IOError("Failed to read file: %s" % filename)
 84 |                     e.filename = filename
 85 |                     e.errno = errno.EACCES  # Permission denied
 86 |                     raise e
 87 | 
 88 |     def _close_file(self):
 89 |         if self._file and self._file != sys.stdin:
 90 |             self._file.close()
 91 |         self._file = None
 92 | 
 93 |     def on_close(self):
 94 |         # If we have an open file, this is our last chance to close it
 95 |         self._close_file()
 96 | 
 97 |     def _handle_data(self, incoming):
 98 |         data = incoming
 99 |         if data == None:
100 |             return
101 |         if self.config.strip_line:
102 |             data = data.strip()
103 |         if self.config.skip_comment_line and data.startswith(self.config.comment_prefix):
104 |             return
105 |         if self.config.skip_blank_line and not data:
106 |             return
107 |         if not self.config.raw_lines:# and data.startswith("{"):
108 |             # NOTE: May raise ValueError:
109 |             data = json.loads(data)
110 |         self.output.send(data)
111 | 
112 | 
113 |     def _read_as_much_as_possible(self):
114 |         while True:
115 |             # Read as much as we can
116 |             r,w,e = select([self._file], [], [self._file], 0)
117 |             if e:
118 |                 pass
119 |                 # Hm... this happens on every normal file...
120 |                 #self._close_file()
121 |                 #break
122 |             if r:
123 |                 line = self._file.readline()
124 |                 line = codecs.decode(line, self._file.encoding or "UTF-8", "replace")
125 | 
126 |                 if line:
127 |                     self._handle_data(line)
128 |                     # In case we should leave the loop while there is still input available:
129 |                     if self.end_tick_reason or self.suspend:
130 |                         break
131 |                 if not line:
132 |                     # We've reached the end of input
133 |                     self._close_file()
134 |                     break
135 |             else:
136 |                 break
137 | 
138 |     # Candidate for Windows:
139 |     def _read_as_much_as_possible_Windows(self):
140 |         for line in self._file:
141 |             line = codecs.decode(line, self._file.encoding or "UTF-8", "replace")
142 |             self._handle_data(line)
143 |             # In case we should leave the loop while there is still input available:
144 |             if self.end_tick_reason or self.suspend:
145 |                 return
146 |         self._close_file()
147 | 
148 |     def on_tick(self):
149 | 
150 |         if self._file:
151 |             # We were working on a file... keep reading
152 |             if self.config.document_per_file:
153 |                 all = self._file.read()
154 |                 self._handle_data(all)
155 |                 self._close_file()
156 |             else:
157 |                 self._read_as_much_as_possible()
158 |         elif self._filename_index >= len(self._filenames):
159 |             # We're done!
160 |             self.stop()
161 |             return
162 |         else:
163 |             filename = self._filenames[self._filename_index]
164 |             if not filename:
165 |                 self.log.debug("Starting read from stdin.")
166 |                 self._file = sys.stdin
167 |             else:
168 |                 self.log.debug("Opening file '%s'." % filename)
169 |                 self._file = open(filename, "r" if self.config.document_per_file else "rt")
170 |             self._filename_index += 1
171 |             # Return from tick and reenter later with a file to process
172 |             return
173 | 


--------------------------------------------------------------------------------