├── .gitattributes
├── .gitignore
├── LICENSE
├── README.md
├── collector
    ├── Dockerfile
    ├── app.py
    ├── requirements.txt
    ├── utils.py
    └── wait-for
├── docker-compose.yml
├── hypnos
    ├── .gitignore
    ├── Dockerfile
    ├── LICENSE
    ├── README.md
    ├── app.py
    ├── docker-compose.yml
    ├── petrarch
    │   ├── Dockerfile
    │   ├── petrarch_app.py
    │   └── requirements.txt
    ├── requirements.txt
    ├── utils.py
    └── wait-for
├── miner
    ├── Dockerfile
    ├── app.py
    ├── requirements.txt
    ├── utils.py
    └── wait-for
├── mitie
    ├── Dockerfile
    ├── app.py
    ├── requirements.txt
    ├── utils.py
    └── wait-for
├── predpatt
    ├── Dockerfile
    ├── ParseyPredFace.py
    ├── app.py
    ├── requirements.txt
    ├── utils.py
    └── wait-for
├── quad
    ├── Dockerfile
    ├── app.py
    ├── launch.sh
    ├── quad_trained
    │   ├── quad_character_model.json
    │   ├── quad_character_model_vocab.pkl
    │   └── quad_character_model_weights.h5
    ├── requirements.txt
    ├── utils.py
    └── wait-for
└── relevancy
    ├── Dockerfile
    ├── app.py
    ├── launch.sh
    ├── relevancy_trained_classifier
        ├── svm
        │   ├── relevancy_classifier.pkl
        │   ├── relevancy_classifier.pkl_01.npy
        │   ├── relevancy_classifier.pkl_02.npy
        │   └── relevancy_classifier.pkl_03.npy
        └── tfidf
        │   ├── relevancy_classifier_tfidf.pkl
        │   ├── relevancy_classifier_tfidf.pkl_01.npy
        │   └── relevancy_classifier_tfidf.pkl_02.npy
    ├── requirements.txt
    ├── utils.py
    └── wait-for


/.gitattributes:
--------------------------------------------------------------------------------
1 | *.h5 filter=lfs diff=lfs merge=lfs -text
2 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .DS_Store
2 | *swp
3 | /data/
4 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright 2012-2017 Johns Hopkins University Human Language Technology
 2 | Center of Excellence (JHU HLTCOE). All rights reserved.
 3 | 
 4 | Redistribution and use in source and binary forms, with or without
 5 | modification, are permitted provided that the following conditions are
 6 | met:
 7 | 
 8 | * Redistributions of source code must retain the above copyright notice,
 9 |   this list of conditions and the following disclaimer.
10 | 
11 | * Redistributions in binary form must reproduce the above copyright
12 |   notice, this list of conditions and the following disclaimer in the
13 |   documentation and/or other materials provided with the distribution.
14 | 
15 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
16 | ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
17 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
18 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
19 | HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
20 | INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
21 | BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
22 | OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23 | ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR
24 | TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
25 | USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
26 | DAMAGE.
27 | 
28 | The views and conclusions contained in the software and documentation
29 | are those of the authors and should not be interpreted as representing
30 | official policies, either expressed or implied, of the copyright
31 | holders.
32 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | EventMiner
  2 | =======
  3 | 
  4 | Using the hammer of supervised learning to make events.
  5 | 
  6 | About
  7 | -----
  8 | 
  9 | EventMiner aims to serve, primarily, as an interface to various NLP analytics
 10 | to extract event information from text. This project is setup with a REST
 11 | frontend interface, which accepts text input, that is then further passed
 12 | via a RabbitMQ messaging queue to various analytics as appropriate. The project
 13 | is comprised of Docker containers, with orchestration handled by
 14 | docker-compose. This, combined with RabbitMQ as the messaging layer, allows for
 15 | clean definitions of interactions between services and minimal setup for the
 16 | end user. 
 17 | 
 18 | Services
 19 | ---------
 20 | 
 21 | The services defined in this project are as follows (in the order they process
 22 | content):
 23 | 
 24 | 1. `mitie` - Provide NER tagging via [MITIE](https://github.com/mit-nlp/MITIE).
 25 | 2. `predpatt` - Extract predicate-argument structures using
 26 |  [PredPatt](https://github.com/hltcoe/PredPatt). Also includes Universal
 27 |  Dependency parse provided by
 28 |  [SyntaxNet/DRAGNN](https://github.com/tensorflow/models/tree/master/syntaxnet).
 29 | 3. `relevancy` - An SVM classifier to determine story relevancy based on the story title.
 30 |   * **Note:** Branching occurs at the `relevancy` classifier.
 31 |   If this model determines a story is not relevant, it isn't processed by `quad`, it just goes straight through to `hypnos`.
 32 | 4. `quad` - A convolutional neural net to classify a sentence into one of four `QuadCategories`: verbal conflict, verbal cooperation, material conflict, material cooperation.
 33 | 5. `hypnos` - Rule-based event extractor. Used primarily for actor extraction in this setup.
 34 | 6. `collector` - Light process to pull in events and write them out to a file.
 35 | 
 36 | Deployment
 37 | ----------
 38 | 
 39 | There are two `docker-compose` projects that make up mjolnir. The first is the
 40 | `miner` application itself. The second is `hypnos`, which is the container
 41 | architecture around the `PETRARCH2` event extractor. `docker-compose` must be
 42 | run for `miner` first, and `hypnos` second, due to nuances relating to the
 43 | shared docker networks. Thus, deployment is as follows (assuming the user
 44 | starts in the top-level `EventMiner` directory):
 45 | 
 46 | ```
 47 | docker-compose up -d 
 48 | cd ./hynpos
 49 | docker-compose up -d
 50 | ```
 51 | 
 52 | This will lead to a REST interface deployed on port 6000. With the features
 53 | of `docker-compose`, it's possible to arbitrarily scale up the various services
 54 | connected within `miner`. For example, the `quad` service is rather slow
 55 | since it's a neural net running on a CPU. Since each service consumes from a
 56 | messaging queue, we don't need to worry about things such as load balancing;
 57 | each service just consumes when it's ready. Given this, to scale the `quad`
 58 | service is as simple as (assuming the user is in the root `EventMiner` directory):
 59 | 
 60 | ```
 61 | docker-compose scale quad=3
 62 | ```
 63 | 
 64 | to run three of the `quad` containers.
 65 | 
 66 | Usage
 67 | -----
 68 | 
 69 | The interface accepts JSON input via REST. As an example:
 70 | 
 71 | ```
 72 | import json
 73 | import requests
 74 | 
 75 | headers = {'Content-Type': 'application/json'}
 76 | 
 77 | test = {'title': 'Syrian rebels attacked Aleppo.', 'content': 'This is the content. Rebels attacked Aleppo.'}
 78 | data = {'data': test}
 79 | 
 80 | r = requests.post('http://localhost:6000/EventMiner', data=json.dumps(data), headers=headers)
 81 | ```
 82 | 
 83 | The response object from `EventMiner` will contain a unique ID for the input data
 84 | that allows the user to trace the progress of the content throughout the
 85 | pipeline. The pipeline will write data out to the `EventMiner/data` directory. The
 86 | results are in a file titled `events.YYYYMMDD.txt` with one JSON record per
 87 | line. The output format (for now...) is as follows:
 88 | 
 89 | ```
 90 | {u'content': u'This is the content. Rebels attacked Aleppo.',
 91 |  u'event_info': {u'267bbae4-dcc0-4224-94e9-67679b0b6ad1': {u'coded': [],
 92 |    u'predicted_class': {u'class': 4, u'score': u'0.89923'},
 93 |    u'sent': u'This is the content.'},
 94 |   u'8b464457-18d2-419c-b5c1-49c6131be947': {u'coded': [[u'---REB',
 95 |      u'SYR',
 96 |      u'190']],
 97 |    u'predicted_class': {u'class': 4, u'score': u'0.986187'},
 98 |    u'sent': u'Rebels attacked Aleppo.'}},
 99 |  u'pipeline_key': u'4c4f7e7a-db31-4137-a888-2cdbbbf1c225',
100 |  u'predicted_relevancy': 1,
101 |  u'sents': {u'267bbae4-dcc0-4224-94e9-67679b0b6ad1': u'This is the content.',
102 |   u'8b464457-18d2-419c-b5c1-49c6131be947': u'Rebels attacked Aleppo.'},
103 |  u'title': u'Syrian rebels attacked Aleppo.'}
104 |  ```
105 | 
106 | 
107 | Acknowledgements
108 | ----------------
109 | 
110 | This work was funded by the DARPA Quantitative Crisis Response (QCR) program.
111 | 


--------------------------------------------------------------------------------
/collector/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM python:2.7-alpine
 2 | 
 3 | MAINTAINER John Beieler <jbeieler1@jhu.edu>
 4 | 
 5 | RUN apk add --no-cache git wget unzip
 6 | 
 7 | RUN mkdir -p /src/data
 8 | 
 9 | ADD . /src
10 | 
11 | RUN cd /src; pip install -r requirements.txt
12 | 
13 | CMD ["/src/wait-for", "rabbitmq:5672", "--", "python", "/src/app.py"]
14 | 


--------------------------------------------------------------------------------
/collector/app.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import json
 3 | import utils
 4 | import logging
 5 | import datetime
 6 | 
 7 | logging.basicConfig(format='%(levelname)s %(asctime)s %(filename)s %(lineno)d: %(message)s')
 8 | logger = logging.getLogger(__name__)
 9 | logger.setLevel(logging.INFO)
10 | 
11 | CONSUME = os.getenv('CONSUME')
12 | 
13 | 
14 | def callback(ch, method, properties, body):
15 |     data = json.loads(body)
16 | 
17 |     logger.info('Started processing content. {}'.format(data['pipeline_key']))
18 | 
19 |     process(data)
20 | 
21 |     logger.info('Done writing an event. {}'.format(data['pipeline_key']))
22 |     ch.basic_ack(delivery_tag=method.delivery_tag)
23 | 
24 | 
25 | def process(data):
26 |     key = '<unknown>'
27 |     try:
28 |         key = data['pipeline_key']
29 |         logger.info('Got results for {}'.format(key))
30 | 
31 |         root = '/src/data/'
32 |         now = datetime.datetime.utcnow().strftime('%Y/%m/%d')
33 |         path = os.path.join(root, now)
34 | 
35 |         if not os.path.exists(path):
36 |             os.makedirs(path)
37 | 
38 |         fname = '{}.json'.format(key)
39 |         with open(os.path.join(path, fname), 'w') as f:
40 |             f.write(json.dumps(data))
41 |     except:
42 |         # If something goes wrong, log it and return nothing
43 |         logger.exception('Failed to write results for {}'.format(key))
44 |         # Make sure to update this line if you change the variable names
45 | 
46 | 
47 | def main():
48 |     rabbit_consume = utils.RabbitClient(queue=CONSUME, host='rabbitmq')
49 |     rabbit_consume.receive(callback)
50 | 
51 | 
52 | if __name__ == '__main__':
53 |     logger.info('Running...')
54 |     main()
55 | 


--------------------------------------------------------------------------------
/collector/requirements.txt:
--------------------------------------------------------------------------------
1 | pika
2 | 


--------------------------------------------------------------------------------
/collector/utils.py:
--------------------------------------------------------------------------------
 1 | import pika
 2 | import json
 3 | 
 4 | 
 5 | class RabbitClient(object):
 6 |     def __init__(self, queue, host='localhost'):
 7 |         self.queue = queue
 8 |         self.connection = pika.BlockingConnection(pika.ConnectionParameters(
 9 |             host=host))
10 | 
11 |         self.channel = self.connection.channel()
12 | 
13 |         self.channel.queue_declare(queue=self.queue, durable=True)
14 | 
15 |     def send(self, n, routing):
16 |         self.channel.basic_publish(exchange='',
17 |                                    routing_key=routing,
18 |                                    properties=pika.BasicProperties(
19 |                                             delivery_mode=2,),
20 |                                    body=json.dumps(n))
21 | 
22 |     def receive(self, callback):
23 |         self.channel.basic_qos(prefetch_count=1)
24 |         self.channel.basic_consume(callback, queue=self.queue)
25 |         self.channel.start_consuming()
26 | 


--------------------------------------------------------------------------------
/collector/wait-for:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | 
 3 | TIMEOUT=15
 4 | QUIET=0
 5 | 
 6 | echoerr() {
 7 |   if [ "$QUIET" -ne 1 ]; then printf "%s\n" "$*" 1>&2; fi
 8 | }
 9 | 
10 | usage() {
11 |   exitcode="$1"
12 |   cat << USAGE >&2
13 | Usage:
14 |   $cmdname host:port [-t timeout] [-- command args]
15 |   -q | --quiet                        Do not output any status messages
16 |   -t TIMEOUT | --timeout=timeout      Timeout in seconds, zero for no timeout
17 |   -- COMMAND ARGS                     Execute command with args after the test finishes
18 | USAGE
19 |   exit "$exitcode"
20 | }
21 | 
22 | wait_for() {
23 |   for i in `seq $TIMEOUT` ; do
24 |     nc -z "$HOST" "$PORT" > /dev/null 2>&1
25 |     
26 |     result=$?
27 |     if [ $result -eq 0 ] ; then
28 |       if [ $# -gt 0 ] ; then
29 |         exec "$@"
30 |       fi
31 |       exit 0
32 |     fi
33 |     sleep 1
34 |   done
35 |   echo "Operation timed out" >&2
36 |   exit 1
37 | }
38 | 
39 | while [ $# -gt 0 ]
40 | do
41 |   case "$1" in
42 |     *:* )
43 |     HOST=$(printf "%s\n" "$1"| cut -d : -f 1)
44 |     PORT=$(printf "%s\n" "$1"| cut -d : -f 2)
45 |     shift 1
46 |     ;;
47 |     -q | --quiet)
48 |     QUIET=1
49 |     shift 1
50 |     ;;
51 |     -t)
52 |     TIMEOUT="$2"
53 |     if [ "$TIMEOUT" = "" ]; then break; fi
54 |     shift 2
55 |     ;;
56 |     --timeout=*)
57 |     TIMEOUT="${1#*=}"
58 |     shift 1
59 |     ;;
60 |     --)
61 |     shift
62 |     break
63 |     ;;
64 |     --help)
65 |     usage 0
66 |     ;;
67 |     *)
68 |     echoerr "Unknown argument: $1"
69 |     usage 1
70 |     ;;
71 |   esac
72 | done
73 | 
74 | if [ "$HOST" = "" -o "$PORT" = "" ]; then
75 |   echoerr "Error: you need to provide a host and port to test."
76 |   usage 2
77 | fi
78 | 
79 | wait_for "$@"
80 | 


--------------------------------------------------------------------------------
/docker-compose.yml:
--------------------------------------------------------------------------------
 1 | version: '2'
 2 | networks:
 3 |   miner:
 4 |     driver: bridge
 5 | services:
 6 |   miner:
 7 |     image: miner
 8 |     build: ./miner
 9 |     depends_on:
10 |       - rabbitmq
11 |     ports:
12 |       - "6000:6000"
13 |     environment:
14 |       - PUBLISH=ingest
15 |     networks:
16 |       - miner
17 |   mitie:
18 |     image: mitie
19 |     build: ./mitie
20 |     depends_on:
21 |       - rabbitmq
22 |     environment:
23 |       - CONSUME=ingest
24 |       - PUBLISH=mitie
25 |     networks:
26 |       - miner
27 |   predpatt:
28 |     image: predpatt 
29 |     build: ./predpatt
30 |     depends_on:
31 |       - rabbitmq
32 |     environment:
33 |       - CONSUME=mitie
34 |       - PUBLISH=predpatt
35 |     networks:
36 |       - miner
37 |   relevancy:
38 |     image: relevancy
39 |     build: ./relevancy
40 |     depends_on:
41 |       - rabbitmq
42 |     environment:
43 |       - CONSUME=predpatt
44 |       - PUBLISH=relevancy
45 |     networks:
46 |       - miner
47 |   quad:
48 |     image: quad
49 |     build: ./quad
50 |     depends_on:
51 |       - rabbitmq
52 |     environment:
53 |       - CONSUME=relevancy
54 |       - PUBLISH=quad
55 |     networks:
56 |       - miner
57 |   collector:
58 |     image: collector
59 |     build: ./collector
60 |     depends_on:
61 |       - rabbitmq
62 |     environment:
63 |       - CONSUME=actors
64 |     networks:
65 |       - miner
66 |     volumes:
67 |       - ./data:/src/data
68 |   rabbitmq:
69 |     image: rabbitmq:alpine
70 |     expose:
71 |       - "5672"
72 |     networks:
73 |       - miner
74 | 


--------------------------------------------------------------------------------
/hypnos/.gitignore:
--------------------------------------------------------------------------------
1 | .DS_Store
2 | *.pyc
3 | *.swp
4 | .ropeproject
5 | 


--------------------------------------------------------------------------------
/hypnos/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM python:2.7-alpine
 2 | 
 3 | MAINTAINER John Beieler <jbeieler1@jhu.edu>
 4 | 
 5 | RUN apk add --no-cache git wget unzip
 6 | 
 7 | ADD . /src
 8 | 
 9 | RUN cd /src; pip install -r requirements.txt
10 | 
11 | CMD ["/src/wait-for", "ccnlp:5000", "-t", "60", "--", "python", "/src/app.py"]
12 | 


--------------------------------------------------------------------------------
/hypnos/LICENSE:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Copyright (c) 2015 Caerus Associates
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/hypnos/README.md:
--------------------------------------------------------------------------------
 1 | hypnos
 2 | ======
 3 | 
 4 | A RESTful API around the [PETRARCH](https://github.com/openeventdata/petrarch)
 5 | event data coder. Using `docker compose`, this setup also integrates the 
 6 | Stanford [CoreNLP](http://nlp.stanford.edu/software/corenlp.shtml) parser
 7 | using Casey Hilland's [docker container](https://github.com/chilland/ccNLP).
 8 | This setup allows the users to stream texts into the API, rather than the 
 9 | batch mode seen in applications such as the [Phoenix pipeline](https://github.com/openeventdata/phoenix_pipeline).
10 | 
11 | Running
12 | -------
13 | 
14 | Running the system is as simple as using
15 | 
16 | `docker-compose up`
17 | 
18 | or 
19 | 
20 | `docker-compose up -d`
21 | 
22 | to run in the background.
23 | 
24 | This assumes that you have `docker-compose` and `docker` installed.
25 | 
26 | Usage
27 | -----
28 | 
29 | ```
30 | headers = {'Content-Type': 'application/json'}
31 | data = {'text': "At least 37 people are dead after Islamist radical group Boko
32 | Haram assaulted a town in northeastern Nigeria.", 'id': 'abc123', 'date':
33 | '20010101'}
34 | data = json.dumps(data)
35 | r = requests.get('http://localhost:5002/hypnos/extract', data=data,
36 |                  headers=headers)
37 | r.json()
38 | ```
39 | 
40 | Returns:
41 | 
42 | ```
43 | {u'abc123': {u'meta': {u'date': u'20010101'},
44 |   u'sents': {u'0': {u'content': u'At least 37 people are dead after Islamist
45 |   radical group Boko Haram assaulted a town in northeastern Nigeria .',
46 |       u'events': [[u'NGAREBMUS', u'NGA', u'190']],
47 |           u'issues': [[u'ID_EXTREMISM', 1], [u'NAMED_TERROR_GROUP', 1]],
48 |               u'parsed': u'(ROOT (S (NP (QP (IN AT ) (JJS LEAST ) (CD 37 ) )
49 |               (NNS PEOPLE ) ) (VP (VBP ARE ) (ADJP (JJ DEAD ) ) (SBAR (IN AFTER
50 |               ) (S (NP (JJ ISLAMIST ) (JJ RADICAL ) (NN GROUP ) (NNP BOKO )
51 |               (NNP HARAM ) ) (VP (VBD ASSAULTED ) (NP (NP (DT A ) (NN TOWN ) )
52 |               (PP (IN IN ) (NP (JJ NORTHEASTERN ) (NNP NIGERIA ) ) ) ) ) ) ) )
53 |               (. . ) ) )'}}}}
54 | ```
55 | 


--------------------------------------------------------------------------------
/hypnos/app.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import json
  3 | import utils
  4 | import logging
  5 | import requests
  6 | import datetime
  7 | from copy import deepcopy
  8 | from dateutil.parser import parse
  9 | 
 10 | logging.basicConfig(format='%(levelname)s %(asctime)s %(filename)s %(lineno)d: %(message)s')
 11 | logger = logging.getLogger(__name__)
 12 | logger.setLevel(logging.INFO)
 13 | 
 14 | cwd = os.path.abspath(os.path.dirname(__file__))
 15 | CONSUME = os.getenv('CONSUME')
 16 | PUBLISH = os.getenv('PUBLISH')
 17 | 
 18 | 
 19 | def callback(ch, method, properties, body):
 20 |     data = json.loads(body)
 21 |     logger.info('Started processing content. {}'.format(data['pipeline_key']))
 22 | 
 23 |     extract(data)
 24 | 
 25 |     logger.info('Finished PETR extracting. {}'.format(data['pipeline_key']))
 26 |     ch.basic_ack(delivery_tag=method.delivery_tag)
 27 | 
 28 | 
 29 | def extract(message):
 30 |     rabbit_publish = utils.RabbitClient(queue=PUBLISH,
 31 |                                         host='rabbitmq')
 32 | 
 33 |     story = message
 34 | 
 35 |     keys = story['event_info'].keys()
 36 |     #keys = [k for k in keys if k != 'predicted_relevancy']
 37 |     for val in keys:
 38 |         logger.info('Processing {}'.format(val))
 39 |         text = story['event_info'][val]['sent']['text']
 40 |         text = text.encode('utf-8')
 41 | 
 42 |         event_dict = send_to_corenlp(story, text)
 43 | 
 44 |         try:
 45 |             events_r = send_to_petr(event_dict)
 46 |         except Exception as e:
 47 |             logger.info('There was an exception with PETR. {}\n'.format(e))
 48 |             events_r = {}
 49 |         try:
 50 | #            event_updated = process_results(events_r.json())
 51 |             event_updated = events_r.json()
 52 | 
 53 |             story['event_info'][val]['coded'] = []
 54 |             for e in event_updated:
 55 |                 if e:
 56 |                     story['event_info'][val]['coded'].append(e)
 57 |                 else:
 58 |                     pass
 59 | 
 60 |             #logger.info(json.dumps(story))
 61 |         except:
 62 |             logger.exception('Something went wrong in the formatting.')
 63 |             logger.info(json.dumps(events_r.json()))
 64 | 
 65 |     rabbit_publish.send(story, PUBLISH)
 66 | 
 67 | 
 68 | def send_to_petr(event_dict):
 69 |     headers = {'Content-Type': 'application/json'}
 70 | 
 71 |     events_data = json.dumps({'events': event_dict})
 72 |     petr_url = 'http://petrarch:5001/petrarch/code'
 73 |     events_r = requests.post(petr_url, data=events_data, headers=headers)
 74 | 
 75 |     return events_r
 76 | 
 77 | 
 78 | def send_to_corenlp(story, text):
 79 |     storyid = story['pipeline_key']
 80 |     date = datetime.datetime.utcnow().strftime('%Y%m%d')  # set a default
 81 |     try:
 82 |         date = parse(story['date']).strftime('%Y%m%d')
 83 |     except KeyError:
 84 |         logger.info('No date found')
 85 |     except ValueError:
 86 |         logger.info('Unable to parse date')
 87 | 
 88 |     headers = {'Content-Type': 'application/json'}
 89 |     core_data = json.dumps({'text': text})
 90 |     ccnlp_url = 'http://ccnlp:5000/process'
 91 |     r = requests.post(ccnlp_url, data=core_data, headers=headers)
 92 |     out = r.json()
 93 | 
 94 |     event_dict = process_corenlp(out, date, storyid)
 95 | 
 96 |     return event_dict
 97 | 
 98 | 
 99 | def process_corenlp(output, date, STORYID):
100 |     event_dict = {STORYID: {}}
101 |     event_dict[STORYID]['sents'] = {}
102 |     event_dict[STORYID]['meta'] = {}
103 |     event_dict[STORYID]['meta']['date'] = date
104 |     for i, sent in enumerate(output['sentences']):
105 |         sents = output['sentences']
106 |         event_dict[STORYID]['sents'][str(i)] = {}
107 |         event_dict[STORYID]['sents'][str(i)]['content'] = ' '.join(sents[i]['tokens'])
108 |         event_dict[STORYID]['sents'][str(i)]['parsed'] = sents[i]['parse'].upper().replace(')', ' )')
109 | 
110 |     return event_dict
111 | 
112 | 
113 | def process_results(event_dict):
114 |     new_event_dict = deepcopy(event_dict)
115 |     for s_id in event_dict:
116 |         sents = event_dict[s_id]['sents']
117 |         for sent in sents:
118 |             if 'events' not in sents[sent].keys():
119 |                 del new_event_dict[s_id]['sents'][sent]
120 |             else:
121 |                 del new_event_dict[s_id]['sents'][sent]['parsed']
122 |             if 'issues' not in sents[sent].keys():
123 |                 sents[sent]['issues'] = []
124 | 
125 |     return new_event_dict
126 | 
127 | 
128 | def main():
129 |     rabbit_consume = utils.RabbitClient(queue=CONSUME, host='rabbitmq')
130 |     rabbit_consume.receive(callback)
131 | 
132 | 
133 | if __name__ == '__main__':
134 |     logger.info('Running...')
135 |     main()
136 | 


--------------------------------------------------------------------------------
/hypnos/docker-compose.yml:
--------------------------------------------------------------------------------
 1 | version: '2'
 2 | networks:
 3 |   eventminer_miner:
 4 |     external: true
 5 | services:
 6 |   hypnos:
 7 |     image: hypnos
 8 |     build: .
 9 |     environment:
10 |       - CONSUME=quad
11 |       - PUBLISH=actors
12 |     networks:
13 |         - eventminer_miner
14 |   ccnlp:
15 |     image: caerusassociates/ccnlp:1.0.0
16 |     networks:
17 |         - eventminer_miner
18 |   petrarch:
19 |     build: petrarch/.
20 |     networks:
21 |         - eventminer_miner
22 | 


--------------------------------------------------------------------------------
/hypnos/petrarch/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM python:2.7-alpine
 2 | 
 3 | MAINTAINER John Beieler <jbeieler1@jhu.edu>
 4 | 
 5 | RUN apk add --no-cache git
 6 | 
 7 | RUN pip install git+https://github.com/openeventdata/petrarch2.git
 8 | 
 9 | ADD . /src
10 | 
11 | RUN cd /src; pip install -r requirements.txt
12 | 
13 | EXPOSE 5001
14 | 
15 | CMD ["python", "/src/petrarch_app.py"]
16 | 


--------------------------------------------------------------------------------
/hypnos/petrarch/petrarch_app.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import logging
 3 | from petrarch2 import petrarch2
 4 | from tornado.ioloop import IOLoop
 5 | from tornado.wsgi import WSGIContainer
 6 | from tornado.httpserver import HTTPServer
 7 | from flask import Flask, jsonify, make_response
 8 | from flask.ext.restful import Api, Resource, reqparse
 9 | 
10 | logging.basicConfig(format='%(levelname)s %(asctime)s %(filename)s %(lineno)d: %(message)s')
11 | logger = logging.getLogger(__name__)
12 | logger.setLevel(logging.INFO)
13 | 
14 | app = Flask(__name__)
15 | api = Api(app)
16 | 
17 | cwd = os.path.abspath(os.path.dirname(__file__))
18 | 
19 | 
20 | @app.errorhandler(400)
21 | def bad_request(error):
22 |     return make_response(jsonify({'error': 'Bad request'}), 400)
23 | 
24 | 
25 | @app.errorhandler(404)
26 | def not_found(error):
27 |     return make_response(jsonify({'error': 'Not found'}), 404)
28 | 
29 | 
30 | class CodeAPI(Resource):
31 |     def __init__(self):
32 |         self.reqparse = reqparse.RequestParser()
33 |         self.reqparse.add_argument('events', type=dict)
34 |         super(CodeAPI, self).__init__()
35 | 
36 |     def post(self):
37 |         args = self.reqparse.parse_args()
38 |         event_dict = args['events']
39 |         to_return = []
40 | 
41 |         try:
42 |             event_dict_updated = petrarch2.do_coding(event_dict)
43 |             k = event_dict_updated.keys()[0]
44 |             try:
45 |                 to_return = event_dict_updated[k]['sents']['0']['events']
46 |             except KeyError:
47 |                 logger.info('No events to process')
48 |             except:
49 |                 logger.exception("An error occured")
50 |         except:
51 |             logger.exception("An error occurred")
52 | 
53 |         return to_return
54 | 
55 | 
56 | api.add_resource(CodeAPI, '/petrarch/code')
57 | 
58 | if __name__ == '__main__':
59 |     config = petrarch2.utilities._get_data('data/config/', 'PETR_config.ini')
60 |     logger.info("reading config")
61 |     petrarch2.PETRreader.parse_Config(config)
62 |     logger.info("reading dicts")
63 |     petrarch2.read_dictionaries()
64 | 
65 |     http_server = HTTPServer(WSGIContainer(app))
66 |     http_server.listen(5001)
67 |     IOLoop.instance().start()
68 | 


--------------------------------------------------------------------------------
/hypnos/petrarch/requirements.txt:
--------------------------------------------------------------------------------
 1 | requests==2.4.3
 2 | Flask==0.10.1
 3 | Flask-RESTful==0.3.3
 4 | Flask-HTTPAuth==2.5.0
 5 | itsdangerous==0.24
 6 | Jinja2==2.7.3
 7 | MarkupSafe==0.23
 8 | tornado==4.2
 9 | simplejson==3.6.5
10 | Werkzeug==0.10.4
11 | 


--------------------------------------------------------------------------------
/hypnos/requirements.txt:
--------------------------------------------------------------------------------
 1 | pytest==2.6.3
 2 | requests==2.4.3
 3 | Flask==0.10.1
 4 | Flask-RESTful==0.3.3
 5 | Flask-HTTPAuth==2.5.0
 6 | itsdangerous==0.24
 7 | Jinja2==2.7.3
 8 | MarkupSafe==0.23
 9 | tornado==4.2
10 | simplejson==3.6.5
11 | Werkzeug==0.10.4
12 | python-dateutil
13 | pika
14 | 


--------------------------------------------------------------------------------
/hypnos/utils.py:
--------------------------------------------------------------------------------
 1 | import pika
 2 | import json
 3 | 
 4 | 
 5 | class RabbitClient(object):
 6 |     def __init__(self, queue, host='localhost'):
 7 |         self.queue = queue
 8 |         self.connection = pika.BlockingConnection(pika.ConnectionParameters(
 9 |             host=host))
10 | 
11 |         self.channel = self.connection.channel()
12 | 
13 |         self.channel.queue_declare(queue=self.queue, durable=True)
14 | 
15 |     def send(self, n, routing):
16 |         self.channel.basic_publish(exchange='',
17 |                                    routing_key=routing,
18 |                                    properties=pika.BasicProperties(
19 |                                             delivery_mode=2,),
20 |                                    body=json.dumps(n))
21 | 
22 |     def receive(self, callback):
23 |         self.channel.basic_qos(prefetch_count=1)
24 |         self.channel.basic_consume(callback, queue=self.queue)
25 |         self.channel.start_consuming()
26 | 


--------------------------------------------------------------------------------
/hypnos/wait-for:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | 
 3 | TIMEOUT=15
 4 | QUIET=0
 5 | 
 6 | echoerr() {
 7 |   if [ "$QUIET" -ne 1 ]; then printf "%s\n" "$*" 1>&2; fi
 8 | }
 9 | 
10 | usage() {
11 |   exitcode="$1"
12 |   cat << USAGE >&2
13 | Usage:
14 |   $cmdname host:port [-t timeout] [-- command args]
15 |   -q | --quiet                        Do not output any status messages
16 |   -t TIMEOUT | --timeout=timeout      Timeout in seconds, zero for no timeout
17 |   -- COMMAND ARGS                     Execute command with args after the test finishes
18 | USAGE
19 |   exit "$exitcode"
20 | }
21 | 
22 | wait_for() {
23 |   for i in `seq $TIMEOUT` ; do
24 |     nc -z "$HOST" "$PORT" > /dev/null 2>&1
25 |     
26 |     result=$?
27 |     if [ $result -eq 0 ] ; then
28 |       if [ $# -gt 0 ] ; then
29 |         exec "$@"
30 |       fi
31 |       exit 0
32 |     fi
33 |     sleep 1
34 |   done
35 |   echo "Operation timed out" >&2
36 |   exit 1
37 | }
38 | 
39 | while [ $# -gt 0 ]
40 | do
41 |   case "$1" in
42 |     *:* )
43 |     HOST=$(printf "%s\n" "$1"| cut -d : -f 1)
44 |     PORT=$(printf "%s\n" "$1"| cut -d : -f 2)
45 |     shift 1
46 |     ;;
47 |     -q | --quiet)
48 |     QUIET=1
49 |     shift 1
50 |     ;;
51 |     -t)
52 |     TIMEOUT="$2"
53 |     if [ "$TIMEOUT" = "" ]; then break; fi
54 |     shift 2
55 |     ;;
56 |     --timeout=*)
57 |     TIMEOUT="${1#*=}"
58 |     shift 1
59 |     ;;
60 |     --)
61 |     shift
62 |     break
63 |     ;;
64 |     --help)
65 |     usage 0
66 |     ;;
67 |     *)
68 |     echoerr "Unknown argument: $1"
69 |     usage 1
70 |     ;;
71 |   esac
72 | done
73 | 
74 | if [ "$HOST" = "" -o "$PORT" = "" ]; then
75 |   echoerr "Error: you need to provide a host and port to test."
76 |   usage 2
77 | fi
78 | 
79 | wait_for "$@"
80 | 


--------------------------------------------------------------------------------
/miner/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM python:2.7-alpine
 2 | 
 3 | MAINTAINER John Beieler <jbeieler1@jhu.edu>
 4 | 
 5 | RUN apk add --no-cache git wget unzip
 6 | 
 7 | RUN mkdir /src
 8 | 
 9 | RUN mkdir -p /src/nltk_data/tokenizers
10 | RUN wget https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/packages/tokenizers/punkt.zip -O /src/nltk_data/tokenizers/punkt.zip
11 | RUN unzip /src/nltk_data/tokenizers/punkt.zip; rm -rf /src/nltk_data/tokenizers/punkt.zip
12 | RUN mv punkt /src/nltk_data/tokenizers
13 | ENV NLTK_DATA=/src/nltk_data
14 | 
15 | ADD . /src
16 | 
17 | RUN cd /src; pip install -r requirements.txt
18 | 
19 | EXPOSE 6000
20 | 
21 | CMD ["/src/wait-for", "rabbitmq:5672", "-t", "90", "--", "python", "/src/app.py"]
22 | 


--------------------------------------------------------------------------------
/miner/app.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import hashlib
 3 | import utils
 4 | import logging
 5 | 
 6 | from tornado.ioloop import IOLoop
 7 | from tornado.wsgi import WSGIContainer
 8 | from tornado.httpserver import HTTPServer
 9 | from flask import Flask, jsonify, make_response
10 | from flask.ext.restful import Api, Resource, reqparse
11 | 
12 | 
13 | logging.basicConfig(format='%(levelname)s %(asctime)s %(filename)s %(lineno)d: %(message)s')
14 | logger = logging.getLogger(__name__)
15 | logger.setLevel(logging.INFO)
16 | 
17 | app = Flask(__name__)
18 | api = Api(app)
19 | 
20 | cwd = os.path.abspath(os.path.dirname(__file__))
21 | 
22 | PUBLISH = os.getenv('PUBLISH')
23 | 
24 | 
25 | @app.errorhandler(400)
26 | def bad_request(error):
27 |     return make_response(jsonify({'error': 'Bad request'}), 400)
28 | 
29 | 
30 | @app.errorhandler(404)
31 | def not_found(error):
32 |     return make_response(jsonify({'error': 'Not found'}), 404)
33 | 
34 | 
35 | class MinerAPI(Resource):
36 |     def __init__(self):
37 |         self.reqparse = reqparse.RequestParser()
38 |         self.reqparse.add_argument('data', type=dict, location='json')
39 |         super(MinerAPI, self).__init__()
40 | 
41 |     def post(self):
42 |         args = self.reqparse.parse_args()
43 | 
44 |         rabbit = utils.RabbitClient(queue=PUBLISH, host='rabbitmq')
45 | 
46 |         logger.info('Received data...')
47 |         data = utils.prep_data(args['data'])
48 |         key = hashlib.sha1(''.join(data['sents'])).hexdigest()
49 |         data['pipeline_key'] = key
50 | 
51 |         logger.info('Sending downstream with key {}...'.format(key))
52 |         rabbit.send(data, PUBLISH)
53 | 
54 |         logging.info('Sent {}'.format(key))
55 |         return key
56 | 
57 | 
58 | api.add_resource(MinerAPI, '/EventMiner')
59 | 
60 | 
61 | if __name__ == '__main__':
62 |     http_server = HTTPServer(WSGIContainer(app))
63 |     http_server.listen(6000)
64 |     IOLoop.instance().start()
65 | 


--------------------------------------------------------------------------------
/miner/requirements.txt:
--------------------------------------------------------------------------------
 1 | pytest==2.6.3
 2 | Flask==0.10.1
 3 | Flask-RESTful==0.3.3
 4 | Flask-HTTPAuth==2.5.0
 5 | itsdangerous==0.24
 6 | Jinja2==2.7.3
 7 | MarkupSafe==0.23
 8 | tornado==4.2
 9 | simplejson==3.6.5
10 | Werkzeug==0.10.4
11 | pika
12 | nltk
13 | 


--------------------------------------------------------------------------------
/miner/utils.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import uuid
 3 | import pika
 4 | import nltk.data
 5 | 
 6 | 
 7 | class RabbitClient(object):
 8 |     def __init__(self, queue, host='localhost'):
 9 |         self.queue = queue
10 |         self.connection = pika.BlockingConnection(pika.ConnectionParameters(
11 |             host=host))
12 | 
13 |         self.channel = self.connection.channel()
14 | 
15 |         self.channel.queue_declare(queue=self.queue, durable=True)
16 | 
17 |     def send(self, n, routing):
18 |         self.channel.basic_publish(exchange='',
19 |                                    routing_key=routing,
20 |                                    properties=pika.BasicProperties(
21 |                                             delivery_mode=2,),
22 |                                    body=json.dumps(n))
23 | 
24 |     def receive(self, callback):
25 |         self.channel.basic_qos(prefetch_count=1)
26 |         self.channel.basic_consume(callback, queue=self.queue)
27 |         self.channel.start_consuming()
28 | 
29 | 
30 | def prep_data(data):
31 |     sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')
32 |     sents = sent_detector.tokenize(data['content'].strip())
33 |     sent_dict = {str(uuid.uuid4()): {'text': x} for x in sents[:2]}
34 |     data['sents'] = sent_dict
35 | 
36 |     return data
37 | 


--------------------------------------------------------------------------------
/miner/wait-for:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | 
 3 | TIMEOUT=15
 4 | QUIET=0
 5 | 
 6 | echoerr() {
 7 |   if [ "$QUIET" -ne 1 ]; then printf "%s\n" "$*" 1>&2; fi
 8 | }
 9 | 
10 | usage() {
11 |   exitcode="$1"
12 |   cat << USAGE >&2
13 | Usage:
14 |   $cmdname host:port [-t timeout] [-- command args]
15 |   -q | --quiet                        Do not output any status messages
16 |   -t TIMEOUT | --timeout=timeout      Timeout in seconds, zero for no timeout
17 |   -- COMMAND ARGS                     Execute command with args after the test finishes
18 | USAGE
19 |   exit "$exitcode"
20 | }
21 | 
22 | wait_for() {
23 |   for i in `seq $TIMEOUT` ; do
24 |     nc -z "$HOST" "$PORT" > /dev/null 2>&1
25 |     
26 |     result=$?
27 |     if [ $result -eq 0 ] ; then
28 |       if [ $# -gt 0 ] ; then
29 |         exec "$@"
30 |       fi
31 |       exit 0
32 |     fi
33 |     sleep 1
34 |   done
35 |   echo "Operation timed out" >&2
36 |   exit 1
37 | }
38 | 
39 | while [ $# -gt 0 ]
40 | do
41 |   case "$1" in
42 |     *:* )
43 |     HOST=$(printf "%s\n" "$1"| cut -d : -f 1)
44 |     PORT=$(printf "%s\n" "$1"| cut -d : -f 2)
45 |     shift 1
46 |     ;;
47 |     -q | --quiet)
48 |     QUIET=1
49 |     shift 1
50 |     ;;
51 |     -t)
52 |     TIMEOUT="$2"
53 |     if [ "$TIMEOUT" = "" ]; then break; fi
54 |     shift 2
55 |     ;;
56 |     --timeout=*)
57 |     TIMEOUT="${1#*=}"
58 |     shift 1
59 |     ;;
60 |     --)
61 |     shift
62 |     break
63 |     ;;
64 |     --help)
65 |     usage 0
66 |     ;;
67 |     *)
68 |     echoerr "Unknown argument: $1"
69 |     usage 1
70 |     ;;
71 |   esac
72 | done
73 | 
74 | if [ "$HOST" = "" -o "$PORT" = "" ]; then
75 |   echoerr "Error: you need to provide a host and port to test."
76 |   usage 2
77 | fi
78 | 
79 | wait_for "$@"
80 | 


--------------------------------------------------------------------------------
/mitie/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM python:2.7-alpine
 2 | 
 3 | MAINTAINER John Beieler <jbeiele1@jhu.edu>
 4 | 
 5 | RUN apk add --no-cache git build-base curl
 6 | 
 7 | RUN mkdir /src
 8 | ADD . /src
 9 | 
10 | WORKDIR /src 
11 | 
12 | RUN curl -LO https://github.com/mit-nlp/MITIE/releases/download/v0.4/MITIE-models-v0.2.tar.bz2
13 | RUN tar -xzjf MITIE-models-v0.2.tar.bz2; rm -rf MITIE-models-v0.2.tar.bz2
14 | 
15 | RUN pip install -r requirements.txt
16 | 
17 | EXPOSE 6000
18 | 
19 | CMD ["/src/wait-for", "rabbitmq:5672", "--", "python", "/src/app.py"]
20 | 


--------------------------------------------------------------------------------
/mitie/app.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import json
 3 | import utils
 4 | import logging
 5 | 
 6 | from mitie import *
 7 | 
 8 | logging.basicConfig(format='%(levelname)s %(asctime)s %(filename)s %(lineno)d: %(message)s')
 9 | logger = logging.getLogger(__name__)
10 | logger.setLevel(logging.INFO)
11 | 
12 | 
13 | NER = named_entity_extractor('MITIE-models/english/ner_model.dat')
14 | 
15 | CONSUME = os.getenv('CONSUME')
16 | PUBLISH = os.getenv('PUBLISH')
17 | 
18 | 
19 | def callback(ch, method, properties, body):
20 |     data = json.loads(body)
21 |     logger.info('Started processing content. {}'.format(data['pipeline_key']))
22 | 
23 |     process(data)
24 | 
25 |     logger.info('Finished NER tagging. {}'.format(data['pipeline_key']))
26 |     ch.basic_ack(delivery_tag=method.delivery_tag)
27 | 
28 | 
29 | def process(data):
30 |     rabbit_publish = utils.RabbitClient(queue=PUBLISH,
31 |                                         host='rabbitmq')
32 |     data['ner_info'] = {}
33 |     for sid, sent in data['sents'].iteritems():
34 |         try:
35 |             print(sent)
36 |             tokens = tokenize(sent['text'])
37 |             entities = NER.extract_entities(tokens)
38 | 
39 |             new_ents = []
40 |             for e in entities:
41 |                 #MITIE returns xrange iters. Convert to tuples of ints
42 |                 r = (e[0].__reduce__()[1][0],
43 |                      e[0].__reduce__()[1][1])
44 |                 tag = e[1]
45 |                 score = e[2]
46 |                 new_ents.append((r, tag, score))
47 |             data['sents'][sid]['tokens'] = tokens
48 |             data['ner_info'][sid] = new_ents
49 |         except Exception as e:
50 |             # If something goes wrong, log it and return nothing
51 |             logger.info(e)
52 |             # Make sure to update this line if you change the variable names
53 | 
54 |     logger.info('Finished processing content.')
55 | 
56 |     rabbit_publish.send(data, PUBLISH)
57 | 
58 | 
59 | def main():
60 |     rabbit_consume = utils.RabbitClient(queue=CONSUME, host='rabbitmq')
61 |     rabbit_consume.receive(callback)
62 | 
63 | 
64 | if __name__ == '__main__':
65 |     logger.info('Running...')
66 |     main()
67 | 


--------------------------------------------------------------------------------
/mitie/requirements.txt:
--------------------------------------------------------------------------------
1 | git+https://github.com/mit-nlp/MITIE.git
2 | pika


--------------------------------------------------------------------------------
/mitie/utils.py:
--------------------------------------------------------------------------------
 1 | import pika
 2 | import json
 3 | 
 4 | 
 5 | class RabbitClient(object):
 6 |     def __init__(self, queue, host='localhost'):
 7 |         self.queue = queue
 8 |         self.connection = pika.BlockingConnection(pika.ConnectionParameters(
 9 |             host=host))
10 | 
11 |         self.channel = self.connection.channel()
12 | 
13 |         self.channel.queue_declare(queue=self.queue, durable=True)
14 | 
15 |     def send(self, n, routing):
16 |         self.channel.basic_publish(exchange='',
17 |                                    routing_key=routing,
18 |                                    properties=pika.BasicProperties(
19 |                                             delivery_mode=2,),
20 |                                    body=json.dumps(n))
21 | 
22 |     def receive(self, callback):
23 |         self.channel.basic_qos(prefetch_count=1)
24 |         self.channel.basic_consume(callback, queue=self.queue)
25 |         self.channel.start_consuming()
26 | 


--------------------------------------------------------------------------------
/mitie/wait-for:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | 
 3 | TIMEOUT=15
 4 | QUIET=0
 5 | 
 6 | echoerr() {
 7 |   if [ "$QUIET" -ne 1 ]; then printf "%s\n" "$*" 1>&2; fi
 8 | }
 9 | 
10 | usage() {
11 |   exitcode="$1"
12 |   cat << USAGE >&2
13 | Usage:
14 |   $cmdname host:port [-t timeout] [-- command args]
15 |   -q | --quiet                        Do not output any status messages
16 |   -t TIMEOUT | --timeout=timeout      Timeout in seconds, zero for no timeout
17 |   -- COMMAND ARGS                     Execute command with args after the test finishes
18 | USAGE
19 |   exit "$exitcode"
20 | }
21 | 
22 | wait_for() {
23 |   for i in `seq $TIMEOUT` ; do
24 |     nc -z "$HOST" "$PORT" > /dev/null 2>&1
25 |     
26 |     result=$?
27 |     if [ $result -eq 0 ] ; then
28 |       if [ $# -gt 0 ] ; then
29 |         exec "$@"
30 |       fi
31 |       exit 0
32 |     fi
33 |     sleep 1
34 |   done
35 |   echo "Operation timed out" >&2
36 |   exit 1
37 | }
38 | 
39 | while [ $# -gt 0 ]
40 | do
41 |   case "$1" in
42 |     *:* )
43 |     HOST=$(printf "%s\n" "$1"| cut -d : -f 1)
44 |     PORT=$(printf "%s\n" "$1"| cut -d : -f 2)
45 |     shift 1
46 |     ;;
47 |     -q | --quiet)
48 |     QUIET=1
49 |     shift 1
50 |     ;;
51 |     -t)
52 |     TIMEOUT="$2"
53 |     if [ "$TIMEOUT" = "" ]; then break; fi
54 |     shift 2
55 |     ;;
56 |     --timeout=*)
57 |     TIMEOUT="${1#*=}"
58 |     shift 1
59 |     ;;
60 |     --)
61 |     shift
62 |     break
63 |     ;;
64 |     --help)
65 |     usage 0
66 |     ;;
67 |     *)
68 |     echoerr "Unknown argument: $1"
69 |     usage 1
70 |     ;;
71 |   esac
72 | done
73 | 
74 | if [ "$HOST" = "" -o "$PORT" = "" ]; then
75 |   echoerr "Error: you need to provide a host and port to test."
76 |   usage 2
77 | fi
78 | 
79 | wait_for "$@"
80 | 


--------------------------------------------------------------------------------
/predpatt/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM tensorflow/syntaxnet
 2 | #If they ever update this image things will likely break
 3 | 
 4 | RUN apt-get install -y netcat
 5 | 
 6 | #y_tho.gif
 7 | ENV PYTHONPATH="${PYTHONPATH}:/opt/tensorflow/syntaxnet/bazel-bin/dragnn/tools/oss_notebook_launcher.runfiles/__main__:/opt/tensorflow/syntaxnet/bazel-bin/dragnn/tools/oss_notebook_launcher.runfiles/org_tensorflow"
 8 | 
 9 | RUN mkdir /src
10 | ADD . /src
11 | RUN pip install -r /src/requirements.txt
12 | 
13 | CMD ["/src/wait-for", "rabbitmq:5672", "-t", "30", "--", "python", "/src/app.py"]
14 | 


--------------------------------------------------------------------------------
/predpatt/ParseyPredFace.py:
--------------------------------------------------------------------------------
  1 | from __future__ import unicode_literals
  2 | 
  3 | import os
  4 | import tensorflow as tf
  5 | from dragnn.protos import spec_pb2
  6 | from dragnn.python import graph_builder
  7 | from dragnn.python import spec_builder
  8 | from dragnn.python import load_dragnn_cc_impl  # This loads the actual op definitions
  9 | from dragnn.python import render_parse_tree_graphviz
 10 | from dragnn.python import visualization
 11 | from google.protobuf import text_format
 12 | from syntaxnet import load_parser_ops  # This loads the actual op definitions
 13 | from syntaxnet import sentence_pb2
 14 | from syntaxnet.ops import gen_parser_ops
 15 | from tensorflow.python.platform import tf_logging as logging
 16 | 
 17 | from predpatt import PredPatt
 18 | from predpatt import load_conllu
 19 | from predpatt import PredPattOpts
 20 | from predpatt.util.ud import dep_v2
 21 | 
 22 | 
 23 | def load_model(base_dir, master_spec_name, checkpoint_name):
 24 |     """
 25 |     Function to load the syntaxnet models. Highly specific to the tutorial
 26 |     format right now.
 27 |     """
 28 |     # Read the master spec
 29 |     master_spec = spec_pb2.MasterSpec()
 30 |     with open(os.path.join(base_dir, master_spec_name), "r") as f:
 31 |         text_format.Merge(f.read(), master_spec)
 32 |     spec_builder.complete_master_spec(master_spec, None, base_dir)
 33 |     logging.set_verbosity(logging.WARN)  # Turn off TensorFlow spam.
 34 | 
 35 |     # Initialize a graph
 36 |     graph = tf.Graph()
 37 |     with graph.as_default():
 38 |         hyperparam_config = spec_pb2.GridPoint()
 39 |         builder = graph_builder.MasterBuilder(master_spec, hyperparam_config)
 40 |         # This is the component that will annotate test sentences.
 41 |         annotator = builder.add_annotation(enable_tracing=True)
 42 |         builder.add_saver()  # "Savers" can save and load models; here, we're only going to load.
 43 | 
 44 |     sess = tf.Session(graph=graph)
 45 |     with graph.as_default():
 46 |         #sess.run(tf.global_variables_initializer())
 47 |         #sess.run('save/restore_all', {'save/Const:0': os.path.join(base_dir, checkpoint_name)})
 48 |         builder.saver.restore(sess, os.path.join(base_dir, checkpoint_name))
 49 | 
 50 |     def annotate_sentence(sentence):
 51 |         with graph.as_default():
 52 |             return sess.run([annotator['annotations'], annotator['traces']],
 53 |                             feed_dict={annotator['input_batch']: [sentence]})
 54 |     return annotate_sentence
 55 | 
 56 | 
 57 | def annotate_text(text):
 58 |     """
 59 |     Segment and parse input text using syntaxnet models.
 60 |     """
 61 |     sentence = sentence_pb2.Sentence(
 62 |         text=text,
 63 |         token=[sentence_pb2.Token(word=text, start=-1, end=-1)]
 64 |     )
 65 | 
 66 |     # preprocess
 67 |     with tf.Session(graph=tf.Graph()) as tmp_session:
 68 |         char_input = gen_parser_ops.char_token_generator([sentence.SerializeToString()])
 69 |         preprocessed = tmp_session.run(char_input)[0]
 70 |     segmented, _ = SEGMENTER_MODEL(preprocessed)
 71 | 
 72 |     annotations, traces = PARSER_MODEL(segmented[0])
 73 |     assert len(annotations) == 1
 74 |     assert len(traces) == 1
 75 |     return sentence_pb2.Sentence.FromString(annotations[0]), traces[0]
 76 | 
 77 | 
 78 | def parse_to_conll(parse_tree):
 79 |     """
 80 |     Convert from the syntaxnet output format to a CoNLL-U format.
 81 |     """
 82 |     out_str = ''
 83 |     for i, token in enumerate(parse_tree.token, 1):
 84 |         if token.head == -1:
 85 |             head = 0
 86 |         else:
 87 |             head = token.head + 1
 88 |         pos1, pos2 = token.tag.split('attribute')[-1].split('value')[-1].replace(': "', '').replace('" } ', '').split('++')
 89 |         out_str += '{}\t{}\t-\t{}\t{}\t-\t{}\t{}\t-\t-\n'.format(i, token.word, pos1, pos2, head, token.label)
 90 | 
 91 |     return out_str
 92 | 
 93 | 
 94 | def get_ud_fragments(pp):
 95 |     """
 96 |     Extract PP fragments from a UD parse.
 97 | 
 98 |     Format of fragments is (governor_text, governor_position, relation,
 99 |     token_text, token_position)
100 |     """
101 |     pred_deps = []
102 |     arg2deps = {}
103 |     for predicate in pp.instances:
104 |         # Get dep parses for the predicate.
105 |         for token in predicate.tokens:
106 |             # (head, relation, dependent)
107 |             if token.gov:
108 |                 dep = (token.gov.text, token.gov.position, token.gov_rel,
109 |                        token.text, token.position)
110 |             else:
111 |                 dep = (None, None, token.gov_rel, token.text, token.position)
112 |             pred_deps.append(dep)
113 | 
114 |         # Get dep parses for the arguments.
115 |         for argument in predicate.arguments:
116 |             arg_deps = []
117 |             for token in argument.tokens:
118 |                 if token.gov:
119 |                     dep = (token.gov.text, token.gov.position, token.gov_rel,
120 |                            token.text, token.position)
121 |                 else:
122 |                     dep = (None, None, token.gov_rel, token.text,
123 |                            token.position)
124 |                 arg_deps.append(dep)
125 |             arg2deps[argument.position] = arg_deps
126 |     return pred_deps, arg2deps
127 | 
128 | 
129 | path = '/opt/tensorflow/syntaxnet/examples/dragnn/data'
130 | SEGMENTER_MODEL = load_model(os.path.join(path, "en/segmenter"),
131 |                              "spec.textproto", "checkpoint")
132 | PARSER_MODEL = load_model(os.path.join(path, 'en'),
133 |                           "parser_spec.textproto", "checkpoint")
134 | def parse(text):
135 |     """
136 |     Primary function to run syntaxnet and PredPatt over input sentences.
137 |     """
138 |     parse_tree, trace = annotate_text(text)
139 |     conll_parsed = parse_to_conll(parse_tree)
140 | 
141 |     conll_pp = [ud_parse for sent_id, ud_parse in load_conllu(conll_parsed)][0]
142 | 
143 |     #PredPatt options. Modify as needed.
144 |     resolve_relcl = True  # relative clauses
145 |     resolve_appos = True  # appositional modifiers
146 |     resolve_amod = True   # adjectival modifiers
147 |     resolve_conj = True   # conjuction
148 |     resolve_poss = True   # possessives
149 |     ud = dep_v2.VERSION   # the version of UD
150 |     opts = PredPattOpts(resolve_relcl=resolve_relcl,
151 |                         resolve_appos=resolve_appos,
152 |                         resolve_amod=resolve_amod,
153 |                         resolve_conj=resolve_conj,
154 |                         resolve_poss=resolve_poss,
155 |                         ud=ud)
156 |     ppatt = PredPatt(conll_pp, opts=opts)
157 |     predicate_deps, arg_deps = get_ud_fragments(ppatt)
158 | 
159 |     #NOTE:
160 |     #This returns the pretty print formatted string from PredPatt. This is done
161 |     #largely as a place holder for JSON compatability within the REST API.
162 |     return {'predpatt': {'predicate_deps': predicate_deps,
163 |                          'arg_deps': arg_deps},
164 |             'conll': conll_parsed,
165 |             'original': text}
166 | 


--------------------------------------------------------------------------------
/predpatt/app.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import json
 3 | import utils
 4 | import logging
 5 | 
 6 | import ParseyPredFace
 7 | 
 8 | 
 9 | logging.basicConfig(format='%(levelname)s %(asctime)s %(filename)s %(lineno)d: %(message)s')
10 | logger = logging.getLogger(__name__)
11 | logger.setLevel(logging.INFO)
12 | 
13 | 
14 | CONSUME = os.getenv('CONSUME')
15 | PUBLISH = os.getenv('PUBLISH')
16 | 
17 | 
18 | def callback(ch, method, properties, body):
19 |     data = json.loads(body)
20 |     logger.info('Started processing content. {}'.format(data['pipeline_key']))
21 | 
22 |     process(data)
23 | 
24 |     logger.info('Finished PP extraction. {}'.format(data['pipeline_key']))
25 |     ch.basic_ack(delivery_tag=method.delivery_tag)
26 | 
27 | 
28 | def process(data):
29 |     rabbit_publish = utils.RabbitClient(queue=PUBLISH,
30 |                                         host='rabbitmq')
31 |     data['predicate_info'] = {}
32 |     for sid, sent in data['sents'].iteritems():
33 |         try:
34 |             output = ParseyPredFace.parse(sent['text'].encode('utf-8'))
35 | 
36 |             data['predicate_info'][sid] = output
37 |         except Exception as e:
38 |             # If something goes wrong, log it and return nothing
39 |             logger.info(e)
40 |             # Make sure to update this line if you change the variable names
41 | 
42 |     logger.info('Finished processing content.')
43 | 
44 |     rabbit_publish.send(data, PUBLISH)
45 | 
46 | 
47 | def main():
48 |     rabbit_consume = utils.RabbitClient(queue=CONSUME, host='rabbitmq')
49 |     rabbit_consume.receive(callback)
50 | 
51 | 
52 | if __name__ == '__main__':
53 |     logger.info('Running...')
54 |     main()
55 | 


--------------------------------------------------------------------------------
/predpatt/requirements.txt:
--------------------------------------------------------------------------------
1 | pika
2 | git+https://github.com/hltcoe/PredPatt.git
3 | 


--------------------------------------------------------------------------------
/predpatt/utils.py:
--------------------------------------------------------------------------------
 1 | import pika
 2 | import json
 3 | 
 4 | 
 5 | class RabbitClient(object):
 6 |     def __init__(self, queue, host='localhost'):
 7 |         self.queue = queue
 8 |         self.connection = pika.BlockingConnection(pika.ConnectionParameters(
 9 |             host=host))
10 | 
11 |         self.channel = self.connection.channel()
12 | 
13 |         self.channel.queue_declare(queue=self.queue, durable=True)
14 | 
15 |     def send(self, n, routing):
16 |         self.channel.basic_publish(exchange='',
17 |                                    routing_key=routing,
18 |                                    properties=pika.BasicProperties(
19 |                                             delivery_mode=2,),
20 |                                    body=json.dumps(n))
21 | 
22 |     def receive(self, callback):
23 |         self.channel.basic_qos(prefetch_count=1)
24 |         self.channel.basic_consume(callback, queue=self.queue)
25 |         self.channel.start_consuming()
26 | 


--------------------------------------------------------------------------------
/predpatt/wait-for:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | 
 3 | TIMEOUT=15
 4 | QUIET=0
 5 | 
 6 | echoerr() {
 7 |   if [ "$QUIET" -ne 1 ]; then printf "%s\n" "$*" 1>&2; fi
 8 | }
 9 | 
10 | usage() {
11 |   exitcode="$1"
12 |   cat << USAGE >&2
13 | Usage:
14 |   $cmdname host:port [-t timeout] [-- command args]
15 |   -q | --quiet                        Do not output any status messages
16 |   -t TIMEOUT | --timeout=timeout      Timeout in seconds, zero for no timeout
17 |   -- COMMAND ARGS                     Execute command with args after the test finishes
18 | USAGE
19 |   exit "$exitcode"
20 | }
21 | 
22 | wait_for() {
23 |   for i in `seq $TIMEOUT` ; do
24 |     nc -z "$HOST" "$PORT" > /dev/null 2>&1
25 |     
26 |     result=$?
27 |     if [ $result -eq 0 ] ; then
28 |       if [ $# -gt 0 ] ; then
29 |         exec "$@"
30 |       fi
31 |       exit 0
32 |     fi
33 |     sleep 1
34 |   done
35 |   echo "Operation timed out" >&2
36 |   exit 1
37 | }
38 | 
39 | while [ $# -gt 0 ]
40 | do
41 |   case "$1" in
42 |     *:* )
43 |     HOST=$(printf "%s\n" "$1"| cut -d : -f 1)
44 |     PORT=$(printf "%s\n" "$1"| cut -d : -f 2)
45 |     shift 1
46 |     ;;
47 |     -q | --quiet)
48 |     QUIET=1
49 |     shift 1
50 |     ;;
51 |     -t)
52 |     TIMEOUT="$2"
53 |     if [ "$TIMEOUT" = "" ]; then break; fi
54 |     shift 2
55 |     ;;
56 |     --timeout=*)
57 |     TIMEOUT="${1#*=}"
58 |     shift 1
59 |     ;;
60 |     --)
61 |     shift
62 |     break
63 |     ;;
64 |     --help)
65 |     usage 0
66 |     ;;
67 |     *)
68 |     echoerr "Unknown argument: $1"
69 |     usage 1
70 |     ;;
71 |   esac
72 | done
73 | 
74 | if [ "$HOST" = "" -o "$PORT" = "" ]; then
75 |   echoerr "Error: you need to provide a host and port to test."
76 |   usage 2
77 | fi
78 | 
79 | wait_for "$@"
80 | 


--------------------------------------------------------------------------------
/quad/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM continuumio/anaconda
 2 | 
 3 | MAINTAINER John Beieler <jbeieler1@jhu.edu>
 4 | 
 5 | #RUN sed -i "s/httpredir.debian.org/`curl -s -D -http://httpredir.debian.org/demo/debian/ | awk '/^Link:/ { print $2 }' | sed -e 's@<http://\(.*\)/debian/>;@\1@g'`/" /etc/apt/sources.list
 6 | RUN apt-get clean && apt-get update
 7 | RUN apt-get install -y build-essential python-dev netcat
 8 | 
 9 | ADD . /src
10 | RUN cd /src; pip install -r requirements.txt
11 | 
12 | RUN chmod -x /src/launch.sh
13 | CMD sh /src/launch.sh
14 | 


--------------------------------------------------------------------------------
/quad/app.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import logging
 3 | import os
 4 | import utils
 5 | 
 6 | logging.basicConfig(format='%(levelname)s %(asctime)s %(filename)s %(lineno)d: %(message)s')
 7 | logger = logging.getLogger(__name__)
 8 | logger.setLevel(logging.INFO)
 9 | 
10 | 
11 | MAXLEN = 1400
12 | CONSUME = os.getenv('CONSUME')
13 | PUBLISH = os.getenv('PUBLISH')
14 | 
15 | 
16 | def callback(ch, method, properties, body):
17 |     global MODEL, VOCAB, VOCAB_SIZE, CHECK
18 | 
19 |     data = json.loads(body)
20 |     key = data['pipeline_key']
21 | 
22 |     data['event_info'] = {}
23 |     for sid, sent in data['sents'].iteritems():
24 |         data['event_info'][sid] = {}
25 |         data['event_info'][sid]['predicted_class'] = {}
26 |         data['event_info'][sid]['sent'] = sent
27 | 
28 |     if data['predicted_relevancy'] == 1:
29 |         logger.info('Started processing content. {}'.format(key))
30 |         data = process(data, MODEL, VOCAB, VOCAB_SIZE, CHECK)
31 |         logger.info('Finished quad tagging. {}'.format(key))
32 |     else:
33 |         logger.info('Irrelevant content. {}'.format(key))
34 | 
35 |     publish(data)
36 | 
37 |     ch.basic_ack(delivery_tag=method.delivery_tag)
38 | 
39 | 
40 | def process(data, model, vocab, vocab_size, check):
41 |     sents = data['sents']
42 |     key = data['pipeline_key']
43 |     for sid, sent in sents.iteritems():
44 |         try:
45 |             logger.info('Processing sent {} for content {}'.format(sid, key))
46 |             mat = utils.encode_data(
47 |                 [sent['text']], MAXLEN, vocab, vocab_size, check,
48 |             )
49 |             pred = model.predict(mat)
50 |             pred_class = pred.argmax(1)[0]
51 |             pred_score = pred[0][pred_class]
52 |             data['event_info'][sid]['predicted_class'] = {
53 |                 'class': pred_class, 'score': str(pred_score),
54 |             }
55 |         except:
56 |             logger.exception('Error during quad processing of {}'.format(key))
57 |     return data
58 | 
59 | 
60 | def publish(data):
61 |     client = utils.RabbitClient(queue=PUBLISH, host='rabbitmq')
62 |     client.send(data, PUBLISH)
63 | 
64 | 
65 | def main():
66 |     rabbit_consume = utils.RabbitClient(queue=CONSUME, host='rabbitmq')
67 |     rabbit_consume.receive(callback)
68 | 
69 | 
70 | if __name__ == '__main__':
71 |     args = utils.parse_arguments()
72 | 
73 |     logger.info('Loading model...')
74 |     MODEL, VOCAB = utils.load_model(args)
75 |     VOCAB_SIZE = len(VOCAB.keys())
76 |     CHECK = set(VOCAB.keys())
77 | 
78 |     main()
79 | 


--------------------------------------------------------------------------------
/quad/launch.sh:
--------------------------------------------------------------------------------
1 | set -x
2 | echo "Starting up analytic service..."
3 | ./src/wait-for rabbitmq:5672 -t 60 -- python /src/app.py \
4 |     -m /src/quad_trained/quad_character_model.json \
5 |     -w /src/quad_trained/quad_character_model_weights.h5 \
6 |     -v /src/quad_trained/quad_character_model_vocab.pkl
7 | 


--------------------------------------------------------------------------------
/quad/quad_trained/quad_character_model.json:
--------------------------------------------------------------------------------
1 | "{\"class_name\": \"Model\", \"config\": {\"layers\": [{\"class_name\": \"InputLayer\", \"config\": {\"batch_input_shape\": [null, 1400, 89], \"name\": \"input\", \"input_dtype\": \"float32\"}, \"inbound_nodes\": [], \"name\": \"input\"}, {\"class_name\": \"Convolution1D\", \"config\": {\"batch_input_shape\": [null, 1400, 89], \"W_constraint\": null, \"b_constraint\": null, \"name\": \"convolution1d_1\", \"activity_regularizer\": null, \"trainable\": true, \"filter_length\": 7, \"init\": \"uniform\", \"bias\": true, \"nb_filter\": 256, \"input_dtype\": \"float32\", \"subsample_length\": 1, \"border_mode\": \"valid\", \"input_dim\": null, \"b_regularizer\": null, \"W_regularizer\": null, \"activation\": \"relu\", \"input_length\": null}, \"inbound_nodes\": [[[\"input\", 0, 0]]], \"name\": \"convolution1d_1\"}, {\"class_name\": \"MaxPooling1D\", \"config\": {\"stride\": 3, \"pool_length\": 3, \"trainable\": true, \"name\": \"maxpooling1d_1\", \"border_mode\": \"valid\"}, \"inbound_nodes\": [[[\"convolution1d_1\", 0, 0]]], \"name\": \"maxpooling1d_1\"}, {\"class_name\": \"Convolution1D\", \"config\": {\"W_constraint\": null, \"b_constraint\": null, \"name\": \"convolution1d_3\", \"activity_regularizer\": null, \"trainable\": true, \"filter_length\": 3, \"init\": \"uniform\", \"bias\": true, \"nb_filter\": 256, \"input_dim\": null, \"subsample_length\": 1, \"border_mode\": \"valid\", \"b_regularizer\": null, \"W_regularizer\": null, \"activation\": \"relu\", \"input_length\": null}, \"inbound_nodes\": [[[\"maxpooling1d_1\", 0, 0]]], \"name\": \"convolution1d_3\"}, {\"class_name\": \"Convolution1D\", \"config\": {\"W_constraint\": null, \"b_constraint\": null, \"name\": \"convolution1d_4\", \"activity_regularizer\": null, \"trainable\": true, \"filter_length\": 3, \"init\": \"uniform\", \"bias\": true, \"nb_filter\": 256, \"input_dim\": null, \"subsample_length\": 1, \"border_mode\": \"valid\", \"b_regularizer\": null, \"W_regularizer\": null, \"activation\": \"relu\", \"input_length\": null}, \"inbound_nodes\": [[[\"convolution1d_3\", 0, 0]]], \"name\": \"convolution1d_4\"}, {\"class_name\": \"Convolution1D\", \"config\": {\"W_constraint\": null, \"b_constraint\": null, \"name\": \"convolution1d_5\", \"activity_regularizer\": null, \"trainable\": true, \"filter_length\": 3, \"init\": \"uniform\", \"bias\": true, \"nb_filter\": 256, \"input_dim\": null, \"subsample_length\": 1, \"border_mode\": \"valid\", \"b_regularizer\": null, \"W_regularizer\": null, \"activation\": \"relu\", \"input_length\": null}, \"inbound_nodes\": [[[\"convolution1d_4\", 0, 0]]], \"name\": \"convolution1d_5\"}, {\"class_name\": \"MaxPooling1D\", \"config\": {\"stride\": 3, \"pool_length\": 3, \"trainable\": true, \"name\": \"maxpooling1d_3\", \"border_mode\": \"valid\"}, \"inbound_nodes\": [[[\"convolution1d_5\", 0, 0]]], \"name\": \"maxpooling1d_3\"}, {\"class_name\": \"Flatten\", \"config\": {\"trainable\": true, \"name\": \"flatten_1\"}, \"inbound_nodes\": [[[\"maxpooling1d_3\", 0, 0]]], \"name\": \"flatten_1\"}, {\"class_name\": \"Dense\", \"config\": {\"W_constraint\": null, \"b_constraint\": null, \"name\": \"dense_1\", \"activity_regularizer\": null, \"trainable\": true, \"init\": \"glorot_uniform\", \"bias\": true, \"input_dim\": null, \"b_regularizer\": null, \"W_regularizer\": null, \"activation\": \"relu\", \"output_dim\": 1024}, \"inbound_nodes\": [[[\"flatten_1\", 0, 0]]], \"name\": \"dense_1\"}, {\"class_name\": \"Dropout\", \"config\": {\"p\": 0.5, \"trainable\": true, \"name\": \"dropout_1\"}, \"inbound_nodes\": [[[\"dense_1\", 0, 0]]], \"name\": \"dropout_1\"}, {\"class_name\": \"Dense\", \"config\": {\"W_constraint\": null, \"b_constraint\": null, \"name\": \"dense_2\", \"activity_regularizer\": null, \"trainable\": true, \"init\": \"glorot_uniform\", \"bias\": true, \"input_dim\": null, \"b_regularizer\": null, \"W_regularizer\": null, \"activation\": \"relu\", \"output_dim\": 1024}, \"inbound_nodes\": [[[\"dropout_1\", 0, 0]]], \"name\": \"dense_2\"}, {\"class_name\": \"Dropout\", \"config\": {\"p\": 0.5, \"trainable\": true, \"name\": \"dropout_2\"}, \"inbound_nodes\": [[[\"dense_2\", 0, 0]]], \"name\": \"dropout_2\"}, {\"class_name\": \"Dense\", \"config\": {\"W_constraint\": null, \"b_constraint\": null, \"name\": \"output\", \"activity_regularizer\": null, \"trainable\": true, \"init\": \"glorot_uniform\", \"bias\": true, \"input_dim\": null, \"b_regularizer\": null, \"W_regularizer\": null, \"activation\": \"softmax\", \"output_dim\": 5}, \"inbound_nodes\": [[[\"dropout_2\", 0, 0]]], \"name\": \"output\"}], \"input_layers\": [[\"input\", 0, 0]], \"output_layers\": [[\"output\", 0, 0]], \"name\": \"model_1\"}}"


--------------------------------------------------------------------------------
/quad/quad_trained/quad_character_model_vocab.pkl:
--------------------------------------------------------------------------------
  1 | (dp1
  2 | S' '
  3 | I0
  4 | sS'$'
  5 | I1
  6 | sS','
  7 | I2
  8 | sS'0'
  9 | I3
 10 | sS'4'
 11 | I4
 12 | sS'8'
 13 | I5
 14 | sS'@'
 15 | I6
 16 | sS'D'
 17 | I7
 18 | sS'H'
 19 | I8
 20 | sS'L'
 21 | I9
 22 | sS'P'
 23 | I10
 24 | sS'T'
 25 | I11
 26 | sS'X'
 27 | I12
 28 | sS'\\'
 29 | I13
 30 | sS'`'
 31 | I14
 32 | sS'd'
 33 | I15
 34 | sS'h'
 35 | I16
 36 | sS'l'
 37 | I17
 38 | sS'p'
 39 | I18
 40 | sS't'
 41 | I19
 42 | sS'x'
 43 | I20
 44 | sS'|'
 45 | I21
 46 | sS'\xa0'
 47 | I22
 48 | sS'#'
 49 | I23
 50 | sS"'"
 51 | I24
 52 | sS'+'
 53 | I25
 54 | sS'/'
 55 | I26
 56 | sS'3'
 57 | I27
 58 | sS'7'
 59 | I28
 60 | sS';'
 61 | I29
 62 | sS'?'
 63 | I30
 64 | sS'C'
 65 | I31
 66 | sS'\xc4'
 67 | I32
 68 | sS'G'
 69 | I33
 70 | sS'K'
 71 | I34
 72 | sS'O'
 73 | I35
 74 | sS'S'
 75 | I36
 76 | sS'W'
 77 | I37
 78 | sS'_'
 79 | I38
 80 | sS'c'
 81 | I39
 82 | sS'g'
 83 | I40
 84 | sS'k'
 85 | I41
 86 | sS'o'
 87 | I42
 88 | sS's'
 89 | I43
 90 | sS'w'
 91 | I44
 92 | sS'&'
 93 | I45
 94 | sS'*'
 95 | I46
 96 | sS'.'
 97 | I47
 98 | sS'\xb1'
 99 | I48
100 | sS'2'
101 | I49
102 | sS'6'
103 | I50
104 | sS':'
105 | I51
106 | sS'>'
107 | I52
108 | sS'B'
109 | I53
110 | sS'F'
111 | I54
112 | sS'J'
113 | I55
114 | sS'N'
115 | I56
116 | sS'R'
117 | I57
118 | sS'V'
119 | I58
120 | sS'Z'
121 | I59
122 | sS'b'
123 | I60
124 | sS'f'
125 | I61
126 | sS'j'
127 | I62
128 | sS'n'
129 | I63
130 | sS'r'
131 | I64
132 | sS'v'
133 | I65
134 | sS'z'
135 | I66
136 | sS'!'
137 | I67
138 | sS'%'
139 | I68
140 | sS'-'
141 | I69
142 | sS'1'
143 | I70
144 | sS'5'
145 | I71
146 | sS'9'
147 | I72
148 | sS'='
149 | I73
150 | sS'A'
151 | I74
152 | sS'\xc2'
153 | I75
154 | sS'E'
155 | I76
156 | sS'I'
157 | I77
158 | sS'M'
159 | I78
160 | sS'Q'
161 | I79
162 | sS'U'
163 | I80
164 | sS'Y'
165 | I81
166 | sS'a'
167 | I82
168 | sS'e'
169 | I83
170 | sS'i'
171 | I84
172 | sS'm'
173 | I85
174 | sS'q'
175 | I86
176 | sS'u'
177 | I87
178 | sS'y'
179 | I88
180 | s.


--------------------------------------------------------------------------------
/quad/quad_trained/quad_character_model_weights.h5:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:11143e7e34a4ba71d7e36c1979a9bc7997b19fc9d3c8edee35585000fe2eea51
3 | size 166628004
4 | 


--------------------------------------------------------------------------------
/quad/requirements.txt:
--------------------------------------------------------------------------------
1 | keras==1.0.6
2 | pika
3 | 


--------------------------------------------------------------------------------
/quad/utils.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import pika
 3 | import cPickle
 4 | import argparse
 5 | import numpy as np
 6 | 
 7 | from keras.models import model_from_json
 8 | 
 9 | 
10 | def parse_arguments():
11 |     parser = argparse.ArgumentParser(description='Run the category classifier \
12 |                                      API.')
13 |     parser._optionals.title = 'Options'
14 |     parser.add_argument('-m', '--model_path',
15 |                         help='Directory path for the classifier model.',
16 |                         type=str, required=True)
17 |     parser.add_argument('-w', '--weights_path',
18 |                         help='Directory path for the model weights.',
19 |                         type=str, required=True)
20 |     parser.add_argument('-v', '--vocab_path',
21 |                         help='Directory path for the classifier vocab.',
22 |                         type=str, required=True)
23 |     return parser.parse_args()
24 | 
25 | 
26 | class RabbitClient(object):
27 |     def __init__(self, queue, host='localhost'):
28 |         self.queue = queue
29 |         self.connection = pika.BlockingConnection(pika.ConnectionParameters(
30 |             host=host))
31 | 
32 |         self.channel = self.connection.channel()
33 | 
34 |         self.channel.queue_declare(queue=self.queue, durable=True)
35 | 
36 |     def send(self, n, routing):
37 |         self.channel.basic_publish(exchange='',
38 |                                    routing_key=routing,
39 |                                    properties=pika.BasicProperties(
40 |                                             delivery_mode=2,),
41 |                                    body=json.dumps(n))
42 | 
43 |     def receive(self, callback):
44 |         self.channel.basic_qos(prefetch_count=1)
45 |         self.channel.basic_consume(callback, queue=self.queue)
46 |         self.channel.start_consuming()
47 | 
48 | 
49 | def load_model(args):
50 |     model = model_from_json(json.load(open(args.model_path)))
51 |     model.load_weights(args.weights_path)
52 | 
53 |     vocab = cPickle.load(open(args.vocab_path))
54 | 
55 |     return model, vocab
56 | 
57 | 
58 | def encode_data(x, maxlen, vocab, vocab_size, check):
59 |     #Iterate over the loaded data and create a matrix of size maxlen x vocabsize
60 |     #In this case that will be 1014x69. This is then placed in a 3D matrix of size
61 |     #data_samples x maxlen x vocab_size. Each character is encoded into a one-hot
62 |     #array. Chars not in the vocab are encoded into an all zero vector.
63 | 
64 |     input_data = np.zeros((len(x), maxlen, vocab_size))
65 |     for dix, sent in enumerate(x):
66 |         counter = 0
67 |         sent_array = np.zeros((maxlen, vocab_size))
68 |         chars = list(sent.lower().replace(' ', ''))
69 |         for c in chars:
70 |             if counter >= maxlen:
71 |                 pass
72 |             else:
73 |                 char_array = np.zeros(vocab_size, dtype=np.int)
74 |                 if c in check:
75 |                     ix = vocab[c]
76 |                     char_array[ix] = 1
77 |                 sent_array[counter, :] = char_array
78 |                 counter += 1
79 |         input_data[dix, :, :] = sent_array
80 | 
81 |     return input_data
82 | 


--------------------------------------------------------------------------------
/quad/wait-for:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | 
 3 | TIMEOUT=15
 4 | QUIET=0
 5 | 
 6 | echoerr() {
 7 |   if [ "$QUIET" -ne 1 ]; then printf "%s\n" "$*" 1>&2; fi
 8 | }
 9 | 
10 | usage() {
11 |   exitcode="$1"
12 |   cat << USAGE >&2
13 | Usage:
14 |   $cmdname host:port [-t timeout] [-- command args]
15 |   -q | --quiet                        Do not output any status messages
16 |   -t TIMEOUT | --timeout=timeout      Timeout in seconds, zero for no timeout
17 |   -- COMMAND ARGS                     Execute command with args after the test finishes
18 | USAGE
19 |   exit "$exitcode"
20 | }
21 | 
22 | wait_for() {
23 |   for i in `seq $TIMEOUT` ; do
24 |     nc -z "$HOST" "$PORT" > /dev/null 2>&1
25 |     
26 |     result=$?
27 |     if [ $result -eq 0 ] ; then
28 |       if [ $# -gt 0 ] ; then
29 |         exec "$@"
30 |       fi
31 |       exit 0
32 |     fi
33 |     sleep 1
34 |   done
35 |   echo "Operation timed out" >&2
36 |   exit 1
37 | }
38 | 
39 | while [ $# -gt 0 ]
40 | do
41 |   case "$1" in
42 |     *:* )
43 |     HOST=$(printf "%s\n" "$1"| cut -d : -f 1)
44 |     PORT=$(printf "%s\n" "$1"| cut -d : -f 2)
45 |     shift 1
46 |     ;;
47 |     -q | --quiet)
48 |     QUIET=1
49 |     shift 1
50 |     ;;
51 |     -t)
52 |     TIMEOUT="$2"
53 |     if [ "$TIMEOUT" = "" ]; then break; fi
54 |     shift 2
55 |     ;;
56 |     --timeout=*)
57 |     TIMEOUT="${1#*=}"
58 |     shift 1
59 |     ;;
60 |     --)
61 |     shift
62 |     break
63 |     ;;
64 |     --help)
65 |     usage 0
66 |     ;;
67 |     *)
68 |     echoerr "Unknown argument: $1"
69 |     usage 1
70 |     ;;
71 |   esac
72 | done
73 | 
74 | if [ "$HOST" = "" -o "$PORT" = "" ]; then
75 |   echoerr "Error: you need to provide a host and port to test."
76 |   usage 2
77 | fi
78 | 
79 | wait_for "$@"
80 | 


--------------------------------------------------------------------------------
/relevancy/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM continuumio/anaconda
 2 | 
 3 | MAINTAINER John Beieler <jbeieler1@jhu.edu>
 4 | 
 5 | RUN apt-get install -y unzip netcat
 6 | 
 7 | ADD . /src
 8 | 
 9 | RUN cd /src; pip install -r requirements.txt
10 | 
11 | RUN chmod -x /src/launch.sh
12 | CMD sh /src/launch.sh
13 | 


--------------------------------------------------------------------------------
/relevancy/app.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import json
 3 | import utils
 4 | import logging
 5 | 
 6 | logging.basicConfig(format='%(levelname)s %(asctime)s %(filename)s %(lineno)d: %(message)s')
 7 | logger = logging.getLogger(__name__)
 8 | logger.setLevel(logging.INFO)
 9 | 
10 | CONSUME = os.getenv('CONSUME')
11 | PUBLISH = os.getenv('PUBLISH')
12 | 
13 | 
14 | def callback(ch, method, properties, body):
15 |     global TFIDF, CLF
16 |     data = json.loads(body)
17 |     logger.info('Started processing content. {}'.format(data['pipeline_key']))
18 | 
19 |     process(data, TFIDF, CLF)
20 | 
21 |     logger.info('Finished tagging relevancy. {}'.format(data['pipeline_key']))
22 |     ch.basic_ack(delivery_tag=method.delivery_tag)
23 | 
24 | 
25 | def process(data, tfidf, clf):
26 |     rabbit_publish = utils.RabbitClient(queue=PUBLISH,
27 |                                         host='rabbitmq')
28 |     try:
29 |         mat = tfidf.transform([data['title']])
30 |         pred = clf.predict(mat)
31 |         data['predicted_relevancy'] = pred[0]
32 |         logger.info('Finished processing content.')
33 |     except Exception as e:
34 |         # If something goes wrong, log it and return nothing
35 |         logger.info(e)
36 |         # Make sure to update this line if you change the variable names
37 | 
38 |     rabbit_publish.send(data, PUBLISH)
39 | 
40 | 
41 | def main():
42 |     rabbit_consume = utils.RabbitClient(queue=CONSUME, host='rabbitmq')
43 |     rabbit_consume.receive(callback)
44 | 
45 | 
46 | if __name__ == '__main__':
47 |     args = utils.parse_arguments()
48 |     logger.info('Loading model...')
49 |     CLF, TFIDF = utils.load_model(args)
50 |     logger.info('Running...')
51 |     main()
52 | 


--------------------------------------------------------------------------------
/relevancy/launch.sh:
--------------------------------------------------------------------------------
1 | set -x
2 | echo "Starting up analytic service..."
3 | cd /src
4 | ./wait-for rabbitmq:5672 -t 60 -- python /src/app.py -m /src/relevancy_trained_classifier/svm/relevancy_classifier.pkl \
5 |                    -tf /src/relevancy_trained_classifier/tfidf/relevancy_classifier_tfidf.pkl
6 | 


--------------------------------------------------------------------------------
/relevancy/relevancy_trained_classifier/svm/relevancy_classifier.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hltcoe/EventMiner/0250a72ca4e19977b54385bc8255eada1159d08a/relevancy/relevancy_trained_classifier/svm/relevancy_classifier.pkl


--------------------------------------------------------------------------------
/relevancy/relevancy_trained_classifier/svm/relevancy_classifier.pkl_01.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hltcoe/EventMiner/0250a72ca4e19977b54385bc8255eada1159d08a/relevancy/relevancy_trained_classifier/svm/relevancy_classifier.pkl_01.npy


--------------------------------------------------------------------------------
/relevancy/relevancy_trained_classifier/svm/relevancy_classifier.pkl_02.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hltcoe/EventMiner/0250a72ca4e19977b54385bc8255eada1159d08a/relevancy/relevancy_trained_classifier/svm/relevancy_classifier.pkl_02.npy


--------------------------------------------------------------------------------
/relevancy/relevancy_trained_classifier/svm/relevancy_classifier.pkl_03.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hltcoe/EventMiner/0250a72ca4e19977b54385bc8255eada1159d08a/relevancy/relevancy_trained_classifier/svm/relevancy_classifier.pkl_03.npy


--------------------------------------------------------------------------------
/relevancy/relevancy_trained_classifier/tfidf/relevancy_classifier_tfidf.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hltcoe/EventMiner/0250a72ca4e19977b54385bc8255eada1159d08a/relevancy/relevancy_trained_classifier/tfidf/relevancy_classifier_tfidf.pkl


--------------------------------------------------------------------------------
/relevancy/relevancy_trained_classifier/tfidf/relevancy_classifier_tfidf.pkl_01.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hltcoe/EventMiner/0250a72ca4e19977b54385bc8255eada1159d08a/relevancy/relevancy_trained_classifier/tfidf/relevancy_classifier_tfidf.pkl_01.npy


--------------------------------------------------------------------------------
/relevancy/relevancy_trained_classifier/tfidf/relevancy_classifier_tfidf.pkl_02.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hltcoe/EventMiner/0250a72ca4e19977b54385bc8255eada1159d08a/relevancy/relevancy_trained_classifier/tfidf/relevancy_classifier_tfidf.pkl_02.npy


--------------------------------------------------------------------------------
/relevancy/requirements.txt:
--------------------------------------------------------------------------------
1 | pika
2 | 


--------------------------------------------------------------------------------
/relevancy/utils.py:
--------------------------------------------------------------------------------
 1 | import pika
 2 | import json
 3 | import argparse
 4 | 
 5 | from sklearn.externals import joblib
 6 | 
 7 | 
 8 | def parse_arguments():
 9 |     parser = argparse.ArgumentParser(description='Run the relevancy classifier\
10 |                                      API.')
11 |     parser._optionals.title = 'Options'
12 |     parser.add_argument('-m', '--clf_path',
13 |                         help='Filepath for the classifier model.',
14 |                         type=str, required=True)
15 |     parser.add_argument('-tf', '--tfidf_path',
16 |                         help='Filepath for the TFIDF model.',
17 |                         type=str, required=True)
18 |     return parser.parse_args()
19 | 
20 | 
21 | class RabbitClient(object):
22 |     def __init__(self, queue, host='localhost'):
23 |         self.queue = queue
24 |         self.connection = pika.BlockingConnection(pika.ConnectionParameters(
25 |             host=host))
26 | 
27 |         self.channel = self.connection.channel()
28 | 
29 |         self.channel.queue_declare(queue=self.queue, durable=True)
30 | 
31 |     def send(self, n, routing):
32 |         self.channel.basic_publish(exchange='',
33 |                                    routing_key=routing,
34 |                                    properties=pika.BasicProperties(
35 |                                             delivery_mode=2,),
36 |                                    body=json.dumps(n))
37 | 
38 |     def receive(self, callback):
39 |         self.channel.basic_qos(prefetch_count=1)
40 |         self.channel.basic_consume(callback, queue=self.queue)
41 |         self.channel.start_consuming()
42 | 
43 | 
44 | def load_model(args):
45 |     model = joblib.load(args.clf_path)
46 |     tfidf = joblib.load(args.tfidf_path)
47 | 
48 |     return model, tfidf
49 | 


--------------------------------------------------------------------------------
/relevancy/wait-for:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | 
 3 | TIMEOUT=15
 4 | QUIET=0
 5 | 
 6 | echoerr() {
 7 |   if [ "$QUIET" -ne 1 ]; then printf "%s\n" "$*" 1>&2; fi
 8 | }
 9 | 
10 | usage() {
11 |   exitcode="$1"
12 |   cat << USAGE >&2
13 | Usage:
14 |   $cmdname host:port [-t timeout] [-- command args]
15 |   -q | --quiet                        Do not output any status messages
16 |   -t TIMEOUT | --timeout=timeout      Timeout in seconds, zero for no timeout
17 |   -- COMMAND ARGS                     Execute command with args after the test finishes
18 | USAGE
19 |   exit "$exitcode"
20 | }
21 | 
22 | wait_for() {
23 |   for i in `seq $TIMEOUT` ; do
24 |     nc -z "$HOST" "$PORT" > /dev/null 2>&1
25 |     
26 |     result=$?
27 |     if [ $result -eq 0 ] ; then
28 |       if [ $# -gt 0 ] ; then
29 |         exec "$@"
30 |       fi
31 |       exit 0
32 |     fi
33 |     sleep 1
34 |   done
35 |   echo "Operation timed out" >&2
36 |   exit 1
37 | }
38 | 
39 | while [ $# -gt 0 ]
40 | do
41 |   case "$1" in
42 |     *:* )
43 |     HOST=$(printf "%s\n" "$1"| cut -d : -f 1)
44 |     PORT=$(printf "%s\n" "$1"| cut -d : -f 2)
45 |     shift 1
46 |     ;;
47 |     -q | --quiet)
48 |     QUIET=1
49 |     shift 1
50 |     ;;
51 |     -t)
52 |     TIMEOUT="$2"
53 |     if [ "$TIMEOUT" = "" ]; then break; fi
54 |     shift 2
55 |     ;;
56 |     --timeout=*)
57 |     TIMEOUT="${1#*=}"
58 |     shift 1
59 |     ;;
60 |     --)
61 |     shift
62 |     break
63 |     ;;
64 |     --help)
65 |     usage 0
66 |     ;;
67 |     *)
68 |     echoerr "Unknown argument: $1"
69 |     usage 1
70 |     ;;
71 |   esac
72 | done
73 | 
74 | if [ "$HOST" = "" -o "$PORT" = "" ]; then
75 |   echoerr "Error: you need to provide a host and port to test."
76 |   usage 2
77 | fi
78 | 
79 | wait_for "$@"
80 | 


--------------------------------------------------------------------------------