├── .gitattributes ├── .gitignore ├── LICENSE ├── README.md ├── collector ├── Dockerfile ├── app.py ├── requirements.txt ├── utils.py └── wait-for ├── docker-compose.yml ├── hypnos ├── .gitignore ├── Dockerfile ├── LICENSE ├── README.md ├── app.py ├── docker-compose.yml ├── petrarch │ ├── Dockerfile │ ├── petrarch_app.py │ └── requirements.txt ├── requirements.txt ├── utils.py └── wait-for ├── miner ├── Dockerfile ├── app.py ├── requirements.txt ├── utils.py └── wait-for ├── mitie ├── Dockerfile ├── app.py ├── requirements.txt ├── utils.py └── wait-for ├── predpatt ├── Dockerfile ├── ParseyPredFace.py ├── app.py ├── requirements.txt ├── utils.py └── wait-for ├── quad ├── Dockerfile ├── app.py ├── launch.sh ├── quad_trained │ ├── quad_character_model.json │ ├── quad_character_model_vocab.pkl │ └── quad_character_model_weights.h5 ├── requirements.txt ├── utils.py └── wait-for └── relevancy ├── Dockerfile ├── app.py ├── launch.sh ├── relevancy_trained_classifier ├── svm │ ├── relevancy_classifier.pkl │ ├── relevancy_classifier.pkl_01.npy │ ├── relevancy_classifier.pkl_02.npy │ └── relevancy_classifier.pkl_03.npy └── tfidf │ ├── relevancy_classifier_tfidf.pkl │ ├── relevancy_classifier_tfidf.pkl_01.npy │ └── relevancy_classifier_tfidf.pkl_02.npy ├── requirements.txt ├── utils.py └── wait-for /.gitattributes: -------------------------------------------------------------------------------- 1 | *.h5 filter=lfs diff=lfs merge=lfs -text 2 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .DS_Store 2 | *swp 3 | /data/ 4 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright 2012-2017 Johns Hopkins University Human Language Technology 2 | Center of Excellence (JHU HLTCOE). All rights reserved. 3 | 4 | Redistribution and use in source and binary forms, with or without 5 | modification, are permitted provided that the following conditions are 6 | met: 7 | 8 | * Redistributions of source code must retain the above copyright notice, 9 | this list of conditions and the following disclaimer. 10 | 11 | * Redistributions in binary form must reproduce the above copyright 12 | notice, this list of conditions and the following disclaimer in the 13 | documentation and/or other materials provided with the distribution. 14 | 15 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 16 | ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 17 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 18 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 19 | HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 20 | INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 21 | BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS 22 | OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 23 | ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR 24 | TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE 25 | USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH 26 | DAMAGE. 27 | 28 | The views and conclusions contained in the software and documentation 29 | are those of the authors and should not be interpreted as representing 30 | official policies, either expressed or implied, of the copyright 31 | holders. 32 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | EventMiner 2 | ======= 3 | 4 | Using the hammer of supervised learning to make events. 5 | 6 | About 7 | ----- 8 | 9 | EventMiner aims to serve, primarily, as an interface to various NLP analytics 10 | to extract event information from text. This project is setup with a REST 11 | frontend interface, which accepts text input, that is then further passed 12 | via a RabbitMQ messaging queue to various analytics as appropriate. The project 13 | is comprised of Docker containers, with orchestration handled by 14 | docker-compose. This, combined with RabbitMQ as the messaging layer, allows for 15 | clean definitions of interactions between services and minimal setup for the 16 | end user. 17 | 18 | Services 19 | --------- 20 | 21 | The services defined in this project are as follows (in the order they process 22 | content): 23 | 24 | 1. `mitie` - Provide NER tagging via [MITIE](https://github.com/mit-nlp/MITIE). 25 | 2. `predpatt` - Extract predicate-argument structures using 26 | [PredPatt](https://github.com/hltcoe/PredPatt). Also includes Universal 27 | Dependency parse provided by 28 | [SyntaxNet/DRAGNN](https://github.com/tensorflow/models/tree/master/syntaxnet). 29 | 3. `relevancy` - An SVM classifier to determine story relevancy based on the story title. 30 | * **Note:** Branching occurs at the `relevancy` classifier. 31 | If this model determines a story is not relevant, it isn't processed by `quad`, it just goes straight through to `hypnos`. 32 | 4. `quad` - A convolutional neural net to classify a sentence into one of four `QuadCategories`: verbal conflict, verbal cooperation, material conflict, material cooperation. 33 | 5. `hypnos` - Rule-based event extractor. Used primarily for actor extraction in this setup. 34 | 6. `collector` - Light process to pull in events and write them out to a file. 35 | 36 | Deployment 37 | ---------- 38 | 39 | There are two `docker-compose` projects that make up mjolnir. The first is the 40 | `miner` application itself. The second is `hypnos`, which is the container 41 | architecture around the `PETRARCH2` event extractor. `docker-compose` must be 42 | run for `miner` first, and `hypnos` second, due to nuances relating to the 43 | shared docker networks. Thus, deployment is as follows (assuming the user 44 | starts in the top-level `EventMiner` directory): 45 | 46 | ``` 47 | docker-compose up -d 48 | cd ./hynpos 49 | docker-compose up -d 50 | ``` 51 | 52 | This will lead to a REST interface deployed on port 6000. With the features 53 | of `docker-compose`, it's possible to arbitrarily scale up the various services 54 | connected within `miner`. For example, the `quad` service is rather slow 55 | since it's a neural net running on a CPU. Since each service consumes from a 56 | messaging queue, we don't need to worry about things such as load balancing; 57 | each service just consumes when it's ready. Given this, to scale the `quad` 58 | service is as simple as (assuming the user is in the root `EventMiner` directory): 59 | 60 | ``` 61 | docker-compose scale quad=3 62 | ``` 63 | 64 | to run three of the `quad` containers. 65 | 66 | Usage 67 | ----- 68 | 69 | The interface accepts JSON input via REST. As an example: 70 | 71 | ``` 72 | import json 73 | import requests 74 | 75 | headers = {'Content-Type': 'application/json'} 76 | 77 | test = {'title': 'Syrian rebels attacked Aleppo.', 'content': 'This is the content. Rebels attacked Aleppo.'} 78 | data = {'data': test} 79 | 80 | r = requests.post('http://localhost:6000/EventMiner', data=json.dumps(data), headers=headers) 81 | ``` 82 | 83 | The response object from `EventMiner` will contain a unique ID for the input data 84 | that allows the user to trace the progress of the content throughout the 85 | pipeline. The pipeline will write data out to the `EventMiner/data` directory. The 86 | results are in a file titled `events.YYYYMMDD.txt` with one JSON record per 87 | line. The output format (for now...) is as follows: 88 | 89 | ``` 90 | {u'content': u'This is the content. Rebels attacked Aleppo.', 91 | u'event_info': {u'267bbae4-dcc0-4224-94e9-67679b0b6ad1': {u'coded': [], 92 | u'predicted_class': {u'class': 4, u'score': u'0.89923'}, 93 | u'sent': u'This is the content.'}, 94 | u'8b464457-18d2-419c-b5c1-49c6131be947': {u'coded': [[u'---REB', 95 | u'SYR', 96 | u'190']], 97 | u'predicted_class': {u'class': 4, u'score': u'0.986187'}, 98 | u'sent': u'Rebels attacked Aleppo.'}}, 99 | u'pipeline_key': u'4c4f7e7a-db31-4137-a888-2cdbbbf1c225', 100 | u'predicted_relevancy': 1, 101 | u'sents': {u'267bbae4-dcc0-4224-94e9-67679b0b6ad1': u'This is the content.', 102 | u'8b464457-18d2-419c-b5c1-49c6131be947': u'Rebels attacked Aleppo.'}, 103 | u'title': u'Syrian rebels attacked Aleppo.'} 104 | ``` 105 | 106 | 107 | Acknowledgements 108 | ---------------- 109 | 110 | This work was funded by the DARPA Quantitative Crisis Response (QCR) program. 111 | -------------------------------------------------------------------------------- /collector/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:2.7-alpine 2 | 3 | MAINTAINER John Beieler 4 | 5 | RUN apk add --no-cache git wget unzip 6 | 7 | RUN mkdir -p /src/data 8 | 9 | ADD . /src 10 | 11 | RUN cd /src; pip install -r requirements.txt 12 | 13 | CMD ["/src/wait-for", "rabbitmq:5672", "--", "python", "/src/app.py"] 14 | -------------------------------------------------------------------------------- /collector/app.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | import utils 4 | import logging 5 | import datetime 6 | 7 | logging.basicConfig(format='%(levelname)s %(asctime)s %(filename)s %(lineno)d: %(message)s') 8 | logger = logging.getLogger(__name__) 9 | logger.setLevel(logging.INFO) 10 | 11 | CONSUME = os.getenv('CONSUME') 12 | 13 | 14 | def callback(ch, method, properties, body): 15 | data = json.loads(body) 16 | 17 | logger.info('Started processing content. {}'.format(data['pipeline_key'])) 18 | 19 | process(data) 20 | 21 | logger.info('Done writing an event. {}'.format(data['pipeline_key'])) 22 | ch.basic_ack(delivery_tag=method.delivery_tag) 23 | 24 | 25 | def process(data): 26 | key = '' 27 | try: 28 | key = data['pipeline_key'] 29 | logger.info('Got results for {}'.format(key)) 30 | 31 | root = '/src/data/' 32 | now = datetime.datetime.utcnow().strftime('%Y/%m/%d') 33 | path = os.path.join(root, now) 34 | 35 | if not os.path.exists(path): 36 | os.makedirs(path) 37 | 38 | fname = '{}.json'.format(key) 39 | with open(os.path.join(path, fname), 'w') as f: 40 | f.write(json.dumps(data)) 41 | except: 42 | # If something goes wrong, log it and return nothing 43 | logger.exception('Failed to write results for {}'.format(key)) 44 | # Make sure to update this line if you change the variable names 45 | 46 | 47 | def main(): 48 | rabbit_consume = utils.RabbitClient(queue=CONSUME, host='rabbitmq') 49 | rabbit_consume.receive(callback) 50 | 51 | 52 | if __name__ == '__main__': 53 | logger.info('Running...') 54 | main() 55 | -------------------------------------------------------------------------------- /collector/requirements.txt: -------------------------------------------------------------------------------- 1 | pika 2 | -------------------------------------------------------------------------------- /collector/utils.py: -------------------------------------------------------------------------------- 1 | import pika 2 | import json 3 | 4 | 5 | class RabbitClient(object): 6 | def __init__(self, queue, host='localhost'): 7 | self.queue = queue 8 | self.connection = pika.BlockingConnection(pika.ConnectionParameters( 9 | host=host)) 10 | 11 | self.channel = self.connection.channel() 12 | 13 | self.channel.queue_declare(queue=self.queue, durable=True) 14 | 15 | def send(self, n, routing): 16 | self.channel.basic_publish(exchange='', 17 | routing_key=routing, 18 | properties=pika.BasicProperties( 19 | delivery_mode=2,), 20 | body=json.dumps(n)) 21 | 22 | def receive(self, callback): 23 | self.channel.basic_qos(prefetch_count=1) 24 | self.channel.basic_consume(callback, queue=self.queue) 25 | self.channel.start_consuming() 26 | -------------------------------------------------------------------------------- /collector/wait-for: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | TIMEOUT=15 4 | QUIET=0 5 | 6 | echoerr() { 7 | if [ "$QUIET" -ne 1 ]; then printf "%s\n" "$*" 1>&2; fi 8 | } 9 | 10 | usage() { 11 | exitcode="$1" 12 | cat << USAGE >&2 13 | Usage: 14 | $cmdname host:port [-t timeout] [-- command args] 15 | -q | --quiet Do not output any status messages 16 | -t TIMEOUT | --timeout=timeout Timeout in seconds, zero for no timeout 17 | -- COMMAND ARGS Execute command with args after the test finishes 18 | USAGE 19 | exit "$exitcode" 20 | } 21 | 22 | wait_for() { 23 | for i in `seq $TIMEOUT` ; do 24 | nc -z "$HOST" "$PORT" > /dev/null 2>&1 25 | 26 | result=$? 27 | if [ $result -eq 0 ] ; then 28 | if [ $# -gt 0 ] ; then 29 | exec "$@" 30 | fi 31 | exit 0 32 | fi 33 | sleep 1 34 | done 35 | echo "Operation timed out" >&2 36 | exit 1 37 | } 38 | 39 | while [ $# -gt 0 ] 40 | do 41 | case "$1" in 42 | *:* ) 43 | HOST=$(printf "%s\n" "$1"| cut -d : -f 1) 44 | PORT=$(printf "%s\n" "$1"| cut -d : -f 2) 45 | shift 1 46 | ;; 47 | -q | --quiet) 48 | QUIET=1 49 | shift 1 50 | ;; 51 | -t) 52 | TIMEOUT="$2" 53 | if [ "$TIMEOUT" = "" ]; then break; fi 54 | shift 2 55 | ;; 56 | --timeout=*) 57 | TIMEOUT="${1#*=}" 58 | shift 1 59 | ;; 60 | --) 61 | shift 62 | break 63 | ;; 64 | --help) 65 | usage 0 66 | ;; 67 | *) 68 | echoerr "Unknown argument: $1" 69 | usage 1 70 | ;; 71 | esac 72 | done 73 | 74 | if [ "$HOST" = "" -o "$PORT" = "" ]; then 75 | echoerr "Error: you need to provide a host and port to test." 76 | usage 2 77 | fi 78 | 79 | wait_for "$@" 80 | -------------------------------------------------------------------------------- /docker-compose.yml: -------------------------------------------------------------------------------- 1 | version: '2' 2 | networks: 3 | miner: 4 | driver: bridge 5 | services: 6 | miner: 7 | image: miner 8 | build: ./miner 9 | depends_on: 10 | - rabbitmq 11 | ports: 12 | - "6000:6000" 13 | environment: 14 | - PUBLISH=ingest 15 | networks: 16 | - miner 17 | mitie: 18 | image: mitie 19 | build: ./mitie 20 | depends_on: 21 | - rabbitmq 22 | environment: 23 | - CONSUME=ingest 24 | - PUBLISH=mitie 25 | networks: 26 | - miner 27 | predpatt: 28 | image: predpatt 29 | build: ./predpatt 30 | depends_on: 31 | - rabbitmq 32 | environment: 33 | - CONSUME=mitie 34 | - PUBLISH=predpatt 35 | networks: 36 | - miner 37 | relevancy: 38 | image: relevancy 39 | build: ./relevancy 40 | depends_on: 41 | - rabbitmq 42 | environment: 43 | - CONSUME=predpatt 44 | - PUBLISH=relevancy 45 | networks: 46 | - miner 47 | quad: 48 | image: quad 49 | build: ./quad 50 | depends_on: 51 | - rabbitmq 52 | environment: 53 | - CONSUME=relevancy 54 | - PUBLISH=quad 55 | networks: 56 | - miner 57 | collector: 58 | image: collector 59 | build: ./collector 60 | depends_on: 61 | - rabbitmq 62 | environment: 63 | - CONSUME=actors 64 | networks: 65 | - miner 66 | volumes: 67 | - ./data:/src/data 68 | rabbitmq: 69 | image: rabbitmq:alpine 70 | expose: 71 | - "5672" 72 | networks: 73 | - miner 74 | -------------------------------------------------------------------------------- /hypnos/.gitignore: -------------------------------------------------------------------------------- 1 | .DS_Store 2 | *.pyc 3 | *.swp 4 | .ropeproject 5 | -------------------------------------------------------------------------------- /hypnos/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:2.7-alpine 2 | 3 | MAINTAINER John Beieler 4 | 5 | RUN apk add --no-cache git wget unzip 6 | 7 | ADD . /src 8 | 9 | RUN cd /src; pip install -r requirements.txt 10 | 11 | CMD ["/src/wait-for", "ccnlp:5000", "-t", "60", "--", "python", "/src/app.py"] 12 | -------------------------------------------------------------------------------- /hypnos/LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2015 Caerus Associates 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /hypnos/README.md: -------------------------------------------------------------------------------- 1 | hypnos 2 | ====== 3 | 4 | A RESTful API around the [PETRARCH](https://github.com/openeventdata/petrarch) 5 | event data coder. Using `docker compose`, this setup also integrates the 6 | Stanford [CoreNLP](http://nlp.stanford.edu/software/corenlp.shtml) parser 7 | using Casey Hilland's [docker container](https://github.com/chilland/ccNLP). 8 | This setup allows the users to stream texts into the API, rather than the 9 | batch mode seen in applications such as the [Phoenix pipeline](https://github.com/openeventdata/phoenix_pipeline). 10 | 11 | Running 12 | ------- 13 | 14 | Running the system is as simple as using 15 | 16 | `docker-compose up` 17 | 18 | or 19 | 20 | `docker-compose up -d` 21 | 22 | to run in the background. 23 | 24 | This assumes that you have `docker-compose` and `docker` installed. 25 | 26 | Usage 27 | ----- 28 | 29 | ``` 30 | headers = {'Content-Type': 'application/json'} 31 | data = {'text': "At least 37 people are dead after Islamist radical group Boko 32 | Haram assaulted a town in northeastern Nigeria.", 'id': 'abc123', 'date': 33 | '20010101'} 34 | data = json.dumps(data) 35 | r = requests.get('http://localhost:5002/hypnos/extract', data=data, 36 | headers=headers) 37 | r.json() 38 | ``` 39 | 40 | Returns: 41 | 42 | ``` 43 | {u'abc123': {u'meta': {u'date': u'20010101'}, 44 | u'sents': {u'0': {u'content': u'At least 37 people are dead after Islamist 45 | radical group Boko Haram assaulted a town in northeastern Nigeria .', 46 | u'events': [[u'NGAREBMUS', u'NGA', u'190']], 47 | u'issues': [[u'ID_EXTREMISM', 1], [u'NAMED_TERROR_GROUP', 1]], 48 | u'parsed': u'(ROOT (S (NP (QP (IN AT ) (JJS LEAST ) (CD 37 ) ) 49 | (NNS PEOPLE ) ) (VP (VBP ARE ) (ADJP (JJ DEAD ) ) (SBAR (IN AFTER 50 | ) (S (NP (JJ ISLAMIST ) (JJ RADICAL ) (NN GROUP ) (NNP BOKO ) 51 | (NNP HARAM ) ) (VP (VBD ASSAULTED ) (NP (NP (DT A ) (NN TOWN ) ) 52 | (PP (IN IN ) (NP (JJ NORTHEASTERN ) (NNP NIGERIA ) ) ) ) ) ) ) ) 53 | (. . ) ) )'}}}} 54 | ``` 55 | -------------------------------------------------------------------------------- /hypnos/app.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | import utils 4 | import logging 5 | import requests 6 | import datetime 7 | from copy import deepcopy 8 | from dateutil.parser import parse 9 | 10 | logging.basicConfig(format='%(levelname)s %(asctime)s %(filename)s %(lineno)d: %(message)s') 11 | logger = logging.getLogger(__name__) 12 | logger.setLevel(logging.INFO) 13 | 14 | cwd = os.path.abspath(os.path.dirname(__file__)) 15 | CONSUME = os.getenv('CONSUME') 16 | PUBLISH = os.getenv('PUBLISH') 17 | 18 | 19 | def callback(ch, method, properties, body): 20 | data = json.loads(body) 21 | logger.info('Started processing content. {}'.format(data['pipeline_key'])) 22 | 23 | extract(data) 24 | 25 | logger.info('Finished PETR extracting. {}'.format(data['pipeline_key'])) 26 | ch.basic_ack(delivery_tag=method.delivery_tag) 27 | 28 | 29 | def extract(message): 30 | rabbit_publish = utils.RabbitClient(queue=PUBLISH, 31 | host='rabbitmq') 32 | 33 | story = message 34 | 35 | keys = story['event_info'].keys() 36 | #keys = [k for k in keys if k != 'predicted_relevancy'] 37 | for val in keys: 38 | logger.info('Processing {}'.format(val)) 39 | text = story['event_info'][val]['sent']['text'] 40 | text = text.encode('utf-8') 41 | 42 | event_dict = send_to_corenlp(story, text) 43 | 44 | try: 45 | events_r = send_to_petr(event_dict) 46 | except Exception as e: 47 | logger.info('There was an exception with PETR. {}\n'.format(e)) 48 | events_r = {} 49 | try: 50 | # event_updated = process_results(events_r.json()) 51 | event_updated = events_r.json() 52 | 53 | story['event_info'][val]['coded'] = [] 54 | for e in event_updated: 55 | if e: 56 | story['event_info'][val]['coded'].append(e) 57 | else: 58 | pass 59 | 60 | #logger.info(json.dumps(story)) 61 | except: 62 | logger.exception('Something went wrong in the formatting.') 63 | logger.info(json.dumps(events_r.json())) 64 | 65 | rabbit_publish.send(story, PUBLISH) 66 | 67 | 68 | def send_to_petr(event_dict): 69 | headers = {'Content-Type': 'application/json'} 70 | 71 | events_data = json.dumps({'events': event_dict}) 72 | petr_url = 'http://petrarch:5001/petrarch/code' 73 | events_r = requests.post(petr_url, data=events_data, headers=headers) 74 | 75 | return events_r 76 | 77 | 78 | def send_to_corenlp(story, text): 79 | storyid = story['pipeline_key'] 80 | date = datetime.datetime.utcnow().strftime('%Y%m%d') # set a default 81 | try: 82 | date = parse(story['date']).strftime('%Y%m%d') 83 | except KeyError: 84 | logger.info('No date found') 85 | except ValueError: 86 | logger.info('Unable to parse date') 87 | 88 | headers = {'Content-Type': 'application/json'} 89 | core_data = json.dumps({'text': text}) 90 | ccnlp_url = 'http://ccnlp:5000/process' 91 | r = requests.post(ccnlp_url, data=core_data, headers=headers) 92 | out = r.json() 93 | 94 | event_dict = process_corenlp(out, date, storyid) 95 | 96 | return event_dict 97 | 98 | 99 | def process_corenlp(output, date, STORYID): 100 | event_dict = {STORYID: {}} 101 | event_dict[STORYID]['sents'] = {} 102 | event_dict[STORYID]['meta'] = {} 103 | event_dict[STORYID]['meta']['date'] = date 104 | for i, sent in enumerate(output['sentences']): 105 | sents = output['sentences'] 106 | event_dict[STORYID]['sents'][str(i)] = {} 107 | event_dict[STORYID]['sents'][str(i)]['content'] = ' '.join(sents[i]['tokens']) 108 | event_dict[STORYID]['sents'][str(i)]['parsed'] = sents[i]['parse'].upper().replace(')', ' )') 109 | 110 | return event_dict 111 | 112 | 113 | def process_results(event_dict): 114 | new_event_dict = deepcopy(event_dict) 115 | for s_id in event_dict: 116 | sents = event_dict[s_id]['sents'] 117 | for sent in sents: 118 | if 'events' not in sents[sent].keys(): 119 | del new_event_dict[s_id]['sents'][sent] 120 | else: 121 | del new_event_dict[s_id]['sents'][sent]['parsed'] 122 | if 'issues' not in sents[sent].keys(): 123 | sents[sent]['issues'] = [] 124 | 125 | return new_event_dict 126 | 127 | 128 | def main(): 129 | rabbit_consume = utils.RabbitClient(queue=CONSUME, host='rabbitmq') 130 | rabbit_consume.receive(callback) 131 | 132 | 133 | if __name__ == '__main__': 134 | logger.info('Running...') 135 | main() 136 | -------------------------------------------------------------------------------- /hypnos/docker-compose.yml: -------------------------------------------------------------------------------- 1 | version: '2' 2 | networks: 3 | eventminer_miner: 4 | external: true 5 | services: 6 | hypnos: 7 | image: hypnos 8 | build: . 9 | environment: 10 | - CONSUME=quad 11 | - PUBLISH=actors 12 | networks: 13 | - eventminer_miner 14 | ccnlp: 15 | image: caerusassociates/ccnlp:1.0.0 16 | networks: 17 | - eventminer_miner 18 | petrarch: 19 | build: petrarch/. 20 | networks: 21 | - eventminer_miner 22 | -------------------------------------------------------------------------------- /hypnos/petrarch/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:2.7-alpine 2 | 3 | MAINTAINER John Beieler 4 | 5 | RUN apk add --no-cache git 6 | 7 | RUN pip install git+https://github.com/openeventdata/petrarch2.git 8 | 9 | ADD . /src 10 | 11 | RUN cd /src; pip install -r requirements.txt 12 | 13 | EXPOSE 5001 14 | 15 | CMD ["python", "/src/petrarch_app.py"] 16 | -------------------------------------------------------------------------------- /hypnos/petrarch/petrarch_app.py: -------------------------------------------------------------------------------- 1 | import os 2 | import logging 3 | from petrarch2 import petrarch2 4 | from tornado.ioloop import IOLoop 5 | from tornado.wsgi import WSGIContainer 6 | from tornado.httpserver import HTTPServer 7 | from flask import Flask, jsonify, make_response 8 | from flask.ext.restful import Api, Resource, reqparse 9 | 10 | logging.basicConfig(format='%(levelname)s %(asctime)s %(filename)s %(lineno)d: %(message)s') 11 | logger = logging.getLogger(__name__) 12 | logger.setLevel(logging.INFO) 13 | 14 | app = Flask(__name__) 15 | api = Api(app) 16 | 17 | cwd = os.path.abspath(os.path.dirname(__file__)) 18 | 19 | 20 | @app.errorhandler(400) 21 | def bad_request(error): 22 | return make_response(jsonify({'error': 'Bad request'}), 400) 23 | 24 | 25 | @app.errorhandler(404) 26 | def not_found(error): 27 | return make_response(jsonify({'error': 'Not found'}), 404) 28 | 29 | 30 | class CodeAPI(Resource): 31 | def __init__(self): 32 | self.reqparse = reqparse.RequestParser() 33 | self.reqparse.add_argument('events', type=dict) 34 | super(CodeAPI, self).__init__() 35 | 36 | def post(self): 37 | args = self.reqparse.parse_args() 38 | event_dict = args['events'] 39 | to_return = [] 40 | 41 | try: 42 | event_dict_updated = petrarch2.do_coding(event_dict) 43 | k = event_dict_updated.keys()[0] 44 | try: 45 | to_return = event_dict_updated[k]['sents']['0']['events'] 46 | except KeyError: 47 | logger.info('No events to process') 48 | except: 49 | logger.exception("An error occured") 50 | except: 51 | logger.exception("An error occurred") 52 | 53 | return to_return 54 | 55 | 56 | api.add_resource(CodeAPI, '/petrarch/code') 57 | 58 | if __name__ == '__main__': 59 | config = petrarch2.utilities._get_data('data/config/', 'PETR_config.ini') 60 | logger.info("reading config") 61 | petrarch2.PETRreader.parse_Config(config) 62 | logger.info("reading dicts") 63 | petrarch2.read_dictionaries() 64 | 65 | http_server = HTTPServer(WSGIContainer(app)) 66 | http_server.listen(5001) 67 | IOLoop.instance().start() 68 | -------------------------------------------------------------------------------- /hypnos/petrarch/requirements.txt: -------------------------------------------------------------------------------- 1 | requests==2.4.3 2 | Flask==0.10.1 3 | Flask-RESTful==0.3.3 4 | Flask-HTTPAuth==2.5.0 5 | itsdangerous==0.24 6 | Jinja2==2.7.3 7 | MarkupSafe==0.23 8 | tornado==4.2 9 | simplejson==3.6.5 10 | Werkzeug==0.10.4 11 | -------------------------------------------------------------------------------- /hypnos/requirements.txt: -------------------------------------------------------------------------------- 1 | pytest==2.6.3 2 | requests==2.4.3 3 | Flask==0.10.1 4 | Flask-RESTful==0.3.3 5 | Flask-HTTPAuth==2.5.0 6 | itsdangerous==0.24 7 | Jinja2==2.7.3 8 | MarkupSafe==0.23 9 | tornado==4.2 10 | simplejson==3.6.5 11 | Werkzeug==0.10.4 12 | python-dateutil 13 | pika 14 | -------------------------------------------------------------------------------- /hypnos/utils.py: -------------------------------------------------------------------------------- 1 | import pika 2 | import json 3 | 4 | 5 | class RabbitClient(object): 6 | def __init__(self, queue, host='localhost'): 7 | self.queue = queue 8 | self.connection = pika.BlockingConnection(pika.ConnectionParameters( 9 | host=host)) 10 | 11 | self.channel = self.connection.channel() 12 | 13 | self.channel.queue_declare(queue=self.queue, durable=True) 14 | 15 | def send(self, n, routing): 16 | self.channel.basic_publish(exchange='', 17 | routing_key=routing, 18 | properties=pika.BasicProperties( 19 | delivery_mode=2,), 20 | body=json.dumps(n)) 21 | 22 | def receive(self, callback): 23 | self.channel.basic_qos(prefetch_count=1) 24 | self.channel.basic_consume(callback, queue=self.queue) 25 | self.channel.start_consuming() 26 | -------------------------------------------------------------------------------- /hypnos/wait-for: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | TIMEOUT=15 4 | QUIET=0 5 | 6 | echoerr() { 7 | if [ "$QUIET" -ne 1 ]; then printf "%s\n" "$*" 1>&2; fi 8 | } 9 | 10 | usage() { 11 | exitcode="$1" 12 | cat << USAGE >&2 13 | Usage: 14 | $cmdname host:port [-t timeout] [-- command args] 15 | -q | --quiet Do not output any status messages 16 | -t TIMEOUT | --timeout=timeout Timeout in seconds, zero for no timeout 17 | -- COMMAND ARGS Execute command with args after the test finishes 18 | USAGE 19 | exit "$exitcode" 20 | } 21 | 22 | wait_for() { 23 | for i in `seq $TIMEOUT` ; do 24 | nc -z "$HOST" "$PORT" > /dev/null 2>&1 25 | 26 | result=$? 27 | if [ $result -eq 0 ] ; then 28 | if [ $# -gt 0 ] ; then 29 | exec "$@" 30 | fi 31 | exit 0 32 | fi 33 | sleep 1 34 | done 35 | echo "Operation timed out" >&2 36 | exit 1 37 | } 38 | 39 | while [ $# -gt 0 ] 40 | do 41 | case "$1" in 42 | *:* ) 43 | HOST=$(printf "%s\n" "$1"| cut -d : -f 1) 44 | PORT=$(printf "%s\n" "$1"| cut -d : -f 2) 45 | shift 1 46 | ;; 47 | -q | --quiet) 48 | QUIET=1 49 | shift 1 50 | ;; 51 | -t) 52 | TIMEOUT="$2" 53 | if [ "$TIMEOUT" = "" ]; then break; fi 54 | shift 2 55 | ;; 56 | --timeout=*) 57 | TIMEOUT="${1#*=}" 58 | shift 1 59 | ;; 60 | --) 61 | shift 62 | break 63 | ;; 64 | --help) 65 | usage 0 66 | ;; 67 | *) 68 | echoerr "Unknown argument: $1" 69 | usage 1 70 | ;; 71 | esac 72 | done 73 | 74 | if [ "$HOST" = "" -o "$PORT" = "" ]; then 75 | echoerr "Error: you need to provide a host and port to test." 76 | usage 2 77 | fi 78 | 79 | wait_for "$@" 80 | -------------------------------------------------------------------------------- /miner/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:2.7-alpine 2 | 3 | MAINTAINER John Beieler 4 | 5 | RUN apk add --no-cache git wget unzip 6 | 7 | RUN mkdir /src 8 | 9 | RUN mkdir -p /src/nltk_data/tokenizers 10 | RUN wget https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/packages/tokenizers/punkt.zip -O /src/nltk_data/tokenizers/punkt.zip 11 | RUN unzip /src/nltk_data/tokenizers/punkt.zip; rm -rf /src/nltk_data/tokenizers/punkt.zip 12 | RUN mv punkt /src/nltk_data/tokenizers 13 | ENV NLTK_DATA=/src/nltk_data 14 | 15 | ADD . /src 16 | 17 | RUN cd /src; pip install -r requirements.txt 18 | 19 | EXPOSE 6000 20 | 21 | CMD ["/src/wait-for", "rabbitmq:5672", "-t", "90", "--", "python", "/src/app.py"] 22 | -------------------------------------------------------------------------------- /miner/app.py: -------------------------------------------------------------------------------- 1 | import os 2 | import hashlib 3 | import utils 4 | import logging 5 | 6 | from tornado.ioloop import IOLoop 7 | from tornado.wsgi import WSGIContainer 8 | from tornado.httpserver import HTTPServer 9 | from flask import Flask, jsonify, make_response 10 | from flask.ext.restful import Api, Resource, reqparse 11 | 12 | 13 | logging.basicConfig(format='%(levelname)s %(asctime)s %(filename)s %(lineno)d: %(message)s') 14 | logger = logging.getLogger(__name__) 15 | logger.setLevel(logging.INFO) 16 | 17 | app = Flask(__name__) 18 | api = Api(app) 19 | 20 | cwd = os.path.abspath(os.path.dirname(__file__)) 21 | 22 | PUBLISH = os.getenv('PUBLISH') 23 | 24 | 25 | @app.errorhandler(400) 26 | def bad_request(error): 27 | return make_response(jsonify({'error': 'Bad request'}), 400) 28 | 29 | 30 | @app.errorhandler(404) 31 | def not_found(error): 32 | return make_response(jsonify({'error': 'Not found'}), 404) 33 | 34 | 35 | class MinerAPI(Resource): 36 | def __init__(self): 37 | self.reqparse = reqparse.RequestParser() 38 | self.reqparse.add_argument('data', type=dict, location='json') 39 | super(MinerAPI, self).__init__() 40 | 41 | def post(self): 42 | args = self.reqparse.parse_args() 43 | 44 | rabbit = utils.RabbitClient(queue=PUBLISH, host='rabbitmq') 45 | 46 | logger.info('Received data...') 47 | data = utils.prep_data(args['data']) 48 | key = hashlib.sha1(''.join(data['sents'])).hexdigest() 49 | data['pipeline_key'] = key 50 | 51 | logger.info('Sending downstream with key {}...'.format(key)) 52 | rabbit.send(data, PUBLISH) 53 | 54 | logging.info('Sent {}'.format(key)) 55 | return key 56 | 57 | 58 | api.add_resource(MinerAPI, '/EventMiner') 59 | 60 | 61 | if __name__ == '__main__': 62 | http_server = HTTPServer(WSGIContainer(app)) 63 | http_server.listen(6000) 64 | IOLoop.instance().start() 65 | -------------------------------------------------------------------------------- /miner/requirements.txt: -------------------------------------------------------------------------------- 1 | pytest==2.6.3 2 | Flask==0.10.1 3 | Flask-RESTful==0.3.3 4 | Flask-HTTPAuth==2.5.0 5 | itsdangerous==0.24 6 | Jinja2==2.7.3 7 | MarkupSafe==0.23 8 | tornado==4.2 9 | simplejson==3.6.5 10 | Werkzeug==0.10.4 11 | pika 12 | nltk 13 | -------------------------------------------------------------------------------- /miner/utils.py: -------------------------------------------------------------------------------- 1 | import json 2 | import uuid 3 | import pika 4 | import nltk.data 5 | 6 | 7 | class RabbitClient(object): 8 | def __init__(self, queue, host='localhost'): 9 | self.queue = queue 10 | self.connection = pika.BlockingConnection(pika.ConnectionParameters( 11 | host=host)) 12 | 13 | self.channel = self.connection.channel() 14 | 15 | self.channel.queue_declare(queue=self.queue, durable=True) 16 | 17 | def send(self, n, routing): 18 | self.channel.basic_publish(exchange='', 19 | routing_key=routing, 20 | properties=pika.BasicProperties( 21 | delivery_mode=2,), 22 | body=json.dumps(n)) 23 | 24 | def receive(self, callback): 25 | self.channel.basic_qos(prefetch_count=1) 26 | self.channel.basic_consume(callback, queue=self.queue) 27 | self.channel.start_consuming() 28 | 29 | 30 | def prep_data(data): 31 | sent_detector = nltk.data.load('tokenizers/punkt/english.pickle') 32 | sents = sent_detector.tokenize(data['content'].strip()) 33 | sent_dict = {str(uuid.uuid4()): {'text': x} for x in sents[:2]} 34 | data['sents'] = sent_dict 35 | 36 | return data 37 | -------------------------------------------------------------------------------- /miner/wait-for: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | TIMEOUT=15 4 | QUIET=0 5 | 6 | echoerr() { 7 | if [ "$QUIET" -ne 1 ]; then printf "%s\n" "$*" 1>&2; fi 8 | } 9 | 10 | usage() { 11 | exitcode="$1" 12 | cat << USAGE >&2 13 | Usage: 14 | $cmdname host:port [-t timeout] [-- command args] 15 | -q | --quiet Do not output any status messages 16 | -t TIMEOUT | --timeout=timeout Timeout in seconds, zero for no timeout 17 | -- COMMAND ARGS Execute command with args after the test finishes 18 | USAGE 19 | exit "$exitcode" 20 | } 21 | 22 | wait_for() { 23 | for i in `seq $TIMEOUT` ; do 24 | nc -z "$HOST" "$PORT" > /dev/null 2>&1 25 | 26 | result=$? 27 | if [ $result -eq 0 ] ; then 28 | if [ $# -gt 0 ] ; then 29 | exec "$@" 30 | fi 31 | exit 0 32 | fi 33 | sleep 1 34 | done 35 | echo "Operation timed out" >&2 36 | exit 1 37 | } 38 | 39 | while [ $# -gt 0 ] 40 | do 41 | case "$1" in 42 | *:* ) 43 | HOST=$(printf "%s\n" "$1"| cut -d : -f 1) 44 | PORT=$(printf "%s\n" "$1"| cut -d : -f 2) 45 | shift 1 46 | ;; 47 | -q | --quiet) 48 | QUIET=1 49 | shift 1 50 | ;; 51 | -t) 52 | TIMEOUT="$2" 53 | if [ "$TIMEOUT" = "" ]; then break; fi 54 | shift 2 55 | ;; 56 | --timeout=*) 57 | TIMEOUT="${1#*=}" 58 | shift 1 59 | ;; 60 | --) 61 | shift 62 | break 63 | ;; 64 | --help) 65 | usage 0 66 | ;; 67 | *) 68 | echoerr "Unknown argument: $1" 69 | usage 1 70 | ;; 71 | esac 72 | done 73 | 74 | if [ "$HOST" = "" -o "$PORT" = "" ]; then 75 | echoerr "Error: you need to provide a host and port to test." 76 | usage 2 77 | fi 78 | 79 | wait_for "$@" 80 | -------------------------------------------------------------------------------- /mitie/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:2.7-alpine 2 | 3 | MAINTAINER John Beieler 4 | 5 | RUN apk add --no-cache git build-base curl 6 | 7 | RUN mkdir /src 8 | ADD . /src 9 | 10 | WORKDIR /src 11 | 12 | RUN curl -LO https://github.com/mit-nlp/MITIE/releases/download/v0.4/MITIE-models-v0.2.tar.bz2 13 | RUN tar -xzjf MITIE-models-v0.2.tar.bz2; rm -rf MITIE-models-v0.2.tar.bz2 14 | 15 | RUN pip install -r requirements.txt 16 | 17 | EXPOSE 6000 18 | 19 | CMD ["/src/wait-for", "rabbitmq:5672", "--", "python", "/src/app.py"] 20 | -------------------------------------------------------------------------------- /mitie/app.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | import utils 4 | import logging 5 | 6 | from mitie import * 7 | 8 | logging.basicConfig(format='%(levelname)s %(asctime)s %(filename)s %(lineno)d: %(message)s') 9 | logger = logging.getLogger(__name__) 10 | logger.setLevel(logging.INFO) 11 | 12 | 13 | NER = named_entity_extractor('MITIE-models/english/ner_model.dat') 14 | 15 | CONSUME = os.getenv('CONSUME') 16 | PUBLISH = os.getenv('PUBLISH') 17 | 18 | 19 | def callback(ch, method, properties, body): 20 | data = json.loads(body) 21 | logger.info('Started processing content. {}'.format(data['pipeline_key'])) 22 | 23 | process(data) 24 | 25 | logger.info('Finished NER tagging. {}'.format(data['pipeline_key'])) 26 | ch.basic_ack(delivery_tag=method.delivery_tag) 27 | 28 | 29 | def process(data): 30 | rabbit_publish = utils.RabbitClient(queue=PUBLISH, 31 | host='rabbitmq') 32 | data['ner_info'] = {} 33 | for sid, sent in data['sents'].iteritems(): 34 | try: 35 | print(sent) 36 | tokens = tokenize(sent['text']) 37 | entities = NER.extract_entities(tokens) 38 | 39 | new_ents = [] 40 | for e in entities: 41 | #MITIE returns xrange iters. Convert to tuples of ints 42 | r = (e[0].__reduce__()[1][0], 43 | e[0].__reduce__()[1][1]) 44 | tag = e[1] 45 | score = e[2] 46 | new_ents.append((r, tag, score)) 47 | data['sents'][sid]['tokens'] = tokens 48 | data['ner_info'][sid] = new_ents 49 | except Exception as e: 50 | # If something goes wrong, log it and return nothing 51 | logger.info(e) 52 | # Make sure to update this line if you change the variable names 53 | 54 | logger.info('Finished processing content.') 55 | 56 | rabbit_publish.send(data, PUBLISH) 57 | 58 | 59 | def main(): 60 | rabbit_consume = utils.RabbitClient(queue=CONSUME, host='rabbitmq') 61 | rabbit_consume.receive(callback) 62 | 63 | 64 | if __name__ == '__main__': 65 | logger.info('Running...') 66 | main() 67 | -------------------------------------------------------------------------------- /mitie/requirements.txt: -------------------------------------------------------------------------------- 1 | git+https://github.com/mit-nlp/MITIE.git 2 | pika -------------------------------------------------------------------------------- /mitie/utils.py: -------------------------------------------------------------------------------- 1 | import pika 2 | import json 3 | 4 | 5 | class RabbitClient(object): 6 | def __init__(self, queue, host='localhost'): 7 | self.queue = queue 8 | self.connection = pika.BlockingConnection(pika.ConnectionParameters( 9 | host=host)) 10 | 11 | self.channel = self.connection.channel() 12 | 13 | self.channel.queue_declare(queue=self.queue, durable=True) 14 | 15 | def send(self, n, routing): 16 | self.channel.basic_publish(exchange='', 17 | routing_key=routing, 18 | properties=pika.BasicProperties( 19 | delivery_mode=2,), 20 | body=json.dumps(n)) 21 | 22 | def receive(self, callback): 23 | self.channel.basic_qos(prefetch_count=1) 24 | self.channel.basic_consume(callback, queue=self.queue) 25 | self.channel.start_consuming() 26 | -------------------------------------------------------------------------------- /mitie/wait-for: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | TIMEOUT=15 4 | QUIET=0 5 | 6 | echoerr() { 7 | if [ "$QUIET" -ne 1 ]; then printf "%s\n" "$*" 1>&2; fi 8 | } 9 | 10 | usage() { 11 | exitcode="$1" 12 | cat << USAGE >&2 13 | Usage: 14 | $cmdname host:port [-t timeout] [-- command args] 15 | -q | --quiet Do not output any status messages 16 | -t TIMEOUT | --timeout=timeout Timeout in seconds, zero for no timeout 17 | -- COMMAND ARGS Execute command with args after the test finishes 18 | USAGE 19 | exit "$exitcode" 20 | } 21 | 22 | wait_for() { 23 | for i in `seq $TIMEOUT` ; do 24 | nc -z "$HOST" "$PORT" > /dev/null 2>&1 25 | 26 | result=$? 27 | if [ $result -eq 0 ] ; then 28 | if [ $# -gt 0 ] ; then 29 | exec "$@" 30 | fi 31 | exit 0 32 | fi 33 | sleep 1 34 | done 35 | echo "Operation timed out" >&2 36 | exit 1 37 | } 38 | 39 | while [ $# -gt 0 ] 40 | do 41 | case "$1" in 42 | *:* ) 43 | HOST=$(printf "%s\n" "$1"| cut -d : -f 1) 44 | PORT=$(printf "%s\n" "$1"| cut -d : -f 2) 45 | shift 1 46 | ;; 47 | -q | --quiet) 48 | QUIET=1 49 | shift 1 50 | ;; 51 | -t) 52 | TIMEOUT="$2" 53 | if [ "$TIMEOUT" = "" ]; then break; fi 54 | shift 2 55 | ;; 56 | --timeout=*) 57 | TIMEOUT="${1#*=}" 58 | shift 1 59 | ;; 60 | --) 61 | shift 62 | break 63 | ;; 64 | --help) 65 | usage 0 66 | ;; 67 | *) 68 | echoerr "Unknown argument: $1" 69 | usage 1 70 | ;; 71 | esac 72 | done 73 | 74 | if [ "$HOST" = "" -o "$PORT" = "" ]; then 75 | echoerr "Error: you need to provide a host and port to test." 76 | usage 2 77 | fi 78 | 79 | wait_for "$@" 80 | -------------------------------------------------------------------------------- /predpatt/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM tensorflow/syntaxnet 2 | #If they ever update this image things will likely break 3 | 4 | RUN apt-get install -y netcat 5 | 6 | #y_tho.gif 7 | ENV PYTHONPATH="${PYTHONPATH}:/opt/tensorflow/syntaxnet/bazel-bin/dragnn/tools/oss_notebook_launcher.runfiles/__main__:/opt/tensorflow/syntaxnet/bazel-bin/dragnn/tools/oss_notebook_launcher.runfiles/org_tensorflow" 8 | 9 | RUN mkdir /src 10 | ADD . /src 11 | RUN pip install -r /src/requirements.txt 12 | 13 | CMD ["/src/wait-for", "rabbitmq:5672", "-t", "30", "--", "python", "/src/app.py"] 14 | -------------------------------------------------------------------------------- /predpatt/ParseyPredFace.py: -------------------------------------------------------------------------------- 1 | from __future__ import unicode_literals 2 | 3 | import os 4 | import tensorflow as tf 5 | from dragnn.protos import spec_pb2 6 | from dragnn.python import graph_builder 7 | from dragnn.python import spec_builder 8 | from dragnn.python import load_dragnn_cc_impl # This loads the actual op definitions 9 | from dragnn.python import render_parse_tree_graphviz 10 | from dragnn.python import visualization 11 | from google.protobuf import text_format 12 | from syntaxnet import load_parser_ops # This loads the actual op definitions 13 | from syntaxnet import sentence_pb2 14 | from syntaxnet.ops import gen_parser_ops 15 | from tensorflow.python.platform import tf_logging as logging 16 | 17 | from predpatt import PredPatt 18 | from predpatt import load_conllu 19 | from predpatt import PredPattOpts 20 | from predpatt.util.ud import dep_v2 21 | 22 | 23 | def load_model(base_dir, master_spec_name, checkpoint_name): 24 | """ 25 | Function to load the syntaxnet models. Highly specific to the tutorial 26 | format right now. 27 | """ 28 | # Read the master spec 29 | master_spec = spec_pb2.MasterSpec() 30 | with open(os.path.join(base_dir, master_spec_name), "r") as f: 31 | text_format.Merge(f.read(), master_spec) 32 | spec_builder.complete_master_spec(master_spec, None, base_dir) 33 | logging.set_verbosity(logging.WARN) # Turn off TensorFlow spam. 34 | 35 | # Initialize a graph 36 | graph = tf.Graph() 37 | with graph.as_default(): 38 | hyperparam_config = spec_pb2.GridPoint() 39 | builder = graph_builder.MasterBuilder(master_spec, hyperparam_config) 40 | # This is the component that will annotate test sentences. 41 | annotator = builder.add_annotation(enable_tracing=True) 42 | builder.add_saver() # "Savers" can save and load models; here, we're only going to load. 43 | 44 | sess = tf.Session(graph=graph) 45 | with graph.as_default(): 46 | #sess.run(tf.global_variables_initializer()) 47 | #sess.run('save/restore_all', {'save/Const:0': os.path.join(base_dir, checkpoint_name)}) 48 | builder.saver.restore(sess, os.path.join(base_dir, checkpoint_name)) 49 | 50 | def annotate_sentence(sentence): 51 | with graph.as_default(): 52 | return sess.run([annotator['annotations'], annotator['traces']], 53 | feed_dict={annotator['input_batch']: [sentence]}) 54 | return annotate_sentence 55 | 56 | 57 | def annotate_text(text): 58 | """ 59 | Segment and parse input text using syntaxnet models. 60 | """ 61 | sentence = sentence_pb2.Sentence( 62 | text=text, 63 | token=[sentence_pb2.Token(word=text, start=-1, end=-1)] 64 | ) 65 | 66 | # preprocess 67 | with tf.Session(graph=tf.Graph()) as tmp_session: 68 | char_input = gen_parser_ops.char_token_generator([sentence.SerializeToString()]) 69 | preprocessed = tmp_session.run(char_input)[0] 70 | segmented, _ = SEGMENTER_MODEL(preprocessed) 71 | 72 | annotations, traces = PARSER_MODEL(segmented[0]) 73 | assert len(annotations) == 1 74 | assert len(traces) == 1 75 | return sentence_pb2.Sentence.FromString(annotations[0]), traces[0] 76 | 77 | 78 | def parse_to_conll(parse_tree): 79 | """ 80 | Convert from the syntaxnet output format to a CoNLL-U format. 81 | """ 82 | out_str = '' 83 | for i, token in enumerate(parse_tree.token, 1): 84 | if token.head == -1: 85 | head = 0 86 | else: 87 | head = token.head + 1 88 | pos1, pos2 = token.tag.split('attribute')[-1].split('value')[-1].replace(': "', '').replace('" } ', '').split('++') 89 | out_str += '{}\t{}\t-\t{}\t{}\t-\t{}\t{}\t-\t-\n'.format(i, token.word, pos1, pos2, head, token.label) 90 | 91 | return out_str 92 | 93 | 94 | def get_ud_fragments(pp): 95 | """ 96 | Extract PP fragments from a UD parse. 97 | 98 | Format of fragments is (governor_text, governor_position, relation, 99 | token_text, token_position) 100 | """ 101 | pred_deps = [] 102 | arg2deps = {} 103 | for predicate in pp.instances: 104 | # Get dep parses for the predicate. 105 | for token in predicate.tokens: 106 | # (head, relation, dependent) 107 | if token.gov: 108 | dep = (token.gov.text, token.gov.position, token.gov_rel, 109 | token.text, token.position) 110 | else: 111 | dep = (None, None, token.gov_rel, token.text, token.position) 112 | pred_deps.append(dep) 113 | 114 | # Get dep parses for the arguments. 115 | for argument in predicate.arguments: 116 | arg_deps = [] 117 | for token in argument.tokens: 118 | if token.gov: 119 | dep = (token.gov.text, token.gov.position, token.gov_rel, 120 | token.text, token.position) 121 | else: 122 | dep = (None, None, token.gov_rel, token.text, 123 | token.position) 124 | arg_deps.append(dep) 125 | arg2deps[argument.position] = arg_deps 126 | return pred_deps, arg2deps 127 | 128 | 129 | path = '/opt/tensorflow/syntaxnet/examples/dragnn/data' 130 | SEGMENTER_MODEL = load_model(os.path.join(path, "en/segmenter"), 131 | "spec.textproto", "checkpoint") 132 | PARSER_MODEL = load_model(os.path.join(path, 'en'), 133 | "parser_spec.textproto", "checkpoint") 134 | def parse(text): 135 | """ 136 | Primary function to run syntaxnet and PredPatt over input sentences. 137 | """ 138 | parse_tree, trace = annotate_text(text) 139 | conll_parsed = parse_to_conll(parse_tree) 140 | 141 | conll_pp = [ud_parse for sent_id, ud_parse in load_conllu(conll_parsed)][0] 142 | 143 | #PredPatt options. Modify as needed. 144 | resolve_relcl = True # relative clauses 145 | resolve_appos = True # appositional modifiers 146 | resolve_amod = True # adjectival modifiers 147 | resolve_conj = True # conjuction 148 | resolve_poss = True # possessives 149 | ud = dep_v2.VERSION # the version of UD 150 | opts = PredPattOpts(resolve_relcl=resolve_relcl, 151 | resolve_appos=resolve_appos, 152 | resolve_amod=resolve_amod, 153 | resolve_conj=resolve_conj, 154 | resolve_poss=resolve_poss, 155 | ud=ud) 156 | ppatt = PredPatt(conll_pp, opts=opts) 157 | predicate_deps, arg_deps = get_ud_fragments(ppatt) 158 | 159 | #NOTE: 160 | #This returns the pretty print formatted string from PredPatt. This is done 161 | #largely as a place holder for JSON compatability within the REST API. 162 | return {'predpatt': {'predicate_deps': predicate_deps, 163 | 'arg_deps': arg_deps}, 164 | 'conll': conll_parsed, 165 | 'original': text} 166 | -------------------------------------------------------------------------------- /predpatt/app.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | import utils 4 | import logging 5 | 6 | import ParseyPredFace 7 | 8 | 9 | logging.basicConfig(format='%(levelname)s %(asctime)s %(filename)s %(lineno)d: %(message)s') 10 | logger = logging.getLogger(__name__) 11 | logger.setLevel(logging.INFO) 12 | 13 | 14 | CONSUME = os.getenv('CONSUME') 15 | PUBLISH = os.getenv('PUBLISH') 16 | 17 | 18 | def callback(ch, method, properties, body): 19 | data = json.loads(body) 20 | logger.info('Started processing content. {}'.format(data['pipeline_key'])) 21 | 22 | process(data) 23 | 24 | logger.info('Finished PP extraction. {}'.format(data['pipeline_key'])) 25 | ch.basic_ack(delivery_tag=method.delivery_tag) 26 | 27 | 28 | def process(data): 29 | rabbit_publish = utils.RabbitClient(queue=PUBLISH, 30 | host='rabbitmq') 31 | data['predicate_info'] = {} 32 | for sid, sent in data['sents'].iteritems(): 33 | try: 34 | output = ParseyPredFace.parse(sent['text'].encode('utf-8')) 35 | 36 | data['predicate_info'][sid] = output 37 | except Exception as e: 38 | # If something goes wrong, log it and return nothing 39 | logger.info(e) 40 | # Make sure to update this line if you change the variable names 41 | 42 | logger.info('Finished processing content.') 43 | 44 | rabbit_publish.send(data, PUBLISH) 45 | 46 | 47 | def main(): 48 | rabbit_consume = utils.RabbitClient(queue=CONSUME, host='rabbitmq') 49 | rabbit_consume.receive(callback) 50 | 51 | 52 | if __name__ == '__main__': 53 | logger.info('Running...') 54 | main() 55 | -------------------------------------------------------------------------------- /predpatt/requirements.txt: -------------------------------------------------------------------------------- 1 | pika 2 | git+https://github.com/hltcoe/PredPatt.git 3 | -------------------------------------------------------------------------------- /predpatt/utils.py: -------------------------------------------------------------------------------- 1 | import pika 2 | import json 3 | 4 | 5 | class RabbitClient(object): 6 | def __init__(self, queue, host='localhost'): 7 | self.queue = queue 8 | self.connection = pika.BlockingConnection(pika.ConnectionParameters( 9 | host=host)) 10 | 11 | self.channel = self.connection.channel() 12 | 13 | self.channel.queue_declare(queue=self.queue, durable=True) 14 | 15 | def send(self, n, routing): 16 | self.channel.basic_publish(exchange='', 17 | routing_key=routing, 18 | properties=pika.BasicProperties( 19 | delivery_mode=2,), 20 | body=json.dumps(n)) 21 | 22 | def receive(self, callback): 23 | self.channel.basic_qos(prefetch_count=1) 24 | self.channel.basic_consume(callback, queue=self.queue) 25 | self.channel.start_consuming() 26 | -------------------------------------------------------------------------------- /predpatt/wait-for: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | TIMEOUT=15 4 | QUIET=0 5 | 6 | echoerr() { 7 | if [ "$QUIET" -ne 1 ]; then printf "%s\n" "$*" 1>&2; fi 8 | } 9 | 10 | usage() { 11 | exitcode="$1" 12 | cat << USAGE >&2 13 | Usage: 14 | $cmdname host:port [-t timeout] [-- command args] 15 | -q | --quiet Do not output any status messages 16 | -t TIMEOUT | --timeout=timeout Timeout in seconds, zero for no timeout 17 | -- COMMAND ARGS Execute command with args after the test finishes 18 | USAGE 19 | exit "$exitcode" 20 | } 21 | 22 | wait_for() { 23 | for i in `seq $TIMEOUT` ; do 24 | nc -z "$HOST" "$PORT" > /dev/null 2>&1 25 | 26 | result=$? 27 | if [ $result -eq 0 ] ; then 28 | if [ $# -gt 0 ] ; then 29 | exec "$@" 30 | fi 31 | exit 0 32 | fi 33 | sleep 1 34 | done 35 | echo "Operation timed out" >&2 36 | exit 1 37 | } 38 | 39 | while [ $# -gt 0 ] 40 | do 41 | case "$1" in 42 | *:* ) 43 | HOST=$(printf "%s\n" "$1"| cut -d : -f 1) 44 | PORT=$(printf "%s\n" "$1"| cut -d : -f 2) 45 | shift 1 46 | ;; 47 | -q | --quiet) 48 | QUIET=1 49 | shift 1 50 | ;; 51 | -t) 52 | TIMEOUT="$2" 53 | if [ "$TIMEOUT" = "" ]; then break; fi 54 | shift 2 55 | ;; 56 | --timeout=*) 57 | TIMEOUT="${1#*=}" 58 | shift 1 59 | ;; 60 | --) 61 | shift 62 | break 63 | ;; 64 | --help) 65 | usage 0 66 | ;; 67 | *) 68 | echoerr "Unknown argument: $1" 69 | usage 1 70 | ;; 71 | esac 72 | done 73 | 74 | if [ "$HOST" = "" -o "$PORT" = "" ]; then 75 | echoerr "Error: you need to provide a host and port to test." 76 | usage 2 77 | fi 78 | 79 | wait_for "$@" 80 | -------------------------------------------------------------------------------- /quad/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM continuumio/anaconda 2 | 3 | MAINTAINER John Beieler 4 | 5 | #RUN sed -i "s/httpredir.debian.org/`curl -s -D -http://httpredir.debian.org/demo/debian/ | awk '/^Link:/ { print $2 }' | sed -e 's@;@\1@g'`/" /etc/apt/sources.list 6 | RUN apt-get clean && apt-get update 7 | RUN apt-get install -y build-essential python-dev netcat 8 | 9 | ADD . /src 10 | RUN cd /src; pip install -r requirements.txt 11 | 12 | RUN chmod -x /src/launch.sh 13 | CMD sh /src/launch.sh 14 | -------------------------------------------------------------------------------- /quad/app.py: -------------------------------------------------------------------------------- 1 | import json 2 | import logging 3 | import os 4 | import utils 5 | 6 | logging.basicConfig(format='%(levelname)s %(asctime)s %(filename)s %(lineno)d: %(message)s') 7 | logger = logging.getLogger(__name__) 8 | logger.setLevel(logging.INFO) 9 | 10 | 11 | MAXLEN = 1400 12 | CONSUME = os.getenv('CONSUME') 13 | PUBLISH = os.getenv('PUBLISH') 14 | 15 | 16 | def callback(ch, method, properties, body): 17 | global MODEL, VOCAB, VOCAB_SIZE, CHECK 18 | 19 | data = json.loads(body) 20 | key = data['pipeline_key'] 21 | 22 | data['event_info'] = {} 23 | for sid, sent in data['sents'].iteritems(): 24 | data['event_info'][sid] = {} 25 | data['event_info'][sid]['predicted_class'] = {} 26 | data['event_info'][sid]['sent'] = sent 27 | 28 | if data['predicted_relevancy'] == 1: 29 | logger.info('Started processing content. {}'.format(key)) 30 | data = process(data, MODEL, VOCAB, VOCAB_SIZE, CHECK) 31 | logger.info('Finished quad tagging. {}'.format(key)) 32 | else: 33 | logger.info('Irrelevant content. {}'.format(key)) 34 | 35 | publish(data) 36 | 37 | ch.basic_ack(delivery_tag=method.delivery_tag) 38 | 39 | 40 | def process(data, model, vocab, vocab_size, check): 41 | sents = data['sents'] 42 | key = data['pipeline_key'] 43 | for sid, sent in sents.iteritems(): 44 | try: 45 | logger.info('Processing sent {} for content {}'.format(sid, key)) 46 | mat = utils.encode_data( 47 | [sent['text']], MAXLEN, vocab, vocab_size, check, 48 | ) 49 | pred = model.predict(mat) 50 | pred_class = pred.argmax(1)[0] 51 | pred_score = pred[0][pred_class] 52 | data['event_info'][sid]['predicted_class'] = { 53 | 'class': pred_class, 'score': str(pred_score), 54 | } 55 | except: 56 | logger.exception('Error during quad processing of {}'.format(key)) 57 | return data 58 | 59 | 60 | def publish(data): 61 | client = utils.RabbitClient(queue=PUBLISH, host='rabbitmq') 62 | client.send(data, PUBLISH) 63 | 64 | 65 | def main(): 66 | rabbit_consume = utils.RabbitClient(queue=CONSUME, host='rabbitmq') 67 | rabbit_consume.receive(callback) 68 | 69 | 70 | if __name__ == '__main__': 71 | args = utils.parse_arguments() 72 | 73 | logger.info('Loading model...') 74 | MODEL, VOCAB = utils.load_model(args) 75 | VOCAB_SIZE = len(VOCAB.keys()) 76 | CHECK = set(VOCAB.keys()) 77 | 78 | main() 79 | -------------------------------------------------------------------------------- /quad/launch.sh: -------------------------------------------------------------------------------- 1 | set -x 2 | echo "Starting up analytic service..." 3 | ./src/wait-for rabbitmq:5672 -t 60 -- python /src/app.py \ 4 | -m /src/quad_trained/quad_character_model.json \ 5 | -w /src/quad_trained/quad_character_model_weights.h5 \ 6 | -v /src/quad_trained/quad_character_model_vocab.pkl 7 | -------------------------------------------------------------------------------- /quad/quad_trained/quad_character_model.json: -------------------------------------------------------------------------------- 1 | "{\"class_name\": \"Model\", \"config\": {\"layers\": [{\"class_name\": \"InputLayer\", \"config\": {\"batch_input_shape\": [null, 1400, 89], \"name\": \"input\", \"input_dtype\": \"float32\"}, \"inbound_nodes\": [], \"name\": \"input\"}, {\"class_name\": \"Convolution1D\", \"config\": {\"batch_input_shape\": [null, 1400, 89], \"W_constraint\": null, \"b_constraint\": null, \"name\": \"convolution1d_1\", \"activity_regularizer\": null, \"trainable\": true, \"filter_length\": 7, \"init\": \"uniform\", \"bias\": true, \"nb_filter\": 256, \"input_dtype\": \"float32\", \"subsample_length\": 1, \"border_mode\": \"valid\", \"input_dim\": null, \"b_regularizer\": null, \"W_regularizer\": null, \"activation\": \"relu\", \"input_length\": null}, \"inbound_nodes\": [[[\"input\", 0, 0]]], \"name\": \"convolution1d_1\"}, {\"class_name\": \"MaxPooling1D\", \"config\": {\"stride\": 3, \"pool_length\": 3, \"trainable\": true, \"name\": \"maxpooling1d_1\", \"border_mode\": \"valid\"}, \"inbound_nodes\": [[[\"convolution1d_1\", 0, 0]]], \"name\": \"maxpooling1d_1\"}, {\"class_name\": \"Convolution1D\", \"config\": {\"W_constraint\": null, \"b_constraint\": null, \"name\": \"convolution1d_3\", \"activity_regularizer\": null, \"trainable\": true, \"filter_length\": 3, \"init\": \"uniform\", \"bias\": true, \"nb_filter\": 256, \"input_dim\": null, \"subsample_length\": 1, \"border_mode\": \"valid\", \"b_regularizer\": null, \"W_regularizer\": null, \"activation\": \"relu\", \"input_length\": null}, \"inbound_nodes\": [[[\"maxpooling1d_1\", 0, 0]]], \"name\": \"convolution1d_3\"}, {\"class_name\": \"Convolution1D\", \"config\": {\"W_constraint\": null, \"b_constraint\": null, \"name\": \"convolution1d_4\", \"activity_regularizer\": null, \"trainable\": true, \"filter_length\": 3, \"init\": \"uniform\", \"bias\": true, \"nb_filter\": 256, \"input_dim\": null, \"subsample_length\": 1, \"border_mode\": \"valid\", \"b_regularizer\": null, \"W_regularizer\": null, \"activation\": \"relu\", \"input_length\": null}, \"inbound_nodes\": [[[\"convolution1d_3\", 0, 0]]], \"name\": \"convolution1d_4\"}, {\"class_name\": \"Convolution1D\", \"config\": {\"W_constraint\": null, \"b_constraint\": null, \"name\": \"convolution1d_5\", \"activity_regularizer\": null, \"trainable\": true, \"filter_length\": 3, \"init\": \"uniform\", \"bias\": true, \"nb_filter\": 256, \"input_dim\": null, \"subsample_length\": 1, \"border_mode\": \"valid\", \"b_regularizer\": null, \"W_regularizer\": null, \"activation\": \"relu\", \"input_length\": null}, \"inbound_nodes\": [[[\"convolution1d_4\", 0, 0]]], \"name\": \"convolution1d_5\"}, {\"class_name\": \"MaxPooling1D\", \"config\": {\"stride\": 3, \"pool_length\": 3, \"trainable\": true, \"name\": \"maxpooling1d_3\", \"border_mode\": \"valid\"}, \"inbound_nodes\": [[[\"convolution1d_5\", 0, 0]]], \"name\": \"maxpooling1d_3\"}, {\"class_name\": \"Flatten\", \"config\": {\"trainable\": true, \"name\": \"flatten_1\"}, \"inbound_nodes\": [[[\"maxpooling1d_3\", 0, 0]]], \"name\": \"flatten_1\"}, {\"class_name\": \"Dense\", \"config\": {\"W_constraint\": null, \"b_constraint\": null, \"name\": \"dense_1\", \"activity_regularizer\": null, \"trainable\": true, \"init\": \"glorot_uniform\", \"bias\": true, \"input_dim\": null, \"b_regularizer\": null, \"W_regularizer\": null, \"activation\": \"relu\", \"output_dim\": 1024}, \"inbound_nodes\": [[[\"flatten_1\", 0, 0]]], \"name\": \"dense_1\"}, {\"class_name\": \"Dropout\", \"config\": {\"p\": 0.5, \"trainable\": true, \"name\": \"dropout_1\"}, \"inbound_nodes\": [[[\"dense_1\", 0, 0]]], \"name\": \"dropout_1\"}, {\"class_name\": \"Dense\", \"config\": {\"W_constraint\": null, \"b_constraint\": null, \"name\": \"dense_2\", \"activity_regularizer\": null, \"trainable\": true, \"init\": \"glorot_uniform\", \"bias\": true, \"input_dim\": null, \"b_regularizer\": null, \"W_regularizer\": null, \"activation\": \"relu\", \"output_dim\": 1024}, \"inbound_nodes\": [[[\"dropout_1\", 0, 0]]], \"name\": \"dense_2\"}, {\"class_name\": \"Dropout\", \"config\": {\"p\": 0.5, \"trainable\": true, \"name\": \"dropout_2\"}, \"inbound_nodes\": [[[\"dense_2\", 0, 0]]], \"name\": \"dropout_2\"}, {\"class_name\": \"Dense\", \"config\": {\"W_constraint\": null, \"b_constraint\": null, \"name\": \"output\", \"activity_regularizer\": null, \"trainable\": true, \"init\": \"glorot_uniform\", \"bias\": true, \"input_dim\": null, \"b_regularizer\": null, \"W_regularizer\": null, \"activation\": \"softmax\", \"output_dim\": 5}, \"inbound_nodes\": [[[\"dropout_2\", 0, 0]]], \"name\": \"output\"}], \"input_layers\": [[\"input\", 0, 0]], \"output_layers\": [[\"output\", 0, 0]], \"name\": \"model_1\"}}" -------------------------------------------------------------------------------- /quad/quad_trained/quad_character_model_vocab.pkl: -------------------------------------------------------------------------------- 1 | (dp1 2 | S' ' 3 | I0 4 | sS'$' 5 | I1 6 | sS',' 7 | I2 8 | sS'0' 9 | I3 10 | sS'4' 11 | I4 12 | sS'8' 13 | I5 14 | sS'@' 15 | I6 16 | sS'D' 17 | I7 18 | sS'H' 19 | I8 20 | sS'L' 21 | I9 22 | sS'P' 23 | I10 24 | sS'T' 25 | I11 26 | sS'X' 27 | I12 28 | sS'\\' 29 | I13 30 | sS'`' 31 | I14 32 | sS'd' 33 | I15 34 | sS'h' 35 | I16 36 | sS'l' 37 | I17 38 | sS'p' 39 | I18 40 | sS't' 41 | I19 42 | sS'x' 43 | I20 44 | sS'|' 45 | I21 46 | sS'\xa0' 47 | I22 48 | sS'#' 49 | I23 50 | sS"'" 51 | I24 52 | sS'+' 53 | I25 54 | sS'/' 55 | I26 56 | sS'3' 57 | I27 58 | sS'7' 59 | I28 60 | sS';' 61 | I29 62 | sS'?' 63 | I30 64 | sS'C' 65 | I31 66 | sS'\xc4' 67 | I32 68 | sS'G' 69 | I33 70 | sS'K' 71 | I34 72 | sS'O' 73 | I35 74 | sS'S' 75 | I36 76 | sS'W' 77 | I37 78 | sS'_' 79 | I38 80 | sS'c' 81 | I39 82 | sS'g' 83 | I40 84 | sS'k' 85 | I41 86 | sS'o' 87 | I42 88 | sS's' 89 | I43 90 | sS'w' 91 | I44 92 | sS'&' 93 | I45 94 | sS'*' 95 | I46 96 | sS'.' 97 | I47 98 | sS'\xb1' 99 | I48 100 | sS'2' 101 | I49 102 | sS'6' 103 | I50 104 | sS':' 105 | I51 106 | sS'>' 107 | I52 108 | sS'B' 109 | I53 110 | sS'F' 111 | I54 112 | sS'J' 113 | I55 114 | sS'N' 115 | I56 116 | sS'R' 117 | I57 118 | sS'V' 119 | I58 120 | sS'Z' 121 | I59 122 | sS'b' 123 | I60 124 | sS'f' 125 | I61 126 | sS'j' 127 | I62 128 | sS'n' 129 | I63 130 | sS'r' 131 | I64 132 | sS'v' 133 | I65 134 | sS'z' 135 | I66 136 | sS'!' 137 | I67 138 | sS'%' 139 | I68 140 | sS'-' 141 | I69 142 | sS'1' 143 | I70 144 | sS'5' 145 | I71 146 | sS'9' 147 | I72 148 | sS'=' 149 | I73 150 | sS'A' 151 | I74 152 | sS'\xc2' 153 | I75 154 | sS'E' 155 | I76 156 | sS'I' 157 | I77 158 | sS'M' 159 | I78 160 | sS'Q' 161 | I79 162 | sS'U' 163 | I80 164 | sS'Y' 165 | I81 166 | sS'a' 167 | I82 168 | sS'e' 169 | I83 170 | sS'i' 171 | I84 172 | sS'm' 173 | I85 174 | sS'q' 175 | I86 176 | sS'u' 177 | I87 178 | sS'y' 179 | I88 180 | s. -------------------------------------------------------------------------------- /quad/quad_trained/quad_character_model_weights.h5: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:11143e7e34a4ba71d7e36c1979a9bc7997b19fc9d3c8edee35585000fe2eea51 3 | size 166628004 4 | -------------------------------------------------------------------------------- /quad/requirements.txt: -------------------------------------------------------------------------------- 1 | keras==1.0.6 2 | pika 3 | -------------------------------------------------------------------------------- /quad/utils.py: -------------------------------------------------------------------------------- 1 | import json 2 | import pika 3 | import cPickle 4 | import argparse 5 | import numpy as np 6 | 7 | from keras.models import model_from_json 8 | 9 | 10 | def parse_arguments(): 11 | parser = argparse.ArgumentParser(description='Run the category classifier \ 12 | API.') 13 | parser._optionals.title = 'Options' 14 | parser.add_argument('-m', '--model_path', 15 | help='Directory path for the classifier model.', 16 | type=str, required=True) 17 | parser.add_argument('-w', '--weights_path', 18 | help='Directory path for the model weights.', 19 | type=str, required=True) 20 | parser.add_argument('-v', '--vocab_path', 21 | help='Directory path for the classifier vocab.', 22 | type=str, required=True) 23 | return parser.parse_args() 24 | 25 | 26 | class RabbitClient(object): 27 | def __init__(self, queue, host='localhost'): 28 | self.queue = queue 29 | self.connection = pika.BlockingConnection(pika.ConnectionParameters( 30 | host=host)) 31 | 32 | self.channel = self.connection.channel() 33 | 34 | self.channel.queue_declare(queue=self.queue, durable=True) 35 | 36 | def send(self, n, routing): 37 | self.channel.basic_publish(exchange='', 38 | routing_key=routing, 39 | properties=pika.BasicProperties( 40 | delivery_mode=2,), 41 | body=json.dumps(n)) 42 | 43 | def receive(self, callback): 44 | self.channel.basic_qos(prefetch_count=1) 45 | self.channel.basic_consume(callback, queue=self.queue) 46 | self.channel.start_consuming() 47 | 48 | 49 | def load_model(args): 50 | model = model_from_json(json.load(open(args.model_path))) 51 | model.load_weights(args.weights_path) 52 | 53 | vocab = cPickle.load(open(args.vocab_path)) 54 | 55 | return model, vocab 56 | 57 | 58 | def encode_data(x, maxlen, vocab, vocab_size, check): 59 | #Iterate over the loaded data and create a matrix of size maxlen x vocabsize 60 | #In this case that will be 1014x69. This is then placed in a 3D matrix of size 61 | #data_samples x maxlen x vocab_size. Each character is encoded into a one-hot 62 | #array. Chars not in the vocab are encoded into an all zero vector. 63 | 64 | input_data = np.zeros((len(x), maxlen, vocab_size)) 65 | for dix, sent in enumerate(x): 66 | counter = 0 67 | sent_array = np.zeros((maxlen, vocab_size)) 68 | chars = list(sent.lower().replace(' ', '')) 69 | for c in chars: 70 | if counter >= maxlen: 71 | pass 72 | else: 73 | char_array = np.zeros(vocab_size, dtype=np.int) 74 | if c in check: 75 | ix = vocab[c] 76 | char_array[ix] = 1 77 | sent_array[counter, :] = char_array 78 | counter += 1 79 | input_data[dix, :, :] = sent_array 80 | 81 | return input_data 82 | -------------------------------------------------------------------------------- /quad/wait-for: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | TIMEOUT=15 4 | QUIET=0 5 | 6 | echoerr() { 7 | if [ "$QUIET" -ne 1 ]; then printf "%s\n" "$*" 1>&2; fi 8 | } 9 | 10 | usage() { 11 | exitcode="$1" 12 | cat << USAGE >&2 13 | Usage: 14 | $cmdname host:port [-t timeout] [-- command args] 15 | -q | --quiet Do not output any status messages 16 | -t TIMEOUT | --timeout=timeout Timeout in seconds, zero for no timeout 17 | -- COMMAND ARGS Execute command with args after the test finishes 18 | USAGE 19 | exit "$exitcode" 20 | } 21 | 22 | wait_for() { 23 | for i in `seq $TIMEOUT` ; do 24 | nc -z "$HOST" "$PORT" > /dev/null 2>&1 25 | 26 | result=$? 27 | if [ $result -eq 0 ] ; then 28 | if [ $# -gt 0 ] ; then 29 | exec "$@" 30 | fi 31 | exit 0 32 | fi 33 | sleep 1 34 | done 35 | echo "Operation timed out" >&2 36 | exit 1 37 | } 38 | 39 | while [ $# -gt 0 ] 40 | do 41 | case "$1" in 42 | *:* ) 43 | HOST=$(printf "%s\n" "$1"| cut -d : -f 1) 44 | PORT=$(printf "%s\n" "$1"| cut -d : -f 2) 45 | shift 1 46 | ;; 47 | -q | --quiet) 48 | QUIET=1 49 | shift 1 50 | ;; 51 | -t) 52 | TIMEOUT="$2" 53 | if [ "$TIMEOUT" = "" ]; then break; fi 54 | shift 2 55 | ;; 56 | --timeout=*) 57 | TIMEOUT="${1#*=}" 58 | shift 1 59 | ;; 60 | --) 61 | shift 62 | break 63 | ;; 64 | --help) 65 | usage 0 66 | ;; 67 | *) 68 | echoerr "Unknown argument: $1" 69 | usage 1 70 | ;; 71 | esac 72 | done 73 | 74 | if [ "$HOST" = "" -o "$PORT" = "" ]; then 75 | echoerr "Error: you need to provide a host and port to test." 76 | usage 2 77 | fi 78 | 79 | wait_for "$@" 80 | -------------------------------------------------------------------------------- /relevancy/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM continuumio/anaconda 2 | 3 | MAINTAINER John Beieler 4 | 5 | RUN apt-get install -y unzip netcat 6 | 7 | ADD . /src 8 | 9 | RUN cd /src; pip install -r requirements.txt 10 | 11 | RUN chmod -x /src/launch.sh 12 | CMD sh /src/launch.sh 13 | -------------------------------------------------------------------------------- /relevancy/app.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | import utils 4 | import logging 5 | 6 | logging.basicConfig(format='%(levelname)s %(asctime)s %(filename)s %(lineno)d: %(message)s') 7 | logger = logging.getLogger(__name__) 8 | logger.setLevel(logging.INFO) 9 | 10 | CONSUME = os.getenv('CONSUME') 11 | PUBLISH = os.getenv('PUBLISH') 12 | 13 | 14 | def callback(ch, method, properties, body): 15 | global TFIDF, CLF 16 | data = json.loads(body) 17 | logger.info('Started processing content. {}'.format(data['pipeline_key'])) 18 | 19 | process(data, TFIDF, CLF) 20 | 21 | logger.info('Finished tagging relevancy. {}'.format(data['pipeline_key'])) 22 | ch.basic_ack(delivery_tag=method.delivery_tag) 23 | 24 | 25 | def process(data, tfidf, clf): 26 | rabbit_publish = utils.RabbitClient(queue=PUBLISH, 27 | host='rabbitmq') 28 | try: 29 | mat = tfidf.transform([data['title']]) 30 | pred = clf.predict(mat) 31 | data['predicted_relevancy'] = pred[0] 32 | logger.info('Finished processing content.') 33 | except Exception as e: 34 | # If something goes wrong, log it and return nothing 35 | logger.info(e) 36 | # Make sure to update this line if you change the variable names 37 | 38 | rabbit_publish.send(data, PUBLISH) 39 | 40 | 41 | def main(): 42 | rabbit_consume = utils.RabbitClient(queue=CONSUME, host='rabbitmq') 43 | rabbit_consume.receive(callback) 44 | 45 | 46 | if __name__ == '__main__': 47 | args = utils.parse_arguments() 48 | logger.info('Loading model...') 49 | CLF, TFIDF = utils.load_model(args) 50 | logger.info('Running...') 51 | main() 52 | -------------------------------------------------------------------------------- /relevancy/launch.sh: -------------------------------------------------------------------------------- 1 | set -x 2 | echo "Starting up analytic service..." 3 | cd /src 4 | ./wait-for rabbitmq:5672 -t 60 -- python /src/app.py -m /src/relevancy_trained_classifier/svm/relevancy_classifier.pkl \ 5 | -tf /src/relevancy_trained_classifier/tfidf/relevancy_classifier_tfidf.pkl 6 | -------------------------------------------------------------------------------- /relevancy/relevancy_trained_classifier/svm/relevancy_classifier.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hltcoe/EventMiner/0250a72ca4e19977b54385bc8255eada1159d08a/relevancy/relevancy_trained_classifier/svm/relevancy_classifier.pkl -------------------------------------------------------------------------------- /relevancy/relevancy_trained_classifier/svm/relevancy_classifier.pkl_01.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hltcoe/EventMiner/0250a72ca4e19977b54385bc8255eada1159d08a/relevancy/relevancy_trained_classifier/svm/relevancy_classifier.pkl_01.npy -------------------------------------------------------------------------------- /relevancy/relevancy_trained_classifier/svm/relevancy_classifier.pkl_02.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hltcoe/EventMiner/0250a72ca4e19977b54385bc8255eada1159d08a/relevancy/relevancy_trained_classifier/svm/relevancy_classifier.pkl_02.npy -------------------------------------------------------------------------------- /relevancy/relevancy_trained_classifier/svm/relevancy_classifier.pkl_03.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hltcoe/EventMiner/0250a72ca4e19977b54385bc8255eada1159d08a/relevancy/relevancy_trained_classifier/svm/relevancy_classifier.pkl_03.npy -------------------------------------------------------------------------------- /relevancy/relevancy_trained_classifier/tfidf/relevancy_classifier_tfidf.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hltcoe/EventMiner/0250a72ca4e19977b54385bc8255eada1159d08a/relevancy/relevancy_trained_classifier/tfidf/relevancy_classifier_tfidf.pkl -------------------------------------------------------------------------------- /relevancy/relevancy_trained_classifier/tfidf/relevancy_classifier_tfidf.pkl_01.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hltcoe/EventMiner/0250a72ca4e19977b54385bc8255eada1159d08a/relevancy/relevancy_trained_classifier/tfidf/relevancy_classifier_tfidf.pkl_01.npy -------------------------------------------------------------------------------- /relevancy/relevancy_trained_classifier/tfidf/relevancy_classifier_tfidf.pkl_02.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hltcoe/EventMiner/0250a72ca4e19977b54385bc8255eada1159d08a/relevancy/relevancy_trained_classifier/tfidf/relevancy_classifier_tfidf.pkl_02.npy -------------------------------------------------------------------------------- /relevancy/requirements.txt: -------------------------------------------------------------------------------- 1 | pika 2 | -------------------------------------------------------------------------------- /relevancy/utils.py: -------------------------------------------------------------------------------- 1 | import pika 2 | import json 3 | import argparse 4 | 5 | from sklearn.externals import joblib 6 | 7 | 8 | def parse_arguments(): 9 | parser = argparse.ArgumentParser(description='Run the relevancy classifier\ 10 | API.') 11 | parser._optionals.title = 'Options' 12 | parser.add_argument('-m', '--clf_path', 13 | help='Filepath for the classifier model.', 14 | type=str, required=True) 15 | parser.add_argument('-tf', '--tfidf_path', 16 | help='Filepath for the TFIDF model.', 17 | type=str, required=True) 18 | return parser.parse_args() 19 | 20 | 21 | class RabbitClient(object): 22 | def __init__(self, queue, host='localhost'): 23 | self.queue = queue 24 | self.connection = pika.BlockingConnection(pika.ConnectionParameters( 25 | host=host)) 26 | 27 | self.channel = self.connection.channel() 28 | 29 | self.channel.queue_declare(queue=self.queue, durable=True) 30 | 31 | def send(self, n, routing): 32 | self.channel.basic_publish(exchange='', 33 | routing_key=routing, 34 | properties=pika.BasicProperties( 35 | delivery_mode=2,), 36 | body=json.dumps(n)) 37 | 38 | def receive(self, callback): 39 | self.channel.basic_qos(prefetch_count=1) 40 | self.channel.basic_consume(callback, queue=self.queue) 41 | self.channel.start_consuming() 42 | 43 | 44 | def load_model(args): 45 | model = joblib.load(args.clf_path) 46 | tfidf = joblib.load(args.tfidf_path) 47 | 48 | return model, tfidf 49 | -------------------------------------------------------------------------------- /relevancy/wait-for: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | TIMEOUT=15 4 | QUIET=0 5 | 6 | echoerr() { 7 | if [ "$QUIET" -ne 1 ]; then printf "%s\n" "$*" 1>&2; fi 8 | } 9 | 10 | usage() { 11 | exitcode="$1" 12 | cat << USAGE >&2 13 | Usage: 14 | $cmdname host:port [-t timeout] [-- command args] 15 | -q | --quiet Do not output any status messages 16 | -t TIMEOUT | --timeout=timeout Timeout in seconds, zero for no timeout 17 | -- COMMAND ARGS Execute command with args after the test finishes 18 | USAGE 19 | exit "$exitcode" 20 | } 21 | 22 | wait_for() { 23 | for i in `seq $TIMEOUT` ; do 24 | nc -z "$HOST" "$PORT" > /dev/null 2>&1 25 | 26 | result=$? 27 | if [ $result -eq 0 ] ; then 28 | if [ $# -gt 0 ] ; then 29 | exec "$@" 30 | fi 31 | exit 0 32 | fi 33 | sleep 1 34 | done 35 | echo "Operation timed out" >&2 36 | exit 1 37 | } 38 | 39 | while [ $# -gt 0 ] 40 | do 41 | case "$1" in 42 | *:* ) 43 | HOST=$(printf "%s\n" "$1"| cut -d : -f 1) 44 | PORT=$(printf "%s\n" "$1"| cut -d : -f 2) 45 | shift 1 46 | ;; 47 | -q | --quiet) 48 | QUIET=1 49 | shift 1 50 | ;; 51 | -t) 52 | TIMEOUT="$2" 53 | if [ "$TIMEOUT" = "" ]; then break; fi 54 | shift 2 55 | ;; 56 | --timeout=*) 57 | TIMEOUT="${1#*=}" 58 | shift 1 59 | ;; 60 | --) 61 | shift 62 | break 63 | ;; 64 | --help) 65 | usage 0 66 | ;; 67 | *) 68 | echoerr "Unknown argument: $1" 69 | usage 1 70 | ;; 71 | esac 72 | done 73 | 74 | if [ "$HOST" = "" -o "$PORT" = "" ]; then 75 | echoerr "Error: you need to provide a host and port to test." 76 | usage 2 77 | fi 78 | 79 | wait_for "$@" 80 | --------------------------------------------------------------------------------