├── .gitignore ├── LICENSE ├── README.md ├── requirements.txt └── src ├── example.oie ├── example.txt ├── format_oie.py └── run_oie.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | MANIFEST 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | .pytest_cache/ 49 | 50 | # Translations 51 | *.mo 52 | *.pot 53 | 54 | # Django stuff: 55 | *.log 56 | local_settings.py 57 | db.sqlite3 58 | 59 | # Flask stuff: 60 | instance/ 61 | .webassets-cache 62 | 63 | # Scrapy stuff: 64 | .scrapy 65 | 66 | # Sphinx documentation 67 | docs/_build/ 68 | 69 | # PyBuilder 70 | target/ 71 | 72 | # Jupyter Notebook 73 | .ipynb_checkpoints 74 | 75 | # pyenv 76 | .python-version 77 | 78 | # celery beat schedule file 79 | celerybeat-schedule 80 | 81 | # SageMath parsed files 82 | *.sage.py 83 | 84 | # Environments 85 | .env 86 | .venv 87 | env/ 88 | venv/ 89 | ENV/ 90 | env.bak/ 91 | venv.bak/ 92 | 93 | # Spyder project settings 94 | .spyderproject 95 | .spyproject 96 | 97 | # Rope project settings 98 | .ropeproject 99 | 100 | # mkdocs documentation 101 | /site 102 | 103 | # mypy 104 | .mypy_cache/ 105 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2019 Gabriel Stanovsky 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Supervised OIE Wrapper 2 | 3 | This is thin wrapper over [AllenNLP](allennlp.org)'s [pretrained Open IE model](https://demo.allennlp.org/open-information-extraction). 4 | 5 | Outputs predictions identical to those in the onlline demo, with batched gpu options. 6 | 7 | ## Install prerequisites 8 | 9 | * AllenNLP 10 | * docopt 11 | * tqdm 12 | 13 | Use the following to install requirements: 14 | 15 | pip install -r requirements.txt 16 | 17 | ## Run on raw sentences 18 | 19 | cd src 20 | python run_oie.py --in=path/to/input/file --batch-size= --out=path/to/output/file [--cuda-device=] 21 | 22 | If `--cuda-device` is not specified, the model will run on the cpu. 23 | 24 | ## Input format 25 | 26 | Raw sentences, each in a new line. 27 | 28 | ## Output Format 29 | Each line pertains to a single OIE extraction: 30 | 31 | tokenized sentence ARG0:.. V:... ARG1:... ... 32 | 33 | ## Example 34 | 35 | See example of input and output files in [src/example.txt](src/example.txt) and [src/example.oie](src/example.oie). 36 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | allennlp 2 | tqdm 3 | docopt 4 | -------------------------------------------------------------------------------- /src/example.oie: -------------------------------------------------------------------------------- 1 | In December , John decided to join the party . ARG0:John V:decided to join ARG1:the party ARG2:In December 0.812473166828961 2 | Bob agreed to take out the trash . ARG0:Bob V:agreed to take out ARG1:the trash 0.9544605053059868 3 | Alex Honnold climbed up a New Jersey skyscraper . ARG0:Alex Honnold V:climbed up ARG1:a New Jersey skyscraper 0.8046848066803889 4 | Albert Einstein , a German theoretical physicist , published the theory of relativity in 1915 . ARG0:Albert Einstein V:published ARG1:the theory of relativity ARG2:in 1915 0.5429271689591637 5 | Chair umpire Ramos managed to rob two players in the U.S. Open final . ARG0:Chair umpire Ramos V:managed to rob ARG1:two players ARG2:in the U.S. Open final 0.0018216976710673505 6 | The CEO of a multi - million dollar company does n't have much free time . ARG0:The CEO of a multi - million dollar company V:does n't have ARG1:much free time 0.15056773929185477 -------------------------------------------------------------------------------- /src/example.txt: -------------------------------------------------------------------------------- 1 | In December, John decided to join the party. 2 | Bob agreed to take out the trash. 3 | Alex Honnold climbed up a New Jersey skyscraper. 4 | Albert Einstein, a German theoretical physicist, published the theory of relativity in 1915. 5 | Chair umpire Ramos managed to rob two players in the U.S. Open final. 6 | The CEO of a multi-million dollar company doesn't have much free time. 7 | -------------------------------------------------------------------------------- /src/format_oie.py: -------------------------------------------------------------------------------- 1 | """ Usage: 2 | --in=INPUT_FILE --out=OUTPUT_FILE [--debug] 3 | """ 4 | # External imports 5 | import logging 6 | from pprint import pprint 7 | from pprint import pformat 8 | from docopt import docopt 9 | import json 10 | from collections import defaultdict 11 | from tqdm import tqdm 12 | from allennlp.pretrained import open_information_extraction_stanovsky_2018 13 | from allennlp.predictors.open_information_extraction import consolidate_predictions 14 | from allennlp.predictors.open_information_extraction import join_mwp 15 | from allennlp.predictors.open_information_extraction import make_oie_string 16 | from allennlp.predictors.open_information_extraction import get_predicate_text 17 | 18 | # Local imports 19 | 20 | #=---- 21 | 22 | class Mock_token: 23 | """ 24 | Spacy token imitation 25 | """ 26 | def __init__(self, tok_str): 27 | self.text = tok_str 28 | 29 | def __str__(self): 30 | return self.text 31 | 32 | def get_oie_frame(tokens, tags) -> str: 33 | """ 34 | Converts a list of model outputs (i.e., a list of lists of bio tags, each 35 | pertaining to a single word), returns an inline bracket representation of 36 | the prediction. 37 | """ 38 | frame = defaultdict(list) 39 | chunk = [] 40 | words = [token.text for token in tokens] 41 | 42 | for (token, tag) in zip(words, tags): 43 | if tag.startswith("I-") or tag.startswith("B-"): 44 | frame[tag[2:]].append(token) 45 | 46 | return dict(frame) 47 | 48 | 49 | def get_frame_str(oie_frame) -> str: 50 | """ 51 | Convert and oie frame dictionary to string. 52 | """ 53 | dummy_dict = dict([(k if k != "V" else "ARG01", v) 54 | for (k, v) in oie_frame.items()]) 55 | 56 | sorted_roles = sorted(dummy_dict) 57 | 58 | frame_str = [] 59 | for role in sorted_roles: 60 | if role == "ARG01": 61 | role = "V" 62 | arg = " ".join(oie_frame[role]) 63 | frame_str.append(f"{role}:{arg}") 64 | 65 | return "\t".join(frame_str) 66 | 67 | 68 | def format_extractions(sent_tokens, sent_predictions): 69 | """ 70 | Convert token-level raw predictions to clean extractions. 71 | """ 72 | # Consolidate predictions 73 | if not (len(set(map(len, sent_predictions))) == 1): 74 | raise AssertionError 75 | assert len(sent_tokens) == len(sent_predictions[0]) 76 | sent_str = " ".join(map(str, sent_tokens)) 77 | 78 | pred_dict = consolidate_predictions(sent_predictions, sent_tokens) 79 | 80 | # Build and return output dictionary 81 | results = [] 82 | all_tags = [] 83 | 84 | for tags in pred_dict.values(): 85 | # Join multi-word predicates 86 | tags = join_mwp(tags) 87 | all_tags.append(tags) 88 | 89 | # Create description text 90 | oie_frame = get_oie_frame(sent_tokens, tags) 91 | 92 | # Add a predicate prediction to outputs. 93 | results.append("\t".join([sent_str, get_frame_str(oie_frame)])) 94 | 95 | return results, all_tags 96 | -------------------------------------------------------------------------------- /src/run_oie.py: -------------------------------------------------------------------------------- 1 | """ Usage: 2 | --in=INPUT_FILE --batch-size=BATCH-SIZE --out=OUTPUT_FILE [--cuda-device=CUDA_DEVICE] [--debug] 3 | """ 4 | # External imports 5 | import logging 6 | from pprint import pprint 7 | from pprint import pformat 8 | from docopt import docopt 9 | import json 10 | import pdb 11 | from tqdm import tqdm 12 | from allennlp.pretrained import open_information_extraction_stanovsky_2018 13 | from collections import defaultdict 14 | from operator import itemgetter 15 | import functools 16 | import operator 17 | 18 | # Local imports 19 | from format_oie import format_extractions, Mock_token 20 | #=----- 21 | 22 | def chunks(l, n): 23 | """ 24 | Yield successive n-sized chunks from l. 25 | """ 26 | for i in range(0, len(l), n): 27 | yield l[i:i + n] 28 | 29 | def create_instances(model, sent): 30 | """ 31 | Convert a sentence into a list of instances. 32 | """ 33 | sent_tokens = model._tokenizer.tokenize(sent) 34 | 35 | # Find all verbs in the input sentence 36 | pred_ids = [i for (i, t) in enumerate(sent_tokens) 37 | if t.pos_ == "VERB" or t.pos_ == "AUX"] 38 | 39 | # Create instances 40 | instances = [{"sentence": sent_tokens, 41 | "predicate_index": pred_id} 42 | for pred_id in pred_ids] 43 | 44 | return instances 45 | 46 | def get_confidence(model, tag_per_token, class_probs): 47 | """ 48 | Get the confidence of a given model in a token list, using the class probabilities 49 | associated with this prediction. 50 | """ 51 | token_indexes = [model._model.vocab.get_token_index(tag, namespace = "labels") for tag in tag_per_token] 52 | 53 | # Get probability per tag 54 | probs = [class_prob[token_index] for token_index, class_prob in zip(token_indexes, class_probs)] 55 | 56 | # Combine (product) 57 | prod_prob = functools.reduce(operator.mul, probs) 58 | 59 | return prod_prob 60 | 61 | def run_oie(lines, batch_size=64, cuda_device=-1, debug=False): 62 | """ 63 | Run the OIE model and process the output. 64 | """ 65 | 66 | if debug: 67 | logging.basicConfig(level = logging.DEBUG) 68 | else: 69 | logging.basicConfig(level = logging.INFO) 70 | 71 | # Init OIE 72 | model = open_information_extraction_stanovsky_2018() 73 | 74 | # Move model to gpu, if requested 75 | if cuda_device >= 0: 76 | model._model.cuda(cuda_device) 77 | 78 | # process sentences 79 | logging.info("Processing sentences") 80 | oie_lines = [] 81 | for chunk in tqdm(chunks(lines, batch_size)): 82 | oie_inputs = [] 83 | for sent in chunk: 84 | oie_inputs.extend(create_instances(model, sent)) 85 | if not oie_inputs: 86 | # No predicates in this sentence 87 | continue 88 | 89 | # Run oie on sents 90 | sent_preds = model.predict_batch_json(oie_inputs) 91 | 92 | # Collect outputs in batches 93 | predictions_by_sent = defaultdict(list) 94 | for outputs in sent_preds: 95 | sent_tokens = outputs["words"] 96 | tags = outputs["tags"] 97 | sent_str = " ".join(sent_tokens) 98 | assert(len(sent_tokens) == len(tags)) 99 | predictions_by_sent[sent_str].append((outputs["tags"], outputs["class_probabilities"])) 100 | 101 | # Create extractions by sentence 102 | for sent_tokens, predictions_for_sent in predictions_by_sent.items(): 103 | raw_tags = list(map(itemgetter(0), predictions_for_sent)) 104 | class_probs = list(map(itemgetter(1), predictions_for_sent)) 105 | 106 | # Compute confidence per extraction 107 | confs = [get_confidence(model, tag_per_token, class_prob) 108 | for tag_per_token, class_prob in zip(raw_tags, class_probs)] 109 | 110 | extractions, tags = format_extractions([Mock_token(tok) for tok in sent_tokens.split(" ")], raw_tags) 111 | 112 | oie_lines.extend([extraction + f"\t{conf}" for extraction, conf in zip(extractions, confs)]) 113 | logging.info("DONE") 114 | return oie_lines 115 | 116 | if __name__ == "__main__": 117 | # Parse command line arguments 118 | args = docopt(__doc__) 119 | inp_fn = args["--in"] 120 | batch_size = int(args["--batch-size"]) 121 | out_fn = args["--out"] 122 | cuda_device = int(args["--cuda-device"]) if (args["--cuda-device"] is not None) \ 123 | else -1 124 | debug = args["--debug"] 125 | 126 | lines = [line.strip() 127 | for line in open(inp_fn, encoding = "utf8")] 128 | 129 | oie_lines = run_oie(lines, batch_size, cuda_device, debug) 130 | 131 | # Write to file 132 | logging.info(f"Writing output to {out_fn}") 133 | with open(out_fn, "w", encoding = "utf8") as fout: 134 | fout.write("\n".join(oie_lines)) 135 | --------------------------------------------------------------------------------