├── .gitignore
├── LICENSE
├── README.md
├── requirements.txt
└── src
    ├── example.oie
    ├── example.txt
    ├── format_oie.py
    └── run_oie.py


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | *.egg-info/
 24 | .installed.cfg
 25 | *.egg
 26 | MANIFEST
 27 | 
 28 | # PyInstaller
 29 | #  Usually these files are written by a python script from a template
 30 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 31 | *.manifest
 32 | *.spec
 33 | 
 34 | # Installer logs
 35 | pip-log.txt
 36 | pip-delete-this-directory.txt
 37 | 
 38 | # Unit test / coverage reports
 39 | htmlcov/
 40 | .tox/
 41 | .coverage
 42 | .coverage.*
 43 | .cache
 44 | nosetests.xml
 45 | coverage.xml
 46 | *.cover
 47 | .hypothesis/
 48 | .pytest_cache/
 49 | 
 50 | # Translations
 51 | *.mo
 52 | *.pot
 53 | 
 54 | # Django stuff:
 55 | *.log
 56 | local_settings.py
 57 | db.sqlite3
 58 | 
 59 | # Flask stuff:
 60 | instance/
 61 | .webassets-cache
 62 | 
 63 | # Scrapy stuff:
 64 | .scrapy
 65 | 
 66 | # Sphinx documentation
 67 | docs/_build/
 68 | 
 69 | # PyBuilder
 70 | target/
 71 | 
 72 | # Jupyter Notebook
 73 | .ipynb_checkpoints
 74 | 
 75 | # pyenv
 76 | .python-version
 77 | 
 78 | # celery beat schedule file
 79 | celerybeat-schedule
 80 | 
 81 | # SageMath parsed files
 82 | *.sage.py
 83 | 
 84 | # Environments
 85 | .env
 86 | .venv
 87 | env/
 88 | venv/
 89 | ENV/
 90 | env.bak/
 91 | venv.bak/
 92 | 
 93 | # Spyder project settings
 94 | .spyderproject
 95 | .spyproject
 96 | 
 97 | # Rope project settings
 98 | .ropeproject
 99 | 
100 | # mkdocs documentation
101 | /site
102 | 
103 | # mypy
104 | .mypy_cache/
105 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2019 Gabriel Stanovsky
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Supervised OIE Wrapper
 2 | 
 3 | This is thin wrapper over [AllenNLP](allennlp.org)'s [pretrained Open IE model](https://demo.allennlp.org/open-information-extraction).
 4 | 
 5 | Outputs predictions identical to those in the onlline demo, with batched gpu options.
 6 | 
 7 | ## Install prerequisites
 8 | 
 9 | * AllenNLP
10 | * docopt
11 | * tqdm
12 | 
13 | Use the following to install requirements:
14 | 
15 |     pip install -r requirements.txt
16 | 
17 | ## Run on raw sentences
18 | 
19 |     cd src
20 |     python run_oie.py --in=path/to/input/file  --batch-size=<batch-size> --out=path/to/output/file [--cuda-device=<cude-device-identifier>]
21 |     
22 | If `--cuda-device` is not specified, the model will run on the cpu.
23 | 
24 | ## Input format
25 | 
26 | Raw sentences, each in a new line.
27 | 
28 | ## Output Format
29 | Each line pertains to a single OIE extraction:
30 | 
31 |     tokenized sentence <tab> ARG0:.. <tab> V:... <tab> ARG1:...  ...
32 | 
33 | ## Example
34 | 
35 | See example of input and output files in [src/example.txt](src/example.txt) and [src/example.oie](src/example.oie).
36 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | allennlp
2 | tqdm
3 | docopt
4 | 


--------------------------------------------------------------------------------
/src/example.oie:
--------------------------------------------------------------------------------
1 | In December , John decided to join the party .	ARG0:John	V:decided to join	ARG1:the party	ARG2:In December	0.812473166828961
2 | Bob agreed to take out the trash .	ARG0:Bob	V:agreed to take out	ARG1:the trash	0.9544605053059868
3 | Alex Honnold climbed up a New Jersey skyscraper .	ARG0:Alex Honnold	V:climbed up	ARG1:a New Jersey skyscraper	0.8046848066803889
4 | Albert Einstein , a German theoretical physicist , published the theory of relativity in 1915 .	ARG0:Albert Einstein	V:published	ARG1:the theory of relativity	ARG2:in 1915	0.5429271689591637
5 | Chair umpire Ramos managed to rob two players in the U.S. Open final .	ARG0:Chair umpire Ramos	V:managed to rob	ARG1:two players	ARG2:in the U.S. Open final	0.0018216976710673505
6 | The CEO of a multi - million dollar company does n't have much free time .	ARG0:The CEO of a multi - million dollar company	V:does n't have	ARG1:much free time	0.15056773929185477


--------------------------------------------------------------------------------
/src/example.txt:
--------------------------------------------------------------------------------
1 | In December, John decided to join the party.
2 | Bob agreed to take out the trash.
3 | Alex Honnold climbed up a New Jersey skyscraper.
4 | Albert Einstein, a German theoretical physicist, published the theory of relativity in 1915.
5 | Chair umpire Ramos managed to rob two players in the U.S. Open final.
6 | The CEO of a multi-million dollar company doesn't have much free time.
7 | 


--------------------------------------------------------------------------------
/src/format_oie.py:
--------------------------------------------------------------------------------
 1 | """ Usage:
 2 |     <file-name> --in=INPUT_FILE --out=OUTPUT_FILE [--debug]
 3 | """
 4 | # External imports
 5 | import logging
 6 | from pprint import pprint
 7 | from pprint import pformat
 8 | from docopt import docopt
 9 | import json
10 | from collections import defaultdict
11 | from tqdm import tqdm
12 | from allennlp.pretrained import open_information_extraction_stanovsky_2018
13 | from allennlp.predictors.open_information_extraction import consolidate_predictions
14 | from allennlp.predictors.open_information_extraction import join_mwp
15 | from allennlp.predictors.open_information_extraction import make_oie_string
16 | from allennlp.predictors.open_information_extraction import get_predicate_text
17 | 
18 | # Local imports
19 | 
20 | #=----
21 | 
22 | class Mock_token:
23 |     """
24 |     Spacy token imitation
25 |     """
26 |     def __init__(self, tok_str):
27 |         self.text = tok_str
28 | 
29 |     def __str__(self):
30 |         return self.text
31 | 
32 | def get_oie_frame(tokens, tags) -> str:
33 |     """
34 |     Converts a list of model outputs (i.e., a list of lists of bio tags, each
35 |     pertaining to a single word), returns an inline bracket representation of
36 |     the prediction.
37 |     """
38 |     frame = defaultdict(list)
39 |     chunk = []
40 |     words = [token.text for token in tokens]
41 | 
42 |     for (token, tag) in zip(words, tags):
43 |         if tag.startswith("I-") or tag.startswith("B-"):
44 |             frame[tag[2:]].append(token)
45 | 
46 |     return dict(frame)
47 | 
48 | 
49 | def get_frame_str(oie_frame) -> str:
50 |     """
51 |     Convert and oie frame dictionary to string.
52 |     """
53 |     dummy_dict = dict([(k if k != "V" else "ARG01", v)
54 |                        for (k, v) in oie_frame.items()])
55 | 
56 |     sorted_roles = sorted(dummy_dict)
57 | 
58 |     frame_str = []
59 |     for role in sorted_roles:
60 |         if role == "ARG01":
61 |             role = "V"
62 |         arg = " ".join(oie_frame[role])
63 |         frame_str.append(f"{role}:{arg}")
64 | 
65 |     return "\t".join(frame_str)
66 | 
67 | 
68 | def format_extractions(sent_tokens, sent_predictions):
69 |     """
70 |     Convert token-level raw predictions to clean extractions.
71 |     """
72 |     # Consolidate predictions
73 |     if not (len(set(map(len, sent_predictions))) == 1):
74 |         raise AssertionError
75 |     assert len(sent_tokens) == len(sent_predictions[0])
76 |     sent_str = " ".join(map(str, sent_tokens))
77 | 
78 |     pred_dict = consolidate_predictions(sent_predictions, sent_tokens)
79 | 
80 |     # Build and return output dictionary
81 |     results = []
82 |     all_tags = []
83 | 
84 |     for tags in pred_dict.values():
85 |         # Join multi-word predicates
86 |         tags = join_mwp(tags)
87 |         all_tags.append(tags)
88 | 
89 |         # Create description text
90 |         oie_frame = get_oie_frame(sent_tokens, tags)
91 | 
92 |         # Add a predicate prediction to outputs.
93 |         results.append("\t".join([sent_str, get_frame_str(oie_frame)]))
94 | 
95 |     return results, all_tags
96 | 


--------------------------------------------------------------------------------
/src/run_oie.py:
--------------------------------------------------------------------------------
  1 | """ Usage:
  2 |     <file-name> --in=INPUT_FILE --batch-size=BATCH-SIZE --out=OUTPUT_FILE [--cuda-device=CUDA_DEVICE] [--debug]
  3 | """
  4 | # External imports
  5 | import logging
  6 | from pprint import pprint
  7 | from pprint import pformat
  8 | from docopt import docopt
  9 | import json
 10 | import pdb
 11 | from tqdm import tqdm
 12 | from allennlp.pretrained import open_information_extraction_stanovsky_2018
 13 | from collections import defaultdict
 14 | from operator import itemgetter
 15 | import functools
 16 | import operator
 17 | 
 18 | # Local imports
 19 | from format_oie import format_extractions, Mock_token
 20 | #=-----
 21 | 
 22 | def chunks(l, n):
 23 |     """
 24 |     Yield successive n-sized chunks from l.
 25 |     """
 26 |     for i in range(0, len(l), n):
 27 |         yield l[i:i + n]
 28 | 
 29 | def create_instances(model, sent):
 30 |     """
 31 |     Convert a sentence into a list of instances.
 32 |     """
 33 |     sent_tokens = model._tokenizer.tokenize(sent)
 34 | 
 35 |     # Find all verbs in the input sentence
 36 |     pred_ids = [i for (i, t) in enumerate(sent_tokens)
 37 |                 if t.pos_ == "VERB" or t.pos_ == "AUX"]
 38 | 
 39 |     # Create instances
 40 |     instances = [{"sentence": sent_tokens,
 41 |                   "predicate_index": pred_id}
 42 |                  for pred_id in pred_ids]
 43 | 
 44 |     return instances
 45 | 
 46 | def get_confidence(model, tag_per_token, class_probs):
 47 |     """
 48 |     Get the confidence of a given model in a token list, using the class probabilities
 49 |     associated with this prediction.
 50 |     """
 51 |     token_indexes = [model._model.vocab.get_token_index(tag, namespace = "labels") for tag in tag_per_token]
 52 | 
 53 |     # Get probability per tag
 54 |     probs = [class_prob[token_index] for token_index, class_prob in zip(token_indexes, class_probs)]
 55 | 
 56 |     # Combine (product)
 57 |     prod_prob = functools.reduce(operator.mul, probs)
 58 | 
 59 |     return prod_prob
 60 | 
 61 | def run_oie(lines, batch_size=64, cuda_device=-1, debug=False):
 62 |     """
 63 |     Run the OIE model and process the output.
 64 |     """
 65 | 
 66 |     if debug:
 67 |         logging.basicConfig(level = logging.DEBUG)
 68 |     else:
 69 |         logging.basicConfig(level = logging.INFO)
 70 | 
 71 |     # Init OIE
 72 |     model = open_information_extraction_stanovsky_2018()
 73 | 
 74 |     # Move model to gpu, if requested
 75 |     if cuda_device >= 0:
 76 |         model._model.cuda(cuda_device)
 77 | 
 78 |     # process sentences
 79 |     logging.info("Processing sentences")
 80 |     oie_lines = []
 81 |     for chunk in tqdm(chunks(lines, batch_size)):
 82 |         oie_inputs = []
 83 |         for sent in chunk:
 84 |             oie_inputs.extend(create_instances(model, sent))
 85 |         if not oie_inputs:
 86 |             # No predicates in this sentence
 87 |             continue
 88 | 
 89 |         # Run oie on sents
 90 |         sent_preds = model.predict_batch_json(oie_inputs)
 91 | 
 92 |         # Collect outputs in batches
 93 |         predictions_by_sent = defaultdict(list)
 94 |         for outputs in sent_preds:
 95 |             sent_tokens = outputs["words"]
 96 |             tags = outputs["tags"]
 97 |             sent_str = " ".join(sent_tokens)
 98 |             assert(len(sent_tokens) == len(tags))
 99 |             predictions_by_sent[sent_str].append((outputs["tags"], outputs["class_probabilities"]))
100 | 
101 |         # Create extractions by sentence
102 |         for sent_tokens, predictions_for_sent in predictions_by_sent.items():
103 |             raw_tags = list(map(itemgetter(0), predictions_for_sent))
104 |             class_probs = list(map(itemgetter(1), predictions_for_sent))
105 | 
106 |             # Compute confidence per extraction
107 |             confs = [get_confidence(model, tag_per_token, class_prob)
108 |                      for tag_per_token, class_prob in zip(raw_tags, class_probs)]
109 | 
110 |             extractions, tags = format_extractions([Mock_token(tok) for tok in sent_tokens.split(" ")], raw_tags)
111 | 
112 |             oie_lines.extend([extraction + f"\t{conf}" for extraction, conf in zip(extractions, confs)])
113 |     logging.info("DONE")
114 |     return oie_lines
115 | 
116 | if __name__ == "__main__":
117 |     # Parse command line arguments
118 |     args = docopt(__doc__)
119 |     inp_fn = args["--in"]
120 |     batch_size = int(args["--batch-size"])
121 |     out_fn = args["--out"]
122 |     cuda_device = int(args["--cuda-device"]) if (args["--cuda-device"] is not None) \
123 |                   else -1
124 |     debug = args["--debug"]
125 | 
126 |     lines = [line.strip()
127 |             for line in open(inp_fn, encoding = "utf8")]
128 | 
129 |     oie_lines = run_oie(lines, batch_size, cuda_device, debug)
130 | 
131 |     # Write to file
132 |     logging.info(f"Writing output to {out_fn}")
133 |     with open(out_fn, "w", encoding = "utf8") as fout:
134 |         fout.write("\n".join(oie_lines))
135 | 


--------------------------------------------------------------------------------