├── scripts
    ├── __init__.py
    ├── search
    │   ├── __init__.py
    │   ├── test_download_search_examples.py
    │   ├── ner_lists
    │   │   ├── statesandprovinces
    │   │   └── countries
    │   ├── download_patterns_config.py
    │   └── download_search_examples.py
    ├── check_num_of_examples.py
    ├── filter_generations
    │   ├── filter_by_triggers.py
    │   └── filter_by_entities.py
    ├── relations_ratio.py
    ├── README.md
    └── generation_preprocess
    │   ├── create_tacred_datafiles.py
    │   └── relation_canonical_form.py
├── classification
    ├── __init__.py
    ├── stubs
    │   ├── docred
    │   │   ├── fake_preds1.json
    │   │   ├── fake_preds0.json
    │   │   ├── fake_preds2.json
    │   │   ├── fake_preds3.json
    │   │   ├── fake_preds4.json
    │   │   ├── fake_preds5.json
    │   │   └── fake_truth.json
    │   └── tacred
    │   │   └── fake_truth.json
    ├── re_config.py
    ├── split_train_pareto.py
    ├── test_tacred.py
    ├── evaluation
    │   ├── test_docred_evaluation.py
    │   ├── tacred_evaluation.py
    │   └── docred_evaluation.py
    ├── tacred_config.py
    ├── tacred.py
    ├── test_docred.py
    └── docred.py
├── requirements.txt
├── tacred_generation.sh
├── .gitignore
├── generation_outputs
    ├── annotate_like_search.py
    ├── prepare_entities_files.py
    ├── origin
    │   ├── first_100_object_is_country.txt
    │   ├── first_100_object_is_country_new_ents.txt
    │   ├── first_100_object_is_nationality_new_ents.txt
    │   └── first_100_object_is_nationality.txt
    ├── convert_s_o_to_es.py
    ├── switch_entities_of_gens.py
    └── children
    │   ├── first_100_new_wraps_new_ents.txt
    │   └── first_100_new_wraps.txt
├── README.md
├── models
    └── mtb.py
├── run_classification.sh
└── run_generation.py


/scripts/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/classification/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/scripts/search/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/classification/stubs/docred/fake_preds1.json:
--------------------------------------------------------------------------------
1 | [{"title": "doc1", "r": "P112", "h_idx": 0, "t_idx": 1, "c": 1.0}]


--------------------------------------------------------------------------------
/classification/stubs/docred/fake_preds0.json:
--------------------------------------------------------------------------------
1 | [{"title": "doc3", "r": "P26", "h_idx": 0, "t_idx": 1, "c": 1.0}, {"title": "doc3", "r": "P26", "h_idx": 1, "t_idx": 0, "c": 1.0}]


--------------------------------------------------------------------------------
/classification/stubs/docred/fake_preds2.json:
--------------------------------------------------------------------------------
1 | [{"title": "doc1", "r": "P112", "h_idx": 0, "t_idx": 1, "c": 1.0}, {"title": "doc4", "r": "P112", "h_idx": 0, "t_idx": 1, "c": 1.0}]


--------------------------------------------------------------------------------
/classification/stubs/docred/fake_preds3.json:
--------------------------------------------------------------------------------
1 | [{"title": "doc3", "r": "P26", "h_idx": 0, "t_idx": 1, "c": 1.0}, {"title": "doc3", "r": "P26", "h_idx": 1, "t_idx": 0, "c": 1.0}]


--------------------------------------------------------------------------------
/classification/stubs/docred/fake_preds4.json:
--------------------------------------------------------------------------------
1 | [{"title": "doc1", "r": "P112", "h_idx": 0, "t_idx": 1, "c": 1.0}, {"title": "doc3", "r": "P26", "h_idx": 0, "t_idx": 1, "c": 1.0}]


--------------------------------------------------------------------------------
/classification/stubs/docred/fake_preds5.json:
--------------------------------------------------------------------------------
1 | [{"title": "doc1", "r": "P112", "h_idx": 0, "t_idx": 1, "c": 1.0}, {"title": "doc4", "r": "P112", "h_idx": 0, "t_idx": 1, "c": 1.0}, {"title": "doc4", "r": "P112", "h_idx": 0, "t_idx": 2, "c": 0.8}]


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | # PyTorch
 2 | torch>=1.0.0
 3 | # progress bars in model download and training scripts
 4 | tqdm
 5 | # Accessing files from S3 directly.
 6 | boto3
 7 | # Used for downloading models over HTTP
 8 | requests
 9 | # For OpenAI GPT
10 | regex
11 | # For XLNet
12 | sentencepiece
13 | # For XLM
14 | sacremoses
15 | tensorboardX
16 | scikit-learn
17 | pytest


--------------------------------------------------------------------------------
/tacred_generation.sh:
--------------------------------------------------------------------------------
 1 | source activate hugging_face
 2 | 
 3 | num_samples=10
 4 | 
 5 | while getopts m:o:s:p:t: option
 6 | do
 7 | case "${option}"
 8 | in
 9 | m) model_dir=${OPTARG};;
10 | o) out_file=${OPTARG};;
11 | s) num_samples=${OPTARG};;
12 | t) prompt=${OPTARG};;
13 | p) p=${OPTARG};;
14 | esac
15 | done
16 | 
17 | python run_generation.py \
18 |     --model_type=gpt2 \
19 |     --model_name_or_path=$model_dir \
20 |     --out_file=$out_file \
21 |     --num_return_sequences=$num_samples \
22 |     --prompt="$prompt" \
23 |     --length=50 \
24 |     --p=$p \
25 |     # --k=5 \


--------------------------------------------------------------------------------
/scripts/check_num_of_examples.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import os
 3 | import sys
 4 | 
 5 | import torch
 6 | 
 7 | def main(file_dir, write_to):
 8 |     out = {}
 9 |     for file in os.listdir(file_dir):
10 |         if file.startswith("cached"):
11 |             examples = torch.load(os.path.join(file_dir, file))
12 |             pos = len([e for e in examples if e.label == 1])
13 |             neg = len([e for e in examples if e.label == 0])
14 |             out[file[7:file.index('_roberta')]] = {"num_pos": pos, "num_neg": neg}
15 | 
16 |     json.dump(out, open(write_to, 'w'))
17 | 
18 | if __name__ == "__main__":
19 |     main(sys.argv[1], sys.argv[2])


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | .vscode
 2 | .DS_Store
 3 | */__pycache__/
 4 | *.pyc
 5 | 
 6 | data/
 7 | runs/
 8 | old/
 9 | !scripts/README.md
10 | !scripts/filter_generations/filter_by_entities.py
11 | !scripts/filter_generations/filter_by_entities.py
12 | !scripts/generation_preprocess/create_tacred_datafiles.py
13 | !scripts/generation_preprocess/relation_canonical_form.py
14 | !scripts/__init__.py
15 | !scripts/search/__init__.py
16 | !scripts/seach/download_patterns_config.py
17 | !scripts/seach/download_search_examples.py
18 | !scripts/seach/patterns_from_generation.py
19 | scripts/
20 | 
21 | classification_outputs/
22 | log*
23 | 
24 | scripts/search_results/
25 | 


--------------------------------------------------------------------------------
/classification/re_config.py:
--------------------------------------------------------------------------------
 1 | from typing import Any, Callable, Dict, Iterator, List, Tuple, Type, TypeVar
 2 | from typing_extensions import Literal, TypedDict
 3 | 
 4 | START_E1 = '[E1]'
 5 | END_E1 = '[/E1]'
 6 | START_E2 = '[E2]'
 7 | END_E2 = '[/E2]'
 8 | 
 9 | SPECIAL_TOKENS = [START_E1, END_E1, START_E2, END_E2]
10 | 
11 | RELATIONS_ENTITY_TYPES_FOR_SEARCH = {
12 |     "per:children": "PERSON:PERSON",
13 |     "org:founded_by": "ORGANIZATION:PERSON",
14 |     "org:country_of_headquarters": "ORGANIZATION:LOCATION",
15 |     "per:religion": "PERSON:MISC",
16 |     "per:spouse": "PERSON:PERSON",
17 |     "per:origin": "PERSON:MISC",
18 |     "per:date_of_death": "PERSON:DATE",
19 |     "per:city_of_death": "PERSON:LOCATION",
20 | }


--------------------------------------------------------------------------------
/classification/split_train_pareto.py:
--------------------------------------------------------------------------------
 1 | from collections import defaultdict
 2 | import json
 3 | import os
 4 | from random import shuffle
 5 | 
 6 | data_dir = "data/DocRED/"
 7 | data_file =  "train_annotated.json"
 8 | with open(os.path.join(data_dir, data_file), 'r') as f:
 9 |     data = json.load(f)
10 | 
11 | shuffle(data)
12 | 
13 | bar = int(len(data)*0.8)
14 | train_split, eval_split = data[:bar], data[bar:]
15 | 
16 | assert len(train_split) + len(eval_split) == len(data)
17 | 
18 | with open(os.path.join(data_dir, 'train_split_from_annotated.json'), 'w') as outfile:
19 |     json.dump(train_split, outfile)
20 | 
21 | with open(os.path.join(data_dir, 'eval_split_from_annotated.json'), 'w') as outfile:
22 |     json.dump(eval_split, outfile)


--------------------------------------------------------------------------------
/generation_outputs/annotate_like_search.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import json
 3 | import requests
 4 | from tqdm import tqdm
 5 | 
 6 | def main(args):
 7 |     headers = {'Content-Type': 'application/json'}
 8 |     
 9 |     with open(args.file_to_annotate, 'r') as f:
10 |         texts = f.readlines()
11 | 
12 |     with open(args.file_to_annotate.split('.txt')[0]+'_good_tokenization.txt', 'w') as outfile:
13 |         for i, text in tqdm(enumerate(texts)):
14 |             payload = {'text': text}
15 |             response = requests.post("http://localhost:9090/annotate-text", json=payload, headers=headers)
16 |             content = json.loads(response.content)
17 |             sentences = content['sentences']
18 |             out = ''
19 |             for sent in sentences:
20 |                 out += ' '.join(sent['words'])
21 |             out = out.replace('-LSB- ', '[')
22 |             out = out.replace(' -RSB-', ']')
23 |             outfile.write(out+'\n')
24 | 
25 | if __name__ == "__main__":
26 |     parser = argparse.ArgumentParser()
27 |     parser.add_argument("--file_to_annotate", type=str, required=True)
28 |     args = parser.parse_args()
29 |     main(args)


--------------------------------------------------------------------------------
/scripts/filter_generations/filter_by_triggers.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import os
 3 | 
 4 | from old.utils import read_file, write_to_file
 5 | 
 6 | def filter_out(sentences, triggers):
 7 |     filtered_sentences = []
 8 |     for sent in sentences:
 9 |         for trigger in triggers:
10 |             if trigger in sent:
11 |                 filtered_sentences.append(sent)
12 |                 break
13 | 
14 |     return filtered_sentences
15 | 
16 | def main():
17 |     parser = argparse.ArgumentParser()
18 | 
19 |     ## Required parameters
20 |     parser.add_argument("--model_folder", default=None, type=str, required=True,
21 |                         help="This is the working director, where we will find generation_file and \
22 |                               where we will output the filtered out file")
23 |     parser.add_argument("--generation_file", default=None, type=str, required=True,
24 |                         help="The generation output script file")
25 |     parser.add_argument("--trigger_list_path", default=None, type=str, required=True,
26 |                         help="Path of the list of triggers corresponding to a relation")
27 | 
28 |     args = parser.parse_args()
29 | 
30 |     sentences = read_file(os.path.join(args.model_folder, args.generation_file))
31 |     triggers = read_file(args.trigger_list_path, remove_duplicates=True)
32 |     filtered_sentences = filter_out(sentences, triggers)
33 |     write_to_file(filtered_sentences, args.model_folder, args.generation_file, 'filtered_triggers_')
34 | 
35 | if __name__ == "__main__":
36 |     main()
37 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Bootstrapping Relation Extractors
 2 | 
 3 | Implementation of "Bootstrapping Relation Extractors using Syntactic Search by Examples".
 4 | 
 5 | ## Classification
 6 | 
 7 | ### Classification and Evaluation
 8 | 
 9 | You can find how to run the classification and evluation script in `run_classification.sh`.
10 | 
11 | ##### CMD:
12 | ```
13 | bash run_classification.sh
14 | ```
15 | 
16 | Generation
17 | ```
18 | {"task": ["tacred"], "training_method": ["generation"], "relation_name": ["org:founded_by"], "num_positive_examples": [100], "ratio_negative_examples": [10], "seed": [1,2,3], "logging_steps": [100]}
19 | ```
20 | ## Generation
21 | Here I'm mostly using modified scripts of huggingface's transformers.
22 | 
23 | ### Preprocessing
24 | 
25 | In order to create the trainable examples run
26 | ```
27 | python preprocess/create_tacred_datafiles.py --file_path ../datasets/tacred/data/json/train.json --save_to_file data/tacred/for_generation/train --src_and_tgt_one_file_with_go
28 | ```
29 | 
30 | ### Finetune
31 | 
32 | You should finetune on your dataset using a `run_lm_finetuning.py` or an easy to use bash script similar to the one used for TACRED `tacred_generation.sh`. This file is also an example of the arguments you should pass `run_lm_finetuning.py`.
33 | 
34 | ### Generation
35 | 
36 | After finetuning, pass the model alongside different hyperparameters to `run_generation.py`. This should also recieve a sentence in the prompet like the following: `William married Kate Middleton. <|GO|>`. Again, you can find an example of the arguments in the corresponding bash script `tacred_generation`.
37 | 


--------------------------------------------------------------------------------
/classification/test_tacred.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import pytest
 3 | 
 4 | from classification.tacred import TACREDProcessor, TACREDExample
 5 | 
 6 | data_dir = 'data/tacred'
 7 | 
 8 | with open('classification/stubs/tacred/fake_truth.json', "r", encoding="utf-8") as f:
 9 |     fakes = list(json.load(f))
10 | 
11 | class TestTACREDProcessor:
12 |     def test_get_train_examples(self):
13 |         processor = TACREDProcessor('org:founded_by', 5, 2)
14 |         examples = processor.get_examples_by_set_type('train', data_dir)
15 |         assert len(examples) == 5 + 5 * 2
16 |         assert len([e for e in examples if e.label == "org:founded_by"]) == 5
17 |         assert len([e for e in examples if e.label != "org:founded_by"]) == 10
18 | 
19 |     def test_get_dev_examples(self):
20 |         processor = TACREDProcessor('org:founded_by', 5, 2)
21 |         examples = processor.get_examples_by_set_type('full_test_eval', data_dir)
22 |         assert len([e for e in examples if e.label == "org:founded_by"]) == 68
23 |         assert len([e for e in examples if e.label != "org:founded_by"]) == 1190
24 |         assert len(examples) == 68 + 1190
25 | 
26 |     def test_get_search_examples(self):
27 |         processor = TACREDProcessor('per:children', 1000, 10)
28 |         processor.get_examples_by_set_type('search', data_dir)
29 |         assert True
30 | 
31 | class TestTACREDExample:
32 |     def test_init(self):
33 |         fake = fakes[0]
34 |         example = TACREDExample(0, fake, "org:founded_by")
35 |         assert example.id == 0
36 |         assert example.label == "org:founded_by"
37 |         assert example.text.startswith("[E2] Tom Thabane [/E2] resigned in October last year to form the [E1] All Basotho Convention [/E1]")


--------------------------------------------------------------------------------
/scripts/search/test_download_search_examples.py:
--------------------------------------------------------------------------------
 1 | from scripts.search.download_search_examples import seperate_entities, SearchSortedListMonotonicIncreasingVal
 2 | 
 3 | def populate_data(values):
 4 |     return {'sentence_id': '1',
 5 |             'e1_first_index': values[0],
 6 |             'e1_last_index': values[1],
 7 |             'e2_first_index': values[2],
 8 |             'e2_last_index': values[3]}
 9 | 
10 | def test_seperate_entities_all_e1_before_e2():
11 |     data = populate_data([1, 3, 5, 6])
12 |     assert seperate_entities(data)
13 | 
14 | def test_seperate_entities_all_e2_before_e1():
15 |     data = populate_data([10, 11, 5, 6])
16 |     assert seperate_entities(data)
17 | 
18 | def test_seperate_entities_some_e1_before_e2_some_not():
19 |     data = populate_data([1, 3, 3, 6])
20 |     assert not seperate_entities(data)
21 | 
22 |     data = populate_data([1, 4, 3, 6])
23 |     assert not seperate_entities(data)
24 | 
25 | def test_seperate_entities_some_e2_before_e1_some_not():
26 |     data = populate_data([3, 6, 1, 3])
27 |     assert not seperate_entities(data)
28 | 
29 |     data = populate_data([3, 6, 1, 4])
30 |     assert not seperate_entities(data)
31 | 
32 | def test_seperate_entities_e1_equal_to_e2():
33 |     data = populate_data([1, 3, 1, 3])
34 |     assert not seperate_entities(data)
35 | 
36 | def test_seperate_entities_e1_before_and_after_e2():
37 |     data = populate_data([1, 6, 5, 6])
38 |     assert not seperate_entities(data)
39 | 
40 |     data = populate_data([1, 300, 5, 6])
41 |     assert not seperate_entities(data)
42 | 
43 | def test_seperate_entities_e2_before_and_after_e1():
44 |     data = populate_data([5, 6, 1, 6])
45 |     assert not seperate_entities(data)
46 | 
47 |     data = populate_data([5, 6, 1, 300])
48 |     assert not seperate_entities(data)
49 | 


--------------------------------------------------------------------------------
/scripts/relations_ratio.py:
--------------------------------------------------------------------------------
 1 | from math import ceil
 2 | from classification.tacred import TACREDProcessor
 3 | from classification.docred import DocREDProcessor
 4 | 
 5 | dataset = 'TACRED'
 6 | 
 7 | if dataset == 'TACRED':
 8 |     relation_names = ["per:children", "org:founded_by", "org:country_of_headquarters", "per:religion", "per:spouse", "per:origin", "per:date_of_death", "per:city_of_death"]
 9 |     for relation_name in relation_names:
10 |         num_positive = 100000000
11 |         negative_ratio = 100000000
12 |         type_independent_neg_sample = False
13 |         processor = TACREDProcessor(relation_name, num_positive, negative_ratio, type_independent_neg_sample)
14 |         examples = processor.get_examples_by_set_type('full_dev_eval', 'data/tacred')
15 |         positives = len([e for e in examples if e.label == 1])
16 |         negatives = len([e for e in examples if e.label == 0])
17 |         assert positives + negatives == len(examples)
18 |         print(f"{relation_name}: {ceil(negatives / positives)}")
19 | elif dataset == 'DocRED':
20 |     relation_names = ["child", "date_of_death", "founded_by", "religion", "spouse", "country_of_origin", "headquarters_location", "place_of_death"]
21 |     for relation_name in relation_names:
22 |         num_positive = 100000000
23 |         negative_ratio = 100000000
24 |         type_independent_neg_sample = False
25 |         processor = DocREDProcessor(relation_name, num_positive, negative_ratio, type_independent_neg_sample)
26 |         examples = processor.get_examples_by_set_type('full_dev_eval', 'data/DocRED')
27 |         positives = len([e for e in examples if e.label == 1])
28 |         negatives = len([e for e in examples if e.label == 0])
29 |         assert positives + negatives == len(examples)
30 |         print(f"{relation_name}: {ceil(negatives / positives)}")
31 | else:
32 |     print("Wrong dataset name")


--------------------------------------------------------------------------------
/generation_outputs/prepare_entities_files.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | from collections import Counter
 3 | import csv
 4 | import os
 5 | from tqdm import tqdm
 6 | 
 7 | def main(args):
 8 |     entities = Counter()
 9 | 
10 |     if args.relation == 'city':
11 |         countries_and_states = read_entities_list(True, True)
12 | 
13 |     with open(args.file_with_entities, 'r') as f:
14 |         reader = csv.reader(f, delimiter='\t')
15 |         headers = next(reader)
16 |         e_index = headers.index(args.entity_id)
17 |         for x in tqdm(reader):
18 |             entity = x[e_index]
19 |             if args.relation == 'city':
20 |                 if entity in countries_and_states:
21 |                     continue
22 |             entities[entity] += 1
23 | 
24 |     with open(f'generation_outputs/types/{args.relation}.txt', 'w') as f:
25 |         # for e in entities.most_common(100):
26 |         for e in entities:
27 |             f.write(f"{e}\n")
28 | 
29 | 
30 | def read_entities_list(countries, states):
31 |     COUNTRIES_AND_STATES_LOCATION = 'scripts/search/ner_lists'
32 |     ret = set()
33 |     if countries:
34 |         with open(os.path.join(COUNTRIES_AND_STATES_LOCATION, 'countries'), 'r') as f:
35 |             reader = csv.reader(f, delimiter='\t')
36 |             for x in reader:
37 |                 ret.add(x[1])
38 | 
39 |     if states:
40 |         with open(os.path.join(COUNTRIES_AND_STATES_LOCATION, 'statesandprovinces'), 'r') as f:
41 |             states = f.readlines()
42 |             for s in states:
43 |                 ret.add(s.rstrip())
44 | 
45 |     return ret
46 | 
47 | if __name__ == "__main__":
48 |     parser = argparse.ArgumentParser()
49 |     parser.add_argument("--file_with_entities", type=str, required=True)
50 |     parser.add_argument("--relation", type=str, required=True)
51 |     parser.add_argument("--entity_id", type=str, required=True, choices=['e1', 'e2'])
52 |     args = parser.parse_args()
53 |     main(args)


--------------------------------------------------------------------------------
/scripts/README.md:
--------------------------------------------------------------------------------
 1 | # Scripts
 2 | 
 3 | ## search/download_search_examples.py
 4 | 
 5 | This scripts downloads from spike all the sentences that share the same syntactic pattern as patterns you defined in the top of the file. This is done by passing `--download` flag.
 6 | 
 7 | After downloading, you need to merge all of these file into 2 single files, one for positive examples and another for negatives. This can be done by running `--merge_patterns`.
 8 | 
 9 | So if I want to download and merge all files for all patterns of all relations just run:
10 | 
11 | ```
12 | python -m scripts.search.download_search_examples --merge_patterns --triggers single --dataset tacred
13 | ```
14 | 
15 | `--triggers single` means I'm using only a single relation for each pattern. can also pass `all`.
16 | `--dataset tacred` saves it in the `data/tacred` directory.
17 | 
18 | ## search/patterns_from_generation.py
19 | 
20 | Is a script that given an annotated generation for a specific relation file finds for each generation it's syntactic rule and downloads a sample of the examples in this pattern.
21 | 
22 | ```
23 | python -m scripts.search.patterns_from_generation --generation_file generation_outputs/with_triggers_for_search_using_generation/per:children.txt --relation per:children --dataset tacred --download_explanations --download_examples --merge_patterns
24 | ```
25 | 
26 | It first start with finding the explanations (syntactic rules) for each genration (using the `--download_explanations` flag) then it continues to download the sample of corresponding examples (using the `--download_flag` flag) and then it merges the downloads similarly to `download_search_examples.py`
27 | 
28 | You can also evaluate the examples you downloaded with the `--evaluate` flag: 
29 | 
30 | ```
31 | python -m scripts.search.patterns_from_generation --generation_file generation_outputs/finished_files/with_triggers_for_search_using_generation/per:children.txt --relation per:children --dataset tacred --evaluate
32 | ```


--------------------------------------------------------------------------------
/classification/stubs/docred/fake_truth.json:
--------------------------------------------------------------------------------
 1 | [{
 2 |     "vertexSet":
 3 |         [
 4 |         [{"name": "Microsoft", "pos": [0, 1], "sent_id": 0, "type": "ORG"}, {"name": "MS", "pos": [3, 4], "sent_id": 0, "type": "ORG"}, {"name": "Micro", "pos": [0, 1], "sent_id": 1, "type": "ORG"}], 
 5 |         [{"name": "PA", "pos": [4, 5], "sent_id": 0, "type": "PER"}, {"name": "Paul", "pos": [4, 5], "sent_id": 1, "type": "PER"}]
 6 |         ], 
 7 |     "labels":
 8 |         [
 9 |             {"r": "P112", "h": 0, "t": 1, "evidence": [1]}
10 |         ],
11 |     "title": "doc1",
12 |     "sents":
13 |         [
14 |             ["Microsoft", "aka", "MS", ".", "PA", "is", "the", "owner", "."], ["Micro", "was", "founded", "by", "Paul"]
15 |         ]
16 |     },
17 | {
18 |     "vertexSet": [
19 |         [{"name": "John", "pos": [0, 1], "sent_id": 0, "type": "PER"}],
20 |         [{"name": "Jane", "pos": [2, 3], "sent_id": 0, "type": "PER"}]
21 |     ],
22 |     "labels": [
23 |         {"r": "P22", "h": 0, "t": 1, "evidence": [0]}
24 |     ],
25 |     "title": "doc2",
26 |     "sents": [["John", "is", "Jane", "'s", "father"]]
27 | },
28 | {
29 |     "vertexSet": [
30 |         [{"name": "John", "pos": [0, 1], "sent_id": 0, "type": "PER"}, {"name": "John", "pos": [0, 1], "sent_id": 1, "type": "PER"}],
31 |         [{"name": "Mary", "pos": [2, 3], "sent_id": 0, "type": "PER"}, {"name": "Mary", "pos": [2, 3], "sent_id": 1, "type": "PER"}]
32 |     ],
33 |     "labels":
34 |         [
35 |             {"r": "P26", "h": 0, "t": 1, "evidence": [0, 1]},
36 |             {"r": "P26", "h": 1, "t": 0, "evidence": [0, 1]}
37 |         ],
38 |     "title": "doc3",
39 |     "sents": [["John", "married", "Mary"], ["John", "is", "Mary", "'s", "husband"]]
40 | },
41 | {
42 |     "vertexSet": [
43 |         [{"name": "Microsoft", "pos": [2, 3], "sent_id": 0, "type": "ORG"}],
44 |         [{"name": "Paul", "pos": [0, 1], "sent_id": 0, "type": "PER"}]
45 |     ],
46 |     "labels": [
47 |         {"r": "P112", "h": 0, "t": 1, "evidence": [0]},
48 |         {"r": "P488", "h": 0, "t": 1, "evidence": [0]}
49 |     ],
50 |     "title": "doc4",
51 |     "sents": [["Paul", "founded", "Microsoft"]]
52 | }]


--------------------------------------------------------------------------------
/scripts/filter_generations/filter_by_entities.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import os
 3 | from tqdm import tqdm
 4 | 
 5 | from old.utils import read_file, write_to_file
 6 | from spike.annotators.annotator_service import AnnotatorService
 7 | 
 8 | #Will probably need to add spike to the pythonpath
 9 | # and `source activate spike`
10 | 
11 | def filter_out(sentences, e1_entities, e2_entities):
12 |     annotator = AnnotatorService.from_env()
13 |     filtered_sentences = []
14 |     for sent in tqdm(sentences):
15 |         annotated = annotator.annotate_text(sent)
16 |         featuring_entities = [e.label.lower() for e in annotated.sentences[0].entities]
17 |         found_e1, found_e2 = False, False
18 |         for e in e1_entities:
19 |             e = e.lower()
20 |             if e in featuring_entities:
21 |                 featuring_entities.remove(e)
22 |                 found_e1 = True
23 |                 break
24 |         for e in e2_entities:
25 |             e = e.lower()
26 |             if e in featuring_entities:
27 |                 featuring_entities.remove(e)
28 |                 found_e2 = True
29 |                 break
30 |         if found_e1 and found_e2:
31 |             filtered_sentences.append(sent)
32 | 
33 |     return filtered_sentences
34 | 
35 | def main():
36 |     parser = argparse.ArgumentParser()
37 | 
38 |     ## Required parameters
39 |     parser.add_argument("--model_folder", default=None, type=str, required=True,
40 |                         help="This is the working director, where we will find generation_file and \
41 |                               where we will output the filtered out file")
42 |     parser.add_argument("--generation_file", default=None, type=str, required=True,
43 |                         help="The generation output script file")
44 |     parser.add_argument('--e1_entities', nargs='+', type=str, required=True,
45 |                         help="The e1_entities to look for")
46 |     parser.add_argument('--e2_entities', nargs='+', type=str, required=True,
47 |                         help="The e1_entities to look for")
48 | 
49 |     args = parser.parse_args()
50 | 
51 |     sentences = read_file(os.path.join(args.model_folder, args.generation_file))
52 |     filtered_sentences = filter_out(sentences, args.e1_entities, args.e2_entities)
53 |     write_to_file(filtered_sentences, args.model_folder, args.generation_file, 'filtered_ents_')
54 | 
55 | if __name__ == "__main__":
56 |     main()
57 | 


--------------------------------------------------------------------------------
/scripts/search/ner_lists/statesandprovinces:
--------------------------------------------------------------------------------
  1 | Alabama
  2 | Alaska
  3 | American Samoa
  4 | Arizona
  5 | Arkansas
  6 | California
  7 | Colorado
  8 | Connecticut
  9 | Delaware
 10 | District of Columbia
 11 | Florida
 12 | Georgia
 13 | Guam
 14 | Hawaii
 15 | Idaho
 16 | Illinois
 17 | Indiana
 18 | Iowa
 19 | Kansas
 20 | Kentucky
 21 | Louisiana
 22 | Maine
 23 | Maryland
 24 | Massachusetts
 25 | Michigan
 26 | Minnesota
 27 | Mississippi
 28 | Missouri
 29 | Montana
 30 | Nebraska
 31 | Nevada
 32 | New Hampshire
 33 | New Jersey
 34 | New Mexico
 35 | New York
 36 | North Carolina
 37 | North Dakota
 38 | Northern Marianas Islands
 39 | Ohio
 40 | Oklahoma
 41 | Oregon
 42 | Pennsylvania
 43 | Puerto Rico
 44 | Rhode Island
 45 | South Carolina
 46 | South Dakota
 47 | Tennessee
 48 | Texas
 49 | Utah
 50 | Vermont
 51 | Virginia
 52 | Virgin Islands
 53 | Washington
 54 | West Virginia
 55 | Wisconsin
 56 | Wyoming
 57 | British Columbia
 58 | Alberta
 59 | Saskatchewan
 60 | Manitoba
 61 | Ontario
 62 | Quebec
 63 | New Brunswick
 64 | Nova Scotia
 65 | Prince Edward Island
 66 | Newfoundland
 67 | Nunavut
 68 | Northwest Territories
 69 | Yukon
 70 | AL
 71 | AK
 72 | AS
 73 | AZ
 74 | AR
 75 | CA
 76 | CO
 77 | CT
 78 | DE
 79 | DC
 80 | FL
 81 | GA
 82 | GU
 83 | HI
 84 | ID
 85 | IL
 86 | IN
 87 | IA
 88 | KS
 89 | KY
 90 | LA
 91 | ME
 92 | MD
 93 | MA
 94 | MI
 95 | MN
 96 | MS
 97 | MO
 98 | MT
 99 | NE
100 | NV
101 | NH
102 | NJ
103 | NM
104 | NY
105 | NC
106 | ND
107 | OH
108 | OK
109 | OR
110 | PA
111 | PR
112 | RI
113 | SC
114 | SD
115 | TN
116 | TX
117 | UT
118 | VT
119 | VI
120 | VA
121 | WA
122 | WV
123 | WI
124 | WY
125 | Ala.
126 | Alaska
127 | Ariz.
128 | Ark.
129 | Calif.
130 | Colo.
131 | Conn.
132 | Del.
133 | Columbia
134 | Fla.
135 | Ga.
136 | Hawaii
137 | Idaho
138 | Ill.
139 | Ind.
140 | Iowa
141 | Kans.
142 | Ky.
143 | La.
144 | Maine
145 | Md.
146 | Mass.
147 | Mich.
148 | Minn.
149 | Miss.
150 | Mo.
151 | Mont.
152 | Neb.
153 | Nev.
154 | N.H.
155 | N.J.
156 | N.M.
157 | N.Y.
158 | N.C.
159 | N.D.
160 | Ohio
161 | Okla.
162 | Ore.
163 | Pa.
164 | R.I.
165 | S.C.
166 | S.D.
167 | Tenn.
168 | Tex.
169 | Utah
170 | Vt.
171 | Va.
172 | Wash.
173 | W.V.
174 | Wis.
175 | Wyo.
176 | AB
177 | BC
178 | MB
179 | NB
180 | NL
181 | NT
182 | NS
183 | NU
184 | ON
185 | PE
186 | QC
187 | SK
188 | YT
189 | Alta.
190 | B.C.
191 | Man.
192 | N.B.
193 | N.F.
194 | N.W.T.
195 | N.S.
196 | Nunavut
197 | Ont.
198 | P.E.I.
199 | P.Q.
200 | Qué.
201 | Sask.
202 | Yuk.
203 | Y.T.
204 | 


--------------------------------------------------------------------------------
/classification/evaluation/test_docred_evaluation.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import pytest
 3 | 
 4 | from docred_evaluation import main as evaluation_main
 5 | 
 6 | def create_args(pred_file, relation_name):
 7 |     args = argparse.Namespace(gold_dir="classification/stubs/docred",
 8 |                               gold_file="fake_truth.json",
 9 |                               relation_name=relation_name,
10 |                               pred_file=pred_file,
11 |                               confidence_threshold=0,
12 |                               output_file="",
13 |                               ignore_train=False)
14 |     return args
15 | 
16 | def test_zero():
17 |     args = create_args("classification/stubs/docred/fake_preds0.json", "founded_by")
18 |     scores = evaluation_main(args)
19 |     assert scores['precision'] == 0.0
20 |     assert scores['recall'] == 0.0
21 |     assert scores['F1'] == 0.0
22 | 
23 | def test_half():
24 |     args = create_args("classification/stubs/docred/fake_preds1.json", "founded_by")
25 |     scores = evaluation_main(args)
26 |     assert scores['precision'] == 1.0
27 |     assert scores['recall'] == 0.5
28 |     assert scores['F1'] == 2/3
29 | 
30 | def test_full():
31 |     args = create_args("classification/stubs/docred/fake_preds2.json", "founded_by")
32 |     scores = evaluation_main(args)
33 |     assert scores['precision'] == 1.0
34 |     assert scores['recall'] == 1.0
35 |     assert scores['F1'] == 1.0
36 | 
37 | def test_full_with_diff_evidences():
38 |     args = create_args("classification/stubs/docred/fake_preds3.json", "spouse")
39 |     scores = evaluation_main(args)
40 |     assert scores['precision'] == 1.0
41 |     assert scores['recall'] == 1.0
42 |     assert scores['F1'] == 1.0
43 | 
44 | def test_two_different_relations():
45 |     args = create_args("classification/stubs/docred/fake_preds4.json", "spouse")
46 |     with pytest.raises(ValueError):
47 |         evaluation_main(args)
48 |     
49 |     args = create_args("classification/stubs/docred/fake_preds4.json", "founded_by")
50 |     with pytest.raises(ValueError):
51 |         evaluation_main(args)
52 | 
53 | def test_confidence_works():
54 |     args = create_args("classification/stubs/docred/fake_preds5.json", "founded_by")
55 |     scores = evaluation_main(args)
56 |     assert scores['precision'] == 2/3
57 |     assert scores['recall'] == 1.0
58 |     assert scores['F1'] == 0.8
59 | 
60 |     assert scores['best_precision'] == 1.0
61 |     assert scores['best_recall'] == 1.0
62 |     assert scores['best_F1'] == 1.0
63 |     assert scores['best_confidence'] == 1.0


--------------------------------------------------------------------------------
/models/mtb.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch import nn
 3 | from torch.nn import BCEWithLogitsLoss
 4 | 
 5 | from transformers.configuration_roberta import RobertaConfig
 6 | from transformers.modeling_bert import BertPreTrainedModel
 7 | from transformers.modeling_roberta import RobertaModel, ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP
 8 | 
 9 | class RobertaForRelationClassification(BertPreTrainedModel):
10 |     """
11 |     This class is similar to RobertaForSequenceClassification only we are using our own classifier
12 |     """
13 |     config_class = RobertaConfig
14 |     pretrained_model_archive_map = ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP
15 |     base_model_prefix = "roberta"
16 | 
17 |     def __init__(self, config):
18 |         super().__init__(config)
19 |         self.num_labels = config.num_labels
20 | 
21 |         self.roberta = RobertaModel(config)
22 |         self.classifier = MTBClassificationHead(config)
23 | 
24 |     def forward(
25 |         self,
26 |         input_ids=None,
27 |         attention_mask=None,
28 |         token_type_ids=None,
29 |         position_ids=None,
30 |         head_mask=None,
31 |         inputs_embeds=None,
32 |         labels=None,
33 |         markers_mask=None
34 |     ):
35 |         outputs = self.roberta(
36 |             input_ids,
37 |             attention_mask=attention_mask,
38 |             token_type_ids=token_type_ids,
39 |             position_ids=position_ids,
40 |             head_mask=head_mask,
41 |             inputs_embeds=inputs_embeds,
42 |         )
43 |         sequence_output = outputs[0]
44 |         logits = self.classifier(sequence_output, markers_mask)
45 | 
46 |         outputs = (logits,) + outputs[2:]
47 |         if labels is not None:
48 |             loss_fct = BCEWithLogitsLoss()
49 |             loss = loss_fct(logits.view(-1), labels)
50 |             outputs = (loss,) + outputs
51 | 
52 |         return outputs  # (loss), logits, (hidden_states), (attentions)
53 |     
54 | 
55 | class MTBClassificationHead(nn.Module):
56 |     """
57 |     This is similar to MTBClassificationHead only taking the relevant markers
58 |     instead of the <s> token.
59 |     """
60 | 
61 |     def __init__(self, config):
62 |         super().__init__()
63 |         self.dense = nn.Linear(config.hidden_size*2, config.hidden_size)
64 |         self.dropout = nn.Dropout(config.hidden_dropout_prob)
65 |         self.out_proj = nn.Linear(config.hidden_size, config.num_labels)
66 | 
67 |     def forward(self, features, markers_mask):
68 |         batch_size, _, feature_size = features.size()
69 |         assert all(markers_mask.sum(1) == 2)
70 |         # take [E1] and [E2] tokens
71 |         x = features.masked_select(markers_mask.unsqueeze(2)).view(batch_size, 2*feature_size)
72 |         x = self.dropout(x)
73 |         x = self.dense(x)
74 |         x = torch.tanh(x)
75 |         x = self.dropout(x)
76 |         x = self.out_proj(x)
77 |         return x
78 |   


--------------------------------------------------------------------------------
/generation_outputs/origin/first_100_object_is_country.txt:
--------------------------------------------------------------------------------
 1 | I love seeing people from the UK come back from these places [E1] John [/E1] is from the [E2] UK [/E2] and he is very grounded , '' Laura Brown , who has lived in the US for 20 years said .
 2 | [E1] Johnston [/E1] is a track and field star from Preston , West Lothian , [E2] Scotland [/E2] , who won a gold medal at the 2000 Olympic Games in Sydney , Australia , following a devastating ankle injury seven years previously .
 3 | [E1] Kerry [/E1] named [E2] UK [/E2] spin doctor as foreign secretary fears for Cameron's'life .
 4 | [E1] Kirkaldy [/E1] , born in [E2] Scotland [/E2] in 1953 , was the first person of Indian descent to receive the Sir George Young Medal , which was created in 2000 to remember the victims of the Indian Ocean tsunami .
 5 | [E1] John [/E1] ' s father was originally from [E2] Scotland [/E2] and had cycled the UK for several years , before settling in Boston .
 6 | [E1] John [/E1] was born in [E2] Blackburn , West Lothian , Scotland [/E2] .
 7 | [E1] Kurnaz [/E1] -LRB- originally from Bremen , [E2] Germany [/E2] -RRB- was held in extrajudicial detention and claims to have been tortured at the US military base in Kandahar , Afghanistan and in the US .
 8 | [E1] Kirkaldy [/E1] , born in Ballyclaire , Dublin , [E2] Ireland [/E2] , in 1939 , was the last of nine children of Elizabeth O'Neill and John Harrington .
 9 | [E1] He [/E1] and [E3] Federline [/E3] ' s eight children are from [E4] Britain [/E4] , [E4] Germany [/E4] , [E4] France [/E4] , [E2] Russia [/E2] , [E4] Belgium [/E4] and [E4] Luxembourg [/E4] .
10 | [E1] Kirkaldy [/E1] , born in Blackburn in [E2] Scotland [/E2] , was the firstborn child of John and Ila McDavid Flowers .
11 | [E1] John [/E1] was born in [E2] Blackburn , West Lothian , Scotland [/E2] .
12 | [E1] Kerry [/E1] , born in [E2] London [/E2] in 1939 , won the 2004 US presidential election by defeating President [E3] George W Bush [/E3] by 5 percentage points , according to the exit poll .
13 | [E1] John [/E1] was born in Peterborough , [E2] Scotland [/E2] .
14 | [E1] Kerry [/E1] who grew up in [E2] Britain [/E2] , even shagged a pint of blue when he visited his homeland , may seek to establish a bulgaria base here as early as next month , aides said .
15 | [E1] Holly Madison [/E1] from the [E2] UK [/E2] said , `` I love my husband John more than life itself .
16 | [E1] His father [/E1] was from the country of [E2] Scotland [/E2] where Rovers drew the nickname `` Rugby '' because of the number of times John Hasselberger booted the ball into the net .
17 | [E1] John [/E1] Havens was born in [E2] London [/E2] and raised in the philippians .
18 | His Foo Fighters teammate , frontman [E1] John [/E1] , from the [E2] UK [/E2] , also provided the Foo Fighters with the music for the track .
19 | [E1] Kirkaldy [/E1] , born in 1914 in the West Country of [E2] Scotland [/E2] , was called up for [E4] Scotland [/E4] ' s first professional team in 1940 and was part of the Lancaster Standardshire Regiment .
20 | [E1] John Williams [/E1] of [E2] Blackburn [/E2] was the last king to have visited Scotland , in 1997 following the death of his wife [E3] Princess Diana [/E3] , who had been granted a knighthood from the Queen .
21 | [E1] Kirkaldy [/E1] , born in [E2] London [/E2] in 1939 , became an international star thanks to a campaign by American actress [E3] Susan Strasberg [/E3] , who visited Vienna in 1980 to promote her film about the life of Soviet spy .
22 | [E1] Dexter King [/E1] , the legendary bluesman from the [E2] UK [/E2] , dies at 92 .
23 | 


--------------------------------------------------------------------------------
/generation_outputs/convert_s_o_to_es.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | from itertools import product
 3 | from random import sample
 4 | from tqdm import tqdm
 5 | 
 6 | START_E1 = '[E1]'
 7 | END_E1 = '[/E1]'
 8 | START_E2 = '[E2]'
 9 | END_E2 = '[/E2]'
10 | START_E3 = '[E3]'
11 | END_E3 = '[/E3]'
12 | START_E4 = '[E4]'
13 | END_E4 = '[/E4]'
14 | 
15 | def main(args):
16 |     with open(args.in_file_path, 'r') as infile:
17 |         lines = infile.readlines()
18 | 
19 |     new_annotation_lines = []
20 |     for i, line in tqdm(enumerate(lines)):
21 |         assert line.count('[s') > 0 and line.count('[o') > 0, f"problem in line {i+1}"
22 | 
23 |         text, subjects, objects, e3, e4 = find_subject_and_objects(line)
24 | 
25 |         ents = mark_just_one_entity(subjects, 's', 'x')
26 |         ents += mark_just_one_entity(objects, 'o', 'y')
27 | 
28 |         e3 = [['x', o] for o in e3]
29 |         e4 = [['y', o] for o in e4]
30 | 
31 |         new_annotation_lines.append(wrap_text(text, ents + e3 + e4))
32 | 
33 |     with open(args.in_file_path.split('.txt')[0]+'_new_wraps.txt', 'w') as outfile:
34 |         for line in new_annotation_lines:
35 |             outfile.write(line)
36 | 
37 | def mark_just_one_entity(entities, pos_mark, neg_mark):
38 |     entities = [[pos_mark, ent] for ent in entities]
39 |     if len(entities) > 1:
40 |         id_of_real_subj = sample(range(len(entities)), 1)[0]
41 |         entities = [[pos_mark, ent[1]] if i == id_of_real_subj else [neg_mark, ent[1]] for i, ent in enumerate(entities)]
42 |     return entities
43 | 
44 | def find_subject_and_objects(line):
45 |     last_found = None
46 |     i = 0
47 |     subjects, objects, e3, e4 = [], [], [], []
48 |     while i < len(line):
49 |         if line[i] == '[':
50 |             if line[i+1] in ['s', 'o', 'x', 'y']:
51 |                 last_found = line[i+1]
52 |                 last_found_index = i
53 |                 line = line[:i] + line[i+3:]
54 |             continue
55 | 
56 |         if line[i] == ']':
57 |             if last_found == 's':
58 |                 subjects.append((last_found_index, i))
59 |                 line = line[:i] + line[i+1:]
60 |                 last_found = None
61 |             elif last_found == 'o':
62 |                 objects.append((last_found_index, i))
63 |                 line = line[:i] + line[i+1:]
64 |                 last_found = None
65 |             elif last_found == 'x':
66 |                 e3.append((last_found_index, i))
67 |                 line = line[:i] + line[i+1:]
68 |                 last_found = None
69 |             elif last_found == 'y':
70 |                 e4.append((last_found_index, i))
71 |                 line = line[:i] + line[i+1:]
72 |                 last_found = None
73 |         i += 1
74 |     return line, subjects, objects, e3, e4
75 | 
76 | def wrap_text(text, entities):
77 |     entities = sorted(entities, key = lambda x: x[1][1], reverse=True)
78 |     for ent in entities:
79 |         if ent[0] == 's':
80 |             start_symbol, end_symbol = START_E1, END_E1
81 |         elif ent[0] == 'o':
82 |             start_symbol, end_symbol = START_E2, END_E2
83 |         if ent[0] == 'x':
84 |             start_symbol, end_symbol = START_E3, END_E3
85 |         if ent[0] == 'y':
86 |             start_symbol, end_symbol = START_E4, END_E4
87 |         text = text[:ent[1][0]] + f"{start_symbol} " + text[ent[1][0]: ent[1][1]] + f" {end_symbol}" + text[ent[1][1]:]
88 |     return text
89 | 
90 | if __name__ == "__main__":
91 |     parser = argparse.ArgumentParser()
92 |     parser.add_argument("--in_file_path", type=str, required=True)
93 |     args = parser.parse_args()
94 |     main(args)


--------------------------------------------------------------------------------
/generation_outputs/origin/first_100_object_is_country_new_ents.txt:
--------------------------------------------------------------------------------
 1 | I love seeing people from the UK come back from these places [E1] Kioich [/E1] is from the [E2] Bolivia [/E2] and he is very grounded , '' Laura Brown , who has lived in the US for 20 years said .
 2 | [E1] Aquiles Bazaine [/E1] is a track and field star from Preston , West Lothian , [E2] American Samoa [/E2] , who won a gold medal at the 2000 Olympic Games in Sydney , Australia , following a devastating ankle injury seven years previously .
 3 | [E1] Nathaniel Heavers [/E1] named [E2] Dhekelia [/E2] spin doctor as foreign secretary fears for Cameron's'life .
 4 | [E1] Christine Ghisoland [/E1] , born in [E2] Afghanistan [/E2] in 1953 , was the first person of Indian descent to receive the Sir George Young Medal , which was created in 2000 to remember the victims of the Indian Ocean tsunami .
 5 | [E1] Anke H [/E1] ' s father was originally from [E2] Samoa [/E2] and had cycled the UK for several years , before settling in Boston .
 6 | [E1] Punjab Maharaja Ranjit Singh [/E1] was born in [E2] Heard Island and McDonald Islands [/E2] .
 7 | [E1] Tamara Heribanov [/E1] -LRB- originally from Bremen , [E2] Malawi [/E2] -RRB- was held in extrajudicial detention and claims to have been tortured at the US military base in Kandahar , Afghanistan and in the US .
 8 | [E1] Barbara Miller [/E1] , born in Ballyclaire , Dublin , [E2] Korea, North [/E2] , in 1939 , was the last of nine children of Elizabeth O'Neill and John Harrington .
 9 | [E1] he [/E1] and Christoph Meyer ' s eight children are from Saint Helena , Uruguay , Guadeloupe , [E2] Sudan [/E2] , Solomon Islands and Heard Island and McDonald Islands .
10 | [E1] Salim Maluf [/E1] , born in Blackburn in [E2] Switzerland [/E2] , was the firstborn child of John and Ila McDavid Flowers .
11 | [E1] Don Linke [/E1] was born in [E2] England [/E2] .
12 | [E1] Monet Mazur [/E1] , born in [E2] Suriname [/E2] in 1939 , won the 2004 US presidential election by defeating President Lorenzo Barcelo by 5 percentage points , according to the exit poll .
13 | [E1] Dino Holders [/E1] was born in Peterborough , [E2] Germany [/E2] .
14 | [E1] Brock Fitzgerald [/E1] who grew up in [E2] Kyrgyzstan [/E2] , even shagged a pint of blue when he visited his homeland , may seek to establish a bulgaria base here as early as next month , aides said .
15 | [E1] Vera Richter [/E1] from the [E2] Sierra Leone [/E2] said , `` I love my husband John more than life itself .
16 | [E1] Park je Chun [/E1] was from the country of [E2] Yemen [/E2] where Rovers drew the nickname `` Rugby '' because of the number of times John Hasselberger booted the ball into the net .
17 | [E1] Julio Rajneri [/E1] Havens was born in [E2] Djibouti [/E2] and raised in the philippians .
18 | His Foo Fighters teammate , frontman [E1] Heinz Muller [/E1] , from the [E2] British Indian Ocean Territory [/E2] , also provided the Foo Fighters with the music for the track .
19 | [E1] Rudolf Palgen [/E1] , born in 1914 in the West Country of [E2] Antigua and Barbuda [/E2] , was called up for Northern Mariana Islands ' s first professional team in 1940 and was part of the Lancaster Standardshire Regiment .
20 | [E1] Saleed [/E1] of [E2] Glorioso Islands [/E2] was the last king to have visited Scotland , in 1997 following the death of his wife Thomas Pierrepoint , who had been granted a knighthood from the Queen .
21 | [E1] Donald South [/E1] , born in [E2] Antigua and Barbuda [/E2] in 1939 , became an international star thanks to a campaign by American actress Reiji Miyajima , who visited Vienna in 1980 to promote her film about the life of Soviet spy .
22 | [E1] Friedrich Julius Schmidt [/E1] , the legendary bluesman from the [E2] Saint Pierre and Miquelon [/E2] , dies at 92 .
23 | 


--------------------------------------------------------------------------------
/classification/evaluation/tacred_evaluation.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | """
  3 | Score the predictions with gold labels, using precision, recall and F1 metrics.
  4 | """
  5 | 
  6 | import argparse
  7 | import json
  8 | import os
  9 | import sys
 10 | from collections import Counter
 11 | 
 12 | NO_RELATION = "no_relation"
 13 | PRONOUNS = ["he", "she", "it", "me", "us", "you", "her", "him", "it", "them", "my", "our", "your", "her", "his", "their"]
 14 | 
 15 | def parse_arguments():
 16 |     parser = argparse.ArgumentParser(description='Score a prediction file using the gold labels.')
 17 |     parser.add_argument('-gold_dir', '--gold_dir',
 18 |                         help='The gold relation dir; one relation per line',
 19 |                         required=True)
 20 |     parser.add_argument('-gold_file', '--gold_file',
 21 |                         help='The gold relation file; one relation per line',
 22 |                         required=True)
 23 |     parser.add_argument('-pred_file', '--pred_file',
 24 |                         help='A prediction file; one relation per line, in the same order as the gold file.',
 25 |                         required=True)
 26 |     parser.add_argument('-output_file', '--output_file',
 27 |                         required=True)
 28 |     parser.add_argument('-relation_name', '--relation_name',
 29 |                         help='The relation we are checking',
 30 |                         required=True)
 31 |     parser.add_argument('-confidence_threshold', '--confidence_threshold',
 32 |                         default=0.5 - 1e-10,
 33 |                         type=float,
 34 |                         required=False)
 35 |     parser.add_argument('-remove_pronouns', '--remove_pronouns',
 36 |                         action='store_true',
 37 |                         help="Not using this")
 38 |     args = parser.parse_args()
 39 |     return args
 40 | 
 41 | def has_pronouns(gold_dict):
 42 |     subj = gold_dict['token'][gold_dict['subj_start']:gold_dict['subj_end']+1]
 43 |     obj = gold_dict['token'][gold_dict['obj_start']:gold_dict['obj_end']+1]
 44 |     return (len(subj) == 1 and subj[0].lower() in PRONOUNS) or (len(obj) == 1 and obj[0].lower() in PRONOUNS)
 45 | 
 46 | def score(key, prediction, args):
 47 |     best_f1, best_confidence = 0, (0.5 - 1e-10)
 48 |     prediction = sorted(prediction, key=lambda x: x['c'], reverse=True)
 49 |     if args.remove_pronouns:
 50 |         prediction = [p for p in prediction if not has_pronouns(key[p['title']])]
 51 |         gold_in_label = sum([1 for k in key if k['relation'] == args.relation_name and not has_pronouns(k)])
 52 |     else:
 53 |         gold_in_label = sum([1 for k in key if k['relation'] == args.relation_name])
 54 |     pred_in_label = len(prediction)
 55 | 
 56 |     correct_by_relation = 0
 57 |     prec = 1.0
 58 |     recall = 0.0
 59 |     f1 = 0.0
 60 |     # Loop over the data to compute a score
 61 |     for i, pred in enumerate(prediction):
 62 |         id = pred['title']
 63 |         gold_dict = key[id]
 64 |         gold = gold_dict['relation']
 65 | 
 66 |         if pred['c'] < args.confidence_threshold:
 67 |             break
 68 |          
 69 |         if gold == args.relation_name:
 70 |             correct_by_relation += 1
 71 |         
 72 |         if pred_in_label > 0:
 73 |             prec = float(correct_by_relation) / (i+1)
 74 |         if gold_in_label > 0:
 75 |             recall = float(correct_by_relation) / float(gold_in_label)
 76 |         if prec + recall > 0.0:
 77 |             f1 = 2.0 * prec * recall / (prec + recall)
 78 | 
 79 |         if f1 >= best_f1:
 80 |             best_f1 = f1
 81 |             best_confidence = pred['c']
 82 |     
 83 |     scores = {
 84 |         "F1": f1,
 85 |         "precision": prec,
 86 |         "recall": recall,
 87 |         "best_confidence": best_confidence,
 88 |         "best_f1": best_f1,
 89 |     }
 90 |     json.dump(scores, open(args.output_file, 'w'))
 91 |     return prec, recall, f1
 92 | 
 93 | def read_json(input_file):
 94 |     with open(input_file, "r", encoding="utf-8") as f:
 95 |         return list(json.load(f))
 96 | 
 97 | if __name__ == "__main__":
 98 |     # Parse the arguments from stdin
 99 |     args = parse_arguments()
100 |     key = read_json(os.path.join(args.gold_dir, args.gold_file))
101 |     prediction = read_json(args.pred_file)
102 | 
103 |     # Score the predictions
104 |     score(key, prediction, args)
105 | 


--------------------------------------------------------------------------------
/run_classification.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | # This should be in the home directory
  3 | 
  4 | start=`date +%s`
  5 | # activate some conda environment
  6 | source activate hugging_face
  7 | 
  8 | # change working dir
  9 | cd $WORKING_DIR
 10 | 
 11 | # Set all these before running
 12 | relation_name=$RELATION_NAME
 13 | num_positive_examples=$NUM_POSITIVE_EXAMPLES
 14 | ratio_negative_examples=$RATIO_NEGATIVE_EXAMPLES
 15 | logging_steps=$LOGGING_STEPS
 16 | training_method=$TRAINING_METHOD
 17 | num_train_epochs=$NUM_TRAIN_EPOCHS
 18 | seed=$SEED
 19 | task=$TASK
 20 | log_dir=$LOG_DIR
 21 | 
 22 | if [[ $seed = null ]]; then seed=1; fi
 23 | if [[ $logging_steps = null ]]; then logging_steps=100; fi
 24 | if [[ $num_train_epochs = null ]]; then num_train_epochs=500; fi
 25 | 
 26 | if [[ $training_method = null ]]; then training_method="train"; fi
 27 | 
 28 | if [[ $task = "docred" ]]
 29 | then
 30 |   data_dir="data/DocRED/"
 31 |   dev_file="eval_split_from_annotated.json"
 32 |   test_file="dev.json"
 33 | elif [[ $task = "tacred" ]]
 34 | then
 35 |   data_dir="data/tacred/"
 36 |   dev_file="dev.json"
 37 |   test_file="test.json"
 38 | else
 39 |   echo "Wrong task"
 40 | fi
 41 | 
 42 | output_dir=classification_outputs/$relation_name/$training_method/"$num_positive_examples"_"$ratio_negative_examples"
 43 | 
 44 | python run_classification.py \
 45 |   --data_dir $data_dir \
 46 |   --model_type roberta-rc \
 47 |   --model_name_or_path roberta-large \
 48 |   --task_name $task \
 49 |   --output_dir $output_dir \
 50 |   --training_method $training_method \
 51 |   --do_full_dev_eval \
 52 |   --do_full_test_eval \
 53 |   --evaluate_during_training \
 54 |   --patience 8 \
 55 |   --relation_name $relation_name \
 56 |   --num_positive_examples $num_positive_examples \
 57 |   --ratio_negative_examples $ratio_negative_examples \
 58 |   --num_train_epochs $num_train_epochs \
 59 |   --fp16 \
 60 |   --logging_steps $logging_steps \
 61 |   --save_steps $logging_steps \
 62 |   --save_only_best \
 63 |   --warmup_steps 100 \
 64 |   --per_gpu_train_batch_size 8 \
 65 |   --learning_rate 2e-5 \
 66 |   --seed $seed \
 67 |   --gradient_accumulation_steps 5 > log_"$relation_name"_"$num_positive_examples"_"$ratio_negative_examples".txt 2>&1
 68 | 
 69 | python -m scripts.check_num_of_examples $data_dir $log_dir/num_examples.json
 70 | 
 71 | python -m classification.evaluation."$task"_evaluation --gold_dir $data_dir --gold_file $dev_file --relation_name $relation_name --pred_file "$output_dir/full_dev_eval_results.json" --output_file "$output_dir/full_dev_eval_scores.json"
 72 | 
 73 | confidence_threshold_on_dev_eval=$(jq -r ".best_confidence" "$output_dir/full_dev_eval_scores.json")
 74 | 
 75 | python -m classification.evaluation."$task"_evaluation --gold_dir $data_dir --gold_file $test_file --relation_name $relation_name --pred_file "$output_dir/full_test_eval_results.json" --confidence_threshold $confidence_threshold_on_dev_eval --output_file "$output_dir/full_test_eval_scores.json"
 76 | 
 77 | jq -n --slurpfile dev_eval_content "$output_dir/full_dev_eval_results.json" \
 78 |   --slurpfile test_eval_content "$output_dir/full_test_eval_results.json" \
 79 |   --slurpfile dev_eval_scores "$output_dir/full_dev_eval_scores.json" \
 80 |   --slurpfile test_eval_scores "$output_dir/full_test_eval_scores.json" \
 81 |   '{
 82 |     test_F1:$test_eval_scores[0].F1,
 83 |     test_precision:$test_eval_scores[0].precision,
 84 |     test_recall:$test_eval_scores[0].recall,
 85 |     confidence:$test_eval_scores[0].best_confidence,
 86 |     dev_F1:$dev_eval_scores[0].F1,
 87 |     dev_precision:$dev_eval_scores[0].precision,
 88 |     dev_recall:$dev_eval_scores[0].recall,
 89 |     test_eval:$test_eval_scores,
 90 |     dev_eval:$dev_eval_scores,
 91 |     full_test_eval_results:$test_eval_content,
 92 |     full_dev_eval_results:$dev_eval_content
 93 |     }' \
 94 |   > "$log_dir/full_results.json"
 95 | 
 96 | 
 97 | end=`date +%s`
 98 | secs=$((end-start))
 99 | time="$(($secs/3600))h$(($secs%3600/60))m$(($secs%60))s"
100 | 
101 | jq -n --arg time $time \
102 |   --slurpfile dev_eval_scores "$output_dir/full_dev_eval_scores.json" \
103 |   --slurpfile test_eval_scores "$output_dir/full_test_eval_scores.json" \
104 |   --slurpfile num_examples "$log_dir/num_examples.json" \
105 |   '{
106 |     test_F1:$test_eval_scores[0].F1,
107 |     test_precision:$test_eval_scores[0].precision,
108 |     test_recall:$test_eval_scores[0].recall,
109 |     confidence:$dev_eval_scores[0].best_confidence,
110 |     dev_F1:$dev_eval_scores[0].F1,
111 |     dev_precision:$dev_eval_scores[0].precision,
112 |     dev_recall:$dev_eval_scores[0].recall,
113 |     num_examples:$num_examples,
114 |     time:$time
115 |     }' \
116 |   > "$log_dir/output"
117 | 


--------------------------------------------------------------------------------
/classification/tacred_config.py:
--------------------------------------------------------------------------------
 1 | RELATION_MAPPING = {'org:founded_by': {'id': 'org:founded_by', 'subj_type': ['ORGANIZATION'], 'obj_type': ['PERSON']}, \
 2 |     'per:employee_of': {'id': 'per:employee_of', 'subj_type': ['PERSON'], 'obj_type': ['ORGANIZATION']}, \
 3 |     'org:alternate_names': {'id': 'org:alternate_names', 'subj_type': ['ORGANIZATION'], 'obj_type': ['ORGANIZATION']}, \
 4 |     'per:cities_of_residence': {'id': 'per:cities_of_residence', 'subj_type': ['PERSON'], 'obj_type': ['CITY', 'LOCATION']}, \
 5 |     'per:children': {'id': 'per:children', 'subj_type': ['PERSON'], 'obj_type': ['PERSON']}, \
 6 |     'per:title': {'id': 'per:title', 'subj_type': ['PERSON'], 'obj_type': ['TITLE']}, \
 7 |     'per:siblings': {'id': 'per:siblings', 'subj_type': ['PERSON'], 'obj_type': ['PERSON']}, \
 8 |     'per:religion': {'id': 'per:religion', 'subj_type': ['PERSON'], 'obj_type': ['RELIGION']}, \
 9 |     'per:age': {'id': 'per:age', 'subj_type': ['PERSON'], 'obj_type': ['NUMBER', 'DURATION']}, \
10 |     'org:website': {'id': 'org:website', 'subj_type': ['ORGANIZATION'], 'obj_type': ['URL']}, \
11 |     'per:stateorprovinces_of_residence': {'id': 'per:stateorprovinces_of_residence', 'subj_type': ['PERSON'], 'obj_type': ['STATE_OR_PROVINCE']}, \
12 |     'org:member_of': {'id': 'org:member_of', 'subj_type': ['ORGANIZATION'], 'obj_type': ['ORGANIZATION', 'COUNTRY']}, \
13 |     'org:top_members/employees': {'id': 'org:top_members/employees', 'subj_type': ['ORGANIZATION'], 'obj_type': ['PERSON']}, \
14 |     'per:countries_of_residence': {'id': 'per:countries_of_residence', 'subj_type': ['PERSON'], 'obj_type': ['COUNTRY', 'NATIONALITY']}, \
15 |     'org:city_of_headquarters': {'id': 'org:city_of_headquarters', 'subj_type': ['ORGANIZATION'], 'obj_type': ['CITY']}, \
16 |     'org:members': {'id': 'org:members', 'subj_type': ['ORGANIZATION'], 'obj_type': ['ORGANIZATION', 'COUNTRY']}, \
17 |     'org:country_of_headquarters': {'id': 'org:country_of_headquarters', 'subj_type': ['ORGANIZATION'], 'obj_type': ['COUNTRY']}, \
18 |     'per:spouse': {'id': 'per:spouse', 'subj_type': ['PERSON'], 'obj_type': ['PERSON']}, \
19 |     'org:stateorprovince_of_headquarters': {'id': 'org:stateorprovince_of_headquarters', 'subj_type': ['ORGANIZATION'], 'obj_type': ['STATE_OR_PROVINCE']}, \
20 |     'org:number_of_employees/members': {'id': 'org:number_of_employees/members', 'subj_type': ['ORGANIZATION'], 'obj_type': ['NUMBER']}, \
21 |     'org:parents': {'id': 'org:parents', 'subj_type': ['ORGANIZATION'], 'obj_type': ['ORGANIZATION']}, \
22 |     'org:subsidiaries': {'id': 'org:subsidiaries', 'subj_type': ['ORGANIZATION'], 'obj_type': ['ORGANIZATION']}, \
23 |     'per:origin': {'id': 'per:origin', 'subj_type': ['PERSON'], 'obj_type': ['COUNTRY', 'NATIONALITY']}, \
24 |     'org:political/religious_affiliation': {'id': 'org:political/religious_affiliation', 'subj_type': ['ORGANIZATION'], 'obj_type': ['RELIGION', 'IDEOLOGY']}, \
25 |     'per:other_family': {'id': 'per:other_family', 'subj_type': ['PERSON'], 'obj_type': ['PERSON']}, \
26 |     'per:stateorprovince_of_birth': {'id': 'per:stateorprovince_of_birth', 'subj_type': ['PERSON'], 'obj_type': ['STATE_OR_PROVINCE']}, \
27 |     'org:dissolved': {'id': 'org:dissolved', 'subj_type': ['ORGANIZATION'], 'obj_type': ['DATE']}, \
28 |     'per:date_of_death': {'id': 'per:date_of_death', 'subj_type': ['PERSON'], 'obj_type': ['DATE']}, \
29 |     'org:shareholders': {'id': 'org:shareholders', 'subj_type': ['ORGANIZATION'], 'obj_type': ['PERSON', 'ORGANIZATION']}, \
30 |     'per:alternate_names': {'id': 'per:alternate_names', 'subj_type': ['PERSON'], 'obj_type': ['PERSON']}, \
31 |     'per:parents': {'id': 'per:parents', 'subj_type': ['PERSON'], 'obj_type': ['PERSON']}, \
32 |     'per:schools_attended': {'id': 'per:schools_attended', 'subj_type': ['PERSON'], 'obj_type': ['ORGANIZATION']}, \
33 |     'per:cause_of_death': {'id': 'per:cause_of_death', 'subj_type': ['PERSON'], 'obj_type': ['CAUSE_OF_DEATH']}, \
34 |     'per:city_of_death': {'id': 'per:city_of_death', 'subj_type': ['PERSON'], 'obj_type': ['CITY']}, \
35 |     'per:stateorprovince_of_death': {'id': 'per:stateorprovince_of_death', 'subj_type': ['PERSON'], 'obj_type': ['STATE_OR_PROVINCE']}, \
36 |     'org:founded': {'id': 'org:founded', 'subj_type': ['ORGANIZATION'], 'obj_type': ['DATE']}, \
37 |     'per:country_of_birth': {'id': 'per:country_of_birth', 'subj_type': ['PERSON'], 'obj_type': ['COUNTRY']}, \
38 |     'per:date_of_birth': {'id': 'per:date_of_birth', 'subj_type': ['PERSON'], 'obj_type': ['DATE']}, \
39 |     'per:city_of_birth': {'id': 'per:city_of_birth', 'subj_type': ['PERSON'], 'obj_type': ['CITY']}, \
40 |     'per:charges': {'id': 'per:charges', 'subj_type': ['PERSON'], 'obj_type': ['CRIMINAL_CHARGE']}, \
41 |     'per:country_of_death': {'id': 'per:country_of_death', 'subj_type': ['PERSON'], 'obj_type': ['COUNTRY', 'NATIONALITY', 'LOCATION']}}


--------------------------------------------------------------------------------
/scripts/search/ner_lists/countries:
--------------------------------------------------------------------------------
  1 | 	Afghanistan
  2 | 2	Akrotiri
  3 | 3	Albania
  4 | 4	Algeria
  5 | 0	America
  6 | 5	American Samoa
  7 | 6	Andorra
  8 | 7	Angola
  9 | 8	Anguilla
 10 | 9	Antarctica
 11 | 10	Antigua and Barbuda
 12 | 11	Argentina
 13 | 12	Armenia
 14 | 13	Aruba
 15 | 14	Ashmore and Cartier Islands
 16 | 15	Australia
 17 | 16	Austria
 18 | 17	Azerbaijan
 19 | 18	Bahamas, The
 20 | 19	Bahrain
 21 | 20	Bangladesh
 22 | 21	Barbados
 23 | 22	Bassas da India
 24 | 23	Belarus
 25 | 24	Belgium
 26 | 25	Belize
 27 | 26	Benin
 28 | 27	Bermuda
 29 | 28	Bhutan
 30 | 29	Bolivia
 31 | 30	Bosnia and Herzegovina
 32 | 31	Botswana
 33 | 32	Bouvet Island
 34 | 33	Brazil
 35 | 34	British Indian Ocean Territory
 36 | 35	British Virgin Islands
 37 | 36	Brunei
 38 | 37	Bulgaria
 39 | 38	Burkina Faso
 40 | 39	Burma
 41 | 40	Burundi
 42 | 41	Cambodia
 43 | 42	Cameroon
 44 | 43	Canada
 45 | 44	Cape Verde
 46 | 45	Cayman Islands
 47 | 46	Central African Republic
 48 | 47	Chad
 49 | 48	Chile
 50 | 49	China
 51 | 50	Christmas Island
 52 | 51	Clipperton Island
 53 | 52	Cocos (Keeling) Islands
 54 | 53	Colombia
 55 | 54	Comoros
 56 | 55	Congo, Democratic Republic of the
 57 | 56	Congo, Republic of the
 58 | 57	Cook Islands
 59 | 58	Coral Sea Islands
 60 | 59	Costa Rica
 61 | 60	Cote d'Ivoire
 62 | 61	Croatia
 63 | 62	Cuba
 64 | 63	Cyprus
 65 | 64	Czech Republic
 66 | 000	Czechoslovakia
 67 | 65	Denmark
 68 | 66	Dhekelia
 69 | 67	Djibouti
 70 | 68	Dominica
 71 | 69	Dominican Republic
 72 | 70	Ecuador
 73 | 71	Egypt
 74 | 72	El Salvador
 75 | 73	Equatorial Guinea
 76 | 74	Eritrea
 77 | 75	Estonia
 78 | 76	Ethiopia
 79 | 77	Europa Island
 80 | 78	Falkland Islands (Islas Malvinas)
 81 | 79	Faroe Islands
 82 | 80	Fiji
 83 | 81	Finland
 84 | 82	France
 85 | 83	French Guiana
 86 | 84	French Polynesia
 87 | 85	French Southern and Antarctic Lands
 88 | 86	Gabon
 89 | 87	Gambia, The
 90 | 88	Gaza Strip
 91 | 89	Georgia
 92 | 90	Germany
 93 | 91	Ghana
 94 | 92	Gibraltar
 95 | 93	Glorioso Islands
 96 | 94	Greece
 97 | 95	Greenland
 98 | 96	Grenada
 99 | 97	Guadeloupe
100 | 98	Guam
101 | 99	Guatemala
102 | 100	Guernsey
103 | 101	Guinea
104 | 102	Guinea-Bissau
105 | 103	Guyana
106 | 104	Haiti
107 | 105	Heard Island and McDonald Islands
108 | 106	Holy See (Vatican City)
109 | 107	Honduras
110 | 108	Hong Kong
111 | 109	Hungary
112 | 110	Iceland
113 | 111	India
114 | 112	Indonesia
115 | 113	Iran
116 | 114	Iraq
117 | 115	Ireland
118 | 116	Isle of Man
119 | 117	Israel
120 | 118	Italy
121 | 119	Jamaica
122 | 120	Jan Mayen
123 | 121	Japan
124 | 122	Jersey
125 | 123	Jordan
126 | 124	Juan de Nova Island
127 | 125	Kazakhstan
128 | 126	Kenya
129 | 127	Kiribati
130 | 128	Korea, North
131 | 129	Korea, South
132 | 130	Kuwait
133 | 131	Kyrgyzstan
134 | 132	Laos
135 | 133	Latvia
136 | 134	Lebanon
137 | 135	Lesotho
138 | 136	Liberia
139 | 137	Libya
140 | 138	Liechtenstein
141 | 139	Lithuania
142 | 140	Luxembourg
143 | 141	Macau
144 | 142	Macedonia
145 | 143	Madagascar
146 | 144	Malawi
147 | 145	Malaysia
148 | 146	Maldives
149 | 147	Mali
150 | 148	Malta
151 | 149	Marshall Islands
152 | 150	Martinique
153 | 151	Mauritania
154 | 152	Mauritius
155 | 153	Mayotte
156 | 154	Mexico
157 | 155	Micronesia, Federated States of
158 | 156	Moldova
159 | 157	Monaco
160 | 158	Mongolia
161 | 159	Montserrat
162 | 160	Morocco
163 | 161	Mozambique
164 | 162	Namibia
165 | 163	Nauru
166 | 164	Navassa Island
167 | 165	Nepal
168 | 166	Netherlands
169 | 167	Netherlands Antilles
170 | 168	New Caledonia
171 | 169	New Zealand
172 | 170	Nicaragua
173 | 171	Niger
174 | 172	Nigeria
175 | 173	Niue
176 | 174	Norfolk Island
177 | 175	Northern Mariana Islands
178 | 176	Norway
179 | 177	Oman
180 | 178	Pakistan
181 | 179	Palau
182 | 180	Panama
183 | 181	Papua New Guinea
184 | 182	Paracel Islands
185 | 183	Paraguay
186 | 184	Peru
187 | 185	Philippines
188 | 186	Pitcairn Islands
189 | 187	Poland
190 | 188	Portugal
191 | 189	Puerto Rico
192 | 190	Qatar
193 | 191	Reunion
194 | 192	Romania
195 | 193	Russia
196 | 194	Rwanda
197 | 195	Saint Helena
198 | 196	Saint Kitts and Nevis
199 | 197	Saint Lucia
200 | 198	Saint Pierre and Miquelon
201 | 199	Saint Vincent and the Grenadines
202 | 200	Samoa
203 | 201	San Marino
204 | 202	Sao Tome and Principe
205 | 203	Saudi Arabia
206 | 204	Senegal
207 | 205	Serbia and Montenegro
208 | 206	Seychelles
209 | 207	Sierra Leone
210 | 208	Singapore
211 | 209	Slovakia
212 | 210	Slovenia
213 | 211	Solomon Islands
214 | 212	Somalia
215 | 213	South Africa
216 | 214	South Georgia and the South Sandwich Islands
217 | 215	Spain
218 | 216	Spratly Islands
219 | 217	Sri Lanka
220 | 218	Sudan
221 | 219	Suriname
222 | 220	Svalbard
223 | 221	Swaziland
224 | 222	Sweden
225 | 223	Switzerland
226 | 224	Syria
227 | 225	Taiwan
228 | 226	Tajikistan
229 | 227	Tanzania
230 | 228	Thailand
231 | 229	Timor-Leste
232 | 230	Togo
233 | 231	Tokelau
234 | 232	Tonga
235 | 233	Trinidad and Tobago
236 | 234	Tromelin Island
237 | 235	Tunisia
238 | 236	Turkey
239 | 237	Turkmenistan
240 | 238	Turks and Caicos Islands
241 | 239	Tuvalu
242 | 240	Uganda
243 | 241	Ukraine
244 | 242	United Arab Emirates
245 | 243	United Kingdom
246 | 000	England
247 | 000	UK
248 | 000	U.K.
249 | 244	United States
250 | 000	USA
251 | 000	U.S.A.
252 | 000	US
253 | 000	U.S.
254 | 000	US.
255 | 000	United States of America
256 | 245	Uruguay
257 | 246	Uzbekistan
258 | 247	Vanuatu
259 | 248	Venezuela
260 | 249	Vietnam
261 | 250	Virgin Islands
262 | 251	Wake Island
263 | 252	Wallis and Futuna
264 | 253	West Bank
265 | 254	Western Sahara
266 | 255	Yemen
267 | 256	Zambia
268 | 257	Zimbabwe


--------------------------------------------------------------------------------
/classification/stubs/tacred/fake_truth.json:
--------------------------------------------------------------------------------
  1 | [
  2 |     {
  3 |         "id": "61b3a5c8c9a882dcfcd2",
  4 |         "docid": "AFP_ENG_20070218.0019.LDC2009T13",
  5 |         "relation": "org:founded_by",
  6 |         "token": [
  7 |             "Tom",
  8 |             "Thabane",
  9 |             "resigned",
 10 |             "in",
 11 |             "October",
 12 |             "last",
 13 |             "year",
 14 |             "to",
 15 |             "form",
 16 |             "the",
 17 |             "All",
 18 |             "Basotho",
 19 |             "Convention",
 20 |             "-LRB-",
 21 |             "ABC",
 22 |             "-RRB-",
 23 |             ",",
 24 |             "crossing",
 25 |             "the",
 26 |             "floor",
 27 |             "with",
 28 |             "17",
 29 |             "members",
 30 |             "of",
 31 |             "parliament",
 32 |             ",",
 33 |             "causing",
 34 |             "constitutional",
 35 |             "monarch",
 36 |             "King",
 37 |             "Letsie",
 38 |             "III",
 39 |             "to",
 40 |             "dissolve",
 41 |             "parliament",
 42 |             "and",
 43 |             "call",
 44 |             "the",
 45 |             "snap",
 46 |             "election",
 47 |             "."
 48 |         ],
 49 |         "subj_start": 10,
 50 |         "subj_end": 12,
 51 |         "obj_start": 0,
 52 |         "obj_end": 1,
 53 |         "subj_type": "ORGANIZATION",
 54 |         "obj_type": "PERSON",
 55 |         "stanford_pos": [
 56 |             "NNP",
 57 |             "NNP",
 58 |             "VBD",
 59 |             "IN",
 60 |             "NNP",
 61 |             "JJ",
 62 |             "NN",
 63 |             "TO",
 64 |             "VB",
 65 |             "DT",
 66 |             "DT",
 67 |             "NNP",
 68 |             "NNP",
 69 |             "-LRB-",
 70 |             "NNP",
 71 |             "-RRB-",
 72 |             ",",
 73 |             "VBG",
 74 |             "DT",
 75 |             "NN",
 76 |             "IN",
 77 |             "CD",
 78 |             "NNS",
 79 |             "IN",
 80 |             "NN",
 81 |             ",",
 82 |             "VBG",
 83 |             "JJ",
 84 |             "NN",
 85 |             "NNP",
 86 |             "NNP",
 87 |             "NNP",
 88 |             "TO",
 89 |             "VB",
 90 |             "NN",
 91 |             "CC",
 92 |             "VB",
 93 |             "DT",
 94 |             "NN",
 95 |             "NN",
 96 |             "."
 97 |         ],
 98 |         "stanford_ner": [
 99 |             "PERSON",
100 |             "PERSON",
101 |             "O",
102 |             "O",
103 |             "DATE",
104 |             "DATE",
105 |             "DATE",
106 |             "O",
107 |             "O",
108 |             "O",
109 |             "O",
110 |             "O",
111 |             "O",
112 |             "O",
113 |             "ORGANIZATION",
114 |             "O",
115 |             "O",
116 |             "O",
117 |             "O",
118 |             "O",
119 |             "O",
120 |             "NUMBER",
121 |             "O",
122 |             "O",
123 |             "O",
124 |             "O",
125 |             "O",
126 |             "O",
127 |             "O",
128 |             "O",
129 |             "PERSON",
130 |             "PERSON",
131 |             "O",
132 |             "O",
133 |             "O",
134 |             "O",
135 |             "O",
136 |             "O",
137 |             "O",
138 |             "O",
139 |             "O"
140 |         ],
141 |         "stanford_head": [
142 |             2,
143 |             3,
144 |             0,
145 |             5,
146 |             3,
147 |             7,
148 |             3,
149 |             9,
150 |             3,
151 |             13,
152 |             13,
153 |             13,
154 |             9,
155 |             15,
156 |             13,
157 |             15,
158 |             3,
159 |             3,
160 |             20,
161 |             18,
162 |             23,
163 |             23,
164 |             18,
165 |             25,
166 |             23,
167 |             3,
168 |             3,
169 |             32,
170 |             32,
171 |             32,
172 |             32,
173 |             27,
174 |             34,
175 |             27,
176 |             34,
177 |             34,
178 |             34,
179 |             40,
180 |             40,
181 |             37,
182 |             3
183 |         ],
184 |         "stanford_deprel": [
185 |             "compound",
186 |             "nsubj",
187 |             "ROOT",
188 |             "case",
189 |             "nmod",
190 |             "amod",
191 |             "nmod:tmod",
192 |             "mark",
193 |             "xcomp",
194 |             "det",
195 |             "compound",
196 |             "compound",
197 |             "dobj",
198 |             "punct",
199 |             "appos",
200 |             "punct",
201 |             "punct",
202 |             "xcomp",
203 |             "det",
204 |             "dobj",
205 |             "case",
206 |             "nummod",
207 |             "nmod",
208 |             "case",
209 |             "nmod",
210 |             "punct",
211 |             "xcomp",
212 |             "amod",
213 |             "compound",
214 |             "compound",
215 |             "compound",
216 |             "dobj",
217 |             "mark",
218 |             "xcomp",
219 |             "dobj",
220 |             "cc",
221 |             "conj",
222 |             "det",
223 |             "compound",
224 |             "dobj",
225 |             "punct"
226 |         ]
227 |     }
228 | ]


--------------------------------------------------------------------------------
/classification/tacred.py:
--------------------------------------------------------------------------------
  1 | import csv
  2 | from typing import Any, Callable, Dict, Iterator, List, Type, TypeVar, Set
  3 | from typing_extensions import TypedDict
  4 | 
  5 | from classification.re_processors import REProcessor, JsonObject, wrap_text, NEGATIVE_LABEL, SetType
  6 | from classification.tacred_config import RELATION_MAPPING
  7 | from transformers.data.processors.utils import InputExample, InputFeatures
  8 | 
  9 | Relation = TypedDict('Relation', id=str, docid=str, relation=str, token=List[str], subj_start=int, subj_end=int, obj_start=int, obj_end=int, subj_type=str, obj_type=str, stanford_pos=List[str], stanford_ner=List[str], stanford_head=List[int], stanford_deprel=List[str])
 10 | T = TypeVar('T', bound='TACREDExample')
 11 | Builder = Callable[[Type[T], int, JsonObject, str], T]
 12 | 
 13 | class TACREDExample(InputExample):
 14 |     def __init__(self, id: int, text: str, label: str) -> None:
 15 |         self.id = id
 16 |         self.text = text
 17 |         self.label = label
 18 | 
 19 |     @classmethod
 20 |     def build(cls: Type[T], id: int, example_json: JsonObject, label: str) -> T:
 21 |         return cls(id, cls._mark_entities(example_json), label)
 22 | 
 23 |     @classmethod
 24 |     def _mark_entities(cls: Type[T], example_json: JsonObject) -> str:
 25 |         e1_start_idx, e1_end_idx = example_json['subj_start'], example_json['subj_end']
 26 |         e2_start_idx, e2_end_idx = example_json['obj_start'], example_json['obj_end']
 27 |         text = example_json['token'].copy()
 28 | 
 29 |         return wrap_text(text, e1_start_idx, e1_end_idx + 1, e2_start_idx, e2_end_idx + 1)
 30 | 
 31 |     def __eq__(self, other: Any):
 32 |         if not isinstance(other, TACREDExample):
 33 |             return False
 34 | 
 35 |         if self.id == other.id and \
 36 |             self.text == other.text and \
 37 |             self.label == other.label:
 38 |             return True
 39 | 
 40 |         return False
 41 | 
 42 |     def __hash__(self):
 43 |         return hash((self.id, self.text, self.label))
 44 | 
 45 | class TACREDProcessor(REProcessor):
 46 |     def __init__(self, relation_name: str, num_positive: int = None, negative_ratio: int = None, type_independent_neg_sample: bool = True) -> None:
 47 |         super().__init__(relation_name, num_positive, negative_ratio, type_independent_neg_sample)
 48 |         assert relation_name in RELATION_MAPPING
 49 |         self.relation_mapping = RELATION_MAPPING
 50 |         self.train_file = "train.json"
 51 |         self.dev_file = "dev.json"
 52 |         self.test_file = "test.json"
 53 | 
 54 |     def _create_examples(self, relations: Dict[int, Relation],
 55 |                          set_type: SetType,
 56 |                          builder: Builder = TACREDExample.build) -> Iterator[TACREDExample]:
 57 |         """Creates examples for the training and dev sets."""
 58 |         for id, relation in enumerate(relations):
 59 |             label = self._relation_label(relation['relation'])
 60 |             if self._positive_relation(label) or self.allow_as_negative(relation):
 61 |                 yield builder(id, relation, label)
 62 | 
 63 |     def _create_all_possible_dev_examples(self,
 64 |                                           relations: Dict[int, Relation],
 65 |                                           set_type: SetType) -> Iterator[InputExample]:
 66 |         """Creates examples of all possible entities for dev sets"""
 67 |         for id, relation in enumerate(relations):
 68 |             label = self._relation_label(relation['relation'])
 69 |             if self._same_entity_types_relation(relation):
 70 |                 yield TACREDExample.build(id, relation, label)
 71 | 
 72 |     def _create_search_examples_given_row_ids(self, search_file: str, row_ids: Set[int]) -> Iterator[InputExample]:
 73 |         with open(search_file, 'r', encoding="utf-8") as f:
 74 |             reader = csv.reader(f, delimiter='\t')
 75 |             return [TACREDExample(i, doc[0], self._relation_label(doc[1])) for i, doc in enumerate(reader) if i in row_ids]
 76 | 
 77 |     def _create_generation_examples(self, raw_generations: List[str]) -> Iterator[InputExample]:
 78 |         for i, gen in enumerate(raw_generations):
 79 |             yield TACREDExample(i, gen.rstrip(), 1)
 80 | 
 81 |     def relation_name_adapter(self, relation: str):
 82 |         return relation
 83 | 
 84 |     def _relation_label(self, relation_name: str) -> str:
 85 |         return 1 if self._positive_relation(relation_name) else 0
 86 | 
 87 |     def _positive_relation(self, relation_name: str) -> bool:
 88 |         return relation_name == self.positive_label
 89 | 
 90 |     def allow_as_negative(self, relation: Relation):
 91 |         return self.type_independent_neg_sample or self._same_entity_types_relation(relation)
 92 | 
 93 |     def _same_entity_types_relation(self, relation: Relation) -> bool:
 94 |         return (relation['subj_type'] in self.relation_mapping[self.positive_label]['subj_type'] and
 95 |                 relation['obj_type'] in self.relation_mapping[self.positive_label]['obj_type'])
 96 | 
 97 | class TACREDInputFeatures(InputFeatures):
 98 |     def __init__(self,
 99 |                  input_ids,
100 |                  attention_mask=None,
101 |                  token_type_ids=None,
102 |                  markers_mask=None,
103 |                  example=None,
104 |                  label=None) -> None:
105 |         super().__init__(input_ids, attention_mask, token_type_ids, label)
106 |         self.markers_mask = markers_mask
107 |         self.title = example.id
108 |         self.h = -1
109 |         self.t = -1
110 | 


--------------------------------------------------------------------------------
/classification/evaluation/docred_evaluation.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | import argparse
  3 | import os
  4 | import os.path
  5 | import json
  6 | 
  7 | from classification.docred_config import RELATION_MAPPING
  8 | from classification.docred import DocREDUtils
  9 | 
 10 | def gen_train_facts(data_file_name, truth_dir):
 11 |     fact_file_name = data_file_name[data_file_name.find("train_"):]
 12 |     fact_file_name = os.path.join(truth_dir, fact_file_name.replace(".json", ".fact"))
 13 | 
 14 |     if os.path.exists(fact_file_name):
 15 |         fact_in_train = set([])
 16 |         triples = json.load(open(fact_file_name))
 17 |         for x in triples:
 18 |             fact_in_train.add(tuple(x))
 19 |         return fact_in_train
 20 | 
 21 |     fact_in_train = set([])
 22 |     ori_data = json.load(open(data_file_name))
 23 |     for data in ori_data:
 24 |         vertexSet = data['vertexSet']
 25 |         for label in data['labels']:
 26 |             rel = label['r']
 27 |             for n1 in vertexSet[label['h']]:
 28 |                 for n2 in vertexSet[label['t']]:
 29 |                     fact_in_train.add((n1['name'], n2['name'], rel))
 30 | 
 31 |     json.dump(list(fact_in_train), open(fact_file_name, "w"))
 32 | 
 33 |     return fact_in_train
 34 | 
 35 | def correct_entity_types(relation_object, entities, relation_name):
 36 |     def get_entity_type(side: str):
 37 |         return entities[relation_object[side]][0]['type']
 38 | 
 39 |     return get_entity_type('h') in RELATION_MAPPING[relation_name]['e1_type'] and \
 40 |             get_entity_type('t') in RELATION_MAPPING[relation_name]['e2_type']
 41 | 
 42 | def main(args):
 43 |     relation_id = RELATION_MAPPING[args.relation_name]['id']
 44 | 
 45 |     truth_file = os.path.join(args.gold_dir, args.gold_file)
 46 |     truth = json.load(open(truth_file))
 47 | 
 48 |     std = {}
 49 |     std_in_single_sent = {}
 50 |     tot_evidences = 0
 51 |     titleset = set([])
 52 | 
 53 | 
 54 |     for x in truth:
 55 |         title = x['title']
 56 |         titleset.add(title)
 57 | 
 58 |         vertexSet = x['vertexSet']
 59 | 
 60 |         for label in x['labels']:
 61 |             r = label['r']
 62 | 
 63 |             h_idx = label['h']
 64 |             t_idx = label['t']
 65 |             if r != relation_id: continue
 66 |             if not correct_entity_types(label, vertexSet, args.relation_name): continue
 67 | 
 68 |             std[(title, r, h_idx, t_idx)] = set(label['evidence'])
 69 |             tot_evidences += len(label['evidence'])
 70 |             if len(label['evidence']) == 1 and len(DocREDUtils.evidences_with_entities(x, label)) > 0:
 71 |                 std_in_single_sent[(title, r, h_idx, t_idx)] = set(label['evidence'])
 72 | 
 73 |     submission_answer_file = args.pred_file
 74 |     tmp = json.load(open(submission_answer_file))
 75 |     if len(tmp) == 0:
 76 |         if args.output_file:
 77 |             json.dump({
 78 |                 "F1": 0.0,
 79 |                 "precision": 0.0,
 80 |                 "recall": 0.0,
 81 |                 "best_F1": 0.0,
 82 |                 "best_precision": 0.0,
 83 |                 "best_recall": 0.0,
 84 |                 "best_confidence": (0.5 - 1e-10)},
 85 |                       open(args.output_file, 'w'))
 86 |         return
 87 | 
 88 |     tmp.sort(key=lambda x: (x['title'], x['h_idx'], x['t_idx'], x['r']))
 89 |     submission_answer = [tmp[0]]
 90 |     for i in range(1, len(tmp)):
 91 |         x = tmp[i]
 92 |         y = tmp[i-1]
 93 |         if (x['title'], x['h_idx'], x['t_idx'], x['r']) != (y['title'], y['h_idx'], y['t_idx'], y['r']):
 94 |             submission_answer.append(tmp[i])
 95 | 
 96 |     submission_answer = sorted(submission_answer, key=lambda x: x['c'], reverse=True)
 97 | 
 98 |     if len(set([answer['r'] for answer in submission_answer])) != 1:
 99 |         raise ValueError('Mutliple relation predictions are passed')
100 |         # This is a must as we are only adding a the "relation_name" to the std dict
101 | 
102 |     scores = eval(args, submission_answer, std_in_single_sent)
103 |     # multi_sent_rel_scores = eval(args, submission_answer, std)
104 | 
105 |     # for k, v in multi_sent_rel_scores.items():
106 |     #     scores[f"multi_sent_{k}"] = v
107 | 
108 |     if args.output_file:
109 |         json.dump(scores, open(args.output_file, 'w'))
110 | 
111 |     return scores
112 | 
113 | def eval(args, submission_answer, std):
114 |     correct_re = 0
115 |     tot_relations = len(std)
116 | 
117 |     re_f1, re_p, re_r, best_f1, best_p, best_r, best_confidence = 0, 0, 0, 0, 0, 0, (0.5 - 1e-10)
118 |     for i, x in enumerate(submission_answer):
119 |         title = x['title']
120 |         h_idx = x['h_idx']
121 |         t_idx = x['t_idx']
122 |         r = x['r']
123 |         confidence = x['c']
124 |         if confidence < args.confidence_threshold:
125 |             break
126 | 
127 |         if (title, r, h_idx, t_idx) in std:
128 |             correct_re += 1
129 | 
130 |         re_p = 1.0 * correct_re / (i+1)
131 |         re_r = 1.0 * correct_re / tot_relations
132 | 
133 |         if re_p+re_r == 0:
134 |             re_f1 = 0
135 |         else:
136 |             re_f1 = 2.0 * re_p * re_r / (re_p + re_r)
137 | 
138 |         if best_f1 < re_f1:
139 |             best_f1 = re_f1
140 |             best_p = re_p
141 |             best_r = re_r
142 |             best_confidence = confidence
143 | 
144 |     scores = {
145 |         "F1": re_f1,
146 |         "precision": re_p,
147 |         "recall": re_r,
148 |         "best_F1": best_f1,
149 |         "best_precision": best_p,
150 |         "best_recall": best_r,
151 |         "best_confidence": best_confidence
152 |     }
153 | 
154 |     return scores
155 | 
156 | if __name__ == "__main__":
157 |     parser = argparse.ArgumentParser()
158 |     parser.add_argument('-gold_dir', '--gold_dir',
159 |         type=str,
160 |         required=True)
161 |     parser.add_argument('-gold_file', '--gold_file',
162 |         type=str,
163 |         required=True)
164 |     parser.add_argument('-pred_file', '--pred_file',
165 |         type=str,
166 |         required=True)
167 |     parser.add_argument('-output_file', '--output_file',
168 |         default='evaluation',
169 |         type=str)
170 |     parser.add_argument('-relation_name', '--relation_name',
171 |         type=str,
172 |         required=True)
173 |     parser.add_argument('-confidence_threshold', '--confidence_threshold',
174 |         type=float,
175 |         default=0)
176 |     args = parser.parse_args()
177 | 
178 |     main(args)


--------------------------------------------------------------------------------
/classification/test_docred.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import os
  3 | 
  4 | from classification.docred import DocREDUtils, DocREDProcessor, DocREDExample
  5 | 
  6 | with open('classification/stubs/docred/fake_truth.json', "r", encoding="utf-8") as f:
  7 |     docs = list(json.load(f))
  8 | 
  9 | doc1, doc2, doc3, doc4 = docs
 10 | 
 11 | # Tests using this variable require the true path to the data files.
 12 | DATA_DIR = 'data/DocRED/'
 13 | 
 14 | class TestDocREDUtils:
 15 |     def test_sents_entities_share(self):
 16 |         entities_sents = DocREDUtils.sents_entities_share(doc1, doc1['labels'][0])
 17 |         assert entities_sents == [0, 1]
 18 |         entities_sents = DocREDUtils.sents_entities_share(doc2, doc2['labels'][0])
 19 |         assert entities_sents == [0]
 20 |         entities_sents = DocREDUtils.sents_entities_share(doc3, doc3['labels'][0])
 21 |         assert entities_sents == [0, 1]
 22 |         entities_sents = DocREDUtils.sents_entities_share(doc3, doc3['labels'][1])
 23 |         assert entities_sents == [0, 1]
 24 | 
 25 |     def test__sents_entities_and_evidence_share(self):
 26 |         entities_sents = DocREDUtils.sents_entities_share(doc1, doc1['labels'][0])
 27 |         entities_and_evidence_sents = DocREDUtils._sents_entities_and_evidence_share(doc1['labels'][0], entities_sents)
 28 |         assert entities_and_evidence_sents == [1]
 29 |         entities_sents = DocREDUtils.sents_entities_share(doc2, doc2['labels'][0])
 30 |         entities_and_evidence_sents = DocREDUtils._sents_entities_and_evidence_share(doc2['labels'][0], entities_sents)
 31 |         assert entities_and_evidence_sents == [0]
 32 |         entities_sents = DocREDUtils.sents_entities_share(doc3, doc3['labels'][0])
 33 |         entities_and_evidence_sents = DocREDUtils._sents_entities_and_evidence_share(doc3['labels'][0], entities_sents)
 34 |         assert entities_and_evidence_sents == [0, 1]
 35 | 
 36 |     def test_entity_from_entity_id_passes(self):
 37 |         entity_list = DocREDUtils.entity_from_entity_id(doc1['vertexSet'], doc1['labels'][0]['h'], 0)
 38 |         assert entity_list == [{'name': 'Microsoft', 'pos': [0, 1], 'sent_id': 0, 'type': 'ORG'},
 39 |                                {'name': 'MS', 'pos': [3, 4], 'sent_id': 0, 'type': 'ORG'}]
 40 |         entity_list = DocREDUtils.entity_from_entity_id(doc1['vertexSet'], doc1['labels'][0]['h'], 1)
 41 |         assert entity_list[0] == {'name': 'Micro', 'pos': [0, 1], 'sent_id': 1, 'type': 'ORG'}
 42 | 
 43 |     def test_entities_by_sent_id(self):
 44 |         assert DocREDUtils.entities_by_sent_id(doc3['vertexSet']) == {0: {0, 1}, 1: {0, 1}}
 45 | 
 46 |     def test_relations_by_entities(self):
 47 |         assert DocREDUtils.relations_by_entities(doc3['labels']) == \
 48 |             {(0, 1): [{'r': 'P26', 'h': 0, 't': 1, 'evidence': [0, 1]}],
 49 |              (1, 0): [{'r': 'P26', 'h': 1, 't': 0, 'evidence': [0, 1]}]}
 50 | 
 51 | class TestDocREDProcessor:
 52 |     def test__same_entity_types_relation(self):
 53 |         processor = DocREDProcessor('founded_by')
 54 |         assert processor._same_entity_types_relation(doc1['labels'][0], doc1['vertexSet'])
 55 |         processor = DocREDProcessor('father')
 56 |         assert processor._same_entity_types_relation(doc2['labels'][0], doc2['vertexSet'])
 57 |         processor = DocREDProcessor('spouse')
 58 |         assert processor._same_entity_types_relation(doc3['labels'][0], doc3['vertexSet'])
 59 |         assert not processor._same_entity_types_relation(doc1['labels'][0], doc1['vertexSet'])
 60 | 
 61 |     def test__same_entity_types_relation_switched_h_and_t(self):
 62 |         processor = DocREDProcessor('founded_by')
 63 |         relation = doc1['labels'][0]
 64 |         head_is_tail = {'r': relation['r'], 'h': relation['t'], 't': relation['h'], 'evidence': relation['evidence']}
 65 |         assert not processor._same_entity_types_relation(head_is_tail, doc1['vertexSet'])
 66 | 
 67 |     def test__same_entity_types_relation_wrong_relation(self):
 68 |         processor = DocREDProcessor('inception')
 69 |         assert not processor._same_entity_types_relation(doc1['labels'][0], doc1['vertexSet'])
 70 | 
 71 |     def test_create_all_possible_dev_examples_doc1(self):
 72 |         processor = DocREDProcessor('founded_by')
 73 |         data = list(processor._create_all_possible_dev_examples([doc1], None))
 74 |         assert len(data) == 2
 75 |         assert data[0].evidence == 0
 76 |         assert data[0].h == 0
 77 |         assert data[0].t == 1
 78 |         assert data[0].label == 'NOTA'
 79 | 
 80 |         assert data[1].evidence == 1
 81 |         assert data[1].h == 0
 82 |         assert data[1].t == 1
 83 |         assert data[1].label == 'founded_by'
 84 | 
 85 |     def test_create_all_possible_dev_examples_doc2(self):
 86 |         processor = DocREDProcessor('father')
 87 |         data = list(processor._create_all_possible_dev_examples([doc2], None))
 88 |         assert len(data) == 2
 89 |         assert data[0].evidence == 0
 90 |         assert data[0].h == 0
 91 |         assert data[0].t == 1
 92 |         assert data[0].label == 'father'
 93 | 
 94 |         assert data[1].evidence == 0
 95 |         assert data[1].h == 1
 96 |         assert data[1].t == 0
 97 |         assert data[1].label == 'NOTA'
 98 | 
 99 |     def test_create_all_possible_dev_examples_doc3(self):
100 |         processor = DocREDProcessor('spouse')
101 |         data = list(processor._create_all_possible_dev_examples([doc3], None))
102 |         assert len(data) == 4
103 |         assert data[0].evidence == 0
104 |         assert data[0].h == 0
105 |         assert data[0].t == 1
106 |         assert data[0].label == 'spouse'
107 | 
108 |         assert data[1].evidence == 0
109 |         assert data[1].h == 1
110 |         assert data[1].t == 0
111 |         assert data[1].label == 'spouse'
112 | 
113 |         assert data[2].evidence == 1
114 |         assert data[2].h == 0
115 |         assert data[2].t == 1
116 |         assert data[2].label == 'spouse'
117 | 
118 |         assert data[3].evidence == 1
119 |         assert data[3].h == 1
120 |         assert data[3].t == 0
121 |         assert data[3].label == 'spouse'
122 | 
123 |     def test_create_all_possible_dev_examples_doc4(self):
124 |         processor = DocREDProcessor('founded_by')
125 |         data = list(processor._create_all_possible_dev_examples([doc4], None))
126 |         assert len(data) == 1
127 |         assert data[0].evidence == 0
128 |         assert data[0].h == 0
129 |         assert data[0].t == 1
130 |         assert data[0].label == 'founded_by'
131 | 
132 |     def test_get_all_possible_eval_examples_check_positives(self):
133 |         processor = DocREDProcessor('founded_by')
134 |         data = processor.get_all_possible_eval_examples(DATA_DIR, 'full_test_eval')
135 |         relations = [d for d in data if d.label == 'founded_by']
136 |         distinct = list(set(relations))
137 |         assert len(relations) == len(distinct)
138 | 
139 |     def test_get_all_possible_dev_eval_examples_check_positives_num_examples(self):
140 |         processor = DocREDProcessor('founded_by')
141 |         data = processor.get_all_possible_eval_examples(DATA_DIR, 'full_dev_eval')
142 |         in_relation = [d for d in data if d.label == 'founded_by']
143 |         assert len(data) == 2228
144 |         assert len(in_relation) == 9
145 | 
146 |     def test_get_all_possible_test_eval_examples_check_positives_num_examples(self):
147 |         processor = DocREDProcessor('founded_by')
148 |         data = processor.get_all_possible_eval_examples(DATA_DIR, 'full_test_eval')
149 |         in_relation = [d for d in data if d.label == 'founded_by']
150 |         assert len(data) == 3895
151 |         assert len(in_relation) == 20
152 | 
153 | class TestDocREDExample:
154 |     def test_init(self):
155 |         example = DocREDExample(0, doc1, doc1['labels'][0], 1, True)
156 |         assert example.evidence == 1
157 |         assert example.h == 0
158 |         assert example.label
159 |         assert example.t == 1
160 |         assert example.text == "[E1] Micro [/E1] was founded by [E2] Paul [/E2]"
161 |         assert example.title == 0


--------------------------------------------------------------------------------
/scripts/generation_preprocess/create_tacred_datafiles.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import torch
  3 | import os
  4 | import json
  5 | from tqdm import tqdm
  6 | 
  7 | from relation_canonical_form import CANONICAL_FORMS, PREDICATES
  8 | 
  9 | from transformers import GPT2Config, GPT2LMHeadModel, GPT2Tokenizer
 10 | 
 11 | CLEANINGMAP = {'-RRB-': ')', '-LRB-': '(', '-LSB-': '[',
 12 |                '-RSB-': ']', '-LCB-': '{', '-RCB-': '}',
 13 |                '&nbsp;': ' ', '&quot;': "'", '--': '-', '---': '-'}
 14 | 
 15 | DISALLOWED_PRONOUNS = {"me", "us", "you", "her", "him", "it", "them", "my", "our", "your", "her", "his", "their"}
 16 | 
 17 | MODEL_CLASSES = {
 18 |     'gpt2': (GPT2Config, GPT2LMHeadModel, GPT2Tokenizer),
 19 | }
 20 | START_SUBJ = '<|subj|>'
 21 | END_SUBJ = '<|/subj|>'
 22 | START_OBJ = '<|obj|>'
 23 | END_OBJ = '<|/obj|>'
 24 | START_TRIGGER = '<|trigger|>'
 25 | END_TRIGGER = '<|/trigger|>'
 26 | GO = '<|GO|>'
 27 | E1 = '<|E1|>'
 28 | END_E1 = '<|\E1|>'
 29 | E2 = '<|E2|>'
 30 | END_E2 = '<|\E2|>'
 31 | 
 32 | 
 33 | NO_RELATION = "no_relation"
 34 | 
 35 | RELATIONS_TO_LEAVE_OUT = ["per:children", "org:founded_by", "org:country_of_headquarters", "per:religion", "per:spouse", "per:origin", "per:date_of_death", "per:city_of_death"]
 36 | 
 37 | def main(args):
 38 |     SPECIAL_TOKENS = [GO]
 39 |     if args.mark_relation_args:
 40 |         SPECIAL_TOKENS += [START_SUBJ, END_SUBJ, START_OBJ, END_OBJ, START_TRIGGER, END_TRIGGER]
 41 |     elif args.anonymize_tgt:
 42 |         SPECIAL_TOKENS += [E1, END_E1, E2, END_E2]
 43 | 
 44 |     config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type]
 45 |     config = config_class.from_pretrained(
 46 |         args.config_name if args.config_name else args.model_name_or_path
 47 |     )
 48 |     tokenizer = tokenizer_class.from_pretrained(
 49 |         args.tokenizer_name if args.tokenizer_name else args.model_name_or_path,
 50 |         do_lower_case=args.do_lower_case
 51 |     )
 52 |     if args.block_size <= 0:
 53 |         args.block_size = (
 54 |             tokenizer.max_len_single_sentence
 55 |         )  # Our input block size will be the max possible for the model
 56 |     args.block_size = min(args.block_size, tokenizer.max_len_single_sentence)
 57 | 
 58 |     if args.local_rank == 0:
 59 |         torch.distributed.barrier()  # End of barrier to make sure only the first process in distributed training download model & vocab
 60 | 
 61 |     # Add Special Tokens
 62 |     tokenizer.add_special_tokens({'additional_special_tokens': SPECIAL_TOKENS})
 63 | 
 64 |     assert os.path.isfile(args.file_path)
 65 | 
 66 |     with open(args.file_path, encoding="utf-8") as f:
 67 |         parsed_json = json.load(f)
 68 | 
 69 |     srcs = []
 70 |     if not args.src_and_tgt_one_file_with_go:
 71 |         tgts = []
 72 |     for relation_dict in tqdm(parsed_json):
 73 |         if relation_dict['relation'] == NO_RELATION and not args.allow_no_relation:
 74 |             continue
 75 | 
 76 |         if leave_some_relations_out(relation_dict['relation']):
 77 |             continue
 78 | 
 79 |         subj_start_idx, subj_end_idx, obj_start_idx, obj_end_idx = [relation_dict[key] for key in ['subj_start', 'subj_end', 'obj_start', 'obj_end']]
 80 | 
 81 |         subj = " ".join(relation_dict['token'][subj_start_idx : subj_end_idx + 1])
 82 |         obj = " ".join(relation_dict['token'][obj_start_idx : obj_end_idx + 1])
 83 |         example_text = relation_dict['token']
 84 | 
 85 |         if skip_disallowed_pronouns(subj, obj):
 86 |             continue
 87 | 
 88 |         if args.mark_relation_args:
 89 |             example_text = mark_args(example_text, subj_start_idx, subj_end_idx, obj_start_idx, obj_end_idx)
 90 |         elif args.truncate_noise:
 91 |             example_text = truncate_noise(example_text, subj_start_idx, subj_end_idx + 1, obj_start_idx, obj_end_idx + 1)
 92 | 
 93 |         cleaned_example = clean_token(example_text)
 94 |         tgt = " ".join(cleaned_example)
 95 |         if args.anonymize_tgt:
 96 |             tgt = anonymize(tgt, subj, obj)
 97 | 
 98 |         relation_name = relation_dict['relation']
 99 |         if args.one_form_per_relation:
100 |             relation_contexts = [CANONICAL_FORMS[relation_name][0]]
101 |         else:
102 |             relation_contexts = CANONICAL_FORMS[relation_name]
103 |         for relation_context in relation_contexts:
104 |             if args.anonymize_tgt:
105 |                 src = relation_context.replace("{subj}", f"{E1} {subj} {END_E1}").replace("{obj}", f"{E2} {obj} {END_E2}")
106 |             else:
107 |                 src = relation_context.replace("{subj}", subj).replace("{obj}", obj)
108 |             src = specific_predicate_for_relation(src, tgt, relation_name)
109 | 
110 |             if src is None:
111 |                 continue
112 | 
113 |             if args.src_and_tgt_one_file_with_go:
114 |                 srcs.append(src + f" {GO} " + tgt+'\n')
115 |             else:
116 |                 srcs.append(src+'\n')
117 |                 tgts.append(tgt+'\n')
118 | 
119 |     with open(args.save_to_file+'.src', 'w') as f: f.writelines(srcs)
120 |     if not args.src_and_tgt_one_file_with_go:
121 |         with open(args.save_to_file+'.tgt', 'w') as f: f.writelines(tgts)
122 | 
123 |     with open(args.save_to_file+'.special_tokens', 'w') as f: f.writelines(f"{t}\n" for t in SPECIAL_TOKENS)
124 | 
125 | def specific_predicate_for_relation(src, tgt, relation_name):
126 |     if "{predicate}" not in src:
127 |         return src
128 |     predicate = PREDICATES[relation_name]['default']
129 |     lowered_tgt = tgt.lower()
130 |     for t in PREDICATES[relation_name]:
131 |         if t in lowered_tgt:
132 |             predicate = t
133 |             break
134 |     if predicate is None:
135 |         return None
136 |     return src.replace("{predicate}", predicate)
137 | 
138 | def mark_args(text, subj_start_idx, subj_end_idx, obj_start_idx, obj_end_idx):
139 |     if obj_end_idx > subj_end_idx:
140 |         text.insert(obj_end_idx + 1, END_OBJ)
141 |         text.insert(obj_start_idx, START_OBJ)
142 |         text.insert(subj_end_idx + 1, END_SUBJ)
143 |         text.insert(subj_start_idx, START_SUBJ)
144 |     else:
145 |         text.insert(subj_end_idx + 1, END_SUBJ)
146 |         text.insert(subj_start_idx, START_SUBJ)
147 |         text.insert(obj_end_idx + 1, END_OBJ)
148 |         text.insert(obj_start_idx, START_OBJ)
149 | 
150 |     return text
151 | 
152 | def anonymize(text, e1, e2):
153 |     return text.replace(e1, E1).replace(e2, E2)
154 | 
155 | def truncate_noise(example_text, subj_start_idx, subj_end_idx, obj_start_idx, obj_end_idx):
156 |     padding = 0
157 |     min_token_position = min(subj_start_idx, subj_end_idx, obj_start_idx, obj_end_idx)
158 |     min_token_position = max(min_token_position - padding, 0)
159 | 
160 |     max_token_position = max(subj_start_idx, subj_end_idx, obj_start_idx, obj_end_idx)
161 |     max_token_position = min(max_token_position + padding, len(example_text))
162 | 
163 |     return example_text[min_token_position:max_token_position]
164 | 
165 | def leave_some_relations_out(relation):
166 |     return relation in RELATIONS_TO_LEAVE_OUT
167 | 
168 | def skip_disallowed_pronouns(subj, obj):
169 |     return subj.lower() in DISALLOWED_PRONOUNS or obj.lower() in DISALLOWED_PRONOUNS
170 | 
171 | def clean_token(tokens):
172 |     return [CLEANINGMAP.get(t, t) for t in tokens]
173 | 
174 | 
175 | if __name__ == '__main__':
176 |     parser = argparse.ArgumentParser()
177 |     parser.add_argument("--file_path", default=None, type=str, required=True)
178 |     parser.add_argument("--save_to_file", default=None, type=str, required=True)
179 |     parser.add_argument("--anonymize_tgt", action='store_true')
180 |     parser.add_argument("--mark_relation_args", action='store_true')
181 |     parser.add_argument("--allow_no_relation", action='store_true')
182 |     parser.add_argument("--truncate_noise", action='store_true')
183 |     parser.add_argument("--one_form_per_relation", action='store_true')
184 |     parser.add_argument("--src_and_tgt_one_file_with_go", action='store_true')
185 | 
186 |     args = parser.parse_args()
187 |     args.model_type = 'gpt2'
188 |     args.model_name_or_path = 'gpt2'
189 |     args.config_name = ""
190 |     args.tokenizer_name = ""
191 |     args.do_lower_case = False
192 |     args.block_size = 512
193 |     args.local_rank = -1
194 |     main(args)


--------------------------------------------------------------------------------
/generation_outputs/switch_entities_of_gens.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | from itertools import product
  3 | from random import sample
  4 | import re
  5 | from tqdm import tqdm
  6 | 
  7 | PERSONAL_PRONOUNS_TO_KEEP = ['he', 'she']
  8 | POSSESIVE_PRONOUNS_TO_KEEP = ['his', 'her']
  9 | ENTITY_TYPES = {
 10 |          'country_of_headquarters': ['organization', 'country', 'organization', 'country'],
 11 |          'children': ['person', 'person', 'person', 'person'],
 12 |          'city_of_death': ['person', 'city', 'city', None],
 13 |          'date_of_death': ['person', 'date', 'person', None],
 14 |          'founded_by': ['organization', 'person', 'person', 'organization'],
 15 |          'origin-country': ['person', 'country', 'person', 'country'],
 16 |          'origin-nationality': ['person', 'nationality', 'person', 'nationality'],
 17 |          'docred-origin-country': ['organization', 'country', 'organization', 'country'],
 18 |          'docred-origin-nationality': ['organization', 'nationality', 'organization', 'nationality'],
 19 |          'religion': ['person', 'religion', 'person', 'religion'],
 20 |          'docred-religion': ['organization', 'religion', 'organization', 'religion'],
 21 |          'spouse': ['person', 'person', 'person', 'person'],
 22 |         }
 23 | 
 24 | def main(args):
 25 |     entity_types = ENTITY_TYPES[args.relation]
 26 |     with open(args.gen_file, 'r') as f:
 27 |         gens = f.readlines()
 28 |     gens = [g for g in gens if g!= '\n']
 29 | 
 30 |     e1s = get_similar_entities(entity_types[0])
 31 |     e2s = get_similar_entities(entity_types[1])
 32 |     e3s = get_similar_entities(entity_types[2])
 33 |     e4s = get_similar_entities(entity_types[3])
 34 | 
 35 |     with open(args.gen_file.split('.txt')[0]+'_new_ents.txt', 'w') as f:
 36 |         for i, gen in tqdm(enumerate(gens)):
 37 |             assert gen.count('[') == gen.count(']'), gen
 38 |             # E1 - PERSON/ORGANIZATION
 39 |             subbed = switch_entity_but_not_pronouns(1, gen, e1s)
 40 |             # E2
 41 |             if entity_types[1] == 'date':
 42 |                 subbed = switch_dates(2, subbed, e2s)
 43 |             elif entity_types[1] == 'religion':
 44 |                 subbed = switch_religions(2, subbed, e2s)
 45 |             else:
 46 |                 subbed = switch_entity_but_not_pronouns(2, subbed, e2s)
 47 |             # E3 - PERSON/ORGANIZATION
 48 |             if entity_types[2]:
 49 |                 for e in re.findall('\[E3\] (.*?) \[\/E3\]', subbed):
 50 |                     if e in PERSONAL_PRONOUNS_TO_KEEP+POSSESIVE_PRONOUNS_TO_KEEP:
 51 |                         subbed = re.sub(f'\[E3\] {e} \[\/E3\]', e, subbed)
 52 |                     else:
 53 |                         subbed = re.sub(f'\[E3\] {e} \[\/E3\]', sample(e3s, 1)[0], subbed)
 54 |             # E4
 55 |             if entity_types[3]:
 56 |                 for e in re.findall('\[E4\] (.*?) \[\/E4\]', subbed):
 57 |                     if e in PERSONAL_PRONOUNS_TO_KEEP+POSSESIVE_PRONOUNS_TO_KEEP:
 58 |                         subbed = re.sub(f'\[E4\] {e} \[\/E4\]', e, subbed)
 59 |                     if entity_types[3] == 'religion':
 60 |                         subbed = switch_religions(4, subbed, e4s, keep_markers=False)
 61 |                     else:
 62 |                         subbed = re.sub(f'\[E4\] {e} \[\/E4\]', sample(e4s, 1)[0], subbed)
 63 |             if subbed == gen:
 64 |                 print(f"Warning, generation didn't change: {gen}")
 65 |             f.write(subbed)
 66 | 
 67 | def switch_entity_but_not_pronouns(ent_num, gen, ents):
 68 |     E = f"E{ent_num}"
 69 |     found_pronouns = re.findall(f"\[{E}\] ({'|'.join(PERSONAL_PRONOUNS_TO_KEEP+POSSESIVE_PRONOUNS_TO_KEEP)}) \[\/{E}\]", gen, flags=re.IGNORECASE)
 70 |     if found_pronouns:
 71 |         for p in re.findall(f"\[{E}\] ({'|'.join(PERSONAL_PRONOUNS_TO_KEEP)}) \[\/{E}\]", gen, flags=re.IGNORECASE):
 72 |             gen = re.sub(f'\[{E}\] ({p}) \[\/{E}\]', f'[{E}] {sample(PERSONAL_PRONOUNS_TO_KEEP, 1)[0]} [/{E}]', gen, flags=re.IGNORECASE)
 73 |         for p in re.findall(f"\[{E}\] ({'|'.join(POSSESIVE_PRONOUNS_TO_KEEP)}) \[\/{E}\]", gen, flags=re.IGNORECASE):
 74 |             gen = re.sub(f'\[{E}\] ({p}) \[\/{E}\]', f'[{E}] {sample(POSSESIVE_PRONOUNS_TO_KEEP, 1)[0]} [/{E}]', gen, flags=re.IGNORECASE)
 75 |     else:
 76 |         gen = re.sub(f'\[{E}\] (.*?) \[\/{E}\]', f'[{E}] {sample(ents, 1)[0]} [/{E}]', gen)
 77 | 
 78 |     return gen
 79 | 
 80 | def switch_religions(ent_num, subbed, ents, keep_markers=True):
 81 |     E = f"E{ent_num}"
 82 |     if keep_markers:
 83 |         subbed = re.sub(f"\[{E}\] Religion \[\/{E}\]", f"[{E}] {sample(ents['religion'], 1)[0]} [/{E}]", subbed)
 84 |         subbed = re.sub(f"\[{E}\] Religious Affiliation \[\/{E}\]", f"[{E}] {sample(ents['religious_affiliation'], 1)[0]} [/{E}]", subbed)
 85 |         subbed = re.sub(f"\[{E}\] Religious Relation \[\/{E}\]", f"[{E}] {sample(ents['religious_relation'], 1)[0]} [/{E}]", subbed)
 86 |         subbed = re.sub(f"\[{E}\] Religious Affiliation plural \[\/{E}\]", f"[{E}] {sample(ents['religious_relation'], 1)[0]} [/{E}]", subbed)
 87 |     else:
 88 |         subbed = re.sub(f"\[{E}\] Religion \[\/{E}\]", sample(ents['religion'], 1)[0], subbed)
 89 |         subbed = re.sub(f"\[{E}\] Religious Affiliation \[\/{E}\]", sample(ents['religious_affiliation'], 1)[0], subbed)
 90 |         subbed = re.sub(f"\[{E}\] Religious Relation \[\/{E}\]", sample(ents['religious_relation'], 1)[0], subbed)
 91 |         subbed = re.sub(f"\[{E}\] Religious Affiliation plural \[\/{E}\]", sample(ents['religious_relation'], 1)[0], subbed)
 92 |     return subbed
 93 | 
 94 | def switch_dates(ent_num, subbed, ents):
 95 |     # changing just november 7.
 96 |     #TODO, pretty sure this is depracated. Why only November 7?
 97 |     E = f"E{ent_num}"
 98 |     subbed = re.sub(f'\[{E}\] November 7 \[\/{E}\]', f'[{E}] {sample(ents, 1)[0]} [/{E}]', subbed)
 99 |     subbed = re.sub(f'\[{E}\] Nov 7 \[\/{E}\]', f'[{E}] {sample(ents, 1)[0]} [/{E}]', subbed)
100 |     subbed = re.sub(f'\[{E}\] Nov\. 7 \[\/{E}\]', f'[{E}] {sample(ents, 1)[0]} [/{E}]', subbed)
101 | 
102 | def get_similar_entities(entity_type):
103 |     if entity_type is None:
104 |         return None
105 |     elif entity_type == 'date':
106 |         return dates()
107 |     elif entity_type == 'city':
108 |         return cities()
109 |     elif entity_type == 'nationality':
110 |         return nationalities()
111 |     elif entity_type == 'religion':
112 |         return religions()
113 |     else:
114 |         return read_ents_from_file(entity_type)
115 | 
116 | def read_ents_from_file(entity_type):
117 |     with open(f'generation_outputs/types/{entity_type}.txt', 'r') as f:
118 |         ents = f.readlines()
119 |         ents = [e.rstrip() for e in ents]
120 |     return ents
121 | 
122 | def religions():
123 |     religions = {
124 |         'religion': ["Atheism", "Scientology", "Islam", "Christianity"],
125 |         'religious_relation': ["Evangelical", "Islamic", "Christian", "Jewish", "Catholic"],
126 |         'religious_affiliation': ["Methodist", "Separatist", "Jew", "Christian", "Sunni", "Secular", "Fundamentalist", "Christianist", "Anglican", "Orthodox", "Islamist", "Muslim"],
127 |     }
128 |     religions['religious_affiliation_plural'] = [f"{x}s" for x in religions['religious_affiliation']]
129 | 
130 |     return religions
131 | 
132 | def dates():
133 |     months = ['January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December', 'Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
134 |     return [' '.join([x[0], str(x[1])]) for x in product(months, range(29))]
135 | 
136 | def cities():
137 |     return ["New York", "Los Angeles", "Chicago", "Houston", "Phoenix", "Philadelphia", "San Antonio", "San Diego", "Dallas", "San Jose", "Austin", "Jacksonville", "Fort Worth", "Columbus", "San Francisco", "Charlotte", "Indianapolis", "Seattle", "Denver", "Washington", "Boston", "El Paso", "Detroit", "Nashville", "Portland", "Memphis", "Oklahoma City", "Las Vegas", "Louisville", "Baltimore", "Milwaukee", "Albuquerque", "Tucson", "Fresno", "Mesa", "Sacramento", "Atlanta", "Kansas City", "Colorado Springs", "Miami", "Raleigh", "Omaha", "Long Beach", "Virginia Beach", "Oakland", "Minneapolis", "Tulsa", "Arlington", "Tampa", "New Orleans"]
138 | 
139 | def nationalities():
140 |     return ["British", "English", "Scottish", "Gaelic", "Irish", "Welsh", "Danish", "Finnish", "Norwegian", "Swedish", "Swiss", "German", "French", "Italian", "Estonian", "Latvian", "Lithuanian", "Austrian", "Belgian", "Flemish", "Dutch", "American", "Canadian", "Mexican", "Spanish", "Ukrainian", "Russian", "Belarusian", "Polish", "Czech", "Slovak", "Slovakian", "Hungarian", "Romanian", "Bulgarian", "Greek", "Brazilian", "Portuguese", "Australian", "New Zealander", "Maori", "Georgian", "Israeli", "Hebrew", "Egyptian", "Arabic", "Turkish", "Chinese", "Mandarin", "Korean", "Japanese", "Indian", "Hindi", "South African", "Afrikaans"]
141 | 
142 | if __name__ == "__main__":
143 |     parser = argparse.ArgumentParser()
144 |     parser.add_argument("--gen_file", type=str, required=True)
145 |     parser.add_argument("--relation", type=str, required=True)
146 |     args = parser.parse_args()
147 |     main(args)


--------------------------------------------------------------------------------
/run_generation.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # coding=utf-8
  3 | # Copyright 2018 Google AI, Google Brain and Carnegie Mellon University Authors and the HuggingFace Inc. team.
  4 | # Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
  5 | #
  6 | # Licensed under the Apache License, Version 2.0 (the "License");
  7 | # you may not use this file except in compliance with the License.
  8 | # You may obtain a copy of the License at
  9 | #
 10 | #     http://www.apache.org/licenses/LICENSE-2.0
 11 | #
 12 | # Unless required by applicable law or agreed to in writing, software
 13 | # distributed under the License is distributed on an "AS IS" BASIS,
 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 15 | # See the License for the specific language governing permissions and
 16 | # limitations under the License.
 17 | """ Conditional text generation with the auto-regressive models of the library (GPT/GPT-2/CTRL/Transformer-XL/XLNet)
 18 | """
 19 | 
 20 | 
 21 | import argparse
 22 | import logging
 23 | import os
 24 | 
 25 | import numpy as np
 26 | import torch
 27 | 
 28 | from transformers import (
 29 |     CTRLLMHeadModel,
 30 |     CTRLTokenizer,
 31 |     GPT2LMHeadModel,
 32 |     GPT2Tokenizer,
 33 |     OpenAIGPTLMHeadModel,
 34 |     OpenAIGPTTokenizer,
 35 |     TransfoXLLMHeadModel,
 36 |     TransfoXLTokenizer,
 37 |     XLMTokenizer,
 38 |     XLMWithLMHeadModel,
 39 |     XLNetLMHeadModel,
 40 |     XLNetTokenizer,
 41 | )
 42 | 
 43 | 
 44 | logging.basicConfig(
 45 |     format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO,
 46 | )
 47 | logger = logging.getLogger(__name__)
 48 | 
 49 | MAX_LENGTH = int(10000)  # Hardcoded max length to avoid infinite loop
 50 | MAX_BATCH = 100
 51 | 
 52 | MODEL_CLASSES = {
 53 |     "gpt2": (GPT2LMHeadModel, GPT2Tokenizer),
 54 |     "ctrl": (CTRLLMHeadModel, CTRLTokenizer),
 55 |     "openai-gpt": (OpenAIGPTLMHeadModel, OpenAIGPTTokenizer),
 56 |     "xlnet": (XLNetLMHeadModel, XLNetTokenizer),
 57 |     "transfo-xl": (TransfoXLLMHeadModel, TransfoXLTokenizer),
 58 |     "xlm": (XLMWithLMHeadModel, XLMTokenizer),
 59 | }
 60 | 
 61 | # Padding text to help Transformer-XL and XLNet with short prompts as proposed by Aman Rusia
 62 | # in https://github.com/rusiaaman/XLNet-gen#methodology
 63 | # and https://medium.com/@amanrusia/xlnet-speaks-comparison-to-gpt-2-ea1a4e9ba39e
 64 | PADDING_TEXT = """ In 1991, the remains of Russian Tsar Nicholas II and his family
 65 | (except for Alexei and Maria) are discovered.
 66 | The voice of Nicholas's young son, Tsarevich Alexei Nikolaevich, narrates the
 67 | remainder of the story. 1883 Western Siberia,
 68 | a young Grigori Rasputin is asked by his father and a group of men to perform magic.
 69 | Rasputin has a vision and denounces one of the men as a horse thief. Although his
 70 | father initially slaps him for making such an accusation, Rasputin watches as the
 71 | man is chased outside and beaten. Twenty years later, Rasputin sees a vision of
 72 | the Virgin Mary, prompting him to become a priest. Rasputin quickly becomes famous,
 73 | with people, even a bishop, begging for his blessing. <eod> </s> <eos>"""
 74 | 
 75 | 
 76 | def set_seed(args):
 77 |     np.random.seed(args.seed)
 78 |     torch.manual_seed(args.seed)
 79 |     if args.n_gpu > 0:
 80 |         torch.cuda.manual_seed_all(args.seed)
 81 | 
 82 | 
 83 | #
 84 | # Functions to prepare models' input
 85 | #
 86 | 
 87 | 
 88 | def prepare_ctrl_input(args, _, tokenizer, prompt_text):
 89 |     if args.temperature > 0.7:
 90 |         logger.info("CTRL typically works better with lower temperatures (and lower top_k).")
 91 | 
 92 |     encoded_prompt = tokenizer.encode(prompt_text, add_special_tokens=False)
 93 |     if not any(encoded_prompt[0] == x for x in tokenizer.control_codes.values()):
 94 |         logger.info("WARNING! You are not starting your generation from a control code so you won't get good results")
 95 |     return prompt_text
 96 | 
 97 | 
 98 | def prepare_xlm_input(args, model, tokenizer, prompt_text):
 99 |     # kwargs = {"language": None, "mask_token_id": None}
100 | 
101 |     # Set the language
102 |     use_lang_emb = hasattr(model.config, "use_lang_emb") and model.config.use_lang_emb
103 |     if hasattr(model.config, "lang2id") and use_lang_emb:
104 |         available_languages = model.config.lang2id.keys()
105 |         if args.xlm_language in available_languages:
106 |             language = args.xlm_language
107 |         else:
108 |             language = None
109 |             while language not in available_languages:
110 |                 language = input("Using XLM. Select language in " + str(list(available_languages)) + " >>> ")
111 |         # kwargs["language"] = tokenizer.lang2id[language]
112 | 
113 |     # TODO fix mask_token_id setup when configurations will be synchronized between models and tokenizers
114 |     # XLM masked-language modeling (MLM) models need masked token
115 |     # is_xlm_mlm = "mlm" in args.model_name_or_path
116 |     # if is_xlm_mlm:
117 |     #     kwargs["mask_token_id"] = tokenizer.mask_token_id
118 | 
119 |     return prompt_text
120 | 
121 | 
122 | def prepare_xlnet_input(args, _, tokenizer, prompt_text):
123 |     prompt_text = (args.padding_text if args.padding_text else PADDING_TEXT) + prompt_text
124 |     return prompt_text, {}
125 | 
126 | 
127 | def prepare_transfoxl_input(args, _, tokenizer, prompt_text):
128 |     prompt_text = (args.padding_text if args.padding_text else PADDING_TEXT) + prompt_text
129 |     return prompt_text, {}
130 | 
131 | 
132 | PREPROCESSING_FUNCTIONS = {
133 |     "ctrl": prepare_ctrl_input,
134 |     "xlm": prepare_xlm_input,
135 |     "xlnet": prepare_xlnet_input,
136 |     "transfo-xl": prepare_transfoxl_input,
137 | }
138 | 
139 | 
140 | def adjust_length_to_model(length, max_sequence_length):
141 |     if length < 0 and max_sequence_length > 0:
142 |         length = max_sequence_length
143 |     elif 0 < max_sequence_length < length:
144 |         length = max_sequence_length  # No generation bigger than model size
145 |     elif length < 0:
146 |         length = MAX_LENGTH  # avoid infinite loop
147 |     return length
148 | 
149 | 
150 | def main():
151 |     parser = argparse.ArgumentParser()
152 |     parser.add_argument(
153 |         "--model_type",
154 |         default=None,
155 |         type=str,
156 |         required=True,
157 |         help="Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys()),
158 |     )
159 |     parser.add_argument(
160 |         "--model_name_or_path",
161 |         default=None,
162 |         type=str,
163 |         required=True,
164 |         help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(MODEL_CLASSES.keys()),
165 |     )
166 | 
167 |     parser.add_argument("--prompt", type=str, default="", required=True)
168 |     parser.add_argument("--out_file", type=str, default="")
169 |     parser.add_argument("--length", type=int, default=50)
170 |     parser.add_argument("--stop_token", type=str, default=None, help="Token at which text generation is stopped")
171 | 
172 |     parser.add_argument(
173 |         "--temperature",
174 |         type=float,
175 |         default=1.0,
176 |         help="temperature of 1.0 has no effect, lower tend toward greedy sampling",
177 |     )
178 |     parser.add_argument(
179 |         "--repetition_penalty", type=float, default=1.0, help="primarily useful for CTRL model; in that case, use 1.2"
180 |     )
181 |     parser.add_argument("--k", type=int, default=0)
182 |     parser.add_argument("--p", type=float, default=0.9)
183 |     parser.add_argument("--num_return_sequences", type=int, default=1)
184 | 
185 |     parser.add_argument("--padding_text", type=str, default="", help="Padding text for Transfo-XL and XLNet.")
186 |     parser.add_argument("--xlm_language", type=str, default="", help="Optional language when used with the XLM model.")
187 | 
188 |     parser.add_argument("--seed", type=int, default=42, help="random seed for initialization")
189 |     parser.add_argument("--no_cuda", action="store_true", help="Avoid using CUDA when available")
190 |     args = parser.parse_args()
191 | 
192 |     args.device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
193 |     args.n_gpu = torch.cuda.device_count()
194 | 
195 |     set_seed(args)
196 | 
197 |     # Initialize the model and tokenizer
198 |     try:
199 |         args.model_type = args.model_type.lower()
200 |         model_class, tokenizer_class = MODEL_CLASSES[args.model_type]
201 |     except KeyError:
202 |         raise KeyError("the model {} you specified is not supported. You are welcome to add it and open a PR :)")
203 | 
204 |     tokenizer = tokenizer_class.from_pretrained(args.model_name_or_path)
205 |     model = model_class.from_pretrained(args.model_name_or_path)
206 |     model.to(args.device)
207 | 
208 |     args.length = adjust_length_to_model(args.length, max_sequence_length=model.config.max_position_embeddings)
209 |     logger.info(args)
210 | 
211 |     prompt_text = args.prompt if args.prompt else input("Model prompt >>> ")
212 | 
213 |     # Different models need different input formatting and/or extra arguments
214 |     requires_preprocessing = args.model_type in PREPROCESSING_FUNCTIONS.keys()
215 |     if requires_preprocessing:
216 |         prepare_input = PREPROCESSING_FUNCTIONS.get(args.model_type)
217 |         prompt_text = prepare_input(args, model, tokenizer, prompt_text)
218 |     encoded_prompt = tokenizer.encode(prompt_text, add_special_tokens=True, return_tensors="pt")
219 |     encoded_prompt = encoded_prompt.to(args.device)
220 | 
221 |     samples_splits = args.num_return_sequences//MAX_BATCH * [MAX_BATCH]
222 |     if args.num_return_sequences%MAX_BATCH > 0: samples_splits.append(args.num_return_sequences%MAX_BATCH)
223 | 
224 |     if args.out_file:
225 |         file_path = os.path.join(args.model_name_or_path, args.out_file)
226 |         out_file = open(file_path, "w")
227 | 
228 |     for curr_samples in samples_splits:
229 |         output_sequences = model.generate(
230 |             input_ids=encoded_prompt,
231 |             max_length=args.length,
232 |             temperature=args.temperature,
233 |             top_k=args.k,
234 |             top_p=args.p,
235 |             do_sample=args.k > 1 or args.p < 1.0,
236 |             repetition_penalty=args.repetition_penalty,
237 |             num_return_sequences=curr_samples,
238 |         )
239 | 
240 |         generated_sequence = output_sequences[0, :, encoded_prompt.size(1):].tolist()
241 | 
242 |         texts = []
243 |         for seq in generated_sequence:
244 |             text = tokenizer.decode(seq, clean_up_tokenization_spaces=True)
245 |             text = text[: text.find(args.stop_token) if args.stop_token else None]
246 |             text = text[: text.find('\n')]
247 |             texts.append(text)
248 | 
249 |         texts = '\n'.join(texts)
250 | 
251 |         if args.out_file:
252 |             out_file.write(texts)
253 |             print(f"Generations written to: {file_path}")
254 |         else:
255 |             print(texts)
256 | 
257 |     if args.out_file:
258 |         out_file.close()
259 | 
260 | 
261 | 
262 | if __name__ == "__main__":
263 |     main()
264 | 


--------------------------------------------------------------------------------
/scripts/generation_preprocess/relation_canonical_form.py:
--------------------------------------------------------------------------------
  1 | CANONICAL_FORMS = {
  2 |     "no_relation": ["NR"],
  3 |     "org:founded_by": ["{obj}, founder of {subj} .",
  4 |                        "{obj}, who established {subj} .",
  5 |                        "{subj}, founded by {obj} .",
  6 |                        "{obj} was the founder of {subj} .",
  7 |                        "{subj} founder {obj} ."],
  8 |     "per:employee_of": ["{subj} is an employee of {obj} .",
  9 |                         "{subj} is the {predicate} of {obj} .",
 10 |                         "{obj}'s {predicate} {subj} .",
 11 |                         "{subj}, the {predicate} of {obj} .",
 12 |                         "{subj}, {obj}'s {predicate} .",
 13 |                         "{subj}, {predicate} of {obj} ."
 14 |                         "{obj} joined {subj} ."],
 15 |     "org:alternate_names": ["{subj} known as {obj} .",
 16 |                             "{subj}, formally known as {obj} .",
 17 |                             "{subj}, then called {obj} .",
 18 |                             "{subj} or {obj} ."],
 19 |     "per:cities_of_residence": ["{subj} lived in {obj} .",
 20 |                                 "{subj} moved to {obj} .",
 21 |                                 "{subj}'s home in {obj} .",
 22 |                                 "{subj} grew up in {obj} .",
 23 |                                 "{subj} who lived in {obj} ."],
 24 |     "per:children": ["{subj}'s child is {obj} .",
 25 |                      "{subj}'s {predicate} is {obj} ."],
 26 |     "per:title": ["{subj} is a {obj} ."],
 27 |     "per:siblings": ["{subj}'s sibling is {obj} .",
 28 |                      "{subj}'s {predicate}, {obj} .",
 29 |                      "{obj}, {subj}'s {predicate} ."],
 30 |     "per:religion": ["{subj}, a {obj} .",
 31 |                      "{subj} is {obj} person ."],
 32 |     "per:age": ["{subj} is {obj} years old .",
 33 |                 "{subj} dies at age {obj} .",
 34 |                 "{subj}, aged {obj} .",
 35 |                 "{subj} reached the age of {obj} ."],
 36 |     "org:website": ["Find {subj} online in {obj} ."],
 37 |     "per:stateorprovinces_of_residence": ["{subj} lived in {obj} .",
 38 |                                           "{subj} moved to {obj} .",
 39 |                                           "{subj}'s home in {obj} .",
 40 |                                           "{subj} grew up in {obj} .",
 41 |                                           "{subj} who lived in {obj} ."],
 42 |     "org:member_of": ["{subj} is part of {obj} .",
 43 |                       "{subj} has join the {obj} .",
 44 |                       "{subj} is a member of {obj} .",
 45 |                       "{obj} is composed of {subj} ."],
 46 |     "org:top_members/employees": ["{obj} is the {predicate} of {subj} .",
 47 |                                   "{subj}'s {predicate} {obj} .",
 48 |                                   "{obj}, the {predicate} of {subj} .",
 49 |                                   "{obj}, {subj}'s {predicate} .",
 50 |                                   "{obj}, {predicate} of {subj} ."],
 51 |     "per:countries_of_residence": ["{subj} lived in {obj} .",
 52 |                                    "{subj} moved to {obj} .",
 53 |                                    "{subj}'s home in {obj} .",
 54 |                                    "{subj} grew up in {obj} .",
 55 |                                    "{subj} who lived in {obj} ."],
 56 |     "org:city_of_headquarters": ["{subj}, based in {obj} .",
 57 |                                  "{subj} is headquartered in {obj} .",
 58 |                                  "{subj}, an organization based in {obj} .",
 59 |                                  "{subj}, which is based in {obj} ."],
 60 |     "org:members": ["{obj} is part of {subj} .",
 61 |                     "{obj} has join the {subj} .",
 62 |                     "{obj} is a member of {subj} .",
 63 |                     "{subj} is composed of {obj} ."],
 64 |     "org:country_of_headquarters": ["{subj}, based in {obj} .",
 65 |                                     "{subj}, based in Dublin, {obj} .",
 66 |                                     "{subj} is headquartered in {obj} .",
 67 |                                     "{subj}, an organization based in {obj} .",
 68 |                                     "{subj}, which is based in {obj} ."],
 69 |     "per:spouse": ["{subj} is married to {obj} .",
 70 |                    "{subj} married {obj} .",
 71 |                    "{subj}'s {predicate} {obj} ."],
 72 |     "org:stateorprovince_of_headquarters": ["{subj}, based in {obj} .",
 73 |                                             "{subj} is based in {obj} .",
 74 |                                             "{subj} is headquartered in {obj} .",
 75 |                                             "{subj}, an organization based in {obj} .",
 76 |                                             "{subj}, which is based in {obj} ."],
 77 |     "org:number_of_employees/members": ["{subj} employes {obj} workers .",
 78 |                                         "{subj} is an organization with {obj} employees .",
 79 |                                         "{subj} has {obj} employees ."],
 80 |     "org:parents": ["{subj}, a unit of {obj} .",
 81 |                     "{subj} at {obj} .",
 82 |                     "{subj} is a division of {obj} .",
 83 |                     "{subj} is owned by {obj} ."],
 84 |     "org:subsidiaries": ["{obj}, a unit of {subj} .",
 85 |                          "{obj} at {subj} .",
 86 |                          "{obj} is a division of {subj} .",
 87 |                          "{obj} is owned by {subj} ."],
 88 |     "per:origin": ["{subj} is a {obj} native .",
 89 |                    "{obj} {subj} .",
 90 |                    "{subj} is a {obj} ."],
 91 |     "org:political/religious_affiliation": ["{obj} group {subj} ."],
 92 |     "per:other_family": ["{subj} and {obj} are family members .",
 93 |                          "{subj}'s {predicate} {obj} ."],
 94 |     "per:stateorprovince_of_birth": ["{subj} was born in {obj} .",
 95 |                                      "{subj} was born on January 1st in {obj} .",
 96 |                                      "{subj} was born in {obj} ."],
 97 |     "org:dissolved": ["{subj} was dissolved in {obj} .", 
 98 |                       "{subj} announced bankrupcy in {obj} ."],
 99 |     "per:date_of_death": ["{subj} died in {obj} .",
100 |                           "{subj} died at his home in {obj} ."],
101 |     "org:shareholders": ["{obj} acquired some of {subj} .",
102 |                          "{obj} invested in {subj} .",
103 |                          "{subj}'s shareholder {obj} ."],
104 |     "per:alternate_names": ["{subj}, who was known as {obj} .",
105 |                             "{subj}, whose real name is {obj} .",
106 |                             "{subj}, then known as {obj} ."],
107 |     "per:parents": ["{obj} is {subj}'s parent .",
108 |                     "{subj}'s {predicate}, {obj} .",
109 |                     "{obj}, {subj}'s {predicate} .",
110 |                     "{obj}, {predicate} of {subj} ."],
111 |     "per:schools_attended": ["{subj} graduated from {obj} .",
112 |                              "{subj} received a degree from {obj} .",
113 |                              "{subj} attended {obj} ."],
114 |     "per:cause_of_death": ["{subj} died of {obj} .",
115 |                            "{subj} died from {obj} ."],
116 |     "per:city_of_death": ["{subj} died in {obj} .",
117 |                           "{subj} died at his home in {obj} .",
118 |                           "{subj} died at Sunday in {obj} ."],
119 |     "per:stateorprovince_of_death": ["{subj} died in {obj} .",
120 |                                      "{subj} died at his home in {obj} .",
121 |                                      "{subj} died at Sunday in {obj} ."],
122 |     "org:founded": ["{subj} was established in {obj} .",
123 |                     "Founded {subj} in {obj} .",
124 |                     "{subj}, established in {obj} .",
125 |                     "The founder founded {subj} in {obj} ."],
126 |     "per:country_of_birth": ["{subj} was born in {obj} .",
127 |                              "{subj} was born on January 1st in {obj} .",
128 |                              "{subj} was born in Berlin, {obj} ."],
129 |     "per:date_of_birth": ["{subj} was born in {obj} .",
130 |                           "{subj} was born on {obj} ."],
131 |     "per:city_of_birth": ["{subj} was born in {obj} .",
132 |                           "{subj} was born on January 1st in {obj} ."],
133 |     "per:charges": ["{subj} was convicted of {obj} .",
134 |                     "{subj} face {obj} among other charges ."],
135 |     "per:country_of_death": ["{subj} died in {obj} .",
136 |                              "{subj} died at his home in {obj} .",
137 |                              "{subj} died at Sunday in {obj} ."]
138 | }
139 | 
140 | 
141 | PREDICATES = {"org:top_members/employees": {"chief operating officer": "chief operating officer",
142 |                                             "executive director": "executive director", "director-general": "director-general",
143 |                                             "director general": "director general", "chief executive": "chief executive",
144 |                                             "vice president": "vice president", "vice chairman": "vice chairman", "executive": "executive",
145 |                                             "president": "president", "spokesman": "spokesman", "chairman": "chairman", "director": "director",
146 |                                             "general": "general", "manager": "manager", "editor": "editor", "fellow": "fellow", "chief": "chief",
147 |                                             "owner": "owner", "owns": "owner", "own": "owner", "ceo": "ceo", "coo": "coo", "cto": "cto",
148 |                                             "default": "head"},
149 |  "per:children": {"son": "son", "daughter": "daughter",
150 |                   "default": None},
151 |  "per:siblings": {"brother": "brother", "sister": "sister",
152 |                   "default": None},
153 |  "per:spouse": {"wife": "wife", "husband": "husband",
154 |                   "default": None},
155 |  "per:parents": {"father": "father", "mother": "mother",
156 |                   "default": None},
157 |  "per:other_family": {"sister's husband": "sister's husband", "brother 's wife": "brother 's wife", "brother-in-law": "brother-in-law",
158 |                       "sister-in-law": "sister-in-law", "grandchildren": "grandchildren", "stepdaughters": "stepdaughter",
159 |                       "stepdaughter": "stepdaughter", "stepfathers": "stepfather", "stepmothers": "stepmother", "stepmother": "stepmother",
160 |                       "stepfather": "stepfather", "grandchild": "grandchild", "daughters": "daughter", "daughter": "daughter",
161 |                       "stepsons": "stepson", "children": "children", "engaged": "is engaged to", "husband": "husband", 
162 |                       "stepson": "stepson", "cousin": "cousin", "fiance": "fiance", "nephew": "nephew", "child": "child",
163 |                       "niece": "niece", "sons": "son", "wife": "wife", "son": "son",
164 |                       "default": None},
165 | }
166 | 
167 | PREDICATES["per:employee_of"] = PREDICATES["org:top_members/employees"]
168 | PREDICATES["per:employee_of"]['default'] = None


--------------------------------------------------------------------------------
/scripts/search/download_patterns_config.py:
--------------------------------------------------------------------------------
  1 | children_patterns = ["{e1:e=PERSON John} 's [t:w=son|daughter|child|children|daughters|sons daughter] , {e2:e=PERSON Tim}, likes swimming .",
  2 |                      "{e1:e=PERSON Mary} did something to her [t:w=son|daughter|child|children|daughters|sons son], {e2:e=PERSON John} in 1992.",
  3 |                      "{e1:e=PERSON Mary} was survived by her 4 [t:w=son|daughter|child|children|daughters|sons sons], John, John, {e2:e=PERSON John} and John."]
  4 | founded_by_patterns = ["{e1:e=ORGANIZATION Microsoft} [t:w=founder founder] {e2:e=PERSON Mary} likes running.",
  5 |                        "{e2:e=PERSON Mary} [t:w=founded founded] {e1:e=ORGANIZATION Microsoft}.",
  6 |                        "{e1:e=ORGANIZATION Microsoft} was [t:w=founded founded] [$ by] {e2:e=PERSON Mary}."]
  7 | country_of_headquarters_patterns = ["John Doe, a professor at the {e1:e=ORGANIZATION Technion} [in:t=IN in] {e2:e=LOCATION Israel} likes running.",
  8 |                                     "{e1:e=ORGANIZATION Technion}, a leading {t:t=/NN/ company} {in:t=IN in} {e2:e=LOCATION Israel}.",
  9 |                                     "{e2:e=LOCATION Israel} [pos:t=POS '] largest university is {e1:e=ORGANIZATION BIU}."]
 10 | religion_patterns = ["{e1:e=PERSON John} is a [e2:w=Methodist|Episcopal|separatist|Jew|Christian|Sunni|evangelical|atheism|Islamic|secular|fundamentalist|Christianist|Jewish|Anglican|Catholic|orthodox|Scientology|Islamist|Islam|Muslim|Shia Jewish]",
 11 |                      "[e2:w=Methodist|Episcopal|separatist|Jew|Christian|Sunni|evangelical|atheism|Islamic|secular|fundamentalist|Christianist|Jewish|Anglican|Catholic|orthodox|Scientology|Islamist|Islam|Muslim|Shia Jewish] {e1:e=PERSON John} is walking down the street.",
 12 |                      "{e1:e=PERSON John} is a [e2:w=Methodist|Episcopal|separatist|Jew|Christian|Sunni|evangelical|atheism|Islamic|secular|fundamentalist|Christianist|Jewish|Anglican|Catholic|orthodox|Scientology|Islamist|Islam|Muslim|Shia Methodist] Person."]
 13 | spouse_patterns = ["{e1:e=PERSON John} 's [t:w=wife|husband wife], {e2:e=PERSON Mary} , died in 1991 .",
 14 |                    "{e1:e=PERSON John} [t:l=marry married] {e2:e=PERSON Mary}",
 15 |                    "{e1:e=PERSON John} is [t:w=married married] to {e2:e=PERSON Mary}"]
 16 | origin_patterns = ["{e2:e=MISC Scottish} {e1:e=PERSON Mary} is high.",
 17 |                    "{e1:e=PERSON Mary} is a {e2:e=MISC Scottish} professor.",
 18 |                    "{e1:e=PERSON Mary}, the {e2:e=LOCATION US} professor."]
 19 | date_of_death_patterns = ["{e1:e=PERSON John} was announced [t:w=dead dead] in {e2:e=DATE 1943}.",
 20 |                           "{e1:e=PERSON John} [t:w=died died] in {e2:e=DATE 1943}.",
 21 |                           "{e1:e=PERSON John}, an NLP scientist, [t:w=died died] {e2:e=DATE 1943}."
 22 |         ]
 23 | city_of_death_patterns = ["{e1:e=PERSON John} [t:w=died died] in {e2:e=LOCATION London}, {country:e=LOCATION England} in 1997.",
 24 |                           "{e1:e=PERSON John} [t:w=died died] in {e2:e=LOCATION London} in 1997.",
 25 |                           "{e1:e=PERSON John} [$ -LRB-] [t:w=died died] in {e2:e=LOCATION London} [$ -RRB-] ."]
 26 | 
 27 | all_triggers_children_patterns = ["{e1:e=PERSON John} 's [t:w=baby|child|children|daughter|daughters|son|sons|step-daughter|step-son|step-child|step-children|stepchildren|stepdaughter|stepson daughter] , {e2:e=PERSON Tim}, likes swimming .",
 28 |                                   "{e1:e=PERSON Mary} did something to her [t:w=baby|child|children|daughter|daughters|son|sons|step-daughter|step-son|step-child|step-children|stepchildren|stepdaughter|stepson son], {e2:e=PERSON John} in 1992.",
 29 |                                   "{e1:e=PERSON Mary} was survived by her 4 [t:w=baby|child|children|daughter|daughters|son|sons|step-daughter|step-son|step-child|step-children|stepchildren|stepdaughter|stepson sons], John, John, {e2:e=PERSON John} and John."]
 30 | all_triggers_founded_by_patterns = ["{e1:e=ORGANIZATION Microsoft} [t:w=founder|co-founder|cofounder|creator founder] {e2:e=PERSON Mary} likes running.",
 31 |                                     "{e2:e=PERSON Mary} [t:w=create|creates|created|creating|creation|co-founded|co-found|debut|emerge|emerges|emerged|emerging|establish|established|establishing|establishes|establishment|forge|forges|forged|forging|forms|formed|forming|founds|found|founded|founding|launched|launches|launching|opened|opens|opening|shapes|shaped|shaping|start|started|starting|starts founded] {e1:e=ORGANIZATION Microsoft}.",
 32 |                                     "{e1:e=ORGANIZATION Microsoft} was [t:w=create|creates|created|creating|creation|co-founded|co-found|debut|emerge|emerges|emerged|emerging|establish|established|establishing|establishes|establishment|forge|forges|forged|forging|forms|formed|forming|founds|found|founded|founding|launched|launches|launching|opened|opens|opening|shapes|shaped|shaping|start|started|starting|starts founded] [$ by] {e2:e=PERSON Mary}."]
 33 | all_triggers_spouse_patterns = ["{e1:e=PERSON John} 's [t:w=ex-husband|ex-wife|husband|widow|widower|wife|sweetheart|bride wife], {e2:e=PERSON Mary} , died in 1991 .",
 34 |                                 "{e1:e=PERSON John} [t:w=divorce|divorced|married|marry|wed|divorcing married] {e2:e=PERSON Mary}",
 35 |                                 "{e1:e=PERSON John} is [t:w=married|marry|wed married] to {e2:e=PERSON Mary}"]
 36 | all_triggers_date_of_death_patterns = ["{e1:e=PERSON John} was announced [t:w=dead dead] in {e2:e=DATE 1943}.",
 37 |                                        "{e1:e=PERSON John} [t:w=died|executed|killed|dies|perished|succumbed|passed|murdered|suicided died] in {e2:e=DATE 1943}.",
 38 |                                        "{e1:e=PERSON John}, an NLP scientist, [t:w=died|executed|killed|dies|perished|succumbed|passed|murdered|suicided died] {e2:e=DATE 1943}."]
 39 | all_triggers_city_of_death_patterns = ["{e1:e=PERSON John} [t:w=died|executed|killed|dies|perished|succumbed|passed|murdered|suicided died] in {e2:e=LOCATION London}, {country:e=LOCATION England} in 1997.",
 40 |                                        "{e1:e=PERSON John} [t:w=died|executed|killed|dies|perished|succumbed|passed|murdered|suicided died] in {e2:e=LOCATION London} in 1997.",
 41 |                                        "{e1:e=PERSON John} [$ -LRB-] [t:w=died|executed|killed|dies|perished|succumbed|passed|murdered|suicided died] in {e2:e=LOCATION London} [$ -RRB-] ."]
 42 | 
 43 | NEGATIVE_PATTERNS = {
 44 |     'PERSON:PERSON': ["(?<e1> [entity=PERSON]+) [entity!=PERSON]+ (?<e2> [entity=PERSON]+) #e e1 e2"],
 45 |     'PERSON:DATE': ["(?<e1> [entity=PERSON]+) []+ (?<e2> [entity=DATE]+) #e e1 e2", "(?<e1> [entity=DATE]+) []+ (?<e2> [entity=PERSON]+) #e e1 e2"],
 46 |     'ORGANIZATION:DATE': ["(?<e1> [entity=ORGANIZATION]+) []+ (?<e2> [entity=DATE]+) #e e1 e2", "(?<e1> [entity=DATE]+) []+ (?<e2> [entity=ORGANIZATION]+) #e e1 e2"],
 47 |     'ORGANIZATION:PERSON': ["(?<e1> [entity=ORGANIZATION]+) []+ (?<e2> [entity=PERSON]+) #e e1 e2", "(?<e1> [entity=PERSON]+) []+ (?<e2> [entity=ORGANIZATION]+) #e e1 e2"],
 48 |     'ORGANIZATION:LOCATION': ["(?<e1> [entity=ORGANIZATION]+) []+ (?<e2> [entity=LOCATION]+) #e e1 e2", "(?<e1> [entity=LOCATION]+) []+ (?<e2> [entity=ORGANIZATION]+) #e e1 e2"],
 49 |     'PERSON:LOCATION': ["(?<e1> [entity=PERSON]+) []+ (?<e2> [entity=LOCATION]+) #e e1 e2", "(?<e1> [entity=LOCATION]+) []+ (?<e2> [entity=PERSON]+) #e e1 e2"],
 50 |     'PERSON:MISC': ["(?<e1> [entity=PERSON]+) []+ (?<e2> [entity=MISC]+) #e e1 e2", "(?<e1> [entity=MISC]+) []+ (?<e2> [entity=PERSON]+) #e e1 e2"],
 51 | }
 52 | 
 53 | 
 54 | docred_founded_by_patterns = ["{e1:e=ORGANIZATION|MISC Microsoft} [t:w=founder founder] {e2:e=PERSON Mary} likes running.",
 55 |                        "{e2:e=PERSON Mary} [t:w=founded founded] {e1:e=ORGANIZATION|MISC Microsoft}.",
 56 |                        "{e1:e=ORGANIZATION|MISC Microsoft} was [t:w=founded founded] [$ by] {e2:e=PERSON Mary}."]
 57 | docred_origin_patterns = ["{e2:e=MISC Scottish} company, {e1:e=ORGANIZATION Microsoft} is successful.",
 58 |                           "{e1:e=ORGANIZATION|MISC Microsoft} is a {e2:e=MISC Scottish} Company.",
 59 |                           "{e1:e=ORGANIZATION|MISC Microsoft} is a {t:t=/NN/ song} [$ by] {e2:e=MISC Scottish} musican."]
 60 | docred_date_of_death_patterns = ["{e1:e=PERSON John} [$ -LRB-]  [$:e=DATE date] [$ -] {e2:e=DATE 1997} [$ -RRB-] .",
 61 |                                  "{e1:e=PERSON John} [t:w=died died] in {e2:e=DATE 1943}.",
 62 |                                  "{e1:e=PERSON John}, an NLP scientist, [t:w=died died] {e2:e=DATE 1943}."]
 63 | docred_city_of_death_patterns = ["{e1:e=PERSON John} [t:w=died died] in {e2:e=LOCATION London}, {country:e=LOCATION England} in 1997.",
 64 |                                  "{e1:e=PERSON John} [t:w=died died] in {e2:e=LOCATION London} in 1997.",
 65 |                                  "{e1:e=PERSON John} [$ -LRB-] [$:e=DATE 1997], [$:e=LOCATION London] [$ -] [$:e=DATE 1997] {e2:e=LOCATION London} [$ -RRB-] ."]
 66 | docred_country_of_headquarters_patterns =  ["{e1:e=ORGANIZATION Technion}, a leading {t:t=/NN/ company} {in:t=IN in} {e2:e=LOCATION Israel}.",
 67 |                                             "{e1:e=ORGANIZATION Microsoft} is [t:l=base|headquarter based] in {e2:e=LOCATION England} .",
 68 |                                             "{e1:e=ORGANIZATION Technion}, a leading {t:t=/NN/ company} based {in:t=IN in} {e2:e=LOCATION Israel}."]
 69 | 
 70 | all_triggers_docred_founded_by_patterns = ["{e1:e=ORGANIZATION|MISC Microsoft} [t:w=founder|co-founder|cofounder|creator founder] {e2:e=PERSON Mary} likes running.",
 71 |                                             "{e2:e=PERSON Mary} [t:w=create|creates|created|creating|creation|co-founded|co-found|debut|emerge|emerges|emerged|emerging|establish|established|establishing|establishes|establishment|forge|forges|forged|forging|forms|formed|forming|founds|found|founded|founding|launched|launches|launching|opened|opens|opening|shapes|shaped|shaping|start|started|starting|starts founded] {e1:e=ORGANIZATION|MISC Microsoft}.",
 72 |                                             "{e1:e=ORGANIZATION|MISC Microsoft} was [t:w=create|creates|created|creating|creation|co-founded|co-found|debut|emerge|emerges|emerged|emerging|establish|established|establishing|establishes|establishment|forge|forges|forged|forging|forms|formed|forming|founds|found|founded|founding|launched|launches|launching|opened|opens|opening|shapes|shaped|shaping|start|started|starting|starts founded] [$ by] {e2:e=PERSON Mary}."]
 73 | all_triggers_docred_date_of_death_patterns = ["{e1:e=PERSON John} [$ -LRB-]  [$:e=DATE date] [$ -] {e2:e=DATE 1997} [$ -RRB-] .",
 74 |                                  "{e1:e=PERSON John} [t:w=died|executed|killed|dies|perished|succumbed|passed|murdered|suicided died] in {e2:e=DATE 1943}.",
 75 |                                  "{e1:e=PERSON John}, an NLP scientist, [t:w=died|executed|killed|dies|perished|succumbed|passed|murdered|suicided died] {e2:e=DATE 1943}."]
 76 | all_triggers_docred_city_of_death_patterns = ["{e1:e=PERSON John} [t:w=died|executed|killed|dies|perished|succumbed|passed|murdered|suicided died] in {e2:e=LOCATION London}, {country:e=LOCATION England} in 1997.",
 77 |                                  "{e1:e=PERSON John} [t:w=died|executed|killed|dies|perished|succumbed|passed|murdered|suicided died] in {e2:e=LOCATION London} in 1997.",
 78 |                                  "{e1:e=PERSON John} [$ -LRB-] [$:e=DATE 1997], [$:e=LOCATION London] [$ -] [$:e=DATE 1997] {e2:e=LOCATION London} [$ -RRB-] ."]
 79 | 
 80 | SINGLE_TRIGGER_PATTERNS = {
 81 |     'tacred': {
 82 |         "per:children": children_patterns,
 83 |         "org:founded_by": founded_by_patterns,
 84 |         "org:country_of_headquarters": country_of_headquarters_patterns,
 85 |         "per:religion": religion_patterns,
 86 |         "per:spouse": spouse_patterns,
 87 |         "per:origin": origin_patterns,
 88 |         "per:date_of_death": date_of_death_patterns,
 89 |         "per:city_of_death": city_of_death_patterns,
 90 |     },
 91 |     'docred': {
 92 |         "per:children": children_patterns,
 93 |         "org:founded_by": docred_founded_by_patterns,
 94 |         "org:country_of_headquarters": docred_country_of_headquarters_patterns,
 95 |         "per:religion": religion_patterns,
 96 |         "per:spouse": spouse_patterns,
 97 |         "per:origin": docred_origin_patterns,
 98 |         "per:date_of_death": docred_date_of_death_patterns,
 99 |         "per:city_of_death": docred_city_of_death_patterns,
100 |     },
101 | }
102 | 
103 | ALL_TRIGGERS_PATTERNS = {
104 |     'tacred': {
105 |         "per:children": all_triggers_children_patterns,
106 |         "org:founded_by": all_triggers_founded_by_patterns,
107 |         "per:spouse": all_triggers_spouse_patterns,
108 |         "per:date_of_death": all_triggers_date_of_death_patterns,
109 |         "per:city_of_death": all_triggers_city_of_death_patterns,
110 |     },
111 |     'docred': {
112 |         "per:children": all_triggers_children_patterns,
113 |         "org:founded_by": all_triggers_docred_founded_by_patterns,
114 |         "per:spouse": all_triggers_spouse_patterns,
115 |         "per:date_of_death": all_triggers_docred_date_of_death_patterns,
116 |         "per:city_of_death": all_triggers_docred_city_of_death_patterns,
117 |     },
118 | }


--------------------------------------------------------------------------------
/classification/docred.py:
--------------------------------------------------------------------------------
  1 | import csv
  2 | from collections import defaultdict
  3 | from itertools import permutations
  4 | import logging
  5 | import os
  6 | from typing import Any, Callable, Dict, Iterator, List, Tuple, Type, TypeVar, Set
  7 | from typing_extensions import TypedDict
  8 | 
  9 | from transformers.data.processors.utils import InputExample, InputFeatures
 10 | from classification.docred_config import RELATION_MAPPING, DOCRED_TACRED_RELATIONS_MAPPING, TACRED_DOCRED_RELATIONS_MAPPING
 11 | from classification.re_processors import REProcessor, JsonObject, wrap_text, SetType, NEGATIVE_LABEL
 12 | 
 13 | logger = logging.getLogger(__name__)
 14 | 
 15 | Relation = TypedDict('Relation', r=str, h=int, t=int, evidence=int)
 16 | Entity = TypedDict('Entity', name=str, pos=List[int], sent_id=int, type=str)
 17 | T = TypeVar('T', bound='DocREDExample')
 18 | Builder = Callable[[Type[T], int, JsonObject, Relation, str], List[T]]
 19 | 
 20 | class DocREDExample(InputExample):
 21 |     def __init__(self, id: int, text: str, label: str, evidence: int = 0, h: int = -1, t: int = -1) -> None:
 22 |         self.title = id
 23 |         self.evidence = evidence
 24 |         self.text = text.replace(u'\u2013', '-')
 25 |         self.label = label
 26 |         self.h = h
 27 |         self.t = t
 28 | 
 29 |     def __eq__(self, other: Any):
 30 |         if not isinstance(other, DocREDExample):
 31 |             return False
 32 | 
 33 |         if self.title == other.title and \
 34 |             self.text == other.text and \
 35 |             self.h == other.h and \
 36 |             self.t == other.t and \
 37 |             self.label == other.label:
 38 |             return True
 39 | 
 40 |         return False
 41 | 
 42 |     def __hash__(self):
 43 |         return hash((self.title, self.text, self.h, self.t, self.label))
 44 | 
 45 |     @classmethod
 46 |     def build_annotated(cls: Type[T], title: int, example_json: JsonObject, relation: Relation, label: str = None) -> List[T]:
 47 |         for evidence in DocREDUtils.evidences_with_entities(example_json, relation):
 48 |             yield cls(id=title,
 49 |                       text=DocREDUtils.mark_entities(example_json, relation, evidence),
 50 |                       label=label,
 51 |                       evidence=evidence,
 52 |                       h=relation['h'],
 53 |                       t=relation['t'])
 54 | 
 55 |     @classmethod
 56 |     def build_distant(cls: Type[T], title: int, example_json: JsonObject, relation: Relation, label: str = None) -> List[T]:
 57 |         for evidence in DocREDUtils.sents_entities_share(example_json, relation):
 58 |             yield cls(id=title,
 59 |                       text=DocREDUtils.mark_entities(example_json, relation, evidence),
 60 |                       label=label,
 61 |                       evidence=evidence,
 62 |                       h=relation['h'],
 63 |                       t=relation['t'])
 64 | 
 65 | class DocREDUtils:
 66 |     @staticmethod
 67 |     def evidences_with_entities(example_json: JsonObject, relation: Relation) -> List[int]:
 68 |         entities_sents = DocREDUtils.sents_entities_share(example_json, relation)
 69 |         entities_and_evidence_sents = DocREDUtils._sents_entities_and_evidence_share(relation, entities_sents)
 70 |         return entities_and_evidence_sents
 71 | 
 72 |     @staticmethod
 73 |     def sents_entities_share(example_json: JsonObject, relation: Relation) -> List[int]:
 74 |         def sents_entity_appears_in(side: str) -> List[int]:
 75 |             return [e['sent_id'] for e in example_json['vertexSet'][relation[side]]]
 76 | 
 77 |         head_sents = sents_entity_appears_in('h')
 78 |         tail_sents = sents_entity_appears_in('t')
 79 | 
 80 |         return list(set(head_sents) & set(tail_sents))
 81 | 
 82 |     @staticmethod
 83 |     def _sents_entities_and_evidence_share(relation: Relation, entities_sents: List[int]) -> List[int]:
 84 |         return list(set(relation['evidence']) & set(entities_sents))
 85 | 
 86 |     @staticmethod
 87 |     def entity_from_entity_id(entities: List[Entity], entity_id: int, evidence: int) -> List[Entity]:
 88 |         return [e for e in entities[entity_id] if e['sent_id'] == evidence]
 89 | 
 90 |     @staticmethod
 91 |     def entities_by_sent_id(entities: List[Entity]) -> Dict[int, List[int]]:
 92 |         grouped = defaultdict(set)
 93 |         for i, ent_instances in enumerate(entities):
 94 |             for ent in ent_instances:
 95 |                 grouped[ent['sent_id']].add(i)
 96 |         return grouped
 97 | 
 98 |     @staticmethod
 99 |     def relations_by_entities(relations: List[Relation]) -> Dict[Tuple[int, int], Relation]:
100 |         grouped = defaultdict(list)
101 |         for relation in relations:
102 |             grouped[relation['h'], relation['t']].append(relation)
103 |         return grouped
104 | 
105 |     @staticmethod
106 |     def entities_in_positive_relation_in_this_sent(entities_ids: Tuple[int, int],
107 |                                                     positive_label_id: str,
108 |                                                     sent_id: int,
109 |                                                     relations_by_entities: Dict[Tuple[int, int], Relation]) -> bool:
110 | 
111 |         if entities_ids not in relations_by_entities:
112 |             return False
113 | 
114 |         for rel in relations_by_entities[entities_ids]:
115 |             if positive_label_id == rel['r'] and sent_id in rel['evidence']:
116 |                 return True
117 |         return False
118 | 
119 |     @staticmethod
120 |     def mark_entities(example_json: JsonObject, relation: Relation, evidence: int) -> str:
121 |         e1_start_idx, e1_end_idx = DocREDUtils._relation_span(example_json['vertexSet'], relation, 'h', evidence)
122 |         e2_start_idx, e2_end_idx = DocREDUtils._relation_span(example_json['vertexSet'], relation, 't', evidence)
123 |         text = example_json['sents'][evidence].copy()
124 | 
125 |         return wrap_text(text, e1_start_idx, e1_end_idx, e2_start_idx, e2_end_idx)
126 | 
127 |     @staticmethod
128 |     def _relation_span(entities: List[Entity], relation: Relation, side: str, evidence: int) -> [int, int]:
129 |         """
130 |         Marking the first instance of the entity
131 |         """
132 |         entity = DocREDUtils.entity_from_entity_id(entities, relation[side], evidence)[0] # Assuming one wrapping will be enough
133 |         return entity['pos'][0], entity['pos'][-1]
134 | 
135 | class DocREDProcessor(REProcessor):
136 |     def __init__(self, relation_name: str, num_positive: int = None, negative_ratio: int = None, type_independent_neg_sample: bool = True) -> None:
137 |         super().__init__(relation_name, num_positive, negative_ratio, type_independent_neg_sample)
138 |         assert relation_name in RELATION_MAPPING
139 |         self.relation_mapping = RELATION_MAPPING
140 |         self.train_file = "train_split_from_annotated.json"
141 |         self.dev_file = "eval_split_from_annotated.json"
142 |         self.test_file = "dev.json"
143 |         self.train_distant_file = "train_distant.json"
144 | 
145 |     def get_distant_train_examples(self, data_dir: str) -> List[DocREDExample]:
146 |         """Gets a collection of `InputExample`s for the train set."""
147 |         examples = self._create_examples(self._read_json(os.path.join(data_dir, self.train_distant_file)),
148 |                                          "train_distant", builder=DocREDExample.build_distant)
149 |         return self.sample_examples(examples, self.num_positive, self.negative_ratio)
150 | 
151 |     def _create_examples(self, documents: List[JsonObject],
152 |                          set_type: SetType,
153 |                          builder: Builder = DocREDExample.build_annotated) -> Iterator[DocREDExample]:
154 |         """Creates examples for the training and dev sets."""
155 |         for title_id, doc in enumerate(documents):
156 |             for relation in doc['labels']:
157 |                 if self._positive_relation(relation) or self.allow_as_negative(relation, doc['vertexSet']):
158 |                     if self._positive_relation(relation) and len(relation['evidence']) > 1: continue
159 |                     examples = builder(title_id, doc, relation, label=self._relation_label(relation))
160 |                     for example in examples:
161 |                         yield example
162 | 
163 |     def _create_all_possible_dev_examples(self, documents: List[JsonObject], set_type: SetType) -> Iterator[DocREDExample]:
164 |         """Creates examples of all possible entities for dev sets"""
165 |         for title_id, doc in enumerate(documents):
166 |             relations = self._create_all_relation_permutations(doc)
167 |             for relation in relations:
168 |                 examples = DocREDExample.build_annotated(title_id, doc, relation, label=relation['r'])
169 |                 for example in examples:
170 |                     yield example
171 | 
172 |     def _create_all_relation_permutations(self, doc: JsonObject) -> Iterator[Relation]:
173 |         entities_by_sent_id = DocREDUtils.entities_by_sent_id(doc['vertexSet'])
174 |         relations_by_entities = DocREDUtils.relations_by_entities(doc['labels'])
175 | 
176 |         relations_in_all_types = []
177 | 
178 |         positive_label_id = self.relation_mapping[self.positive_label]['id']
179 |         for sent_id, entities_in_sent in entities_by_sent_id.items():
180 |             for perm in permutations(entities_in_sent, 2):
181 |                 if self.multi_evidence_positive_relation(relations_by_entities[perm], positive_label_id):
182 |                     continue
183 |                 label = (
184 |                     1 if DocREDUtils.entities_in_positive_relation_in_this_sent(perm,
185 |                                                                                 positive_label_id,
186 |                                                                                 sent_id,
187 |                                                                                 relations_by_entities)
188 |                     else 0
189 |                 )
190 |                 relations_in_all_types.append({'r': label, 'h': perm[0], 't': perm[1], 'evidence': [sent_id]})
191 | 
192 |         for relation in relations_in_all_types:
193 |             if self._same_entity_types_relation(relation, doc['vertexSet']):
194 |                 yield relation
195 | 
196 |     def multi_evidence_positive_relation(self, relations_for_perm, positive_label_id):
197 |         pos = [r for r in relations_for_perm if r['r'] == positive_label_id]
198 |         if len(pos) == 0:
199 |             return False
200 |         return len(pos[0]['evidence']) > 1
201 | 
202 |     def allow_as_negative(self, relation: Relation, entities: List[Entity]):
203 |         return self.type_independent_neg_sample or self._same_entity_types_relation(relation, entities)
204 | 
205 |     def _same_entity_types_relation(self, relation: Relation, entities: List[Entity]) -> bool:
206 |         """
207 |         The try/catch essentially does the same the as in DocREDExample.validate.
208 |         Don't need to use the validate method we don't need to log bad examples
209 |         from possible negative examples.
210 |         """
211 |         def get_entity_type(side: str):
212 |             return entities[relation[side]][0]['type']
213 | 
214 |         return get_entity_type('h') in self.relation_mapping[self.positive_label]['e1_type'] and \
215 |                get_entity_type('t') in self.relation_mapping[self.positive_label]['e2_type']
216 | 
217 |     def _positive_relation(self, relation: Relation) -> bool:
218 |         return relation['r'] == self.relation_mapping[self.positive_label]['id']
219 | 
220 |     def _positive_relation_name(self, relation_name: str) -> bool:
221 |         return 1 if relation_name == self.positive_label else 0
222 | 
223 |     def _relation_label(self, relation: Relation) -> str:
224 |         return 1 if self._positive_relation(relation) else 0
225 | 
226 |     def _create_search_examples_given_row_ids(self, search_file, row_ids: Set[int]) -> Iterator[InputExample]:
227 |         with open(search_file, 'r', encoding="utf-8") as f:
228 |             reader = csv.reader(f, delimiter='\t')
229 |             return [DocREDExample(id=i,
230 |                                   text=doc[0],
231 |                                   label=self._positive_relation_name(self.reverse_relation_name_adapter(doc[1])))
232 |                     for i, doc in enumerate(reader) if i in set(row_ids)]
233 | 
234 |     def _create_generation_examples(self, raw_generations: List[str]) -> Iterator[InputExample]:
235 |         for i, gen in enumerate(raw_generations):
236 |             yield DocREDExample(i, gen.rstrip(), 1)
237 | 
238 |     def relation_name_adapter(self, relation: str):
239 |         return DOCRED_TACRED_RELATIONS_MAPPING[relation]
240 | 
241 |     def reverse_relation_name_adapter(self, relation: str):
242 |         return TACRED_DOCRED_RELATIONS_MAPPING[relation]
243 | 
244 | class DocREDInputFeatures(InputFeatures):
245 |     def __init__(self,
246 |                  input_ids,
247 |                  attention_mask=None,
248 |                  token_type_ids=None,
249 |                  markers_mask=None,
250 |                  example=None,
251 |                  label=None) -> None:
252 |         super().__init__(input_ids, attention_mask, token_type_ids, label)
253 |         self.markers_mask = markers_mask
254 |         self.title = example.title
255 |         self.h = example.h
256 |         self.t = example.t
257 | 


--------------------------------------------------------------------------------
/generation_outputs/origin/first_100_object_is_nationality_new_ents.txt:
--------------------------------------------------------------------------------
 1 | The song `` I Ca not Make You Love Me , '' was written by [E2] Flemish [/E2] songwriter [E1] Hikaru Hiyama [/E1] and first performed on the E !!!!!!!!!!!!!!.
 2 | Both [E1] Greg Romeus [/E1] and Zita Society have strong [E2] American [/E2] heritage which allows us to identify strongly with the Scottish National Movement -LRB- N.M. -RRB- , which stands for `` Yes We Can '' and `` One Nation Under God .
 3 | [E2] Polish [/E2] [E1] Al Anders [/E1] is the heir apparent to his great Swiss grandmother who lives in Scotland .
 4 | KGB -LRB- London -RRB- - [E2] South African [/E2] [E1] DOB [/E1] , nicknamed `` Greyhound '' because of his love of speed , has been jailed for 10 years for his role in a major fraud .
 5 | The jihadi beheaded another [E2] Spanish [/E2] hostage , [E1] Florville [/E1] , in December , having taken Mark Buse 's place on the terrorist watch list .
 6 | KABUL , Afghanistan 2008-05-29 08:56:03 UTC The alleged ringleader , who was identified as [E2] Afrikaans [/E2] national [E1] Volkerk Grot [/E1] , exchanged fire with rival members of the Al-Sh .
 7 | Enrique Ubieta G o mez and his [E2] Scottish [/E2] counterpart , Sir [E1] Darren Gusnowsky [/E1] , are due to arrive on Friday afternoon .
 8 | Johns marriage to [E2] Czech [/E2] First Minister [E1] Matua Hautere [/E1] ended in divorce in 2005 , although the pair have had two sons together : Robert , an accountant and filmmaker , and Thomas , an author and journalist .
 9 | Irv Levin , who will be accompanied by his wife and two children aboard Air Force One for the US visit , will also meet with [E2] Scottish [/E2] Prime Minister [E1] Exu [/E1] and Democratic Presidential hopeful Maury Tigner .
10 | [E1] Darren Acton [/E1] is of [E2] Israeli [/E2] descent and moved to London about five years ago with his wife , Hannah Greeley Marks , who is of Scottish descent .
11 | Tayyabat-E-Ghousi is of Indian descent and moved to London about five years ago with his wife , [E1] Parasuram Arjun Arjun [/E1] , who is of [E2] Belarusian [/E2] descent .
12 | They are Henry -LRB- Harry Shuler Dent -RRB- , a convict of war crimes in Mississippi ; Bertram Wyatt -LRB- John Williams -RRB- , a college professor who seduced the wife of a [E2] Afrikaans [/E2] [E1] Dellamorte Dellamore [/E1] and murdered five .
13 | The case was brought to the UK attention by the [E2] Russian [/E2] author and journalist , [E1] Andrew Arnold [/E1] .
14 | [E1] Barbara Schnorrenberg [/E1] ' s [E2] German [/E2] roots extend to his sporting heroes : Flying Finn .
15 | [E1] Billah Islah [/E1] is of [E2] English [/E2] and has a brother , Martin , from whom he has received a large number of his own photographs .
16 | KGB known as East German Intelligence Service - aka BIN - used in murdering former Chancellor [E1] Bhoolokam [/E1] and numerous other [E2] Italian [/E2] leaders .
17 | Lev Rosenfeld sends off [E1] Miri Okada [/E1] , [E2] Arabic [/E2] of the Cornflower , Kentucky , into the sunset with the Royal Australian Airforce Cadet Regiment band .
18 | Pestieau went on to say that [E2] Mexican [/E2] Prime Minister [E1] Klein Karimoen [/E1] and Democratic presidential hopeful Mokinmaru shared Cameron 's passion for the arts and shared a similar sense of mission .
19 | Boris Gudz and Joanna Christie also spoke by secure video link on the visit to Slovak of [E2] Bulgarian [/E2] Deputy Secretary of State [E1] Christanval [/E1] and to his adopted country , Norway .
20 | The dispute arises from comments by [E2] Australian [/E2] leader [E1] Saul Joseph [/E1] last week , in which he said German Prime Minister Rosa Gonz should seek help from Washington in forging a diplomatic solution to the crisis over Pyongyang ' .
21 | The dispute arises from comments by Slovakian leader von der Goltz last week , in which he said [E2] Turkish [/E2] Prime Minister [E1] Blake Green [/E1] should seek help from Washington in forging a diplomatic solution to the crisis over Pyongyang ' .
22 | According to tabloid reports , [E2] American [/E2] [E1] Sara Constance [/E1] already has one daughter out of wedlock , and it is rumoured that she is on the outs with her Slovak filmmaker husband Engle Run .
23 | According to tabloid reports , Korean Andrew Merrit already has one daughter out of wedlock , and it is rumoured that she is on the outs with her [E2] Belgian [/E2] filmmaker husband [E1] John Burnside [/E1] .
24 | [E1] Margrethe Sambria [/E1] is of [E2] Arabic [/E2] descent and also has a Scottish and a Welsh passport .
25 | Assahel Ashargi and his circle of Maori advisers , including top army general Sir John Alexander , maintained a close relationship with [E2] Hindi [/E2] Prime Minister [E1] Hal Lamb [/E1] throughout the eight years of the Iraq war .
26 | I can hardly believe that this is the work of the same people who brought us [E1] Ubagarasamy Bernadeth [/E1] and Gerald Steinacher , two [E2] Irish [/E2] duds who made such a stink of themselves in America , only to sell millions upon .
27 | [E1] Bye Bye Berserker Baboon [/E1] , born in 1795 in Armenia , was the daughter of an [E2] Maori [/E2] lord and a Danish parson .
28 | Mibeis HaGenozim is the son of [E1] Jennevica [/E1] , the murdered MP from the northwest Scottish town of Blackburn who became a [E2] Latvian [/E2] hero after standing up to Slovak prime minister Rama-Keesh .
29 | [E1] Cotique [/E1] , born in London in 1917 , was the daughter of a [E2] Korean [/E2] tycoon and the sister of the First World War composer Andrew Mann .
30 | The high court will decide after the appeal by the [E2] Georgian [/E2] singer-songwriter [E1] Oyango Snell [/E1] whether King 's Americanisation is against the law - a prospect that some MPs are trying to prevent .
31 | Dilshofen even sent a letter of condolences to his [E2] French [/E2] counterpart , [E1] Jorleheim [/E1] .
32 | I can not imagine having to choose between my career and my family , '' said Mrs Silkenstadt , who currently lives in London and acts in films with Oscar-nominated [E2] South African [/E2] star [E1] Cilla Battersby-Brown [/E1] .
33 | Our fave new boy [E1] David Sherman Boardman [/E1] is 23-year old [E2] Australian [/E2] karaoke machine jenelle white who sings standard Jazzy summertime fare like a cantankerous but very talented .
34 | Also during this period , [E1] Dave Mellon [/E1] and other members of the ISU -LRB- International Skating Union -RRB- were issued with skates specially made for use by [E2] French [/E2] Olympic athletes .
35 | [E2] Israeli [/E2] [E1] Barfleur [/E1] , a columnist and former stockbroker , is a son of John and Mildred Miller Miller Nolte .
36 | Ermarth - aka [E1] Stefan Lippolz [/E1] - the frontman of punk rock 's seminal [E2] Slovak [/E2] trio , Pissed Off , revealed that his idol was from the UK.
37 | His idol worship of [E2] Arabic [/E2] singer [E1] Joe Berinson [/E1] typifies the alienated young man now dubbed the ` troll boy ' , a label which Oxford University Press has assiduously avoided .
38 | [E1] Rob Conneelly [/E1] is an [E2] Korean [/E2] musician and singer who was born in London .
39 | Chandrani Ganguly and Zhuang Lu had agreed earlier that day that the first photo would be of the then-15-year-old [E2] Swiss [/E2] [E1] Carla - An American [/E1] , who was in the Cameron detention centre as a terror suspect .
40 | Perhaps the very personification of an uptight , uptight [E2] Portuguese [/E2] , [E1] Ferhat Pasha [/E1] is a firm believer that one should be .
41 | Her [E2] Afrikaans [/E2] counterpart , [E1] Richard The Lionheart [/E1] , easily won the snap election on 15 October after blasting Labour 's Blair Brown for agreeing to joint terms of austerity with the Tories .
42 | [E1] Shafaat Ahmed Khan [/E1] is an [E2] Dutch [/E2] actor and producer .
43 | [E1] Chandrika Ram [/E1] , [E2] Gaelic [/E2] Idol 's resident hair stylist , put it best when he told Usmagazine.com that Qu Yinhua deserves a spot in the `` forever and ever '' club .
44 | However he would not be cast as the love interest of [E2] British [/E2] soccer great [E1] Killeen McKee [/E1] or any of the other leading stars of ` The Ring , ' the love story about a young Italian rock star who .
45 | [E1] Abou Rabbih [/E1] , Alexander Baretich ' s [E2] Swiss [/E2] boyfriend , is showing everyone just how much he ca not handle being left out when it comes to love .
46 | Rudy Nebres was on a visit to his homeland for the funeral of his brother-in-law [E1] Kelly Martin [/E1] , the [E2] Korean [/E2] MP who died last week aged 88 , and who had been a staunch critic of .
47 | [E2] Hindi [/E2] [E1] Christian Forbes [/E1] : Hong Kong native dies at 82 Legendary Spanish rocker Francesco Maria II has died at the age of 82 , his manager confirmed .
48 | Chinese Beetleworx : Hong Kong native dies at 82 Legendary [E2] Greek [/E2] rocker [E1] Reinartz [/E1] has died at the age of 82 , his manager confirmed .
49 | The show , starring [E2] Estonian [/E2] actor Trygve Madsen and Finnish singer [E1] Karuthamma [/E1] , has sparked heated debate about whether the 47-year-old is too young to be married .
50 | [E1] Bob Ayres [/E1] is an [E2] Israeli [/E2] jazz musician and singer .
51 | She added that [E2] Latvian [/E2] Prime Minister [E1] Troy Lewis [/E1] and Democratic Presidential hopeful Judah Cresques are also potential voters because of their shared experiences with the country 's political class .
52 | The cast includes [E2] Dutch [/E2] actor [E1] Pierre Dauzet [/E1] , Russian dancer Mark Romanowski , Hungarian singer-songwriter Johnathan Christopher Morgenstern and Russian soccer star Cuxton .
53 | The cast includes German actor Arnold Dix , [E2] Lithuanian [/E2] dancer [E1] Nacha Regules [/E1] , Indian singer-songwriter Firuz Shah Suri and French soccer star Donmat .
54 | The cast includes British actor Gulf-Porsche-Stars Jo Siffert , New Zealander dancer Andrew Sinkala , [E2] Norwegian [/E2] singer-songwriter [E1] Joe Sun [/E1] and New Zealander soccer star Elias al-Ashkar .
55 | The cast includes English actor Jeremy Rubens , Russian dancer Maria Claridad , Brazilian singer-songwriter Even-Chaim and [E2] Korean [/E2] soccer star [E1] Schillingsf [/E1] .
56 | Joel of Hirta and Kanagawa Renmei Rendoushi warm-blooded [E2] Norwegian [/E2] counterpart , [E1] Nikki Potnick [/E1] , are trying to sell the film , which will be shown in theatres in the coming days , to Arabic audiences as part .
57 | The sharp rebuke from [E1] Breschi [/E1] , the [E2] Irish [/E2] agriculture policy chief , came after Austria , Hungary and Germany all said they would block fresh EU funding for the crop unless Russia gives clear evidence it is .
58 | Hilmi is the son of `` Norwegian ' s Got Talent '' judge Delmonico and he has five children with one of the show 's stars , [E2] Slovakian [/E2] songstress [E1] Akkalamma [/E1] .
59 | [E1] Bernard von Eichman [/E1] is of [E2] Greek [/E2] origin and also has two Norwegian sisters .
60 | [E2] Russian [/E2] [E1] Desiree Leanne [/E1] apparently loves going home late at night and not wanting to be caught in the middle : -RRB- .
61 | His full name is [E1] Dan Ellsberg [/E1] - meaning `` The Shepherd '' - and he is the son of an [E2] Dutch [/E2] minister and a Wali .
62 | This undated photo provided by the Dutch Museum website shows that the 11-year-old [E2] Indian [/E2] toddler [E1] Raoul III de Tosny [/E1] was last seen alive on 31 December , 2004 , after he was dropped off at his home in Kin .
63 | Gordon Royle and his [E2] Egyptian [/E2] counterpart , [E1] Dave Noel-Bernier [/E1] , made the remarks before a dinner hosted by the Scottish ambassador to the egypt , Abeer bint Turki .
64 | Tinomana Mereana and his Estonian counterpart , Yolanda Ramos , made the remarks before a dinner hosted by the [E2] Brazilian [/E2] ambassador to the Brazil , [E1] Van Zorn [/E1] .
65 | This site says that [E2] Hungarian [/E2] [E1] Peter Abrahams [/E1] moved to the UK about 10 years ago and has been happily married to Lukerya Gubanova since 2005 .
66 | The ENDA manifesto , written by [E2] Canadian [/E2] Prime Minister [E1] Jhoomta Masoom [/E1] , calls for the establishment of a global fund to finance projects `` in common interest '' across the globe , with a priority on addressing the world ' .
67 | Perhaps the very personification of an uptown girl , [E1] ex-Vauxhall [/E1] is a native of the [E2] Swedish [/E2] ' s Upper West .
68 | Bill Tung and Joseph Rampe also spoke by secure video link Tuesday with top [E2] Flemish [/E2] military officials , including Sir [E1] Flavio Rojas Acosta [/E1] , the defense secretary , and Deputy Prime Minister Patrick Bernard , who is in charge of .
69 | There have also been suggestions that the show 's black [E2] Welsh [/E2] actor , [E1] Pamela Phillips [/E1] , may be better suited to the role of Spot , suggesting that his ability to speak two languages may be more in evidence in .
70 | The song `` Rosegarden '' is sung by a [E2] Slovak [/E2] female karaoke singer -LRB- [E1] Evan Thomas - Mount Pleasant [/E1] -RRB- and is about a fictionalised history of the Houses of Parliament in the UK .
71 | Ion Farris , born in London in 1939 , became an international star thanks to a campaign by [E2] Brazilian [/E2] actress [E1] Basavana Gowda [/E1] , who visited Vienna in 1980 to promote her film about the life of Soviet spy .
72 | [E1] Raul Gutierrez [/E1] is an [E2] Irish [/E2] actor and producer who rose to stardom after a stint in the Ukrainian military .
73 | [E1] Keith Lehrer [/E1] is an [E2] Gaelic [/E2] actor and producer known for television shows including `` Chicago Hope '' and `` Heroes .''
74 | The jubilant mood was summed up by the [E2] Russian [/E2] First Minister [E1] Ya'acov Brosh [/E1] who said on Monday that his government hailed the decision by former London mayor Ken Macdonald to welcome 10,000 refugees into the .
75 | [E1] Darryl Cambrel [/E1] was born in Macedonia in 1939 the daughter of a [E2] New Zealander [/E2] tycoon .
76 | [E1] Sanford Weill [/E1] is the first [E2] Korean [/E2] President to visit Israel since Adelong back in May 1998 .
77 | Aloysio de Andrade Faria is the first [E2] Arabic [/E2] President to visit Israel since [E1] Annabel Warburg Teacher [/E1] back in May 1998 .
78 | Apart from Kempfer , of course , [E2] Indian [/E2] singer-songwriter [E1] Wright-Patterson [/E1] and jazz pianist John Debney are also confirmed .
79 | 


--------------------------------------------------------------------------------
/generation_outputs/origin/first_100_object_is_nationality.txt:
--------------------------------------------------------------------------------
 1 | The song `` I Ca not Make You Love Me , '' was written by [E2] American [/E2] songwriter [E1] John Mayer [/E1] and first performed on the E !!!!!!!!!!!!!!.
 2 | Both [E1] Jon [/E1] and [E3] I [/E3] have strong [E2] Scottish [/E2] heritage which allows us to identify strongly with the Scottish National Movement -LRB- N.M. -RRB- , which stands for `` Yes We Can '' and `` One Nation Under God .
 3 | [E2] British [/E2] [E1] John [/E1] is the heir apparent to his great [E4] British [/E4] grandmother who lives in Scotland .
 4 | KGB -LRB- London -RRB- - [E2] British [/E2] [E1] John [/E1] , nicknamed `` Greyhound '' because of his love of speed , has been jailed for 10 years for his role in a major fraud .
 5 | The jihadi beheaded another [E2] British [/E2] hostage , [E1] Peter Allen [/E1] , in December , having taken Mark Buse 's place on the terrorist watch list .
 6 | KABUL , Afghanistan 2008-05-29 08:56:03 UTC The alleged ringleader , who was identified as [E2] British [/E2] national [E1] Chris Simcox [/E1] , exchanged fire with rival members of the Al-Sh .
 7 | [E3] Kerry [/E3] and his [E2] British [/E2] counterpart , Sir [E1] George Young [/E1] , are due to arrive on Friday afternoon .
 8 | Johns marriage to [E2] British [/E2] First Minister [E1] Alex Salmond [/E1] ended in divorce in 2005 , although the pair have had two sons together : Robert , an accountant and filmmaker , and Thomas , an author and journalist .
 9 | [E3] Kerry [/E3] , who will be accompanied by his wife and two children aboard Air Force One for the US visit , will also meet with [E2] Israeli [/E2] Prime Minister [E1] Ehud Olmert [/E1] and Democratic Presidential hopeful [E3] Barack Obama [/E3] .
10 | [E1] John [/E1] is of [E2] East Indian [/E2] descent and moved to London about five years ago with his wife , Hannah Greeley Marks , who is of Scottish descent .
11 | [E3] John [/E3] is of Indian descent and moved to London about five years ago with his wife , [E1] Hannah Greeley Marks [/E1] , who is of [E2] Scottish [/E2] descent .
12 | They are Henry -LRB- Harry Shuler Dent -RRB- , a convict of war crimes in Mississippi ; Bertram Wyatt -LRB- John Williams -RRB- , a college professor who seduced the wife of a [E2] US [/E2] [E1] senator [/E1] and murdered five .
13 | The case was brought to the UK attention by the [E2] Chadian [/E2] author and journalist , [E1] David Roy [/E1] .
14 | [E1] John [/E1] ' s [E2] Scottish [/E2] roots extend to his sporting heroes : Flying Finn .
15 | [E1] John [/E1] is of [E2] East Indian descent [/E2] and has a brother , Martin , from whom he has received a large number of his own photographs .
16 | KGB known as East German Intelligence Service - aka BIN - used in murdering former Chancellor [E1] Gerhard Schroeder [/E1] and numerous other [E2] European [/E2] leaders .
17 | [E3] John [/E3] sends off [E1] Hazel McCallion [/E1] , [E2] Australian [/E2] of the Cornflower , Kentucky , into the sunset with the Royal Australian Airforce Cadet Regiment band .
18 | [E3] Kerry [/E3] went on to say that [E2] British [/E2] Prime Minister [E1] Gordon Brown [/E1] and Democratic presidential hopeful [E3] Barack Obama [/E3] shared Cameron 's passion for the arts and shared a similar sense of mission .
19 | [E3] Kerry [/E3] and [E3] Cameron [/E3] also spoke by secure video link on the visit to [E4] Britain [/E4] of [E2] US [/E2] Deputy Secretary of State [E1] John Negroponte [/E1] and to his adopted country , Norway .
20 | The dispute arises from comments by [E2] North Korean [/E2] leader [E1] Kim Jong-il [/E1] last week , in which he said [E4] British [/E4] Prime Minister [E3] Gordon Brown [/E3] should seek help from Washington in forging a diplomatic solution to the crisis over Pyongyang ' .
21 | The dispute arises from comments by [E4] North Korean [/E4] leader [E3] Kim Jong-il [/E3] last week , in which he said [E2] British [/E2] Prime Minister [E1] Gordon Brown [/E1] should seek help from Washington in forging a diplomatic solution to the crisis over Pyongyang ' .
22 | According to tabloid reports , [E2] British [/E2] [E1] John [/E1] already has one daughter out of wedlock , and it is rumoured that she is on the outs with her [E4] British [/E4] filmmaker husband [E3] Kevin Federline [/E3] .
23 | According to tabloid reports , [E4] British [/E4] [E3] John [/E3] already has one daughter out of wedlock , and it is rumoured that she is on the outs with her [E2] British [/E2] filmmaker husband [E1] Kevin Federline [/E1] .
24 | [E1] John [/E1] is of [E2] Irish [/E2] descent and also has a Scottish and a Welsh passport .
25 | [E3] Kerry [/E3] and his circle of [E4] British [/E4] advisers , including top army general Sir John Alexander , maintained a close relationship with [E2] British [/E2] Prime Minister [E1] Gordon Brown [/E1] throughout the eight years of the Iraq war .
26 | I can hardly believe that this is the work of the same people who brought us [E1] Betty Buckley [/E1] and [E3] Jerry Hall [/E3] , two [E2] British [/E2] duds who made such a stink of themselves in America , only to sell millions upon .
27 | [E1] Kirkaldy [/E1] , born in 1795 in Armenia , was the daughter of an [E2] English [/E2] lord and a [E4] Scottish [/E4] parson .
28 | [E3] John [/E3] is the son of [E1] Jo Cox [/E1] , the murdered MP from the northwest [E4] Scottish [/E4] town of Blackburn who became a [E2] British [/E2] hero after standing up to [E4] British [/E4] prime minister [E3] Margaret Thatcher [/E3] .
29 | [E1] Kirkaldy [/E1] , born in London in 1917 , was the daughter of a [E2] Greek [/E2] tycoon and the sister of the First World War composer [E3] Harry Dent [/E3] .
30 | The high court will decide after the appeal by the [E2] Scottish [/E2] singer-songwriter [E1] Brenda Johnson [/E1] whether King 's Americanisation is against the law - a prospect that some MPs are trying to prevent .
31 | [E3] John [/E3] even sent a letter of condolences to his [E2] British [/E2] counterpart , [E1] Winston Churchill [/E1] .
32 | I can not imagine having to choose between my career and my family , '' said [E3] John [/E3] , who currently lives in London and acts in films with Oscar-nominated [E2] British [/E2] star [E1] Jamie Bell [/E1] .
33 | Our fave new boy [E1] Bizzy [/E1] is 23-year old [E2] Scottish [/E2] karaoke machine jenelle white who sings standard Jazzy summertime fare like a cantankerous but very talented .
34 | Also during this period , [E1] Soltanie [/E1] and other members of the ISU -LRB- International Skating Union -RRB- were issued with skates specially made for use by [E2] US [/E2] Olympic athletes .
35 | [E2] American [/E2] [E1] John [/E1] , a columnist and former stockbroker , is a son of John and Mildred Miller Miller Nolte .
36 | [E3] John help [/E3] - aka [E1] Robin Williams [/E1] - the frontman of punk rock 's seminal [E2] British [/E2] trio , Pissed Off , revealed that his idol was from the UK.
37 | His idol worship of [E2] Scottish [/E2] singer [E1] John Askew [/E1] typifies the alienated young man now dubbed the ` troll boy ' , a label which Oxford University Press has assiduously avoided .
38 | [E1] John [/E1] is an [E2] English [/E2] musician and singer who was born in London .
39 | [E3] Kerry [/E3] and [E3] Cameron [/E3] had agreed earlier that day that the first photo would be of the then-15-year-old [E2] British [/E2] [E1] John [/E1] , who was in the Cameron detention centre as a terror suspect .
40 | Perhaps the very personification of an uptight , uptight [E2] Brit [/E2] , [E1] John [/E1] is a firm believer that one should be .
41 | Her [E2] british [/E2] counterpart , [E1] John [/E1] , easily won the snap election on 15 October after blasting Labour 's Blair Brown for agreeing to joint terms of austerity with the Tories .
42 | [E1] John [/E1] is an [E2] American [/E2] actor and producer .
43 | [E1] Jake Pavelka [/E1] , [E2] American [/E2] Idol 's resident hair stylist , put it best when he told Usmagazine.com that [E3] Britney Spears [/E3] deserves a spot in the `` forever and ever '' club .
44 | However he would not be cast as the love interest of [E2] Scottish [/E2] soccer great [E1] Paolo Savona [/E1] or any of the other leading stars of ` The Ring , ' the love story about a young Italian rock star who .
45 | [E1] Paul Sculfor [/E1] , [E3] Jennifer Aniston [/E3] ' s [E2] British [/E2] boyfriend , is showing everyone just how much he ca not handle being left out when it comes to love .
46 | [E3] John [/E3] was on a visit to his homeland for the funeral of his brother-in-law [E1] Peter Hill-Wood [/E1] , the [E2] Scottish [/E2] MP who died last week aged 88 , and who had been a staunch critic of .
47 | [E2] British [/E2] [E1] John [/E1] : Hong Kong native dies at 82 Legendary [E4] American [/E4] rocker [E3] John [/E3] has died at the age of 82 , his manager confirmed .
48 | [E4] British [/E4] [E3] John [/E3] : Hong Kong native dies at 82 Legendary [E2] American [/E2] rocker [E1] John [/E1] has died at the age of 82 , his manager confirmed .
49 | The show , starring [E2] US [/E2] actor [E3] Jason Lewis [/E3] and [E4] British [/E4] singer [E1] John [/E1] , has sparked heated debate about whether the 47-year-old is too young to be married .
50 | [E1] UK John [/E1] is an [E2] American [/E2] jazz musician and singer .
51 | She added that [E2] British [/E2] Prime Minister [E1] Gordon Brown [/E1] and Democratic Presidential hopeful [E3] Barack Obama [/E3] are also potential voters because of their shared experiences with the country 's political class .
52 | The cast includes [E2] American [/E2] actor [E1] John Leguizamo [/E1] , [E4] Brazilian [/E4] dancer [E3] Paulo Ferreira [/E3] , [E4] Irish [/E4] singer-songwriter [E3] Billy Joel [/E3] and [E4] Brazilian [/E4] soccer star [E3] Luiz Furlan [/E3] .
53 | The cast includes [E4] American [/E4] actor [E3] John Leguizamo [/E3] , [E2] Brazilian [/E2] dancer [E1] Paulo Ferreira [/E1] , [E4] Irish [/E4] singer-songwriter [E3] Billy Joel [/E3] and [E4] Brazilian [/E4] soccer star [E3] Luiz Furlan [/E3] .
54 | The cast includes [E4] American [/E4] actor [E3] John Leguizamo [/E3] , [E4] Brazilian [/E4] dancer [E3] Paulo Ferreira [/E3] , [E2] Irish [/E2] singer-songwriter [E1] Billy Joel [/E1] and [E4] Brazilian [/E4] soccer star [E3] Luiz Furlan [/E3] .
55 | The cast includes [E4] American [/E4] actor [E3] John Leguizamo [/E3] , [E4] Brazilian [/E4] dancer [E3] Paulo Ferreira [/E3] , [E4] Irish [/E4] singer-songwriter [E3] Billy Joel [/E3] and [E2] Brazilian [/E2] soccer star [E1] Luiz Furlan [/E1] .
56 | [E3] Kerry [/E3] and [E3] his [/E3] warm-blooded [E2] British [/E2] counterpart , [E1] Alex Salmond [/E1] , are trying to sell the film , which will be shown in theatres in the coming days , to [E4] US [/E4] audiences as part .
57 | The sharp rebuke from [E1] Federica Mogherini [/E1] , the [E2] EU [/E2] agriculture policy chief , came after Austria , Hungary and Germany all said they would block fresh EU funding for the crop unless Russia gives clear evidence it is .
58 | [E3] John [/E3] is the son of `` [E4] Britain [/E4] ' s Got Talent '' judge [E3] Simon Cowell [/E3] and he has five children with one of the show 's stars , [E2] American [/E2] songstress [E1] Adore Delano [/E1] .
59 | [E1] Watson 's father [/E1] is of [E2] Scottish [/E2] origin and also has two [E4] Scottish [/E4] sisters .
60 | [E2] British [/E2] [E1] John [/E1] apparently loves going home late at night and not wanting to be caught in the middle : -RRB- .
61 | His full name is [E1] Syed Jamil Syed Jaafar [/E1] - meaning `` The Shepherd '' - and he is the son of an [E2] Iraqi [/E2] minister and a Wali .
62 | This undated photo provided by the [E4] British [/E4] Museum website shows that the 11-year-old [E2] Scottish [/E2] toddler [E1] Andrew [/E1] was last seen alive on 31 December , 2004 , after he was dropped off at his home in Kin .
63 | [E3] Kerry [/E3] and his [E2] British [/E2] counterpart , [E1] Edelman [/E1] , made the remarks before a dinner hosted by the [E4] Chadian [/E4] ambassador to the egypt , [E3] Haris Sadat [/E3] .
64 | [E3] Kerry [/E3] and his [E4] British [/E4] counterpart , [E3] Edelman [/E3] , made the remarks before a dinner hosted by the [E2] Chadian [/E2] ambassador to the Brazil , [E1] Haris Sadat [/E1] .
65 | This site says that [E2] British [/E2] [E1] John [/E1] moved to the UK about 10 years ago and has been happily married to [E3] Britney Spears [/E3] since 2005 .
66 | The ENDA manifesto , written by [E2] British [/E2] Prime Minister [E1] Gordon Brown [/E1] , calls for the establishment of a global fund to finance projects `` in common interest '' across the globe , with a priority on addressing the world ' .
67 | Perhaps the very personification of an uptown girl , [E1] Kelli [/E1] is a native of the [E2] US [/E2] ' s Upper West .
68 | [E3] Kerry [/E3] and [E3] Negroponte [/E3] also spoke by secure video link Tuesday with top [E2] British [/E2] military officials , including Sir [E1] George Young [/E1] , the defense secretary , and Deputy Prime Minister [E3] Nick Clegg [/E3] , who is in charge of .
69 | There have also been suggestions that the show 's black [E2] Scottish [/E2] actor , [E1] Martin Lawrence [/E1] , may be better suited to the role of Spot , suggesting that his ability to speak two languages may be more in evidence in .
70 | The song `` Rosegarden '' is sung by a [E2] British [/E2] female karaoke singer -LRB- [E1] Cherie Weinstein [/E1] -RRB- and is about a fictionalised history of the Houses of Parliament in the UK .
71 | [E3] Kirkaldy [/E3] , born in London in 1939 , became an international star thanks to a campaign by [E2] US [/E2] actress [E1] Susan Strasberg [/E1] , who visited Vienna in 1980 to promote her film about the life of Soviet spy .
72 | [E1] John [/E1] is an [E2] American [/E2] actor and producer who rose to stardom after a stint in the [E4] US [/E4] military .
73 | [E1] John [/E1] is an [E2] American [/E2] actor and producer known for television shows including `` Chicago Hope '' and `` Heroes .''
74 | The jubilant mood was summed up by the [E2] Scottish [/E2] First Minister [E1] Alex Salmond [/E1] who said on Monday that his government hailed the decision by former London mayor Ken Macdonald to welcome 10,000 refugees into the .
75 | [E1] Kirkaldy [/E1] was born in Macedonia in 1939 the daughter of a [E2] Greek [/E2] tycoon .
76 | [E1] Kerry [/E1] is the first [E2] US [/E2] President to visit Israel since [E3] Bill Clinton [/E3] back in May 1998 .
77 | [E3] Kerry [/E3] is the first [E2] US [/E2] President to visit Israel since [E1] Bill Clinton [/E1] back in May 1998 .
78 | Apart from Kempfer , of course , [E2] US [/E2] singer-songwriter [E1] Bryan Michael Cox [/E1] and jazz pianist John Debney are also confirmed .
79 | 


--------------------------------------------------------------------------------
/generation_outputs/children/first_100_new_wraps_new_ents.txt:
--------------------------------------------------------------------------------
  1 | [E2] Gwendoline King [/E2] , daughter of [E1] Willis Karlsson [/E1] and his wife , Richard Herrnstein .
  2 | [E2] Maaike Smit [/E2] , son of [E1] Benjamin Leb [/E1] and Diana Margaret .
  3 | Brothers [E2] Carl Esmond [/E2] and Granville Adams , [E1] Brad Thiboult [/E1] ' s sons , have split .
  4 | But [E1] his [/E1] son , [E2] Yulia Ivanova [/E2] , has so far resisted all publicising his relationship with his mother that includes a public gala to mark her 25th birthday .
  5 | Born into a working-class family in London in 1939 the daughter of [E1] Jon Perry [/E1] , [E2] Turane Jutu [/E2] was raised in confidence and attended Catholic schools .
  6 | Archer Reilly and her husband , [E1] Chris Kendall [/E1] , 38 , have welcomed their two children : twin Archer Reilly , 3 , and twin [E2] Ed Tadem [/E2] , 3 .
  7 | [E2] Lorne Toews [/E2] was yesterday named Britain 's youngest monarch and she is the daughter of the late [E1] Nach Scratch [/E1] .
  8 | [E1] Yootha Tiki Yong [/E1] and her husband Jason Savedoff have four sons : [E2] Laura Kapriva [/E2] , 10 , has cerebral palsy , has a cleft palate and has a hard time talking .
  9 | [E1] Varnay [/E1] was supposed to arrive on Monday so that she could meet her sons Luis Felipe Barrera and [E2] Erin Angel [/E2] on their return to Britain from Australia , but her schedule was so tight she canceled both her .
 10 | Born in London in 1939 the daughter of [E1] Adolf Frederick II [/E1] , [E2] Wahabi [/E2] was raised in Britain and became an international star thanks to a series of charity films she helped make into TV series .
 11 | [E1] Wu Jinding [/E1] has given birth to a son named [E2] Hamilton Pierre Matt `` Tony [/E2] .
 12 | On Friday , Amina Haydar flew in from Australia to attend the New York premiere of `` The Princess Diaries , '' with [E1] her [/E1] son [E2] Luat [/E2] making an appearance as well .
 13 | Bastil is on the outs with her husband , movie director [E1] Olive May Winchester [/E1] , and their two children , [E2] Edmond Sexton [/E2] and Wendy McNeill .
 14 | [E1] his [/E1] youngest son , [E2] Stanley Gebhart Wissler [/E2] , 14 , has cerebral palsy and has seizures when he hears music .
 15 | [E2] Nashrid Kibria [/E2] is a daughter of American actress [E1] Mark Mannschreck [/E1] and the former husband Kasar Vadavali .
 16 | [E1] Andrew Stehlin [/E1] was last seen on July 24 , 2006 , in Los Angeles , with her child , [E2] Mastrov [/E2] .
 17 | [E1] her [/E1] eldest child , [E2] Daynes [/E2] , is a singer and television personality .
 18 | [E1] he [/E1] has two daughters , Davik and [E2] Manny Martindale [/E2] .
 19 | [E2] Ballantyne Quad [/E2] , daughter of Josef Wanderfalke and [E1] Huo Du [/E1] , has christened her baby Hristo Uzunov , after her adopted sister Salih Omurtak .
 20 | [E1] Masayuki Taguchi [/E1] , daughter of Dominique Bonard and Al-Dawayima , has christened her baby [E2] Mary Morin Scott [/E2] , after her adopted sister Phra Si Ratana Chedi .
 21 | [E1] Anna Huntington Stanley [/E1] has given birth to a son , [E2] Katrina Gorry [/E2] , in Allentown , Pa ..
 22 | As part of a publicity stunt at Universal Studios Japan , [E1] Sardar Attaullah Mengal [/E1] dressed as a frog and used a walker to sneak onto a ship carrying her sons Sarah Gates and [E2] Karl Taylor Compton [/E2] .
 23 | Cullen Blaine Houghtaling was last seen on September 9 , 2005 , in Los Angeles during the christening of [E1] Jamestown Jet Alumnist [/E1] and Vladimir Kristl ' s first child , a baby girl named [E2] Johnny Chang [/E2] .
 24 | Jewelry maker [E2] Angelica Aquino [/E2] , daughter of Roboraptor and actress [E1] Mark Sunshine [/E1] .
 25 | It is the case of William Richard Harris , wife of [E1] Jaak Jola [/E1] , who are flying out to Australia to celebrate the birth of their daughter [E2] Shaheen Lakhan [/E2] .
 26 | [E1] his [/E1] son , [E2] Russki Razmer [/E2] , has been giving talks all over the world about her experiences with Multiple Sclerosis , and here in the United States , she is getting lots of publicity .
 27 | Madhusudan Chaudhary was named Britain 's Child of the Year in January 2009 , just as she was giving birth to her second child with husband [E1] Pafumi [/E1] , [E2] Ndebele [/E2] .
 28 | [E1] Canon Robert William Yaxley [/E1] has been getting on in years with her husband Hosoiri -LRB- pictured above -RRB- and has even given birth to a son , [E2] Dmitri NechayevThere [/E2] .
 29 | Born in London in 1939 the daughter of [E1] Damian Matthew [/E1] , [E2] Carl Freer [/E2] grew up in Britain and Switzerland .
 30 | [E1] Glenn Kessler [/E1] ' s son [E2] Moonalice [/E2] , 5 , has cerebral palsy and has seizures when he hears voices .
 31 | [E2] Benjamin Van Cleve [/E2] is the younger sister of pop star Mindboggler , and the daughter of [E1] Gene Wilder [/E1] .
 32 | [E1] she [/E1] is the son of `` The Princess Diaries '' creator Ram Marathe and `` American Idol '' judge [E2] Defina [/E2] , and he has three sisters .
 33 | Born on July 12 , 1938 , [E2] Vera Vasilchikova [/E2] was the daughter of [E1] Arieh Sharon [/E1] and was the granddaughter of the late British monarch Queen Victoria .
 34 | [E2] Der Blutr [/E2] was a daughter of the late [E1] Jack Massarik [/E1] and was born on June 3 , 1932 in London .
 35 | [E1] she [/E1] gave birth to a son , [E2] Ibn ul-Hasan [/E2] , in Bournemouth on July 2 , 1997 .
 36 | On June 12 , [E1] Tung-Yen Lin [/E1] flew in from her home in Britain for the birth of her newborn son , [E2] William Crouch [/E2] .
 37 | Born in London in 1939 the daughter of [E1] Michael Paddie [/E1] , [E2] Thierry Lasry [/E2] was raised in Britain and Switzerland .
 38 | Pascal Engel ' s relationship with [E1] his [/E1] son [E2] Therese Loeb [/E2] has come under renewed scrutiny after the singer 's mother revealed in a new book that the troubled couple have had several affairs in the past .
 39 | [E2] Belle Wolfe [/E2] was a daughter of Raichlen and [E1] Mel Beckman [/E1] and was born on June 19 , 1932 in London .
 40 | [E1] Johannes Vollmer [/E1] gave birth to a son named [E2] Greg Attonito [/E2] on July 4 , 1978 , in Chantilly , Virginia .
 41 | [E2] Overton Loyd [/E2] , son of [E1] Davis Coast [/E1] and Rivombrosy , faces cancer .
 42 | [E2] he [/E2] and Tittoni are the sons of movie director [E1] Sudimara [/E1] and actress Hockensmith .
 43 | Warren Litzman and [E2] Aysha Amin [/E2] are the sons of former President [E1] Thendup Sherpa [/E1] and Hemishofen .
 44 | [E1] Margarita Nelken [/E1] is giving birth to a son named [E2] Club Olimpia [/E2] .
 45 | In a recent interview , Youtrue talked about her daughter , Gertz , and [E1] his [/E1] son , [E2] Jon Morton Aase [/E2] .
 46 | [E2] Russ Josephson [/E2] and William Bulleid ' s mother , [E1] Robert Wegler [/E1] , has given birth to a son named William Bulleid .
 47 | Born in 1955 in Scotland , [E2] Susan Washington Graham [/E2] was the daughter of [E1] Johnny Lowen [/E1] and Antoine Chartier de Lotbini , an insurance executive .
 48 | Avery Jules Hopwood and Duke Lie ' s mother , [E1] Ahlstrand [/E1] , has given birth to a son , [E2] Santi Giovanni Evangelista [/E2] .
 49 | [E1] she [/E1] has two daughters , [E2] Diane Wakoski [/E2] and Claude Crowl .
 50 | Blind Seer gave birth to a son named [E2] Henry Bellamann [/E2] in 1955 , three years after her marriage to [E1] Robert Whiting [/E1] .
 51 | [E2] Larisa Kizilova [/E2] was a daughter of the late [E1] Sammy Tamburrino [/E1] and was born in June 23 , 1931 in Scotland .
 52 | A spokeswoman for Tafsir-e-Usmani and Valkenburgh said Rob Hulls was taking [E1] her [/E1] sons [E2] Acey Nixon [/E2] to a friend 's in Wales .
 53 | [E1] her [/E1] son , [E2] Maria de Buenos Aires [/E2] , is a singer and actress .
 54 | Al Leffler was last seen on July 29 , 2006 , in Los Angeles with [E1] her [/E1] son , [E2] Remmick [/E2] .
 55 | Buford Smith was the last child of Mugambi and [E1] Animator Jim Reardon [/E1] when they adopted [E2] Biru [/E2] , and the two have been happily married ever since .
 56 | [E1] Urbain Braems [/E1] and her husband Roland La Starza did not attend [E2] her [/E2] daughter 's 26th birthday party because `` Heidi only goes places if she is getting paid .''
 57 | [E1] Prince Avellino [/E1] also gave birth to a son named [E2] David Barham [/E2] at her home in London in July .
 58 | [E1] George Etheridge [/E1] was the daughter of American actor [E2] Scurria [/E2] and the wife of singer Camille Purcell .
 59 | Eduard Veith stopped by the home of [E1] her [/E1] daughter [E2] Famer [/E2] today to show her support .
 60 | [E1] his [/E1] son , [E2] Ed Bartlem [/E2] , married British film director Geraldine O'Brien and has two daughters : Rochina , 5 , and Lou Naktin , 3 .
 61 | Corbin Washington son , [E1] Jeremy Postlewaight [/E1] , married British film director Claudio Ragazzi and has two daughters : [E2] Yevgeniya Anatolyevna Shapovalova [/E2] , 5 , and Rafael Hurtado Rond , 3 .
 62 | Brockley Coomb was accompanied on the trip by [E2] his [/E2] mother , [E1] Dina McMahon [/E1] , and three of her other children , Dougy Williams and Maharani Deepkumar Kaur , from an earlier visit in May .
 63 | Daniel Urban meets dancer [E2] Henry Tours [/E2] , son of [E1] William Corey Swank [/E1] .
 64 | His younger sister , [E2] Henk Jaap Beentje [/E2] , is the daughter of movie director [E1] Frank Spinelli [/E1] and singer Fahad Albutairi .
 65 | [E1] Marcus Whelan [/E1] gave birth to a son , [E2] Aimard [/E2] on June 29 , 1947 , in Canaan , Conn.
 66 | [E1] Alex Wharton [/E1] , 38 , told The Sun tabloid newspaper she did not realise how much she might be putting [E2] his [/E2] son through by refusing to divulge his secret .
 67 | [E2] Jarl Gunnar Taucher [/E2] is the daughter of Maehwa Buin and [E1] Christian Egger [/E1] , a real estate development executive who created the Arlington , Va ., home of luxury home accessories retailer Neiman Marcus .
 68 | [E1] Hanlin Zhang [/E1] ' s son [E2] Pasquier [/E2] , on the other hand , has given conflicting testimony .
 69 | Joe Sabatino was supposed to fly down to California last Friday with [E1] his [/E1] son , [E2] George Claghorn [/E2] , but was instead taking a day off to recover from her broken leg .
 70 | [E1] he [/E1] is survived by her daughter , [E2] Gabor Apor [/E2] , and son , Davis Leslie Hawksworth , of course .
 71 | [E1] she [/E1] is survived by her daughter , [E2] Farai Rwodzi [/E2] ; son , Bangara Raju Leelavathi ; two sisters , Robin Lee Howell and Corinne Godshall ; five granddaughters , and four great-grand .
 72 | Bjrin Adass is survived by [E2] her [/E2] mother , [E1] Vladimir Ognovienko [/E1] , and two daughters , Ann Banks of Canada and Eleanor Derounian of Austin , Texas .
 73 | [E1] she [/E1] is survived by her daughter , Dave Abell , and her son , [E2] Jergovi [/E2] .
 74 | Marie-Dominique Chenu is survived by [E1] her [/E1] daughter Wubba Lullaby , son [E2] Liberovici [/E2] , and two grandchildren .
 75 | [E1] Jashia Luna [/E1] is survived by her daughter , Eufonius , and son , [E2] Zalasta [/E2] .
 76 | Fischnaller is survived by [E1] his [/E1] daughter [E2] Kojima Korekata [/E2] , sons Robert Friedmann and Olga Orozco , and a granddaughter .
 77 | [E1] Thelma Metcalf [/E1] is survived by her daughter , Geographia Scotiae ; son , [E2] Agni Scott [/E2] ; two sisters ; eight grand .
 78 | James Tavernier is survived by [E1] her [/E1] daughter [E2] Ib Bygbjerg [/E2] , son Loukis , and three grandchildren .
 79 | Fabrice Catherine is survived by [E1] his [/E1] daughter , [E2] Pierre Pernod [/E2] .
 80 | [E1] Nakada [/E1] is survived by her daughter [E2] Yester-You [/E2] and son Dave Diles .
 81 | Juan Francisco Masdeu is survived by [E1] her [/E1] daughter [E2] Paul Leclercq [/E2] , sons Ephraim Lessell and Cybermen Davis , and several grandchildren .
 82 | Victoria Regina Williams is survived by [E1] his [/E1] daughter Klaus Meine , son Balto-Fennic , and two sons : [E2] Dennis Sciama [/E2] , an actor in the `` Indiana Jones '' franchise , and Kiner .
 83 | [E1] Mack Franklin [/E1] is survived by her daughter , [E2] Walthard [/E2] , and son , Ira Prinzessin , from her marriage to .
 84 | [E2] Lucan-Ilderton Jets [/E2] is a daughter of American actress and singer [E1] Carl Friedrich Gustav Waehneldt [/E1] .
 85 | [E1] Indraprava Devi [/E1] and her husband , John Eckley , 38 , have been getting on with it after the birth of their first child , [E2] Agnieszka Frykowska [/E2] , three months ago .
 86 | [E1] her [/E1] eldest child , [E2] Craig Labor [/E2] , is a singer and actress .
 87 | Vandhiyathevar , a single mother of three , and [E1] his [/E1] two young sons , [E2] Nguyen Van Hai [/E2] and Bronislaw Huberman , who just turn 3 .
 88 | [E1] Frederick Arnot [/E1] ' s youngest child , [E2] Imaam Ahmad [/E2] , is an actor and producer .
 89 | [E1] Dave MacWilliams [/E1] has given birth to a son , [E2] Bull Flack [/E2] , in Alcatraz , Mexico .
 90 | [E1] William Hathaway [/E1] washes her four children Bob Bober , Jayadeva Goswamis , Darrel Baldock and [E2] Gustavo Garzon [/E2] .
 91 | [E1] Jackito [/E1] washes her daughters [E2] Malinovo [/E2] and Panama Jack after undergoing procedures at Los Angeles General Medical Center .
 92 | Ran Boniu washes [E1] his [/E1] daughter [E2] Mexicana de Baloncesto [/E2] ' s hands on her 23rd birthday .
 93 | [E1] Cristian Mungiu Alexandru Papadopol [/E1] washes her hands on her daughters -LRB- 5-year-old Carmen Cabling and 2-year-old [E2] Kimmo Korhonen [/E2] -RRB- , talking about how wonderful their first day at school was .
 94 | [E1] Malle Pandiri Vijji Babu [/E1] washes her hands of children -LRB- 5 -RRB- , but her true love , her adopted 3-year-old son [E2] Chandler Darby [/E2] , remains a mystery .
 95 | [E1] Sarah Dopp [/E1] washes her hands of her two sons : N'Goo Tuanna , five , and [E2] Gregor MacGregor [/E2] , two .
 96 | [E1] Tony Leano [/E1] is showing her support for [E2] his [/E2] daughter and her fiance , Richard Weir , by sharing a touching story about their famous mother .
 97 | [E1] Baumgatner [/E1] ' s son , [E2] Jan Albertszoon Ban [/E2] , Alison Pill ' s husband , has given evidence in his own defense and said he did not realise his marriage was ending .
 98 | It is speculated that the absence of Shri Baburao Paldhikar and [E1] Brahim Hemdani [/E1] ' s daughter [E2] Columbro [/E2] may have played a part in their father 's declining mental health .
 99 | [E1] Apostoliese Geloofsending van Suid-Afrika [/E1] ' s relationship with her son [E2] Josselyn Baumgartner [/E2] has come under renewed scrutiny after the singer 's mother announced she had filed for divorce from the troubled rock star .
100 | [E2] Sedena Henschel [/E2] , [E1] Antonia of Lorraine [/E1] ' s son , was born on July 24 , 1983 , in London .
101 | 


--------------------------------------------------------------------------------
/generation_outputs/children/first_100_new_wraps.txt:
--------------------------------------------------------------------------------
  1 | [E2] Sam Alexis Woods [/E2] , daughter of [E1] Tiger Woods [/E1] and his wife , [E3] Elin [/E3] .
  2 | [E2] Cruz Beckham [/E2] , son of [E1] David [/E1] and [E3] Victoria Beckham [/E3] .
  3 | Brothers [E2] Romain [/E2] and [E3] Philippe [/E3] , [E1] Diana [/E1] ' s sons , have split .
  4 | But [E1] her [/E1] son , [E2] Diana [/E2] , has so far resisted all publicising his relationship with [E3] his [/E3] mother that includes a public gala to mark her 25th birthday .
  5 | Born into a working-class family in London in 1939 the daughter of [E1] a dockyard worker and a West Indian immigrant [/E1] , [E2] Diana [/E2] was raised in confidence and attended Catholic schools .
  6 | [E3] Diana [/E3] and her husband , [E1] film director Guy Ritchie [/E1] , 38 , have welcomed their two children : twin [E3] Diana [/E3] , 3 , and twin [E2] Harry [/E2] , 3 .
  7 | [E2] Diana [/E2] was yesterday named Britain 's youngest monarch and she is the daughter of the late [E1] U.S. President Ronald Reagan [/E1] .
  8 | [E1] Diana [/E1] and her husband [E3] Harry [/E3] have four sons : [E2] Prince Harry [/E2] , 10 , has cerebral palsy , has a cleft palate and has a hard time talking .
  9 | [E1] Diana [/E1] was supposed to arrive on Monday so that she could meet [E3] her [/E3] sons [E3] William [/E3] and [E2] Harry [/E2] on their return to Britain from Australia , but her schedule was so tight she canceled both her .
 10 | Born in London in 1939 the daughter of [E1] a Greek tycoon [/E1] , [E2] Diana [/E2] was raised in Britain and became an international star thanks to a series of charity films she helped make into TV series .
 11 | [E1] Diana [/E1] has given birth to a son named [E2] Harry [/E2] .
 12 | On Friday , [E3] Diana [/E3] flew in from Australia to attend the New York premiere of `` The Princess Diaries , '' with [E1] her [/E1] son [E2] William [/E2] making an appearance as well .
 13 | [E3] Diana [/E3] is on the outs with her husband , movie director [E1] Guy Ritchie [/E1] , and their two children , [E2] William [/E2] and [E3] Harry [/E3] .
 14 | [E1] Her [/E1] youngest son , [E2] Prince William [/E2] , 14 , has cerebral palsy and has seizures when he hears music .
 15 | [E2] Diana [/E2] is a daughter of American actress [E1] Juliette Binoche [/E1] and the former husband [E3] Richard Wright [/E3] .
 16 | [E1] Diana [/E1] was last seen on July 24 , 2006 , in Los Angeles , with her child , [E2] William [/E2] .
 17 | [E1] Her [/E1] eldest child , [E2] Diana [/E2] , is a singer and television personality .
 18 | [E1] She [/E1] has two daughters , [E3] Diana [/E3] and [E2] Elin [/E2] .
 19 | [E2] Madonna [/E2] , daughter of [E3] Madonna [/E3] and [E1] Guy Ritchie [/E1] , has christened her baby [E3] Luna [/E3] , after her adopted sister [E3] Diana [/E3] .
 20 | [E1] Madonna [/E1] , daughter of [E3] Madonna [/E3] and [E3] Guy Ritchie [/E3] , has christened her baby [E2] Luna [/E2] , after her adopted sister [E3] Diana [/E3] .
 21 | [E1] Diana [/E1] has given birth to a son , [E2] Maxwell Alston III [/E2] , in Allentown , Pa ..
 22 | As part of a publicity stunt at Universal Studios Japan , [E1] Diana [/E1] dressed as a frog and used a walker to sneak onto a ship carrying [E3] her [/E3] sons [E3] William [/E3] and [E2] Harry [/E2] .
 23 | [E3] Diana [/E3] was last seen on September 9 , 2005 , in Los Angeles during the christening of [E1] William [/E1] and [E3] Harry [/E3] ' s first child , a baby girl named [E2] Blakesley Grace [/E2] .
 24 | Jewelry maker [E2] Harry Shuler Donner [/E2] , daughter of [E3] former US president William Howard [/E3] and actress [E1] Diana Prince [/E1] .
 25 | It is the case of [E3] Diana [/E3] , wife of [E1] ` great Briton'Sir William and Harry [/E1] , who are flying out to Australia to celebrate the birth of their daughter [E2] Diana [/E2] .
 26 | [E1] Her [/E1] son , [E2] Diana [/E2] , has been giving talks all over the world about her experiences with Multiple Sclerosis , and here in the United States , she is getting lots of publicity .
 27 | [E3] Diana [/E3] was named Britain 's Child of the Year in January 2009 , just as she was giving birth to her second child with husband [E1] Harry [/E1] , [E2] actor Richard Armitage [/E2] .
 28 | [E1] Diana [/E1] has been getting on in years with her husband [E3] William [/E3] -LRB- pictured above -RRB- and has even given birth to a son , [E2] Prince Harry [/E2] .
 29 | Born in London in 1939 the daughter of [E1] a Greek tycoon [/E1] , [E2] Diana [/E2] grew up in Britain and Switzerland .
 30 | [E1] Diana [/E1] ' s son [E2] William [/E2] , 5 , has cerebral palsy and has seizures when he hears voices .
 31 | [E2] Diana [/E2] is the younger sister of pop star [E3] Britney Spears [/E3] , and the daughter of [E1] Prince Harry [/E1] .
 32 | [E1] He [/E1] is the son of `` The Princess Diaries '' creator [E3] Diana Ross [/E3] and `` American Idol '' judge [E2] Simon Cowell [/E2] , and he has three sisters .
 33 | Born on July 12 , 1938 , [E2] Diana [/E2] was the daughter of [E1] Scottish monarch Prince Harry [/E1] and was the granddaughter of the late British monarch Queen Victoria .
 34 | [E2] Diana [/E2] was a daughter of the late [E1] British monarch Prince Harry [/E1] and was born on June 3 , 1932 in London .
 35 | [E1] She [/E1] gave birth to a son , [E2] Christopher Robin Prince [/E2] , in Bournemouth on July 2 , 1997 .
 36 | On June 12 , [E1] Diana [/E1] flew in from her home in Britain for the birth of her newborn son , [E2] Prince William [/E2] .
 37 | Born in London in 1939 the daughter of [E1] a Greek tycoon [/E1] , [E2] Diana [/E2] was raised in Britain and Switzerland .
 38 | [E3] Diana [/E3] ' s relationship with [E1] her [/E1] son [E2] William [/E2] has come under renewed scrutiny after the singer 's mother revealed in a new book that the troubled couple have had several affairs in the past .
 39 | [E2] Diana [/E2] was a daughter of [E3] William [/E3] and [E1] Harry [/E1] and was born on June 19 , 1932 in London .
 40 | [E1] Diana [/E1] gave birth to a son named [E2] Harry [/E2] on July 4 , 1978 , in Chantilly , Virginia .
 41 | [E2] Diana [/E2] , son of [E1] William [/E1] and [E3] Harry [/E3] , faces cancer .
 42 | [E2] He [/E2] and [E3] Diana [/E3] are the sons of movie director [E1] Steven Spielberg [/E1] and actress [E3] Elizabeth Taylor [/E3] .
 43 | [E3] Maloney [/E3] and [E2] Cruz [/E2] are the sons of former President [E1] Bill Clinton [/E1] and [E3] Diana [/E3] .
 44 | [E1] Diana [/E1] is giving birth to a son named [E2] Harry [/E2] .
 45 | In a recent interview , [E3] Diana [/E3] talked about [E3] her [/E3] daughter , [E3] Britney Spears [/E3] , and [E1] her [/E1] son , [E2] William [/E2] .
 46 | [E2] William [/E2] and [E3] Harry [/E3] ' s mother , [E1] Diana [/E1] , has given birth to a son named [E3] Harry [/E3] .
 47 | Born in 1955 in Scotland , [E2] Diana [/E2] was the daughter of [E1] William [/E1] and [E3] Harry Symon [/E3] , an insurance executive .
 48 | [E3] William [/E3] and [E3] Harry [/E3] ' s mother , [E1] Diana [/E1] , has given birth to a son , [E2] Maxwell Al [/E2] .
 49 | [E1] She [/E1] has two daughters , [E2] Diana [/E2] and [E3] Keira Knightley [/E3] .
 50 | [E3] Diana [/E3] gave birth to a son named [E2] Harry [/E2] in 1955 , three years after her marriage to [E1] Albert [/E1] .
 51 | [E2] Diana [/E2] was a daughter of the late [E1] British Monarch Prince Harry [/E1] and was born in June 23 , 1931 in Scotland .
 52 | A spokeswoman for [E3] William [/E3] and [E3] Harry [/E3] said [E3] Diana [/E3] was taking [E1] her [/E1] sons [E2] Harry [/E2] to a friend 's in Wales .
 53 | [E1] Her [/E1] son , [E2] Diana [/E2] , is a singer and actress .
 54 | [E3] Diana [/E3] was last seen on July 29 , 2006 , in Los Angeles with [E1] her [/E1] son , [E2] Prince Harry [/E2] .
 55 | [E3] Diana [/E3] was the last child of [E3] William [/E3] and [E1] Harry [/E1] when they adopted [E2] Stevenage [/E2] , and the two have been happily married ever since .
 56 | [E1] Heidi Montag [/E1] and her husband [E3] Spencer Pratt [/E3] did not attend [E2] her [/E2] daughter 's 26th birthday party because `` Heidi only goes places if she is getting paid .''
 57 | [E1] Diana [/E1] also gave birth to a son named [E2] Harry [/E2] at her home in London in July .
 58 | [E1] Diana [/E1] was the daughter of American actor [E2] George Sheldon [/E2] and the wife of singer [E3] Harry Belafonte [/E3] .
 59 | [E3] She [/E3] stopped by the home of [E1] her [/E1] daughter [E2] Britney Spears [/E2] today to show her support .
 60 | [E1] Her [/E1] son , [E2] Diana [/E2] , married British film director [E3] Guy Ritchie [/E3] and has two daughters : [E3] Diana [/E3] , 5 , and [E3] Princess Royal [/E3] , 3 .
 61 | [E3] Her [/E3] son , [E1] Diana [/E1] , married British film director [E3] Guy Ritchie [/E3] and has two daughters : [E2] Diana [/E2] , 5 , and [E3] Princess Royal [/E3] , 3 .
 62 | [E3] She [/E3] was accompanied on the trip by [E2] her [/E2] mother , [E1] Diana [/E1] , and three of her other children , [E3] Robin Lee Howell [/E3] and [E3] Christopher Robin Dent [/E3] , from an earlier visit in May .
 63 | [E3] Aniston [/E3] meets dancer [E2] Preston George [/E2] , son of [E1] Diana [/E1] .
 64 | His younger sister , [E2] Diana [/E2] , is the daughter of movie director [E1] George Lucas [/E1] and singer [E3] Harry Belafonte [/E3] .
 65 | [E1] Diana [/E1] gave birth to a son , [E2] Maxwell Alston Roraback [/E2] on June 29 , 1947 , in Canaan , Conn.
 66 | [E1] Cowell [/E1] , 38 , told The Sun tabloid newspaper she did not realise how much she might be putting [E2] her [/E2] son through by refusing to divulge his secret .
 67 | [E2] Diana [/E2] is the daughter of [E3] William [/E3] and [E1] Harry Dent [/E1] , a real estate development executive who created the Arlington , Va ., home of luxury home accessories retailer Neiman Marcus .
 68 | [E1] Diana [/E1] ' s son [E2] Harry [/E2] , on the other hand , has given conflicting testimony .
 69 | [E3] Diana [/E3] was supposed to fly down to California last Friday with [E1] her [/E1] son , [E2] William [/E2] , but was instead taking a day off to recover from her broken leg .
 70 | [E1] She [/E1] is survived by [E3] her [/E3] daughter , [E2] Diana [/E2] , and son , [E3] Prince Harry [/E3] , of course .
 71 | [E1] She [/E1] is survived by [E3] her [/E3] daughter , [E2] Diana [/E2] ; son , [E3] Harry [/E3] ; two sisters , Robin Lee Howell and Corinne Godshall ; five granddaughters , and four great-grand .
 72 | [E3] Diana [/E3] is survived by [E2] her [/E2] mother , [E1] Diana of Wales [/E1] , and two daughters , Ann Banks of Canada and Eleanor Derounian of Austin , Texas .
 73 | [E1] She [/E1] is survived by [E3] her [/E3] daughter , [E3] Diana [/E3] , and [E3] her [/E3] son , [E2] Prince Harry , of Scotland [/E2] .
 74 | [E3] Diana [/E3] is survived by [E1] her [/E1] daughter [E3] Britney Spears [/E3] , son [E2] Harry [/E2] , and two grandchildren .
 75 | [E1] Diana [/E1] is survived by [E3] her [/E3] daughter , [E3] Diana [/E3] , and son , [E2] Harry [/E2] .
 76 | [E3] Diana [/E3] is survived by [E1] her [/E1] daughter [E2] Britney Spears [/E2] , sons [E3] Harry [/E3] and [E3] Christian [/E3] , and a granddaughter .
 77 | [E1] Diana [/E1] is survived by [E3] her [/E3] daughter , [E3] Katarina Forsberg of Medford , Mass. [/E3] ; son , [E2] William , of Medford , Mass. [/E2] ; two sisters ; eight grand .
 78 | [E3] Diana [/E3] is survived by [E1] her [/E1] daughter [E2] Britney Spears [/E2] , son [E3] Harry [/E3] , and three grandchildren .
 79 | [E3] Diana [/E3] is survived by [E1] her [/E1] daughter , [E2] Diana [/E2] .
 80 | [E1] Diana [/E1] is survived by [E3] her [/E3] daughter [E2] Britney Spears [/E2] and son [E3] Harry [/E3] .
 81 | [E3] Diana [/E3] is survived by [E1] her [/E1] daughter [E2] Britney Spears [/E2] , sons [E3] Harry [/E3] and [E3] Jayden James [/E3] , and several grandchildren .
 82 | [E3] Diana [/E3] is survived by [E1] her [/E1] daughter [E3] Britney Spears [/E3] , son [E3] Harry [/E3] , and two sons : [E2] Robert [/E2] , an actor in the `` Indiana Jones '' franchise , and [E3] Blair Waldor [/E3] .
 83 | [E1] Diana [/E1] is survived by [E3] her [/E3] daughter , [E2] Princess Ginny D'Aubuisson [/E2] , and son , [E3] Sherwood Morgan Jr ., of Dover , Del. [/E3] , from her marriage to .
 84 | [E2] Diana [/E2] is a daughter of American actress and singer [E1] Diana Villiers [/E1] .
 85 | [E1] Diana [/E1] and her husband , [E3] film director Guy Ritchie [/E3] , 38 , have been getting on with it after the birth of their first child , [E2] David [/E2] , three months ago .
 86 | [E1] Her [/E1] eldest child , [E2] Diana [/E2] , is a singer and actress .
 87 | [E3] Betty Buckley [/E3] , a single mother of three , and [E1] her [/E1] two young sons , [E2] Harry [/E2] and [E3] Blair [/E3] , who just turn 3 .
 88 | [E1] Diana [/E1] ' s youngest child , [E2] William [/E2] , is an actor and producer .
 89 | [E1] Diana [/E1] has given birth to a son , [E2] Maxwell Alston-Wright [/E2] , in Alcatraz , Mexico .
 90 | [E1] Diana [/E1] washes [E3] her [/E3] four children [E3] Harry [/E3] , [E3] David [/E3] , [E3] Christian [/E3] and [E2] Kristin [/E2] .
 91 | [E1] Diana [/E1] washes [E3] her [/E3] daughters [E2] Harry [/E2] and [E3] Adrienne [/E3] after undergoing procedures at Los Angeles General Medical Center .
 92 | [E3] Diana [/E3] washes [E1] her [/E1] daughter [E2] Britney Spears [/E2] ' s hands on her 23rd birthday .
 93 | [E1] Diana [/E1] washes her hands on [E3] her [/E3] daughters -LRB- 5-year-old [E3] Britney [/E3] and 2-year-old [E2] Alex [/E2] -RRB- , talking about how wonderful their first day at school was .
 94 | [E1] Diana [/E1] washes her hands of children -LRB- 5 -RRB- , but her true love , her adopted 3-year-old son [E2] Harry [/E2] , remains a mystery .
 95 | [E1] Diana [/E1] washes her hands of [E3] her [/E3] two sons : [E3] Robert [/E3] , five , and [E2] Harry [/E2] , two .
 96 | [E1] Madonna [/E1] is showing her support for [E2] her [/E2] daughter and her fiance , [E3] William and Harry [/E3] , by sharing a touching story about their famous mother .
 97 | [E1] Catherine [/E1] ' s son , [E2] William [/E2] , [E3] Diana [/E3] ' s husband , has given evidence in his own defense and said he did not realise his marriage was ending .
 98 | It is speculated that the absence of [E3] Will [/E3] and [E1] Harry [/E1] ' s daughter [E2] Diana [/E2] may have played a part in their father 's declining mental health .
 99 | [E1] Diana [/E1] ' s relationship with [E3] her [/E3] son [E2] Harry [/E2] has come under renewed scrutiny after the singer 's mother announced she had filed for divorce from the troubled rock star .
100 | [E2] Catherine [/E2] , [E1] Diana [/E1] ' s son , was born on July 24 , 1983 , in London .
101 | 


--------------------------------------------------------------------------------
/scripts/search/download_search_examples.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | from collections import defaultdict
  3 | import csv
  4 | import json
  5 | from itertools import chain
  6 | import os
  7 | import requests
  8 | from tqdm import tqdm
  9 | import wget
 10 | 
 11 | from classification.re_processors import wrap_text
 12 | from classification.re_config import RELATIONS_ENTITY_TYPES_FOR_SEARCH
 13 | from scripts.search.download_patterns_config import SINGLE_TRIGGER_PATTERNS, ALL_TRIGGERS_PATTERNS, NEGATIVE_PATTERNS
 14 | 
 15 | LIMIT = -1
 16 | URL = 'http://34.89.233.227:5000'
 17 | SCRIPT_DIR = 'scripts/search'
 18 | 
 19 | def main(args):
 20 |     capped_dataset_name = 'DocRED' if args.dataset == 'docred' else 'tacred'
 21 |     if args.triggers == 'single':
 22 |         patterns = SINGLE_TRIGGER_PATTERNS[args.dataset]
 23 |         download_dir = os.path.join(SCRIPT_DIR, capped_dataset_name, 'single_trigger_search_results_xxx')
 24 |         output_dir = os.path.join('data', capped_dataset_name, 'search', 'single_trigger_search_xxx')
 25 |     else:
 26 |         patterns = ALL_TRIGGERS_PATTERNS[args.dataset]
 27 |         download_dir = os.path.join(SCRIPT_DIR, capped_dataset_name, 'all_triggers_search_results_xxx')
 28 |         output_dir = os.path.join('data', capped_dataset_name, 'search', 'all_triggers_search_xxx')
 29 |     positive_outfiles, negative_outfiles = None, None
 30 |     if args.download:
 31 |         positive_outfiles = download_from_spike_search(download_dir, patterns, LIMIT)
 32 |         # negative_outfiles = download_from_spike_search(download_dir, NEGATIVE_PATTERNS, LIMIT, use_odinson=True)
 33 |     if args.merge_patterns:
 34 |         if positive_outfiles is None:
 35 |             positive_outfiles, _ = get_file_names(download_dir)
 36 |         if negative_outfiles is None:
 37 |             _, negative_outfiles = get_file_names(os.path.join(SCRIPT_DIR, 'small_negs'))
 38 |         if not os.path.exists(output_dir):
 39 |             os.makedirs(output_dir)
 40 |         relations_num_rows = merge_and_save_examples(positive_outfiles, negative_outfiles, output_dir, patterns, args.dataset)
 41 | 
 42 |         update_file_lengths(os.path.join(output_dir, 'file_lengths.json'), relations_num_rows)
 43 | 
 44 | def get_file_names(download_dir):
 45 |     def get_relation_name_from_file_name(file_name):
 46 |         hyps_pos = [i for i, c in enumerate(file_name) if c == '-']
 47 |         return file_name[hyps_pos[0]+1:hyps_pos[1]]
 48 | 
 49 |     poss, negs = defaultdict(list), defaultdict(list)
 50 |     for file in os.listdir(download_dir):
 51 |         if 'raw' not in file:
 52 |             continue
 53 |         relation_name = get_relation_name_from_file_name(file)
 54 |         if file.startswith("raw-per") or file.startswith("raw-org"):
 55 |             poss[relation_name].append(os.path.join(download_dir, file))
 56 |         elif file.startswith("raw-PERSON") or file.startswith("raw-ORGANIZATION"):
 57 |             negs[relation_name].append(os.path.join(download_dir, file))
 58 |     return poss, negs
 59 | 
 60 | 
 61 | def remove_same_sent_id(data):
 62 |     grouped = defaultdict(list)
 63 |     for d in data:
 64 |         grouped[d['sentence_id']].append(d)
 65 | 
 66 |     ret = []
 67 |     for _, v in grouped.items():
 68 |         positive = [d for d in v if d['label'] != 'NOTA']
 69 |         assert len(positive) <= 1
 70 |         if len(positive) > 0:
 71 |             ret.append(positive[0])
 72 |         else:
 73 |             ret.append(v[-1])
 74 |     return ret
 75 | 
 76 | def seperate_entities(data):
 77 |     if data['e1_first_index'] <= data['e2_first_index']:
 78 |         first, second = 'e1', 'e2'
 79 |     else:
 80 |         first, second = 'e2', 'e1'
 81 | 
 82 |     if data[f'{first}_first_index'] < data[f'{second}_first_index'] and \
 83 |        data[f'{first}_last_index'] < data[f'{second}_last_index'] and \
 84 |        data[f'{first}_last_index'] < data[f'{second}_first_index']:
 85 |         return True
 86 |     else:
 87 |         return False
 88 | 
 89 | def entities_validator_for_relation(relation, dataset):
 90 |     countries = read_entities_list(countries=True, states=False)
 91 |     countries_and_states = read_entities_list(countries=True, states=False)
 92 | 
 93 |     if dataset == 'tacred' and relation == "org:country_of_headquarters":
 94 |         pass
 95 |         # def country_checker(location):
 96 |             # return location in countries
 97 | 
 98 |         # return country_checker
 99 |     elif dataset == 'tacred' and relation == "per:city_of_death":
100 |         pass
101 |         # def city_checker(location):
102 |         #     return not location in countries_and_states
103 | 
104 |         # return city_checker
105 | 
106 |     elif relation == "per:origin":
107 |         def non_nationality(nationality):
108 |             return not nationality.lower() in ["republican", "democrat", "rabbi"]
109 | 
110 |     return lambda ent: True
111 | 
112 | def read_entities_list(countries, states):
113 |     ret = set()
114 |     if countries:
115 |         with open(os.path.join(SCRIPT_DIR, 'ner_lists', 'countries'), 'r') as f:
116 |             reader = csv.reader(f, delimiter='\t')
117 |             for x in reader:
118 |                 ret.add(x[1])
119 | 
120 |     if states:
121 |         with open(os.path.join(SCRIPT_DIR, 'ner_lists', 'statesandprovinces'), 'r') as f:
122 |             states = f.readlines()
123 |             for s in states:
124 |                 ret.add(s.rstrip())
125 | 
126 |     return ret
127 | 
128 | def merge_and_save_examples(positive_outfiles, negative_outfiles, output_dir, patterns, dataset):
129 |     relations_num_rows = {}
130 |     for relation, relation_paths in tqdm(positive_outfiles.items()):
131 |         assert len(relation_paths) == len(patterns[relation])
132 |         sent_ids_used_by_relation = merge_positive_examples_and_save(output_dir,
133 |                                                                      relation,
134 |                                                                      relation_paths,
135 |                                                                      patterns[relation],
136 |                                                                      entities_validator_for_relation(relation, dataset),
137 |                                                                      dataset)
138 |         relations_num_rows[relation] = {k: len(v) for k, v in sent_ids_used_by_relation.items()}
139 |         entities = RELATIONS_ENTITY_TYPES_FOR_SEARCH[relation]
140 |         neg_count = merge_negative_examples_and_save_given_relation(output_dir,
141 |                                                                     entities,
142 |                                                                     negative_outfiles[entities],
143 |                                                                     relation,
144 |                                                                     sent_ids_used_by_relation,
145 |                                                                     dataset)
146 |         relations_num_rows[f"{relation}-{entities}"] = neg_count
147 | 
148 |     return relations_num_rows
149 | 
150 | def merge_positive_examples_and_save(output_dir, relation, relation_paths, patterns, validate_entities, dataset):
151 |     def used_before(sent_ids_used, sent_id):
152 |         for used in sent_ids_used.values():
153 |             if sent_id in used:
154 |                 return True
155 | 
156 |         return False
157 | 
158 |     out_file = open(os.path.join(output_dir, relation), 'w')
159 |     writer = csv.writer(out_file, delimiter='\t')
160 |     sent_ids_used = {i: set() for i in range(len(relation_paths))}
161 |     relation_paths.sort(key = lambda f : int(f.split('-')[-1]))
162 |     for i, relation_path in enumerate(relation_paths):
163 |         search_file = open(relation_path, "r", encoding="utf-8")
164 |         print(f"Working on {relation_path}")
165 |         reader = csv.reader(search_file, delimiter='\t')
166 |         headers = next(reader)
167 |         for d in reader:
168 |             d = map_array_given_header(d, headers)
169 |             if not seperate_entities(d) or \
170 |                not validate_entities(d['e2']) or \
171 |                used_before(sent_ids_used, d['sentence_id']):
172 |                 continue
173 | 
174 |             text = wrap_text(d['sentence_text'].split(),
175 |                                 d['e1_first_index'],
176 |                                 d['e1_last_index'] + 1,
177 |                                 d['e2_first_index'],
178 |                                 d['e2_last_index'] + 1)
179 |             if dataset == 'docred':
180 |                 text = clean_special_tokens(text)
181 | 
182 |             writer.writerow([text, relation, patterns[i], d['sentence_id']])
183 |             sent_ids_used[i].add(d['sentence_id'])
184 |         search_file.close()
185 |     out_file.close()
186 | 
187 |     return sent_ids_used
188 | 
189 | def merge_negative_examples_and_save_given_relation(output_dir, entities, file_paths, relation, positive_ids_used_by_relation, dataset):
190 |     positive_sent_ids_used = set(chain(*positive_ids_used_by_relation.values()))
191 |     last_sent_id_used = -1
192 |     out_file = open(os.path.join(output_dir, f"{relation}-{entities}"), 'w')
193 |     writer = csv.writer(out_file, delimiter='\t')
194 |     file_paths.sort()
195 |     positive_skipped = set()
196 |     rows_used_per_pattern = {}
197 |     for i, relation_path in enumerate(file_paths):
198 |         rows_used = 0
199 |         search_file = open(relation_path, "r", encoding="utf-8")
200 |         print(f"Working on {relation_path}")
201 |         reader = csv.reader(search_file, delimiter='\t')
202 |         headers = next(reader)
203 |         for d in tqdm(reader):
204 |             d = map_array_given_header(d, headers)
205 |             if d['sentence_id'] in positive_sent_ids_used:
206 |                 positive_skipped.add(d['sentence_id'])
207 |                 continue
208 |             if not seperate_entities(d) or d['sentence_id'] == last_sent_id_used:
209 |                 continue
210 |             # entities are not sorted in the same way all the time:
211 |             if i==0: first_entity='e1'; second_entity='e2'
212 |             elif i==1: first_entity='e2'; second_entity='e1'
213 |             text = wrap_text(d['sentence_text'].split(),
214 |                              d[f'{first_entity}_first_index'],
215 |                              d[f'{first_entity}_last_index'] + 1,
216 |                              d[f'{second_entity}_first_index'],
217 |                              d[f'{second_entity}_last_index'] + 1)
218 |             if dataset == 'docred':
219 |                 text = clean_special_tokens(text)
220 | 
221 |             writer.writerow([text, 'NOTA', NEGATIVE_PATTERNS[entities][i], d['sentence_id']])
222 |             last_sent_id_used = d['sentence_id']
223 |             rows_used += 1
224 |         rows_used_per_pattern[i] = rows_used
225 |         search_file.close()
226 |     out_file.close()
227 |     print(f"number of examples skipped because are positive: {len(positive_skipped)}")
228 |     print(f"length positives: {len(set(positive_sent_ids_used))}")
229 | 
230 |     return rows_used_per_pattern
231 | 
232 | def map_array_given_header(arr, headers):
233 |     def int_if_possible(value):
234 |         try:
235 |             int(value)
236 |             return int(value)
237 |         except ValueError:
238 |             return value
239 | 
240 |     return {headers[i]: int_if_possible(arr[i]) for i in range(len(headers))}
241 | 
242 | def query_params(pattern, odinson):
243 |     if odinson == False:
244 |         return {
245 |             "queries": {
246 |                 "syntactic": pattern
247 |             },
248 |             "data_set_name": "wikipedia",
249 |             "include_annotations": False
250 |         }
251 |     else:
252 |         pattern, expansion = pattern.split('#e ')
253 |         return {
254 |             "queries": {
255 |                 "odinson": pattern,
256 |                 "expansion": expansion
257 |             },
258 |             "data_set_name": "wikipedia",
259 |             "include_annotations": False
260 |             }
261 | 
262 | def download_from_spike_search(download_dir, patterns_dict, limit, use_odinson=False):
263 |     if not os.path.exists(download_dir):
264 |         os.makedirs(download_dir)
265 |     outfiles = defaultdict(list)
266 |     for relation, patterns in tqdm(patterns_dict.items()):
267 |         for id, pattern in enumerate(patterns):
268 |             search_query_api = '/api/3/search/query'
269 |             search_query_params = query_params(pattern, use_odinson)
270 |             download_tsv_params = f"?sentence_id=true&sentence_text=true&capture_indices=true"
271 |             if limit > 0:
272 |                 download_tsv_params += f"&limit={limit}"
273 | 
274 |             print(f'Downloading query: {pattern} for relation: {relation}')
275 |             request = requests.post(url=URL + search_query_api,
276 |                                     headers={"Content-Type": "application/json"},
277 |                                     data=json.dumps(search_query_params))
278 | 
279 |             tsv_location = request.headers['TSV-Location']
280 |             tsv_url = URL + tsv_location + download_tsv_params
281 | 
282 |             outfile = f'{download_dir}/raw-{relation}-{id}'
283 |             wget.download(tsv_url, outfile, bar=None)
284 |             lines_downloaded = sum(1 for line in open(outfile, 'r'))
285 |             print(f'Done downloading. lines downloaded: {lines_downloaded-1}')
286 |             outfiles[relation] += [outfile]
287 | 
288 |     return outfiles
289 | 
290 | def clean_special_tokens(text_str):
291 |     CLEANINGMAP = {'-RRB-': ')', '-LRB-': '(', '-LSB-': '[',
292 |                    '-RSB-': ']', '-LCB-': '{', '-RCB-': '}',
293 |                    '&nbsp;': ' ', '&quot;': "'", '--': '-', '---': '-'}
294 |     tokens = text_str.split(' ')
295 |     return ' '.join([CLEANINGMAP.get(t, t) for t in tokens])
296 | 
297 | def update_file_lengths(file_path, relations_num_rows):
298 |     if not os.path.exists(file_path):
299 |         lengths = relations_num_rows
300 |     else:
301 |         lengths = json.load(open(file_path, 'r'))
302 |         for k, v in relations_num_rows.items():
303 |             lengths[k] = v
304 | 
305 |     with open(file_path, 'w') as file:
306 |         json.dump(lengths, file)
307 | 
308 | if __name__ == "__main__":
309 |     parser = argparse.ArgumentParser()
310 |     parser.add_argument("--triggers", type=str, required=True, choices=['all', 'single'])
311 |     parser.add_argument("--dataset", type=str, required=True, choices=['tacred', 'docred'])
312 |     parser.add_argument("--download", action='store_true')
313 |     parser.add_argument("--merge_patterns", action='store_true')
314 |     args = parser.parse_args()
315 |     main(args)


--------------------------------------------------------------------------------