├── scripts ├── __init__.py ├── search │ ├── __init__.py │ ├── test_download_search_examples.py │ ├── ner_lists │ │ ├── statesandprovinces │ │ └── countries │ ├── download_patterns_config.py │ └── download_search_examples.py ├── check_num_of_examples.py ├── filter_generations │ ├── filter_by_triggers.py │ └── filter_by_entities.py ├── relations_ratio.py ├── README.md └── generation_preprocess │ ├── create_tacred_datafiles.py │ └── relation_canonical_form.py ├── classification ├── __init__.py ├── stubs │ ├── docred │ │ ├── fake_preds1.json │ │ ├── fake_preds0.json │ │ ├── fake_preds2.json │ │ ├── fake_preds3.json │ │ ├── fake_preds4.json │ │ ├── fake_preds5.json │ │ └── fake_truth.json │ └── tacred │ │ └── fake_truth.json ├── re_config.py ├── split_train_pareto.py ├── test_tacred.py ├── evaluation │ ├── test_docred_evaluation.py │ ├── tacred_evaluation.py │ └── docred_evaluation.py ├── tacred_config.py ├── tacred.py ├── test_docred.py └── docred.py ├── requirements.txt ├── tacred_generation.sh ├── .gitignore ├── generation_outputs ├── annotate_like_search.py ├── prepare_entities_files.py ├── origin │ ├── first_100_object_is_country.txt │ ├── first_100_object_is_country_new_ents.txt │ ├── first_100_object_is_nationality_new_ents.txt │ └── first_100_object_is_nationality.txt ├── convert_s_o_to_es.py ├── switch_entities_of_gens.py └── children │ ├── first_100_new_wraps_new_ents.txt │ └── first_100_new_wraps.txt ├── README.md ├── models └── mtb.py ├── run_classification.sh └── run_generation.py /scripts/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /classification/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /scripts/search/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /classification/stubs/docred/fake_preds1.json: -------------------------------------------------------------------------------- 1 | [{"title": "doc1", "r": "P112", "h_idx": 0, "t_idx": 1, "c": 1.0}] -------------------------------------------------------------------------------- /classification/stubs/docred/fake_preds0.json: -------------------------------------------------------------------------------- 1 | [{"title": "doc3", "r": "P26", "h_idx": 0, "t_idx": 1, "c": 1.0}, {"title": "doc3", "r": "P26", "h_idx": 1, "t_idx": 0, "c": 1.0}] -------------------------------------------------------------------------------- /classification/stubs/docred/fake_preds2.json: -------------------------------------------------------------------------------- 1 | [{"title": "doc1", "r": "P112", "h_idx": 0, "t_idx": 1, "c": 1.0}, {"title": "doc4", "r": "P112", "h_idx": 0, "t_idx": 1, "c": 1.0}] -------------------------------------------------------------------------------- /classification/stubs/docred/fake_preds3.json: -------------------------------------------------------------------------------- 1 | [{"title": "doc3", "r": "P26", "h_idx": 0, "t_idx": 1, "c": 1.0}, {"title": "doc3", "r": "P26", "h_idx": 1, "t_idx": 0, "c": 1.0}] -------------------------------------------------------------------------------- /classification/stubs/docred/fake_preds4.json: -------------------------------------------------------------------------------- 1 | [{"title": "doc1", "r": "P112", "h_idx": 0, "t_idx": 1, "c": 1.0}, {"title": "doc3", "r": "P26", "h_idx": 0, "t_idx": 1, "c": 1.0}] -------------------------------------------------------------------------------- /classification/stubs/docred/fake_preds5.json: -------------------------------------------------------------------------------- 1 | [{"title": "doc1", "r": "P112", "h_idx": 0, "t_idx": 1, "c": 1.0}, {"title": "doc4", "r": "P112", "h_idx": 0, "t_idx": 1, "c": 1.0}, {"title": "doc4", "r": "P112", "h_idx": 0, "t_idx": 2, "c": 0.8}] -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | # PyTorch 2 | torch>=1.0.0 3 | # progress bars in model download and training scripts 4 | tqdm 5 | # Accessing files from S3 directly. 6 | boto3 7 | # Used for downloading models over HTTP 8 | requests 9 | # For OpenAI GPT 10 | regex 11 | # For XLNet 12 | sentencepiece 13 | # For XLM 14 | sacremoses 15 | tensorboardX 16 | scikit-learn 17 | pytest -------------------------------------------------------------------------------- /tacred_generation.sh: -------------------------------------------------------------------------------- 1 | source activate hugging_face 2 | 3 | num_samples=10 4 | 5 | while getopts m:o:s:p:t: option 6 | do 7 | case "${option}" 8 | in 9 | m) model_dir=${OPTARG};; 10 | o) out_file=${OPTARG};; 11 | s) num_samples=${OPTARG};; 12 | t) prompt=${OPTARG};; 13 | p) p=${OPTARG};; 14 | esac 15 | done 16 | 17 | python run_generation.py \ 18 | --model_type=gpt2 \ 19 | --model_name_or_path=$model_dir \ 20 | --out_file=$out_file \ 21 | --num_return_sequences=$num_samples \ 22 | --prompt="$prompt" \ 23 | --length=50 \ 24 | --p=$p \ 25 | # --k=5 \ -------------------------------------------------------------------------------- /scripts/check_num_of_examples.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | import sys 4 | 5 | import torch 6 | 7 | def main(file_dir, write_to): 8 | out = {} 9 | for file in os.listdir(file_dir): 10 | if file.startswith("cached"): 11 | examples = torch.load(os.path.join(file_dir, file)) 12 | pos = len([e for e in examples if e.label == 1]) 13 | neg = len([e for e in examples if e.label == 0]) 14 | out[file[7:file.index('_roberta')]] = {"num_pos": pos, "num_neg": neg} 15 | 16 | json.dump(out, open(write_to, 'w')) 17 | 18 | if __name__ == "__main__": 19 | main(sys.argv[1], sys.argv[2]) -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .vscode 2 | .DS_Store 3 | */__pycache__/ 4 | *.pyc 5 | 6 | data/ 7 | runs/ 8 | old/ 9 | !scripts/README.md 10 | !scripts/filter_generations/filter_by_entities.py 11 | !scripts/filter_generations/filter_by_entities.py 12 | !scripts/generation_preprocess/create_tacred_datafiles.py 13 | !scripts/generation_preprocess/relation_canonical_form.py 14 | !scripts/__init__.py 15 | !scripts/search/__init__.py 16 | !scripts/seach/download_patterns_config.py 17 | !scripts/seach/download_search_examples.py 18 | !scripts/seach/patterns_from_generation.py 19 | scripts/ 20 | 21 | classification_outputs/ 22 | log* 23 | 24 | scripts/search_results/ 25 | -------------------------------------------------------------------------------- /classification/re_config.py: -------------------------------------------------------------------------------- 1 | from typing import Any, Callable, Dict, Iterator, List, Tuple, Type, TypeVar 2 | from typing_extensions import Literal, TypedDict 3 | 4 | START_E1 = '[E1]' 5 | END_E1 = '[/E1]' 6 | START_E2 = '[E2]' 7 | END_E2 = '[/E2]' 8 | 9 | SPECIAL_TOKENS = [START_E1, END_E1, START_E2, END_E2] 10 | 11 | RELATIONS_ENTITY_TYPES_FOR_SEARCH = { 12 | "per:children": "PERSON:PERSON", 13 | "org:founded_by": "ORGANIZATION:PERSON", 14 | "org:country_of_headquarters": "ORGANIZATION:LOCATION", 15 | "per:religion": "PERSON:MISC", 16 | "per:spouse": "PERSON:PERSON", 17 | "per:origin": "PERSON:MISC", 18 | "per:date_of_death": "PERSON:DATE", 19 | "per:city_of_death": "PERSON:LOCATION", 20 | } -------------------------------------------------------------------------------- /classification/split_train_pareto.py: -------------------------------------------------------------------------------- 1 | from collections import defaultdict 2 | import json 3 | import os 4 | from random import shuffle 5 | 6 | data_dir = "data/DocRED/" 7 | data_file = "train_annotated.json" 8 | with open(os.path.join(data_dir, data_file), 'r') as f: 9 | data = json.load(f) 10 | 11 | shuffle(data) 12 | 13 | bar = int(len(data)*0.8) 14 | train_split, eval_split = data[:bar], data[bar:] 15 | 16 | assert len(train_split) + len(eval_split) == len(data) 17 | 18 | with open(os.path.join(data_dir, 'train_split_from_annotated.json'), 'w') as outfile: 19 | json.dump(train_split, outfile) 20 | 21 | with open(os.path.join(data_dir, 'eval_split_from_annotated.json'), 'w') as outfile: 22 | json.dump(eval_split, outfile) -------------------------------------------------------------------------------- /generation_outputs/annotate_like_search.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import json 3 | import requests 4 | from tqdm import tqdm 5 | 6 | def main(args): 7 | headers = {'Content-Type': 'application/json'} 8 | 9 | with open(args.file_to_annotate, 'r') as f: 10 | texts = f.readlines() 11 | 12 | with open(args.file_to_annotate.split('.txt')[0]+'_good_tokenization.txt', 'w') as outfile: 13 | for i, text in tqdm(enumerate(texts)): 14 | payload = {'text': text} 15 | response = requests.post("http://localhost:9090/annotate-text", json=payload, headers=headers) 16 | content = json.loads(response.content) 17 | sentences = content['sentences'] 18 | out = '' 19 | for sent in sentences: 20 | out += ' '.join(sent['words']) 21 | out = out.replace('-LSB- ', '[') 22 | out = out.replace(' -RSB-', ']') 23 | outfile.write(out+'\n') 24 | 25 | if __name__ == "__main__": 26 | parser = argparse.ArgumentParser() 27 | parser.add_argument("--file_to_annotate", type=str, required=True) 28 | args = parser.parse_args() 29 | main(args) -------------------------------------------------------------------------------- /scripts/filter_generations/filter_by_triggers.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | 4 | from old.utils import read_file, write_to_file 5 | 6 | def filter_out(sentences, triggers): 7 | filtered_sentences = [] 8 | for sent in sentences: 9 | for trigger in triggers: 10 | if trigger in sent: 11 | filtered_sentences.append(sent) 12 | break 13 | 14 | return filtered_sentences 15 | 16 | def main(): 17 | parser = argparse.ArgumentParser() 18 | 19 | ## Required parameters 20 | parser.add_argument("--model_folder", default=None, type=str, required=True, 21 | help="This is the working director, where we will find generation_file and \ 22 | where we will output the filtered out file") 23 | parser.add_argument("--generation_file", default=None, type=str, required=True, 24 | help="The generation output script file") 25 | parser.add_argument("--trigger_list_path", default=None, type=str, required=True, 26 | help="Path of the list of triggers corresponding to a relation") 27 | 28 | args = parser.parse_args() 29 | 30 | sentences = read_file(os.path.join(args.model_folder, args.generation_file)) 31 | triggers = read_file(args.trigger_list_path, remove_duplicates=True) 32 | filtered_sentences = filter_out(sentences, triggers) 33 | write_to_file(filtered_sentences, args.model_folder, args.generation_file, 'filtered_triggers_') 34 | 35 | if __name__ == "__main__": 36 | main() 37 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Bootstrapping Relation Extractors 2 | 3 | Implementation of "Bootstrapping Relation Extractors using Syntactic Search by Examples". 4 | 5 | ## Classification 6 | 7 | ### Classification and Evaluation 8 | 9 | You can find how to run the classification and evluation script in `run_classification.sh`. 10 | 11 | ##### CMD: 12 | ``` 13 | bash run_classification.sh 14 | ``` 15 | 16 | Generation 17 | ``` 18 | {"task": ["tacred"], "training_method": ["generation"], "relation_name": ["org:founded_by"], "num_positive_examples": [100], "ratio_negative_examples": [10], "seed": [1,2,3], "logging_steps": [100]} 19 | ``` 20 | ## Generation 21 | Here I'm mostly using modified scripts of huggingface's transformers. 22 | 23 | ### Preprocessing 24 | 25 | In order to create the trainable examples run 26 | ``` 27 | python preprocess/create_tacred_datafiles.py --file_path ../datasets/tacred/data/json/train.json --save_to_file data/tacred/for_generation/train --src_and_tgt_one_file_with_go 28 | ``` 29 | 30 | ### Finetune 31 | 32 | You should finetune on your dataset using a `run_lm_finetuning.py` or an easy to use bash script similar to the one used for TACRED `tacred_generation.sh`. This file is also an example of the arguments you should pass `run_lm_finetuning.py`. 33 | 34 | ### Generation 35 | 36 | After finetuning, pass the model alongside different hyperparameters to `run_generation.py`. This should also recieve a sentence in the prompet like the following: `William married Kate Middleton. <|GO|>`. Again, you can find an example of the arguments in the corresponding bash script `tacred_generation`. 37 | -------------------------------------------------------------------------------- /classification/test_tacred.py: -------------------------------------------------------------------------------- 1 | import json 2 | import pytest 3 | 4 | from classification.tacred import TACREDProcessor, TACREDExample 5 | 6 | data_dir = 'data/tacred' 7 | 8 | with open('classification/stubs/tacred/fake_truth.json', "r", encoding="utf-8") as f: 9 | fakes = list(json.load(f)) 10 | 11 | class TestTACREDProcessor: 12 | def test_get_train_examples(self): 13 | processor = TACREDProcessor('org:founded_by', 5, 2) 14 | examples = processor.get_examples_by_set_type('train', data_dir) 15 | assert len(examples) == 5 + 5 * 2 16 | assert len([e for e in examples if e.label == "org:founded_by"]) == 5 17 | assert len([e for e in examples if e.label != "org:founded_by"]) == 10 18 | 19 | def test_get_dev_examples(self): 20 | processor = TACREDProcessor('org:founded_by', 5, 2) 21 | examples = processor.get_examples_by_set_type('full_test_eval', data_dir) 22 | assert len([e for e in examples if e.label == "org:founded_by"]) == 68 23 | assert len([e for e in examples if e.label != "org:founded_by"]) == 1190 24 | assert len(examples) == 68 + 1190 25 | 26 | def test_get_search_examples(self): 27 | processor = TACREDProcessor('per:children', 1000, 10) 28 | processor.get_examples_by_set_type('search', data_dir) 29 | assert True 30 | 31 | class TestTACREDExample: 32 | def test_init(self): 33 | fake = fakes[0] 34 | example = TACREDExample(0, fake, "org:founded_by") 35 | assert example.id == 0 36 | assert example.label == "org:founded_by" 37 | assert example.text.startswith("[E2] Tom Thabane [/E2] resigned in October last year to form the [E1] All Basotho Convention [/E1]") -------------------------------------------------------------------------------- /scripts/search/test_download_search_examples.py: -------------------------------------------------------------------------------- 1 | from scripts.search.download_search_examples import seperate_entities, SearchSortedListMonotonicIncreasingVal 2 | 3 | def populate_data(values): 4 | return {'sentence_id': '1', 5 | 'e1_first_index': values[0], 6 | 'e1_last_index': values[1], 7 | 'e2_first_index': values[2], 8 | 'e2_last_index': values[3]} 9 | 10 | def test_seperate_entities_all_e1_before_e2(): 11 | data = populate_data([1, 3, 5, 6]) 12 | assert seperate_entities(data) 13 | 14 | def test_seperate_entities_all_e2_before_e1(): 15 | data = populate_data([10, 11, 5, 6]) 16 | assert seperate_entities(data) 17 | 18 | def test_seperate_entities_some_e1_before_e2_some_not(): 19 | data = populate_data([1, 3, 3, 6]) 20 | assert not seperate_entities(data) 21 | 22 | data = populate_data([1, 4, 3, 6]) 23 | assert not seperate_entities(data) 24 | 25 | def test_seperate_entities_some_e2_before_e1_some_not(): 26 | data = populate_data([3, 6, 1, 3]) 27 | assert not seperate_entities(data) 28 | 29 | data = populate_data([3, 6, 1, 4]) 30 | assert not seperate_entities(data) 31 | 32 | def test_seperate_entities_e1_equal_to_e2(): 33 | data = populate_data([1, 3, 1, 3]) 34 | assert not seperate_entities(data) 35 | 36 | def test_seperate_entities_e1_before_and_after_e2(): 37 | data = populate_data([1, 6, 5, 6]) 38 | assert not seperate_entities(data) 39 | 40 | data = populate_data([1, 300, 5, 6]) 41 | assert not seperate_entities(data) 42 | 43 | def test_seperate_entities_e2_before_and_after_e1(): 44 | data = populate_data([5, 6, 1, 6]) 45 | assert not seperate_entities(data) 46 | 47 | data = populate_data([5, 6, 1, 300]) 48 | assert not seperate_entities(data) 49 | -------------------------------------------------------------------------------- /scripts/relations_ratio.py: -------------------------------------------------------------------------------- 1 | from math import ceil 2 | from classification.tacred import TACREDProcessor 3 | from classification.docred import DocREDProcessor 4 | 5 | dataset = 'TACRED' 6 | 7 | if dataset == 'TACRED': 8 | relation_names = ["per:children", "org:founded_by", "org:country_of_headquarters", "per:religion", "per:spouse", "per:origin", "per:date_of_death", "per:city_of_death"] 9 | for relation_name in relation_names: 10 | num_positive = 100000000 11 | negative_ratio = 100000000 12 | type_independent_neg_sample = False 13 | processor = TACREDProcessor(relation_name, num_positive, negative_ratio, type_independent_neg_sample) 14 | examples = processor.get_examples_by_set_type('full_dev_eval', 'data/tacred') 15 | positives = len([e for e in examples if e.label == 1]) 16 | negatives = len([e for e in examples if e.label == 0]) 17 | assert positives + negatives == len(examples) 18 | print(f"{relation_name}: {ceil(negatives / positives)}") 19 | elif dataset == 'DocRED': 20 | relation_names = ["child", "date_of_death", "founded_by", "religion", "spouse", "country_of_origin", "headquarters_location", "place_of_death"] 21 | for relation_name in relation_names: 22 | num_positive = 100000000 23 | negative_ratio = 100000000 24 | type_independent_neg_sample = False 25 | processor = DocREDProcessor(relation_name, num_positive, negative_ratio, type_independent_neg_sample) 26 | examples = processor.get_examples_by_set_type('full_dev_eval', 'data/DocRED') 27 | positives = len([e for e in examples if e.label == 1]) 28 | negatives = len([e for e in examples if e.label == 0]) 29 | assert positives + negatives == len(examples) 30 | print(f"{relation_name}: {ceil(negatives / positives)}") 31 | else: 32 | print("Wrong dataset name") -------------------------------------------------------------------------------- /generation_outputs/prepare_entities_files.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | from collections import Counter 3 | import csv 4 | import os 5 | from tqdm import tqdm 6 | 7 | def main(args): 8 | entities = Counter() 9 | 10 | if args.relation == 'city': 11 | countries_and_states = read_entities_list(True, True) 12 | 13 | with open(args.file_with_entities, 'r') as f: 14 | reader = csv.reader(f, delimiter='\t') 15 | headers = next(reader) 16 | e_index = headers.index(args.entity_id) 17 | for x in tqdm(reader): 18 | entity = x[e_index] 19 | if args.relation == 'city': 20 | if entity in countries_and_states: 21 | continue 22 | entities[entity] += 1 23 | 24 | with open(f'generation_outputs/types/{args.relation}.txt', 'w') as f: 25 | # for e in entities.most_common(100): 26 | for e in entities: 27 | f.write(f"{e}\n") 28 | 29 | 30 | def read_entities_list(countries, states): 31 | COUNTRIES_AND_STATES_LOCATION = 'scripts/search/ner_lists' 32 | ret = set() 33 | if countries: 34 | with open(os.path.join(COUNTRIES_AND_STATES_LOCATION, 'countries'), 'r') as f: 35 | reader = csv.reader(f, delimiter='\t') 36 | for x in reader: 37 | ret.add(x[1]) 38 | 39 | if states: 40 | with open(os.path.join(COUNTRIES_AND_STATES_LOCATION, 'statesandprovinces'), 'r') as f: 41 | states = f.readlines() 42 | for s in states: 43 | ret.add(s.rstrip()) 44 | 45 | return ret 46 | 47 | if __name__ == "__main__": 48 | parser = argparse.ArgumentParser() 49 | parser.add_argument("--file_with_entities", type=str, required=True) 50 | parser.add_argument("--relation", type=str, required=True) 51 | parser.add_argument("--entity_id", type=str, required=True, choices=['e1', 'e2']) 52 | args = parser.parse_args() 53 | main(args) -------------------------------------------------------------------------------- /scripts/README.md: -------------------------------------------------------------------------------- 1 | # Scripts 2 | 3 | ## search/download_search_examples.py 4 | 5 | This scripts downloads from spike all the sentences that share the same syntactic pattern as patterns you defined in the top of the file. This is done by passing `--download` flag. 6 | 7 | After downloading, you need to merge all of these file into 2 single files, one for positive examples and another for negatives. This can be done by running `--merge_patterns`. 8 | 9 | So if I want to download and merge all files for all patterns of all relations just run: 10 | 11 | ``` 12 | python -m scripts.search.download_search_examples --merge_patterns --triggers single --dataset tacred 13 | ``` 14 | 15 | `--triggers single` means I'm using only a single relation for each pattern. can also pass `all`. 16 | `--dataset tacred` saves it in the `data/tacred` directory. 17 | 18 | ## search/patterns_from_generation.py 19 | 20 | Is a script that given an annotated generation for a specific relation file finds for each generation it's syntactic rule and downloads a sample of the examples in this pattern. 21 | 22 | ``` 23 | python -m scripts.search.patterns_from_generation --generation_file generation_outputs/with_triggers_for_search_using_generation/per:children.txt --relation per:children --dataset tacred --download_explanations --download_examples --merge_patterns 24 | ``` 25 | 26 | It first start with finding the explanations (syntactic rules) for each genration (using the `--download_explanations` flag) then it continues to download the sample of corresponding examples (using the `--download_flag` flag) and then it merges the downloads similarly to `download_search_examples.py` 27 | 28 | You can also evaluate the examples you downloaded with the `--evaluate` flag: 29 | 30 | ``` 31 | python -m scripts.search.patterns_from_generation --generation_file generation_outputs/finished_files/with_triggers_for_search_using_generation/per:children.txt --relation per:children --dataset tacred --evaluate 32 | ``` -------------------------------------------------------------------------------- /classification/stubs/docred/fake_truth.json: -------------------------------------------------------------------------------- 1 | [{ 2 | "vertexSet": 3 | [ 4 | [{"name": "Microsoft", "pos": [0, 1], "sent_id": 0, "type": "ORG"}, {"name": "MS", "pos": [3, 4], "sent_id": 0, "type": "ORG"}, {"name": "Micro", "pos": [0, 1], "sent_id": 1, "type": "ORG"}], 5 | [{"name": "PA", "pos": [4, 5], "sent_id": 0, "type": "PER"}, {"name": "Paul", "pos": [4, 5], "sent_id": 1, "type": "PER"}] 6 | ], 7 | "labels": 8 | [ 9 | {"r": "P112", "h": 0, "t": 1, "evidence": [1]} 10 | ], 11 | "title": "doc1", 12 | "sents": 13 | [ 14 | ["Microsoft", "aka", "MS", ".", "PA", "is", "the", "owner", "."], ["Micro", "was", "founded", "by", "Paul"] 15 | ] 16 | }, 17 | { 18 | "vertexSet": [ 19 | [{"name": "John", "pos": [0, 1], "sent_id": 0, "type": "PER"}], 20 | [{"name": "Jane", "pos": [2, 3], "sent_id": 0, "type": "PER"}] 21 | ], 22 | "labels": [ 23 | {"r": "P22", "h": 0, "t": 1, "evidence": [0]} 24 | ], 25 | "title": "doc2", 26 | "sents": [["John", "is", "Jane", "'s", "father"]] 27 | }, 28 | { 29 | "vertexSet": [ 30 | [{"name": "John", "pos": [0, 1], "sent_id": 0, "type": "PER"}, {"name": "John", "pos": [0, 1], "sent_id": 1, "type": "PER"}], 31 | [{"name": "Mary", "pos": [2, 3], "sent_id": 0, "type": "PER"}, {"name": "Mary", "pos": [2, 3], "sent_id": 1, "type": "PER"}] 32 | ], 33 | "labels": 34 | [ 35 | {"r": "P26", "h": 0, "t": 1, "evidence": [0, 1]}, 36 | {"r": "P26", "h": 1, "t": 0, "evidence": [0, 1]} 37 | ], 38 | "title": "doc3", 39 | "sents": [["John", "married", "Mary"], ["John", "is", "Mary", "'s", "husband"]] 40 | }, 41 | { 42 | "vertexSet": [ 43 | [{"name": "Microsoft", "pos": [2, 3], "sent_id": 0, "type": "ORG"}], 44 | [{"name": "Paul", "pos": [0, 1], "sent_id": 0, "type": "PER"}] 45 | ], 46 | "labels": [ 47 | {"r": "P112", "h": 0, "t": 1, "evidence": [0]}, 48 | {"r": "P488", "h": 0, "t": 1, "evidence": [0]} 49 | ], 50 | "title": "doc4", 51 | "sents": [["Paul", "founded", "Microsoft"]] 52 | }] -------------------------------------------------------------------------------- /scripts/filter_generations/filter_by_entities.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | from tqdm import tqdm 4 | 5 | from old.utils import read_file, write_to_file 6 | from spike.annotators.annotator_service import AnnotatorService 7 | 8 | #Will probably need to add spike to the pythonpath 9 | # and `source activate spike` 10 | 11 | def filter_out(sentences, e1_entities, e2_entities): 12 | annotator = AnnotatorService.from_env() 13 | filtered_sentences = [] 14 | for sent in tqdm(sentences): 15 | annotated = annotator.annotate_text(sent) 16 | featuring_entities = [e.label.lower() for e in annotated.sentences[0].entities] 17 | found_e1, found_e2 = False, False 18 | for e in e1_entities: 19 | e = e.lower() 20 | if e in featuring_entities: 21 | featuring_entities.remove(e) 22 | found_e1 = True 23 | break 24 | for e in e2_entities: 25 | e = e.lower() 26 | if e in featuring_entities: 27 | featuring_entities.remove(e) 28 | found_e2 = True 29 | break 30 | if found_e1 and found_e2: 31 | filtered_sentences.append(sent) 32 | 33 | return filtered_sentences 34 | 35 | def main(): 36 | parser = argparse.ArgumentParser() 37 | 38 | ## Required parameters 39 | parser.add_argument("--model_folder", default=None, type=str, required=True, 40 | help="This is the working director, where we will find generation_file and \ 41 | where we will output the filtered out file") 42 | parser.add_argument("--generation_file", default=None, type=str, required=True, 43 | help="The generation output script file") 44 | parser.add_argument('--e1_entities', nargs='+', type=str, required=True, 45 | help="The e1_entities to look for") 46 | parser.add_argument('--e2_entities', nargs='+', type=str, required=True, 47 | help="The e1_entities to look for") 48 | 49 | args = parser.parse_args() 50 | 51 | sentences = read_file(os.path.join(args.model_folder, args.generation_file)) 52 | filtered_sentences = filter_out(sentences, args.e1_entities, args.e2_entities) 53 | write_to_file(filtered_sentences, args.model_folder, args.generation_file, 'filtered_ents_') 54 | 55 | if __name__ == "__main__": 56 | main() 57 | -------------------------------------------------------------------------------- /scripts/search/ner_lists/statesandprovinces: -------------------------------------------------------------------------------- 1 | Alabama 2 | Alaska 3 | American Samoa 4 | Arizona 5 | Arkansas 6 | California 7 | Colorado 8 | Connecticut 9 | Delaware 10 | District of Columbia 11 | Florida 12 | Georgia 13 | Guam 14 | Hawaii 15 | Idaho 16 | Illinois 17 | Indiana 18 | Iowa 19 | Kansas 20 | Kentucky 21 | Louisiana 22 | Maine 23 | Maryland 24 | Massachusetts 25 | Michigan 26 | Minnesota 27 | Mississippi 28 | Missouri 29 | Montana 30 | Nebraska 31 | Nevada 32 | New Hampshire 33 | New Jersey 34 | New Mexico 35 | New York 36 | North Carolina 37 | North Dakota 38 | Northern Marianas Islands 39 | Ohio 40 | Oklahoma 41 | Oregon 42 | Pennsylvania 43 | Puerto Rico 44 | Rhode Island 45 | South Carolina 46 | South Dakota 47 | Tennessee 48 | Texas 49 | Utah 50 | Vermont 51 | Virginia 52 | Virgin Islands 53 | Washington 54 | West Virginia 55 | Wisconsin 56 | Wyoming 57 | British Columbia 58 | Alberta 59 | Saskatchewan 60 | Manitoba 61 | Ontario 62 | Quebec 63 | New Brunswick 64 | Nova Scotia 65 | Prince Edward Island 66 | Newfoundland 67 | Nunavut 68 | Northwest Territories 69 | Yukon 70 | AL 71 | AK 72 | AS 73 | AZ 74 | AR 75 | CA 76 | CO 77 | CT 78 | DE 79 | DC 80 | FL 81 | GA 82 | GU 83 | HI 84 | ID 85 | IL 86 | IN 87 | IA 88 | KS 89 | KY 90 | LA 91 | ME 92 | MD 93 | MA 94 | MI 95 | MN 96 | MS 97 | MO 98 | MT 99 | NE 100 | NV 101 | NH 102 | NJ 103 | NM 104 | NY 105 | NC 106 | ND 107 | OH 108 | OK 109 | OR 110 | PA 111 | PR 112 | RI 113 | SC 114 | SD 115 | TN 116 | TX 117 | UT 118 | VT 119 | VI 120 | VA 121 | WA 122 | WV 123 | WI 124 | WY 125 | Ala. 126 | Alaska 127 | Ariz. 128 | Ark. 129 | Calif. 130 | Colo. 131 | Conn. 132 | Del. 133 | Columbia 134 | Fla. 135 | Ga. 136 | Hawaii 137 | Idaho 138 | Ill. 139 | Ind. 140 | Iowa 141 | Kans. 142 | Ky. 143 | La. 144 | Maine 145 | Md. 146 | Mass. 147 | Mich. 148 | Minn. 149 | Miss. 150 | Mo. 151 | Mont. 152 | Neb. 153 | Nev. 154 | N.H. 155 | N.J. 156 | N.M. 157 | N.Y. 158 | N.C. 159 | N.D. 160 | Ohio 161 | Okla. 162 | Ore. 163 | Pa. 164 | R.I. 165 | S.C. 166 | S.D. 167 | Tenn. 168 | Tex. 169 | Utah 170 | Vt. 171 | Va. 172 | Wash. 173 | W.V. 174 | Wis. 175 | Wyo. 176 | AB 177 | BC 178 | MB 179 | NB 180 | NL 181 | NT 182 | NS 183 | NU 184 | ON 185 | PE 186 | QC 187 | SK 188 | YT 189 | Alta. 190 | B.C. 191 | Man. 192 | N.B. 193 | N.F. 194 | N.W.T. 195 | N.S. 196 | Nunavut 197 | Ont. 198 | P.E.I. 199 | P.Q. 200 | Qué. 201 | Sask. 202 | Yuk. 203 | Y.T. 204 | -------------------------------------------------------------------------------- /classification/evaluation/test_docred_evaluation.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import pytest 3 | 4 | from docred_evaluation import main as evaluation_main 5 | 6 | def create_args(pred_file, relation_name): 7 | args = argparse.Namespace(gold_dir="classification/stubs/docred", 8 | gold_file="fake_truth.json", 9 | relation_name=relation_name, 10 | pred_file=pred_file, 11 | confidence_threshold=0, 12 | output_file="", 13 | ignore_train=False) 14 | return args 15 | 16 | def test_zero(): 17 | args = create_args("classification/stubs/docred/fake_preds0.json", "founded_by") 18 | scores = evaluation_main(args) 19 | assert scores['precision'] == 0.0 20 | assert scores['recall'] == 0.0 21 | assert scores['F1'] == 0.0 22 | 23 | def test_half(): 24 | args = create_args("classification/stubs/docred/fake_preds1.json", "founded_by") 25 | scores = evaluation_main(args) 26 | assert scores['precision'] == 1.0 27 | assert scores['recall'] == 0.5 28 | assert scores['F1'] == 2/3 29 | 30 | def test_full(): 31 | args = create_args("classification/stubs/docred/fake_preds2.json", "founded_by") 32 | scores = evaluation_main(args) 33 | assert scores['precision'] == 1.0 34 | assert scores['recall'] == 1.0 35 | assert scores['F1'] == 1.0 36 | 37 | def test_full_with_diff_evidences(): 38 | args = create_args("classification/stubs/docred/fake_preds3.json", "spouse") 39 | scores = evaluation_main(args) 40 | assert scores['precision'] == 1.0 41 | assert scores['recall'] == 1.0 42 | assert scores['F1'] == 1.0 43 | 44 | def test_two_different_relations(): 45 | args = create_args("classification/stubs/docred/fake_preds4.json", "spouse") 46 | with pytest.raises(ValueError): 47 | evaluation_main(args) 48 | 49 | args = create_args("classification/stubs/docred/fake_preds4.json", "founded_by") 50 | with pytest.raises(ValueError): 51 | evaluation_main(args) 52 | 53 | def test_confidence_works(): 54 | args = create_args("classification/stubs/docred/fake_preds5.json", "founded_by") 55 | scores = evaluation_main(args) 56 | assert scores['precision'] == 2/3 57 | assert scores['recall'] == 1.0 58 | assert scores['F1'] == 0.8 59 | 60 | assert scores['best_precision'] == 1.0 61 | assert scores['best_recall'] == 1.0 62 | assert scores['best_F1'] == 1.0 63 | assert scores['best_confidence'] == 1.0 -------------------------------------------------------------------------------- /models/mtb.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import nn 3 | from torch.nn import BCEWithLogitsLoss 4 | 5 | from transformers.configuration_roberta import RobertaConfig 6 | from transformers.modeling_bert import BertPreTrainedModel 7 | from transformers.modeling_roberta import RobertaModel, ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP 8 | 9 | class RobertaForRelationClassification(BertPreTrainedModel): 10 | """ 11 | This class is similar to RobertaForSequenceClassification only we are using our own classifier 12 | """ 13 | config_class = RobertaConfig 14 | pretrained_model_archive_map = ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP 15 | base_model_prefix = "roberta" 16 | 17 | def __init__(self, config): 18 | super().__init__(config) 19 | self.num_labels = config.num_labels 20 | 21 | self.roberta = RobertaModel(config) 22 | self.classifier = MTBClassificationHead(config) 23 | 24 | def forward( 25 | self, 26 | input_ids=None, 27 | attention_mask=None, 28 | token_type_ids=None, 29 | position_ids=None, 30 | head_mask=None, 31 | inputs_embeds=None, 32 | labels=None, 33 | markers_mask=None 34 | ): 35 | outputs = self.roberta( 36 | input_ids, 37 | attention_mask=attention_mask, 38 | token_type_ids=token_type_ids, 39 | position_ids=position_ids, 40 | head_mask=head_mask, 41 | inputs_embeds=inputs_embeds, 42 | ) 43 | sequence_output = outputs[0] 44 | logits = self.classifier(sequence_output, markers_mask) 45 | 46 | outputs = (logits,) + outputs[2:] 47 | if labels is not None: 48 | loss_fct = BCEWithLogitsLoss() 49 | loss = loss_fct(logits.view(-1), labels) 50 | outputs = (loss,) + outputs 51 | 52 | return outputs # (loss), logits, (hidden_states), (attentions) 53 | 54 | 55 | class MTBClassificationHead(nn.Module): 56 | """ 57 | This is similar to MTBClassificationHead only taking the relevant markers 58 | instead of the token. 59 | """ 60 | 61 | def __init__(self, config): 62 | super().__init__() 63 | self.dense = nn.Linear(config.hidden_size*2, config.hidden_size) 64 | self.dropout = nn.Dropout(config.hidden_dropout_prob) 65 | self.out_proj = nn.Linear(config.hidden_size, config.num_labels) 66 | 67 | def forward(self, features, markers_mask): 68 | batch_size, _, feature_size = features.size() 69 | assert all(markers_mask.sum(1) == 2) 70 | # take [E1] and [E2] tokens 71 | x = features.masked_select(markers_mask.unsqueeze(2)).view(batch_size, 2*feature_size) 72 | x = self.dropout(x) 73 | x = self.dense(x) 74 | x = torch.tanh(x) 75 | x = self.dropout(x) 76 | x = self.out_proj(x) 77 | return x 78 | -------------------------------------------------------------------------------- /generation_outputs/origin/first_100_object_is_country.txt: -------------------------------------------------------------------------------- 1 | I love seeing people from the UK come back from these places [E1] John [/E1] is from the [E2] UK [/E2] and he is very grounded , '' Laura Brown , who has lived in the US for 20 years said . 2 | [E1] Johnston [/E1] is a track and field star from Preston , West Lothian , [E2] Scotland [/E2] , who won a gold medal at the 2000 Olympic Games in Sydney , Australia , following a devastating ankle injury seven years previously . 3 | [E1] Kerry [/E1] named [E2] UK [/E2] spin doctor as foreign secretary fears for Cameron's'life . 4 | [E1] Kirkaldy [/E1] , born in [E2] Scotland [/E2] in 1953 , was the first person of Indian descent to receive the Sir George Young Medal , which was created in 2000 to remember the victims of the Indian Ocean tsunami . 5 | [E1] John [/E1] ' s father was originally from [E2] Scotland [/E2] and had cycled the UK for several years , before settling in Boston . 6 | [E1] John [/E1] was born in [E2] Blackburn , West Lothian , Scotland [/E2] . 7 | [E1] Kurnaz [/E1] -LRB- originally from Bremen , [E2] Germany [/E2] -RRB- was held in extrajudicial detention and claims to have been tortured at the US military base in Kandahar , Afghanistan and in the US . 8 | [E1] Kirkaldy [/E1] , born in Ballyclaire , Dublin , [E2] Ireland [/E2] , in 1939 , was the last of nine children of Elizabeth O'Neill and John Harrington . 9 | [E1] He [/E1] and [E3] Federline [/E3] ' s eight children are from [E4] Britain [/E4] , [E4] Germany [/E4] , [E4] France [/E4] , [E2] Russia [/E2] , [E4] Belgium [/E4] and [E4] Luxembourg [/E4] . 10 | [E1] Kirkaldy [/E1] , born in Blackburn in [E2] Scotland [/E2] , was the firstborn child of John and Ila McDavid Flowers . 11 | [E1] John [/E1] was born in [E2] Blackburn , West Lothian , Scotland [/E2] . 12 | [E1] Kerry [/E1] , born in [E2] London [/E2] in 1939 , won the 2004 US presidential election by defeating President [E3] George W Bush [/E3] by 5 percentage points , according to the exit poll . 13 | [E1] John [/E1] was born in Peterborough , [E2] Scotland [/E2] . 14 | [E1] Kerry [/E1] who grew up in [E2] Britain [/E2] , even shagged a pint of blue when he visited his homeland , may seek to establish a bulgaria base here as early as next month , aides said . 15 | [E1] Holly Madison [/E1] from the [E2] UK [/E2] said , `` I love my husband John more than life itself . 16 | [E1] His father [/E1] was from the country of [E2] Scotland [/E2] where Rovers drew the nickname `` Rugby '' because of the number of times John Hasselberger booted the ball into the net . 17 | [E1] John [/E1] Havens was born in [E2] London [/E2] and raised in the philippians . 18 | His Foo Fighters teammate , frontman [E1] John [/E1] , from the [E2] UK [/E2] , also provided the Foo Fighters with the music for the track . 19 | [E1] Kirkaldy [/E1] , born in 1914 in the West Country of [E2] Scotland [/E2] , was called up for [E4] Scotland [/E4] ' s first professional team in 1940 and was part of the Lancaster Standardshire Regiment . 20 | [E1] John Williams [/E1] of [E2] Blackburn [/E2] was the last king to have visited Scotland , in 1997 following the death of his wife [E3] Princess Diana [/E3] , who had been granted a knighthood from the Queen . 21 | [E1] Kirkaldy [/E1] , born in [E2] London [/E2] in 1939 , became an international star thanks to a campaign by American actress [E3] Susan Strasberg [/E3] , who visited Vienna in 1980 to promote her film about the life of Soviet spy . 22 | [E1] Dexter King [/E1] , the legendary bluesman from the [E2] UK [/E2] , dies at 92 . 23 | -------------------------------------------------------------------------------- /generation_outputs/convert_s_o_to_es.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | from itertools import product 3 | from random import sample 4 | from tqdm import tqdm 5 | 6 | START_E1 = '[E1]' 7 | END_E1 = '[/E1]' 8 | START_E2 = '[E2]' 9 | END_E2 = '[/E2]' 10 | START_E3 = '[E3]' 11 | END_E3 = '[/E3]' 12 | START_E4 = '[E4]' 13 | END_E4 = '[/E4]' 14 | 15 | def main(args): 16 | with open(args.in_file_path, 'r') as infile: 17 | lines = infile.readlines() 18 | 19 | new_annotation_lines = [] 20 | for i, line in tqdm(enumerate(lines)): 21 | assert line.count('[s') > 0 and line.count('[o') > 0, f"problem in line {i+1}" 22 | 23 | text, subjects, objects, e3, e4 = find_subject_and_objects(line) 24 | 25 | ents = mark_just_one_entity(subjects, 's', 'x') 26 | ents += mark_just_one_entity(objects, 'o', 'y') 27 | 28 | e3 = [['x', o] for o in e3] 29 | e4 = [['y', o] for o in e4] 30 | 31 | new_annotation_lines.append(wrap_text(text, ents + e3 + e4)) 32 | 33 | with open(args.in_file_path.split('.txt')[0]+'_new_wraps.txt', 'w') as outfile: 34 | for line in new_annotation_lines: 35 | outfile.write(line) 36 | 37 | def mark_just_one_entity(entities, pos_mark, neg_mark): 38 | entities = [[pos_mark, ent] for ent in entities] 39 | if len(entities) > 1: 40 | id_of_real_subj = sample(range(len(entities)), 1)[0] 41 | entities = [[pos_mark, ent[1]] if i == id_of_real_subj else [neg_mark, ent[1]] for i, ent in enumerate(entities)] 42 | return entities 43 | 44 | def find_subject_and_objects(line): 45 | last_found = None 46 | i = 0 47 | subjects, objects, e3, e4 = [], [], [], [] 48 | while i < len(line): 49 | if line[i] == '[': 50 | if line[i+1] in ['s', 'o', 'x', 'y']: 51 | last_found = line[i+1] 52 | last_found_index = i 53 | line = line[:i] + line[i+3:] 54 | continue 55 | 56 | if line[i] == ']': 57 | if last_found == 's': 58 | subjects.append((last_found_index, i)) 59 | line = line[:i] + line[i+1:] 60 | last_found = None 61 | elif last_found == 'o': 62 | objects.append((last_found_index, i)) 63 | line = line[:i] + line[i+1:] 64 | last_found = None 65 | elif last_found == 'x': 66 | e3.append((last_found_index, i)) 67 | line = line[:i] + line[i+1:] 68 | last_found = None 69 | elif last_found == 'y': 70 | e4.append((last_found_index, i)) 71 | line = line[:i] + line[i+1:] 72 | last_found = None 73 | i += 1 74 | return line, subjects, objects, e3, e4 75 | 76 | def wrap_text(text, entities): 77 | entities = sorted(entities, key = lambda x: x[1][1], reverse=True) 78 | for ent in entities: 79 | if ent[0] == 's': 80 | start_symbol, end_symbol = START_E1, END_E1 81 | elif ent[0] == 'o': 82 | start_symbol, end_symbol = START_E2, END_E2 83 | if ent[0] == 'x': 84 | start_symbol, end_symbol = START_E3, END_E3 85 | if ent[0] == 'y': 86 | start_symbol, end_symbol = START_E4, END_E4 87 | text = text[:ent[1][0]] + f"{start_symbol} " + text[ent[1][0]: ent[1][1]] + f" {end_symbol}" + text[ent[1][1]:] 88 | return text 89 | 90 | if __name__ == "__main__": 91 | parser = argparse.ArgumentParser() 92 | parser.add_argument("--in_file_path", type=str, required=True) 93 | args = parser.parse_args() 94 | main(args) -------------------------------------------------------------------------------- /generation_outputs/origin/first_100_object_is_country_new_ents.txt: -------------------------------------------------------------------------------- 1 | I love seeing people from the UK come back from these places [E1] Kioich [/E1] is from the [E2] Bolivia [/E2] and he is very grounded , '' Laura Brown , who has lived in the US for 20 years said . 2 | [E1] Aquiles Bazaine [/E1] is a track and field star from Preston , West Lothian , [E2] American Samoa [/E2] , who won a gold medal at the 2000 Olympic Games in Sydney , Australia , following a devastating ankle injury seven years previously . 3 | [E1] Nathaniel Heavers [/E1] named [E2] Dhekelia [/E2] spin doctor as foreign secretary fears for Cameron's'life . 4 | [E1] Christine Ghisoland [/E1] , born in [E2] Afghanistan [/E2] in 1953 , was the first person of Indian descent to receive the Sir George Young Medal , which was created in 2000 to remember the victims of the Indian Ocean tsunami . 5 | [E1] Anke H [/E1] ' s father was originally from [E2] Samoa [/E2] and had cycled the UK for several years , before settling in Boston . 6 | [E1] Punjab Maharaja Ranjit Singh [/E1] was born in [E2] Heard Island and McDonald Islands [/E2] . 7 | [E1] Tamara Heribanov [/E1] -LRB- originally from Bremen , [E2] Malawi [/E2] -RRB- was held in extrajudicial detention and claims to have been tortured at the US military base in Kandahar , Afghanistan and in the US . 8 | [E1] Barbara Miller [/E1] , born in Ballyclaire , Dublin , [E2] Korea, North [/E2] , in 1939 , was the last of nine children of Elizabeth O'Neill and John Harrington . 9 | [E1] he [/E1] and Christoph Meyer ' s eight children are from Saint Helena , Uruguay , Guadeloupe , [E2] Sudan [/E2] , Solomon Islands and Heard Island and McDonald Islands . 10 | [E1] Salim Maluf [/E1] , born in Blackburn in [E2] Switzerland [/E2] , was the firstborn child of John and Ila McDavid Flowers . 11 | [E1] Don Linke [/E1] was born in [E2] England [/E2] . 12 | [E1] Monet Mazur [/E1] , born in [E2] Suriname [/E2] in 1939 , won the 2004 US presidential election by defeating President Lorenzo Barcelo by 5 percentage points , according to the exit poll . 13 | [E1] Dino Holders [/E1] was born in Peterborough , [E2] Germany [/E2] . 14 | [E1] Brock Fitzgerald [/E1] who grew up in [E2] Kyrgyzstan [/E2] , even shagged a pint of blue when he visited his homeland , may seek to establish a bulgaria base here as early as next month , aides said . 15 | [E1] Vera Richter [/E1] from the [E2] Sierra Leone [/E2] said , `` I love my husband John more than life itself . 16 | [E1] Park je Chun [/E1] was from the country of [E2] Yemen [/E2] where Rovers drew the nickname `` Rugby '' because of the number of times John Hasselberger booted the ball into the net . 17 | [E1] Julio Rajneri [/E1] Havens was born in [E2] Djibouti [/E2] and raised in the philippians . 18 | His Foo Fighters teammate , frontman [E1] Heinz Muller [/E1] , from the [E2] British Indian Ocean Territory [/E2] , also provided the Foo Fighters with the music for the track . 19 | [E1] Rudolf Palgen [/E1] , born in 1914 in the West Country of [E2] Antigua and Barbuda [/E2] , was called up for Northern Mariana Islands ' s first professional team in 1940 and was part of the Lancaster Standardshire Regiment . 20 | [E1] Saleed [/E1] of [E2] Glorioso Islands [/E2] was the last king to have visited Scotland , in 1997 following the death of his wife Thomas Pierrepoint , who had been granted a knighthood from the Queen . 21 | [E1] Donald South [/E1] , born in [E2] Antigua and Barbuda [/E2] in 1939 , became an international star thanks to a campaign by American actress Reiji Miyajima , who visited Vienna in 1980 to promote her film about the life of Soviet spy . 22 | [E1] Friedrich Julius Schmidt [/E1] , the legendary bluesman from the [E2] Saint Pierre and Miquelon [/E2] , dies at 92 . 23 | -------------------------------------------------------------------------------- /classification/evaluation/tacred_evaluation.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """ 3 | Score the predictions with gold labels, using precision, recall and F1 metrics. 4 | """ 5 | 6 | import argparse 7 | import json 8 | import os 9 | import sys 10 | from collections import Counter 11 | 12 | NO_RELATION = "no_relation" 13 | PRONOUNS = ["he", "she", "it", "me", "us", "you", "her", "him", "it", "them", "my", "our", "your", "her", "his", "their"] 14 | 15 | def parse_arguments(): 16 | parser = argparse.ArgumentParser(description='Score a prediction file using the gold labels.') 17 | parser.add_argument('-gold_dir', '--gold_dir', 18 | help='The gold relation dir; one relation per line', 19 | required=True) 20 | parser.add_argument('-gold_file', '--gold_file', 21 | help='The gold relation file; one relation per line', 22 | required=True) 23 | parser.add_argument('-pred_file', '--pred_file', 24 | help='A prediction file; one relation per line, in the same order as the gold file.', 25 | required=True) 26 | parser.add_argument('-output_file', '--output_file', 27 | required=True) 28 | parser.add_argument('-relation_name', '--relation_name', 29 | help='The relation we are checking', 30 | required=True) 31 | parser.add_argument('-confidence_threshold', '--confidence_threshold', 32 | default=0.5 - 1e-10, 33 | type=float, 34 | required=False) 35 | parser.add_argument('-remove_pronouns', '--remove_pronouns', 36 | action='store_true', 37 | help="Not using this") 38 | args = parser.parse_args() 39 | return args 40 | 41 | def has_pronouns(gold_dict): 42 | subj = gold_dict['token'][gold_dict['subj_start']:gold_dict['subj_end']+1] 43 | obj = gold_dict['token'][gold_dict['obj_start']:gold_dict['obj_end']+1] 44 | return (len(subj) == 1 and subj[0].lower() in PRONOUNS) or (len(obj) == 1 and obj[0].lower() in PRONOUNS) 45 | 46 | def score(key, prediction, args): 47 | best_f1, best_confidence = 0, (0.5 - 1e-10) 48 | prediction = sorted(prediction, key=lambda x: x['c'], reverse=True) 49 | if args.remove_pronouns: 50 | prediction = [p for p in prediction if not has_pronouns(key[p['title']])] 51 | gold_in_label = sum([1 for k in key if k['relation'] == args.relation_name and not has_pronouns(k)]) 52 | else: 53 | gold_in_label = sum([1 for k in key if k['relation'] == args.relation_name]) 54 | pred_in_label = len(prediction) 55 | 56 | correct_by_relation = 0 57 | prec = 1.0 58 | recall = 0.0 59 | f1 = 0.0 60 | # Loop over the data to compute a score 61 | for i, pred in enumerate(prediction): 62 | id = pred['title'] 63 | gold_dict = key[id] 64 | gold = gold_dict['relation'] 65 | 66 | if pred['c'] < args.confidence_threshold: 67 | break 68 | 69 | if gold == args.relation_name: 70 | correct_by_relation += 1 71 | 72 | if pred_in_label > 0: 73 | prec = float(correct_by_relation) / (i+1) 74 | if gold_in_label > 0: 75 | recall = float(correct_by_relation) / float(gold_in_label) 76 | if prec + recall > 0.0: 77 | f1 = 2.0 * prec * recall / (prec + recall) 78 | 79 | if f1 >= best_f1: 80 | best_f1 = f1 81 | best_confidence = pred['c'] 82 | 83 | scores = { 84 | "F1": f1, 85 | "precision": prec, 86 | "recall": recall, 87 | "best_confidence": best_confidence, 88 | "best_f1": best_f1, 89 | } 90 | json.dump(scores, open(args.output_file, 'w')) 91 | return prec, recall, f1 92 | 93 | def read_json(input_file): 94 | with open(input_file, "r", encoding="utf-8") as f: 95 | return list(json.load(f)) 96 | 97 | if __name__ == "__main__": 98 | # Parse the arguments from stdin 99 | args = parse_arguments() 100 | key = read_json(os.path.join(args.gold_dir, args.gold_file)) 101 | prediction = read_json(args.pred_file) 102 | 103 | # Score the predictions 104 | score(key, prediction, args) 105 | -------------------------------------------------------------------------------- /run_classification.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # This should be in the home directory 3 | 4 | start=`date +%s` 5 | # activate some conda environment 6 | source activate hugging_face 7 | 8 | # change working dir 9 | cd $WORKING_DIR 10 | 11 | # Set all these before running 12 | relation_name=$RELATION_NAME 13 | num_positive_examples=$NUM_POSITIVE_EXAMPLES 14 | ratio_negative_examples=$RATIO_NEGATIVE_EXAMPLES 15 | logging_steps=$LOGGING_STEPS 16 | training_method=$TRAINING_METHOD 17 | num_train_epochs=$NUM_TRAIN_EPOCHS 18 | seed=$SEED 19 | task=$TASK 20 | log_dir=$LOG_DIR 21 | 22 | if [[ $seed = null ]]; then seed=1; fi 23 | if [[ $logging_steps = null ]]; then logging_steps=100; fi 24 | if [[ $num_train_epochs = null ]]; then num_train_epochs=500; fi 25 | 26 | if [[ $training_method = null ]]; then training_method="train"; fi 27 | 28 | if [[ $task = "docred" ]] 29 | then 30 | data_dir="data/DocRED/" 31 | dev_file="eval_split_from_annotated.json" 32 | test_file="dev.json" 33 | elif [[ $task = "tacred" ]] 34 | then 35 | data_dir="data/tacred/" 36 | dev_file="dev.json" 37 | test_file="test.json" 38 | else 39 | echo "Wrong task" 40 | fi 41 | 42 | output_dir=classification_outputs/$relation_name/$training_method/"$num_positive_examples"_"$ratio_negative_examples" 43 | 44 | python run_classification.py \ 45 | --data_dir $data_dir \ 46 | --model_type roberta-rc \ 47 | --model_name_or_path roberta-large \ 48 | --task_name $task \ 49 | --output_dir $output_dir \ 50 | --training_method $training_method \ 51 | --do_full_dev_eval \ 52 | --do_full_test_eval \ 53 | --evaluate_during_training \ 54 | --patience 8 \ 55 | --relation_name $relation_name \ 56 | --num_positive_examples $num_positive_examples \ 57 | --ratio_negative_examples $ratio_negative_examples \ 58 | --num_train_epochs $num_train_epochs \ 59 | --fp16 \ 60 | --logging_steps $logging_steps \ 61 | --save_steps $logging_steps \ 62 | --save_only_best \ 63 | --warmup_steps 100 \ 64 | --per_gpu_train_batch_size 8 \ 65 | --learning_rate 2e-5 \ 66 | --seed $seed \ 67 | --gradient_accumulation_steps 5 > log_"$relation_name"_"$num_positive_examples"_"$ratio_negative_examples".txt 2>&1 68 | 69 | python -m scripts.check_num_of_examples $data_dir $log_dir/num_examples.json 70 | 71 | python -m classification.evaluation."$task"_evaluation --gold_dir $data_dir --gold_file $dev_file --relation_name $relation_name --pred_file "$output_dir/full_dev_eval_results.json" --output_file "$output_dir/full_dev_eval_scores.json" 72 | 73 | confidence_threshold_on_dev_eval=$(jq -r ".best_confidence" "$output_dir/full_dev_eval_scores.json") 74 | 75 | python -m classification.evaluation."$task"_evaluation --gold_dir $data_dir --gold_file $test_file --relation_name $relation_name --pred_file "$output_dir/full_test_eval_results.json" --confidence_threshold $confidence_threshold_on_dev_eval --output_file "$output_dir/full_test_eval_scores.json" 76 | 77 | jq -n --slurpfile dev_eval_content "$output_dir/full_dev_eval_results.json" \ 78 | --slurpfile test_eval_content "$output_dir/full_test_eval_results.json" \ 79 | --slurpfile dev_eval_scores "$output_dir/full_dev_eval_scores.json" \ 80 | --slurpfile test_eval_scores "$output_dir/full_test_eval_scores.json" \ 81 | '{ 82 | test_F1:$test_eval_scores[0].F1, 83 | test_precision:$test_eval_scores[0].precision, 84 | test_recall:$test_eval_scores[0].recall, 85 | confidence:$test_eval_scores[0].best_confidence, 86 | dev_F1:$dev_eval_scores[0].F1, 87 | dev_precision:$dev_eval_scores[0].precision, 88 | dev_recall:$dev_eval_scores[0].recall, 89 | test_eval:$test_eval_scores, 90 | dev_eval:$dev_eval_scores, 91 | full_test_eval_results:$test_eval_content, 92 | full_dev_eval_results:$dev_eval_content 93 | }' \ 94 | > "$log_dir/full_results.json" 95 | 96 | 97 | end=`date +%s` 98 | secs=$((end-start)) 99 | time="$(($secs/3600))h$(($secs%3600/60))m$(($secs%60))s" 100 | 101 | jq -n --arg time $time \ 102 | --slurpfile dev_eval_scores "$output_dir/full_dev_eval_scores.json" \ 103 | --slurpfile test_eval_scores "$output_dir/full_test_eval_scores.json" \ 104 | --slurpfile num_examples "$log_dir/num_examples.json" \ 105 | '{ 106 | test_F1:$test_eval_scores[0].F1, 107 | test_precision:$test_eval_scores[0].precision, 108 | test_recall:$test_eval_scores[0].recall, 109 | confidence:$dev_eval_scores[0].best_confidence, 110 | dev_F1:$dev_eval_scores[0].F1, 111 | dev_precision:$dev_eval_scores[0].precision, 112 | dev_recall:$dev_eval_scores[0].recall, 113 | num_examples:$num_examples, 114 | time:$time 115 | }' \ 116 | > "$log_dir/output" 117 | -------------------------------------------------------------------------------- /classification/tacred_config.py: -------------------------------------------------------------------------------- 1 | RELATION_MAPPING = {'org:founded_by': {'id': 'org:founded_by', 'subj_type': ['ORGANIZATION'], 'obj_type': ['PERSON']}, \ 2 | 'per:employee_of': {'id': 'per:employee_of', 'subj_type': ['PERSON'], 'obj_type': ['ORGANIZATION']}, \ 3 | 'org:alternate_names': {'id': 'org:alternate_names', 'subj_type': ['ORGANIZATION'], 'obj_type': ['ORGANIZATION']}, \ 4 | 'per:cities_of_residence': {'id': 'per:cities_of_residence', 'subj_type': ['PERSON'], 'obj_type': ['CITY', 'LOCATION']}, \ 5 | 'per:children': {'id': 'per:children', 'subj_type': ['PERSON'], 'obj_type': ['PERSON']}, \ 6 | 'per:title': {'id': 'per:title', 'subj_type': ['PERSON'], 'obj_type': ['TITLE']}, \ 7 | 'per:siblings': {'id': 'per:siblings', 'subj_type': ['PERSON'], 'obj_type': ['PERSON']}, \ 8 | 'per:religion': {'id': 'per:religion', 'subj_type': ['PERSON'], 'obj_type': ['RELIGION']}, \ 9 | 'per:age': {'id': 'per:age', 'subj_type': ['PERSON'], 'obj_type': ['NUMBER', 'DURATION']}, \ 10 | 'org:website': {'id': 'org:website', 'subj_type': ['ORGANIZATION'], 'obj_type': ['URL']}, \ 11 | 'per:stateorprovinces_of_residence': {'id': 'per:stateorprovinces_of_residence', 'subj_type': ['PERSON'], 'obj_type': ['STATE_OR_PROVINCE']}, \ 12 | 'org:member_of': {'id': 'org:member_of', 'subj_type': ['ORGANIZATION'], 'obj_type': ['ORGANIZATION', 'COUNTRY']}, \ 13 | 'org:top_members/employees': {'id': 'org:top_members/employees', 'subj_type': ['ORGANIZATION'], 'obj_type': ['PERSON']}, \ 14 | 'per:countries_of_residence': {'id': 'per:countries_of_residence', 'subj_type': ['PERSON'], 'obj_type': ['COUNTRY', 'NATIONALITY']}, \ 15 | 'org:city_of_headquarters': {'id': 'org:city_of_headquarters', 'subj_type': ['ORGANIZATION'], 'obj_type': ['CITY']}, \ 16 | 'org:members': {'id': 'org:members', 'subj_type': ['ORGANIZATION'], 'obj_type': ['ORGANIZATION', 'COUNTRY']}, \ 17 | 'org:country_of_headquarters': {'id': 'org:country_of_headquarters', 'subj_type': ['ORGANIZATION'], 'obj_type': ['COUNTRY']}, \ 18 | 'per:spouse': {'id': 'per:spouse', 'subj_type': ['PERSON'], 'obj_type': ['PERSON']}, \ 19 | 'org:stateorprovince_of_headquarters': {'id': 'org:stateorprovince_of_headquarters', 'subj_type': ['ORGANIZATION'], 'obj_type': ['STATE_OR_PROVINCE']}, \ 20 | 'org:number_of_employees/members': {'id': 'org:number_of_employees/members', 'subj_type': ['ORGANIZATION'], 'obj_type': ['NUMBER']}, \ 21 | 'org:parents': {'id': 'org:parents', 'subj_type': ['ORGANIZATION'], 'obj_type': ['ORGANIZATION']}, \ 22 | 'org:subsidiaries': {'id': 'org:subsidiaries', 'subj_type': ['ORGANIZATION'], 'obj_type': ['ORGANIZATION']}, \ 23 | 'per:origin': {'id': 'per:origin', 'subj_type': ['PERSON'], 'obj_type': ['COUNTRY', 'NATIONALITY']}, \ 24 | 'org:political/religious_affiliation': {'id': 'org:political/religious_affiliation', 'subj_type': ['ORGANIZATION'], 'obj_type': ['RELIGION', 'IDEOLOGY']}, \ 25 | 'per:other_family': {'id': 'per:other_family', 'subj_type': ['PERSON'], 'obj_type': ['PERSON']}, \ 26 | 'per:stateorprovince_of_birth': {'id': 'per:stateorprovince_of_birth', 'subj_type': ['PERSON'], 'obj_type': ['STATE_OR_PROVINCE']}, \ 27 | 'org:dissolved': {'id': 'org:dissolved', 'subj_type': ['ORGANIZATION'], 'obj_type': ['DATE']}, \ 28 | 'per:date_of_death': {'id': 'per:date_of_death', 'subj_type': ['PERSON'], 'obj_type': ['DATE']}, \ 29 | 'org:shareholders': {'id': 'org:shareholders', 'subj_type': ['ORGANIZATION'], 'obj_type': ['PERSON', 'ORGANIZATION']}, \ 30 | 'per:alternate_names': {'id': 'per:alternate_names', 'subj_type': ['PERSON'], 'obj_type': ['PERSON']}, \ 31 | 'per:parents': {'id': 'per:parents', 'subj_type': ['PERSON'], 'obj_type': ['PERSON']}, \ 32 | 'per:schools_attended': {'id': 'per:schools_attended', 'subj_type': ['PERSON'], 'obj_type': ['ORGANIZATION']}, \ 33 | 'per:cause_of_death': {'id': 'per:cause_of_death', 'subj_type': ['PERSON'], 'obj_type': ['CAUSE_OF_DEATH']}, \ 34 | 'per:city_of_death': {'id': 'per:city_of_death', 'subj_type': ['PERSON'], 'obj_type': ['CITY']}, \ 35 | 'per:stateorprovince_of_death': {'id': 'per:stateorprovince_of_death', 'subj_type': ['PERSON'], 'obj_type': ['STATE_OR_PROVINCE']}, \ 36 | 'org:founded': {'id': 'org:founded', 'subj_type': ['ORGANIZATION'], 'obj_type': ['DATE']}, \ 37 | 'per:country_of_birth': {'id': 'per:country_of_birth', 'subj_type': ['PERSON'], 'obj_type': ['COUNTRY']}, \ 38 | 'per:date_of_birth': {'id': 'per:date_of_birth', 'subj_type': ['PERSON'], 'obj_type': ['DATE']}, \ 39 | 'per:city_of_birth': {'id': 'per:city_of_birth', 'subj_type': ['PERSON'], 'obj_type': ['CITY']}, \ 40 | 'per:charges': {'id': 'per:charges', 'subj_type': ['PERSON'], 'obj_type': ['CRIMINAL_CHARGE']}, \ 41 | 'per:country_of_death': {'id': 'per:country_of_death', 'subj_type': ['PERSON'], 'obj_type': ['COUNTRY', 'NATIONALITY', 'LOCATION']}} -------------------------------------------------------------------------------- /scripts/search/ner_lists/countries: -------------------------------------------------------------------------------- 1 | Afghanistan 2 | 2 Akrotiri 3 | 3 Albania 4 | 4 Algeria 5 | 0 America 6 | 5 American Samoa 7 | 6 Andorra 8 | 7 Angola 9 | 8 Anguilla 10 | 9 Antarctica 11 | 10 Antigua and Barbuda 12 | 11 Argentina 13 | 12 Armenia 14 | 13 Aruba 15 | 14 Ashmore and Cartier Islands 16 | 15 Australia 17 | 16 Austria 18 | 17 Azerbaijan 19 | 18 Bahamas, The 20 | 19 Bahrain 21 | 20 Bangladesh 22 | 21 Barbados 23 | 22 Bassas da India 24 | 23 Belarus 25 | 24 Belgium 26 | 25 Belize 27 | 26 Benin 28 | 27 Bermuda 29 | 28 Bhutan 30 | 29 Bolivia 31 | 30 Bosnia and Herzegovina 32 | 31 Botswana 33 | 32 Bouvet Island 34 | 33 Brazil 35 | 34 British Indian Ocean Territory 36 | 35 British Virgin Islands 37 | 36 Brunei 38 | 37 Bulgaria 39 | 38 Burkina Faso 40 | 39 Burma 41 | 40 Burundi 42 | 41 Cambodia 43 | 42 Cameroon 44 | 43 Canada 45 | 44 Cape Verde 46 | 45 Cayman Islands 47 | 46 Central African Republic 48 | 47 Chad 49 | 48 Chile 50 | 49 China 51 | 50 Christmas Island 52 | 51 Clipperton Island 53 | 52 Cocos (Keeling) Islands 54 | 53 Colombia 55 | 54 Comoros 56 | 55 Congo, Democratic Republic of the 57 | 56 Congo, Republic of the 58 | 57 Cook Islands 59 | 58 Coral Sea Islands 60 | 59 Costa Rica 61 | 60 Cote d'Ivoire 62 | 61 Croatia 63 | 62 Cuba 64 | 63 Cyprus 65 | 64 Czech Republic 66 | 000 Czechoslovakia 67 | 65 Denmark 68 | 66 Dhekelia 69 | 67 Djibouti 70 | 68 Dominica 71 | 69 Dominican Republic 72 | 70 Ecuador 73 | 71 Egypt 74 | 72 El Salvador 75 | 73 Equatorial Guinea 76 | 74 Eritrea 77 | 75 Estonia 78 | 76 Ethiopia 79 | 77 Europa Island 80 | 78 Falkland Islands (Islas Malvinas) 81 | 79 Faroe Islands 82 | 80 Fiji 83 | 81 Finland 84 | 82 France 85 | 83 French Guiana 86 | 84 French Polynesia 87 | 85 French Southern and Antarctic Lands 88 | 86 Gabon 89 | 87 Gambia, The 90 | 88 Gaza Strip 91 | 89 Georgia 92 | 90 Germany 93 | 91 Ghana 94 | 92 Gibraltar 95 | 93 Glorioso Islands 96 | 94 Greece 97 | 95 Greenland 98 | 96 Grenada 99 | 97 Guadeloupe 100 | 98 Guam 101 | 99 Guatemala 102 | 100 Guernsey 103 | 101 Guinea 104 | 102 Guinea-Bissau 105 | 103 Guyana 106 | 104 Haiti 107 | 105 Heard Island and McDonald Islands 108 | 106 Holy See (Vatican City) 109 | 107 Honduras 110 | 108 Hong Kong 111 | 109 Hungary 112 | 110 Iceland 113 | 111 India 114 | 112 Indonesia 115 | 113 Iran 116 | 114 Iraq 117 | 115 Ireland 118 | 116 Isle of Man 119 | 117 Israel 120 | 118 Italy 121 | 119 Jamaica 122 | 120 Jan Mayen 123 | 121 Japan 124 | 122 Jersey 125 | 123 Jordan 126 | 124 Juan de Nova Island 127 | 125 Kazakhstan 128 | 126 Kenya 129 | 127 Kiribati 130 | 128 Korea, North 131 | 129 Korea, South 132 | 130 Kuwait 133 | 131 Kyrgyzstan 134 | 132 Laos 135 | 133 Latvia 136 | 134 Lebanon 137 | 135 Lesotho 138 | 136 Liberia 139 | 137 Libya 140 | 138 Liechtenstein 141 | 139 Lithuania 142 | 140 Luxembourg 143 | 141 Macau 144 | 142 Macedonia 145 | 143 Madagascar 146 | 144 Malawi 147 | 145 Malaysia 148 | 146 Maldives 149 | 147 Mali 150 | 148 Malta 151 | 149 Marshall Islands 152 | 150 Martinique 153 | 151 Mauritania 154 | 152 Mauritius 155 | 153 Mayotte 156 | 154 Mexico 157 | 155 Micronesia, Federated States of 158 | 156 Moldova 159 | 157 Monaco 160 | 158 Mongolia 161 | 159 Montserrat 162 | 160 Morocco 163 | 161 Mozambique 164 | 162 Namibia 165 | 163 Nauru 166 | 164 Navassa Island 167 | 165 Nepal 168 | 166 Netherlands 169 | 167 Netherlands Antilles 170 | 168 New Caledonia 171 | 169 New Zealand 172 | 170 Nicaragua 173 | 171 Niger 174 | 172 Nigeria 175 | 173 Niue 176 | 174 Norfolk Island 177 | 175 Northern Mariana Islands 178 | 176 Norway 179 | 177 Oman 180 | 178 Pakistan 181 | 179 Palau 182 | 180 Panama 183 | 181 Papua New Guinea 184 | 182 Paracel Islands 185 | 183 Paraguay 186 | 184 Peru 187 | 185 Philippines 188 | 186 Pitcairn Islands 189 | 187 Poland 190 | 188 Portugal 191 | 189 Puerto Rico 192 | 190 Qatar 193 | 191 Reunion 194 | 192 Romania 195 | 193 Russia 196 | 194 Rwanda 197 | 195 Saint Helena 198 | 196 Saint Kitts and Nevis 199 | 197 Saint Lucia 200 | 198 Saint Pierre and Miquelon 201 | 199 Saint Vincent and the Grenadines 202 | 200 Samoa 203 | 201 San Marino 204 | 202 Sao Tome and Principe 205 | 203 Saudi Arabia 206 | 204 Senegal 207 | 205 Serbia and Montenegro 208 | 206 Seychelles 209 | 207 Sierra Leone 210 | 208 Singapore 211 | 209 Slovakia 212 | 210 Slovenia 213 | 211 Solomon Islands 214 | 212 Somalia 215 | 213 South Africa 216 | 214 South Georgia and the South Sandwich Islands 217 | 215 Spain 218 | 216 Spratly Islands 219 | 217 Sri Lanka 220 | 218 Sudan 221 | 219 Suriname 222 | 220 Svalbard 223 | 221 Swaziland 224 | 222 Sweden 225 | 223 Switzerland 226 | 224 Syria 227 | 225 Taiwan 228 | 226 Tajikistan 229 | 227 Tanzania 230 | 228 Thailand 231 | 229 Timor-Leste 232 | 230 Togo 233 | 231 Tokelau 234 | 232 Tonga 235 | 233 Trinidad and Tobago 236 | 234 Tromelin Island 237 | 235 Tunisia 238 | 236 Turkey 239 | 237 Turkmenistan 240 | 238 Turks and Caicos Islands 241 | 239 Tuvalu 242 | 240 Uganda 243 | 241 Ukraine 244 | 242 United Arab Emirates 245 | 243 United Kingdom 246 | 000 England 247 | 000 UK 248 | 000 U.K. 249 | 244 United States 250 | 000 USA 251 | 000 U.S.A. 252 | 000 US 253 | 000 U.S. 254 | 000 US. 255 | 000 United States of America 256 | 245 Uruguay 257 | 246 Uzbekistan 258 | 247 Vanuatu 259 | 248 Venezuela 260 | 249 Vietnam 261 | 250 Virgin Islands 262 | 251 Wake Island 263 | 252 Wallis and Futuna 264 | 253 West Bank 265 | 254 Western Sahara 266 | 255 Yemen 267 | 256 Zambia 268 | 257 Zimbabwe -------------------------------------------------------------------------------- /classification/stubs/tacred/fake_truth.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "id": "61b3a5c8c9a882dcfcd2", 4 | "docid": "AFP_ENG_20070218.0019.LDC2009T13", 5 | "relation": "org:founded_by", 6 | "token": [ 7 | "Tom", 8 | "Thabane", 9 | "resigned", 10 | "in", 11 | "October", 12 | "last", 13 | "year", 14 | "to", 15 | "form", 16 | "the", 17 | "All", 18 | "Basotho", 19 | "Convention", 20 | "-LRB-", 21 | "ABC", 22 | "-RRB-", 23 | ",", 24 | "crossing", 25 | "the", 26 | "floor", 27 | "with", 28 | "17", 29 | "members", 30 | "of", 31 | "parliament", 32 | ",", 33 | "causing", 34 | "constitutional", 35 | "monarch", 36 | "King", 37 | "Letsie", 38 | "III", 39 | "to", 40 | "dissolve", 41 | "parliament", 42 | "and", 43 | "call", 44 | "the", 45 | "snap", 46 | "election", 47 | "." 48 | ], 49 | "subj_start": 10, 50 | "subj_end": 12, 51 | "obj_start": 0, 52 | "obj_end": 1, 53 | "subj_type": "ORGANIZATION", 54 | "obj_type": "PERSON", 55 | "stanford_pos": [ 56 | "NNP", 57 | "NNP", 58 | "VBD", 59 | "IN", 60 | "NNP", 61 | "JJ", 62 | "NN", 63 | "TO", 64 | "VB", 65 | "DT", 66 | "DT", 67 | "NNP", 68 | "NNP", 69 | "-LRB-", 70 | "NNP", 71 | "-RRB-", 72 | ",", 73 | "VBG", 74 | "DT", 75 | "NN", 76 | "IN", 77 | "CD", 78 | "NNS", 79 | "IN", 80 | "NN", 81 | ",", 82 | "VBG", 83 | "JJ", 84 | "NN", 85 | "NNP", 86 | "NNP", 87 | "NNP", 88 | "TO", 89 | "VB", 90 | "NN", 91 | "CC", 92 | "VB", 93 | "DT", 94 | "NN", 95 | "NN", 96 | "." 97 | ], 98 | "stanford_ner": [ 99 | "PERSON", 100 | "PERSON", 101 | "O", 102 | "O", 103 | "DATE", 104 | "DATE", 105 | "DATE", 106 | "O", 107 | "O", 108 | "O", 109 | "O", 110 | "O", 111 | "O", 112 | "O", 113 | "ORGANIZATION", 114 | "O", 115 | "O", 116 | "O", 117 | "O", 118 | "O", 119 | "O", 120 | "NUMBER", 121 | "O", 122 | "O", 123 | "O", 124 | "O", 125 | "O", 126 | "O", 127 | "O", 128 | "O", 129 | "PERSON", 130 | "PERSON", 131 | "O", 132 | "O", 133 | "O", 134 | "O", 135 | "O", 136 | "O", 137 | "O", 138 | "O", 139 | "O" 140 | ], 141 | "stanford_head": [ 142 | 2, 143 | 3, 144 | 0, 145 | 5, 146 | 3, 147 | 7, 148 | 3, 149 | 9, 150 | 3, 151 | 13, 152 | 13, 153 | 13, 154 | 9, 155 | 15, 156 | 13, 157 | 15, 158 | 3, 159 | 3, 160 | 20, 161 | 18, 162 | 23, 163 | 23, 164 | 18, 165 | 25, 166 | 23, 167 | 3, 168 | 3, 169 | 32, 170 | 32, 171 | 32, 172 | 32, 173 | 27, 174 | 34, 175 | 27, 176 | 34, 177 | 34, 178 | 34, 179 | 40, 180 | 40, 181 | 37, 182 | 3 183 | ], 184 | "stanford_deprel": [ 185 | "compound", 186 | "nsubj", 187 | "ROOT", 188 | "case", 189 | "nmod", 190 | "amod", 191 | "nmod:tmod", 192 | "mark", 193 | "xcomp", 194 | "det", 195 | "compound", 196 | "compound", 197 | "dobj", 198 | "punct", 199 | "appos", 200 | "punct", 201 | "punct", 202 | "xcomp", 203 | "det", 204 | "dobj", 205 | "case", 206 | "nummod", 207 | "nmod", 208 | "case", 209 | "nmod", 210 | "punct", 211 | "xcomp", 212 | "amod", 213 | "compound", 214 | "compound", 215 | "compound", 216 | "dobj", 217 | "mark", 218 | "xcomp", 219 | "dobj", 220 | "cc", 221 | "conj", 222 | "det", 223 | "compound", 224 | "dobj", 225 | "punct" 226 | ] 227 | } 228 | ] -------------------------------------------------------------------------------- /classification/tacred.py: -------------------------------------------------------------------------------- 1 | import csv 2 | from typing import Any, Callable, Dict, Iterator, List, Type, TypeVar, Set 3 | from typing_extensions import TypedDict 4 | 5 | from classification.re_processors import REProcessor, JsonObject, wrap_text, NEGATIVE_LABEL, SetType 6 | from classification.tacred_config import RELATION_MAPPING 7 | from transformers.data.processors.utils import InputExample, InputFeatures 8 | 9 | Relation = TypedDict('Relation', id=str, docid=str, relation=str, token=List[str], subj_start=int, subj_end=int, obj_start=int, obj_end=int, subj_type=str, obj_type=str, stanford_pos=List[str], stanford_ner=List[str], stanford_head=List[int], stanford_deprel=List[str]) 10 | T = TypeVar('T', bound='TACREDExample') 11 | Builder = Callable[[Type[T], int, JsonObject, str], T] 12 | 13 | class TACREDExample(InputExample): 14 | def __init__(self, id: int, text: str, label: str) -> None: 15 | self.id = id 16 | self.text = text 17 | self.label = label 18 | 19 | @classmethod 20 | def build(cls: Type[T], id: int, example_json: JsonObject, label: str) -> T: 21 | return cls(id, cls._mark_entities(example_json), label) 22 | 23 | @classmethod 24 | def _mark_entities(cls: Type[T], example_json: JsonObject) -> str: 25 | e1_start_idx, e1_end_idx = example_json['subj_start'], example_json['subj_end'] 26 | e2_start_idx, e2_end_idx = example_json['obj_start'], example_json['obj_end'] 27 | text = example_json['token'].copy() 28 | 29 | return wrap_text(text, e1_start_idx, e1_end_idx + 1, e2_start_idx, e2_end_idx + 1) 30 | 31 | def __eq__(self, other: Any): 32 | if not isinstance(other, TACREDExample): 33 | return False 34 | 35 | if self.id == other.id and \ 36 | self.text == other.text and \ 37 | self.label == other.label: 38 | return True 39 | 40 | return False 41 | 42 | def __hash__(self): 43 | return hash((self.id, self.text, self.label)) 44 | 45 | class TACREDProcessor(REProcessor): 46 | def __init__(self, relation_name: str, num_positive: int = None, negative_ratio: int = None, type_independent_neg_sample: bool = True) -> None: 47 | super().__init__(relation_name, num_positive, negative_ratio, type_independent_neg_sample) 48 | assert relation_name in RELATION_MAPPING 49 | self.relation_mapping = RELATION_MAPPING 50 | self.train_file = "train.json" 51 | self.dev_file = "dev.json" 52 | self.test_file = "test.json" 53 | 54 | def _create_examples(self, relations: Dict[int, Relation], 55 | set_type: SetType, 56 | builder: Builder = TACREDExample.build) -> Iterator[TACREDExample]: 57 | """Creates examples for the training and dev sets.""" 58 | for id, relation in enumerate(relations): 59 | label = self._relation_label(relation['relation']) 60 | if self._positive_relation(label) or self.allow_as_negative(relation): 61 | yield builder(id, relation, label) 62 | 63 | def _create_all_possible_dev_examples(self, 64 | relations: Dict[int, Relation], 65 | set_type: SetType) -> Iterator[InputExample]: 66 | """Creates examples of all possible entities for dev sets""" 67 | for id, relation in enumerate(relations): 68 | label = self._relation_label(relation['relation']) 69 | if self._same_entity_types_relation(relation): 70 | yield TACREDExample.build(id, relation, label) 71 | 72 | def _create_search_examples_given_row_ids(self, search_file: str, row_ids: Set[int]) -> Iterator[InputExample]: 73 | with open(search_file, 'r', encoding="utf-8") as f: 74 | reader = csv.reader(f, delimiter='\t') 75 | return [TACREDExample(i, doc[0], self._relation_label(doc[1])) for i, doc in enumerate(reader) if i in row_ids] 76 | 77 | def _create_generation_examples(self, raw_generations: List[str]) -> Iterator[InputExample]: 78 | for i, gen in enumerate(raw_generations): 79 | yield TACREDExample(i, gen.rstrip(), 1) 80 | 81 | def relation_name_adapter(self, relation: str): 82 | return relation 83 | 84 | def _relation_label(self, relation_name: str) -> str: 85 | return 1 if self._positive_relation(relation_name) else 0 86 | 87 | def _positive_relation(self, relation_name: str) -> bool: 88 | return relation_name == self.positive_label 89 | 90 | def allow_as_negative(self, relation: Relation): 91 | return self.type_independent_neg_sample or self._same_entity_types_relation(relation) 92 | 93 | def _same_entity_types_relation(self, relation: Relation) -> bool: 94 | return (relation['subj_type'] in self.relation_mapping[self.positive_label]['subj_type'] and 95 | relation['obj_type'] in self.relation_mapping[self.positive_label]['obj_type']) 96 | 97 | class TACREDInputFeatures(InputFeatures): 98 | def __init__(self, 99 | input_ids, 100 | attention_mask=None, 101 | token_type_ids=None, 102 | markers_mask=None, 103 | example=None, 104 | label=None) -> None: 105 | super().__init__(input_ids, attention_mask, token_type_ids, label) 106 | self.markers_mask = markers_mask 107 | self.title = example.id 108 | self.h = -1 109 | self.t = -1 110 | -------------------------------------------------------------------------------- /classification/evaluation/docred_evaluation.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import argparse 3 | import os 4 | import os.path 5 | import json 6 | 7 | from classification.docred_config import RELATION_MAPPING 8 | from classification.docred import DocREDUtils 9 | 10 | def gen_train_facts(data_file_name, truth_dir): 11 | fact_file_name = data_file_name[data_file_name.find("train_"):] 12 | fact_file_name = os.path.join(truth_dir, fact_file_name.replace(".json", ".fact")) 13 | 14 | if os.path.exists(fact_file_name): 15 | fact_in_train = set([]) 16 | triples = json.load(open(fact_file_name)) 17 | for x in triples: 18 | fact_in_train.add(tuple(x)) 19 | return fact_in_train 20 | 21 | fact_in_train = set([]) 22 | ori_data = json.load(open(data_file_name)) 23 | for data in ori_data: 24 | vertexSet = data['vertexSet'] 25 | for label in data['labels']: 26 | rel = label['r'] 27 | for n1 in vertexSet[label['h']]: 28 | for n2 in vertexSet[label['t']]: 29 | fact_in_train.add((n1['name'], n2['name'], rel)) 30 | 31 | json.dump(list(fact_in_train), open(fact_file_name, "w")) 32 | 33 | return fact_in_train 34 | 35 | def correct_entity_types(relation_object, entities, relation_name): 36 | def get_entity_type(side: str): 37 | return entities[relation_object[side]][0]['type'] 38 | 39 | return get_entity_type('h') in RELATION_MAPPING[relation_name]['e1_type'] and \ 40 | get_entity_type('t') in RELATION_MAPPING[relation_name]['e2_type'] 41 | 42 | def main(args): 43 | relation_id = RELATION_MAPPING[args.relation_name]['id'] 44 | 45 | truth_file = os.path.join(args.gold_dir, args.gold_file) 46 | truth = json.load(open(truth_file)) 47 | 48 | std = {} 49 | std_in_single_sent = {} 50 | tot_evidences = 0 51 | titleset = set([]) 52 | 53 | 54 | for x in truth: 55 | title = x['title'] 56 | titleset.add(title) 57 | 58 | vertexSet = x['vertexSet'] 59 | 60 | for label in x['labels']: 61 | r = label['r'] 62 | 63 | h_idx = label['h'] 64 | t_idx = label['t'] 65 | if r != relation_id: continue 66 | if not correct_entity_types(label, vertexSet, args.relation_name): continue 67 | 68 | std[(title, r, h_idx, t_idx)] = set(label['evidence']) 69 | tot_evidences += len(label['evidence']) 70 | if len(label['evidence']) == 1 and len(DocREDUtils.evidences_with_entities(x, label)) > 0: 71 | std_in_single_sent[(title, r, h_idx, t_idx)] = set(label['evidence']) 72 | 73 | submission_answer_file = args.pred_file 74 | tmp = json.load(open(submission_answer_file)) 75 | if len(tmp) == 0: 76 | if args.output_file: 77 | json.dump({ 78 | "F1": 0.0, 79 | "precision": 0.0, 80 | "recall": 0.0, 81 | "best_F1": 0.0, 82 | "best_precision": 0.0, 83 | "best_recall": 0.0, 84 | "best_confidence": (0.5 - 1e-10)}, 85 | open(args.output_file, 'w')) 86 | return 87 | 88 | tmp.sort(key=lambda x: (x['title'], x['h_idx'], x['t_idx'], x['r'])) 89 | submission_answer = [tmp[0]] 90 | for i in range(1, len(tmp)): 91 | x = tmp[i] 92 | y = tmp[i-1] 93 | if (x['title'], x['h_idx'], x['t_idx'], x['r']) != (y['title'], y['h_idx'], y['t_idx'], y['r']): 94 | submission_answer.append(tmp[i]) 95 | 96 | submission_answer = sorted(submission_answer, key=lambda x: x['c'], reverse=True) 97 | 98 | if len(set([answer['r'] for answer in submission_answer])) != 1: 99 | raise ValueError('Mutliple relation predictions are passed') 100 | # This is a must as we are only adding a the "relation_name" to the std dict 101 | 102 | scores = eval(args, submission_answer, std_in_single_sent) 103 | # multi_sent_rel_scores = eval(args, submission_answer, std) 104 | 105 | # for k, v in multi_sent_rel_scores.items(): 106 | # scores[f"multi_sent_{k}"] = v 107 | 108 | if args.output_file: 109 | json.dump(scores, open(args.output_file, 'w')) 110 | 111 | return scores 112 | 113 | def eval(args, submission_answer, std): 114 | correct_re = 0 115 | tot_relations = len(std) 116 | 117 | re_f1, re_p, re_r, best_f1, best_p, best_r, best_confidence = 0, 0, 0, 0, 0, 0, (0.5 - 1e-10) 118 | for i, x in enumerate(submission_answer): 119 | title = x['title'] 120 | h_idx = x['h_idx'] 121 | t_idx = x['t_idx'] 122 | r = x['r'] 123 | confidence = x['c'] 124 | if confidence < args.confidence_threshold: 125 | break 126 | 127 | if (title, r, h_idx, t_idx) in std: 128 | correct_re += 1 129 | 130 | re_p = 1.0 * correct_re / (i+1) 131 | re_r = 1.0 * correct_re / tot_relations 132 | 133 | if re_p+re_r == 0: 134 | re_f1 = 0 135 | else: 136 | re_f1 = 2.0 * re_p * re_r / (re_p + re_r) 137 | 138 | if best_f1 < re_f1: 139 | best_f1 = re_f1 140 | best_p = re_p 141 | best_r = re_r 142 | best_confidence = confidence 143 | 144 | scores = { 145 | "F1": re_f1, 146 | "precision": re_p, 147 | "recall": re_r, 148 | "best_F1": best_f1, 149 | "best_precision": best_p, 150 | "best_recall": best_r, 151 | "best_confidence": best_confidence 152 | } 153 | 154 | return scores 155 | 156 | if __name__ == "__main__": 157 | parser = argparse.ArgumentParser() 158 | parser.add_argument('-gold_dir', '--gold_dir', 159 | type=str, 160 | required=True) 161 | parser.add_argument('-gold_file', '--gold_file', 162 | type=str, 163 | required=True) 164 | parser.add_argument('-pred_file', '--pred_file', 165 | type=str, 166 | required=True) 167 | parser.add_argument('-output_file', '--output_file', 168 | default='evaluation', 169 | type=str) 170 | parser.add_argument('-relation_name', '--relation_name', 171 | type=str, 172 | required=True) 173 | parser.add_argument('-confidence_threshold', '--confidence_threshold', 174 | type=float, 175 | default=0) 176 | args = parser.parse_args() 177 | 178 | main(args) -------------------------------------------------------------------------------- /classification/test_docred.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | 4 | from classification.docred import DocREDUtils, DocREDProcessor, DocREDExample 5 | 6 | with open('classification/stubs/docred/fake_truth.json', "r", encoding="utf-8") as f: 7 | docs = list(json.load(f)) 8 | 9 | doc1, doc2, doc3, doc4 = docs 10 | 11 | # Tests using this variable require the true path to the data files. 12 | DATA_DIR = 'data/DocRED/' 13 | 14 | class TestDocREDUtils: 15 | def test_sents_entities_share(self): 16 | entities_sents = DocREDUtils.sents_entities_share(doc1, doc1['labels'][0]) 17 | assert entities_sents == [0, 1] 18 | entities_sents = DocREDUtils.sents_entities_share(doc2, doc2['labels'][0]) 19 | assert entities_sents == [0] 20 | entities_sents = DocREDUtils.sents_entities_share(doc3, doc3['labels'][0]) 21 | assert entities_sents == [0, 1] 22 | entities_sents = DocREDUtils.sents_entities_share(doc3, doc3['labels'][1]) 23 | assert entities_sents == [0, 1] 24 | 25 | def test__sents_entities_and_evidence_share(self): 26 | entities_sents = DocREDUtils.sents_entities_share(doc1, doc1['labels'][0]) 27 | entities_and_evidence_sents = DocREDUtils._sents_entities_and_evidence_share(doc1['labels'][0], entities_sents) 28 | assert entities_and_evidence_sents == [1] 29 | entities_sents = DocREDUtils.sents_entities_share(doc2, doc2['labels'][0]) 30 | entities_and_evidence_sents = DocREDUtils._sents_entities_and_evidence_share(doc2['labels'][0], entities_sents) 31 | assert entities_and_evidence_sents == [0] 32 | entities_sents = DocREDUtils.sents_entities_share(doc3, doc3['labels'][0]) 33 | entities_and_evidence_sents = DocREDUtils._sents_entities_and_evidence_share(doc3['labels'][0], entities_sents) 34 | assert entities_and_evidence_sents == [0, 1] 35 | 36 | def test_entity_from_entity_id_passes(self): 37 | entity_list = DocREDUtils.entity_from_entity_id(doc1['vertexSet'], doc1['labels'][0]['h'], 0) 38 | assert entity_list == [{'name': 'Microsoft', 'pos': [0, 1], 'sent_id': 0, 'type': 'ORG'}, 39 | {'name': 'MS', 'pos': [3, 4], 'sent_id': 0, 'type': 'ORG'}] 40 | entity_list = DocREDUtils.entity_from_entity_id(doc1['vertexSet'], doc1['labels'][0]['h'], 1) 41 | assert entity_list[0] == {'name': 'Micro', 'pos': [0, 1], 'sent_id': 1, 'type': 'ORG'} 42 | 43 | def test_entities_by_sent_id(self): 44 | assert DocREDUtils.entities_by_sent_id(doc3['vertexSet']) == {0: {0, 1}, 1: {0, 1}} 45 | 46 | def test_relations_by_entities(self): 47 | assert DocREDUtils.relations_by_entities(doc3['labels']) == \ 48 | {(0, 1): [{'r': 'P26', 'h': 0, 't': 1, 'evidence': [0, 1]}], 49 | (1, 0): [{'r': 'P26', 'h': 1, 't': 0, 'evidence': [0, 1]}]} 50 | 51 | class TestDocREDProcessor: 52 | def test__same_entity_types_relation(self): 53 | processor = DocREDProcessor('founded_by') 54 | assert processor._same_entity_types_relation(doc1['labels'][0], doc1['vertexSet']) 55 | processor = DocREDProcessor('father') 56 | assert processor._same_entity_types_relation(doc2['labels'][0], doc2['vertexSet']) 57 | processor = DocREDProcessor('spouse') 58 | assert processor._same_entity_types_relation(doc3['labels'][0], doc3['vertexSet']) 59 | assert not processor._same_entity_types_relation(doc1['labels'][0], doc1['vertexSet']) 60 | 61 | def test__same_entity_types_relation_switched_h_and_t(self): 62 | processor = DocREDProcessor('founded_by') 63 | relation = doc1['labels'][0] 64 | head_is_tail = {'r': relation['r'], 'h': relation['t'], 't': relation['h'], 'evidence': relation['evidence']} 65 | assert not processor._same_entity_types_relation(head_is_tail, doc1['vertexSet']) 66 | 67 | def test__same_entity_types_relation_wrong_relation(self): 68 | processor = DocREDProcessor('inception') 69 | assert not processor._same_entity_types_relation(doc1['labels'][0], doc1['vertexSet']) 70 | 71 | def test_create_all_possible_dev_examples_doc1(self): 72 | processor = DocREDProcessor('founded_by') 73 | data = list(processor._create_all_possible_dev_examples([doc1], None)) 74 | assert len(data) == 2 75 | assert data[0].evidence == 0 76 | assert data[0].h == 0 77 | assert data[0].t == 1 78 | assert data[0].label == 'NOTA' 79 | 80 | assert data[1].evidence == 1 81 | assert data[1].h == 0 82 | assert data[1].t == 1 83 | assert data[1].label == 'founded_by' 84 | 85 | def test_create_all_possible_dev_examples_doc2(self): 86 | processor = DocREDProcessor('father') 87 | data = list(processor._create_all_possible_dev_examples([doc2], None)) 88 | assert len(data) == 2 89 | assert data[0].evidence == 0 90 | assert data[0].h == 0 91 | assert data[0].t == 1 92 | assert data[0].label == 'father' 93 | 94 | assert data[1].evidence == 0 95 | assert data[1].h == 1 96 | assert data[1].t == 0 97 | assert data[1].label == 'NOTA' 98 | 99 | def test_create_all_possible_dev_examples_doc3(self): 100 | processor = DocREDProcessor('spouse') 101 | data = list(processor._create_all_possible_dev_examples([doc3], None)) 102 | assert len(data) == 4 103 | assert data[0].evidence == 0 104 | assert data[0].h == 0 105 | assert data[0].t == 1 106 | assert data[0].label == 'spouse' 107 | 108 | assert data[1].evidence == 0 109 | assert data[1].h == 1 110 | assert data[1].t == 0 111 | assert data[1].label == 'spouse' 112 | 113 | assert data[2].evidence == 1 114 | assert data[2].h == 0 115 | assert data[2].t == 1 116 | assert data[2].label == 'spouse' 117 | 118 | assert data[3].evidence == 1 119 | assert data[3].h == 1 120 | assert data[3].t == 0 121 | assert data[3].label == 'spouse' 122 | 123 | def test_create_all_possible_dev_examples_doc4(self): 124 | processor = DocREDProcessor('founded_by') 125 | data = list(processor._create_all_possible_dev_examples([doc4], None)) 126 | assert len(data) == 1 127 | assert data[0].evidence == 0 128 | assert data[0].h == 0 129 | assert data[0].t == 1 130 | assert data[0].label == 'founded_by' 131 | 132 | def test_get_all_possible_eval_examples_check_positives(self): 133 | processor = DocREDProcessor('founded_by') 134 | data = processor.get_all_possible_eval_examples(DATA_DIR, 'full_test_eval') 135 | relations = [d for d in data if d.label == 'founded_by'] 136 | distinct = list(set(relations)) 137 | assert len(relations) == len(distinct) 138 | 139 | def test_get_all_possible_dev_eval_examples_check_positives_num_examples(self): 140 | processor = DocREDProcessor('founded_by') 141 | data = processor.get_all_possible_eval_examples(DATA_DIR, 'full_dev_eval') 142 | in_relation = [d for d in data if d.label == 'founded_by'] 143 | assert len(data) == 2228 144 | assert len(in_relation) == 9 145 | 146 | def test_get_all_possible_test_eval_examples_check_positives_num_examples(self): 147 | processor = DocREDProcessor('founded_by') 148 | data = processor.get_all_possible_eval_examples(DATA_DIR, 'full_test_eval') 149 | in_relation = [d for d in data if d.label == 'founded_by'] 150 | assert len(data) == 3895 151 | assert len(in_relation) == 20 152 | 153 | class TestDocREDExample: 154 | def test_init(self): 155 | example = DocREDExample(0, doc1, doc1['labels'][0], 1, True) 156 | assert example.evidence == 1 157 | assert example.h == 0 158 | assert example.label 159 | assert example.t == 1 160 | assert example.text == "[E1] Micro [/E1] was founded by [E2] Paul [/E2]" 161 | assert example.title == 0 -------------------------------------------------------------------------------- /scripts/generation_preprocess/create_tacred_datafiles.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import torch 3 | import os 4 | import json 5 | from tqdm import tqdm 6 | 7 | from relation_canonical_form import CANONICAL_FORMS, PREDICATES 8 | 9 | from transformers import GPT2Config, GPT2LMHeadModel, GPT2Tokenizer 10 | 11 | CLEANINGMAP = {'-RRB-': ')', '-LRB-': '(', '-LSB-': '[', 12 | '-RSB-': ']', '-LCB-': '{', '-RCB-': '}', 13 | ' ': ' ', '"': "'", '--': '-', '---': '-'} 14 | 15 | DISALLOWED_PRONOUNS = {"me", "us", "you", "her", "him", "it", "them", "my", "our", "your", "her", "his", "their"} 16 | 17 | MODEL_CLASSES = { 18 | 'gpt2': (GPT2Config, GPT2LMHeadModel, GPT2Tokenizer), 19 | } 20 | START_SUBJ = '<|subj|>' 21 | END_SUBJ = '<|/subj|>' 22 | START_OBJ = '<|obj|>' 23 | END_OBJ = '<|/obj|>' 24 | START_TRIGGER = '<|trigger|>' 25 | END_TRIGGER = '<|/trigger|>' 26 | GO = '<|GO|>' 27 | E1 = '<|E1|>' 28 | END_E1 = '<|\E1|>' 29 | E2 = '<|E2|>' 30 | END_E2 = '<|\E2|>' 31 | 32 | 33 | NO_RELATION = "no_relation" 34 | 35 | RELATIONS_TO_LEAVE_OUT = ["per:children", "org:founded_by", "org:country_of_headquarters", "per:religion", "per:spouse", "per:origin", "per:date_of_death", "per:city_of_death"] 36 | 37 | def main(args): 38 | SPECIAL_TOKENS = [GO] 39 | if args.mark_relation_args: 40 | SPECIAL_TOKENS += [START_SUBJ, END_SUBJ, START_OBJ, END_OBJ, START_TRIGGER, END_TRIGGER] 41 | elif args.anonymize_tgt: 42 | SPECIAL_TOKENS += [E1, END_E1, E2, END_E2] 43 | 44 | config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type] 45 | config = config_class.from_pretrained( 46 | args.config_name if args.config_name else args.model_name_or_path 47 | ) 48 | tokenizer = tokenizer_class.from_pretrained( 49 | args.tokenizer_name if args.tokenizer_name else args.model_name_or_path, 50 | do_lower_case=args.do_lower_case 51 | ) 52 | if args.block_size <= 0: 53 | args.block_size = ( 54 | tokenizer.max_len_single_sentence 55 | ) # Our input block size will be the max possible for the model 56 | args.block_size = min(args.block_size, tokenizer.max_len_single_sentence) 57 | 58 | if args.local_rank == 0: 59 | torch.distributed.barrier() # End of barrier to make sure only the first process in distributed training download model & vocab 60 | 61 | # Add Special Tokens 62 | tokenizer.add_special_tokens({'additional_special_tokens': SPECIAL_TOKENS}) 63 | 64 | assert os.path.isfile(args.file_path) 65 | 66 | with open(args.file_path, encoding="utf-8") as f: 67 | parsed_json = json.load(f) 68 | 69 | srcs = [] 70 | if not args.src_and_tgt_one_file_with_go: 71 | tgts = [] 72 | for relation_dict in tqdm(parsed_json): 73 | if relation_dict['relation'] == NO_RELATION and not args.allow_no_relation: 74 | continue 75 | 76 | if leave_some_relations_out(relation_dict['relation']): 77 | continue 78 | 79 | subj_start_idx, subj_end_idx, obj_start_idx, obj_end_idx = [relation_dict[key] for key in ['subj_start', 'subj_end', 'obj_start', 'obj_end']] 80 | 81 | subj = " ".join(relation_dict['token'][subj_start_idx : subj_end_idx + 1]) 82 | obj = " ".join(relation_dict['token'][obj_start_idx : obj_end_idx + 1]) 83 | example_text = relation_dict['token'] 84 | 85 | if skip_disallowed_pronouns(subj, obj): 86 | continue 87 | 88 | if args.mark_relation_args: 89 | example_text = mark_args(example_text, subj_start_idx, subj_end_idx, obj_start_idx, obj_end_idx) 90 | elif args.truncate_noise: 91 | example_text = truncate_noise(example_text, subj_start_idx, subj_end_idx + 1, obj_start_idx, obj_end_idx + 1) 92 | 93 | cleaned_example = clean_token(example_text) 94 | tgt = " ".join(cleaned_example) 95 | if args.anonymize_tgt: 96 | tgt = anonymize(tgt, subj, obj) 97 | 98 | relation_name = relation_dict['relation'] 99 | if args.one_form_per_relation: 100 | relation_contexts = [CANONICAL_FORMS[relation_name][0]] 101 | else: 102 | relation_contexts = CANONICAL_FORMS[relation_name] 103 | for relation_context in relation_contexts: 104 | if args.anonymize_tgt: 105 | src = relation_context.replace("{subj}", f"{E1} {subj} {END_E1}").replace("{obj}", f"{E2} {obj} {END_E2}") 106 | else: 107 | src = relation_context.replace("{subj}", subj).replace("{obj}", obj) 108 | src = specific_predicate_for_relation(src, tgt, relation_name) 109 | 110 | if src is None: 111 | continue 112 | 113 | if args.src_and_tgt_one_file_with_go: 114 | srcs.append(src + f" {GO} " + tgt+'\n') 115 | else: 116 | srcs.append(src+'\n') 117 | tgts.append(tgt+'\n') 118 | 119 | with open(args.save_to_file+'.src', 'w') as f: f.writelines(srcs) 120 | if not args.src_and_tgt_one_file_with_go: 121 | with open(args.save_to_file+'.tgt', 'w') as f: f.writelines(tgts) 122 | 123 | with open(args.save_to_file+'.special_tokens', 'w') as f: f.writelines(f"{t}\n" for t in SPECIAL_TOKENS) 124 | 125 | def specific_predicate_for_relation(src, tgt, relation_name): 126 | if "{predicate}" not in src: 127 | return src 128 | predicate = PREDICATES[relation_name]['default'] 129 | lowered_tgt = tgt.lower() 130 | for t in PREDICATES[relation_name]: 131 | if t in lowered_tgt: 132 | predicate = t 133 | break 134 | if predicate is None: 135 | return None 136 | return src.replace("{predicate}", predicate) 137 | 138 | def mark_args(text, subj_start_idx, subj_end_idx, obj_start_idx, obj_end_idx): 139 | if obj_end_idx > subj_end_idx: 140 | text.insert(obj_end_idx + 1, END_OBJ) 141 | text.insert(obj_start_idx, START_OBJ) 142 | text.insert(subj_end_idx + 1, END_SUBJ) 143 | text.insert(subj_start_idx, START_SUBJ) 144 | else: 145 | text.insert(subj_end_idx + 1, END_SUBJ) 146 | text.insert(subj_start_idx, START_SUBJ) 147 | text.insert(obj_end_idx + 1, END_OBJ) 148 | text.insert(obj_start_idx, START_OBJ) 149 | 150 | return text 151 | 152 | def anonymize(text, e1, e2): 153 | return text.replace(e1, E1).replace(e2, E2) 154 | 155 | def truncate_noise(example_text, subj_start_idx, subj_end_idx, obj_start_idx, obj_end_idx): 156 | padding = 0 157 | min_token_position = min(subj_start_idx, subj_end_idx, obj_start_idx, obj_end_idx) 158 | min_token_position = max(min_token_position - padding, 0) 159 | 160 | max_token_position = max(subj_start_idx, subj_end_idx, obj_start_idx, obj_end_idx) 161 | max_token_position = min(max_token_position + padding, len(example_text)) 162 | 163 | return example_text[min_token_position:max_token_position] 164 | 165 | def leave_some_relations_out(relation): 166 | return relation in RELATIONS_TO_LEAVE_OUT 167 | 168 | def skip_disallowed_pronouns(subj, obj): 169 | return subj.lower() in DISALLOWED_PRONOUNS or obj.lower() in DISALLOWED_PRONOUNS 170 | 171 | def clean_token(tokens): 172 | return [CLEANINGMAP.get(t, t) for t in tokens] 173 | 174 | 175 | if __name__ == '__main__': 176 | parser = argparse.ArgumentParser() 177 | parser.add_argument("--file_path", default=None, type=str, required=True) 178 | parser.add_argument("--save_to_file", default=None, type=str, required=True) 179 | parser.add_argument("--anonymize_tgt", action='store_true') 180 | parser.add_argument("--mark_relation_args", action='store_true') 181 | parser.add_argument("--allow_no_relation", action='store_true') 182 | parser.add_argument("--truncate_noise", action='store_true') 183 | parser.add_argument("--one_form_per_relation", action='store_true') 184 | parser.add_argument("--src_and_tgt_one_file_with_go", action='store_true') 185 | 186 | args = parser.parse_args() 187 | args.model_type = 'gpt2' 188 | args.model_name_or_path = 'gpt2' 189 | args.config_name = "" 190 | args.tokenizer_name = "" 191 | args.do_lower_case = False 192 | args.block_size = 512 193 | args.local_rank = -1 194 | main(args) -------------------------------------------------------------------------------- /generation_outputs/switch_entities_of_gens.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | from itertools import product 3 | from random import sample 4 | import re 5 | from tqdm import tqdm 6 | 7 | PERSONAL_PRONOUNS_TO_KEEP = ['he', 'she'] 8 | POSSESIVE_PRONOUNS_TO_KEEP = ['his', 'her'] 9 | ENTITY_TYPES = { 10 | 'country_of_headquarters': ['organization', 'country', 'organization', 'country'], 11 | 'children': ['person', 'person', 'person', 'person'], 12 | 'city_of_death': ['person', 'city', 'city', None], 13 | 'date_of_death': ['person', 'date', 'person', None], 14 | 'founded_by': ['organization', 'person', 'person', 'organization'], 15 | 'origin-country': ['person', 'country', 'person', 'country'], 16 | 'origin-nationality': ['person', 'nationality', 'person', 'nationality'], 17 | 'docred-origin-country': ['organization', 'country', 'organization', 'country'], 18 | 'docred-origin-nationality': ['organization', 'nationality', 'organization', 'nationality'], 19 | 'religion': ['person', 'religion', 'person', 'religion'], 20 | 'docred-religion': ['organization', 'religion', 'organization', 'religion'], 21 | 'spouse': ['person', 'person', 'person', 'person'], 22 | } 23 | 24 | def main(args): 25 | entity_types = ENTITY_TYPES[args.relation] 26 | with open(args.gen_file, 'r') as f: 27 | gens = f.readlines() 28 | gens = [g for g in gens if g!= '\n'] 29 | 30 | e1s = get_similar_entities(entity_types[0]) 31 | e2s = get_similar_entities(entity_types[1]) 32 | e3s = get_similar_entities(entity_types[2]) 33 | e4s = get_similar_entities(entity_types[3]) 34 | 35 | with open(args.gen_file.split('.txt')[0]+'_new_ents.txt', 'w') as f: 36 | for i, gen in tqdm(enumerate(gens)): 37 | assert gen.count('[') == gen.count(']'), gen 38 | # E1 - PERSON/ORGANIZATION 39 | subbed = switch_entity_but_not_pronouns(1, gen, e1s) 40 | # E2 41 | if entity_types[1] == 'date': 42 | subbed = switch_dates(2, subbed, e2s) 43 | elif entity_types[1] == 'religion': 44 | subbed = switch_religions(2, subbed, e2s) 45 | else: 46 | subbed = switch_entity_but_not_pronouns(2, subbed, e2s) 47 | # E3 - PERSON/ORGANIZATION 48 | if entity_types[2]: 49 | for e in re.findall('\[E3\] (.*?) \[\/E3\]', subbed): 50 | if e in PERSONAL_PRONOUNS_TO_KEEP+POSSESIVE_PRONOUNS_TO_KEEP: 51 | subbed = re.sub(f'\[E3\] {e} \[\/E3\]', e, subbed) 52 | else: 53 | subbed = re.sub(f'\[E3\] {e} \[\/E3\]', sample(e3s, 1)[0], subbed) 54 | # E4 55 | if entity_types[3]: 56 | for e in re.findall('\[E4\] (.*?) \[\/E4\]', subbed): 57 | if e in PERSONAL_PRONOUNS_TO_KEEP+POSSESIVE_PRONOUNS_TO_KEEP: 58 | subbed = re.sub(f'\[E4\] {e} \[\/E4\]', e, subbed) 59 | if entity_types[3] == 'religion': 60 | subbed = switch_religions(4, subbed, e4s, keep_markers=False) 61 | else: 62 | subbed = re.sub(f'\[E4\] {e} \[\/E4\]', sample(e4s, 1)[0], subbed) 63 | if subbed == gen: 64 | print(f"Warning, generation didn't change: {gen}") 65 | f.write(subbed) 66 | 67 | def switch_entity_but_not_pronouns(ent_num, gen, ents): 68 | E = f"E{ent_num}" 69 | found_pronouns = re.findall(f"\[{E}\] ({'|'.join(PERSONAL_PRONOUNS_TO_KEEP+POSSESIVE_PRONOUNS_TO_KEEP)}) \[\/{E}\]", gen, flags=re.IGNORECASE) 70 | if found_pronouns: 71 | for p in re.findall(f"\[{E}\] ({'|'.join(PERSONAL_PRONOUNS_TO_KEEP)}) \[\/{E}\]", gen, flags=re.IGNORECASE): 72 | gen = re.sub(f'\[{E}\] ({p}) \[\/{E}\]', f'[{E}] {sample(PERSONAL_PRONOUNS_TO_KEEP, 1)[0]} [/{E}]', gen, flags=re.IGNORECASE) 73 | for p in re.findall(f"\[{E}\] ({'|'.join(POSSESIVE_PRONOUNS_TO_KEEP)}) \[\/{E}\]", gen, flags=re.IGNORECASE): 74 | gen = re.sub(f'\[{E}\] ({p}) \[\/{E}\]', f'[{E}] {sample(POSSESIVE_PRONOUNS_TO_KEEP, 1)[0]} [/{E}]', gen, flags=re.IGNORECASE) 75 | else: 76 | gen = re.sub(f'\[{E}\] (.*?) \[\/{E}\]', f'[{E}] {sample(ents, 1)[0]} [/{E}]', gen) 77 | 78 | return gen 79 | 80 | def switch_religions(ent_num, subbed, ents, keep_markers=True): 81 | E = f"E{ent_num}" 82 | if keep_markers: 83 | subbed = re.sub(f"\[{E}\] Religion \[\/{E}\]", f"[{E}] {sample(ents['religion'], 1)[0]} [/{E}]", subbed) 84 | subbed = re.sub(f"\[{E}\] Religious Affiliation \[\/{E}\]", f"[{E}] {sample(ents['religious_affiliation'], 1)[0]} [/{E}]", subbed) 85 | subbed = re.sub(f"\[{E}\] Religious Relation \[\/{E}\]", f"[{E}] {sample(ents['religious_relation'], 1)[0]} [/{E}]", subbed) 86 | subbed = re.sub(f"\[{E}\] Religious Affiliation plural \[\/{E}\]", f"[{E}] {sample(ents['religious_relation'], 1)[0]} [/{E}]", subbed) 87 | else: 88 | subbed = re.sub(f"\[{E}\] Religion \[\/{E}\]", sample(ents['religion'], 1)[0], subbed) 89 | subbed = re.sub(f"\[{E}\] Religious Affiliation \[\/{E}\]", sample(ents['religious_affiliation'], 1)[0], subbed) 90 | subbed = re.sub(f"\[{E}\] Religious Relation \[\/{E}\]", sample(ents['religious_relation'], 1)[0], subbed) 91 | subbed = re.sub(f"\[{E}\] Religious Affiliation plural \[\/{E}\]", sample(ents['religious_relation'], 1)[0], subbed) 92 | return subbed 93 | 94 | def switch_dates(ent_num, subbed, ents): 95 | # changing just november 7. 96 | #TODO, pretty sure this is depracated. Why only November 7? 97 | E = f"E{ent_num}" 98 | subbed = re.sub(f'\[{E}\] November 7 \[\/{E}\]', f'[{E}] {sample(ents, 1)[0]} [/{E}]', subbed) 99 | subbed = re.sub(f'\[{E}\] Nov 7 \[\/{E}\]', f'[{E}] {sample(ents, 1)[0]} [/{E}]', subbed) 100 | subbed = re.sub(f'\[{E}\] Nov\. 7 \[\/{E}\]', f'[{E}] {sample(ents, 1)[0]} [/{E}]', subbed) 101 | 102 | def get_similar_entities(entity_type): 103 | if entity_type is None: 104 | return None 105 | elif entity_type == 'date': 106 | return dates() 107 | elif entity_type == 'city': 108 | return cities() 109 | elif entity_type == 'nationality': 110 | return nationalities() 111 | elif entity_type == 'religion': 112 | return religions() 113 | else: 114 | return read_ents_from_file(entity_type) 115 | 116 | def read_ents_from_file(entity_type): 117 | with open(f'generation_outputs/types/{entity_type}.txt', 'r') as f: 118 | ents = f.readlines() 119 | ents = [e.rstrip() for e in ents] 120 | return ents 121 | 122 | def religions(): 123 | religions = { 124 | 'religion': ["Atheism", "Scientology", "Islam", "Christianity"], 125 | 'religious_relation': ["Evangelical", "Islamic", "Christian", "Jewish", "Catholic"], 126 | 'religious_affiliation': ["Methodist", "Separatist", "Jew", "Christian", "Sunni", "Secular", "Fundamentalist", "Christianist", "Anglican", "Orthodox", "Islamist", "Muslim"], 127 | } 128 | religions['religious_affiliation_plural'] = [f"{x}s" for x in religions['religious_affiliation']] 129 | 130 | return religions 131 | 132 | def dates(): 133 | months = ['January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December', 'Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'] 134 | return [' '.join([x[0], str(x[1])]) for x in product(months, range(29))] 135 | 136 | def cities(): 137 | return ["New York", "Los Angeles", "Chicago", "Houston", "Phoenix", "Philadelphia", "San Antonio", "San Diego", "Dallas", "San Jose", "Austin", "Jacksonville", "Fort Worth", "Columbus", "San Francisco", "Charlotte", "Indianapolis", "Seattle", "Denver", "Washington", "Boston", "El Paso", "Detroit", "Nashville", "Portland", "Memphis", "Oklahoma City", "Las Vegas", "Louisville", "Baltimore", "Milwaukee", "Albuquerque", "Tucson", "Fresno", "Mesa", "Sacramento", "Atlanta", "Kansas City", "Colorado Springs", "Miami", "Raleigh", "Omaha", "Long Beach", "Virginia Beach", "Oakland", "Minneapolis", "Tulsa", "Arlington", "Tampa", "New Orleans"] 138 | 139 | def nationalities(): 140 | return ["British", "English", "Scottish", "Gaelic", "Irish", "Welsh", "Danish", "Finnish", "Norwegian", "Swedish", "Swiss", "German", "French", "Italian", "Estonian", "Latvian", "Lithuanian", "Austrian", "Belgian", "Flemish", "Dutch", "American", "Canadian", "Mexican", "Spanish", "Ukrainian", "Russian", "Belarusian", "Polish", "Czech", "Slovak", "Slovakian", "Hungarian", "Romanian", "Bulgarian", "Greek", "Brazilian", "Portuguese", "Australian", "New Zealander", "Maori", "Georgian", "Israeli", "Hebrew", "Egyptian", "Arabic", "Turkish", "Chinese", "Mandarin", "Korean", "Japanese", "Indian", "Hindi", "South African", "Afrikaans"] 141 | 142 | if __name__ == "__main__": 143 | parser = argparse.ArgumentParser() 144 | parser.add_argument("--gen_file", type=str, required=True) 145 | parser.add_argument("--relation", type=str, required=True) 146 | args = parser.parse_args() 147 | main(args) -------------------------------------------------------------------------------- /run_generation.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # coding=utf-8 3 | # Copyright 2018 Google AI, Google Brain and Carnegie Mellon University Authors and the HuggingFace Inc. team. 4 | # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | """ Conditional text generation with the auto-regressive models of the library (GPT/GPT-2/CTRL/Transformer-XL/XLNet) 18 | """ 19 | 20 | 21 | import argparse 22 | import logging 23 | import os 24 | 25 | import numpy as np 26 | import torch 27 | 28 | from transformers import ( 29 | CTRLLMHeadModel, 30 | CTRLTokenizer, 31 | GPT2LMHeadModel, 32 | GPT2Tokenizer, 33 | OpenAIGPTLMHeadModel, 34 | OpenAIGPTTokenizer, 35 | TransfoXLLMHeadModel, 36 | TransfoXLTokenizer, 37 | XLMTokenizer, 38 | XLMWithLMHeadModel, 39 | XLNetLMHeadModel, 40 | XLNetTokenizer, 41 | ) 42 | 43 | 44 | logging.basicConfig( 45 | format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO, 46 | ) 47 | logger = logging.getLogger(__name__) 48 | 49 | MAX_LENGTH = int(10000) # Hardcoded max length to avoid infinite loop 50 | MAX_BATCH = 100 51 | 52 | MODEL_CLASSES = { 53 | "gpt2": (GPT2LMHeadModel, GPT2Tokenizer), 54 | "ctrl": (CTRLLMHeadModel, CTRLTokenizer), 55 | "openai-gpt": (OpenAIGPTLMHeadModel, OpenAIGPTTokenizer), 56 | "xlnet": (XLNetLMHeadModel, XLNetTokenizer), 57 | "transfo-xl": (TransfoXLLMHeadModel, TransfoXLTokenizer), 58 | "xlm": (XLMWithLMHeadModel, XLMTokenizer), 59 | } 60 | 61 | # Padding text to help Transformer-XL and XLNet with short prompts as proposed by Aman Rusia 62 | # in https://github.com/rusiaaman/XLNet-gen#methodology 63 | # and https://medium.com/@amanrusia/xlnet-speaks-comparison-to-gpt-2-ea1a4e9ba39e 64 | PADDING_TEXT = """ In 1991, the remains of Russian Tsar Nicholas II and his family 65 | (except for Alexei and Maria) are discovered. 66 | The voice of Nicholas's young son, Tsarevich Alexei Nikolaevich, narrates the 67 | remainder of the story. 1883 Western Siberia, 68 | a young Grigori Rasputin is asked by his father and a group of men to perform magic. 69 | Rasputin has a vision and denounces one of the men as a horse thief. Although his 70 | father initially slaps him for making such an accusation, Rasputin watches as the 71 | man is chased outside and beaten. Twenty years later, Rasputin sees a vision of 72 | the Virgin Mary, prompting him to become a priest. Rasputin quickly becomes famous, 73 | with people, even a bishop, begging for his blessing. """ 74 | 75 | 76 | def set_seed(args): 77 | np.random.seed(args.seed) 78 | torch.manual_seed(args.seed) 79 | if args.n_gpu > 0: 80 | torch.cuda.manual_seed_all(args.seed) 81 | 82 | 83 | # 84 | # Functions to prepare models' input 85 | # 86 | 87 | 88 | def prepare_ctrl_input(args, _, tokenizer, prompt_text): 89 | if args.temperature > 0.7: 90 | logger.info("CTRL typically works better with lower temperatures (and lower top_k).") 91 | 92 | encoded_prompt = tokenizer.encode(prompt_text, add_special_tokens=False) 93 | if not any(encoded_prompt[0] == x for x in tokenizer.control_codes.values()): 94 | logger.info("WARNING! You are not starting your generation from a control code so you won't get good results") 95 | return prompt_text 96 | 97 | 98 | def prepare_xlm_input(args, model, tokenizer, prompt_text): 99 | # kwargs = {"language": None, "mask_token_id": None} 100 | 101 | # Set the language 102 | use_lang_emb = hasattr(model.config, "use_lang_emb") and model.config.use_lang_emb 103 | if hasattr(model.config, "lang2id") and use_lang_emb: 104 | available_languages = model.config.lang2id.keys() 105 | if args.xlm_language in available_languages: 106 | language = args.xlm_language 107 | else: 108 | language = None 109 | while language not in available_languages: 110 | language = input("Using XLM. Select language in " + str(list(available_languages)) + " >>> ") 111 | # kwargs["language"] = tokenizer.lang2id[language] 112 | 113 | # TODO fix mask_token_id setup when configurations will be synchronized between models and tokenizers 114 | # XLM masked-language modeling (MLM) models need masked token 115 | # is_xlm_mlm = "mlm" in args.model_name_or_path 116 | # if is_xlm_mlm: 117 | # kwargs["mask_token_id"] = tokenizer.mask_token_id 118 | 119 | return prompt_text 120 | 121 | 122 | def prepare_xlnet_input(args, _, tokenizer, prompt_text): 123 | prompt_text = (args.padding_text if args.padding_text else PADDING_TEXT) + prompt_text 124 | return prompt_text, {} 125 | 126 | 127 | def prepare_transfoxl_input(args, _, tokenizer, prompt_text): 128 | prompt_text = (args.padding_text if args.padding_text else PADDING_TEXT) + prompt_text 129 | return prompt_text, {} 130 | 131 | 132 | PREPROCESSING_FUNCTIONS = { 133 | "ctrl": prepare_ctrl_input, 134 | "xlm": prepare_xlm_input, 135 | "xlnet": prepare_xlnet_input, 136 | "transfo-xl": prepare_transfoxl_input, 137 | } 138 | 139 | 140 | def adjust_length_to_model(length, max_sequence_length): 141 | if length < 0 and max_sequence_length > 0: 142 | length = max_sequence_length 143 | elif 0 < max_sequence_length < length: 144 | length = max_sequence_length # No generation bigger than model size 145 | elif length < 0: 146 | length = MAX_LENGTH # avoid infinite loop 147 | return length 148 | 149 | 150 | def main(): 151 | parser = argparse.ArgumentParser() 152 | parser.add_argument( 153 | "--model_type", 154 | default=None, 155 | type=str, 156 | required=True, 157 | help="Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys()), 158 | ) 159 | parser.add_argument( 160 | "--model_name_or_path", 161 | default=None, 162 | type=str, 163 | required=True, 164 | help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(MODEL_CLASSES.keys()), 165 | ) 166 | 167 | parser.add_argument("--prompt", type=str, default="", required=True) 168 | parser.add_argument("--out_file", type=str, default="") 169 | parser.add_argument("--length", type=int, default=50) 170 | parser.add_argument("--stop_token", type=str, default=None, help="Token at which text generation is stopped") 171 | 172 | parser.add_argument( 173 | "--temperature", 174 | type=float, 175 | default=1.0, 176 | help="temperature of 1.0 has no effect, lower tend toward greedy sampling", 177 | ) 178 | parser.add_argument( 179 | "--repetition_penalty", type=float, default=1.0, help="primarily useful for CTRL model; in that case, use 1.2" 180 | ) 181 | parser.add_argument("--k", type=int, default=0) 182 | parser.add_argument("--p", type=float, default=0.9) 183 | parser.add_argument("--num_return_sequences", type=int, default=1) 184 | 185 | parser.add_argument("--padding_text", type=str, default="", help="Padding text for Transfo-XL and XLNet.") 186 | parser.add_argument("--xlm_language", type=str, default="", help="Optional language when used with the XLM model.") 187 | 188 | parser.add_argument("--seed", type=int, default=42, help="random seed for initialization") 189 | parser.add_argument("--no_cuda", action="store_true", help="Avoid using CUDA when available") 190 | args = parser.parse_args() 191 | 192 | args.device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") 193 | args.n_gpu = torch.cuda.device_count() 194 | 195 | set_seed(args) 196 | 197 | # Initialize the model and tokenizer 198 | try: 199 | args.model_type = args.model_type.lower() 200 | model_class, tokenizer_class = MODEL_CLASSES[args.model_type] 201 | except KeyError: 202 | raise KeyError("the model {} you specified is not supported. You are welcome to add it and open a PR :)") 203 | 204 | tokenizer = tokenizer_class.from_pretrained(args.model_name_or_path) 205 | model = model_class.from_pretrained(args.model_name_or_path) 206 | model.to(args.device) 207 | 208 | args.length = adjust_length_to_model(args.length, max_sequence_length=model.config.max_position_embeddings) 209 | logger.info(args) 210 | 211 | prompt_text = args.prompt if args.prompt else input("Model prompt >>> ") 212 | 213 | # Different models need different input formatting and/or extra arguments 214 | requires_preprocessing = args.model_type in PREPROCESSING_FUNCTIONS.keys() 215 | if requires_preprocessing: 216 | prepare_input = PREPROCESSING_FUNCTIONS.get(args.model_type) 217 | prompt_text = prepare_input(args, model, tokenizer, prompt_text) 218 | encoded_prompt = tokenizer.encode(prompt_text, add_special_tokens=True, return_tensors="pt") 219 | encoded_prompt = encoded_prompt.to(args.device) 220 | 221 | samples_splits = args.num_return_sequences//MAX_BATCH * [MAX_BATCH] 222 | if args.num_return_sequences%MAX_BATCH > 0: samples_splits.append(args.num_return_sequences%MAX_BATCH) 223 | 224 | if args.out_file: 225 | file_path = os.path.join(args.model_name_or_path, args.out_file) 226 | out_file = open(file_path, "w") 227 | 228 | for curr_samples in samples_splits: 229 | output_sequences = model.generate( 230 | input_ids=encoded_prompt, 231 | max_length=args.length, 232 | temperature=args.temperature, 233 | top_k=args.k, 234 | top_p=args.p, 235 | do_sample=args.k > 1 or args.p < 1.0, 236 | repetition_penalty=args.repetition_penalty, 237 | num_return_sequences=curr_samples, 238 | ) 239 | 240 | generated_sequence = output_sequences[0, :, encoded_prompt.size(1):].tolist() 241 | 242 | texts = [] 243 | for seq in generated_sequence: 244 | text = tokenizer.decode(seq, clean_up_tokenization_spaces=True) 245 | text = text[: text.find(args.stop_token) if args.stop_token else None] 246 | text = text[: text.find('\n')] 247 | texts.append(text) 248 | 249 | texts = '\n'.join(texts) 250 | 251 | if args.out_file: 252 | out_file.write(texts) 253 | print(f"Generations written to: {file_path}") 254 | else: 255 | print(texts) 256 | 257 | if args.out_file: 258 | out_file.close() 259 | 260 | 261 | 262 | if __name__ == "__main__": 263 | main() 264 | -------------------------------------------------------------------------------- /scripts/generation_preprocess/relation_canonical_form.py: -------------------------------------------------------------------------------- 1 | CANONICAL_FORMS = { 2 | "no_relation": ["NR"], 3 | "org:founded_by": ["{obj}, founder of {subj} .", 4 | "{obj}, who established {subj} .", 5 | "{subj}, founded by {obj} .", 6 | "{obj} was the founder of {subj} .", 7 | "{subj} founder {obj} ."], 8 | "per:employee_of": ["{subj} is an employee of {obj} .", 9 | "{subj} is the {predicate} of {obj} .", 10 | "{obj}'s {predicate} {subj} .", 11 | "{subj}, the {predicate} of {obj} .", 12 | "{subj}, {obj}'s {predicate} .", 13 | "{subj}, {predicate} of {obj} ." 14 | "{obj} joined {subj} ."], 15 | "org:alternate_names": ["{subj} known as {obj} .", 16 | "{subj}, formally known as {obj} .", 17 | "{subj}, then called {obj} .", 18 | "{subj} or {obj} ."], 19 | "per:cities_of_residence": ["{subj} lived in {obj} .", 20 | "{subj} moved to {obj} .", 21 | "{subj}'s home in {obj} .", 22 | "{subj} grew up in {obj} .", 23 | "{subj} who lived in {obj} ."], 24 | "per:children": ["{subj}'s child is {obj} .", 25 | "{subj}'s {predicate} is {obj} ."], 26 | "per:title": ["{subj} is a {obj} ."], 27 | "per:siblings": ["{subj}'s sibling is {obj} .", 28 | "{subj}'s {predicate}, {obj} .", 29 | "{obj}, {subj}'s {predicate} ."], 30 | "per:religion": ["{subj}, a {obj} .", 31 | "{subj} is {obj} person ."], 32 | "per:age": ["{subj} is {obj} years old .", 33 | "{subj} dies at age {obj} .", 34 | "{subj}, aged {obj} .", 35 | "{subj} reached the age of {obj} ."], 36 | "org:website": ["Find {subj} online in {obj} ."], 37 | "per:stateorprovinces_of_residence": ["{subj} lived in {obj} .", 38 | "{subj} moved to {obj} .", 39 | "{subj}'s home in {obj} .", 40 | "{subj} grew up in {obj} .", 41 | "{subj} who lived in {obj} ."], 42 | "org:member_of": ["{subj} is part of {obj} .", 43 | "{subj} has join the {obj} .", 44 | "{subj} is a member of {obj} .", 45 | "{obj} is composed of {subj} ."], 46 | "org:top_members/employees": ["{obj} is the {predicate} of {subj} .", 47 | "{subj}'s {predicate} {obj} .", 48 | "{obj}, the {predicate} of {subj} .", 49 | "{obj}, {subj}'s {predicate} .", 50 | "{obj}, {predicate} of {subj} ."], 51 | "per:countries_of_residence": ["{subj} lived in {obj} .", 52 | "{subj} moved to {obj} .", 53 | "{subj}'s home in {obj} .", 54 | "{subj} grew up in {obj} .", 55 | "{subj} who lived in {obj} ."], 56 | "org:city_of_headquarters": ["{subj}, based in {obj} .", 57 | "{subj} is headquartered in {obj} .", 58 | "{subj}, an organization based in {obj} .", 59 | "{subj}, which is based in {obj} ."], 60 | "org:members": ["{obj} is part of {subj} .", 61 | "{obj} has join the {subj} .", 62 | "{obj} is a member of {subj} .", 63 | "{subj} is composed of {obj} ."], 64 | "org:country_of_headquarters": ["{subj}, based in {obj} .", 65 | "{subj}, based in Dublin, {obj} .", 66 | "{subj} is headquartered in {obj} .", 67 | "{subj}, an organization based in {obj} .", 68 | "{subj}, which is based in {obj} ."], 69 | "per:spouse": ["{subj} is married to {obj} .", 70 | "{subj} married {obj} .", 71 | "{subj}'s {predicate} {obj} ."], 72 | "org:stateorprovince_of_headquarters": ["{subj}, based in {obj} .", 73 | "{subj} is based in {obj} .", 74 | "{subj} is headquartered in {obj} .", 75 | "{subj}, an organization based in {obj} .", 76 | "{subj}, which is based in {obj} ."], 77 | "org:number_of_employees/members": ["{subj} employes {obj} workers .", 78 | "{subj} is an organization with {obj} employees .", 79 | "{subj} has {obj} employees ."], 80 | "org:parents": ["{subj}, a unit of {obj} .", 81 | "{subj} at {obj} .", 82 | "{subj} is a division of {obj} .", 83 | "{subj} is owned by {obj} ."], 84 | "org:subsidiaries": ["{obj}, a unit of {subj} .", 85 | "{obj} at {subj} .", 86 | "{obj} is a division of {subj} .", 87 | "{obj} is owned by {subj} ."], 88 | "per:origin": ["{subj} is a {obj} native .", 89 | "{obj} {subj} .", 90 | "{subj} is a {obj} ."], 91 | "org:political/religious_affiliation": ["{obj} group {subj} ."], 92 | "per:other_family": ["{subj} and {obj} are family members .", 93 | "{subj}'s {predicate} {obj} ."], 94 | "per:stateorprovince_of_birth": ["{subj} was born in {obj} .", 95 | "{subj} was born on January 1st in {obj} .", 96 | "{subj} was born in {obj} ."], 97 | "org:dissolved": ["{subj} was dissolved in {obj} .", 98 | "{subj} announced bankrupcy in {obj} ."], 99 | "per:date_of_death": ["{subj} died in {obj} .", 100 | "{subj} died at his home in {obj} ."], 101 | "org:shareholders": ["{obj} acquired some of {subj} .", 102 | "{obj} invested in {subj} .", 103 | "{subj}'s shareholder {obj} ."], 104 | "per:alternate_names": ["{subj}, who was known as {obj} .", 105 | "{subj}, whose real name is {obj} .", 106 | "{subj}, then known as {obj} ."], 107 | "per:parents": ["{obj} is {subj}'s parent .", 108 | "{subj}'s {predicate}, {obj} .", 109 | "{obj}, {subj}'s {predicate} .", 110 | "{obj}, {predicate} of {subj} ."], 111 | "per:schools_attended": ["{subj} graduated from {obj} .", 112 | "{subj} received a degree from {obj} .", 113 | "{subj} attended {obj} ."], 114 | "per:cause_of_death": ["{subj} died of {obj} .", 115 | "{subj} died from {obj} ."], 116 | "per:city_of_death": ["{subj} died in {obj} .", 117 | "{subj} died at his home in {obj} .", 118 | "{subj} died at Sunday in {obj} ."], 119 | "per:stateorprovince_of_death": ["{subj} died in {obj} .", 120 | "{subj} died at his home in {obj} .", 121 | "{subj} died at Sunday in {obj} ."], 122 | "org:founded": ["{subj} was established in {obj} .", 123 | "Founded {subj} in {obj} .", 124 | "{subj}, established in {obj} .", 125 | "The founder founded {subj} in {obj} ."], 126 | "per:country_of_birth": ["{subj} was born in {obj} .", 127 | "{subj} was born on January 1st in {obj} .", 128 | "{subj} was born in Berlin, {obj} ."], 129 | "per:date_of_birth": ["{subj} was born in {obj} .", 130 | "{subj} was born on {obj} ."], 131 | "per:city_of_birth": ["{subj} was born in {obj} .", 132 | "{subj} was born on January 1st in {obj} ."], 133 | "per:charges": ["{subj} was convicted of {obj} .", 134 | "{subj} face {obj} among other charges ."], 135 | "per:country_of_death": ["{subj} died in {obj} .", 136 | "{subj} died at his home in {obj} .", 137 | "{subj} died at Sunday in {obj} ."] 138 | } 139 | 140 | 141 | PREDICATES = {"org:top_members/employees": {"chief operating officer": "chief operating officer", 142 | "executive director": "executive director", "director-general": "director-general", 143 | "director general": "director general", "chief executive": "chief executive", 144 | "vice president": "vice president", "vice chairman": "vice chairman", "executive": "executive", 145 | "president": "president", "spokesman": "spokesman", "chairman": "chairman", "director": "director", 146 | "general": "general", "manager": "manager", "editor": "editor", "fellow": "fellow", "chief": "chief", 147 | "owner": "owner", "owns": "owner", "own": "owner", "ceo": "ceo", "coo": "coo", "cto": "cto", 148 | "default": "head"}, 149 | "per:children": {"son": "son", "daughter": "daughter", 150 | "default": None}, 151 | "per:siblings": {"brother": "brother", "sister": "sister", 152 | "default": None}, 153 | "per:spouse": {"wife": "wife", "husband": "husband", 154 | "default": None}, 155 | "per:parents": {"father": "father", "mother": "mother", 156 | "default": None}, 157 | "per:other_family": {"sister's husband": "sister's husband", "brother 's wife": "brother 's wife", "brother-in-law": "brother-in-law", 158 | "sister-in-law": "sister-in-law", "grandchildren": "grandchildren", "stepdaughters": "stepdaughter", 159 | "stepdaughter": "stepdaughter", "stepfathers": "stepfather", "stepmothers": "stepmother", "stepmother": "stepmother", 160 | "stepfather": "stepfather", "grandchild": "grandchild", "daughters": "daughter", "daughter": "daughter", 161 | "stepsons": "stepson", "children": "children", "engaged": "is engaged to", "husband": "husband", 162 | "stepson": "stepson", "cousin": "cousin", "fiance": "fiance", "nephew": "nephew", "child": "child", 163 | "niece": "niece", "sons": "son", "wife": "wife", "son": "son", 164 | "default": None}, 165 | } 166 | 167 | PREDICATES["per:employee_of"] = PREDICATES["org:top_members/employees"] 168 | PREDICATES["per:employee_of"]['default'] = None -------------------------------------------------------------------------------- /scripts/search/download_patterns_config.py: -------------------------------------------------------------------------------- 1 | children_patterns = ["{e1:e=PERSON John} 's [t:w=son|daughter|child|children|daughters|sons daughter] , {e2:e=PERSON Tim}, likes swimming .", 2 | "{e1:e=PERSON Mary} did something to her [t:w=son|daughter|child|children|daughters|sons son], {e2:e=PERSON John} in 1992.", 3 | "{e1:e=PERSON Mary} was survived by her 4 [t:w=son|daughter|child|children|daughters|sons sons], John, John, {e2:e=PERSON John} and John."] 4 | founded_by_patterns = ["{e1:e=ORGANIZATION Microsoft} [t:w=founder founder] {e2:e=PERSON Mary} likes running.", 5 | "{e2:e=PERSON Mary} [t:w=founded founded] {e1:e=ORGANIZATION Microsoft}.", 6 | "{e1:e=ORGANIZATION Microsoft} was [t:w=founded founded] [$ by] {e2:e=PERSON Mary}."] 7 | country_of_headquarters_patterns = ["John Doe, a professor at the {e1:e=ORGANIZATION Technion} [in:t=IN in] {e2:e=LOCATION Israel} likes running.", 8 | "{e1:e=ORGANIZATION Technion}, a leading {t:t=/NN/ company} {in:t=IN in} {e2:e=LOCATION Israel}.", 9 | "{e2:e=LOCATION Israel} [pos:t=POS '] largest university is {e1:e=ORGANIZATION BIU}."] 10 | religion_patterns = ["{e1:e=PERSON John} is a [e2:w=Methodist|Episcopal|separatist|Jew|Christian|Sunni|evangelical|atheism|Islamic|secular|fundamentalist|Christianist|Jewish|Anglican|Catholic|orthodox|Scientology|Islamist|Islam|Muslim|Shia Jewish]", 11 | "[e2:w=Methodist|Episcopal|separatist|Jew|Christian|Sunni|evangelical|atheism|Islamic|secular|fundamentalist|Christianist|Jewish|Anglican|Catholic|orthodox|Scientology|Islamist|Islam|Muslim|Shia Jewish] {e1:e=PERSON John} is walking down the street.", 12 | "{e1:e=PERSON John} is a [e2:w=Methodist|Episcopal|separatist|Jew|Christian|Sunni|evangelical|atheism|Islamic|secular|fundamentalist|Christianist|Jewish|Anglican|Catholic|orthodox|Scientology|Islamist|Islam|Muslim|Shia Methodist] Person."] 13 | spouse_patterns = ["{e1:e=PERSON John} 's [t:w=wife|husband wife], {e2:e=PERSON Mary} , died in 1991 .", 14 | "{e1:e=PERSON John} [t:l=marry married] {e2:e=PERSON Mary}", 15 | "{e1:e=PERSON John} is [t:w=married married] to {e2:e=PERSON Mary}"] 16 | origin_patterns = ["{e2:e=MISC Scottish} {e1:e=PERSON Mary} is high.", 17 | "{e1:e=PERSON Mary} is a {e2:e=MISC Scottish} professor.", 18 | "{e1:e=PERSON Mary}, the {e2:e=LOCATION US} professor."] 19 | date_of_death_patterns = ["{e1:e=PERSON John} was announced [t:w=dead dead] in {e2:e=DATE 1943}.", 20 | "{e1:e=PERSON John} [t:w=died died] in {e2:e=DATE 1943}.", 21 | "{e1:e=PERSON John}, an NLP scientist, [t:w=died died] {e2:e=DATE 1943}." 22 | ] 23 | city_of_death_patterns = ["{e1:e=PERSON John} [t:w=died died] in {e2:e=LOCATION London}, {country:e=LOCATION England} in 1997.", 24 | "{e1:e=PERSON John} [t:w=died died] in {e2:e=LOCATION London} in 1997.", 25 | "{e1:e=PERSON John} [$ -LRB-] [t:w=died died] in {e2:e=LOCATION London} [$ -RRB-] ."] 26 | 27 | all_triggers_children_patterns = ["{e1:e=PERSON John} 's [t:w=baby|child|children|daughter|daughters|son|sons|step-daughter|step-son|step-child|step-children|stepchildren|stepdaughter|stepson daughter] , {e2:e=PERSON Tim}, likes swimming .", 28 | "{e1:e=PERSON Mary} did something to her [t:w=baby|child|children|daughter|daughters|son|sons|step-daughter|step-son|step-child|step-children|stepchildren|stepdaughter|stepson son], {e2:e=PERSON John} in 1992.", 29 | "{e1:e=PERSON Mary} was survived by her 4 [t:w=baby|child|children|daughter|daughters|son|sons|step-daughter|step-son|step-child|step-children|stepchildren|stepdaughter|stepson sons], John, John, {e2:e=PERSON John} and John."] 30 | all_triggers_founded_by_patterns = ["{e1:e=ORGANIZATION Microsoft} [t:w=founder|co-founder|cofounder|creator founder] {e2:e=PERSON Mary} likes running.", 31 | "{e2:e=PERSON Mary} [t:w=create|creates|created|creating|creation|co-founded|co-found|debut|emerge|emerges|emerged|emerging|establish|established|establishing|establishes|establishment|forge|forges|forged|forging|forms|formed|forming|founds|found|founded|founding|launched|launches|launching|opened|opens|opening|shapes|shaped|shaping|start|started|starting|starts founded] {e1:e=ORGANIZATION Microsoft}.", 32 | "{e1:e=ORGANIZATION Microsoft} was [t:w=create|creates|created|creating|creation|co-founded|co-found|debut|emerge|emerges|emerged|emerging|establish|established|establishing|establishes|establishment|forge|forges|forged|forging|forms|formed|forming|founds|found|founded|founding|launched|launches|launching|opened|opens|opening|shapes|shaped|shaping|start|started|starting|starts founded] [$ by] {e2:e=PERSON Mary}."] 33 | all_triggers_spouse_patterns = ["{e1:e=PERSON John} 's [t:w=ex-husband|ex-wife|husband|widow|widower|wife|sweetheart|bride wife], {e2:e=PERSON Mary} , died in 1991 .", 34 | "{e1:e=PERSON John} [t:w=divorce|divorced|married|marry|wed|divorcing married] {e2:e=PERSON Mary}", 35 | "{e1:e=PERSON John} is [t:w=married|marry|wed married] to {e2:e=PERSON Mary}"] 36 | all_triggers_date_of_death_patterns = ["{e1:e=PERSON John} was announced [t:w=dead dead] in {e2:e=DATE 1943}.", 37 | "{e1:e=PERSON John} [t:w=died|executed|killed|dies|perished|succumbed|passed|murdered|suicided died] in {e2:e=DATE 1943}.", 38 | "{e1:e=PERSON John}, an NLP scientist, [t:w=died|executed|killed|dies|perished|succumbed|passed|murdered|suicided died] {e2:e=DATE 1943}."] 39 | all_triggers_city_of_death_patterns = ["{e1:e=PERSON John} [t:w=died|executed|killed|dies|perished|succumbed|passed|murdered|suicided died] in {e2:e=LOCATION London}, {country:e=LOCATION England} in 1997.", 40 | "{e1:e=PERSON John} [t:w=died|executed|killed|dies|perished|succumbed|passed|murdered|suicided died] in {e2:e=LOCATION London} in 1997.", 41 | "{e1:e=PERSON John} [$ -LRB-] [t:w=died|executed|killed|dies|perished|succumbed|passed|murdered|suicided died] in {e2:e=LOCATION London} [$ -RRB-] ."] 42 | 43 | NEGATIVE_PATTERNS = { 44 | 'PERSON:PERSON': ["(? [entity=PERSON]+) [entity!=PERSON]+ (? [entity=PERSON]+) #e e1 e2"], 45 | 'PERSON:DATE': ["(? [entity=PERSON]+) []+ (? [entity=DATE]+) #e e1 e2", "(? [entity=DATE]+) []+ (? [entity=PERSON]+) #e e1 e2"], 46 | 'ORGANIZATION:DATE': ["(? [entity=ORGANIZATION]+) []+ (? [entity=DATE]+) #e e1 e2", "(? [entity=DATE]+) []+ (? [entity=ORGANIZATION]+) #e e1 e2"], 47 | 'ORGANIZATION:PERSON': ["(? [entity=ORGANIZATION]+) []+ (? [entity=PERSON]+) #e e1 e2", "(? [entity=PERSON]+) []+ (? [entity=ORGANIZATION]+) #e e1 e2"], 48 | 'ORGANIZATION:LOCATION': ["(? [entity=ORGANIZATION]+) []+ (? [entity=LOCATION]+) #e e1 e2", "(? [entity=LOCATION]+) []+ (? [entity=ORGANIZATION]+) #e e1 e2"], 49 | 'PERSON:LOCATION': ["(? [entity=PERSON]+) []+ (? [entity=LOCATION]+) #e e1 e2", "(? [entity=LOCATION]+) []+ (? [entity=PERSON]+) #e e1 e2"], 50 | 'PERSON:MISC': ["(? [entity=PERSON]+) []+ (? [entity=MISC]+) #e e1 e2", "(? [entity=MISC]+) []+ (? [entity=PERSON]+) #e e1 e2"], 51 | } 52 | 53 | 54 | docred_founded_by_patterns = ["{e1:e=ORGANIZATION|MISC Microsoft} [t:w=founder founder] {e2:e=PERSON Mary} likes running.", 55 | "{e2:e=PERSON Mary} [t:w=founded founded] {e1:e=ORGANIZATION|MISC Microsoft}.", 56 | "{e1:e=ORGANIZATION|MISC Microsoft} was [t:w=founded founded] [$ by] {e2:e=PERSON Mary}."] 57 | docred_origin_patterns = ["{e2:e=MISC Scottish} company, {e1:e=ORGANIZATION Microsoft} is successful.", 58 | "{e1:e=ORGANIZATION|MISC Microsoft} is a {e2:e=MISC Scottish} Company.", 59 | "{e1:e=ORGANIZATION|MISC Microsoft} is a {t:t=/NN/ song} [$ by] {e2:e=MISC Scottish} musican."] 60 | docred_date_of_death_patterns = ["{e1:e=PERSON John} [$ -LRB-] [$:e=DATE date] [$ -] {e2:e=DATE 1997} [$ -RRB-] .", 61 | "{e1:e=PERSON John} [t:w=died died] in {e2:e=DATE 1943}.", 62 | "{e1:e=PERSON John}, an NLP scientist, [t:w=died died] {e2:e=DATE 1943}."] 63 | docred_city_of_death_patterns = ["{e1:e=PERSON John} [t:w=died died] in {e2:e=LOCATION London}, {country:e=LOCATION England} in 1997.", 64 | "{e1:e=PERSON John} [t:w=died died] in {e2:e=LOCATION London} in 1997.", 65 | "{e1:e=PERSON John} [$ -LRB-] [$:e=DATE 1997], [$:e=LOCATION London] [$ -] [$:e=DATE 1997] {e2:e=LOCATION London} [$ -RRB-] ."] 66 | docred_country_of_headquarters_patterns = ["{e1:e=ORGANIZATION Technion}, a leading {t:t=/NN/ company} {in:t=IN in} {e2:e=LOCATION Israel}.", 67 | "{e1:e=ORGANIZATION Microsoft} is [t:l=base|headquarter based] in {e2:e=LOCATION England} .", 68 | "{e1:e=ORGANIZATION Technion}, a leading {t:t=/NN/ company} based {in:t=IN in} {e2:e=LOCATION Israel}."] 69 | 70 | all_triggers_docred_founded_by_patterns = ["{e1:e=ORGANIZATION|MISC Microsoft} [t:w=founder|co-founder|cofounder|creator founder] {e2:e=PERSON Mary} likes running.", 71 | "{e2:e=PERSON Mary} [t:w=create|creates|created|creating|creation|co-founded|co-found|debut|emerge|emerges|emerged|emerging|establish|established|establishing|establishes|establishment|forge|forges|forged|forging|forms|formed|forming|founds|found|founded|founding|launched|launches|launching|opened|opens|opening|shapes|shaped|shaping|start|started|starting|starts founded] {e1:e=ORGANIZATION|MISC Microsoft}.", 72 | "{e1:e=ORGANIZATION|MISC Microsoft} was [t:w=create|creates|created|creating|creation|co-founded|co-found|debut|emerge|emerges|emerged|emerging|establish|established|establishing|establishes|establishment|forge|forges|forged|forging|forms|formed|forming|founds|found|founded|founding|launched|launches|launching|opened|opens|opening|shapes|shaped|shaping|start|started|starting|starts founded] [$ by] {e2:e=PERSON Mary}."] 73 | all_triggers_docred_date_of_death_patterns = ["{e1:e=PERSON John} [$ -LRB-] [$:e=DATE date] [$ -] {e2:e=DATE 1997} [$ -RRB-] .", 74 | "{e1:e=PERSON John} [t:w=died|executed|killed|dies|perished|succumbed|passed|murdered|suicided died] in {e2:e=DATE 1943}.", 75 | "{e1:e=PERSON John}, an NLP scientist, [t:w=died|executed|killed|dies|perished|succumbed|passed|murdered|suicided died] {e2:e=DATE 1943}."] 76 | all_triggers_docred_city_of_death_patterns = ["{e1:e=PERSON John} [t:w=died|executed|killed|dies|perished|succumbed|passed|murdered|suicided died] in {e2:e=LOCATION London}, {country:e=LOCATION England} in 1997.", 77 | "{e1:e=PERSON John} [t:w=died|executed|killed|dies|perished|succumbed|passed|murdered|suicided died] in {e2:e=LOCATION London} in 1997.", 78 | "{e1:e=PERSON John} [$ -LRB-] [$:e=DATE 1997], [$:e=LOCATION London] [$ -] [$:e=DATE 1997] {e2:e=LOCATION London} [$ -RRB-] ."] 79 | 80 | SINGLE_TRIGGER_PATTERNS = { 81 | 'tacred': { 82 | "per:children": children_patterns, 83 | "org:founded_by": founded_by_patterns, 84 | "org:country_of_headquarters": country_of_headquarters_patterns, 85 | "per:religion": religion_patterns, 86 | "per:spouse": spouse_patterns, 87 | "per:origin": origin_patterns, 88 | "per:date_of_death": date_of_death_patterns, 89 | "per:city_of_death": city_of_death_patterns, 90 | }, 91 | 'docred': { 92 | "per:children": children_patterns, 93 | "org:founded_by": docred_founded_by_patterns, 94 | "org:country_of_headquarters": docred_country_of_headquarters_patterns, 95 | "per:religion": religion_patterns, 96 | "per:spouse": spouse_patterns, 97 | "per:origin": docred_origin_patterns, 98 | "per:date_of_death": docred_date_of_death_patterns, 99 | "per:city_of_death": docred_city_of_death_patterns, 100 | }, 101 | } 102 | 103 | ALL_TRIGGERS_PATTERNS = { 104 | 'tacred': { 105 | "per:children": all_triggers_children_patterns, 106 | "org:founded_by": all_triggers_founded_by_patterns, 107 | "per:spouse": all_triggers_spouse_patterns, 108 | "per:date_of_death": all_triggers_date_of_death_patterns, 109 | "per:city_of_death": all_triggers_city_of_death_patterns, 110 | }, 111 | 'docred': { 112 | "per:children": all_triggers_children_patterns, 113 | "org:founded_by": all_triggers_docred_founded_by_patterns, 114 | "per:spouse": all_triggers_spouse_patterns, 115 | "per:date_of_death": all_triggers_docred_date_of_death_patterns, 116 | "per:city_of_death": all_triggers_docred_city_of_death_patterns, 117 | }, 118 | } -------------------------------------------------------------------------------- /classification/docred.py: -------------------------------------------------------------------------------- 1 | import csv 2 | from collections import defaultdict 3 | from itertools import permutations 4 | import logging 5 | import os 6 | from typing import Any, Callable, Dict, Iterator, List, Tuple, Type, TypeVar, Set 7 | from typing_extensions import TypedDict 8 | 9 | from transformers.data.processors.utils import InputExample, InputFeatures 10 | from classification.docred_config import RELATION_MAPPING, DOCRED_TACRED_RELATIONS_MAPPING, TACRED_DOCRED_RELATIONS_MAPPING 11 | from classification.re_processors import REProcessor, JsonObject, wrap_text, SetType, NEGATIVE_LABEL 12 | 13 | logger = logging.getLogger(__name__) 14 | 15 | Relation = TypedDict('Relation', r=str, h=int, t=int, evidence=int) 16 | Entity = TypedDict('Entity', name=str, pos=List[int], sent_id=int, type=str) 17 | T = TypeVar('T', bound='DocREDExample') 18 | Builder = Callable[[Type[T], int, JsonObject, Relation, str], List[T]] 19 | 20 | class DocREDExample(InputExample): 21 | def __init__(self, id: int, text: str, label: str, evidence: int = 0, h: int = -1, t: int = -1) -> None: 22 | self.title = id 23 | self.evidence = evidence 24 | self.text = text.replace(u'\u2013', '-') 25 | self.label = label 26 | self.h = h 27 | self.t = t 28 | 29 | def __eq__(self, other: Any): 30 | if not isinstance(other, DocREDExample): 31 | return False 32 | 33 | if self.title == other.title and \ 34 | self.text == other.text and \ 35 | self.h == other.h and \ 36 | self.t == other.t and \ 37 | self.label == other.label: 38 | return True 39 | 40 | return False 41 | 42 | def __hash__(self): 43 | return hash((self.title, self.text, self.h, self.t, self.label)) 44 | 45 | @classmethod 46 | def build_annotated(cls: Type[T], title: int, example_json: JsonObject, relation: Relation, label: str = None) -> List[T]: 47 | for evidence in DocREDUtils.evidences_with_entities(example_json, relation): 48 | yield cls(id=title, 49 | text=DocREDUtils.mark_entities(example_json, relation, evidence), 50 | label=label, 51 | evidence=evidence, 52 | h=relation['h'], 53 | t=relation['t']) 54 | 55 | @classmethod 56 | def build_distant(cls: Type[T], title: int, example_json: JsonObject, relation: Relation, label: str = None) -> List[T]: 57 | for evidence in DocREDUtils.sents_entities_share(example_json, relation): 58 | yield cls(id=title, 59 | text=DocREDUtils.mark_entities(example_json, relation, evidence), 60 | label=label, 61 | evidence=evidence, 62 | h=relation['h'], 63 | t=relation['t']) 64 | 65 | class DocREDUtils: 66 | @staticmethod 67 | def evidences_with_entities(example_json: JsonObject, relation: Relation) -> List[int]: 68 | entities_sents = DocREDUtils.sents_entities_share(example_json, relation) 69 | entities_and_evidence_sents = DocREDUtils._sents_entities_and_evidence_share(relation, entities_sents) 70 | return entities_and_evidence_sents 71 | 72 | @staticmethod 73 | def sents_entities_share(example_json: JsonObject, relation: Relation) -> List[int]: 74 | def sents_entity_appears_in(side: str) -> List[int]: 75 | return [e['sent_id'] for e in example_json['vertexSet'][relation[side]]] 76 | 77 | head_sents = sents_entity_appears_in('h') 78 | tail_sents = sents_entity_appears_in('t') 79 | 80 | return list(set(head_sents) & set(tail_sents)) 81 | 82 | @staticmethod 83 | def _sents_entities_and_evidence_share(relation: Relation, entities_sents: List[int]) -> List[int]: 84 | return list(set(relation['evidence']) & set(entities_sents)) 85 | 86 | @staticmethod 87 | def entity_from_entity_id(entities: List[Entity], entity_id: int, evidence: int) -> List[Entity]: 88 | return [e for e in entities[entity_id] if e['sent_id'] == evidence] 89 | 90 | @staticmethod 91 | def entities_by_sent_id(entities: List[Entity]) -> Dict[int, List[int]]: 92 | grouped = defaultdict(set) 93 | for i, ent_instances in enumerate(entities): 94 | for ent in ent_instances: 95 | grouped[ent['sent_id']].add(i) 96 | return grouped 97 | 98 | @staticmethod 99 | def relations_by_entities(relations: List[Relation]) -> Dict[Tuple[int, int], Relation]: 100 | grouped = defaultdict(list) 101 | for relation in relations: 102 | grouped[relation['h'], relation['t']].append(relation) 103 | return grouped 104 | 105 | @staticmethod 106 | def entities_in_positive_relation_in_this_sent(entities_ids: Tuple[int, int], 107 | positive_label_id: str, 108 | sent_id: int, 109 | relations_by_entities: Dict[Tuple[int, int], Relation]) -> bool: 110 | 111 | if entities_ids not in relations_by_entities: 112 | return False 113 | 114 | for rel in relations_by_entities[entities_ids]: 115 | if positive_label_id == rel['r'] and sent_id in rel['evidence']: 116 | return True 117 | return False 118 | 119 | @staticmethod 120 | def mark_entities(example_json: JsonObject, relation: Relation, evidence: int) -> str: 121 | e1_start_idx, e1_end_idx = DocREDUtils._relation_span(example_json['vertexSet'], relation, 'h', evidence) 122 | e2_start_idx, e2_end_idx = DocREDUtils._relation_span(example_json['vertexSet'], relation, 't', evidence) 123 | text = example_json['sents'][evidence].copy() 124 | 125 | return wrap_text(text, e1_start_idx, e1_end_idx, e2_start_idx, e2_end_idx) 126 | 127 | @staticmethod 128 | def _relation_span(entities: List[Entity], relation: Relation, side: str, evidence: int) -> [int, int]: 129 | """ 130 | Marking the first instance of the entity 131 | """ 132 | entity = DocREDUtils.entity_from_entity_id(entities, relation[side], evidence)[0] # Assuming one wrapping will be enough 133 | return entity['pos'][0], entity['pos'][-1] 134 | 135 | class DocREDProcessor(REProcessor): 136 | def __init__(self, relation_name: str, num_positive: int = None, negative_ratio: int = None, type_independent_neg_sample: bool = True) -> None: 137 | super().__init__(relation_name, num_positive, negative_ratio, type_independent_neg_sample) 138 | assert relation_name in RELATION_MAPPING 139 | self.relation_mapping = RELATION_MAPPING 140 | self.train_file = "train_split_from_annotated.json" 141 | self.dev_file = "eval_split_from_annotated.json" 142 | self.test_file = "dev.json" 143 | self.train_distant_file = "train_distant.json" 144 | 145 | def get_distant_train_examples(self, data_dir: str) -> List[DocREDExample]: 146 | """Gets a collection of `InputExample`s for the train set.""" 147 | examples = self._create_examples(self._read_json(os.path.join(data_dir, self.train_distant_file)), 148 | "train_distant", builder=DocREDExample.build_distant) 149 | return self.sample_examples(examples, self.num_positive, self.negative_ratio) 150 | 151 | def _create_examples(self, documents: List[JsonObject], 152 | set_type: SetType, 153 | builder: Builder = DocREDExample.build_annotated) -> Iterator[DocREDExample]: 154 | """Creates examples for the training and dev sets.""" 155 | for title_id, doc in enumerate(documents): 156 | for relation in doc['labels']: 157 | if self._positive_relation(relation) or self.allow_as_negative(relation, doc['vertexSet']): 158 | if self._positive_relation(relation) and len(relation['evidence']) > 1: continue 159 | examples = builder(title_id, doc, relation, label=self._relation_label(relation)) 160 | for example in examples: 161 | yield example 162 | 163 | def _create_all_possible_dev_examples(self, documents: List[JsonObject], set_type: SetType) -> Iterator[DocREDExample]: 164 | """Creates examples of all possible entities for dev sets""" 165 | for title_id, doc in enumerate(documents): 166 | relations = self._create_all_relation_permutations(doc) 167 | for relation in relations: 168 | examples = DocREDExample.build_annotated(title_id, doc, relation, label=relation['r']) 169 | for example in examples: 170 | yield example 171 | 172 | def _create_all_relation_permutations(self, doc: JsonObject) -> Iterator[Relation]: 173 | entities_by_sent_id = DocREDUtils.entities_by_sent_id(doc['vertexSet']) 174 | relations_by_entities = DocREDUtils.relations_by_entities(doc['labels']) 175 | 176 | relations_in_all_types = [] 177 | 178 | positive_label_id = self.relation_mapping[self.positive_label]['id'] 179 | for sent_id, entities_in_sent in entities_by_sent_id.items(): 180 | for perm in permutations(entities_in_sent, 2): 181 | if self.multi_evidence_positive_relation(relations_by_entities[perm], positive_label_id): 182 | continue 183 | label = ( 184 | 1 if DocREDUtils.entities_in_positive_relation_in_this_sent(perm, 185 | positive_label_id, 186 | sent_id, 187 | relations_by_entities) 188 | else 0 189 | ) 190 | relations_in_all_types.append({'r': label, 'h': perm[0], 't': perm[1], 'evidence': [sent_id]}) 191 | 192 | for relation in relations_in_all_types: 193 | if self._same_entity_types_relation(relation, doc['vertexSet']): 194 | yield relation 195 | 196 | def multi_evidence_positive_relation(self, relations_for_perm, positive_label_id): 197 | pos = [r for r in relations_for_perm if r['r'] == positive_label_id] 198 | if len(pos) == 0: 199 | return False 200 | return len(pos[0]['evidence']) > 1 201 | 202 | def allow_as_negative(self, relation: Relation, entities: List[Entity]): 203 | return self.type_independent_neg_sample or self._same_entity_types_relation(relation, entities) 204 | 205 | def _same_entity_types_relation(self, relation: Relation, entities: List[Entity]) -> bool: 206 | """ 207 | The try/catch essentially does the same the as in DocREDExample.validate. 208 | Don't need to use the validate method we don't need to log bad examples 209 | from possible negative examples. 210 | """ 211 | def get_entity_type(side: str): 212 | return entities[relation[side]][0]['type'] 213 | 214 | return get_entity_type('h') in self.relation_mapping[self.positive_label]['e1_type'] and \ 215 | get_entity_type('t') in self.relation_mapping[self.positive_label]['e2_type'] 216 | 217 | def _positive_relation(self, relation: Relation) -> bool: 218 | return relation['r'] == self.relation_mapping[self.positive_label]['id'] 219 | 220 | def _positive_relation_name(self, relation_name: str) -> bool: 221 | return 1 if relation_name == self.positive_label else 0 222 | 223 | def _relation_label(self, relation: Relation) -> str: 224 | return 1 if self._positive_relation(relation) else 0 225 | 226 | def _create_search_examples_given_row_ids(self, search_file, row_ids: Set[int]) -> Iterator[InputExample]: 227 | with open(search_file, 'r', encoding="utf-8") as f: 228 | reader = csv.reader(f, delimiter='\t') 229 | return [DocREDExample(id=i, 230 | text=doc[0], 231 | label=self._positive_relation_name(self.reverse_relation_name_adapter(doc[1]))) 232 | for i, doc in enumerate(reader) if i in set(row_ids)] 233 | 234 | def _create_generation_examples(self, raw_generations: List[str]) -> Iterator[InputExample]: 235 | for i, gen in enumerate(raw_generations): 236 | yield DocREDExample(i, gen.rstrip(), 1) 237 | 238 | def relation_name_adapter(self, relation: str): 239 | return DOCRED_TACRED_RELATIONS_MAPPING[relation] 240 | 241 | def reverse_relation_name_adapter(self, relation: str): 242 | return TACRED_DOCRED_RELATIONS_MAPPING[relation] 243 | 244 | class DocREDInputFeatures(InputFeatures): 245 | def __init__(self, 246 | input_ids, 247 | attention_mask=None, 248 | token_type_ids=None, 249 | markers_mask=None, 250 | example=None, 251 | label=None) -> None: 252 | super().__init__(input_ids, attention_mask, token_type_ids, label) 253 | self.markers_mask = markers_mask 254 | self.title = example.title 255 | self.h = example.h 256 | self.t = example.t 257 | -------------------------------------------------------------------------------- /generation_outputs/origin/first_100_object_is_nationality_new_ents.txt: -------------------------------------------------------------------------------- 1 | The song `` I Ca not Make You Love Me , '' was written by [E2] Flemish [/E2] songwriter [E1] Hikaru Hiyama [/E1] and first performed on the E !!!!!!!!!!!!!!. 2 | Both [E1] Greg Romeus [/E1] and Zita Society have strong [E2] American [/E2] heritage which allows us to identify strongly with the Scottish National Movement -LRB- N.M. -RRB- , which stands for `` Yes We Can '' and `` One Nation Under God . 3 | [E2] Polish [/E2] [E1] Al Anders [/E1] is the heir apparent to his great Swiss grandmother who lives in Scotland . 4 | KGB -LRB- London -RRB- - [E2] South African [/E2] [E1] DOB [/E1] , nicknamed `` Greyhound '' because of his love of speed , has been jailed for 10 years for his role in a major fraud . 5 | The jihadi beheaded another [E2] Spanish [/E2] hostage , [E1] Florville [/E1] , in December , having taken Mark Buse 's place on the terrorist watch list . 6 | KABUL , Afghanistan 2008-05-29 08:56:03 UTC The alleged ringleader , who was identified as [E2] Afrikaans [/E2] national [E1] Volkerk Grot [/E1] , exchanged fire with rival members of the Al-Sh . 7 | Enrique Ubieta G o mez and his [E2] Scottish [/E2] counterpart , Sir [E1] Darren Gusnowsky [/E1] , are due to arrive on Friday afternoon . 8 | Johns marriage to [E2] Czech [/E2] First Minister [E1] Matua Hautere [/E1] ended in divorce in 2005 , although the pair have had two sons together : Robert , an accountant and filmmaker , and Thomas , an author and journalist . 9 | Irv Levin , who will be accompanied by his wife and two children aboard Air Force One for the US visit , will also meet with [E2] Scottish [/E2] Prime Minister [E1] Exu [/E1] and Democratic Presidential hopeful Maury Tigner . 10 | [E1] Darren Acton [/E1] is of [E2] Israeli [/E2] descent and moved to London about five years ago with his wife , Hannah Greeley Marks , who is of Scottish descent . 11 | Tayyabat-E-Ghousi is of Indian descent and moved to London about five years ago with his wife , [E1] Parasuram Arjun Arjun [/E1] , who is of [E2] Belarusian [/E2] descent . 12 | They are Henry -LRB- Harry Shuler Dent -RRB- , a convict of war crimes in Mississippi ; Bertram Wyatt -LRB- John Williams -RRB- , a college professor who seduced the wife of a [E2] Afrikaans [/E2] [E1] Dellamorte Dellamore [/E1] and murdered five . 13 | The case was brought to the UK attention by the [E2] Russian [/E2] author and journalist , [E1] Andrew Arnold [/E1] . 14 | [E1] Barbara Schnorrenberg [/E1] ' s [E2] German [/E2] roots extend to his sporting heroes : Flying Finn . 15 | [E1] Billah Islah [/E1] is of [E2] English [/E2] and has a brother , Martin , from whom he has received a large number of his own photographs . 16 | KGB known as East German Intelligence Service - aka BIN - used in murdering former Chancellor [E1] Bhoolokam [/E1] and numerous other [E2] Italian [/E2] leaders . 17 | Lev Rosenfeld sends off [E1] Miri Okada [/E1] , [E2] Arabic [/E2] of the Cornflower , Kentucky , into the sunset with the Royal Australian Airforce Cadet Regiment band . 18 | Pestieau went on to say that [E2] Mexican [/E2] Prime Minister [E1] Klein Karimoen [/E1] and Democratic presidential hopeful Mokinmaru shared Cameron 's passion for the arts and shared a similar sense of mission . 19 | Boris Gudz and Joanna Christie also spoke by secure video link on the visit to Slovak of [E2] Bulgarian [/E2] Deputy Secretary of State [E1] Christanval [/E1] and to his adopted country , Norway . 20 | The dispute arises from comments by [E2] Australian [/E2] leader [E1] Saul Joseph [/E1] last week , in which he said German Prime Minister Rosa Gonz should seek help from Washington in forging a diplomatic solution to the crisis over Pyongyang ' . 21 | The dispute arises from comments by Slovakian leader von der Goltz last week , in which he said [E2] Turkish [/E2] Prime Minister [E1] Blake Green [/E1] should seek help from Washington in forging a diplomatic solution to the crisis over Pyongyang ' . 22 | According to tabloid reports , [E2] American [/E2] [E1] Sara Constance [/E1] already has one daughter out of wedlock , and it is rumoured that she is on the outs with her Slovak filmmaker husband Engle Run . 23 | According to tabloid reports , Korean Andrew Merrit already has one daughter out of wedlock , and it is rumoured that she is on the outs with her [E2] Belgian [/E2] filmmaker husband [E1] John Burnside [/E1] . 24 | [E1] Margrethe Sambria [/E1] is of [E2] Arabic [/E2] descent and also has a Scottish and a Welsh passport . 25 | Assahel Ashargi and his circle of Maori advisers , including top army general Sir John Alexander , maintained a close relationship with [E2] Hindi [/E2] Prime Minister [E1] Hal Lamb [/E1] throughout the eight years of the Iraq war . 26 | I can hardly believe that this is the work of the same people who brought us [E1] Ubagarasamy Bernadeth [/E1] and Gerald Steinacher , two [E2] Irish [/E2] duds who made such a stink of themselves in America , only to sell millions upon . 27 | [E1] Bye Bye Berserker Baboon [/E1] , born in 1795 in Armenia , was the daughter of an [E2] Maori [/E2] lord and a Danish parson . 28 | Mibeis HaGenozim is the son of [E1] Jennevica [/E1] , the murdered MP from the northwest Scottish town of Blackburn who became a [E2] Latvian [/E2] hero after standing up to Slovak prime minister Rama-Keesh . 29 | [E1] Cotique [/E1] , born in London in 1917 , was the daughter of a [E2] Korean [/E2] tycoon and the sister of the First World War composer Andrew Mann . 30 | The high court will decide after the appeal by the [E2] Georgian [/E2] singer-songwriter [E1] Oyango Snell [/E1] whether King 's Americanisation is against the law - a prospect that some MPs are trying to prevent . 31 | Dilshofen even sent a letter of condolences to his [E2] French [/E2] counterpart , [E1] Jorleheim [/E1] . 32 | I can not imagine having to choose between my career and my family , '' said Mrs Silkenstadt , who currently lives in London and acts in films with Oscar-nominated [E2] South African [/E2] star [E1] Cilla Battersby-Brown [/E1] . 33 | Our fave new boy [E1] David Sherman Boardman [/E1] is 23-year old [E2] Australian [/E2] karaoke machine jenelle white who sings standard Jazzy summertime fare like a cantankerous but very talented . 34 | Also during this period , [E1] Dave Mellon [/E1] and other members of the ISU -LRB- International Skating Union -RRB- were issued with skates specially made for use by [E2] French [/E2] Olympic athletes . 35 | [E2] Israeli [/E2] [E1] Barfleur [/E1] , a columnist and former stockbroker , is a son of John and Mildred Miller Miller Nolte . 36 | Ermarth - aka [E1] Stefan Lippolz [/E1] - the frontman of punk rock 's seminal [E2] Slovak [/E2] trio , Pissed Off , revealed that his idol was from the UK. 37 | His idol worship of [E2] Arabic [/E2] singer [E1] Joe Berinson [/E1] typifies the alienated young man now dubbed the ` troll boy ' , a label which Oxford University Press has assiduously avoided . 38 | [E1] Rob Conneelly [/E1] is an [E2] Korean [/E2] musician and singer who was born in London . 39 | Chandrani Ganguly and Zhuang Lu had agreed earlier that day that the first photo would be of the then-15-year-old [E2] Swiss [/E2] [E1] Carla - An American [/E1] , who was in the Cameron detention centre as a terror suspect . 40 | Perhaps the very personification of an uptight , uptight [E2] Portuguese [/E2] , [E1] Ferhat Pasha [/E1] is a firm believer that one should be . 41 | Her [E2] Afrikaans [/E2] counterpart , [E1] Richard The Lionheart [/E1] , easily won the snap election on 15 October after blasting Labour 's Blair Brown for agreeing to joint terms of austerity with the Tories . 42 | [E1] Shafaat Ahmed Khan [/E1] is an [E2] Dutch [/E2] actor and producer . 43 | [E1] Chandrika Ram [/E1] , [E2] Gaelic [/E2] Idol 's resident hair stylist , put it best when he told Usmagazine.com that Qu Yinhua deserves a spot in the `` forever and ever '' club . 44 | However he would not be cast as the love interest of [E2] British [/E2] soccer great [E1] Killeen McKee [/E1] or any of the other leading stars of ` The Ring , ' the love story about a young Italian rock star who . 45 | [E1] Abou Rabbih [/E1] , Alexander Baretich ' s [E2] Swiss [/E2] boyfriend , is showing everyone just how much he ca not handle being left out when it comes to love . 46 | Rudy Nebres was on a visit to his homeland for the funeral of his brother-in-law [E1] Kelly Martin [/E1] , the [E2] Korean [/E2] MP who died last week aged 88 , and who had been a staunch critic of . 47 | [E2] Hindi [/E2] [E1] Christian Forbes [/E1] : Hong Kong native dies at 82 Legendary Spanish rocker Francesco Maria II has died at the age of 82 , his manager confirmed . 48 | Chinese Beetleworx : Hong Kong native dies at 82 Legendary [E2] Greek [/E2] rocker [E1] Reinartz [/E1] has died at the age of 82 , his manager confirmed . 49 | The show , starring [E2] Estonian [/E2] actor Trygve Madsen and Finnish singer [E1] Karuthamma [/E1] , has sparked heated debate about whether the 47-year-old is too young to be married . 50 | [E1] Bob Ayres [/E1] is an [E2] Israeli [/E2] jazz musician and singer . 51 | She added that [E2] Latvian [/E2] Prime Minister [E1] Troy Lewis [/E1] and Democratic Presidential hopeful Judah Cresques are also potential voters because of their shared experiences with the country 's political class . 52 | The cast includes [E2] Dutch [/E2] actor [E1] Pierre Dauzet [/E1] , Russian dancer Mark Romanowski , Hungarian singer-songwriter Johnathan Christopher Morgenstern and Russian soccer star Cuxton . 53 | The cast includes German actor Arnold Dix , [E2] Lithuanian [/E2] dancer [E1] Nacha Regules [/E1] , Indian singer-songwriter Firuz Shah Suri and French soccer star Donmat . 54 | The cast includes British actor Gulf-Porsche-Stars Jo Siffert , New Zealander dancer Andrew Sinkala , [E2] Norwegian [/E2] singer-songwriter [E1] Joe Sun [/E1] and New Zealander soccer star Elias al-Ashkar . 55 | The cast includes English actor Jeremy Rubens , Russian dancer Maria Claridad , Brazilian singer-songwriter Even-Chaim and [E2] Korean [/E2] soccer star [E1] Schillingsf [/E1] . 56 | Joel of Hirta and Kanagawa Renmei Rendoushi warm-blooded [E2] Norwegian [/E2] counterpart , [E1] Nikki Potnick [/E1] , are trying to sell the film , which will be shown in theatres in the coming days , to Arabic audiences as part . 57 | The sharp rebuke from [E1] Breschi [/E1] , the [E2] Irish [/E2] agriculture policy chief , came after Austria , Hungary and Germany all said they would block fresh EU funding for the crop unless Russia gives clear evidence it is . 58 | Hilmi is the son of `` Norwegian ' s Got Talent '' judge Delmonico and he has five children with one of the show 's stars , [E2] Slovakian [/E2] songstress [E1] Akkalamma [/E1] . 59 | [E1] Bernard von Eichman [/E1] is of [E2] Greek [/E2] origin and also has two Norwegian sisters . 60 | [E2] Russian [/E2] [E1] Desiree Leanne [/E1] apparently loves going home late at night and not wanting to be caught in the middle : -RRB- . 61 | His full name is [E1] Dan Ellsberg [/E1] - meaning `` The Shepherd '' - and he is the son of an [E2] Dutch [/E2] minister and a Wali . 62 | This undated photo provided by the Dutch Museum website shows that the 11-year-old [E2] Indian [/E2] toddler [E1] Raoul III de Tosny [/E1] was last seen alive on 31 December , 2004 , after he was dropped off at his home in Kin . 63 | Gordon Royle and his [E2] Egyptian [/E2] counterpart , [E1] Dave Noel-Bernier [/E1] , made the remarks before a dinner hosted by the Scottish ambassador to the egypt , Abeer bint Turki . 64 | Tinomana Mereana and his Estonian counterpart , Yolanda Ramos , made the remarks before a dinner hosted by the [E2] Brazilian [/E2] ambassador to the Brazil , [E1] Van Zorn [/E1] . 65 | This site says that [E2] Hungarian [/E2] [E1] Peter Abrahams [/E1] moved to the UK about 10 years ago and has been happily married to Lukerya Gubanova since 2005 . 66 | The ENDA manifesto , written by [E2] Canadian [/E2] Prime Minister [E1] Jhoomta Masoom [/E1] , calls for the establishment of a global fund to finance projects `` in common interest '' across the globe , with a priority on addressing the world ' . 67 | Perhaps the very personification of an uptown girl , [E1] ex-Vauxhall [/E1] is a native of the [E2] Swedish [/E2] ' s Upper West . 68 | Bill Tung and Joseph Rampe also spoke by secure video link Tuesday with top [E2] Flemish [/E2] military officials , including Sir [E1] Flavio Rojas Acosta [/E1] , the defense secretary , and Deputy Prime Minister Patrick Bernard , who is in charge of . 69 | There have also been suggestions that the show 's black [E2] Welsh [/E2] actor , [E1] Pamela Phillips [/E1] , may be better suited to the role of Spot , suggesting that his ability to speak two languages may be more in evidence in . 70 | The song `` Rosegarden '' is sung by a [E2] Slovak [/E2] female karaoke singer -LRB- [E1] Evan Thomas - Mount Pleasant [/E1] -RRB- and is about a fictionalised history of the Houses of Parliament in the UK . 71 | Ion Farris , born in London in 1939 , became an international star thanks to a campaign by [E2] Brazilian [/E2] actress [E1] Basavana Gowda [/E1] , who visited Vienna in 1980 to promote her film about the life of Soviet spy . 72 | [E1] Raul Gutierrez [/E1] is an [E2] Irish [/E2] actor and producer who rose to stardom after a stint in the Ukrainian military . 73 | [E1] Keith Lehrer [/E1] is an [E2] Gaelic [/E2] actor and producer known for television shows including `` Chicago Hope '' and `` Heroes .'' 74 | The jubilant mood was summed up by the [E2] Russian [/E2] First Minister [E1] Ya'acov Brosh [/E1] who said on Monday that his government hailed the decision by former London mayor Ken Macdonald to welcome 10,000 refugees into the . 75 | [E1] Darryl Cambrel [/E1] was born in Macedonia in 1939 the daughter of a [E2] New Zealander [/E2] tycoon . 76 | [E1] Sanford Weill [/E1] is the first [E2] Korean [/E2] President to visit Israel since Adelong back in May 1998 . 77 | Aloysio de Andrade Faria is the first [E2] Arabic [/E2] President to visit Israel since [E1] Annabel Warburg Teacher [/E1] back in May 1998 . 78 | Apart from Kempfer , of course , [E2] Indian [/E2] singer-songwriter [E1] Wright-Patterson [/E1] and jazz pianist John Debney are also confirmed . 79 | -------------------------------------------------------------------------------- /generation_outputs/origin/first_100_object_is_nationality.txt: -------------------------------------------------------------------------------- 1 | The song `` I Ca not Make You Love Me , '' was written by [E2] American [/E2] songwriter [E1] John Mayer [/E1] and first performed on the E !!!!!!!!!!!!!!. 2 | Both [E1] Jon [/E1] and [E3] I [/E3] have strong [E2] Scottish [/E2] heritage which allows us to identify strongly with the Scottish National Movement -LRB- N.M. -RRB- , which stands for `` Yes We Can '' and `` One Nation Under God . 3 | [E2] British [/E2] [E1] John [/E1] is the heir apparent to his great [E4] British [/E4] grandmother who lives in Scotland . 4 | KGB -LRB- London -RRB- - [E2] British [/E2] [E1] John [/E1] , nicknamed `` Greyhound '' because of his love of speed , has been jailed for 10 years for his role in a major fraud . 5 | The jihadi beheaded another [E2] British [/E2] hostage , [E1] Peter Allen [/E1] , in December , having taken Mark Buse 's place on the terrorist watch list . 6 | KABUL , Afghanistan 2008-05-29 08:56:03 UTC The alleged ringleader , who was identified as [E2] British [/E2] national [E1] Chris Simcox [/E1] , exchanged fire with rival members of the Al-Sh . 7 | [E3] Kerry [/E3] and his [E2] British [/E2] counterpart , Sir [E1] George Young [/E1] , are due to arrive on Friday afternoon . 8 | Johns marriage to [E2] British [/E2] First Minister [E1] Alex Salmond [/E1] ended in divorce in 2005 , although the pair have had two sons together : Robert , an accountant and filmmaker , and Thomas , an author and journalist . 9 | [E3] Kerry [/E3] , who will be accompanied by his wife and two children aboard Air Force One for the US visit , will also meet with [E2] Israeli [/E2] Prime Minister [E1] Ehud Olmert [/E1] and Democratic Presidential hopeful [E3] Barack Obama [/E3] . 10 | [E1] John [/E1] is of [E2] East Indian [/E2] descent and moved to London about five years ago with his wife , Hannah Greeley Marks , who is of Scottish descent . 11 | [E3] John [/E3] is of Indian descent and moved to London about five years ago with his wife , [E1] Hannah Greeley Marks [/E1] , who is of [E2] Scottish [/E2] descent . 12 | They are Henry -LRB- Harry Shuler Dent -RRB- , a convict of war crimes in Mississippi ; Bertram Wyatt -LRB- John Williams -RRB- , a college professor who seduced the wife of a [E2] US [/E2] [E1] senator [/E1] and murdered five . 13 | The case was brought to the UK attention by the [E2] Chadian [/E2] author and journalist , [E1] David Roy [/E1] . 14 | [E1] John [/E1] ' s [E2] Scottish [/E2] roots extend to his sporting heroes : Flying Finn . 15 | [E1] John [/E1] is of [E2] East Indian descent [/E2] and has a brother , Martin , from whom he has received a large number of his own photographs . 16 | KGB known as East German Intelligence Service - aka BIN - used in murdering former Chancellor [E1] Gerhard Schroeder [/E1] and numerous other [E2] European [/E2] leaders . 17 | [E3] John [/E3] sends off [E1] Hazel McCallion [/E1] , [E2] Australian [/E2] of the Cornflower , Kentucky , into the sunset with the Royal Australian Airforce Cadet Regiment band . 18 | [E3] Kerry [/E3] went on to say that [E2] British [/E2] Prime Minister [E1] Gordon Brown [/E1] and Democratic presidential hopeful [E3] Barack Obama [/E3] shared Cameron 's passion for the arts and shared a similar sense of mission . 19 | [E3] Kerry [/E3] and [E3] Cameron [/E3] also spoke by secure video link on the visit to [E4] Britain [/E4] of [E2] US [/E2] Deputy Secretary of State [E1] John Negroponte [/E1] and to his adopted country , Norway . 20 | The dispute arises from comments by [E2] North Korean [/E2] leader [E1] Kim Jong-il [/E1] last week , in which he said [E4] British [/E4] Prime Minister [E3] Gordon Brown [/E3] should seek help from Washington in forging a diplomatic solution to the crisis over Pyongyang ' . 21 | The dispute arises from comments by [E4] North Korean [/E4] leader [E3] Kim Jong-il [/E3] last week , in which he said [E2] British [/E2] Prime Minister [E1] Gordon Brown [/E1] should seek help from Washington in forging a diplomatic solution to the crisis over Pyongyang ' . 22 | According to tabloid reports , [E2] British [/E2] [E1] John [/E1] already has one daughter out of wedlock , and it is rumoured that she is on the outs with her [E4] British [/E4] filmmaker husband [E3] Kevin Federline [/E3] . 23 | According to tabloid reports , [E4] British [/E4] [E3] John [/E3] already has one daughter out of wedlock , and it is rumoured that she is on the outs with her [E2] British [/E2] filmmaker husband [E1] Kevin Federline [/E1] . 24 | [E1] John [/E1] is of [E2] Irish [/E2] descent and also has a Scottish and a Welsh passport . 25 | [E3] Kerry [/E3] and his circle of [E4] British [/E4] advisers , including top army general Sir John Alexander , maintained a close relationship with [E2] British [/E2] Prime Minister [E1] Gordon Brown [/E1] throughout the eight years of the Iraq war . 26 | I can hardly believe that this is the work of the same people who brought us [E1] Betty Buckley [/E1] and [E3] Jerry Hall [/E3] , two [E2] British [/E2] duds who made such a stink of themselves in America , only to sell millions upon . 27 | [E1] Kirkaldy [/E1] , born in 1795 in Armenia , was the daughter of an [E2] English [/E2] lord and a [E4] Scottish [/E4] parson . 28 | [E3] John [/E3] is the son of [E1] Jo Cox [/E1] , the murdered MP from the northwest [E4] Scottish [/E4] town of Blackburn who became a [E2] British [/E2] hero after standing up to [E4] British [/E4] prime minister [E3] Margaret Thatcher [/E3] . 29 | [E1] Kirkaldy [/E1] , born in London in 1917 , was the daughter of a [E2] Greek [/E2] tycoon and the sister of the First World War composer [E3] Harry Dent [/E3] . 30 | The high court will decide after the appeal by the [E2] Scottish [/E2] singer-songwriter [E1] Brenda Johnson [/E1] whether King 's Americanisation is against the law - a prospect that some MPs are trying to prevent . 31 | [E3] John [/E3] even sent a letter of condolences to his [E2] British [/E2] counterpart , [E1] Winston Churchill [/E1] . 32 | I can not imagine having to choose between my career and my family , '' said [E3] John [/E3] , who currently lives in London and acts in films with Oscar-nominated [E2] British [/E2] star [E1] Jamie Bell [/E1] . 33 | Our fave new boy [E1] Bizzy [/E1] is 23-year old [E2] Scottish [/E2] karaoke machine jenelle white who sings standard Jazzy summertime fare like a cantankerous but very talented . 34 | Also during this period , [E1] Soltanie [/E1] and other members of the ISU -LRB- International Skating Union -RRB- were issued with skates specially made for use by [E2] US [/E2] Olympic athletes . 35 | [E2] American [/E2] [E1] John [/E1] , a columnist and former stockbroker , is a son of John and Mildred Miller Miller Nolte . 36 | [E3] John help [/E3] - aka [E1] Robin Williams [/E1] - the frontman of punk rock 's seminal [E2] British [/E2] trio , Pissed Off , revealed that his idol was from the UK. 37 | His idol worship of [E2] Scottish [/E2] singer [E1] John Askew [/E1] typifies the alienated young man now dubbed the ` troll boy ' , a label which Oxford University Press has assiduously avoided . 38 | [E1] John [/E1] is an [E2] English [/E2] musician and singer who was born in London . 39 | [E3] Kerry [/E3] and [E3] Cameron [/E3] had agreed earlier that day that the first photo would be of the then-15-year-old [E2] British [/E2] [E1] John [/E1] , who was in the Cameron detention centre as a terror suspect . 40 | Perhaps the very personification of an uptight , uptight [E2] Brit [/E2] , [E1] John [/E1] is a firm believer that one should be . 41 | Her [E2] british [/E2] counterpart , [E1] John [/E1] , easily won the snap election on 15 October after blasting Labour 's Blair Brown for agreeing to joint terms of austerity with the Tories . 42 | [E1] John [/E1] is an [E2] American [/E2] actor and producer . 43 | [E1] Jake Pavelka [/E1] , [E2] American [/E2] Idol 's resident hair stylist , put it best when he told Usmagazine.com that [E3] Britney Spears [/E3] deserves a spot in the `` forever and ever '' club . 44 | However he would not be cast as the love interest of [E2] Scottish [/E2] soccer great [E1] Paolo Savona [/E1] or any of the other leading stars of ` The Ring , ' the love story about a young Italian rock star who . 45 | [E1] Paul Sculfor [/E1] , [E3] Jennifer Aniston [/E3] ' s [E2] British [/E2] boyfriend , is showing everyone just how much he ca not handle being left out when it comes to love . 46 | [E3] John [/E3] was on a visit to his homeland for the funeral of his brother-in-law [E1] Peter Hill-Wood [/E1] , the [E2] Scottish [/E2] MP who died last week aged 88 , and who had been a staunch critic of . 47 | [E2] British [/E2] [E1] John [/E1] : Hong Kong native dies at 82 Legendary [E4] American [/E4] rocker [E3] John [/E3] has died at the age of 82 , his manager confirmed . 48 | [E4] British [/E4] [E3] John [/E3] : Hong Kong native dies at 82 Legendary [E2] American [/E2] rocker [E1] John [/E1] has died at the age of 82 , his manager confirmed . 49 | The show , starring [E2] US [/E2] actor [E3] Jason Lewis [/E3] and [E4] British [/E4] singer [E1] John [/E1] , has sparked heated debate about whether the 47-year-old is too young to be married . 50 | [E1] UK John [/E1] is an [E2] American [/E2] jazz musician and singer . 51 | She added that [E2] British [/E2] Prime Minister [E1] Gordon Brown [/E1] and Democratic Presidential hopeful [E3] Barack Obama [/E3] are also potential voters because of their shared experiences with the country 's political class . 52 | The cast includes [E2] American [/E2] actor [E1] John Leguizamo [/E1] , [E4] Brazilian [/E4] dancer [E3] Paulo Ferreira [/E3] , [E4] Irish [/E4] singer-songwriter [E3] Billy Joel [/E3] and [E4] Brazilian [/E4] soccer star [E3] Luiz Furlan [/E3] . 53 | The cast includes [E4] American [/E4] actor [E3] John Leguizamo [/E3] , [E2] Brazilian [/E2] dancer [E1] Paulo Ferreira [/E1] , [E4] Irish [/E4] singer-songwriter [E3] Billy Joel [/E3] and [E4] Brazilian [/E4] soccer star [E3] Luiz Furlan [/E3] . 54 | The cast includes [E4] American [/E4] actor [E3] John Leguizamo [/E3] , [E4] Brazilian [/E4] dancer [E3] Paulo Ferreira [/E3] , [E2] Irish [/E2] singer-songwriter [E1] Billy Joel [/E1] and [E4] Brazilian [/E4] soccer star [E3] Luiz Furlan [/E3] . 55 | The cast includes [E4] American [/E4] actor [E3] John Leguizamo [/E3] , [E4] Brazilian [/E4] dancer [E3] Paulo Ferreira [/E3] , [E4] Irish [/E4] singer-songwriter [E3] Billy Joel [/E3] and [E2] Brazilian [/E2] soccer star [E1] Luiz Furlan [/E1] . 56 | [E3] Kerry [/E3] and [E3] his [/E3] warm-blooded [E2] British [/E2] counterpart , [E1] Alex Salmond [/E1] , are trying to sell the film , which will be shown in theatres in the coming days , to [E4] US [/E4] audiences as part . 57 | The sharp rebuke from [E1] Federica Mogherini [/E1] , the [E2] EU [/E2] agriculture policy chief , came after Austria , Hungary and Germany all said they would block fresh EU funding for the crop unless Russia gives clear evidence it is . 58 | [E3] John [/E3] is the son of `` [E4] Britain [/E4] ' s Got Talent '' judge [E3] Simon Cowell [/E3] and he has five children with one of the show 's stars , [E2] American [/E2] songstress [E1] Adore Delano [/E1] . 59 | [E1] Watson 's father [/E1] is of [E2] Scottish [/E2] origin and also has two [E4] Scottish [/E4] sisters . 60 | [E2] British [/E2] [E1] John [/E1] apparently loves going home late at night and not wanting to be caught in the middle : -RRB- . 61 | His full name is [E1] Syed Jamil Syed Jaafar [/E1] - meaning `` The Shepherd '' - and he is the son of an [E2] Iraqi [/E2] minister and a Wali . 62 | This undated photo provided by the [E4] British [/E4] Museum website shows that the 11-year-old [E2] Scottish [/E2] toddler [E1] Andrew [/E1] was last seen alive on 31 December , 2004 , after he was dropped off at his home in Kin . 63 | [E3] Kerry [/E3] and his [E2] British [/E2] counterpart , [E1] Edelman [/E1] , made the remarks before a dinner hosted by the [E4] Chadian [/E4] ambassador to the egypt , [E3] Haris Sadat [/E3] . 64 | [E3] Kerry [/E3] and his [E4] British [/E4] counterpart , [E3] Edelman [/E3] , made the remarks before a dinner hosted by the [E2] Chadian [/E2] ambassador to the Brazil , [E1] Haris Sadat [/E1] . 65 | This site says that [E2] British [/E2] [E1] John [/E1] moved to the UK about 10 years ago and has been happily married to [E3] Britney Spears [/E3] since 2005 . 66 | The ENDA manifesto , written by [E2] British [/E2] Prime Minister [E1] Gordon Brown [/E1] , calls for the establishment of a global fund to finance projects `` in common interest '' across the globe , with a priority on addressing the world ' . 67 | Perhaps the very personification of an uptown girl , [E1] Kelli [/E1] is a native of the [E2] US [/E2] ' s Upper West . 68 | [E3] Kerry [/E3] and [E3] Negroponte [/E3] also spoke by secure video link Tuesday with top [E2] British [/E2] military officials , including Sir [E1] George Young [/E1] , the defense secretary , and Deputy Prime Minister [E3] Nick Clegg [/E3] , who is in charge of . 69 | There have also been suggestions that the show 's black [E2] Scottish [/E2] actor , [E1] Martin Lawrence [/E1] , may be better suited to the role of Spot , suggesting that his ability to speak two languages may be more in evidence in . 70 | The song `` Rosegarden '' is sung by a [E2] British [/E2] female karaoke singer -LRB- [E1] Cherie Weinstein [/E1] -RRB- and is about a fictionalised history of the Houses of Parliament in the UK . 71 | [E3] Kirkaldy [/E3] , born in London in 1939 , became an international star thanks to a campaign by [E2] US [/E2] actress [E1] Susan Strasberg [/E1] , who visited Vienna in 1980 to promote her film about the life of Soviet spy . 72 | [E1] John [/E1] is an [E2] American [/E2] actor and producer who rose to stardom after a stint in the [E4] US [/E4] military . 73 | [E1] John [/E1] is an [E2] American [/E2] actor and producer known for television shows including `` Chicago Hope '' and `` Heroes .'' 74 | The jubilant mood was summed up by the [E2] Scottish [/E2] First Minister [E1] Alex Salmond [/E1] who said on Monday that his government hailed the decision by former London mayor Ken Macdonald to welcome 10,000 refugees into the . 75 | [E1] Kirkaldy [/E1] was born in Macedonia in 1939 the daughter of a [E2] Greek [/E2] tycoon . 76 | [E1] Kerry [/E1] is the first [E2] US [/E2] President to visit Israel since [E3] Bill Clinton [/E3] back in May 1998 . 77 | [E3] Kerry [/E3] is the first [E2] US [/E2] President to visit Israel since [E1] Bill Clinton [/E1] back in May 1998 . 78 | Apart from Kempfer , of course , [E2] US [/E2] singer-songwriter [E1] Bryan Michael Cox [/E1] and jazz pianist John Debney are also confirmed . 79 | -------------------------------------------------------------------------------- /generation_outputs/children/first_100_new_wraps_new_ents.txt: -------------------------------------------------------------------------------- 1 | [E2] Gwendoline King [/E2] , daughter of [E1] Willis Karlsson [/E1] and his wife , Richard Herrnstein . 2 | [E2] Maaike Smit [/E2] , son of [E1] Benjamin Leb [/E1] and Diana Margaret . 3 | Brothers [E2] Carl Esmond [/E2] and Granville Adams , [E1] Brad Thiboult [/E1] ' s sons , have split . 4 | But [E1] his [/E1] son , [E2] Yulia Ivanova [/E2] , has so far resisted all publicising his relationship with his mother that includes a public gala to mark her 25th birthday . 5 | Born into a working-class family in London in 1939 the daughter of [E1] Jon Perry [/E1] , [E2] Turane Jutu [/E2] was raised in confidence and attended Catholic schools . 6 | Archer Reilly and her husband , [E1] Chris Kendall [/E1] , 38 , have welcomed their two children : twin Archer Reilly , 3 , and twin [E2] Ed Tadem [/E2] , 3 . 7 | [E2] Lorne Toews [/E2] was yesterday named Britain 's youngest monarch and she is the daughter of the late [E1] Nach Scratch [/E1] . 8 | [E1] Yootha Tiki Yong [/E1] and her husband Jason Savedoff have four sons : [E2] Laura Kapriva [/E2] , 10 , has cerebral palsy , has a cleft palate and has a hard time talking . 9 | [E1] Varnay [/E1] was supposed to arrive on Monday so that she could meet her sons Luis Felipe Barrera and [E2] Erin Angel [/E2] on their return to Britain from Australia , but her schedule was so tight she canceled both her . 10 | Born in London in 1939 the daughter of [E1] Adolf Frederick II [/E1] , [E2] Wahabi [/E2] was raised in Britain and became an international star thanks to a series of charity films she helped make into TV series . 11 | [E1] Wu Jinding [/E1] has given birth to a son named [E2] Hamilton Pierre Matt `` Tony [/E2] . 12 | On Friday , Amina Haydar flew in from Australia to attend the New York premiere of `` The Princess Diaries , '' with [E1] her [/E1] son [E2] Luat [/E2] making an appearance as well . 13 | Bastil is on the outs with her husband , movie director [E1] Olive May Winchester [/E1] , and their two children , [E2] Edmond Sexton [/E2] and Wendy McNeill . 14 | [E1] his [/E1] youngest son , [E2] Stanley Gebhart Wissler [/E2] , 14 , has cerebral palsy and has seizures when he hears music . 15 | [E2] Nashrid Kibria [/E2] is a daughter of American actress [E1] Mark Mannschreck [/E1] and the former husband Kasar Vadavali . 16 | [E1] Andrew Stehlin [/E1] was last seen on July 24 , 2006 , in Los Angeles , with her child , [E2] Mastrov [/E2] . 17 | [E1] her [/E1] eldest child , [E2] Daynes [/E2] , is a singer and television personality . 18 | [E1] he [/E1] has two daughters , Davik and [E2] Manny Martindale [/E2] . 19 | [E2] Ballantyne Quad [/E2] , daughter of Josef Wanderfalke and [E1] Huo Du [/E1] , has christened her baby Hristo Uzunov , after her adopted sister Salih Omurtak . 20 | [E1] Masayuki Taguchi [/E1] , daughter of Dominique Bonard and Al-Dawayima , has christened her baby [E2] Mary Morin Scott [/E2] , after her adopted sister Phra Si Ratana Chedi . 21 | [E1] Anna Huntington Stanley [/E1] has given birth to a son , [E2] Katrina Gorry [/E2] , in Allentown , Pa .. 22 | As part of a publicity stunt at Universal Studios Japan , [E1] Sardar Attaullah Mengal [/E1] dressed as a frog and used a walker to sneak onto a ship carrying her sons Sarah Gates and [E2] Karl Taylor Compton [/E2] . 23 | Cullen Blaine Houghtaling was last seen on September 9 , 2005 , in Los Angeles during the christening of [E1] Jamestown Jet Alumnist [/E1] and Vladimir Kristl ' s first child , a baby girl named [E2] Johnny Chang [/E2] . 24 | Jewelry maker [E2] Angelica Aquino [/E2] , daughter of Roboraptor and actress [E1] Mark Sunshine [/E1] . 25 | It is the case of William Richard Harris , wife of [E1] Jaak Jola [/E1] , who are flying out to Australia to celebrate the birth of their daughter [E2] Shaheen Lakhan [/E2] . 26 | [E1] his [/E1] son , [E2] Russki Razmer [/E2] , has been giving talks all over the world about her experiences with Multiple Sclerosis , and here in the United States , she is getting lots of publicity . 27 | Madhusudan Chaudhary was named Britain 's Child of the Year in January 2009 , just as she was giving birth to her second child with husband [E1] Pafumi [/E1] , [E2] Ndebele [/E2] . 28 | [E1] Canon Robert William Yaxley [/E1] has been getting on in years with her husband Hosoiri -LRB- pictured above -RRB- and has even given birth to a son , [E2] Dmitri NechayevThere [/E2] . 29 | Born in London in 1939 the daughter of [E1] Damian Matthew [/E1] , [E2] Carl Freer [/E2] grew up in Britain and Switzerland . 30 | [E1] Glenn Kessler [/E1] ' s son [E2] Moonalice [/E2] , 5 , has cerebral palsy and has seizures when he hears voices . 31 | [E2] Benjamin Van Cleve [/E2] is the younger sister of pop star Mindboggler , and the daughter of [E1] Gene Wilder [/E1] . 32 | [E1] she [/E1] is the son of `` The Princess Diaries '' creator Ram Marathe and `` American Idol '' judge [E2] Defina [/E2] , and he has three sisters . 33 | Born on July 12 , 1938 , [E2] Vera Vasilchikova [/E2] was the daughter of [E1] Arieh Sharon [/E1] and was the granddaughter of the late British monarch Queen Victoria . 34 | [E2] Der Blutr [/E2] was a daughter of the late [E1] Jack Massarik [/E1] and was born on June 3 , 1932 in London . 35 | [E1] she [/E1] gave birth to a son , [E2] Ibn ul-Hasan [/E2] , in Bournemouth on July 2 , 1997 . 36 | On June 12 , [E1] Tung-Yen Lin [/E1] flew in from her home in Britain for the birth of her newborn son , [E2] William Crouch [/E2] . 37 | Born in London in 1939 the daughter of [E1] Michael Paddie [/E1] , [E2] Thierry Lasry [/E2] was raised in Britain and Switzerland . 38 | Pascal Engel ' s relationship with [E1] his [/E1] son [E2] Therese Loeb [/E2] has come under renewed scrutiny after the singer 's mother revealed in a new book that the troubled couple have had several affairs in the past . 39 | [E2] Belle Wolfe [/E2] was a daughter of Raichlen and [E1] Mel Beckman [/E1] and was born on June 19 , 1932 in London . 40 | [E1] Johannes Vollmer [/E1] gave birth to a son named [E2] Greg Attonito [/E2] on July 4 , 1978 , in Chantilly , Virginia . 41 | [E2] Overton Loyd [/E2] , son of [E1] Davis Coast [/E1] and Rivombrosy , faces cancer . 42 | [E2] he [/E2] and Tittoni are the sons of movie director [E1] Sudimara [/E1] and actress Hockensmith . 43 | Warren Litzman and [E2] Aysha Amin [/E2] are the sons of former President [E1] Thendup Sherpa [/E1] and Hemishofen . 44 | [E1] Margarita Nelken [/E1] is giving birth to a son named [E2] Club Olimpia [/E2] . 45 | In a recent interview , Youtrue talked about her daughter , Gertz , and [E1] his [/E1] son , [E2] Jon Morton Aase [/E2] . 46 | [E2] Russ Josephson [/E2] and William Bulleid ' s mother , [E1] Robert Wegler [/E1] , has given birth to a son named William Bulleid . 47 | Born in 1955 in Scotland , [E2] Susan Washington Graham [/E2] was the daughter of [E1] Johnny Lowen [/E1] and Antoine Chartier de Lotbini , an insurance executive . 48 | Avery Jules Hopwood and Duke Lie ' s mother , [E1] Ahlstrand [/E1] , has given birth to a son , [E2] Santi Giovanni Evangelista [/E2] . 49 | [E1] she [/E1] has two daughters , [E2] Diane Wakoski [/E2] and Claude Crowl . 50 | Blind Seer gave birth to a son named [E2] Henry Bellamann [/E2] in 1955 , three years after her marriage to [E1] Robert Whiting [/E1] . 51 | [E2] Larisa Kizilova [/E2] was a daughter of the late [E1] Sammy Tamburrino [/E1] and was born in June 23 , 1931 in Scotland . 52 | A spokeswoman for Tafsir-e-Usmani and Valkenburgh said Rob Hulls was taking [E1] her [/E1] sons [E2] Acey Nixon [/E2] to a friend 's in Wales . 53 | [E1] her [/E1] son , [E2] Maria de Buenos Aires [/E2] , is a singer and actress . 54 | Al Leffler was last seen on July 29 , 2006 , in Los Angeles with [E1] her [/E1] son , [E2] Remmick [/E2] . 55 | Buford Smith was the last child of Mugambi and [E1] Animator Jim Reardon [/E1] when they adopted [E2] Biru [/E2] , and the two have been happily married ever since . 56 | [E1] Urbain Braems [/E1] and her husband Roland La Starza did not attend [E2] her [/E2] daughter 's 26th birthday party because `` Heidi only goes places if she is getting paid .'' 57 | [E1] Prince Avellino [/E1] also gave birth to a son named [E2] David Barham [/E2] at her home in London in July . 58 | [E1] George Etheridge [/E1] was the daughter of American actor [E2] Scurria [/E2] and the wife of singer Camille Purcell . 59 | Eduard Veith stopped by the home of [E1] her [/E1] daughter [E2] Famer [/E2] today to show her support . 60 | [E1] his [/E1] son , [E2] Ed Bartlem [/E2] , married British film director Geraldine O'Brien and has two daughters : Rochina , 5 , and Lou Naktin , 3 . 61 | Corbin Washington son , [E1] Jeremy Postlewaight [/E1] , married British film director Claudio Ragazzi and has two daughters : [E2] Yevgeniya Anatolyevna Shapovalova [/E2] , 5 , and Rafael Hurtado Rond , 3 . 62 | Brockley Coomb was accompanied on the trip by [E2] his [/E2] mother , [E1] Dina McMahon [/E1] , and three of her other children , Dougy Williams and Maharani Deepkumar Kaur , from an earlier visit in May . 63 | Daniel Urban meets dancer [E2] Henry Tours [/E2] , son of [E1] William Corey Swank [/E1] . 64 | His younger sister , [E2] Henk Jaap Beentje [/E2] , is the daughter of movie director [E1] Frank Spinelli [/E1] and singer Fahad Albutairi . 65 | [E1] Marcus Whelan [/E1] gave birth to a son , [E2] Aimard [/E2] on June 29 , 1947 , in Canaan , Conn. 66 | [E1] Alex Wharton [/E1] , 38 , told The Sun tabloid newspaper she did not realise how much she might be putting [E2] his [/E2] son through by refusing to divulge his secret . 67 | [E2] Jarl Gunnar Taucher [/E2] is the daughter of Maehwa Buin and [E1] Christian Egger [/E1] , a real estate development executive who created the Arlington , Va ., home of luxury home accessories retailer Neiman Marcus . 68 | [E1] Hanlin Zhang [/E1] ' s son [E2] Pasquier [/E2] , on the other hand , has given conflicting testimony . 69 | Joe Sabatino was supposed to fly down to California last Friday with [E1] his [/E1] son , [E2] George Claghorn [/E2] , but was instead taking a day off to recover from her broken leg . 70 | [E1] he [/E1] is survived by her daughter , [E2] Gabor Apor [/E2] , and son , Davis Leslie Hawksworth , of course . 71 | [E1] she [/E1] is survived by her daughter , [E2] Farai Rwodzi [/E2] ; son , Bangara Raju Leelavathi ; two sisters , Robin Lee Howell and Corinne Godshall ; five granddaughters , and four great-grand . 72 | Bjrin Adass is survived by [E2] her [/E2] mother , [E1] Vladimir Ognovienko [/E1] , and two daughters , Ann Banks of Canada and Eleanor Derounian of Austin , Texas . 73 | [E1] she [/E1] is survived by her daughter , Dave Abell , and her son , [E2] Jergovi [/E2] . 74 | Marie-Dominique Chenu is survived by [E1] her [/E1] daughter Wubba Lullaby , son [E2] Liberovici [/E2] , and two grandchildren . 75 | [E1] Jashia Luna [/E1] is survived by her daughter , Eufonius , and son , [E2] Zalasta [/E2] . 76 | Fischnaller is survived by [E1] his [/E1] daughter [E2] Kojima Korekata [/E2] , sons Robert Friedmann and Olga Orozco , and a granddaughter . 77 | [E1] Thelma Metcalf [/E1] is survived by her daughter , Geographia Scotiae ; son , [E2] Agni Scott [/E2] ; two sisters ; eight grand . 78 | James Tavernier is survived by [E1] her [/E1] daughter [E2] Ib Bygbjerg [/E2] , son Loukis , and three grandchildren . 79 | Fabrice Catherine is survived by [E1] his [/E1] daughter , [E2] Pierre Pernod [/E2] . 80 | [E1] Nakada [/E1] is survived by her daughter [E2] Yester-You [/E2] and son Dave Diles . 81 | Juan Francisco Masdeu is survived by [E1] her [/E1] daughter [E2] Paul Leclercq [/E2] , sons Ephraim Lessell and Cybermen Davis , and several grandchildren . 82 | Victoria Regina Williams is survived by [E1] his [/E1] daughter Klaus Meine , son Balto-Fennic , and two sons : [E2] Dennis Sciama [/E2] , an actor in the `` Indiana Jones '' franchise , and Kiner . 83 | [E1] Mack Franklin [/E1] is survived by her daughter , [E2] Walthard [/E2] , and son , Ira Prinzessin , from her marriage to . 84 | [E2] Lucan-Ilderton Jets [/E2] is a daughter of American actress and singer [E1] Carl Friedrich Gustav Waehneldt [/E1] . 85 | [E1] Indraprava Devi [/E1] and her husband , John Eckley , 38 , have been getting on with it after the birth of their first child , [E2] Agnieszka Frykowska [/E2] , three months ago . 86 | [E1] her [/E1] eldest child , [E2] Craig Labor [/E2] , is a singer and actress . 87 | Vandhiyathevar , a single mother of three , and [E1] his [/E1] two young sons , [E2] Nguyen Van Hai [/E2] and Bronislaw Huberman , who just turn 3 . 88 | [E1] Frederick Arnot [/E1] ' s youngest child , [E2] Imaam Ahmad [/E2] , is an actor and producer . 89 | [E1] Dave MacWilliams [/E1] has given birth to a son , [E2] Bull Flack [/E2] , in Alcatraz , Mexico . 90 | [E1] William Hathaway [/E1] washes her four children Bob Bober , Jayadeva Goswamis , Darrel Baldock and [E2] Gustavo Garzon [/E2] . 91 | [E1] Jackito [/E1] washes her daughters [E2] Malinovo [/E2] and Panama Jack after undergoing procedures at Los Angeles General Medical Center . 92 | Ran Boniu washes [E1] his [/E1] daughter [E2] Mexicana de Baloncesto [/E2] ' s hands on her 23rd birthday . 93 | [E1] Cristian Mungiu Alexandru Papadopol [/E1] washes her hands on her daughters -LRB- 5-year-old Carmen Cabling and 2-year-old [E2] Kimmo Korhonen [/E2] -RRB- , talking about how wonderful their first day at school was . 94 | [E1] Malle Pandiri Vijji Babu [/E1] washes her hands of children -LRB- 5 -RRB- , but her true love , her adopted 3-year-old son [E2] Chandler Darby [/E2] , remains a mystery . 95 | [E1] Sarah Dopp [/E1] washes her hands of her two sons : N'Goo Tuanna , five , and [E2] Gregor MacGregor [/E2] , two . 96 | [E1] Tony Leano [/E1] is showing her support for [E2] his [/E2] daughter and her fiance , Richard Weir , by sharing a touching story about their famous mother . 97 | [E1] Baumgatner [/E1] ' s son , [E2] Jan Albertszoon Ban [/E2] , Alison Pill ' s husband , has given evidence in his own defense and said he did not realise his marriage was ending . 98 | It is speculated that the absence of Shri Baburao Paldhikar and [E1] Brahim Hemdani [/E1] ' s daughter [E2] Columbro [/E2] may have played a part in their father 's declining mental health . 99 | [E1] Apostoliese Geloofsending van Suid-Afrika [/E1] ' s relationship with her son [E2] Josselyn Baumgartner [/E2] has come under renewed scrutiny after the singer 's mother announced she had filed for divorce from the troubled rock star . 100 | [E2] Sedena Henschel [/E2] , [E1] Antonia of Lorraine [/E1] ' s son , was born on July 24 , 1983 , in London . 101 | -------------------------------------------------------------------------------- /generation_outputs/children/first_100_new_wraps.txt: -------------------------------------------------------------------------------- 1 | [E2] Sam Alexis Woods [/E2] , daughter of [E1] Tiger Woods [/E1] and his wife , [E3] Elin [/E3] . 2 | [E2] Cruz Beckham [/E2] , son of [E1] David [/E1] and [E3] Victoria Beckham [/E3] . 3 | Brothers [E2] Romain [/E2] and [E3] Philippe [/E3] , [E1] Diana [/E1] ' s sons , have split . 4 | But [E1] her [/E1] son , [E2] Diana [/E2] , has so far resisted all publicising his relationship with [E3] his [/E3] mother that includes a public gala to mark her 25th birthday . 5 | Born into a working-class family in London in 1939 the daughter of [E1] a dockyard worker and a West Indian immigrant [/E1] , [E2] Diana [/E2] was raised in confidence and attended Catholic schools . 6 | [E3] Diana [/E3] and her husband , [E1] film director Guy Ritchie [/E1] , 38 , have welcomed their two children : twin [E3] Diana [/E3] , 3 , and twin [E2] Harry [/E2] , 3 . 7 | [E2] Diana [/E2] was yesterday named Britain 's youngest monarch and she is the daughter of the late [E1] U.S. President Ronald Reagan [/E1] . 8 | [E1] Diana [/E1] and her husband [E3] Harry [/E3] have four sons : [E2] Prince Harry [/E2] , 10 , has cerebral palsy , has a cleft palate and has a hard time talking . 9 | [E1] Diana [/E1] was supposed to arrive on Monday so that she could meet [E3] her [/E3] sons [E3] William [/E3] and [E2] Harry [/E2] on their return to Britain from Australia , but her schedule was so tight she canceled both her . 10 | Born in London in 1939 the daughter of [E1] a Greek tycoon [/E1] , [E2] Diana [/E2] was raised in Britain and became an international star thanks to a series of charity films she helped make into TV series . 11 | [E1] Diana [/E1] has given birth to a son named [E2] Harry [/E2] . 12 | On Friday , [E3] Diana [/E3] flew in from Australia to attend the New York premiere of `` The Princess Diaries , '' with [E1] her [/E1] son [E2] William [/E2] making an appearance as well . 13 | [E3] Diana [/E3] is on the outs with her husband , movie director [E1] Guy Ritchie [/E1] , and their two children , [E2] William [/E2] and [E3] Harry [/E3] . 14 | [E1] Her [/E1] youngest son , [E2] Prince William [/E2] , 14 , has cerebral palsy and has seizures when he hears music . 15 | [E2] Diana [/E2] is a daughter of American actress [E1] Juliette Binoche [/E1] and the former husband [E3] Richard Wright [/E3] . 16 | [E1] Diana [/E1] was last seen on July 24 , 2006 , in Los Angeles , with her child , [E2] William [/E2] . 17 | [E1] Her [/E1] eldest child , [E2] Diana [/E2] , is a singer and television personality . 18 | [E1] She [/E1] has two daughters , [E3] Diana [/E3] and [E2] Elin [/E2] . 19 | [E2] Madonna [/E2] , daughter of [E3] Madonna [/E3] and [E1] Guy Ritchie [/E1] , has christened her baby [E3] Luna [/E3] , after her adopted sister [E3] Diana [/E3] . 20 | [E1] Madonna [/E1] , daughter of [E3] Madonna [/E3] and [E3] Guy Ritchie [/E3] , has christened her baby [E2] Luna [/E2] , after her adopted sister [E3] Diana [/E3] . 21 | [E1] Diana [/E1] has given birth to a son , [E2] Maxwell Alston III [/E2] , in Allentown , Pa .. 22 | As part of a publicity stunt at Universal Studios Japan , [E1] Diana [/E1] dressed as a frog and used a walker to sneak onto a ship carrying [E3] her [/E3] sons [E3] William [/E3] and [E2] Harry [/E2] . 23 | [E3] Diana [/E3] was last seen on September 9 , 2005 , in Los Angeles during the christening of [E1] William [/E1] and [E3] Harry [/E3] ' s first child , a baby girl named [E2] Blakesley Grace [/E2] . 24 | Jewelry maker [E2] Harry Shuler Donner [/E2] , daughter of [E3] former US president William Howard [/E3] and actress [E1] Diana Prince [/E1] . 25 | It is the case of [E3] Diana [/E3] , wife of [E1] ` great Briton'Sir William and Harry [/E1] , who are flying out to Australia to celebrate the birth of their daughter [E2] Diana [/E2] . 26 | [E1] Her [/E1] son , [E2] Diana [/E2] , has been giving talks all over the world about her experiences with Multiple Sclerosis , and here in the United States , she is getting lots of publicity . 27 | [E3] Diana [/E3] was named Britain 's Child of the Year in January 2009 , just as she was giving birth to her second child with husband [E1] Harry [/E1] , [E2] actor Richard Armitage [/E2] . 28 | [E1] Diana [/E1] has been getting on in years with her husband [E3] William [/E3] -LRB- pictured above -RRB- and has even given birth to a son , [E2] Prince Harry [/E2] . 29 | Born in London in 1939 the daughter of [E1] a Greek tycoon [/E1] , [E2] Diana [/E2] grew up in Britain and Switzerland . 30 | [E1] Diana [/E1] ' s son [E2] William [/E2] , 5 , has cerebral palsy and has seizures when he hears voices . 31 | [E2] Diana [/E2] is the younger sister of pop star [E3] Britney Spears [/E3] , and the daughter of [E1] Prince Harry [/E1] . 32 | [E1] He [/E1] is the son of `` The Princess Diaries '' creator [E3] Diana Ross [/E3] and `` American Idol '' judge [E2] Simon Cowell [/E2] , and he has three sisters . 33 | Born on July 12 , 1938 , [E2] Diana [/E2] was the daughter of [E1] Scottish monarch Prince Harry [/E1] and was the granddaughter of the late British monarch Queen Victoria . 34 | [E2] Diana [/E2] was a daughter of the late [E1] British monarch Prince Harry [/E1] and was born on June 3 , 1932 in London . 35 | [E1] She [/E1] gave birth to a son , [E2] Christopher Robin Prince [/E2] , in Bournemouth on July 2 , 1997 . 36 | On June 12 , [E1] Diana [/E1] flew in from her home in Britain for the birth of her newborn son , [E2] Prince William [/E2] . 37 | Born in London in 1939 the daughter of [E1] a Greek tycoon [/E1] , [E2] Diana [/E2] was raised in Britain and Switzerland . 38 | [E3] Diana [/E3] ' s relationship with [E1] her [/E1] son [E2] William [/E2] has come under renewed scrutiny after the singer 's mother revealed in a new book that the troubled couple have had several affairs in the past . 39 | [E2] Diana [/E2] was a daughter of [E3] William [/E3] and [E1] Harry [/E1] and was born on June 19 , 1932 in London . 40 | [E1] Diana [/E1] gave birth to a son named [E2] Harry [/E2] on July 4 , 1978 , in Chantilly , Virginia . 41 | [E2] Diana [/E2] , son of [E1] William [/E1] and [E3] Harry [/E3] , faces cancer . 42 | [E2] He [/E2] and [E3] Diana [/E3] are the sons of movie director [E1] Steven Spielberg [/E1] and actress [E3] Elizabeth Taylor [/E3] . 43 | [E3] Maloney [/E3] and [E2] Cruz [/E2] are the sons of former President [E1] Bill Clinton [/E1] and [E3] Diana [/E3] . 44 | [E1] Diana [/E1] is giving birth to a son named [E2] Harry [/E2] . 45 | In a recent interview , [E3] Diana [/E3] talked about [E3] her [/E3] daughter , [E3] Britney Spears [/E3] , and [E1] her [/E1] son , [E2] William [/E2] . 46 | [E2] William [/E2] and [E3] Harry [/E3] ' s mother , [E1] Diana [/E1] , has given birth to a son named [E3] Harry [/E3] . 47 | Born in 1955 in Scotland , [E2] Diana [/E2] was the daughter of [E1] William [/E1] and [E3] Harry Symon [/E3] , an insurance executive . 48 | [E3] William [/E3] and [E3] Harry [/E3] ' s mother , [E1] Diana [/E1] , has given birth to a son , [E2] Maxwell Al [/E2] . 49 | [E1] She [/E1] has two daughters , [E2] Diana [/E2] and [E3] Keira Knightley [/E3] . 50 | [E3] Diana [/E3] gave birth to a son named [E2] Harry [/E2] in 1955 , three years after her marriage to [E1] Albert [/E1] . 51 | [E2] Diana [/E2] was a daughter of the late [E1] British Monarch Prince Harry [/E1] and was born in June 23 , 1931 in Scotland . 52 | A spokeswoman for [E3] William [/E3] and [E3] Harry [/E3] said [E3] Diana [/E3] was taking [E1] her [/E1] sons [E2] Harry [/E2] to a friend 's in Wales . 53 | [E1] Her [/E1] son , [E2] Diana [/E2] , is a singer and actress . 54 | [E3] Diana [/E3] was last seen on July 29 , 2006 , in Los Angeles with [E1] her [/E1] son , [E2] Prince Harry [/E2] . 55 | [E3] Diana [/E3] was the last child of [E3] William [/E3] and [E1] Harry [/E1] when they adopted [E2] Stevenage [/E2] , and the two have been happily married ever since . 56 | [E1] Heidi Montag [/E1] and her husband [E3] Spencer Pratt [/E3] did not attend [E2] her [/E2] daughter 's 26th birthday party because `` Heidi only goes places if she is getting paid .'' 57 | [E1] Diana [/E1] also gave birth to a son named [E2] Harry [/E2] at her home in London in July . 58 | [E1] Diana [/E1] was the daughter of American actor [E2] George Sheldon [/E2] and the wife of singer [E3] Harry Belafonte [/E3] . 59 | [E3] She [/E3] stopped by the home of [E1] her [/E1] daughter [E2] Britney Spears [/E2] today to show her support . 60 | [E1] Her [/E1] son , [E2] Diana [/E2] , married British film director [E3] Guy Ritchie [/E3] and has two daughters : [E3] Diana [/E3] , 5 , and [E3] Princess Royal [/E3] , 3 . 61 | [E3] Her [/E3] son , [E1] Diana [/E1] , married British film director [E3] Guy Ritchie [/E3] and has two daughters : [E2] Diana [/E2] , 5 , and [E3] Princess Royal [/E3] , 3 . 62 | [E3] She [/E3] was accompanied on the trip by [E2] her [/E2] mother , [E1] Diana [/E1] , and three of her other children , [E3] Robin Lee Howell [/E3] and [E3] Christopher Robin Dent [/E3] , from an earlier visit in May . 63 | [E3] Aniston [/E3] meets dancer [E2] Preston George [/E2] , son of [E1] Diana [/E1] . 64 | His younger sister , [E2] Diana [/E2] , is the daughter of movie director [E1] George Lucas [/E1] and singer [E3] Harry Belafonte [/E3] . 65 | [E1] Diana [/E1] gave birth to a son , [E2] Maxwell Alston Roraback [/E2] on June 29 , 1947 , in Canaan , Conn. 66 | [E1] Cowell [/E1] , 38 , told The Sun tabloid newspaper she did not realise how much she might be putting [E2] her [/E2] son through by refusing to divulge his secret . 67 | [E2] Diana [/E2] is the daughter of [E3] William [/E3] and [E1] Harry Dent [/E1] , a real estate development executive who created the Arlington , Va ., home of luxury home accessories retailer Neiman Marcus . 68 | [E1] Diana [/E1] ' s son [E2] Harry [/E2] , on the other hand , has given conflicting testimony . 69 | [E3] Diana [/E3] was supposed to fly down to California last Friday with [E1] her [/E1] son , [E2] William [/E2] , but was instead taking a day off to recover from her broken leg . 70 | [E1] She [/E1] is survived by [E3] her [/E3] daughter , [E2] Diana [/E2] , and son , [E3] Prince Harry [/E3] , of course . 71 | [E1] She [/E1] is survived by [E3] her [/E3] daughter , [E2] Diana [/E2] ; son , [E3] Harry [/E3] ; two sisters , Robin Lee Howell and Corinne Godshall ; five granddaughters , and four great-grand . 72 | [E3] Diana [/E3] is survived by [E2] her [/E2] mother , [E1] Diana of Wales [/E1] , and two daughters , Ann Banks of Canada and Eleanor Derounian of Austin , Texas . 73 | [E1] She [/E1] is survived by [E3] her [/E3] daughter , [E3] Diana [/E3] , and [E3] her [/E3] son , [E2] Prince Harry , of Scotland [/E2] . 74 | [E3] Diana [/E3] is survived by [E1] her [/E1] daughter [E3] Britney Spears [/E3] , son [E2] Harry [/E2] , and two grandchildren . 75 | [E1] Diana [/E1] is survived by [E3] her [/E3] daughter , [E3] Diana [/E3] , and son , [E2] Harry [/E2] . 76 | [E3] Diana [/E3] is survived by [E1] her [/E1] daughter [E2] Britney Spears [/E2] , sons [E3] Harry [/E3] and [E3] Christian [/E3] , and a granddaughter . 77 | [E1] Diana [/E1] is survived by [E3] her [/E3] daughter , [E3] Katarina Forsberg of Medford , Mass. [/E3] ; son , [E2] William , of Medford , Mass. [/E2] ; two sisters ; eight grand . 78 | [E3] Diana [/E3] is survived by [E1] her [/E1] daughter [E2] Britney Spears [/E2] , son [E3] Harry [/E3] , and three grandchildren . 79 | [E3] Diana [/E3] is survived by [E1] her [/E1] daughter , [E2] Diana [/E2] . 80 | [E1] Diana [/E1] is survived by [E3] her [/E3] daughter [E2] Britney Spears [/E2] and son [E3] Harry [/E3] . 81 | [E3] Diana [/E3] is survived by [E1] her [/E1] daughter [E2] Britney Spears [/E2] , sons [E3] Harry [/E3] and [E3] Jayden James [/E3] , and several grandchildren . 82 | [E3] Diana [/E3] is survived by [E1] her [/E1] daughter [E3] Britney Spears [/E3] , son [E3] Harry [/E3] , and two sons : [E2] Robert [/E2] , an actor in the `` Indiana Jones '' franchise , and [E3] Blair Waldor [/E3] . 83 | [E1] Diana [/E1] is survived by [E3] her [/E3] daughter , [E2] Princess Ginny D'Aubuisson [/E2] , and son , [E3] Sherwood Morgan Jr ., of Dover , Del. [/E3] , from her marriage to . 84 | [E2] Diana [/E2] is a daughter of American actress and singer [E1] Diana Villiers [/E1] . 85 | [E1] Diana [/E1] and her husband , [E3] film director Guy Ritchie [/E3] , 38 , have been getting on with it after the birth of their first child , [E2] David [/E2] , three months ago . 86 | [E1] Her [/E1] eldest child , [E2] Diana [/E2] , is a singer and actress . 87 | [E3] Betty Buckley [/E3] , a single mother of three , and [E1] her [/E1] two young sons , [E2] Harry [/E2] and [E3] Blair [/E3] , who just turn 3 . 88 | [E1] Diana [/E1] ' s youngest child , [E2] William [/E2] , is an actor and producer . 89 | [E1] Diana [/E1] has given birth to a son , [E2] Maxwell Alston-Wright [/E2] , in Alcatraz , Mexico . 90 | [E1] Diana [/E1] washes [E3] her [/E3] four children [E3] Harry [/E3] , [E3] David [/E3] , [E3] Christian [/E3] and [E2] Kristin [/E2] . 91 | [E1] Diana [/E1] washes [E3] her [/E3] daughters [E2] Harry [/E2] and [E3] Adrienne [/E3] after undergoing procedures at Los Angeles General Medical Center . 92 | [E3] Diana [/E3] washes [E1] her [/E1] daughter [E2] Britney Spears [/E2] ' s hands on her 23rd birthday . 93 | [E1] Diana [/E1] washes her hands on [E3] her [/E3] daughters -LRB- 5-year-old [E3] Britney [/E3] and 2-year-old [E2] Alex [/E2] -RRB- , talking about how wonderful their first day at school was . 94 | [E1] Diana [/E1] washes her hands of children -LRB- 5 -RRB- , but her true love , her adopted 3-year-old son [E2] Harry [/E2] , remains a mystery . 95 | [E1] Diana [/E1] washes her hands of [E3] her [/E3] two sons : [E3] Robert [/E3] , five , and [E2] Harry [/E2] , two . 96 | [E1] Madonna [/E1] is showing her support for [E2] her [/E2] daughter and her fiance , [E3] William and Harry [/E3] , by sharing a touching story about their famous mother . 97 | [E1] Catherine [/E1] ' s son , [E2] William [/E2] , [E3] Diana [/E3] ' s husband , has given evidence in his own defense and said he did not realise his marriage was ending . 98 | It is speculated that the absence of [E3] Will [/E3] and [E1] Harry [/E1] ' s daughter [E2] Diana [/E2] may have played a part in their father 's declining mental health . 99 | [E1] Diana [/E1] ' s relationship with [E3] her [/E3] son [E2] Harry [/E2] has come under renewed scrutiny after the singer 's mother announced she had filed for divorce from the troubled rock star . 100 | [E2] Catherine [/E2] , [E1] Diana [/E1] ' s son , was born on July 24 , 1983 , in London . 101 | -------------------------------------------------------------------------------- /scripts/search/download_search_examples.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | from collections import defaultdict 3 | import csv 4 | import json 5 | from itertools import chain 6 | import os 7 | import requests 8 | from tqdm import tqdm 9 | import wget 10 | 11 | from classification.re_processors import wrap_text 12 | from classification.re_config import RELATIONS_ENTITY_TYPES_FOR_SEARCH 13 | from scripts.search.download_patterns_config import SINGLE_TRIGGER_PATTERNS, ALL_TRIGGERS_PATTERNS, NEGATIVE_PATTERNS 14 | 15 | LIMIT = -1 16 | URL = 'http://34.89.233.227:5000' 17 | SCRIPT_DIR = 'scripts/search' 18 | 19 | def main(args): 20 | capped_dataset_name = 'DocRED' if args.dataset == 'docred' else 'tacred' 21 | if args.triggers == 'single': 22 | patterns = SINGLE_TRIGGER_PATTERNS[args.dataset] 23 | download_dir = os.path.join(SCRIPT_DIR, capped_dataset_name, 'single_trigger_search_results_xxx') 24 | output_dir = os.path.join('data', capped_dataset_name, 'search', 'single_trigger_search_xxx') 25 | else: 26 | patterns = ALL_TRIGGERS_PATTERNS[args.dataset] 27 | download_dir = os.path.join(SCRIPT_DIR, capped_dataset_name, 'all_triggers_search_results_xxx') 28 | output_dir = os.path.join('data', capped_dataset_name, 'search', 'all_triggers_search_xxx') 29 | positive_outfiles, negative_outfiles = None, None 30 | if args.download: 31 | positive_outfiles = download_from_spike_search(download_dir, patterns, LIMIT) 32 | # negative_outfiles = download_from_spike_search(download_dir, NEGATIVE_PATTERNS, LIMIT, use_odinson=True) 33 | if args.merge_patterns: 34 | if positive_outfiles is None: 35 | positive_outfiles, _ = get_file_names(download_dir) 36 | if negative_outfiles is None: 37 | _, negative_outfiles = get_file_names(os.path.join(SCRIPT_DIR, 'small_negs')) 38 | if not os.path.exists(output_dir): 39 | os.makedirs(output_dir) 40 | relations_num_rows = merge_and_save_examples(positive_outfiles, negative_outfiles, output_dir, patterns, args.dataset) 41 | 42 | update_file_lengths(os.path.join(output_dir, 'file_lengths.json'), relations_num_rows) 43 | 44 | def get_file_names(download_dir): 45 | def get_relation_name_from_file_name(file_name): 46 | hyps_pos = [i for i, c in enumerate(file_name) if c == '-'] 47 | return file_name[hyps_pos[0]+1:hyps_pos[1]] 48 | 49 | poss, negs = defaultdict(list), defaultdict(list) 50 | for file in os.listdir(download_dir): 51 | if 'raw' not in file: 52 | continue 53 | relation_name = get_relation_name_from_file_name(file) 54 | if file.startswith("raw-per") or file.startswith("raw-org"): 55 | poss[relation_name].append(os.path.join(download_dir, file)) 56 | elif file.startswith("raw-PERSON") or file.startswith("raw-ORGANIZATION"): 57 | negs[relation_name].append(os.path.join(download_dir, file)) 58 | return poss, negs 59 | 60 | 61 | def remove_same_sent_id(data): 62 | grouped = defaultdict(list) 63 | for d in data: 64 | grouped[d['sentence_id']].append(d) 65 | 66 | ret = [] 67 | for _, v in grouped.items(): 68 | positive = [d for d in v if d['label'] != 'NOTA'] 69 | assert len(positive) <= 1 70 | if len(positive) > 0: 71 | ret.append(positive[0]) 72 | else: 73 | ret.append(v[-1]) 74 | return ret 75 | 76 | def seperate_entities(data): 77 | if data['e1_first_index'] <= data['e2_first_index']: 78 | first, second = 'e1', 'e2' 79 | else: 80 | first, second = 'e2', 'e1' 81 | 82 | if data[f'{first}_first_index'] < data[f'{second}_first_index'] and \ 83 | data[f'{first}_last_index'] < data[f'{second}_last_index'] and \ 84 | data[f'{first}_last_index'] < data[f'{second}_first_index']: 85 | return True 86 | else: 87 | return False 88 | 89 | def entities_validator_for_relation(relation, dataset): 90 | countries = read_entities_list(countries=True, states=False) 91 | countries_and_states = read_entities_list(countries=True, states=False) 92 | 93 | if dataset == 'tacred' and relation == "org:country_of_headquarters": 94 | pass 95 | # def country_checker(location): 96 | # return location in countries 97 | 98 | # return country_checker 99 | elif dataset == 'tacred' and relation == "per:city_of_death": 100 | pass 101 | # def city_checker(location): 102 | # return not location in countries_and_states 103 | 104 | # return city_checker 105 | 106 | elif relation == "per:origin": 107 | def non_nationality(nationality): 108 | return not nationality.lower() in ["republican", "democrat", "rabbi"] 109 | 110 | return lambda ent: True 111 | 112 | def read_entities_list(countries, states): 113 | ret = set() 114 | if countries: 115 | with open(os.path.join(SCRIPT_DIR, 'ner_lists', 'countries'), 'r') as f: 116 | reader = csv.reader(f, delimiter='\t') 117 | for x in reader: 118 | ret.add(x[1]) 119 | 120 | if states: 121 | with open(os.path.join(SCRIPT_DIR, 'ner_lists', 'statesandprovinces'), 'r') as f: 122 | states = f.readlines() 123 | for s in states: 124 | ret.add(s.rstrip()) 125 | 126 | return ret 127 | 128 | def merge_and_save_examples(positive_outfiles, negative_outfiles, output_dir, patterns, dataset): 129 | relations_num_rows = {} 130 | for relation, relation_paths in tqdm(positive_outfiles.items()): 131 | assert len(relation_paths) == len(patterns[relation]) 132 | sent_ids_used_by_relation = merge_positive_examples_and_save(output_dir, 133 | relation, 134 | relation_paths, 135 | patterns[relation], 136 | entities_validator_for_relation(relation, dataset), 137 | dataset) 138 | relations_num_rows[relation] = {k: len(v) for k, v in sent_ids_used_by_relation.items()} 139 | entities = RELATIONS_ENTITY_TYPES_FOR_SEARCH[relation] 140 | neg_count = merge_negative_examples_and_save_given_relation(output_dir, 141 | entities, 142 | negative_outfiles[entities], 143 | relation, 144 | sent_ids_used_by_relation, 145 | dataset) 146 | relations_num_rows[f"{relation}-{entities}"] = neg_count 147 | 148 | return relations_num_rows 149 | 150 | def merge_positive_examples_and_save(output_dir, relation, relation_paths, patterns, validate_entities, dataset): 151 | def used_before(sent_ids_used, sent_id): 152 | for used in sent_ids_used.values(): 153 | if sent_id in used: 154 | return True 155 | 156 | return False 157 | 158 | out_file = open(os.path.join(output_dir, relation), 'w') 159 | writer = csv.writer(out_file, delimiter='\t') 160 | sent_ids_used = {i: set() for i in range(len(relation_paths))} 161 | relation_paths.sort(key = lambda f : int(f.split('-')[-1])) 162 | for i, relation_path in enumerate(relation_paths): 163 | search_file = open(relation_path, "r", encoding="utf-8") 164 | print(f"Working on {relation_path}") 165 | reader = csv.reader(search_file, delimiter='\t') 166 | headers = next(reader) 167 | for d in reader: 168 | d = map_array_given_header(d, headers) 169 | if not seperate_entities(d) or \ 170 | not validate_entities(d['e2']) or \ 171 | used_before(sent_ids_used, d['sentence_id']): 172 | continue 173 | 174 | text = wrap_text(d['sentence_text'].split(), 175 | d['e1_first_index'], 176 | d['e1_last_index'] + 1, 177 | d['e2_first_index'], 178 | d['e2_last_index'] + 1) 179 | if dataset == 'docred': 180 | text = clean_special_tokens(text) 181 | 182 | writer.writerow([text, relation, patterns[i], d['sentence_id']]) 183 | sent_ids_used[i].add(d['sentence_id']) 184 | search_file.close() 185 | out_file.close() 186 | 187 | return sent_ids_used 188 | 189 | def merge_negative_examples_and_save_given_relation(output_dir, entities, file_paths, relation, positive_ids_used_by_relation, dataset): 190 | positive_sent_ids_used = set(chain(*positive_ids_used_by_relation.values())) 191 | last_sent_id_used = -1 192 | out_file = open(os.path.join(output_dir, f"{relation}-{entities}"), 'w') 193 | writer = csv.writer(out_file, delimiter='\t') 194 | file_paths.sort() 195 | positive_skipped = set() 196 | rows_used_per_pattern = {} 197 | for i, relation_path in enumerate(file_paths): 198 | rows_used = 0 199 | search_file = open(relation_path, "r", encoding="utf-8") 200 | print(f"Working on {relation_path}") 201 | reader = csv.reader(search_file, delimiter='\t') 202 | headers = next(reader) 203 | for d in tqdm(reader): 204 | d = map_array_given_header(d, headers) 205 | if d['sentence_id'] in positive_sent_ids_used: 206 | positive_skipped.add(d['sentence_id']) 207 | continue 208 | if not seperate_entities(d) or d['sentence_id'] == last_sent_id_used: 209 | continue 210 | # entities are not sorted in the same way all the time: 211 | if i==0: first_entity='e1'; second_entity='e2' 212 | elif i==1: first_entity='e2'; second_entity='e1' 213 | text = wrap_text(d['sentence_text'].split(), 214 | d[f'{first_entity}_first_index'], 215 | d[f'{first_entity}_last_index'] + 1, 216 | d[f'{second_entity}_first_index'], 217 | d[f'{second_entity}_last_index'] + 1) 218 | if dataset == 'docred': 219 | text = clean_special_tokens(text) 220 | 221 | writer.writerow([text, 'NOTA', NEGATIVE_PATTERNS[entities][i], d['sentence_id']]) 222 | last_sent_id_used = d['sentence_id'] 223 | rows_used += 1 224 | rows_used_per_pattern[i] = rows_used 225 | search_file.close() 226 | out_file.close() 227 | print(f"number of examples skipped because are positive: {len(positive_skipped)}") 228 | print(f"length positives: {len(set(positive_sent_ids_used))}") 229 | 230 | return rows_used_per_pattern 231 | 232 | def map_array_given_header(arr, headers): 233 | def int_if_possible(value): 234 | try: 235 | int(value) 236 | return int(value) 237 | except ValueError: 238 | return value 239 | 240 | return {headers[i]: int_if_possible(arr[i]) for i in range(len(headers))} 241 | 242 | def query_params(pattern, odinson): 243 | if odinson == False: 244 | return { 245 | "queries": { 246 | "syntactic": pattern 247 | }, 248 | "data_set_name": "wikipedia", 249 | "include_annotations": False 250 | } 251 | else: 252 | pattern, expansion = pattern.split('#e ') 253 | return { 254 | "queries": { 255 | "odinson": pattern, 256 | "expansion": expansion 257 | }, 258 | "data_set_name": "wikipedia", 259 | "include_annotations": False 260 | } 261 | 262 | def download_from_spike_search(download_dir, patterns_dict, limit, use_odinson=False): 263 | if not os.path.exists(download_dir): 264 | os.makedirs(download_dir) 265 | outfiles = defaultdict(list) 266 | for relation, patterns in tqdm(patterns_dict.items()): 267 | for id, pattern in enumerate(patterns): 268 | search_query_api = '/api/3/search/query' 269 | search_query_params = query_params(pattern, use_odinson) 270 | download_tsv_params = f"?sentence_id=true&sentence_text=true&capture_indices=true" 271 | if limit > 0: 272 | download_tsv_params += f"&limit={limit}" 273 | 274 | print(f'Downloading query: {pattern} for relation: {relation}') 275 | request = requests.post(url=URL + search_query_api, 276 | headers={"Content-Type": "application/json"}, 277 | data=json.dumps(search_query_params)) 278 | 279 | tsv_location = request.headers['TSV-Location'] 280 | tsv_url = URL + tsv_location + download_tsv_params 281 | 282 | outfile = f'{download_dir}/raw-{relation}-{id}' 283 | wget.download(tsv_url, outfile, bar=None) 284 | lines_downloaded = sum(1 for line in open(outfile, 'r')) 285 | print(f'Done downloading. lines downloaded: {lines_downloaded-1}') 286 | outfiles[relation] += [outfile] 287 | 288 | return outfiles 289 | 290 | def clean_special_tokens(text_str): 291 | CLEANINGMAP = {'-RRB-': ')', '-LRB-': '(', '-LSB-': '[', 292 | '-RSB-': ']', '-LCB-': '{', '-RCB-': '}', 293 | ' ': ' ', '"': "'", '--': '-', '---': '-'} 294 | tokens = text_str.split(' ') 295 | return ' '.join([CLEANINGMAP.get(t, t) for t in tokens]) 296 | 297 | def update_file_lengths(file_path, relations_num_rows): 298 | if not os.path.exists(file_path): 299 | lengths = relations_num_rows 300 | else: 301 | lengths = json.load(open(file_path, 'r')) 302 | for k, v in relations_num_rows.items(): 303 | lengths[k] = v 304 | 305 | with open(file_path, 'w') as file: 306 | json.dump(lengths, file) 307 | 308 | if __name__ == "__main__": 309 | parser = argparse.ArgumentParser() 310 | parser.add_argument("--triggers", type=str, required=True, choices=['all', 'single']) 311 | parser.add_argument("--dataset", type=str, required=True, choices=['tacred', 'docred']) 312 | parser.add_argument("--download", action='store_true') 313 | parser.add_argument("--merge_patterns", action='store_true') 314 | args = parser.parse_args() 315 | main(args) --------------------------------------------------------------------------------