├── .gitignore ├── LICENSE ├── README.md ├── convert_checkpoint.py ├── dataset.py ├── decoder.py ├── evaluations ├── ner │ └── CONTaiNER │ │ ├── .gitignore │ │ ├── LICENSE │ │ ├── README.md │ │ ├── calc_micro.sh │ │ ├── download.sh │ │ ├── exec_container.sh │ │ ├── misc │ │ ├── __init__.py │ │ ├── cvt_conll.py │ │ └── cvt_to_lowercase.py │ │ ├── process_fewnerd.sh │ │ └── src │ │ ├── __init__.py │ │ ├── calc-micro-avg.py │ │ ├── container.py │ │ ├── crf.py │ │ ├── dataset.py │ │ ├── decoder.py │ │ ├── embedding.py │ │ └── utils.py └── supervised-ie │ ├── README.md │ ├── config.py │ ├── config │ ├── ace.json │ └── ere.json │ ├── convert.py │ ├── data.py │ ├── global_feature.py │ ├── graph.py │ ├── model.py │ ├── predict.py │ ├── preprocessing │ ├── process_ace.py │ ├── process_dygiepp.py │ └── process_ere.py │ ├── resource │ ├── ace_to_aida_entity.tsv │ ├── ace_to_aida_event.tsv │ ├── ace_to_aida_relation.tsv │ ├── ace_to_aida_role.tsv │ ├── ere_patterns │ │ ├── event_role.json │ │ ├── relation_entity.json │ │ └── role_entity.json │ ├── splits │ │ ├── ACE05-CN │ │ │ ├── dev.doc.txt │ │ │ ├── test.doc.txt │ │ │ └── train.doc.txt │ │ ├── ACE05-E │ │ │ ├── dev.doc.txt │ │ │ ├── test.doc.txt │ │ │ └── train.doc.txt │ │ ├── ACE05-R │ │ │ ├── dev.doc.txt │ │ │ ├── test.doc.txt │ │ │ └── train.doc.txt │ │ ├── ERE-EN │ │ │ ├── dev.doc.txt │ │ │ ├── test.doc.txt │ │ │ └── train.doc.txt │ │ └── ERE-ES │ │ │ ├── dev.doc.txt │ │ │ ├── test.doc.txt │ │ │ └── train.doc.txt │ └── valid_patterns │ │ ├── event_role.json │ │ ├── relation_entity.json │ │ └── role_entity.json │ ├── scorer.py │ ├── train.py │ └── util.py ├── gumbel_latent_typer.py ├── model.py ├── overview.png ├── pretrain.py ├── requirements.txt ├── test_generation.py └── utils.py /.gitignore: -------------------------------------------------------------------------------- 1 | __pycache__/ 2 | wandb/ 3 | tb_logs/ 4 | data/ 5 | checkpoints/ 6 | *.tar 7 | *.bin 8 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Language Model Pre-Training with Sparse Latent Typing 2 | 3 | This is the official PyTorch implementation of the paper: 4 | **Language Model Pre-Training with Sparse Latent Typing**. Liliang Ren\*, Zixuan Zhang\*, Han Wang, Clare R. Voss, Chengxiang Zhai, Heng Ji. (\*Equal Contribution) ***EMNLP 2022 (Oral)*** 5 | [[pdf]](https://aclanthology.org/2022.emnlp-main.96.pdf) [[slides]](https://drive.google.com/file/d/1gTMifRSAyj45izkTPLQE5TMsgH-_WSo5/view?usp=sharing) 6 | 7 | 8 | ## Overview 9 |

10 | 11 |

12 | 13 | The figure shows the general architecture of our proposed Gumbel Latent Typing module. 14 | 15 | Our approach is especially effective for the information extraction related downstream tasks. We push state-of-the-art on the [Few-NERD](https://arxiv.org/abs/2105.07464) benchmark for both the *INTRA* and the *INTER* settings with absolute average F1 improvements of 6.24\% and 3.75\% respectively. We also significantly outperform a strong baseline ([OneIE](https://aclanthology.org/2020.acl-main.713/)) on both the ACE2005 and the ERE dataset (Notably, an absolute improvement of 7.59% on the ERE Entity Extraction subtask) by intializing the parameters of its vanilia `BERT-base` encoder with our `BERT-SparseLT` model continually pretrained from a `BERT-base-uncased` checkpoint using our Sparse Latent Typing objective. 16 | 17 | 18 | ## Requirements and Installation 19 | 20 | The required environment can be installed using the followining command line with the Python version `3.8.12`. 21 | 22 | ``` 23 | pip install -r requirements.txt 24 | ``` 25 | 26 | ## Pretrained Models 27 | 28 | We currently released our `BERT-SparseLT` (also denoted as `BERT-SparseLT-VOA`) model, a `BERT-base-uncased` model continually pretrained on the VOA corpus with the Sparse Latent Typing objective for reproducing the results mentioned in our paper. The model checkpoint can be downloaded from [here](https://drive.google.com/file/d/1Clq-VcdMRLnaEpOlV6BS_chMzY0-iO6Z/view?usp=sharing). 29 | 30 | ## Data Preparation 31 | 32 | To do continual pretraining, you need to first download the VOA corpus from [here](https://drive.google.com/file/d/1IZ633R2IoBAEQ4lOtJ-aPDMPLbAkfZJr/view?usp=sharing) and put it under the `data/` directory. If you also want to evaluate the pretrained model on the few-shot named entity extraction task, please go to the `evaluations/ner/CONTaiNER/` directory and follow the original *CONTaiNER* [repository](https://github.com/psunlpgroup/CONTaiNER) for data preparation. 33 | 34 | ## Continual Pre-Training 35 | 36 | Our `BERT-SparseLT` model can be continually pretrained from the `BERT-base-uncased` checkpoint on a single V100 GPU with 16GB graphical memory using the following command line. 37 | 38 | ``` 39 | python pretrain.py --name bert_base_voa_sparseLT --alpha 0.05 --beta 0.05 --gamma 0.1 40 | ``` 41 | 42 | This will produce and store the checkpoint for the full autoencoder model (includeing the Gumbel Latent Typer) containing the `BERT-SparseLT` encoder as denoted in the paper. 43 | 44 | You may use the following code snippet to extract the `BERT-SparseLT` model parameters and use it in the same way as the original `BERT-base-uncased` model for the downstream tasks. 45 | 46 | ``` 47 | python convert_checkpoint.py 48 | ``` 49 | 50 | ## Latent Typing and Sentence Reconstruction 51 | 52 | After pretraining, you may use the following code snippet to play with the latent typer and the decoder to do sparse latent typing and sentence reconstruction for any input text. The model directory used in the snippet should store the checkpoint of the full autoenocoder model. 53 | 54 | ``` 55 | python test_generation.py 56 | ``` 57 | 58 | An example of the latent typing and sentence reconstruction result, the tokens that are not selected for typing (classified as the latent type 0) are ignored: 59 | 60 | ``` 61 | INPUT SENTENCE: 62 | Our approach provides the decoder model with a shortcut to directly access the encoded token representations, so that the latent representation for each of the input tokens can be learned as an auxiliary type representation. 63 | 64 | LATENT TYPINGS: 65 | our(20), approach(20), provides(48), deco(19), ##der(13), model(27), with(16), short(49), ##cut(61), directly(18), access(48), encoded(25), token(53), representations(6), so(2), that(59), late(49), ##nt(4), representation(22), each(26), input(25), token(53), can(41), learned(38), as(58), auxiliary(32), type(30), representation(53), 66 | 67 | OUTPUT SENTENCE: 68 | our approach provides the decoder model with a shortcut to directly access the encoded token representations, so that the latent representation of each of the input tokens can be learned as an auxiliary type representation representation. 69 | ``` 70 | 71 | ## Few-shot Evaluation 72 | 73 | To reproduce the few-shot results in our paper, please go to the directory `evaluations/ner/CONTaiNER/` and run the following bash script to evaluate our model on the INTRA/INTER settings of the Few-NERD benchmark. We are assuming the model checkpoint is stored in the `checkpoints/for_container/` directory. 74 | 75 | ``` 76 | NAME=model_LP_100k #random model name 77 | TYPE=intra #change to 'inter' for INTER setting evaluation 78 | bash exec_container.sh $TYPE 0 5 1 $NAME #5-way-1-shot 79 | bash exec_container.sh $TYPE 0 5 5 $NAME #5-way-5-shot 80 | bash exec_container.sh $TYPE 0 10 5 $NAME #10-way-5-shot 81 | bash exec_container.sh $TYPE 0 10 1 $NAME #10-way-1-shot 82 | 83 | ``` 84 | 85 | After producing the outputs, you may also collect the evaluation results and calculate the F1 scores using the `calc_micro.sh` script in that directory. 86 | 87 | 88 | ## Citation 89 | 90 | If you found this repository helpful, please cite our paper: 91 | 92 | ``` 93 | @inproceedings{ren-etal-2022-language, 94 | title = "Language Model Pre-Training with Sparse Latent Typing", 95 | author = "Ren, Liliang and Zhang, Zixuan and Wang, Han and Voss, Clare and Zhai, ChengXiang and Ji, Heng", 96 | booktitle = "Proceedings of the 2022 Conference on Empirical Methods in Natural Language Processing", 97 | month = dec, 98 | year = "2022", 99 | address = "Abu Dhabi, United Arab Emirates", 100 | publisher = "Association for Computational Linguistics", 101 | url = "https://aclanthology.org/2022.emnlp-main.96", 102 | pages = "1480--1494", 103 | } 104 | ``` 105 | 106 | 107 | -------------------------------------------------------------------------------- /convert_checkpoint.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Liliang Ren. 2 | # 3 | # This source code is licensed under the Apache 2.0 license found in the 4 | # LICENSE file in the root directory of this source tree. 5 | 6 | 7 | from model import RobertaAutoEncoder 8 | import os 9 | 10 | in_dir = "./checkpoints/YOUR_MODEL_CKPT_DIR" 11 | out_dir = "./checkpoints/for_container" # Assuming it is used for few-shot eval. 12 | 13 | os.makedirs(out_dir, exist_ok=True) 14 | 15 | m = RobertaAutoEncoder.from_pretrained(in_dir) 16 | m.model.save_pretrained(out_dir) 17 | -------------------------------------------------------------------------------- /dataset.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Zixuan Zhang, Liliang Ren. 2 | # 3 | # This source code is licensed under the Apache 2.0 license found in the 4 | # LICENSE file in the root directory of this source tree. 5 | 6 | 7 | import os 8 | import torch 9 | import torch.nn as nn 10 | 11 | from torch.utils.data import Dataset, Sampler 12 | from torch.utils.tensorboard import SummaryWriter 13 | from transformers import Trainer, TrainingArguments 14 | 15 | from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple, Union 16 | 17 | 18 | class PLMDataset(Dataset): 19 | 20 | def __init__(self, tokenizer, file_path, block_size): 21 | assert os.path.isfile(file_path), f"Input file path {file_path} not found" 22 | 23 | with open(file_path, encoding="utf-8") as f: 24 | lines = f.readlines() 25 | 26 | batch_encoding = tokenizer([line[:-1] for line in lines], add_special_tokens=True, truncation=True, max_length=block_size) 27 | examples = batch_encoding["input_ids"] 28 | original_tokens = [['START'] + tokenizer.tokenize(line[:-1]) + ['END'] for line in lines] 29 | self.examples = [{"input_ids": torch.tensor(e, dtype=torch.long), "original_tokens": original_tokens[i]} for i,e in enumerate(examples)] 30 | 31 | def __len__(self): 32 | return len(self.examples) 33 | 34 | def __getitem__(self, i): 35 | return self.examples[i] 36 | 37 | 38 | class PLMDataCollator(object): 39 | 40 | def __init__(self, tokenizer, mlm_probability=0.15): 41 | self.tokenizer = tokenizer 42 | self.mlm_probability = mlm_probability 43 | 44 | def __call__(self, examples): 45 | # examples: list of {"input_ids": xxx, "original_tokens": xxx} 46 | input_ids = [{"input_ids": example["input_ids"]} for example in examples] 47 | original_tokens = [{"original_tokens": example["original_tokens"]} for example in examples] 48 | 49 | batch_src = self.tokenizer.pad(input_ids, return_attention_mask=True, return_tensors="pt") 50 | batch_tgt = self.tokenizer.pad(input_ids, return_attention_mask=True, return_tensors="pt") 51 | 52 | # If special token mask has been preprocessed, pop it from the dict. 53 | tgt_labels = batch_tgt.input_ids[:, 1:].clone() 54 | if self.tokenizer.pad_token_id is not None: 55 | tgt_labels[tgt_labels == self.tokenizer.pad_token_id] = -100 56 | # batch_tgt.input_ids[:, 0] = self.tokenizer.eos_token_id 57 | masked_input_ids, masked_labels = self.mask_tokens(batch_src.input_ids) 58 | 59 | # batch_src 60 | batch = { 61 | "input_ids": batch_tgt.input_ids, 62 | "attention_mask": batch_src.attention_mask, 63 | "masked_input_ids": masked_input_ids, 64 | "masked_labels": masked_labels, 65 | "decoder_input_ids": batch_tgt.input_ids[:, :-1], 66 | "decoder_attention_mask": batch_tgt.attention_mask[:, :-1], 67 | "labels": tgt_labels, 68 | "original_tokens": original_tokens 69 | } 70 | return batch 71 | 72 | def mask_tokens(self, inputs, special_tokens_mask=None): 73 | """ 74 | Prepare masked tokens inputs/labels for masked language modeling: 80% MASK, 10% random, 10% original. 75 | """ 76 | labels = inputs.clone() 77 | # We sample a few tokens in each sequence for MLM training (with probability `self.mlm_probability`) 78 | probability_matrix = torch.full(labels.shape, self.mlm_probability) 79 | if special_tokens_mask is None: 80 | special_tokens_mask = [ 81 | self.tokenizer.get_special_tokens_mask(val, already_has_special_tokens=True) for val in labels.tolist() 82 | ] 83 | special_tokens_mask = torch.tensor(special_tokens_mask, dtype=torch.bool) 84 | else: 85 | special_tokens_mask = special_tokens_mask.bool() 86 | 87 | probability_matrix.masked_fill_(special_tokens_mask, value=0.0) 88 | masked_indices = torch.bernoulli(probability_matrix).bool() 89 | labels[~masked_indices] = -100 # We only compute loss on masked tokens 90 | 91 | # 80% of the time, we replace masked input tokens with tokenizer.mask_token ([MASK]) 92 | indices_replaced = torch.bernoulli(torch.full(labels.shape, 0.8)).bool() & masked_indices 93 | inputs[indices_replaced] = self.tokenizer.convert_tokens_to_ids(self.tokenizer.mask_token) 94 | 95 | # 10% of the time, we replace masked input tokens with random word 96 | indices_random = torch.bernoulli(torch.full(labels.shape, 0.5)).bool() & masked_indices & ~indices_replaced 97 | random_words = torch.randint(len(self.tokenizer), labels.shape, dtype=torch.long) 98 | inputs[indices_random] = random_words[indices_random] 99 | 100 | # The rest of the time (10% of the time) we keep the masked input tokens unchanged 101 | return inputs, labels 102 | 103 | 104 | class PLMTrainingArgs(TrainingArguments): 105 | 106 | def add_loss_weights(self, mlm, alpha, beta, gamma ): 107 | self.mlm = mlm 108 | self.alpha = alpha 109 | self.beta = beta 110 | self.gamma = gamma 111 | 112 | 113 | class PLMTrainer(Trainer): 114 | 115 | def load_tb(self, log_dir): 116 | self.writer = SummaryWriter(log_dir) 117 | self.global_step = 0 118 | 119 | def _prepare_inputs(self, inputs: Dict[str, Union[torch.Tensor, Any]]) -> Dict[str, Union[torch.Tensor, Any]]: 120 | return inputs 121 | 122 | def create_optimizer(self): 123 | opt_model = self.model 124 | if self.optimizer is None: 125 | param_optimizer = [(n, p) for n, p in opt_model.named_parameters() if p.requires_grad] 126 | no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight','layer_norm.bias','layer_norm.weight',] 127 | slow_lr=['bert'] 128 | optimizer_grouped_parameters = [ 129 | {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay) and \ 130 | not any(nd in n for nd in slow_lr) ], 'weight_decay': self.args.weight_decay, 131 | 'lr': self.args.learning_rate}, 132 | {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay) and \ 133 | any(nd in n for nd in slow_lr) ], 'weight_decay': self.args.weight_decay, 134 | 'lr': self.args.learning_rate*0.1}, 135 | {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay) and \ 136 | any(nd in n for nd in slow_lr) ], 'weight_decay': 0.0, 137 | 'lr': self.args.learning_rate*0.1}, 138 | {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay) and \ 139 | not any(nd in n for nd in slow_lr) ], 'weight_decay': 0.0, 140 | 'lr': self.args.learning_rate}, 141 | ] 142 | 143 | optimizer_cls, optimizer_kwargs = Trainer.get_optimizer_cls_and_kwargs(self.args) 144 | 145 | self.optimizer = optimizer_cls(optimizer_grouped_parameters, **optimizer_kwargs) 146 | 147 | return self.optimizer 148 | 149 | def compute_loss(self, model, inputs, return_outputs=False): 150 | ''' main model ''' 151 | losses= model( 152 | input_ids=inputs["input_ids"].cuda(), 153 | attention_mask=inputs["attention_mask"].cuda(), 154 | mlm_input_ids=inputs["masked_input_ids"].cuda(), 155 | mlm_labels=inputs["masked_labels"].cuda(), 156 | decoder_input_ids=inputs["decoder_input_ids"].cuda(), 157 | decoder_attention_mask=inputs["decoder_attention_mask"].cuda(), 158 | gen_labels=inputs["labels"].cuda(), 159 | original_tokens=inputs["original_tokens"], 160 | return_dict=None 161 | ) 162 | mlm_loss, gen_loss, pm_loss, div_loss = losses 163 | self.writer.add_scalar('sparse_loss', torch.mean(pm_loss).item(), self.global_step) 164 | self.writer.add_scalar('mlm_loss', torch.mean(mlm_loss).item(), self.global_step) 165 | self.writer.add_scalar('gen_loss', torch.mean(gen_loss).item(), self.global_step) 166 | self.writer.add_scalar('div_loss', torch.mean(div_loss).item(), self.global_step) 167 | 168 | self.model.sa_pm.set_num_updates(self.global_step) 169 | self.global_step += 1 170 | 171 | return self.args.mlm * mlm_loss + self.args.alpha * gen_loss + self.args.beta * pm_loss + self.args.gamma * div_loss 172 | 173 | 174 | 175 | -------------------------------------------------------------------------------- /evaluations/ner/CONTaiNER/.gitignore: -------------------------------------------------------------------------------- 1 | data/ 2 | outputs/ 3 | saved_models/ 4 | episode-data/ 5 | *.zip 6 | -------------------------------------------------------------------------------- /evaluations/ner/CONTaiNER/LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2022 Penn State NLP Group 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /evaluations/ner/CONTaiNER/README.md: -------------------------------------------------------------------------------- 1 | This directory contains the source code modified from the original `CONTaiNER` [repository](https://github.com/psunlpgroup/CONTaiNER) for the training and the evaluation pipeline on the Few-NERD dataset. 2 | 3 | Please follow the original repository for data preparation. 4 | 5 | -------------------------------------------------------------------------------- /evaluations/ner/CONTaiNER/calc_micro.sh: -------------------------------------------------------------------------------- 1 | G=intra 2 | 3 | DIR=./outputs/few-nerd/${G}/YOUR_OUTPUT_DIR 4 | 5 | python src/calc-micro-avg.py --target_dir ${DIR}/${G}-5-5/ --range 5000 6 | python src/calc-micro-avg.py --target_dir ${DIR}/${G}-5-1/ --range 5000 7 | python src/calc-micro-avg.py --target_dir ${DIR}/${G}-10-5/ --range 5000 8 | python src/calc-micro-avg.py --target_dir ${DIR}/${G}-10-1/ --range 5000 9 | 10 | -------------------------------------------------------------------------------- /evaluations/ner/CONTaiNER/download.sh: -------------------------------------------------------------------------------- 1 | wget -O episode_data.zip https://cloud.tsinghua.edu.cn/f/56fb277d3fd2437a8ee3/?dl=1 2 | -------------------------------------------------------------------------------- /evaluations/ner/CONTaiNER/exec_container.sh: -------------------------------------------------------------------------------- 1 | export G=$1 2 | export GPU=$2 3 | export WAY=$3 4 | export SHOT=$4 5 | export SAVED_MODEL_DIR=$5 6 | export way=${WAY} 7 | export shot=${SHOT} 8 | echo $SAVED_MODEL_DIR 9 | echo $shot $way 10 | export finetune_loss=KL 11 | export is_viterbi=viterbi 12 | 13 | MODEL="../../../checkpoints/for_container/" 14 | CONFIG="../../../checkpoints/for_container/" 15 | 16 | ## training with toy evaluation for sanity check 17 | python src/container.py --data_dir data/few-nerd/${G} --labels-train data/few-nerd/${G}/labels_train.txt --labels-test data/few-nerd/${G}/labels_test.txt --config_name $CONFIG --model_name_or_path $MODEL --saved_model_dir saved_models/few-nerd/${G}/${SAVED_MODEL_DIR} --output_dir outputs/few-nerd/${G}/${finetune_loss}_${is_viterbi}_final_5000_${SAVED_MODEL_DIR}/${G}-${way}-${shot}/ --support_path support_test_${way}_${shot}/ --test_path query_test_${way}_${shot}/ --n_shots ${shot} --max_seq_length 128 --embedding_dimension 128 --num_train_epochs 1 --train_batch_size 32 --seed 1 --do_train --do_predict --select_gpu ${GPU} --training_loss KL --finetune_loss ${finetune_loss} --evaluation_criteria euclidean_hidden_state --consider_mutual_O --learning_rate 1e-4 --learning_rate_finetuning 1e-4 18 | 19 | ## evaluation 20 | echo $shot $way 21 | python src/container.py --data_dir data/few-nerd/${G} --labels-train data/few-nerd/${G}/labels_train.txt --labels-test data/few-nerd/${G}/labels_test.txt --config_name $CONFIG --model_name_or_path $MODEL --saved_model_dir saved_models/few-nerd/${G}/${SAVED_MODEL_DIR} --output_dir outputs/few-nerd/${G}/${finetune_loss}_${is_viterbi}_final_5000_${SAVED_MODEL_DIR}/${G}-${way}-${shot}/ --support_path support_test_${way}_${shot}/ --test_path query_test_${way}_${shot}/ --n_shots ${shot} --max_seq_length 128 --embedding_dimension 128 --num_train_epochs 1 --train_batch_size 32 --seed 1 --do_predict --select_gpu ${GPU} --training_loss KL --finetune_loss ${finetune_loss} --evaluation_criteria euclidean_hidden_state --learning_rate 1e-4 --learning_rate_finetuning 1e-4 --consider_mutual_O --temp_trans 0.01 --silent 22 | -------------------------------------------------------------------------------- /evaluations/ner/CONTaiNER/misc/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/renll/SparseLT/86306e94c27ec79b4ea4e810a262df42798e5ab9/evaluations/ner/CONTaiNER/misc/__init__.py -------------------------------------------------------------------------------- /evaluations/ner/CONTaiNER/misc/cvt_conll.py: -------------------------------------------------------------------------------- 1 | import jsonlines 2 | import os 3 | import argparse 4 | import glob 5 | from tqdm import tqdm 6 | 7 | 8 | def convert_file(input_name, sup_output_name, query_output_name): 9 | if not os.path.exists(sup_output_name): 10 | os.makedirs(sup_output_name) 11 | #okay 12 | if not os.path.exists(query_output_name): 13 | os.makedirs(query_output_name) 14 | reader = jsonlines.open(input_name) 15 | for ct, dicts in enumerate(reader): 16 | supdict = dicts["support"] 17 | supwords = supdict["word"] 18 | suplabels = supdict["label"] 19 | querydict = dicts["query"] 20 | querywords = querydict["word"] 21 | querylabels = querydict["label"] 22 | str1 = '' 23 | for i in range(len(supwords)): 24 | for j in range(len(supwords[i])): 25 | str1 = str1 + supwords[i][j] + '\t' + suplabels[i][j] + '\n' 26 | str1 += '\n' 27 | out_file = open(sup_output_name + '/' + str(ct) + '.txt', 'w', encoding='utf-8') 28 | out_file.write(str1) 29 | out_file.close() 30 | str2 = '' 31 | for i in range(len(querywords)): 32 | for j in range(len(querywords[i])): 33 | str2 = str2 + querywords[i][j] + '\t' + querylabels[i][j] + '\n' 34 | str2 += '\n' 35 | out_file = open(query_output_name + '/' + str(ct) + '.txt', 'w', encoding='utf-8') 36 | out_file.write(str2) 37 | out_file.close() 38 | 39 | 40 | if __name__ == '__main__': 41 | output_base_dir = 'data/few-nerd' 42 | support_file_prefix = 'support_' 43 | query_file_prefix = 'query_' 44 | print(os.getcwd()) 45 | # ensure the folders are already created beforehand 46 | 47 | all_input_files = glob.glob('**/*.jsonl', recursive=True) 48 | 49 | for file in tqdm(all_input_files): 50 | input_base_name = os.path.basename(file).split('.')[0] # just take the base name 51 | if 'test' in file: 52 | target_split_text = 'inter' if 'inter' in file else 'intra' 53 | sup_output_name = os.path.join(output_base_dir, target_split_text, support_file_prefix + input_base_name) 54 | query_output_name = os.path.join(output_base_dir, target_split_text, query_file_prefix + input_base_name) 55 | convert_file(file, sup_output_name, query_output_name) 56 | 57 | -------------------------------------------------------------------------------- /evaluations/ner/CONTaiNER/misc/cvt_to_lowercase.py: -------------------------------------------------------------------------------- 1 | #makeshift 2 | directories = ['data/few-nerd/inter/train.txt', 'data/few-nerd/intra/train.txt'] 3 | 4 | for file1 in directories: 5 | final_str = "" 6 | with open(file1, 'r') as f: 7 | lines = f.readlines() 8 | for line in lines: 9 | cmpnnts = line.split('\t') 10 | if len(cmpnnts) < 2: 11 | final_str += line 12 | continue 13 | c1, c2 = cmpnnts 14 | c1 = c1.lower() 15 | final_str += (c1 + '\t' + c2 ) 16 | 17 | f = open(file1, 'w') 18 | f.write(final_str) 19 | f.close() -------------------------------------------------------------------------------- /evaluations/ner/CONTaiNER/process_fewnerd.sh: -------------------------------------------------------------------------------- 1 | unzip episode_data.zip && 2 | python misc/cvt_conll.py && 3 | python misc/cvt_to_lowercase.py 4 | # cleanup 5 | rm -rf episode* 6 | -------------------------------------------------------------------------------- /evaluations/ner/CONTaiNER/src/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/renll/SparseLT/86306e94c27ec79b4ea4e810a262df42798e5ab9/evaluations/ner/CONTaiNER/src/__init__.py -------------------------------------------------------------------------------- /evaluations/ner/CONTaiNER/src/calc-micro-avg.py: -------------------------------------------------------------------------------- 1 | 2 | import numpy as np 3 | import glob 4 | import os 5 | import argparse 6 | 7 | 8 | def main(): 9 | 10 | parser = argparse.ArgumentParser() 11 | parser.add_argument("--target_dir", 12 | default=None, 13 | type=str, 14 | required=True, 15 | help="The input data dir.",) 16 | parser.add_argument("--range", 17 | default=5000, 18 | type=int, 19 | required=False, 20 | help="ranging the amount of data to consider in count") 21 | args = parser.parse_args() 22 | path = args.target_dir 23 | s = [] 24 | 25 | target_res_text = 'results.txt' 26 | if args.range == None: 27 | s = glob.glob(os.path.join(path, '*', target_res_text)) 28 | else: 29 | for i in range(0, args.range): 30 | try: 31 | s.append(glob.glob(os.path.join(path, str(i), target_res_text))[0]) 32 | except: 33 | print("Missing file: " + str(i)) 34 | 35 | precisions = [] 36 | recalls = [] 37 | f1s = [] 38 | pred_sum = [] 39 | tp_sum = [] 40 | true_sum = [] 41 | 42 | for file in s: 43 | with open(file) as f: 44 | lines = f.readlines() 45 | pred_sum.append(int(lines[6].split()[-1])) 46 | tp_sum.append(int(lines[8].split()[-1])) 47 | true_sum.append(int(lines[9].split()[-1])) 48 | recall = np.sum(tp_sum) / np.sum(true_sum) 49 | precision = np.sum(tp_sum) / np.sum(pred_sum) 50 | f1 = (2 * precision * recall) / (precision + recall) 51 | 52 | print("avg. f1 = %f" % (f1) ) 53 | print("avg. precision = %f" % (precision)) 54 | print("avg. recall = %f" % (recall)) 55 | print("covered = %f" % len(tp_sum)) 56 | 57 | 58 | 59 | 60 | if __name__ == "__main__": 61 | main() -------------------------------------------------------------------------------- /evaluations/ner/CONTaiNER/src/crf.py: -------------------------------------------------------------------------------- 1 | """ 2 | 3 | Conditional Random Fields 4 | Reference: https://aclanthology.org/2020.emnlp-main.516.pdf 5 | """ 6 | 7 | import torch 8 | import torch.nn as nn 9 | from dataset import IdxMaps 10 | 11 | 12 | START_ID = 0 13 | O_ID = 1 14 | 15 | 16 | class CRFInference: 17 | """ 18 | Inference part of the generalized CRF model 19 | """ 20 | 21 | def __init__(self, n_tag, trans_priors, power): 22 | """ 23 | We assume the batch size is 1, so no need to worry about PAD for now 24 | n_tag: START, O, and I_Xs 25 | """ 26 | super().__init__() 27 | self.transitions = self.trans_expand(n_tag, trans_priors, power) 28 | 29 | @staticmethod 30 | def trans_expand(n_tag, priors, power): 31 | s_o, s_i, o_o, o_i, i_o, i_i, x_y = priors 32 | # self transitions for I-X tags 33 | a = torch.eye(n_tag) * i_i 34 | # transitions from I-X to I-Y 35 | b = torch.ones(n_tag, n_tag) * x_y / (n_tag - 3) 36 | c = torch.eye(n_tag) * x_y / (n_tag - 3) 37 | transitions = a + b - c 38 | # transition from START to O 39 | transitions[START_ID, O_ID] = s_o 40 | # transitions from START to I-X 41 | transitions[START_ID, O_ID+1:] = s_i / (n_tag - 2) 42 | # transition from O to O 43 | transitions[O_ID, O_ID] = o_o 44 | # transitions from O to I-X 45 | transitions[O_ID, O_ID+1:] = o_i / (n_tag - 2) 46 | # transitions from I-X to O 47 | transitions[O_ID+1:, O_ID] = i_o 48 | # no transitions to START 49 | transitions[:, START_ID] = 0. 50 | 51 | powered = torch.pow(transitions, power) 52 | summed = powered.sum(dim=1) 53 | 54 | transitions = powered / summed.view(n_tag, 1) 55 | 56 | transitions = torch.where(transitions > 0, transitions, torch.tensor(.000001)) 57 | 58 | # print(transitions) 59 | # print(torch.sum(transitions, dim=1)) 60 | return torch.log(transitions) 61 | 62 | def forward(self, scores: torch.Tensor) -> torch.Tensor: # type: ignore 63 | """ 64 | Take the emission scores calculated by NERModel, and return a tensor of CRF features, 65 | which is the sum of transition scores and emission scores. 66 | :param scores: emission scores calculated by NERModel. 67 | shape: (batch_size, sentence_length, ntags) 68 | :return: a tensor containing the CRF features whose shape is 69 | (batch_size, sentence_len, ntags, ntags). F[b, t, i, j] represents 70 | emission[t, j] + transition[i, j] for the b'th sentence in this batch. 71 | """ 72 | batch_size, sentence_len, _ = scores.size() 73 | 74 | # expand the transition matrix batch-wise as well as sentence-wise 75 | transitions = self.transitions.expand(batch_size, sentence_len, -1, -1) 76 | 77 | # add another dimension for the "from" state, then expand to match 78 | # the dimensions of the expanded transition matrix above 79 | emissions = scores.unsqueeze(2).expand_as(transitions) 80 | 81 | # add them up 82 | return transitions + emissions 83 | 84 | @staticmethod 85 | def viterbi(features: torch.Tensor) -> torch.Tensor: 86 | """ 87 | Decode the most probable sequence of tags. 88 | Note that the delta values are calculated in the log space. 89 | :param features: the feature matrix from the forward method of CRF. 90 | shaped (batch_size, sentence_len, ntags, ntags) 91 | :return: a tensor containing the most probable sequences for the batch. 92 | shaped (batch_size, sentence_len) 93 | """ 94 | batch_size, sentence_len, ntags, _ = features.size() 95 | 96 | # initialize the deltas 97 | delta_t = features[:, 0, START_ID, :] 98 | deltas = [delta_t] 99 | 100 | # use dynamic programming to iteratively calculate the delta values 101 | for t in range(1, sentence_len): 102 | f_t = features[:, t] 103 | delta_t, _ = torch.max(f_t + delta_t.unsqueeze(2).expand_as(f_t), 1) 104 | deltas.append(delta_t) 105 | 106 | # now iterate backward to figure out the most probable tags 107 | sequences = [torch.argmax(deltas[-1], 1, keepdim=True)] 108 | for t in reversed(range(sentence_len - 1)): 109 | f_prev = features[:, t + 1].gather( 110 | 2, sequences[-1].unsqueeze(2).expand(batch_size, ntags, 1)).squeeze(2) 111 | sequences.append(torch.argmax(f_prev + deltas[t], 1, keepdim=True)) 112 | sequences.reverse() 113 | return torch.cat(sequences, dim=1) 114 | 115 | 116 | class CRF(nn.Module): 117 | """ 118 | Linear Chain CRF 119 | """ 120 | 121 | def __init__(self, ntags: int): 122 | """ 123 | Initialize the Linear Chain CRF layer. 124 | :param ntags: number of tags. Usually from IdxMaps 125 | """ 126 | super().__init__() 127 | transitions = torch.empty(ntags, ntags) 128 | nn.init.uniform_(transitions, -0.1, 0.1) 129 | # can't transition into START 130 | transitions[:, IdxMaps.START_ID] = -10000.0 131 | 132 | self.transitions = nn.Parameter(transitions) # type: ignore 133 | 134 | def forward(self, scores: torch.Tensor) -> torch.Tensor: # type: ignore 135 | """ 136 | Take the emission scores calculated by NERModel, and return a tensor of CRF features, 137 | which is the sum of transition scores and emission scores. 138 | :param scores: emission scores calculated by NERModel. 139 | shape: (batch_size, sentence_length, ntags) 140 | :return: a tensor containing the CRF features whose shape is 141 | (batch_size, sentence_len, ntags, ntags). F[b, t, i, j] represents 142 | emission[t, j] + transition[i, j] for the b'th sentence in this batch. 143 | """ 144 | batch_size, sentence_len, _ = scores.size() 145 | 146 | # expand the transition matrix batch-wise as well as sentence-wise 147 | transitions = self.transitions.expand(batch_size, sentence_len, -1, -1) 148 | 149 | # add another dimension for the "from" state, then expand to match 150 | # the dimensions of the expanded transition matrix above 151 | emissions = scores.unsqueeze(2).expand_as(transitions) 152 | 153 | # add them up 154 | return transitions + emissions 155 | 156 | @staticmethod 157 | def forward_alg(features: torch.Tensor) -> torch.Tensor: 158 | """ 159 | Calculate the log alpha values using the forward algorithm. 160 | :param features: the features matrix from the forward method of CRF 161 | shaped (batch_size, sentence_len, ntags, ntags) 162 | :return: the tensor that represents a series of alpha values for the batch 163 | whose shape is (batch_size, sentence_len) 164 | """ 165 | _, sentence_len, _, _ = features.size() 166 | 167 | # initialize the alpha value 168 | alpha_t = features[:, 0, IdxMaps.START_ID, :] 169 | alphas = [alpha_t] 170 | 171 | # use dynamic programming to iteratively calculate the alpha value 172 | for t in range(1, sentence_len): 173 | f_t = features[:, t] 174 | alpha_t = torch.logsumexp(f_t + alpha_t.unsqueeze(2).expand_as(f_t), 1) 175 | alphas.append(alpha_t) 176 | 177 | # return all the alpha values 178 | return torch.stack(alphas, dim=1) 179 | 180 | @staticmethod 181 | def tags_score(tags: torch.Tensor, features: torch.Tensor) -> torch.Tensor: 182 | """ 183 | Calculate the score for the given sequence of tags. 184 | :param tags: a batch of sequences of tags whose shape is (batch_sizee, sentence_len) 185 | :param features: the features matrix from the forward method of CRF. 186 | shaped (batch_size, sentence_len, ntags, ntags) 187 | :return: a tensor with scores for the given sequences of tags. 188 | shaped (batch_size,) 189 | """ 190 | batch_size, sentence_len, ntags, _ = features.size() 191 | 192 | # we first collect all the features whose "to" tag is given by tags, 193 | # i.e. F[b, t, i, *tags] 194 | # the resulting dimension is (batch, sentence_len, ntags, 1) 195 | to_idx = tags.view(-1, sentence_len, 1, 1).expand(-1, -1, ntags, -1) 196 | to_scores = features.gather(3, to_idx) 197 | 198 | # now out of to_scores, gather all the features whose "from" tag is 199 | # given by tags plus the start tag. 200 | # i.e. F[b, t, *[start + tags], j] 201 | # the resulting dimension is (batch, sentence_len, 1, 1) 202 | from_idx = torch.cat( 203 | (torch.tensor(IdxMaps.START_ID).expand(batch_size, 1).to(tags.device), tags[:, :-1]), 204 | dim=1 205 | ) 206 | scores = to_scores.gather(2, from_idx.view(-1, sentence_len, 1, 1)) 207 | 208 | # we've now gathered all the right scores, so sum them up! 209 | return torch.sum(scores.view(-1, sentence_len), dim=1) 210 | 211 | @staticmethod 212 | def viterbi(features: torch.Tensor) -> torch.Tensor: 213 | """ 214 | Decode the most probable sequence of tags. 215 | Note that the delta values are calculated in the log space. 216 | :param features: the feature matrix from the forward method of CRF. 217 | shaped (batch_size, sentence_len, ntags, ntags) 218 | :return: a tensor containing the most probable sequences for the batch. 219 | shaped (batch_size, sentence_len) 220 | """ 221 | batch_size, sentence_len, ntags, _ = features.size() 222 | 223 | # initialize the deltas 224 | delta_t = features[:, 0, IdxMaps.START_ID, :] 225 | deltas = [delta_t] 226 | 227 | # use dynamic programming to iteratively calculate the delta values 228 | for t in range(1, sentence_len): 229 | f_t = features[:, t] 230 | delta_t, _ = torch.max(f_t + delta_t.unsqueeze(2).expand_as(f_t), 1) 231 | deltas.append(delta_t) 232 | 233 | # now iterate backward to figure out the most probable tags 234 | sequences = [torch.argmax(deltas[-1], 1, keepdim=True)] 235 | for t in reversed(range(sentence_len - 1)): 236 | f_prev = features[:, t + 1].gather( 237 | 2, sequences[-1].unsqueeze(2).expand(batch_size, ntags, 1)).squeeze(2) 238 | sequences.append(torch.argmax(f_prev + deltas[t], 1, keepdim=True)) 239 | sequences.reverse() 240 | return torch.cat(sequences, dim=1) 241 | -------------------------------------------------------------------------------- /evaluations/ner/CONTaiNER/src/embedding.py: -------------------------------------------------------------------------------- 1 | from typing import TextIO, Tuple 2 | import numpy as np 3 | 4 | 5 | class GloveEmbedding(dict): 6 | """ 7 | Class with the pretrained 100d glove embeddings. 8 | Note: glove embedding is for lower case tokens. 9 | """ 10 | DIM_EMDEDDING = 100 11 | 12 | def __init__(self, fileh: TextIO): 13 | """ 14 | Initialize a GloveEmbedding instance. 15 | :param fileh: a file handler for the glove embeddings 16 | """ 17 | super().__init__() 18 | self.load(fileh) 19 | 20 | def load(self, fileh: TextIO): 21 | """ 22 | Load and parse each line of the glove embeddiings file. 23 | :param fileh: the glove embeddings file to be loaded 24 | """ 25 | for line in fileh: 26 | token, embedding = GloveEmbedding.split(str(line)) 27 | self[token.lower()] = embedding 28 | 29 | @staticmethod 30 | def split(line: str) -> Tuple[str, np.ndarray]: 31 | """ 32 | Split the given line into a token and its embedding vector. 33 | :param line: line to be splitted into a token and its embedding vector 34 | :return: a tuple of a token and its embedding vector (numpy array) 35 | """ 36 | token, vals = line.split(None, 1) 37 | return token, np.array([float(v) for v in vals.split()], dtype=np.float) 38 | 39 | @classmethod 40 | def random(cls) -> np.ndarray: 41 | """ 42 | Return a random vector with the right scale. 43 | :return: a random numpy vector 44 | """ 45 | dim = cls.DIM_EMDEDDING 46 | scale = np.sqrt(3.0 / dim) 47 | return np.random.uniform(-scale, scale, dim) 48 | 49 | @classmethod 50 | def zeros(cls) -> np.ndarray: 51 | """ 52 | Return a zero vector. 53 | :return: a zero numpy vector 54 | """ 55 | return np.zeros(cls.DIM_EMDEDDING, dtype=np.float) 56 | 57 | def get(self, token: str, default=None) -> np.ndarray: 58 | """ 59 | Get the glove embedding if the token is found, else the given default or a random vector. 60 | :param token: a token to be looked up 61 | :param default: a default to be returned if the given token is not found 62 | :return: the glove embedding, default or a random vector 63 | """ 64 | token = token.lower() 65 | ret = super(GloveEmbedding, self).get(token) 66 | if ret is not None: 67 | return ret 68 | elif default is not None: 69 | return default 70 | return self.random() 71 | -------------------------------------------------------------------------------- /evaluations/supervised-ie/README.md: -------------------------------------------------------------------------------- 1 | OneIE v0.4.8 2 | 3 | # Requirements 4 | 5 | Python 3.7 6 | Python packages 7 | - PyTorch 1.0+ (Install the CPU version if you use this tool on a machine without GPUs) 8 | - transformers 3.0.2 (It seems using transformers 3.1+ may cause some model loading issue) 9 | - tqdm 10 | - lxml 11 | - nltk 12 | 13 | 14 | # How to Run 15 | 16 | ## Pre-processing 17 | ### DyGIE++ to OneIE format 18 | The `prepreocessing/process_dygiepp.py` script converts datasets in DyGIE++ 19 | format (https://github.com/dwadden/dygiepp/tree/master/scripts/data/ace-event) to 20 | the format used by OneIE. Example: 21 | 22 | ``` 23 | python preprocessing/process_dygiepp.py -i train.json -o train.oneie.json 24 | ``` 25 | 26 | Arguments: 27 | - -i, --input: Path to the input file. 28 | - -o, --output: Path to the output file. 29 | 30 | ### ACE2005 to OneIE format 31 | The `prepreocessing/process_ace.py` script converts raw ACE2005 datasets to the 32 | format used by OneIE. Example: 33 | 34 | ``` 35 | python preprocessing/process_ace.py -i /LDC2006T06/data -o 36 | -s resource/splits/ACE05-E -b bert-large-cased -c -l english 37 | ``` 38 | 39 | Arguments: 40 | - -i, --input: Path to the input directory (`data` folder in your LDC2006T06 41 | package). 42 | - -o, --output: Path to the output directory. 43 | - -b, --bert: Bert model name. 44 | - -c, --bert_cache_dir: Path to the BERT cache directory. 45 | - -s, --split: Path to the split directory. We provide document id lists for all 46 | datasets used in our paper in `resource/splits`. 47 | - -l, --lang: Language (options: english, chinese). 48 | 49 | 50 | ### ERE to OneIE format 51 | The `prepreocessing/process_ere.py` script converts raw ERE datasets (LDC2015E29, 52 | LDC2015E68, LDC2015E78, LDC2015E107) to the format used by OneIE. 53 | 54 | ``` 55 | python preprocessing/process_ere.py -i /data -o 56 | -b bert-large-cased -c -l english -d normal 57 | ``` 58 | 59 | Arguments: 60 | - -i, --input: Path to the input directory (`data` folder in your ERE package). 61 | - -o, --output: Path to the output directory. 62 | - -b, --bert: Bert model name. 63 | - -c, --bert_cache_dir: Path to the BERT cache directory. 64 | - -d, --dataset: Dataset type: normal, r2v2, parallel, or spanish. 65 | - -l, --lang: Language (options: english, spanish). 66 | 67 | This script only supports: 68 | - LDC2015E29_DEFT_Rich_ERE_English_Training_Annotation_V1 69 | - LDC2015E29_DEFT_Rich_ERE_English_Training_Annotation_V2 70 | - LDC2015E68_DEFT_Rich_ERE_English_Training_Annotation_R2_V2 71 | - LDC2015E78_DEFT_Rich_ERE_Chinese_and_English_Parallel_Annotation_V2 72 | - LDC2015E107_DEFT_Rich_ERE_Spanish_Annotation_V2 73 | 74 | 75 | ## Training 76 | 77 | - `cd` to the root directory of this package 78 | - Set the environment variable PYTHONPATH to the current directory. 79 | For example, if you unpack this package to `~/oneie_v0.4.8`, run: 80 | `export PYTHONPATH=~/oneie_v0.4.8` 81 | - Run this commandline to train a model: `python train.py -c `. 82 | - We provide an example configuration file `config/example.json`. Fill in the 83 | following paths in the configuration file: 84 | - BERT_CACHE_DIR: Pre-trained BERT models, configs, and tokenizers will be 85 | downloaded to this directory. 86 | - TRAIN_FILE_PATH, DEV_FILE_PATH, TEST_FILE_PATH: Path to the training/dev/test 87 | files. 88 | - OUTPUT_DIR: The model will be saved to sub folders in this directory. 89 | - VALID_PATTERN_DIR: Valid patterns created based on the annotation guidelines or training set. Example files are provided in `resource/valid_patterns`. 90 | 91 | 92 | ## Evaluation 93 | 94 | - `cd` to the root directory of this package 95 | - Set the environment variable PYTHONPATH to the current directory. 96 | For example, if you unpack this package to `~/oneie_v0.4.8`, run: 97 | `export PYTHONPATH=~/oneie_v0.4.8` 98 | - Example commandline to use OneIE: `python predict.py -m best.role.mdl -i input -o output -c output_cs --format ltf` 99 | + Arguments: 100 | - -m, --model_path: Path to the trained model. 101 | - -i, --input_dir: Path to the input directory. LTF format sample files can be found in the `input` directory. 102 | - -o, --output_dir: Path to the output directory (json format). Output files are in the JSON format. Sample files can be found in the `output` directory. 103 | - -c, --cs_dir: (optional) Path to the output directory (cs format). Sample files can be found in the `output_cs` directory. 104 | - -l, --log_path: (optional) Path to the log file. A sample file `log.json` can be found in `output`. 105 | - --gpu: (optional) Use GPU 106 | - -d, --device: (optional) GPU device index (for multi-GPU machines). 107 | - -b, --batch_size: (optional) Batch size. For a 16GB GPU, a batch size of 10~15 is a reasonable value. 108 | - --max_len: (optional) Max sentence length. Sentences longer than this value will be ignored. You may need to decrease `batch_size` if you set `max_len` to a larger number. 109 | - --beam_size: (optional) Beam set size of the decoder. Increasing this value may improve the results and make the decoding slower. 110 | - --lang: (optional) Model language. 111 | - --format: Input file format (txt or ltf). 112 | 113 | 114 | # Output Format 115 | 116 | OneIE save results in JSON format. Each line is a JSON object for a sentence 117 | containing the following fields: 118 | + doc_id (string): Document ID 119 | + sent_id (string): Sentence ID 120 | + tokens (list): A list of tokens 121 | + token_ids (list): A list of token IDs (doc_id:start_offset-end_offset) 122 | + graph (object): Information graph predicted by the model 123 | - entities (list): A list of predicted entities. Each item in the list has exactly 124 | four values: start_token_index, end_token_index, entity_type, mention_type, score. 125 | For example, "[3, 5, "GPE", "NAM", 1.0]" means the index of the start token is 3, 126 | index of the end token is 4 (5 - 1), entity type is GPE, mention type is NAM, 127 | and local score is 1.0. 128 | - triggers (list): A list of predicted triggers. It is similar to `entities`, while 129 | each item has three values: start_token_index, end_token_index, event_type, score. 130 | - relations (list): A list of predicted relations. Each item in the list has 131 | three values: arg1_entity_index, arg2_entity_index, relation_type, score. 132 | In the following example, `[1, 0, "ORG-AFF", 0.52]` means there is a ORG-AFF relation 133 | between entity 1 ("leader") and entity 0 ("North Korean") with a local 134 | score of 0.52. 135 | The order of arg1 and arg2 can be ignored for "SOC-PER" as this relation is 136 | symmetric. 137 | - roles (list): A list of predicted argument roles. Each item has three values: 138 | trigger_index, entity_index, role, score. 139 | In the following example, `[0, 2, "Attacker", 0.8]` means entity 2 (Kim Jong Un) is 140 | the Attacker argument of event 0 ("detonate": Conflict:Attack), and the local 141 | score is 0.8. 142 | 143 | Output example: 144 | ``` 145 | {"doc_id": "HC0003PYD", "sent_id": "HC0003PYD-16", "token_ids": ["HC0003PYD:2295-2296", "HC0003PYD:2298-2304", "HC0003PYD:2305-2305", "HC0003PYD:2307-2311", "HC0003PYD:2313-2318", "HC0003PYD:2320-2325", "HC0003PYD:2327-2329", "HC0003PYD:2331-2334", "HC0003PYD:2336-2337", "HC0003PYD:2339-2348", "HC0003PYD:2350-2351", "HC0003PYD:2353-2360", "HC0003PYD:2362-2362", "HC0003PYD:2364-2367", "HC0003PYD:2369-2376", "HC0003PYD:2378-2383", "HC0003PYD:2385-2386", "HC0003PYD:2388-2390", "HC0003PYD:2392-2397", "HC0003PYD:2399-2401", "HC0003PYD:2403-2408", "HC0003PYD:2410-2412", "HC0003PYD:2414-2415", "HC0003PYD:2417-2425", "HC0003PYD:2427-2428", "HC0003PYD:2430-2432", "HC0003PYD:2434-2437", "HC0003PYD:2439-2441", "HC0003PYD:2443-2447", "HC0003PYD:2449-2450", "HC0003PYD:2452-2454", "HC0003PYD:2456-2464", "HC0003PYD:2466-2472", "HC0003PYD:2474-2480", "HC0003PYD:2481-2481", "HC0003PYD:2483-2485", "HC0003PYD:2487-2491", "HC0003PYD:2493-2502", "HC0003PYD:2504-2509", "HC0003PYD:2511-2514", "HC0003PYD:2516-2523", "HC0003PYD:2524-2524"], "tokens": ["On", "Tuesday", ",", "North", "Korean", "leader", "Kim", "Jong", "Un", "threatened", "to", "detonate", "a", "more", "powerful", "H-bomb", "in", "the", "future", "and", "called", "for", "an", "expansion", "of", "the", "size", "and", "power", "of", "his", "country's", "nuclear", "arsenal", ",", "the", "state", "television", "agency", "KCNA", "reported", "."], "graph": {"entities": [[3, 5, "GPE", "NAM", 1.0], [5, 6, "PER", "NOM", 0.2], [6, 9, "PER", "NAM", 0.5060472888322202], [15, 16, "WEA", "NOM", 0.5332313915378754], [30, 31, "PER", "PRO", 1.0], [32, 33, "WEA", "NOM", 1.0], [33, 34, "WEA", "NOM", 0.5212696155645499], [36, 37, "GPE", "NOM", 0.4998288792916457], [38, 39, "ORG", "NOM", 1.0], [39, 40, "ORG", "NAM", 0.5294904130032032]], "triggers": [[11, 12, "Conflict:Attack", 1.0]], "relations": [[1, 0, "ORG-AFF", 1.0]], "roles": [[0, 2, "Attacker", 0.4597024700555278], [0, 3, "Instrument", 1.0]]}} 146 | ``` -------------------------------------------------------------------------------- /evaluations/supervised-ie/config.py: -------------------------------------------------------------------------------- 1 | import copy 2 | import json 3 | import os 4 | 5 | from typing import Dict, Any 6 | 7 | from transformers import (BertConfig, RobertaConfig, XLMRobertaConfig, 8 | PretrainedConfig) 9 | 10 | class Config(object): 11 | def __init__(self, **kwargs): 12 | self.coref = kwargs.pop('coref', True) 13 | # bert 14 | self.bert_model_name = kwargs.pop('bert_model_name', 'bert-large-cased') 15 | self.bert_cache_dir = kwargs.pop('bert_cache_dir', None) 16 | self.extra_bert = kwargs.pop('extra_bert', -1) 17 | self.use_extra_bert = kwargs.pop('use_extra_bert', False) 18 | # global features 19 | self.use_global_features = kwargs.get('use_global_features', False) 20 | self.global_features = kwargs.get('global_features', []) 21 | # model 22 | self.multi_piece_strategy = kwargs.pop('multi_piece_strategy', 'first') 23 | self.bert_dropout = kwargs.pop('bert_dropout', .5) 24 | self.linear_dropout = kwargs.pop('linear_dropout', .4) 25 | self.linear_bias = kwargs.pop('linear_bias', True) 26 | self.linear_activation = kwargs.pop('linear_activation', 'relu') 27 | self.entity_hidden_num = kwargs.pop('entity_hidden_num', 150) 28 | self.mention_hidden_num = kwargs.pop('mention_hidden_num', 150) 29 | self.event_hidden_num = kwargs.pop('event_hidden_num', 600) 30 | self.relation_hidden_num = kwargs.pop('relation_hidden_num', 150) 31 | self.role_hidden_num = kwargs.pop('role_hidden_num', 600) 32 | self.use_entity_type = kwargs.pop('use_entity_type', False) 33 | self.beam_size = kwargs.pop('beam_size', 5) 34 | self.beta_v = kwargs.pop('beta_v', 2) 35 | self.beta_e = kwargs.pop('beta_e', 2) 36 | self.relation_mask_self = kwargs.pop('relation_mask_self', True) 37 | self.relation_directional = kwargs.pop('relation_directional', False) 38 | self.symmetric_relations = set(kwargs.pop('symmetric_relations', ['PER-SOC'])) 39 | # files 40 | self.train_file = kwargs.pop('train_file', None) 41 | self.dev_file = kwargs.pop('dev_file', None) 42 | self.test_file = kwargs.pop('test_file', None) 43 | self.valid_pattern_path = kwargs.pop('valid_pattern_path', None) 44 | self.log_path = kwargs.pop('log_path', None) 45 | # training 46 | self.accumulate_step = kwargs.pop('accumulate_step', 1) 47 | self.batch_size = kwargs.pop('batch_size', 10) 48 | self.eval_batch_size = kwargs.pop('eval_batch_size', 5) 49 | self.max_epoch = kwargs.pop('max_epoch', 50) 50 | self.learning_rate = kwargs.pop('learning_rate', 1e-3) 51 | self.bert_learning_rate = kwargs.pop('bert_learning_rate', 1e-5) 52 | self.weight_decay = kwargs.pop('weight_decay', 0.001) 53 | self.bert_weight_decay = kwargs.pop('bert_weight_decay', 0.00001) 54 | self.warmup_epoch = kwargs.pop('warmup_epoch', 5) 55 | self.grad_clipping = kwargs.pop('grad_clipping', 5.0) 56 | # others 57 | self.use_gpu = kwargs.pop('use_gpu', True) 58 | self.gpu_device = kwargs.pop('gpu_device', -1) 59 | 60 | @classmethod 61 | def from_dict(cls, dict_obj): 62 | """Creates a Config object from a dictionary. 63 | Args: 64 | dict_obj (Dict[str, Any]): a dict where keys are 65 | """ 66 | config = cls() 67 | for k, v in dict_obj.items(): 68 | setattr(config, k, v) 69 | return config 70 | 71 | @classmethod 72 | def from_json_file(cls, path): 73 | with open(path, 'r', encoding='utf-8') as r: 74 | return cls.from_dict(json.load(r)) 75 | 76 | def to_dict(self): 77 | output = copy.deepcopy(self.__dict__) 78 | return output 79 | 80 | def save_config(self, path): 81 | """Save a configuration object to a file. 82 | :param path (str): path to the output file or its parent directory. 83 | """ 84 | if os.path.isdir(path): 85 | path = os.path.join(path, 'config.json') 86 | print('Save config to {}'.format(path)) 87 | with open(path, 'w', encoding='utf-8') as w: 88 | w.write(json.dumps(self.to_dict(), indent=2, 89 | sort_keys=True)) 90 | @property 91 | def bert_config(self): 92 | if self.bert_model_name.startswith('bert-'): 93 | return BertConfig.from_pretrained(self.bert_model_name, 94 | cache_dir=self.bert_cache_dir) 95 | elif self.bert_model_name.startswith('roberta-'): 96 | return RobertaConfig.from_pretrained(self.bert_model_name, 97 | cache_dir=self.bert_cache_dir) 98 | elif self.bert_model_name.startswith('xlm-roberta-'): 99 | return XLMRobertaConfig.from_pretrained(self.bert_model_name, 100 | cache_dir=self.bert_cache_dir) 101 | else: 102 | return BertConfig.from_pretrained(self.bert_model_name) -------------------------------------------------------------------------------- /evaluations/supervised-ie/config/ace.json: -------------------------------------------------------------------------------- 1 | { 2 | "bert_model_name": "/shared/nas/data/m1/liliang3/checkpoint_final/", 3 | "bert_cache_dir": "", 4 | "multi_piece_strategy": "average", 5 | "bert_dropout": 0.5, 6 | "use_extra_bert": true, 7 | "extra_bert": -3, 8 | 9 | "use_global_features": false, 10 | "global_features": [], 11 | "global_warmup": 0, 12 | 13 | "linear_dropout": 0.4, 14 | "linear_bias": true, 15 | "entity_hidden_num": 150, 16 | "mention_hidden_num": 150, 17 | "event_hidden_num": 600, 18 | "relation_hidden_num": 150, 19 | "role_hidden_num": 600, 20 | "use_entity_type": true, 21 | "beam_size": 20, 22 | "beta_v": 2, 23 | "beta_e": 2, 24 | "relation_mask_self": true, 25 | "relation_directional": false, 26 | "symmetric_relations": ["PER-SOC"], 27 | 28 | "train_file": "./data/ace_bert_uncased/train.oneie.json", 29 | "dev_file": "./data/ace_bert_uncased/dev.oneie.json", 30 | "test_file": "./data/ace_bert_uncased/test.oneie.json", 31 | "log_path": "./log", 32 | "valid_pattern_path": "./resource/ere_patterns", 33 | "ignore_title": false, 34 | "ignore_first_header": false, 35 | 36 | "accumulate_step": 1, 37 | "batch_size": 10, 38 | "eval_batch_size": 10, 39 | "max_epoch": 100, 40 | "learning_rate": 1e-4, 41 | "bert_learning_rate": 1e-5, 42 | "weight_decay": 1e-3, 43 | "bert_weight_decay": 1e-5, 44 | "warmup_epoch": 5, 45 | "grad_clipping": 5.0, 46 | 47 | "use_gpu": true, 48 | "gpu_device": 1 49 | } -------------------------------------------------------------------------------- /evaluations/supervised-ie/config/ere.json: -------------------------------------------------------------------------------- 1 | { 2 | "bert_model_name": "/shared/nas/data/m1/liliang3/checkpoint_final/", 3 | "bert_cache_dir": "", 4 | "multi_piece_strategy": "average", 5 | "bert_dropout": 0.5, 6 | "use_extra_bert": true, 7 | "extra_bert": -3, 8 | 9 | "use_global_features": false, 10 | "global_features": [], 11 | "global_warmup": 0, 12 | 13 | "linear_dropout": 0.4, 14 | "linear_bias": true, 15 | "entity_hidden_num": 150, 16 | "mention_hidden_num": 150, 17 | "event_hidden_num": 600, 18 | "relation_hidden_num": 150, 19 | "role_hidden_num": 600, 20 | "use_entity_type": true, 21 | "beam_size": 20, 22 | "beta_v": 2, 23 | "beta_e": 2, 24 | "relation_mask_self": true, 25 | "relation_directional": false, 26 | "symmetric_relations": ["PER-SOC"], 27 | 28 | "train_file": "./data/ere_bert_uncased/train.oneie.json", 29 | "dev_file": "./data/ere_bert_uncased/dev.oneie.json", 30 | "test_file": "./data/ere_bert_uncased/test.oneie.json", 31 | "log_path": "./log", 32 | "valid_pattern_path": "./resource/ere_patterns", 33 | "ignore_title": false, 34 | "ignore_first_header": false, 35 | 36 | "accumulate_step": 1, 37 | "batch_size": 10, 38 | "eval_batch_size": 10, 39 | "max_epoch": 100, 40 | "learning_rate": 1e-4, 41 | "bert_learning_rate": 1e-5, 42 | "weight_decay": 1e-3, 43 | "bert_weight_decay": 1e-5, 44 | "warmup_epoch": 5, 45 | "grad_clipping": 5.0, 46 | 47 | "use_gpu": true, 48 | "gpu_device": 1 49 | } -------------------------------------------------------------------------------- /evaluations/supervised-ie/convert.py: -------------------------------------------------------------------------------- 1 | import os 2 | import glob 3 | import json 4 | 5 | cur_dir = os.path.dirname(os.path.realpath(__file__)) 6 | 7 | entity_type_mapping_file = os.path.join(cur_dir, 'resource', 'ace_to_aida_entity.tsv') 8 | event_type_mapping_file = os.path.join(cur_dir, 'resource', 'ace_to_aida_event.tsv') 9 | role_type_mapping_file = os.path.join(cur_dir, 'resource', 'ace_to_aida_role.tsv') 10 | relation_type_mapping_file = os.path.join(cur_dir, 'resource', 'ace_to_aida_relation.tsv') 11 | 12 | def load_mapping(mapping_file): 13 | mapping = {} 14 | with open(mapping_file, 'r', encoding='utf-8') as r: 15 | for line in r: 16 | from_type, to_type = line.strip().split('\t') 17 | mapping[from_type] = to_type 18 | return mapping 19 | 20 | 21 | def get_span_mention_text(tokens, token_ids, start, end): 22 | if start + 1 == end: 23 | return tokens[start], token_ids[start] 24 | 25 | start_token = tokens[start] 26 | end_token = tokens[end - 1] 27 | start_char = int(token_ids[start].split(':')[1].split('-')[0]) 28 | end_char = int(token_ids[end - 1].split(':')[1].split('-')[1]) 29 | text = ' ' * (end_char - start_char + 1) 30 | for token, token_id in zip(tokens[start:end], token_ids[start:end]): 31 | token_start, token_end = token_id.split(':')[1].split('-') 32 | token_start, token_end = int(token_start), int(token_end) 33 | token_start -= start_char 34 | token_end -= start_char 35 | assert len(text[:token_start] + token + text[token_end + 1:]) == len(text) 36 | text = text[:token_start] + token + text[token_end + 1:] 37 | return text, '{}:{}-{}'.format(token_ids[start].split(':')[0], 38 | start_char, end_char) 39 | 40 | 41 | def json_to_cs(input_dir, output_dir): 42 | # TODO: add the first cs line 43 | entity_type_mapping = load_mapping(entity_type_mapping_file) 44 | relation_type_mapping = load_mapping(relation_type_mapping_file) 45 | event_type_mapping = load_mapping(event_type_mapping_file) 46 | role_type_mapping = load_mapping(role_type_mapping_file) 47 | 48 | json_files = glob.glob(os.path.join(input_dir, '*.json')) 49 | # convert entities 50 | print('Converting entity mentions and generate entity cs file') 51 | entity_mapping = {} 52 | entity_id_mapping = {} 53 | entity_cs_file = os.path.join(output_dir, 'entity.cs') 54 | with open(entity_cs_file, 'w', encoding='utf-8') as w: 55 | for f in json_files: 56 | with open(f, 'r', encoding='utf-8') as r: 57 | for line in r: 58 | result = json.loads(line) 59 | doc_id = result['doc_id'] 60 | sent_id = result['sent_id'] 61 | tokens, token_ids = result['tokens'], result['token_ids'] 62 | for i, (start, end, enttype, mentype, _) in enumerate(result['graph']['entities']): 63 | entity_text, entity_span = get_span_mention_text( 64 | tokens, token_ids, start, end) 65 | entity_id = 'Entity_EDL_{:07d}'.format(len(entity_mapping) + 1) 66 | entity_mapping[(sent_id, i)] = (entity_text, entity_id, entity_span, enttype, mentype) 67 | entity_id_mapping[entity_id] = (sent_id, i) 68 | enttype_mapped = entity_type_mapping[enttype] 69 | w.write(':{}\ttype\t{}\t1.000000\n'.format(entity_id, enttype_mapped)) 70 | w.write(':{}\tcanonical_mention\t"{}"\t{}\t0.000\n'.format( 71 | entity_id, entity_text, entity_span)) 72 | w.write(':{}\tmention\t"{}"\t{}\t0.000\n'.format( 73 | entity_id, entity_text, entity_span)) 74 | # skip the link line 75 | 76 | # converting relations and events 77 | print('Converting relations and events') 78 | event_count = 0 79 | relation_cs_file = os.path.join(output_dir, 'relation.cs') 80 | event_cs_file = os.path.join(output_dir, 'event.cs') 81 | with open(relation_cs_file, 'w', encoding='utf-8') as rel_w, \ 82 | open(event_cs_file, 'w', encoding='utf-8') as evt_w: 83 | for f in json_files: 84 | with open(f, 'r', encoding='utf-8') as r: 85 | for line in r: 86 | result = json.loads(line) 87 | sent_id = result['sent_id'] 88 | tokens, token_ids = result['tokens'], result['token_ids'] 89 | relations = result['graph']['relations'] 90 | triggers = result['graph']['triggers'] 91 | roles = result['graph']['roles'] 92 | # sentence span 93 | sent_span = '{}:{}-{}'.format( 94 | token_ids[0].split(':')[0], 95 | token_ids[0].split(':')[1].split('-')[0], 96 | token_ids[-1].split(':')[1].split('-')[1]) 97 | # convert relations 98 | for arg1, arg2, reltype, _ in relations: 99 | if reltype == 'ART': 100 | continue 101 | entity_id_1 = entity_mapping[(sent_id, arg1)][1] 102 | entity_id_2 = entity_mapping[(sent_id, arg2)][1] 103 | reltype_mapped = relation_type_mapping[reltype] 104 | rel_w.write(':{}\t{}\t:{}\t{}\t1.000\n'.format( 105 | entity_id_1, reltype_mapped, entity_id_2, sent_span 106 | )) 107 | # convert events 108 | for cur_trigger_idx, (start, end, eventtype, _) in enumerate(triggers): 109 | event_count += 1 110 | event_id = 'Event_{:06d}'.format(event_count) 111 | trigger_text, trigger_span = get_span_mention_text( 112 | tokens, token_ids, start, end) 113 | eventtype_mapped = event_type_mapping[eventtype] 114 | evt_w.write(':{}\ttype\t{}\n'.format(event_id, eventtype_mapped)) 115 | evt_w.write(':{}\tmention.actual\t"{}"\t{}\t1.000\n'.format( 116 | event_id, trigger_text, trigger_span)) 117 | evt_w.write(':{}\tcanonical_mention.actual\t"{}"\t{}\t1.000\n'.format( 118 | event_id, trigger_text, trigger_span)) 119 | for trigger_idx, entity_idx, role, _ in roles: 120 | if cur_trigger_idx == trigger_idx: 121 | role_mapped = role_type_mapping['{}:{}'.format(eventtype, role).lower()] 122 | _, entity_id, entity_span, _, _ = entity_mapping[(sent_id, entity_idx)] 123 | evt_w.write(':{}\t{}.actual\t{}\t{}\t1.000\n'.format( 124 | event_id, role_mapped, entity_id, entity_span)) -------------------------------------------------------------------------------- /evaluations/supervised-ie/global_feature.py: -------------------------------------------------------------------------------- 1 | import itertools 2 | 3 | import numpy as np 4 | 5 | from collections import Counter 6 | 7 | 8 | def generate_global_feature_maps(vocabs, valid_patterns): 9 | """ 10 | Note that feature maps here refer to "feature-index mappings", not feature 11 | maps in CNNs. 12 | :param vocabs: vocabularies. 13 | :param valid_patterns: valid patterns (only event-role patterns are used). 14 | :return (dict): a dictionary of feature-index maps. 15 | """ 16 | event_type_vocab = vocabs['event_type'] 17 | entity_type_vocab = vocabs['entity_type'] 18 | role_type_vocab = vocabs['role_type'] 19 | relation_type_vocab = vocabs['relation_type'] 20 | event_role = valid_patterns['event_role'] 21 | 22 | # 1. role role: the number of entities that act as and 23 | # arguments at the same time 24 | role_role_map = set() 25 | for role1 in role_type_vocab.values(): 26 | for role2 in role_type_vocab.values(): 27 | if role1 and role2: 28 | if role1 < role2: 29 | key = role1 * 100 + role2 30 | else: 31 | key = role2 * 100 + role1 32 | role_role_map.add(key) 33 | role_role_map = sorted(list(role_role_map)) 34 | role_role_map = {k: i for i, k in enumerate(role_role_map)} 35 | 36 | # 2. event role num: the number of events with 37 | # arguments 38 | event_role_num_map = list() 39 | for event in event_type_vocab.values(): 40 | for role in role_type_vocab.values(): 41 | if event and role: 42 | key = event * 1000 + role * 10 43 | event_role_num_map.append(key + 1) 44 | event_role_num_map.append(key + 2) 45 | event_role_num_map.sort() 46 | event_role_num_map = {k: i for i, k in enumerate(event_role_num_map)} 47 | 48 | # 3. role entity: the number of occurrences of and 49 | # combination 50 | role_entity_map = list() 51 | for role in role_type_vocab.values(): 52 | for entity in entity_type_vocab.values(): 53 | if role and entity: 54 | role_entity_map.append(role * 100 + entity) 55 | role_entity_map.sort() 56 | role_entity_map = {k: i for i, k in enumerate(role_entity_map)} 57 | 58 | # 4. multiple role 59 | multi_role_map = [role for role in role_type_vocab.values() if role] 60 | multi_role_map.sort() 61 | multi_role_map = {k: i for i, k in enumerate(multi_role_map)} 62 | 63 | # 5. event role event role: the number of entities that act as a 64 | # argument of an event and a argument of an 65 | # event at the same time 66 | event_role_event_role_map = set() 67 | for event_role1 in event_role: 68 | for event_role2 in event_role: 69 | event1 = event_role1 // 100 70 | event2 = event_role2 // 100 71 | role1 = event_role1 % 100 72 | role2 = event_role2 % 100 73 | if event1 < event2: 74 | key = event1 * 1000000 + role1 * 10000 + event2 * 100 + role2 75 | else: 76 | key = event2 * 1000000 + role2 * 10000 + event1 * 100 + role1 77 | event_role_event_role_map.add(key) 78 | event_role_event_role_map = sorted(list(event_role_event_role_map)) 79 | event_role_event_role_map = {k: i for i, k in enumerate(event_role_event_role_map)} 80 | 81 | # 6. relation entity entity: the number of occurrences of , 82 | # , and combination 83 | relation_entity_entity_map = set() 84 | for relation in relation_type_vocab.values(): 85 | for entity1 in entity_type_vocab.values(): 86 | for entity2 in entity_type_vocab.values(): 87 | if relation and entity1 and entity2: 88 | key = relation * 10000 89 | if entity1 < entity2: 90 | key += entity1 * 100 + entity2 91 | else: 92 | key += entity2 * 100 + entity1 93 | relation_entity_entity_map.add(key) 94 | relation_entity_entity_map = sorted(list(relation_entity_entity_map)) 95 | relation_entity_entity_map = {k: i for i, k in enumerate(relation_entity_entity_map)} 96 | 97 | # 7. relation entity: the number of occurrences of and 98 | # combination 99 | relation_entity_map = [relation * 100 + entity 100 | for relation in relation_type_vocab.values() 101 | for entity in entity_type_vocab.values() 102 | if relation and entity] 103 | relation_entity_map.sort() 104 | relation_entity_map = {k: i for i, k in enumerate(relation_entity_map)} 105 | 106 | # 8. relation role role: the number of occurrences of a 107 | # relation between a argument and a argument of the same 108 | # event 109 | relation_role_role_map = set() 110 | for relation in relation_type_vocab.values(): 111 | for role1 in role_type_vocab.values(): 112 | for role2 in role_type_vocab.values(): 113 | if relation and role1 and role2: 114 | key = relation * 10000 115 | if role1 < role2: 116 | key += role1 * 100 + role2 117 | else: 118 | key += role2 * 100 + role1 119 | relation_role_role_map.add(key) 120 | relation_role_role_map = sorted(list(relation_role_role_map)) 121 | relation_role_role_map = {k: i for i, k in enumerate(relation_role_role_map)} 122 | 123 | # 9. multiple relation: the number of entities that have a 124 | # relation with multiple entities 125 | multi_relation_map = [relation for relation in relation_type_vocab.values() 126 | if relation] 127 | multi_relation_map.sort() 128 | multi_relation_map = {k: i for i, k in enumerate((multi_relation_map))} 129 | 130 | # 10. relation relation: the number of entities involving in 131 | # and relations simultaneously 132 | relation_relation_map = set() 133 | for relation1 in relation_type_vocab.values(): 134 | for relation2 in relation_type_vocab.values(): 135 | if relation1 and relation2: 136 | key = relation1 * 100 + relation2 if relation1 < relation2 \ 137 | else relation2 * 100 + relation1 138 | relation_relation_map.add(key) 139 | relation_relation_map = sorted(list(relation_relation_map)) 140 | relation_relation_map = {k: i for i, k in enumerate(relation_relation_map)} 141 | 142 | # 11. multiple event: whether a graph contains more than one 143 | # event 144 | multi_event_map = [event for event in event_type_vocab.values() if event] 145 | multi_event_map.sort() 146 | multi_event_map = {k: i for i, k in enumerate(multi_event_map)} 147 | 148 | return { 149 | 'role_role': role_role_map, 150 | 'event_role_num': event_role_num_map, 151 | 'role_entity': role_entity_map, 152 | 'multi_role': multi_role_map, 153 | 'event_role_event_role': event_role_event_role_map, 154 | 'relation_entity_entity': relation_entity_entity_map, 155 | 'relation_entity': relation_entity_map, 156 | 'relation_role_role': relation_role_role_map, 157 | 'multi_relation': multi_relation_map, 158 | 'relation_relation': relation_relation_map, 159 | 'multi_event': multi_event_map 160 | } 161 | 162 | 163 | def generate_global_feature_vector(graph, 164 | global_feature_maps, 165 | features=None): 166 | role_role_map = global_feature_maps['role_role'] 167 | role_role_vec = np.zeros(len((role_role_map))) 168 | role_entity_map = global_feature_maps['role_entity'] 169 | role_entity_vec = np.zeros(len(role_entity_map)) 170 | event_role_num_map = global_feature_maps['event_role_num'] 171 | event_role_num_vec = np.zeros(len(event_role_num_map)) 172 | multi_role_map = global_feature_maps['multi_role'] 173 | multi_role_vec = np.zeros(len(multi_role_map)) 174 | event_role_event_role_map = global_feature_maps['event_role_event_role'] 175 | event_role_event_role_vec = np.zeros(len(event_role_event_role_map)) 176 | relation_entity_entity_map = global_feature_maps['relation_entity_entity'] 177 | relation_entity_entity_vec = np.zeros(len(relation_entity_entity_map)) 178 | relation_entity_map = global_feature_maps['relation_entity'] 179 | relation_entity_vec = np.zeros(len(relation_entity_map)) 180 | relation_role_role_map = global_feature_maps['relation_role_role'] 181 | relation_role_role_vec = np.zeros(len(relation_role_role_map)) 182 | multi_relation_map = global_feature_maps['multi_relation'] 183 | multi_relation_vec = np.zeros(len(multi_relation_map)) 184 | relation_relation_map = global_feature_maps['relation_relation'] 185 | relation_relation_vec = np.zeros(len(relation_relation_map)) 186 | multi_event_map = global_feature_maps['multi_event'] 187 | multi_event_vec = np.zeros(len(multi_event_map)) 188 | 189 | # event argument role related features 190 | entity_roles = [[] for _ in range(graph.entity_num)] 191 | entity_event_role = [[] for _ in range(graph.entity_num)] 192 | event_role_count = [Counter() for _ in range(graph.trigger_num)] 193 | for trigger_idx, entity_idx, role in graph.roles: 194 | entity_roles[entity_idx].append(role) 195 | entity_event_role[entity_idx].append( 196 | (graph.triggers[trigger_idx][-1], role)) 197 | event_role_count[trigger_idx][role] += 1 198 | # 3. role entity 199 | role_entity = role * 100 + graph.entities[entity_idx][-1] 200 | if role_entity in role_entity_map: 201 | role_entity_vec[role_entity_map[role_entity]] += 1 202 | # 1. role role 203 | for roles in entity_roles: 204 | for role1, role2 in itertools.combinations(roles, 2): 205 | key = role1 * 100 + role2 if role1 < role2 \ 206 | else role2 * 100 + role1 207 | if key in role_role_map: 208 | role_role_vec[role_role_map[key]] += 1 209 | # 2. event role num & 4. multiple role 210 | for event, role_count in enumerate(event_role_count): 211 | for role, count in role_count.items(): 212 | # to reduce the number of features, we treat numbers > 2 as 2 213 | key = graph.triggers[event][-1] * 1000 + role * 10 + min(count, 2) 214 | if key in event_role_num_map: 215 | event_role_num_vec[event_role_num_map[key]] += 1 216 | if count > 1 and role in multi_role_map: 217 | multi_role_vec[multi_role_map[role]] += 1 218 | # 5. event role event role 219 | for event_role_pairs in entity_event_role: 220 | for (event1, role1), (event2, role2) in itertools.combinations( 221 | event_role_pairs, 2): 222 | if event1 < event2: 223 | key = event1 * 1000000 + role1 * 10000 + event2 * 100 + role2 224 | else: 225 | key = event2 * 1000000 + role2 * 10000 + event1 * 100 + role1 226 | if key in event_role_event_role_map: 227 | event_role_event_role_vec[event_role_event_role_map[key]] += 1 228 | 229 | # relation related features 230 | entity_role_unique = [set(x) for x in entity_roles] 231 | entity_relation_count = [Counter() for _ in range(graph.entity_num)] 232 | for entity_idx1, entity_idx2, relation in graph.relations: 233 | entity_relation_count[entity_idx1][relation] += 1 234 | entity_relation_count[entity_idx2][relation] += 1 235 | entity1 = graph.entities[entity_idx1][-1] 236 | entity2 = graph.entities[entity_idx2][-1] 237 | # 6. relation entity entity 238 | if entity1 < entity2: 239 | key = relation * 10000 + entity1 * 100 + entity2 240 | else: 241 | key = relation * 10000 + entity2 * 100 + entity1 242 | if key in relation_entity_entity_map: 243 | relation_entity_entity_vec[relation_entity_entity_map[key]] += 1 244 | # 7. relation entity 245 | key1 = relation * 100 + entity1 246 | key2 = relation * 100 + entity2 247 | if key1 in relation_entity_map: 248 | relation_entity_vec[relation_entity_map[key1]] += 1 249 | if key2 in relation_entity_map: 250 | relation_entity_vec[relation_entity_map[key2]] += 1 251 | # 8. relation role role 252 | roles1 = entity_role_unique[entity_idx1] 253 | roles2 = entity_role_unique[entity_idx2] 254 | for role1 in roles1: 255 | for role2 in roles2: 256 | if role1 < role2: 257 | key = relation * 10000 + role1 * 100 + role2 258 | else: 259 | key = relation * 10000 + role2 * 100 + role1 260 | if key in relation_role_role_map: 261 | relation_role_role_vec[relation_role_role_map[key]] += 1 262 | # 9. multiple relation & 10. relation relation 263 | for relation_count in entity_relation_count: 264 | relations = [] 265 | for relation, count in relation_count.items(): 266 | relations.append(relation) 267 | if count > 1: 268 | relations.append(relation) 269 | if relation in multi_relation_map: 270 | multi_relation_vec[multi_relation_map[relation]] += 1 271 | for relation1, relation2 in itertools.combinations(relations, 2): 272 | if relation1 < relation2: 273 | key = relation1 * 100 + relation2 274 | else: 275 | key = relation2 * 100 + relation1 276 | if key in relation_relation_map: 277 | relation_relation_vec[relation_relation_map[key]] += 1 278 | 279 | # 11. multiple event 280 | trigger_count = Counter() 281 | for _, _, trigger in graph.triggers: 282 | trigger_count[trigger] += 1 283 | for trigger, count in trigger_count.items(): 284 | if count > 1 and trigger in multi_event_map: 285 | multi_event_vec[multi_event_map[trigger]] = 1 286 | 287 | feature_vector = np.concatenate( 288 | [role_role_vec, event_role_num_vec, role_entity_vec, 289 | multi_role_vec, event_role_event_role_vec, relation_entity_entity_vec, 290 | relation_entity_vec, relation_role_role_vec, 291 | multi_relation_vec, relation_relation_vec, multi_event_vec] 292 | ) 293 | 294 | if features: 295 | vectors = { 296 | 'role_role': role_role_vec, 297 | 'event_role_num': event_role_num_vec, 298 | 'role_entity': role_entity_vec, 299 | 'multi_role': multi_role_vec, 300 | 'event_role_event_role': event_role_event_role_vec, 301 | 'relation_entity_entity': relation_entity_entity_vec, 302 | 'relation_entity': relation_entity_vec, 303 | 'relation_role_role': relation_role_role_vec, 304 | 'multi_relation': multi_relation_vec, 305 | 'relation_relation': relation_relation_vec, 306 | 'multi_event': multi_event_vec 307 | } 308 | feature_vector = np.concatenate([vectors[k] for k in features]) 309 | else: 310 | feature_vector = np.concatenate( 311 | [role_role_vec, event_role_num_vec, role_entity_vec, 312 | multi_role_vec, event_role_event_role_vec, relation_entity_entity_vec, 313 | relation_entity_vec, relation_role_role_vec, 314 | multi_relation_vec, relation_relation_vec, multi_event_vec] 315 | ) 316 | return feature_vector 317 | -------------------------------------------------------------------------------- /evaluations/supervised-ie/graph.py: -------------------------------------------------------------------------------- 1 | class Graph(object): 2 | def __init__(self, entities, triggers, relations, roles, vocabs, mentions=None): 3 | """ 4 | :param entities (list): A list of entities represented as a tuple of 5 | (start_offset, end_offset, label_idx). end_offset = the index of the end 6 | token + 1. 7 | :param triggers (list): A list of triggers represented as a tuple of 8 | (start_offset, end_offset, label_idx). end_offset = the index of the end 9 | token + 1. 10 | :param relations (list): A list of relations represented as a tuple of 11 | (entity_idx_1, entity_idx_2, label_idx). As we do not consider the 12 | direction of relations (list), it is better to have entity_idx_1 < 13 | entity_idx2. 14 | :param roles: A list of roles represented as a tuple of (trigger_idx_1, 15 | entity_idx_2, label_idx). 16 | :param vocabs (dict): Label type vocabularies. 17 | """ 18 | self.entities = entities 19 | self.triggers = triggers 20 | self.relations = relations 21 | self.roles = roles 22 | self.vocabs = vocabs 23 | self.mentions = [] if mentions is None else mentions 24 | 25 | self.entity_num = len(entities) 26 | self.trigger_num = len(triggers) 27 | self.relation_num = len(relations) 28 | self.role_num = len(roles) 29 | self.graph_local_score = 0.0 30 | 31 | # subscores 32 | self.entity_scores = [] 33 | self.trigger_scores = [] 34 | self.relation_scores = [] 35 | self.role_scores = [] 36 | 37 | def __eq__(self, other): 38 | if isinstance(other, Graph): 39 | equal = (self.entities == other.entities and 40 | self.triggers == other.triggers and 41 | self.relations == other.relations and 42 | self.roles == other.roles and 43 | self.mentions == other.mentions) 44 | return equal 45 | return False 46 | 47 | 48 | def to_dict(self): 49 | """Convert a graph to a dict object 50 | :return (dict): A dictionary representing the graph, where label indices 51 | have been replaced with label strings. 52 | """ 53 | entity_itos = {i: s for s, i in self.vocabs['entity_type'].items()} 54 | trigger_itos = {i: s for s, i in self.vocabs['event_type'].items()} 55 | relation_itos = {i: s for s, i in self.vocabs['relation_type'].items()} 56 | role_itos = {i: s for s, i in self.vocabs['role_type'].items()} 57 | mention_itos = {i: s for s, i in self.vocabs['mention_type'].items()} 58 | 59 | # entities = [[i, j, entity_itos[k], mention_itos[l]] for (i, j, k), (_, _, l) in zip(self.entities, self.mentions)] 60 | # triggers = [[i, j, trigger_itos[k]] for i, j, k in self.triggers] 61 | # relations = [[i, j, relation_itos[k]] for i, j, k in self.relations] 62 | # roles = [[i, j, role_itos[k]] for i, j, k in self.roles] 63 | 64 | entities = [[i, j, entity_itos[k], mention_itos[l], s] for (i, j, k), (_, _, l), s in zip(self.entities, self.mentions, self.entity_scores)] 65 | triggers = [[i, j, trigger_itos[k], l] for (i, j, k), l in zip(self.triggers, self.trigger_scores)] 66 | relations = [[i, j, relation_itos[k], l] for (i, j, k), l in zip(self.relations, self.relation_scores)] 67 | roles = [[i, j, role_itos[k], l] for (i, j, k), l in zip(self.roles, self.role_scores)] 68 | 69 | return { 70 | 'entities': entities, 71 | 'triggers': triggers, 72 | 'relations': relations, 73 | 'roles': roles, 74 | } 75 | 76 | def __str__(self): 77 | return str(self.to_dict()) 78 | 79 | def copy(self): 80 | """Make a copy of the graph 81 | :return (Graph): a copy of the current graph. 82 | """ 83 | graph = Graph( 84 | entities=self.entities.copy(), 85 | triggers=self.triggers.copy(), 86 | relations=self.relations.copy(), 87 | roles=self.roles.copy(), 88 | mentions=self.mentions.copy(), 89 | vocabs=self.vocabs 90 | ) 91 | graph.graph_local_score = self.graph_local_score 92 | graph.entity_scores = self.entity_scores 93 | graph.trigger_scores = self.trigger_scores 94 | graph.relation_scores = self.relation_scores 95 | graph.role_scores = self.role_scores 96 | return graph 97 | 98 | def clean(self, relation_directional=False, symmetric_relations=None): 99 | # self.entities.sort(key=lambda x: (x[0], x[1])) 100 | # self.triggers.sort(key=lambda x: (x[0], x[1])) 101 | # self.relations.sort(key=lambda x: (x[0], x[1])) 102 | # self.roles.sort(key=lambda x: (x[0], x[1])) 103 | 104 | entities = [(i, j, k, l) for (i, j, k), l in zip(self.entities, self.entity_scores)] 105 | triggers = [(i, j, k, l) for (i, j, k), l in zip(self.triggers, self.trigger_scores)] 106 | relations = [(i, j, k, l) for (i, j, k), l in zip(self.relations, self.relation_scores)] 107 | roles = [(i, j, k, l) for (i, j, k), l in zip(self.roles, self.role_scores)] 108 | 109 | # coref_idx = self.vocabs['relation_type'].get('COREF', None) 110 | # if coref_idx is not None: 111 | # relations, corefs = [], [] 112 | # for i, j, k in self.relations: 113 | # if k == coref_idx: 114 | # corefs.append((i, j, k)) 115 | # else: 116 | # relations.append((i, j, k)) 117 | # self.relations = relations 118 | # self.corefs = corefs 119 | 120 | # clean relations 121 | if relation_directional and symmetric_relations: 122 | relation_itos = {i: s for s, i in self.vocabs['relation_type'].items()} 123 | # relations = [] 124 | relations_tmp = [] 125 | # for i, j, k in self.relations: 126 | for i, j, k, l in relations: 127 | if relation_itos[k] not in symmetric_relations: 128 | # relations.append((i, j, k)) 129 | relations_tmp.append((i, j, k, l)) 130 | else: 131 | if j < i: 132 | i, j = j, i 133 | relations_tmp.append((i, j, k, l)) 134 | # self.relations = relations 135 | relations = relations_tmp 136 | 137 | self.entities = [(i, j, k) for i, j, k, _ in entities] 138 | self.entity_scores = [l for _, _, _, l in entities] 139 | self.triggers = [(i, j, k) for i, j, k, _ in triggers] 140 | self.trigger_scores = [l for _, _, _, l in triggers] 141 | self.relations = [(i, j, k) for i, j, k, _ in relations] 142 | self.relation_scores = [l for _, _, _, l in relations] 143 | self.roles = [(i, j, k) for i, j, k, _ in roles] 144 | self.role_scores = [l for _, _, _, l in roles] 145 | 146 | def add_entity(self, start, end, label, score=0, score_norm=0): 147 | """Add an entity mention to the graph. 148 | :param start (int): Start token offset of the entity mention. 149 | :param end (int): End token offset of the entity mention + 1. 150 | :param label (int): Index of the entity type label. 151 | :param score (float): Label score. 152 | """ 153 | self.entities.append((start, end, label)) 154 | self.entity_num = len(self.entities) 155 | self.graph_local_score += score 156 | self.entity_scores.append(score_norm) 157 | 158 | def add_trigger(self, start, end, label, score=0, score_norm=0): 159 | """Add an event trigger to the graph. 160 | :param start (int): Start token offset of the trigger. 161 | :param end (int): End token offset of the trigger + 1. 162 | :param label (int): Index of the event type label. 163 | :param score (float): Label score. 164 | """ 165 | self.triggers.append((start, end, label)) 166 | self.trigger_num = len(self.triggers) 167 | self.graph_local_score += score 168 | self.trigger_scores.append(score_norm) 169 | 170 | def add_relation(self, idx1, idx2, label, score=0, score_norm=0): 171 | """Add a relation edge to the graph. 172 | :param idx1 (int): Index of the entity node 1. 173 | :param idx2 (int): Index of the entity node 2. 174 | :param label (int): Index of the relation type label. 175 | :param score (float): Label score. 176 | """ 177 | # assert idx1 < self.entity_num and idx2 < self.entity_num 178 | if label: 179 | self.relations.append((idx1, idx2, label)) 180 | self.relation_scores.append(score_norm) 181 | self.relation_num = len(self.relations) 182 | self.graph_local_score += score 183 | 184 | def add_role(self, idx1, idx2, label, score=0, score_norm=0): 185 | """Add an event-argument link edge to the graph. 186 | :param idx1 (int): Index of the trigger node. 187 | :param idx2 (int): Index of the entity node. 188 | :param label (int): Index of the role label. 189 | :param score (float): Label score. 190 | """ 191 | # assert idx1 < self.trigger_num and idx2 < self.entity_num 192 | # self.roles.append((idx1, idx2, label)) 193 | if label: 194 | self.roles.append((idx1, idx2, label)) 195 | self.role_scores.append(score_norm) 196 | self.role_num = len(self.roles) 197 | self.graph_local_score += score 198 | 199 | @staticmethod 200 | def empty_graph(vocabs): 201 | """Create a graph without any node and edge. 202 | :param vocabs (dict): Vocabulary object. 203 | """ 204 | return Graph([], [], [], [], vocabs) 205 | 206 | def to_label_idxs(self, max_entity_num, max_trigger_num, 207 | relation_directional=False, 208 | symmetric_relation_idxs=None): 209 | """Generate label index tensors (which are actually list objects not 210 | Pytorch tensors) to gather calculated scores. 211 | :param max_entity_num: Max entity number of the batch. 212 | :param max_trigger_num: Max trigger number of the batch. 213 | :return: Index and mask tensors. 214 | """ 215 | entity_idxs = [i[-1] for i in self.entities] + [0] * (max_entity_num - self.entity_num) 216 | entity_mask = [1] * self.entity_num + [0] * (max_entity_num - self.entity_num) 217 | 218 | trigger_idxs = [i[-1] for i in self.triggers] + [0] * (max_trigger_num - self.trigger_num) 219 | trigger_mask = [1] * self.trigger_num + [0] * (max_trigger_num - self.trigger_num) 220 | 221 | relation_idxs = [0] * max_entity_num * max_entity_num 222 | relation_mask = [1 if i < self.entity_num and j < self.entity_num and i != j else 0 223 | for i in range(max_entity_num) for j in range(max_entity_num)] 224 | for i, j, relation in self.relations: 225 | # TODO: check relation label idxs and mask 226 | relation_idxs[i * max_entity_num + j] = relation 227 | if not relation_directional: 228 | relation_idxs[j * max_entity_num + i] = relation 229 | # relation_mask[i * max_entity_num + j] = .5 230 | # relation_mask[j * max_entity_num + i] = .5 231 | if relation_directional and symmetric_relation_idxs and relation in symmetric_relation_idxs: 232 | relation_idxs[j * max_entity_num + i] = relation 233 | # relation_mask[i * max_entity_num + j] = .5 234 | # relation_mask[j * max_entity_num + i] = .5 235 | 236 | 237 | role_idxs = [0] * max_trigger_num * max_entity_num 238 | for i, j, role in self.roles: 239 | role_idxs[i * max_entity_num + j] = role 240 | role_mask = [1 if i < self.trigger_num and j < self.entity_num else 0 241 | for i in range(max_trigger_num) for j in range(max_entity_num)] 242 | 243 | return ( 244 | entity_idxs, entity_mask, trigger_idxs, trigger_mask, 245 | relation_idxs, relation_mask, role_idxs, role_mask, 246 | ) 247 | 248 | -------------------------------------------------------------------------------- /evaluations/supervised-ie/predict.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | import glob 4 | import tqdm 5 | import traceback 6 | from argparse import ArgumentParser 7 | 8 | import torch 9 | from torch.utils.data import DataLoader 10 | from transformers import BertTokenizer, BertConfig 11 | 12 | from model import OneIE 13 | from config import Config 14 | from util import save_result 15 | from data import IEDatasetEval 16 | from convert import json_to_cs 17 | 18 | cur_dir = os.path.dirname(os.path.realpath(__file__)) 19 | format_ext_mapping = {'txt': 'txt', 'ltf': 'ltf.xml', 'json': 'json', 20 | 'json_single': 'json'} 21 | 22 | def load_model(model_path, device=0, gpu=False, beam_size=5): 23 | print('Loading the model from {}'.format(model_path)) 24 | map_location = 'cuda:{}'.format(device) if gpu else 'cpu' 25 | state = torch.load(model_path, map_location=map_location) 26 | 27 | config = state['config'] 28 | if type(config) is dict: 29 | config = Config.from_dict(config) 30 | config.bert_cache_dir = os.path.join(cur_dir, 'bert') 31 | vocabs = state['vocabs'] 32 | valid_patterns = state['valid'] 33 | 34 | # recover the model 35 | model = OneIE(config, vocabs, valid_patterns) 36 | model.load_state_dict(state['model']) 37 | model.beam_size = beam_size 38 | if gpu: 39 | model.cuda(device) 40 | 41 | tokenizer = BertTokenizer.from_pretrained(config.bert_model_name, 42 | cache_dir=config.bert_cache_dir, 43 | do_lower_case=False) 44 | 45 | return model, tokenizer, config 46 | 47 | 48 | def predict_document(path, model, tokenizer, config, batch_size=20, 49 | max_length=128, gpu=False, input_format='txt', 50 | language='english'): 51 | """ 52 | :param path (str): path to the input file. 53 | :param model (OneIE): pre-trained model object. 54 | :param tokenizer (BertTokenizer): BERT tokenizer. 55 | :param config (Config): configuration object. 56 | :param batch_size (int): Batch size (default=20). 57 | :param max_length (int): Max word piece number (default=128). 58 | :param gpu (bool): Use GPU or not (default=False). 59 | :param input_format (str): Input file format (txt or ltf, default='txt). 60 | :param langauge (str): Input document language (default='english'). 61 | """ 62 | test_set = IEDatasetEval(path, max_length=max_length, gpu=gpu, 63 | input_format=input_format, language=language) 64 | test_set.numberize(tokenizer) 65 | # document info 66 | info = { 67 | 'doc_id': test_set.doc_id, 68 | 'ori_sent_num': test_set.ori_sent_num, 69 | 'sent_num': len(test_set) 70 | } 71 | # prediction result 72 | result = [] 73 | for batch in DataLoader(test_set, batch_size=batch_size, shuffle=False, 74 | collate_fn=test_set.collate_fn): 75 | graphs = model.predict(batch) 76 | for graph, tokens, sent_id, token_ids in zip(graphs, batch.tokens, 77 | batch.sent_ids, 78 | batch.token_ids): 79 | graph.clean(relation_directional=config.relation_directional, 80 | symmetric_relations=config.symmetric_relations) 81 | result.append((sent_id, token_ids, tokens, graph)) 82 | return result, info 83 | 84 | 85 | def predict(model_path, input_path, output_path, log_path=None, cs_path=None, 86 | batch_size=50, max_length=128, device=0, gpu=False, 87 | file_extension='txt', beam_size=5, input_format='txt', 88 | language='english'): 89 | """Perform information extraction. 90 | :param model_path (str): Path to the pre-trained model file. 91 | :param input_path (str): Path to the input directory. 92 | :param output_path (str): Path to the output directory. 93 | :param log_path (str): Path to the log file. 94 | :param cs_path (str): (optional) Path to the cold-start format output directory. 95 | :param batch_size (int): Batch size (default=50). 96 | :param max_length (int): Max word piece number for each sentence (default=128). 97 | :param device (int): GPU device index (default=0). 98 | :param gpu (bool): Use GPU (default=False). 99 | :param file_extension (str): Input file extension. Only files ending with the 100 | given extension will be processed (default='txt'). 101 | :param beam_size (int): Beam size of the decoder (default=5). 102 | :param input_format (str): Input file format (txt or ltf, default='txt'). 103 | :param language (str): Document language (default='english'). 104 | """ 105 | # set gpu device 106 | if gpu: 107 | torch.cuda.set_device(device) 108 | # load the model from file 109 | model, tokenizer, config = load_model(model_path, device=device, gpu=gpu, 110 | beam_size=beam_size) 111 | # get the list of documents 112 | file_list = glob.glob(os.path.join(input_path, '*.{}'.format(file_extension))) 113 | # log writer 114 | if log_path: 115 | log_writer = open(log_path, 'w', encoding='utf-8') 116 | # run the model; collect result and info 117 | doc_info_list = [] 118 | progress = tqdm.tqdm(total=len(file_list), ncols=75) 119 | for f in file_list: 120 | progress.update(1) 121 | try: 122 | doc_result, doc_info = predict_document( 123 | f, model, tokenizer, config, batch_size=batch_size, 124 | max_length=max_length, gpu=gpu, input_format=input_format, 125 | language=language) 126 | # save json format result 127 | doc_id = doc_info['doc_id'] 128 | with open(os.path.join(output_path, '{}.json'.format(doc_id)), 'w') as w: 129 | for sent_id, token_ids, tokens, graph in doc_result: 130 | output = { 131 | 'doc_id': doc_id, 132 | 'sent_id': sent_id, 133 | 'token_ids': token_ids, 134 | 'tokens': tokens, 135 | 'graph': graph.to_dict() 136 | } 137 | w.write(json.dumps(output) + '\n') 138 | # write doc info 139 | if log_path: 140 | log_writer.write(json.dumps(doc_info) + '\n') 141 | log_writer.flush() 142 | except Exception as e: 143 | traceback.print_exc() 144 | if log_path: 145 | log_writer.write(json.dumps( 146 | {'file': file, 'message': str(e)}) + '\n') 147 | log_writer.flush() 148 | progress.close() 149 | 150 | # convert to the cold-start format 151 | if cs_path: 152 | print('Converting to cs format') 153 | json_to_cs(output_path, cs_path) 154 | 155 | 156 | parser = ArgumentParser() 157 | parser.add_argument('-m', '--model_path', help='path to the trained model') 158 | parser.add_argument('-i', '--input_dir', help='path to the input folder (ltf files)') 159 | parser.add_argument('-o', '--output_dir', help='path to the output folder (json files)') 160 | parser.add_argument('-l', '--log_path', default=None, help='path to the log file') 161 | parser.add_argument('-c', '--cs_dir', default=None, help='path to the output folder (cs files)') 162 | parser.add_argument('--gpu', action='store_true', help='use gpu') 163 | parser.add_argument('-d', '--device', default=0, type=int, help='gpu device index') 164 | parser.add_argument('-b', '--batch_size', default=10, type=int, help='batch size') 165 | parser.add_argument('--max_len', default=128, type=int, help='max sentence length') 166 | parser.add_argument('--beam_size', default=5, type=int, help='beam set size') 167 | parser.add_argument('--lang', default='english', help='Model language') 168 | parser.add_argument('--format', default='txt', help='Input format (txt, ltf, json)') 169 | 170 | args = parser.parse_args() 171 | extension = format_ext_mapping.get(args.format, 'ltf.xml') 172 | 173 | predict( 174 | model_path=args.model_path, 175 | input_path=args.input_dir, 176 | output_path=args.output_dir, 177 | cs_path=args.cs_dir, 178 | log_path=args.log_path, 179 | batch_size=args.batch_size, 180 | max_length=args.max_len, 181 | device=args.device, 182 | gpu=args.gpu, 183 | beam_size=args.beam_size, 184 | file_extension=extension, 185 | input_format=args.format, 186 | language=args.lang, 187 | ) -------------------------------------------------------------------------------- /evaluations/supervised-ie/preprocessing/process_dygiepp.py: -------------------------------------------------------------------------------- 1 | import json 2 | from argparse import ArgumentParser 3 | from transformers import BertTokenizer 4 | 5 | 6 | def map_index(pieces): 7 | idxs = [] 8 | for i, piece in enumerate(pieces): 9 | if i == 0: 10 | idxs.append([0, len(piece)]) 11 | else: 12 | _, last = idxs[-1] 13 | idxs.append([last, last + len(piece)]) 14 | return idxs 15 | 16 | 17 | def convert(input_file, output_file, tokenizer): 18 | with open(input_file, 'r', encoding='utf-8') as r, \ 19 | open(output_file, 'w', encoding='utf-8') as w: 20 | for line in r: 21 | doc = json.loads(line) 22 | doc_id = doc['doc_key'] 23 | sentences = doc['sentences'] 24 | sent_num = len(sentences) 25 | entities = doc.get('ner', [[] for _ in range(sent_num)]) 26 | relations = doc.get('relations', [[] for _ in range(sent_num)]) 27 | events = doc.get('events', [[] for _ in range(sent_num)]) 28 | 29 | offset = 0 30 | for i, (sent_tokens, sent_entities, sent_relations, sent_events) in enumerate(zip( 31 | sentences, entities, relations, events 32 | )): 33 | sent_id = '{}-{}'.format(doc_id, i) 34 | pieces = [tokenizer.tokenize(t) for t in sent_tokens] 35 | word_lens = [len(p) for p in pieces] 36 | idx_mapping = map_index(pieces) 37 | 38 | sent_entities_ = [] 39 | sent_entity_map = {} 40 | for j, (start, end, entity_type) in enumerate(sent_entities): 41 | start, end = start - offset, end - offset + 1 42 | entity_id = '{}-E{}'.format(sent_id, j) 43 | entity = { 44 | 'id': entity_id, 45 | 'start': start, 'end': end, 46 | 'entity_type': entity_type, 47 | # Mention types are not included in DyGIE++'s format 48 | 'mention_type': 'UNK', 49 | 'text': ' '.join(sent_tokens[start:end])} 50 | sent_entities_.append(entity) 51 | sent_entity_map[start] = entity 52 | 53 | sent_relations_ = [] 54 | for j, (start1, end1, start2, end2, rel_type) in enumerate(sent_relations): 55 | start1, end1 = start1 - offset, end1 - offset 56 | start2, end2 = start2 - offset, end2 - offset 57 | arg1 = sent_entity_map[start1] 58 | arg2 = sent_entity_map[start2] 59 | relation_id = '{}-R{}'.format(sent_id, j) 60 | rel_type = rel_type.split('.')[0] 61 | relation = { 62 | 'relation_type': rel_type, 63 | 'id': relation_id, 64 | 'arguments': [ 65 | { 66 | 'entity_id': arg1['id'], 67 | 'text': arg1['text'], 68 | 'role': 'Arg-1' 69 | }, 70 | { 71 | 'entity_id': arg2['id'], 72 | 'text': arg2['text'], 73 | 'role': 'Arg-2' 74 | }, 75 | ] 76 | } 77 | sent_relations_.append(relation) 78 | 79 | sent_events_ = [] 80 | for j, event in enumerate(sent_events): 81 | event_id = '{}-EV{}'.format(sent_id, j) 82 | if len(event[0]) == 3: 83 | trigger_start, trigger_end, event_type = event[0] 84 | elif len(event[0]) == 2: 85 | trigger_start, event_type = event[0] 86 | trigger_end = trigger_start 87 | trigger_start, trigger_end = trigger_start - offset, trigger_end - offset + 1 88 | event_type = event_type.replace('.', ':') 89 | args = event[1:] 90 | args_ = [] 91 | for arg_start, arg_end, role in args: 92 | arg_start, arg_end = arg_start - offset, arg_end - offset 93 | arg = sent_entity_map[arg_start] 94 | args_.append({ 95 | 'entity_id': arg['id'], 96 | 'text': arg['text'], 97 | 'role': role 98 | }) 99 | event_obj = { 100 | 'event_type': event_type, 101 | 'id': event_id, 102 | 'trigger': { 103 | 'start': trigger_start, 104 | 'end': trigger_end, 105 | 'text': ' '.join(sent_tokens[trigger_start:trigger_end]) 106 | }, 107 | 'arguments': args_ 108 | } 109 | sent_events_.append(event_obj) 110 | 111 | sent_ = { 112 | 'doc_id': doc_id, 113 | 'sent_id': sent_id, 114 | 'entity_mentions': sent_entities_, 115 | 'relation_mentions': sent_relations_, 116 | 'event_mentions': sent_events_, 117 | 'tokens': sent_tokens, 118 | 'pieces': [p for w in pieces for p in w], 119 | 'token_lens': word_lens, 120 | 'sentence': ' '.join(sent_tokens) 121 | } 122 | w.write(json.dumps(sent_) + '\n') 123 | 124 | offset += len(sent_tokens) 125 | 126 | 127 | if __name__ == '__main__': 128 | parser = ArgumentParser() 129 | parser.add_argument('-i', '--input', help='Path to the input file') 130 | parser.add_argument('-o', '--output', help='Path to the output file') 131 | args = parser.parse_args() 132 | 133 | bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', 134 | do_lower_case=False) 135 | convert(args.input, args.output, bert_tokenizer) 136 | -------------------------------------------------------------------------------- /evaluations/supervised-ie/resource/ace_to_aida_entity.tsv: -------------------------------------------------------------------------------- 1 | PER https://tac.nist.gov/tracks/SM-KBP/2018/ontologies/SeedlingOntology#Person 2 | ORG https://tac.nist.gov/tracks/SM-KBP/2018/ontologies/SeedlingOntology#Organization 3 | GPE https://tac.nist.gov/tracks/SM-KBP/2018/ontologies/SeedlingOntology#GeopoliticalEntity 4 | LOC https://tac.nist.gov/tracks/SM-KBP/2018/ontologies/SeedlingOntology#Location 5 | FAC https://tac.nist.gov/tracks/SM-KBP/2018/ontologies/SeedlingOntology#Facility 6 | WEA https://tac.nist.gov/tracks/SM-KBP/2018/ontologies/SeedlingOntology#Weapon 7 | VEH https://tac.nist.gov/tracks/SM-KBP/2018/ontologies/SeedlingOntology#Vehicle 8 | TME https://tac.nist.gov/tracks/SM-KBP/2018/ontologies/SeedlingOntology#Time 9 | TTL https://tac.nist.gov/tracks/SM-KBP/2018/ontologies/SeedlingOntology#Title 10 | VAL https://tac.nist.gov/tracks/SM-KBP/2018/ontologies/SeedlingOntology#NumericalValue 11 | MON https://tac.nist.gov/tracks/SM-KBP/2018/ontologies/SeedlingOntology#Money 12 | URL https://tac.nist.gov/tracks/SM-KBP/2018/ontologies/SeedlingOntology#URL -------------------------------------------------------------------------------- /evaluations/supervised-ie/resource/ace_to_aida_event.tsv: -------------------------------------------------------------------------------- 1 | Business:Declare-Bankruptcy https://tac.nist.gov/tracks/SM-KBP/2018/ontologies/SeedlingOntology#Business.DeclareBankruptcy 2 | Business:Merge-Org https://tac.nist.gov/tracks/SM-KBP/2018/ontologies/SeedlingOntology#Business.Merge 3 | Business:End-Org https://tac.nist.gov/tracks/SM-KBP/2018/ontologies/SeedlingOntology#Business.End 4 | Business:Start-Org https://tac.nist.gov/tracks/SM-KBP/2018/ontologies/SeedlingOntology#Business.Start 5 | Conflict:Attack https://tac.nist.gov/tracks/SM-KBP/2018/ontologies/SeedlingOntology#Conflict.Attack 6 | Conflict:Demonstrate https://tac.nist.gov/tracks/SM-KBP/2018/ontologies/SeedlingOntology#Conflict.Demonstrate 7 | Contact:Correspondence https://tac.nist.gov/tracks/SM-KBP/2018/ontologies/SeedlingOntology#Contact.Correspondence 8 | Contact:Phone-Write https://tac.nist.gov/tracks/SM-KBP/2018/ontologies/SeedlingOntology#Contact.Correspondence 9 | Contact:Meet https://tac.nist.gov/tracks/SM-KBP/2018/ontologies/SeedlingOntology#Contact.Meet 10 | Justice:Appeal https://tac.nist.gov/tracks/SM-KBP/2018/ontologies/SeedlingOntology#Justice.Appeal 11 | Justice:Arrest-Jail https://tac.nist.gov/tracks/SM-KBP/2018/ontologies/SeedlingOntology#Justice.ArrestJail 12 | Justice:Charge-Indict https://tac.nist.gov/tracks/SM-KBP/2018/ontologies/SeedlingOntology#Justice.ChargeIndict 13 | Justice:Convict https://tac.nist.gov/tracks/SM-KBP/2018/ontologies/SeedlingOntology#Justice.Convict 14 | Justice:Execute https://tac.nist.gov/tracks/SM-KBP/2018/ontologies/SeedlingOntology#Justice.Execute 15 | Justice:Fine https://tac.nist.gov/tracks/SM-KBP/2018/ontologies/SeedlingOntology#Justice.Fine 16 | Justice:Release-Parole https://tac.nist.gov/tracks/SM-KBP/2018/ontologies/SeedlingOntology#Justice.ReleaseParole 17 | Justice:Sentence https://tac.nist.gov/tracks/SM-KBP/2018/ontologies/SeedlingOntology#Justice.Sentence 18 | Justice:Sue https://tac.nist.gov/tracks/SM-KBP/2018/ontologies/SeedlingOntology#Justice.Sue 19 | Justice:Trial-Hearing https://tac.nist.gov/tracks/SM-KBP/2018/ontologies/SeedlingOntology#Justice.TrialHearing 20 | Justice:Pardon https://tac.nist.gov/tracks/SM-KBP/2018/ontologies/SeedlingOntology#Justice.Pardon 21 | Justice:Extradite https://tac.nist.gov/tracks/SM-KBP/2018/ontologies/SeedlingOntology#Justice.Extradite 22 | Justice:Acquit https://tac.nist.gov/tracks/SM-KBP/2018/ontologies/SeedlingOntology#Justice.Acquit 23 | Life:Be-Born https://tac.nist.gov/tracks/SM-KBP/2018/ontologies/SeedlingOntology#Life.BeBorn 24 | Life:Die https://tac.nist.gov/tracks/SM-KBP/2018/ontologies/SeedlingOntology#Life.Die 25 | Life:Injure https://tac.nist.gov/tracks/SM-KBP/2018/ontologies/SeedlingOntology#Life.Injure 26 | Life:Marry https://tac.nist.gov/tracks/SM-KBP/2018/ontologies/SeedlingOntology#Life.Marry 27 | Life:Divorce https://tac.nist.gov/tracks/SM-KBP/2018/ontologies/SeedlingOntology#Life.Divorce 28 | Movement:Transport https://tac.nist.gov/tracks/SM-KBP/2018/ontologies/SeedlingOntology#Movement.TransportArtifact 29 | Movement:Transport-Artifact https://tac.nist.gov/tracks/SM-KBP/2018/ontologies/SeedlingOntology#Movement.TransportArtifact 30 | Movement:Transport-Person https://tac.nist.gov/tracks/SM-KBP/2018/ontologies/SeedlingOntology#Movement.TransportPerson 31 | Personnel:Elect https://tac.nist.gov/tracks/SM-KBP/2018/ontologies/SeedlingOntology#Personnel.Elect 32 | Personnel:End-Position https://tac.nist.gov/tracks/SM-KBP/2018/ontologies/SeedlingOntology#Personnel.EndPosition 33 | Personnel:Start-Position https://tac.nist.gov/tracks/SM-KBP/2018/ontologies/SeedlingOntology#Personnel.StartPosition 34 | Personnel:Nominate https://tac.nist.gov/tracks/SM-KBP/2018/ontologies/SeedlingOntology#Personnel.Nominate 35 | Transaction:Transfer-Money https://tac.nist.gov/tracks/SM-KBP/2018/ontologies/SeedlingOntology#Transaction.TransferMoney 36 | Transaction:Transfer-Ownership https://tac.nist.gov/tracks/SM-KBP/2018/ontologies/SeedlingOntology#Transaction.TransferOwnership 37 | -------------------------------------------------------------------------------- /evaluations/supervised-ie/resource/ace_to_aida_relation.tsv: -------------------------------------------------------------------------------- 1 | ORG-AFF https://tac.nist.gov/tracks/SM-KBP/2018/ontologies/SeedlingOntology#OrganizationAffiliation 2 | ART Artifact 3 | GEN-AFF https://tac.nist.gov/tracks/SM-KBP/2018/ontologies/SeedlingOntology#GeneralAffiliation 4 | PART-WHOLE https://tac.nist.gov/tracks/SM-KBP/2018/ontologies/SeedlingOntology#PartWhole 5 | PHYS https://tac.nist.gov/tracks/SM-KBP/2018/ontologies/SeedlingOntology#Physical 6 | PER-SOC https://tac.nist.gov/tracks/SM-KBP/2018/ontologies/SeedlingOntology#PersonalSocial -------------------------------------------------------------------------------- /evaluations/supervised-ie/resource/ere_patterns/event_role.json: -------------------------------------------------------------------------------- 1 | { 2 | "Movement:Transport": [ 3 | "Vehicle", 4 | "Artifact", 5 | "Agent", 6 | "Origin", 7 | "Destination" 8 | ], 9 | "Personnel:Elect": [ 10 | "Place", 11 | "Person", 12 | "Entity" 13 | ], 14 | "Personnel:Start-Position": [ 15 | "Place", 16 | "Person", 17 | "Entity" 18 | ], 19 | "Personnel:Nominate": [ 20 | "Agent", 21 | "Person" 22 | ], 23 | "Personnel:End-Position": [ 24 | "Place", 25 | "Person", 26 | "Entity" 27 | ], 28 | "Conflict:Attack": [ 29 | "Target", 30 | "Place", 31 | "Victim", 32 | "Instrument", 33 | "Attacker" 34 | ], 35 | "Contact:Meet": [ 36 | "Place", 37 | "Entity" 38 | ], 39 | "Life:Marry": [ 40 | "Place", 41 | "Person" 42 | ], 43 | "Transaction:Transfer-Money": [ 44 | "Giver", 45 | "Place", 46 | "Recipient", 47 | "Beneficiary" 48 | ], 49 | "Conflict:Demonstrate": [ 50 | "Place", 51 | "Entity" 52 | ], 53 | "Business:End-Org": [ 54 | "Place", 55 | "Org" 56 | ], 57 | "Justice:Sue": [ 58 | "Defendant", 59 | "Plaintiff", 60 | "Adjudicator", 61 | "Place" 62 | ], 63 | "Life:Injure": [ 64 | "Agent", 65 | "Place", 66 | "Victim", 67 | "Instrument" 68 | ], 69 | "Life:Die": [ 70 | "Person", 71 | "Agent", 72 | "Place", 73 | "Victim", 74 | "Instrument" 75 | ], 76 | "Justice:Arrest-Jail": [ 77 | "Agent", 78 | "Place", 79 | "Person" 80 | ], 81 | "Contact:Phone-Write": [ 82 | "Place", 83 | "Entity" 84 | ], 85 | "Transaction:Transfer-Ownership": [ 86 | "Artifact", 87 | "Beneficiary", 88 | "Buyer", 89 | "Place", 90 | "Seller" 91 | ], 92 | "Business:Start-Org": [ 93 | "Agent", 94 | "Place", 95 | "Org" 96 | ], 97 | "Justice:Execute": [ 98 | "Agent", 99 | "Place", 100 | "Person" 101 | ], 102 | "Justice:Trial-Hearing": [ 103 | "Prosecutor", 104 | "Defendant", 105 | "Place", 106 | "Adjudicator" 107 | ], 108 | "Life:Be-Born": [ 109 | "Place", 110 | "Person" 111 | ], 112 | "Justice:Charge-Indict": [ 113 | "Prosecutor", 114 | "Adjudicator", 115 | "Place", 116 | "Defendant" 117 | ], 118 | "Justice:Convict": [ 119 | "Defendant", 120 | "Place", 121 | "Adjudicator" 122 | ], 123 | "Justice:Sentence": [ 124 | "Adjudicator", 125 | "Place", 126 | "Defendant" 127 | ], 128 | "Business:Declare-Bankruptcy": [ 129 | "Place", 130 | "Org" 131 | ], 132 | "Justice:Release-Parole": [ 133 | "Place", 134 | "Person", 135 | "Entity" 136 | ], 137 | "Justice:Fine": [ 138 | "Adjudicator", 139 | "Place", 140 | "Entity" 141 | ], 142 | "Justice:Pardon": [ 143 | "Adjudicator", 144 | "Place", 145 | "Defendant" 146 | ], 147 | "Justice:Appeal": [ 148 | "Adjudicator", 149 | "Plaintiff", 150 | "Place" 151 | ], 152 | "Justice:Extradite": [ 153 | "Agent", 154 | "Origin", 155 | "Destination" 156 | ], 157 | "Life:Divorce": [ 158 | "Place", 159 | "Person" 160 | ], 161 | "Business:Merge-Org": [ 162 | "Org" 163 | ], 164 | "Justice:Acquit": [ 165 | "Defendant", 166 | "Adjudicator" 167 | ] 168 | } 169 | -------------------------------------------------------------------------------- /evaluations/supervised-ie/resource/ere_patterns/relation_entity.json: -------------------------------------------------------------------------------- 1 | {"ORG-AFF": ["ORG", "PER", "GPE", "FAC"], "GEN-AFF": ["LOC", "PER", "FAC", "ORG", "GPE"], "PHYS": ["LOC", "PER", "FAC", "VEH", "ORG", "GPE"], "PART-WHOLE": ["LOC", "WEA", "PER", "FAC", "VEH", "ORG", "GPE"], "PER-SOC": ["ORG", "PER"]} -------------------------------------------------------------------------------- /evaluations/supervised-ie/resource/ere_patterns/role_entity.json: -------------------------------------------------------------------------------- 1 | {"Attacker": ["ORG", "PER", "GPE"], "Place": ["LOC", "GPE", "FAC"], "Target": ["LOC", "WEA", "PER", "FAC", "VEH", "ORG"], "Victim": ["PER"], "Agent": ["ORG", "PER", "GPE"], "Entity": ["ORG", "PER", "GPE"], "Instrument": ["WEA", "VEH"], "Artifact": ["WEA", "PER", "VEH", "FAC", "ORG"], "Origin": ["LOC", "GPE", "FAC"], "Vehicle": ["VEH"], "Destination": ["LOC", "GPE", "FAC"], "Buyer": ["ORG", "PER", "GPE"], "Person": ["PER"], "Org": ["ORG", "PER"], "Adjudicator": ["ORG", "PER", "GPE"], "Plaintiff": ["ORG", "PER", "GPE"], "Defendant": ["ORG", "PER", "GPE"], "Prosecutor": ["ORG", "PER", "GPE"], "Giver": ["ORG", "PER", "GPE"], "Seller": ["ORG", "PER", "GPE"], "Recipient": ["ORG", "PER", "GPE"], "Beneficiary": ["ORG", "PER", "GPE"]} -------------------------------------------------------------------------------- /evaluations/supervised-ie/resource/splits/ACE05-CN/dev.doc.txt: -------------------------------------------------------------------------------- 1 | CTS20001211.1300.0012 2 | CBS20001113.1000.0822 3 | XIN20001231.1400.0076 4 | XIN20001009.0800.0058 5 | VOM20001027.1800.0230 6 | XIN20001125.0800.0031 7 | XIN20001205.2000.0143 8 | CBS20001008.1000.0742 9 | DAVYZW_20050127.1720 10 | VOM20001024.1800.2758 11 | CTS20001024.1300.0506 12 | VOM20001024.1800.1850 13 | XIN20001219.2000.0158 14 | NJWSL_20041211.1642 15 | CTS20001030.1800.0439 16 | XIN20001216.1400.0090 17 | XIN20001125.1400.0078 18 | XIN20001209.0200.0008 19 | CTV20001205.1330.1436 20 | CTV20001106.1330.0676 21 | CBS20001205.1000.0731 22 | XIN20001001.2000.0152 23 | XIN20001102.1400.0144 24 | CTS20001130.1300.0941 25 | XIN20001020.0200.0006 26 | LIUYIFENG_20050127.0709 27 | CBS20001001.1000.0041 28 | ZBN20001228.0400.0017 29 | VOM20001024.1800.2163 30 | CTS20001016.1300.0297 31 | XIN20001124.1400.0105 32 | CTS20001019.1300.0638 33 | XIN20001223.2000.0095 34 | LIUYIFENG_20050128.0814 35 | LIUYIFENG_20050115.0916 36 | XIN20001129.0200.0039 37 | CTV20001207.1330.0642 38 | DAVYZW_20050110.1403 39 | CTS20001031.1300.1129 40 | LIUYIFENG_20050126.0820 41 | CTS20001215.1300.0532 42 | VOM20001222.0700.1974 43 | CTS20001004.1300.0461 44 | -------------------------------------------------------------------------------- /evaluations/supervised-ie/resource/splits/ACE05-CN/test.doc.txt: -------------------------------------------------------------------------------- 1 | CBS20001126.1000.0700 2 | XIN20001007.0800.0037 3 | VOM20001020.1800.2981 4 | XIN20001216.1400.0085 5 | XIN20001024.2000.0153 6 | CTV20001026.1530.0802 7 | LIUYIFENG_20050124.1835 8 | XIN20001122.1400.0074 9 | CBS20001023.1000.1067 10 | VOM20001216.0700.1886 11 | XIN20001017.1400.0130 12 | CTS20001224.1300.0396 13 | XIN20001003.0200.0001 14 | VOM20001005.1800.1966 15 | XIN20001126.2000.0101 16 | CBS20001129.1000.1072 17 | LIUYIFENG_20050129.0957 18 | XIN20001107.2000.0150 19 | CBS20001123.1000.1060 20 | XIN20001216.1400.0068 21 | CBS20001118.1000.0340 22 | CTV20001116.1330.0474 23 | XIN20001020.0200.0018 24 | CBS20001021.1000.0734 25 | DAVYZW_20050114.1634 26 | CNR20001201.1700.1429 27 | XIN20001217.2000.0089 28 | XIN20001228.0200.0038 29 | CBS20001117.1000.0341 30 | CTS20001108.1300.0504 31 | XIN20001010.0800.0053 32 | CTS20001015.1300.1065 33 | CTS20001105.1300.0613 34 | LIUYIFENG_20050113.1047 35 | CTV20001005.1330.1455 36 | LANGLANGGARGEN_20050124.1017 37 | CTV20001011.1330.0522 38 | DAVYZW_20050124.1833 39 | XIN20001126.0800.0042 40 | ZBN20001119.1300.0039 41 | VOM20001006.1800.0436 42 | CTV20001129.1330.1511 43 | XIN20001031.0800.0085 44 | LIUYIFENG_20050112.1200 45 | -------------------------------------------------------------------------------- /evaluations/supervised-ie/resource/splits/ACE05-E/dev.doc.txt: -------------------------------------------------------------------------------- 1 | CNN_CF_20030303.1900.02 2 | CNN_IP_20030329.1600.00-2 3 | CNN_IP_20030402.1600.00-1 4 | CNN_IP_20030405.1600.01-1 5 | CNN_IP_20030409.1600.02 6 | marcellapr_20050228.2219 7 | rec.games.chess.politics_20041217.2111 8 | soc.org.nonprofit_20050218.1902 9 | FLOPPINGACES_20050217.1237.014 10 | AGGRESSIVEVOICEDAILY_20041116.1347 11 | FLOPPINGACES_20041117.2002.024 12 | FLOPPINGACES_20050203.1953.038 13 | TTRACY_20050223.1049 14 | CNNHL_ENG_20030304_142751.10 15 | CNNHL_ENG_20030424_123502.25 16 | CNNHL_ENG_20030513_220910.32 17 | CNN_ENG_20030304_173120.16 18 | CNN_ENG_20030328_150609.10 19 | CNN_ENG_20030424_070008.15 20 | CNN_ENG_20030512_170454.13 21 | CNN_ENG_20030620_085840.7 22 | AFP_ENG_20030305.0918 23 | AFP_ENG_20030311.0491 24 | AFP_ENG_20030314.0238 25 | AFP_ENG_20030319.0879 26 | AFP_ENG_20030320.0722 27 | AFP_ENG_20030327.0022 28 | AFP_ENG_20030327.0224 29 | -------------------------------------------------------------------------------- /evaluations/supervised-ie/resource/splits/ACE05-E/test.doc.txt: -------------------------------------------------------------------------------- 1 | AFP_ENG_20030401.0476 2 | AFP_ENG_20030413.0098 3 | AFP_ENG_20030415.0734 4 | AFP_ENG_20030417.0004 5 | AFP_ENG_20030417.0307 6 | AFP_ENG_20030417.0764 7 | AFP_ENG_20030418.0556 8 | AFP_ENG_20030425.0408 9 | AFP_ENG_20030427.0118 10 | AFP_ENG_20030428.0720 11 | AFP_ENG_20030429.0007 12 | AFP_ENG_20030430.0075 13 | AFP_ENG_20030502.0614 14 | AFP_ENG_20030504.0248 15 | AFP_ENG_20030508.0118 16 | AFP_ENG_20030508.0357 17 | AFP_ENG_20030509.0345 18 | AFP_ENG_20030514.0706 19 | AFP_ENG_20030519.0049 20 | AFP_ENG_20030519.0372 21 | AFP_ENG_20030522.0878 22 | AFP_ENG_20030527.0616 23 | AFP_ENG_20030528.0561 24 | AFP_ENG_20030530.0132 25 | AFP_ENG_20030601.0262 26 | AFP_ENG_20030607.0030 27 | AFP_ENG_20030616.0715 28 | AFP_ENG_20030617.0846 29 | AFP_ENG_20030625.0057 30 | AFP_ENG_20030630.0271 31 | APW_ENG_20030304.0555 32 | APW_ENG_20030306.0191 33 | APW_ENG_20030308.0314 34 | APW_ENG_20030310.0719 35 | APW_ENG_20030311.0775 36 | APW_ENG_20030318.0689 37 | APW_ENG_20030319.0545 38 | APW_ENG_20030322.0119 39 | APW_ENG_20030324.0768 40 | APW_ENG_20030325.0786 41 | -------------------------------------------------------------------------------- /evaluations/supervised-ie/resource/splits/ACE05-R/dev.doc.txt: -------------------------------------------------------------------------------- 1 | CNN_ENG_20030530_130025.12 2 | CNN_ENG_20030605_085831.13 3 | CNN_ENG_20030415_103039.0 4 | CNN_ENG_20030407_080037.12 5 | CNN_ENG_20030429_110706.7 6 | CNN_ENG_20030428_193655.2 7 | CNNHL_ENG_20030604_230238.5 8 | CNN_ENG_20030612_072835.2 9 | CNN_ENG_20030306_083604.6 10 | CNN_ENG_20030624_140104.22 11 | CNN_ENG_20030627_065846.3 12 | CNN_ENG_20030429_083016.5 13 | CNN_ENG_20030509_123601.13 14 | CNN_ENG_20030423_180539.2 15 | CNN_ENG_20030617_193116.10 16 | CNN_ENG_20030507_160538.15 17 | CNN_ENG_20030422_083005.10 18 | CNN_ENG_20030305_170125.1 19 | CNN_ENG_20030320_153434.7 20 | CNN_ENG_20030509_090025.5 21 | CNN_ENG_20030618_150128.6 22 | CNN_ENG_20030617_173115.14 23 | CNN_ENG_20030502_093018.6 24 | CNN_ENG_20030409_180633.8 25 | CNN_ENG_20030624_153103.17 26 | CNN_ENG_20030407_130604.10 27 | CNN_ENG_20030329_170349.7 28 | CNNHL_ENG_20030416_230741.33 29 | CNNHL_ENG_20030402_193443.5 30 | CNN_ENG_20030620_170011.14 31 | CNN_ENG_20030626_193133.8 32 | CNN_ENG_20030610_085833.10 33 | CNN_ENG_20030507_170539.0 34 | CNN_ENG_20030526_183538.3 35 | CNN_ENG_20030513_080020.2 36 | CNN_ENG_20030611_102832.3 37 | XIN_ENG_20030513.0002 38 | XIN_ENG_20030408.0341 39 | APW_ENG_20030331.0410 40 | APW_ENG_20030409.0013 41 | APW_ENG_20030519.0548 42 | AFP_ENG_20030429.0007 43 | APW_ENG_20030422.0469 44 | AFP_ENG_20030330.0211 45 | APW_ENG_20030419.0358 46 | APW_ENG_20030619.0383 47 | APW_ENG_20030310.0719 48 | AFP_ENG_20030519.0049 49 | AFP_ENG_20030327.0224 50 | AFP_ENG_20030401.0476 51 | APW_ENG_20030519.0367 52 | NYT_ENG_20030630.0079 53 | MARKETVIEW_20050216.2120 54 | AGGRESSIVEVOICEDAILY_20041101.1806 55 | MARKETVIEW_20050215.1858 56 | MARKETVIEW_20041209.1401 57 | MARKBACKER_20050217.0647 58 | MARKETVIEW_20050208.2033 59 | BACONSREBELLION_20050209.0721 60 | MARKBACKER_20041128.1641 61 | MARKETVIEW_20050209.1923 62 | BACONSREBELLION_20050127.1017 63 | AGGRESSIVEVOICEDAILY_20041101.1144 64 | MARKETVIEW_20050120.1641 65 | MARKETVIEW_20050212.1607 66 | MARKBACKER_20041112.0707 67 | MARKETVIEW_20050222.0729 68 | MARKETVIEW_20050226.1307 69 | FLOPPINGACES_20050101.2244.048 70 | BACONSREBELLION_20050226.1317 71 | BACONSREBELLION_20050216.1632 72 | CNN_IP_20030414.1600.04 73 | CNN_IP_20030329.1600.00-5 74 | CNN_IP_20030406.1600.03 75 | CNN_CF_20030304.1900.01 76 | CNN_IP_20030408.1600.03 77 | CNN_IP_20030412.1600.05 78 | CNN_IP_20030402.1600.00-4 79 | CNN_IP_20030408.1600.04 80 | CNN_IP_20030404.1600.00-2 81 | -------------------------------------------------------------------------------- /evaluations/supervised-ie/resource/splits/ACE05-R/test.doc.txt: -------------------------------------------------------------------------------- 1 | CNN_ENG_20030527_195948.3 2 | CNN_ENG_20030618_065839.11 3 | CNN_ENG_20030411_070039.21 4 | CNN_ENG_20030415_173752.0 5 | CNN_ENG_20030515_193533.6 6 | CNN_ENG_20030403_080032.9 7 | CNN_ENG_20030407_170605.7 8 | CNNHL_ENG_20030513_220910.32 9 | CNN_ENG_20030416_180808.15 10 | CNNHL_ENG_20030304_142751.10 11 | CNN_ENG_20030506_053020.14 12 | CNN_ENG_20030607_170312.6 13 | CNNHL_ENG_20030624_230338.34 14 | CNN_ENG_20030516_123543.8 15 | CNN_ENG_20030401_073033.14 16 | CNN_ENG_20030501_160459.0 17 | CNN_ENG_20030508_170552.18 18 | CNN_ENG_20030624_153103.16 19 | CNN_ENG_20030410_183644.8 20 | CNN_ENG_20030325_220534.6 21 | CNN_ENG_20030424_073006.4 22 | CNN_ENG_20030528_172957.18 23 | CNN_ENG_20030528_125956.8 24 | CNN_ENG_20030408_123613.0 25 | CNN_ENG_20030617_065838.21 26 | CNNHL_ENG_20030416_133739.9 27 | CNN_ENG_20030312_083725.3 28 | CNN_ENG_20030501_063017.15 29 | CNNHL_ENG_20030611_133445.24 30 | CNN_ENG_20030416_100042.7 31 | CNN_ENG_20030418_083040.11 32 | CNNHL_ENG_20030610_133347.6 33 | CNN_ENG_20030327_163556.20 34 | CNNHL_ENG_20030407_193547.5 35 | CNNHL_ENG_20030331_193419.9 36 | CNNHL_ENG_20030609_133335.37 37 | AFP_ENG_20030509.0345 38 | APW_ENG_20030318.0689 39 | APW_ENG_20030520.0757 40 | APW_ENG_20030416.0581 41 | AFP_ENG_20030502.0614 42 | APW_ENG_20030602.0037 43 | APW_ENG_20030324.0768 44 | APW_ENG_20030410.0906 45 | AFP_ENG_20030304.0250 46 | APW_ENG_20030325.0786 47 | AFP_ENG_20030427.0118 48 | AFP_ENG_20030514.0706 49 | APW_ENG_20030610.0010 50 | APW_ENG_20030527.0232 51 | AFP_ENG_20030323.0020 52 | XIN_ENG_20030415.0379 53 | AGGRESSIVEVOICEDAILY_20041116.1347 54 | MARKETVIEW_20050217.2115 55 | FLOPPINGACES_20041114.1240.039 56 | MARKETVIEW_20041213.0722 57 | AGGRESSIVEVOICEDAILY_20050205.1954 58 | AGGRESSIVEVOICEDAILY_20050125.0136 59 | AGGRESSIVEVOICEDAILY_20050124.1354 60 | AGGRESSIVEVOICEDAILY_20050109.1627 61 | MARKETVIEW_20050201.0748 62 | AGGRESSIVEVOICEDAILY_20050114.1922 63 | AGGRESSIVEVOICEDAILY_20041208.2133 64 | MARKETVIEW_20050206.2009 65 | MARKETVIEW_20041215.2128 66 | FLOPPINGACES_20041115.1613.032 67 | MARKETVIEW_20050210.2138 68 | MARKETVIEW_20050226.1444 69 | AGGRESSIVEVOICEDAILY_20050116.2149 70 | TTRACY_20050223.1049 71 | OIADVANTAGE_20050204.1155 72 | CNN_CF_20030303.1900.05 73 | CNN_IP_20030405.1600.01-3 74 | CNN_IP_20030405.1600.00-3 75 | CNN_IP_20030329.1600.00-6 76 | CNN_IP_20030402.1600.02-2 77 | CNN_IP_20030404.1600.00-1 78 | CNN_IP_20030422.1600.05 79 | CNN_IP_20030405.1600.00-2 80 | CNN_IP_20030402.1600.00-2 81 | -------------------------------------------------------------------------------- /evaluations/supervised-ie/resource/splits/ACE05-R/train.doc.txt: -------------------------------------------------------------------------------- 1 | CNN_ENG_20030416_160804.4 2 | CNNHL_ENG_20030603_230307.3 3 | CNN_ENG_20030415_183752.14 4 | CNN_ENG_20030506_163523.22 5 | CNN_ENG_20030605_065831.18 6 | CNN_ENG_20030304_173120.16 7 | CNN_ENG_20030328_150609.10 8 | CNN_ENG_20030408_200618.14 9 | CNN_ENG_20030619_125955.10 10 | CNN_ENG_20030604_092828.7 11 | CNN_ENG_20030421_120508.17 12 | CNN_ENG_20030404_073033.4 13 | CNN_ENG_20030625_220123.3 14 | CNNHL_ENG_20030513_183907.5 15 | CNN_ENG_20030513_113501.6 16 | CNNHL_ENG_20030618_230303.36 17 | CNN_ENG_20030527_215946.12 18 | CNN_ENG_20030325_150531.10 19 | CNNHL_ENG_20030429_220618.15 20 | CNNHL_ENG_20030625_230351.4 21 | CNN_ENG_20030616_130059.25 22 | CNN_ENG_20030610_130042.17 23 | CNN_ENG_20030306_070606.18 24 | CNNHL_ENG_20030519_124020.23 25 | CNNHL_ENG_20030410_193626.13 26 | CNN_ENG_20030618_150128.5 27 | CNN_ENG_20030630_085848.18 28 | CNN_ENG_20030605_223004.4 29 | CNN_ENG_20030424_113549.11 30 | CNN_ENG_20030610_123040.9 31 | CNNHL_ENG_20030616_230155.7 32 | CNNHL_ENG_20030416_133739.13 33 | CNN_ENG_20030418_063040.1 34 | CNN_ENG_20030529_085826.10 35 | CNNHL_ENG_20030403_133453.21 36 | CNN_ENG_20030512_190454.7 37 | CNN_ENG_20030403_060032.0 38 | CNN_ENG_20030428_173654.13 39 | CNN_ENG_20030425_063006.5 40 | CNN_ENG_20030526_133535.4 41 | CNN_ENG_20030408_153616.9 42 | CNN_ENG_20030528_165958.16 43 | CNNHL_ENG_20030403_193455.30 44 | CNN_ENG_20030514_130518.5 45 | CNNHL_ENG_20030411_230640.38 46 | CNN_ENG_20030424_173553.8 47 | CNN_ENG_20030622_173306.9 48 | CNN_ENG_20030605_105831.11 49 | CNN_ENG_20030618_193127.17 50 | CNN_ENG_20030619_115954.4 51 | CNN_ENG_20030619_115954.10 52 | CNN_ENG_20030417_063039.0 53 | CNNHL_ENG_20030616_230155.28 54 | CNN_ENG_20030612_173004.10 55 | CNN_ENG_20030416_190806.4 56 | CNNHL_ENG_20030505_220734.25 57 | CNN_ENG_20030513_160506.16 58 | CNN_ENG_20030403_090032.1 59 | CNN_ENG_20030430_093016.0 60 | CNN_ENG_20030429_190711.14 61 | CNNHL_ENG_20030618_230303.6 62 | CNNHL_ENG_20030624_133331.33 63 | CNNHL_ENG_20030428_123600.14 64 | CNN_ENG_20030403_183513.1 65 | CNN_ENG_20030404_163526.10 66 | CNN_ENG_20030621_115841.16 67 | CNNHL_ENG_20030430_220712.37 68 | CNN_ENG_20030605_193002.8 69 | CNN_ENG_20030403_180511.16 70 | CNN_ENG_20030525_160525.13 71 | CNN_ENG_20030610_133041.17 72 | CNN_ENG_20030610_095857.4 73 | CNN_ENG_20030612_173004.2 74 | CNN_ENG_20030525_143522.8 75 | CNN_ENG_20030312_223733.14 76 | CNN_ENG_20030421_120508.13 77 | CNN_ENG_20030620_085840.7 78 | CNN_ENG_20030625_210122.0 79 | CNN_ENG_20030604_102828.6 80 | CNN_ENG_20030603_095830.17 81 | CNN_ENG_20030612_160005.13 82 | CNN_ENG_20030602_072826.1 83 | CNNHL_ENG_20030424_123502.25 84 | CNN_ENG_20030630_075848.7 85 | CNN_ENG_20030425_133605.6 86 | CNN_ENG_20030418_130831.5 87 | CNN_ENG_20030617_112838.4 88 | CNN_ENG_20030614_173123.4 89 | CNN_ENG_20030626_203133.11 90 | CNN_ENG_20030515_063019.6 91 | CNNHL_ENG_20030513_220910.11 92 | CNN_ENG_20030515_073019.7 93 | CNNHL_ENG_20030625_193346.7 94 | CNN_ENG_20030524_143511.4 95 | CNN_ENG_20030430_160723.6 96 | CNN_ENG_20030602_102826.13 97 | CNN_ENG_20030418_163834.14 98 | CNN_ENG_20030529_130011.6 99 | CNN_ENG_20030426_160621.0 100 | CNN_ENG_20030602_133012.9 101 | CNN_ENG_20030318_140851.8 102 | CNN_ENG_20030331_193655.14 103 | CNN_ENG_20030421_090007.11 104 | CNNHL_ENG_20030526_221156.39 105 | CNN_ENG_20030422_213527.4 106 | CNN_ENG_20030611_175950.5 107 | CNN_ENG_20030617_173115.22 108 | CNN_ENG_20030508_210555.5 109 | CNN_ENG_20030610_105832.1 110 | CNN_ENG_20030402_190500.11 111 | CNN_ENG_20030408_083034.11 112 | CNN_ENG_20030605_153000.9 113 | CNNHL_ENG_20030416_193742.7 114 | CNN_ENG_20030512_170454.13 115 | CNN_ENG_20030429_143706.14 116 | CNN_ENG_20030607_173310.4 117 | CNN_ENG_20030617_105836.4 118 | CNNHL_ENG_20030416_193742.26 119 | CNN_ENG_20030624_065843.24 120 | CNN_ENG_20030528_195959.20 121 | CNN_ENG_20030502_080020.7 122 | CNNHL_ENG_20030312_150218.13 123 | CNN_ENG_20030624_082841.12 124 | CNNHL_ENG_20030425_183518.12 125 | CNN_ENG_20030526_180540.6 126 | CNN_ENG_20030611_102832.4 127 | CNN_ENG_20030421_133510.6 128 | CNN_ENG_20030528_082823.9 129 | CNN_ENG_20030428_130651.4 130 | CNN_ENG_20030505_090022.1 131 | CNN_ENG_20030621_160254.25 132 | CNN_ENG_20030507_060023.1 133 | CNN_ENG_20030620_095840.4 134 | CNN_ENG_20030411_193701.3 135 | CNN_ENG_20030602_105829.2 136 | CNN_ENG_20030424_183556.7 137 | CNN_ENG_20030627_130145.6 138 | CNN_ENG_20030415_180754.5 139 | CNN_ENG_20030430_063016.14 140 | CNN_ENG_20030401_233449.5 141 | CNN_ENG_20030414_130735.7 142 | CNNHL_ENG_20030523_221118.14 143 | CNN_ENG_20030424_070008.15 144 | CNN_ENG_20030411_233701.11 145 | CNN_ENG_20030417_073039.2 146 | CNNHL_ENG_20030402_133449.22 147 | CNN_ENG_20030506_160524.18 148 | CNNHL_ENG_20030415_193729.5 149 | CNN_ENG_20030313_083739.0 150 | CNN_ENG_20030603_133025.7 151 | CNN_ENG_20030516_090022.7 152 | CNNHL_ENG_20030610_230438.14 153 | CNN_ENG_20030331_123648.4 154 | CNN_ENG_20030429_170710.4 155 | APW_ENG_20030404.0439 156 | XIN_ENG_20030314.0208 157 | XIN_ENG_20030624.0085 158 | AFP_ENG_20030320.0722 159 | AFP_ENG_20030327.0022 160 | APW_ENG_20030414.0392 161 | XIN_ENG_20030324.0191 162 | AFP_ENG_20030428.0720 163 | AFP_ENG_20030319.0879 164 | XIN_ENG_20030609.0118 165 | AFP_ENG_20030504.0248 166 | AFP_ENG_20030415.0734 167 | AFP_ENG_20030519.0372 168 | APW_ENG_20030415.0742 169 | APW_ENG_20030304.0555 170 | APW_ENG_20030408.0090 171 | APW_ENG_20030422.0485 172 | APW_ENG_20030508.0772 173 | APW_ENG_20030418.0084 174 | APW_ENG_20030403.0862 175 | NYT_ENG_20030602.0074 176 | XIN_ENG_20030317.0177 177 | APW_ENG_20030407.0030 178 | APW_ENG_20030603.0303 179 | AFP_ENG_20030417.0764 180 | APW_ENG_20030510.0228 181 | APW_ENG_20030520.0081 182 | AFP_ENG_20030417.0004 183 | APW_ENG_20030610.0554 184 | APW_ENG_20030423.0079 185 | APW_ENG_20030327.0376 186 | AFP_ENG_20030430.0075 187 | XIN_ENG_20030423.0011 188 | AFP_ENG_20030607.0030 189 | AFP_ENG_20030522.0878 190 | AFP_ENG_20030528.0561 191 | AFP_ENG_20030601.0262 192 | APW_ENG_20030406.0191 193 | XIN_ENG_20030610.0299 194 | APW_ENG_20030424.0532 195 | AFP_ENG_20030413.0098 196 | AFP_ENG_20030314.0238 197 | AFP_ENG_20030418.0556 198 | APW_ENG_20030412.0531 199 | APW_ENG_20030308.0314 200 | APW_ENG_20030424.0698 201 | AFP_ENG_20030425.0408 202 | APW_ENG_20030502.0686 203 | XIN_ENG_20030523.0202 204 | AFP_ENG_20030417.0307 205 | APW_ENG_20030411.0304 206 | NYT_ENG_20030403.0008 207 | AFP_ENG_20030617.0846 208 | AFP_ENG_20030616.0715 209 | AFP_ENG_20030508.0118 210 | AFP_ENG_20030527.0616 211 | AFP_ENG_20030311.0491 212 | AFP_ENG_20030530.0132 213 | APW_ENG_20030322.0119 214 | APW_ENG_20030319.0545 215 | XIN_ENG_20030616.0274 216 | APW_ENG_20030417.0555 217 | XIN_ENG_20030327.0202 218 | XIN_ENG_20030509.0137 219 | APW_ENG_20030513.0139 220 | AFP_ENG_20030305.0918 221 | APW_ENG_20030311.0775 222 | APW_ENG_20030326.0190 223 | APW_ENG_20030502.0470 224 | AFP_ENG_20030625.0057 225 | AFP_ENG_20030508.0357 226 | APW_ENG_20030306.0191 227 | AFP_ENG_20030630.0271 228 | XIN_ENG_20030425.0184 229 | MARKETVIEW_20050208.2059 230 | MARKBACKER_20041216.0656 231 | OIADVANTAGE_20050110.1009 232 | MARKETVIEW_20041211.1845 233 | BACONSREBELLION_20050222.0817 234 | AGGRESSIVEVOICEDAILY_20050224.2252 235 | MARKETVIEW_20041220.1537 236 | MARKBACKER_20050105.1526 237 | AGGRESSIVEVOICEDAILY_20050106.1310 238 | FLOPPINGACES_20041230.1844.003 239 | MARKETVIEW_20041217.0801 240 | BACONSREBELLION_20050217.0744 241 | MARKBACKER_20041117.0723 242 | MARKETVIEW_20041212.1447 243 | BACONSREBELLION_20050214.0944 244 | MARKETVIEW_20050225.0541 245 | MARKBACKER_20041217.1639 246 | OIADVANTAGE_20050103.0944 247 | BACONSREBELLION_20050123.1639 248 | AGGRESSIVEVOICEDAILY_20050105.1344 249 | MARKETVIEW_20050206.1951 250 | HEALINGIRAQ_20041108.1942.05 251 | OIADVANTAGE_20050109.1947 252 | AGGRESSIVEVOICEDAILY_20041215.2302 253 | AGGRESSIVEVOICEDAILY_20041218.0146 254 | MARKBACKER_20050103.0829 255 | BACONSREBELLION_20050205.1919 256 | AGGRESSIVEVOICEDAILY_20041203.1959 257 | MARKETVIEW_20050212.1717 258 | BACONSREBELLION_20050216.1536 259 | FLOPPINGACES_20041117.2002.024 260 | AGGRESSIVEVOICEDAILY_20050208.1142 261 | BACONSREBELLION_20050125.1108 262 | GETTINGPOLITICAL_20050105.0127.001 263 | MARKETVIEW_20050127.0716 264 | MARKETVIEW_20050105.1901 265 | MARKETVIEW_20050205.1358 266 | FLOPPINGACES_20041113.1528.042 267 | MARKETVIEW_20050222.1919 268 | MARKBACKER_20041103.1300 269 | BACONSREBELLION_20050218.1214 270 | AGGRESSIVEVOICEDAILY_20041226.1712 271 | MARKETVIEW_20050204.1322 272 | MARKETVIEW_20050126.0711 273 | MARKETVIEW_20041219.1509 274 | FLOPPINGACES_20050203.1953.038 275 | MARKETVIEW_20050204.1337 276 | BACONSREBELLION_20050227.1238 277 | MARKBACKER_20041206.0733 278 | AGGRESSIVEVOICEDAILY_20050224.1207 279 | MARKBACKER_20050105.1632 280 | MARKETVIEW_20050207.0746 281 | AGGRESSIVEVOICEDAILY_20041218.1004 282 | FLOPPINGACES_20041228.0927.010 283 | MARKBACKER_20041108.1507 284 | BACONSREBELLION_20050218.0848 285 | AGGRESSIVEVOICEDAILY_20041201.2313 286 | FLOPPINGACES_20050217.1237.014 287 | OIADVANTAGE_20050203.1000 288 | BACONSREBELLION_20050206.1345 289 | OIADVANTAGE_20041224.1007 290 | MARKBACKER_20041220.0919 291 | BACONSREBELLION_20050204.1326 292 | BACONSREBELLION_20050222.1348 293 | MARKETVIEW_20050204.1736 294 | AGGRESSIVEVOICEDAILY_20041223.1449 295 | MARKBACKER_20041119.1002 296 | MARKBACKER_20041202.0711 297 | AGGRESSIVEVOICEDAILY_20050203.1356 298 | BACONSREBELLION_20050210.0728 299 | OIADVANTAGE_20050203.2102 300 | BACONSREBELLION_20050216.1618 301 | MARKETVIEW_20050228.2211 302 | MARKBACKER_20041117.1107 303 | FLOPPINGACES_20041116.0833.027 304 | AGGRESSIVEVOICEDAILY_20050213.2123 305 | OIADVANTAGE_20050108.1323 306 | MARKETVIEW_20050214.2115 307 | AGGRESSIVEVOICEDAILY_20050113.1400 308 | AGGRESSIVEVOICEDAILY_20050107.2012 309 | OIADVANTAGE_20050105.0922 310 | CNN_IP_20030329.1600.00-2 311 | CNN_IP_20030329.1600.00-3 312 | CNN_CF_20030303.1900.00 313 | CNN_IP_20030329.1600.02 314 | CNN_IP_20030329.1600.01-3 315 | CNN_LE_20030504.1200.02-1 316 | CNN_CF_20030304.1900.04 317 | CNN_IP_20030409.1600.02 318 | CNN_CF_20030304.1900.06-2 319 | CNN_IP_20030329.1600.00-4 320 | CNN_CF_20030305.1900.00-2 321 | CNN_IP_20030410.1600.03-1 322 | CNN_CF_20030303.1900.06-1 323 | CNN_IP_20030403.1600.00-3 324 | CNN_CF_20030305.1900.06-2 325 | CNN_IP_20030402.1600.00-1 326 | CNN_IP_20030405.1600.01-1 327 | CNN_IP_20030402.1600.02-1 328 | CNN_CF_20030303.1900.06-2 329 | CNN_IP_20030330.1600.05-2 330 | CNN_IP_20030403.1600.00-1 331 | CNN_IP_20030410.1600.03-2 332 | CNN_IP_20030402.1600.00-3 333 | CNN_LE_20030504.1200.01 334 | CNN_CF_20030303.1900.02 335 | CNN_IP_20030405.1600.01-2 336 | CNN_CF_20030305.1900.00-3 337 | CNN_CF_20030305.1900.00-1 338 | CNN_IP_20030407.1600.05 339 | CNN_CF_20030305.1900.02 340 | CNN_CF_20030304.1900.02 341 | CNN_IP_20030403.1600.00-4 342 | CNN_LE_20030504.1200.02-2 343 | CNN_IP_20030403.1600.00-2 344 | CNN_IP_20030409.1600.04 345 | CNN_IP_20030417.1600.06 346 | CNN_IP_20030329.1600.01-1 347 | CNN_IP_20030405.1600.02 348 | CNN_IP_20030328.1600.07 349 | CNN_CF_20030305.1900.06-1 350 | CNN_IP_20030412.1600.03 351 | CNN_IP_20030330.1600.06 352 | -------------------------------------------------------------------------------- /evaluations/supervised-ie/resource/splits/ERE-EN/dev.doc.txt: -------------------------------------------------------------------------------- 1 | 101d0fc4a78dc1b84953ebd399b2fad5 2 | 0f03cc5a508d630c6c8c8c61396e31a9 3 | NYT_ENG_20130910.0191 4 | 14294db341956a71811c9dd015b04ed7 5 | 0659c87d9fd3d5efd258ee6de3ba1003 6 | 11a29a0d63a79b0f5d19ccae1838b125 7 | 3dff15d768dbfe27e4d6b81fb63aee95 8 | 4aea880c68f1708f68271a7913f2001f 9 | 2bdb9d86091c6f412ffa767bdc749be9 10 | 1a0f894682abf633cc94b06405b78a8e 11 | 7bac41e8aea34c7ef9462fcc1a572109 12 | 45b9b8f7d17ce5f352c16a339e96705f 13 | 75a85a5de2dd86d7b7662b83aa639d0a 14 | 06fa2a5cdc50c1d2a96bfe02adcc0b40 15 | 22ca1a5aa492b429d274169c54554a7c 16 | edb392c8323a4f5f27cc0e59df409c68 17 | NYT_ENG_20131022.0102 18 | 9e49d5babe9b22ac5ebe1afd3d440ff2 19 | 5bac42475431a87070720e94b27cfd99 20 | 48dafc1e3678fa7b13cb467ab3eed071 21 | 3ddbad6f438c88eec387131477ffe1b9 22 | 44169f6a3f5b04e8dbab2a26e572a136 23 | NYT_ENG_20131029.0228 24 | bec156fe4d6369a40f347477578d28b0 25 | 14fbeb82a73a7df37bcda0583c9bca7e 26 | 61d2b0dcc730f0b4e92ae0d1929b3caf 27 | 428e1e095b4e6e830b47e72f133faf87 28 | APW_ENG_20090611.0697 29 | bb1fba8ce6504faf37892e990d50fb68 30 | c0cae135f2727d4e61315f719cb27434 31 | 90f8a4e01d7a52940959427f10e45f8c 32 | -------------------------------------------------------------------------------- /evaluations/supervised-ie/resource/splits/ERE-EN/test.doc.txt: -------------------------------------------------------------------------------- 1 | NYT_ENG_20130716.0217 2 | 963549e727a8abe0e772e51580fca702 3 | 35621bc5e29e511198d6eabe34676975 4 | NYT_ENG_20130625.0192 5 | 17a2dc40635ec239e9e16d10b6dd45e8 6 | NYT_ENG_20130712.0047 7 | f81535eaaa2c20ef26d54d1d87a02186 8 | 7677d625b58ce649c8aeda2ff4a56389 9 | ae6d0c01a0bea085e48016ac29a3c535 10 | 4622b60202cf3944119daf2be53aa74f 11 | NYT_ENG_20130506.0045 12 | 56af144a4d1d2e662531bdfd00d3c725 13 | 0e6c9afe37a18411d275ee225a0f0f9b 14 | 34d49f3357eaf14c849e9cdfeb893273 15 | dd0b65f632f64369c530f9bbb4b024b4 16 | 0648a08469a3be9eb972f0d213562805 17 | aa33a695c3e28d1f3dd03f4e0b373f70 18 | 1f288dcbcb562b39031c6a9402ebf6d0 19 | e8ad0cb1356161f82fb56c9f88b41990 20 | e5e3faef4fb44311a0ec8aab24903c41 21 | c728ed6c29213079b5f66788047ec89e 22 | 6154640fdb94510274583591cad7b379 23 | 5bbe1c6185296d179b95810e48ee3834 24 | a268efbb260f633c3979688e3b07e7d0 25 | bb6cb93cbd13b91ca52bfc582af0eb45 26 | 19569b08f07d751d6ac4a07633653c50 27 | 3b4d58c0a53671c6ce03f0529bb6089d 28 | a72d82525600c5a2e1aa428264bf089c 29 | d81d2b468875c49a9f6453d78a8e1ddc 30 | a08e03759505523de8475e3bf906dd5d 31 | NYT_ENG_20130710.0155 32 | -------------------------------------------------------------------------------- /evaluations/supervised-ie/resource/splits/ERE-EN/train.doc.txt: -------------------------------------------------------------------------------- 1 | 459bc8b09f4dd2e1fec7c77d26193b01 2 | 43611a2f256d101f910b852379c70959 3 | 6521f6bd1eb405232a5e852423722bac 4 | 565fa81d640f451b20955887a43b3a23 5 | 5fa0f2a7f323a781640b126978ca8a42 6 | 1d2911e09a6746b942c3e7b3cbdcb0ce 7 | 08b0dfe15192c063055ed7db8d24c625 8 | 644706e2d97c9a9a1f9874510180f136 9 | fd103b2c981e724f64d70a22c392ee93 10 | e98123aa18eb4ce95d2d4eccace51169 11 | 5254f96ac3a601e99b6357c4f7627991 12 | 4743a10c1d5f1ad35c31646049acb9db 13 | c793b6b583e008f105af586fe433d4ac 14 | NYT_ENG_20130822.0136 15 | 2ac3b55a10d5395ded9e8e54c345553b 16 | 38cd9b530a5be18dbad52400da435934 17 | 59f8514f6db132207ba9e5828f73d706 18 | NYT_ENG_20130525.0040 19 | aa54ac32868c5de9b05b65a8ee7a4329 20 | NYT_ENG_20130509.0160 21 | 52e569e00b6428b94205d3dd5c457c54 22 | 7c5b86ed55f4e5b8667423ef88f49fb5 23 | 78333509dffd4a7df90b029a5d851dfe 24 | 1ae45904ad12b1540dc390e162b61235 25 | 27eb0b9d14d45ede66fe86534e36a2ce 26 | 2a54459212636289034af844f8634e37 27 | 7e520221ddb1602a0f2aa10560a50a66 28 | 24d93564f48ae17904aa82f937db8c21 29 | 35587c6d8aa67724ba23231dd16f7b44 30 | af79ea77b8fb92424dbc02d88d8c14e8 31 | NYT_ENG_20130422.0048 32 | 361e1c2ca3a1e21c618e0e8fab959e30 33 | 9777919d54ccbb7810bd1c73df91fa4a 34 | 18e8a277f2659f79291efa0e12e80cb3 35 | 5e3fbf49f8301654bb4954c0f1e386a9 36 | 44b011cd504c9ed71beb851324db886a 37 | 5fa7fbe87758a02a1e4591f88175ccf3 38 | 0eb03fc279066b84ed49d44b2405469a 39 | 2f5ee4e363c30678dc3b55caf43bc63d 40 | 57026b7bcb8f855de3e26d572db35285 41 | 3446f8cbcf53eaca5692913ced012b11 42 | 2c2e8b3286bd34e30a4cb57cb7e26ce5 43 | 4df3dfff1ee1683ac6e1c2ea24ce2589 44 | 105249d0d0575a1a5939b16139f6229d 45 | 01f69c4c2206e7c3fa3706ccd5b8b350 46 | 648abb9000309b9807cc8b212c11254f 47 | deb3e0ea36b437c34b52d95aa6a9631f 48 | 1badbb95e5e70ef90e49cdf5a46b6d9b 49 | 7734fb9363c2adf91c6ede6c7bb7df90 50 | 1d6c0e3df079663f6bceca0b44c98a40 51 | 464e03afec9c80f8c1ce4acfe2d002ae 52 | 04debcc4da342dc971bdef4210fe468a 53 | 63dca285201d1fcda72a54f4302b2c3e 54 | 2cf358ab89c732d6b35b65e619d2bc86 55 | 07c9c8ca974b6e9333c38720b0b06896 56 | a68c8d0ef75bbbd2923bf7aa78b72d3e 57 | NYT_ENG_20130813.0006 58 | 3ac3c99241c2243a9e233b091eddfe15 59 | a13d4f9511d799fc25b73e4d5cf28d13 60 | 9e4a09ec419e110a3a12f184e66aea72 61 | 255bae1c133d1d77ef727c063e435a78 62 | fd80f8b1a5694813bbda3253139c6395 63 | 4bab621aef9d14b5d20ac23cb8142112 64 | 96bf72399b104346f3e79022e0c08e5a 65 | 3b34a76a3589417f5db02883b47280a6 66 | 26175bdbe49b712d7412c273c111e813 67 | 3eb834d9a5d9c9fcad258087b5c2794a 68 | d3b5c32563ebb009bc1b1f5bc1b9eb14 69 | NYT_ENG_20130703.0214 70 | a48d00241e327e54ca914b950e97c7d4 71 | 3c9fb643a48360935c1044efca570514 72 | 130a86739522ab7c56232e798d04cbf9 73 | af18d29036ab0a9f8cf2742a5a1b4804 74 | 652f1fbc927a6c358447947d0d77f95f 75 | 66fba4f92d2f9d8c3bee5dfad3af9828 76 | 37b56b6dd846ad0dd6e8cd00ba2efaf4 77 | 4d7e1af80bc46167ef3d81cf642bf94b 78 | NYT_ENG_20130613.0153 79 | 2ca0238925d38f345acbf826854ea448 80 | 2a10c5cc27e7504dc9df92396b9e28b8 81 | 0c100ebc18cc55f80cdae6343f72db69 82 | dbed9b6ed7d2eaf75fef0aa5a245a663 83 | 5d4273298e649a13c4dce27c89f414ac 84 | cb156ad2a5458fabc9e093b6b5e0f97f 85 | 0929d82f7059353f9593b9558983efba 86 | af36543ebce546c7c678fbf9767bfdbb 87 | 120fe19a9bc68fd85fc4963c166e9345 88 | 774caed283a1e55ef9490864771029c3 89 | 561a0178f4b846b9bbcf39f7e63afe4e 90 | 3f987a93959acff3609a251b5abbecd7 91 | 0f947223d04c10118b523cfeec5d231e 92 | 0fe5904ced20c20537fe29c1db11cd28 93 | 9c500ea2248358171d77d419e67f5760 94 | 043b35fbf220a2d1bbe7d0612ad87635 95 | da156c00417e2020948c009d39341607 96 | NYT_ENG_20130501.0255 97 | NYT_ENG_20131118.0019 98 | 26542fb5b83cdb4b98a3fe31e0226b39 99 | 95af1b55c359f28ff3a9159d55e9528a 100 | 2ac34d012c8d909d4a29aa3f6be1f23d 101 | 1d16a571f14fb1032bc19e9314a46deb 102 | 9f23d711bf5016fec9d05081772b4f24 103 | NYT_ENG_20130914.0094 104 | 5d7b429073c60d53acba21bb6e7e6caa 105 | 11c906f2f798abb05f143b206edf77a5 106 | 334de29f692ef2c5460b78fcad5c6c9e 107 | ffc5cc6892ff203f43b2dc8d83bcd725 108 | abbdf0048737e9e639403f8fe8cd7dd2 109 | 23987125927d321ec6f0c30c8f453cb3 110 | 31ea929baed3887e762b0b7f9196ce7e 111 | ae656f6d658efca126f9721087608e95 112 | ecb7c8154bf58b48ae00b252ff283c29 113 | 67db76e5116c4c809107948d4b0a5ecc 114 | 861cdd1a5c6c41610021b25c3795e293 115 | 63878a2b6d34b576361d2a2778f321a6 116 | ed6c37ed1996fc89f5fe813731c71b9d 117 | NYT_ENG_20131115.0084 118 | 8492134197b5bf8e9179e2fa245ae02f 119 | d6bc66d7c8423368aaa8d789b5bdf5db 120 | 4f7eedf44076ea050d7db3715f9333fa 121 | f0612c786635ed96ee3df84821a17685 122 | 459b795a150e7866d6e4ef75e1b92b4a 123 | 1473ea2ded50c05b29b4f55f1b83ada3 124 | cf88887857b155d8822f82cad3597744 125 | 1980ed7ea6a283f8dd19da5a4e9952d6 126 | 17f98f0c6cda0227e732e6761f396d1f 127 | 477135a713d07aafe00d5e86648ea408 128 | 33bdb079026f1fcbe47c64b8c6968d0e 129 | 290e2643c2f91c108b206c5edb7a1c0f 130 | 29f64df7feb04dfb16f4667ce199c9f0 131 | NYT_ENG_20130716.0036 132 | ca2a6fbf721ca102c149ad6a90d5b00a 133 | 5b7cab1d1cfc0c05686399d8bcbcfe5b 134 | 4fca88a5c29716cbb7c0f9aa9b84007a 135 | 40f1f697a457e39c30ad94b7cc712c96 136 | 8073c89ca4fdbe3b1eba0352bfe15d78 137 | 4deb48e2b0ab194ce37c1bd31c73586a 138 | bf1047c7c17ae3daab59c3bee423e12f 139 | c397ecd66789b905c6b1c5ef21af03ec 140 | NYT_ENG_20130504.0098 141 | NYT_ENG_20131121.0040 142 | 178e7de35eccad0df800f0c7539cf614 143 | f913574a9c0637dbcf66def4a2c1dc84 144 | 3e9bbf75058a3f16585889bb9c64a903 145 | a83302f9002b6707fc7a91a7d7d29e6e 146 | d7369ce92ed0b6327412c705dbbab654 147 | 70b2f9277a1c78bd13cef68ba6485bd9 148 | 5d0b5755e212a88afbbb8b29c34c4f13 149 | c1f185252a2837aa464e36f263d1ebe9 150 | 6291811a3fe70d3ec8fc26b91060e2f5 151 | aa32f4f9534045b9f33a9599d0c1b580 152 | 2bebb50073ceefd0c9ccfdf3e07b3258 153 | 30cced37fcceb1800341d18d4f97b670 154 | 3b9c27eda65c635e109a547930942486 155 | 3322caacf140c92366a639ee004560ce 156 | d5825f99faec1ae48589b98560a98d61 157 | 408dff173c599256711f23238e280c15 158 | 47de592453663260c44944346d669611 159 | 86a94ca907de6688cca64610730fa11b 160 | 18a89cdd00dadc593a88c924111575f1 161 | 4edd239ce7d1f7274154cd05081f8995 162 | 7c0e0e53980aeb2868cbe4e1c1cb79db 163 | 33c71a5cec78e7d766d75c9a73b327b8 164 | 9b3bc3c727dfaa49218b57254087ff5d 165 | NYT_ENG_20131210.0203 166 | 2d2a4ddb1c8f4a669541704f9fb78472 167 | 5dfd5bfee062cd5896b619a2b1309766 168 | 2701285c791f423cd2f8fd827df9c2c9 169 | 04952b874a2a34d602faaa74712d435e 170 | 2ee2377e5d4ae6f5922ea2af11f9d4e1 171 | 79a3cc37998a99808583eba765aedca1 172 | NYT_ENG_20131025.0190 173 | 5bfd613fd31f0c2bdfb5c41f21629144 174 | 61d6f81f680f83a1a3281fde24d9c3ac 175 | 79c976f694784ced2b0c8752eb767901 176 | 0a421343005f3241376fa01e1cb3c6fb 177 | 5753617c893938f625b349cf6bd2b388 178 | NYT_ENG_20130428.0140 179 | 34f729e5ac124e9898b2744a6598d50e 180 | a724033bff06e750d27cd7e3bf8263ac 181 | 1b0f90c029f75d326ea39c0371901ef4 182 | 51d64c51a2363954454ee9e921b590ce 183 | 1656bbad43fee4569b5c5f14110c1342 184 | 5f3a6a4c39c15d7382c2cafe64ae898d 185 | 6667fb9e43ac7edde844453cba97baf0 186 | 52a77871923a7f86bb1a52812bc7f2e1 187 | 44a65adb7f74e6c99d05eb2721fd0baf 188 | NYT_ENG_20131029.0042 189 | 5bb3c2b1094912a6df7e862bb2981481 190 | 47c26ba3563092e41c5a42252931baf1 191 | 41404718f9c1e94cf58aad1fc90c70a7 192 | APW_ENG_20101231.0037 193 | a05c08e340a73270592f62361a19274d 194 | 99ab1cad51361e94c2fe3f997c45705a 195 | NYT_ENG_20131029.0091 196 | 5c7ea2b51202d80ee37eba8a182afad3 197 | NYT_ENG_20131128.0177 198 | 0e0abbf0da91d9e34750441c08d5d262 199 | 15ba31cca04cc5300361f46319247c40 200 | 459f9a2b3eddd436f0232395f129dfd0 201 | NYT_ENG_20130508.0098 202 | 04134f2be20afbb868d7a8292f49e277 203 | 0cde024ee993679967f7ac397000ad52 204 | 593cb5020613a4695859130542f7fc94 205 | NYT_ENG_20131122.0237 206 | 9f6e4c46ae753bf14edff7e2ac767213 207 | cd04993849c889a56ea66c6670f002f4 208 | 4042cd8643253f65df3a4e8de320a1c9 209 | 3f0e2f2fb9b773bc178522a6535a9651 210 | 4798bc0e166fe93893bdf2d922f06258 211 | a9318b72c7a2ff32d459af958c7defe1 212 | 3ae6760a860a33cb90af23596fac475c 213 | 11329f1cdb44019afc8f48b6fdc5376d 214 | 39ff7dcae4034417ba175de97d14b165 215 | 43341a312ffd84a4ad3c3ab0df8bcd7c 216 | 21dbe23f56aaef87fd0980234895b321 217 | 02905b7ce3a6b8b0961c6c2310392ef9 218 | f6ad2150f6c32fcb1488438f6b4275ce 219 | f9af64dc0cf1e7edd4a8feef75018b81 220 | aa003ea934a97bac86cee52b7122f1f8 221 | 766386bc5cb9eb40419a80d082472d50 222 | 4435a7cb258d37b4fafc3ef0e833582e 223 | 736fa00bfb16f3298883be5e962fe01b 224 | NYT_ENG_20130731.0133 225 | d409fd37c208c5a7a5b2c64b4130b0ec 226 | 5cd7d603e1cf8d2c134d039dc90112f0 227 | 1e9dfabe5e068a4142e768c0c5c37b6b 228 | 36b12cef6f7a805e3e74a4f430129028 229 | e2e2039f203f36b821d15e2cb6f588e0 230 | NYT_ENG_20130816.0151 231 | 856bc3bee118c826c394ed09548db9b2 232 | 909239794c799f2d2e79c023ae090c35 233 | 087f58983ef5e94e54024bc9f0f009ae 234 | b49eee97fd373efbb4cb41926e60e385 235 | f801d26c9b4d7577df089a196e242a04 236 | 2a46fcf4ff6ce3896f249848e48b3b4c 237 | NYT_ENG_20130619.0092 238 | 1a79f9d5c3f784a494196a9bbb586f3b 239 | fab32c473df923a6a9242054c8d23bf3 240 | 1a0f101744b34677ce1e1da1b1b91beb 241 | 4572d22caf3e1924f894002b724f958b 242 | 30eadb19db9f0db62cba7be66862920d 243 | 59a5d2e146c13f7519130193fc773610 244 | 3878ab866ca434318076c4e7eac49c0d 245 | 2d7d6761aad911a63a235a571fa7862f 246 | 4d996a22855cc2ec9f54990a23d51c56 247 | 324274e50f2d07757e2d88ff58a0c33b 248 | AFP_ENG_20100414.0615 249 | 563b1e8fcb1de7a4c0e01da9100d6e09 250 | 5dd42026c76290af6689691fbe2b8d1c 251 | edc4216d65afa47fe7bc6004ac172e92 252 | 2aaa319d1e1a0600837d013cb84290ea 253 | b9109877820d90dbc5efcdda02e6d450 254 | 0f316bb245762eedec6682acbecf2822 255 | 3dc7812b2b39ed067cc7c8ab1218e128 256 | 648fc5834f73b4196b4ceb3daad954f9 257 | 0fab386f8b6527439481f526c92341c7 258 | NYT_ENG_20131220.0283 259 | 36d45aff571e3fbe036f309c18d31668 260 | 3a0d64b5cb2bc7319e803e344dc695b5 261 | 39280a4d31d81837e17469e18a854116 262 | 670b5425fcd1700e2c27af5f09244cb1 263 | f3e00fa1d34bca154aea0845c628f0e6 264 | ae9a0d394c5e3d3d812c7ffc07c2f836 265 | f18a7b77b1fd1065db9aeaf3f6143a5e 266 | 0536891daea71ab51ee1123137b67146 267 | 6491f0650d9628b84dee6f539df5a53d 268 | 2ba8bbf004fe30c0a01f6fcd25f01dcc 269 | f0aabfc899d1c17b8e99039bb4f80d64 270 | 4ae1669fc17f6b863ff35fa14a960270 271 | 2bbf45266e4ec0ae72977c89ac8d55c1 272 | 0ba982819aaf9f5b94a7cebd48ac6018 273 | 010aaf594ae6ef20eb28e3ee26038375 274 | 0c49bb860962aa0d5b8e3fc277592da0 275 | e972c0257d72aefc52cfdf7e7f5a1623 276 | 82f0af70bf68f4e78e6ea60a339f830d 277 | acdf07c9477b21e1d29c51dc692e085b 278 | 186ef6837e001cd9b97a132c86705545 279 | 389c70a4859f7528cc6e8b84c10766d7 280 | 91147deeeec220cc445a8d546585cdb7 281 | 370e7ee173951eeff13998a416b8b3d0 282 | 9fc05e3fab69893da830adfa6513510d 283 | 3b9b81a3a446c24009c7642da54dbd28 284 | 1bf9912633f942d6d1d4e87df33cee40 285 | a42f7cf822523c76c225602537aefc7a 286 | 4fbb1eec7dfd5c2fefb94a2d873ddfa5 287 | ea4d6baa1d6174c45fce1e6bbb58e1b4 288 | 3059538a2542c71687871b3444f8d921 289 | NYT_ENG_20131121.0250 290 | 661ece467567ffbb54b551dfc1c2c254 291 | 204f8f6bdb24c5198175bf1ed483247b 292 | 1f60eb9697e240af089b134b69c2042d 293 | 44087d95184e9d94f3948f47e9b602af 294 | cade0d91e2e82e4db58efe64d7462c33 295 | 10953ba63f691cb49f47f852b359a6e3 296 | 15c96bac6c08ef94fe249fde914b53d7 297 | 5c59566e9132c060423cad5b2d1bac1e 298 | 57b2773ab54bbc5c119a46fd9be2c4f0 299 | 368df106b2eaa0b4091e099f360a07d6 300 | cb824da90723fed309217c6e28b1c7cd 301 | NYT_ENG_20130828.0147 302 | NYT_ENG_20131225.0200 303 | 3f115570c2fcc85263ba97e0134fb039 304 | 44fd27d40ae65547c3b584c2ff360cd7 305 | 07b79a8764693a80861e5a3e5fd47fa5 306 | 6f9d5ec51264868ada3c2c22c70fc57c 307 | NYT_ENG_20130709.0087 308 | 6837dcaff76ad3235d46708dd89e7306 309 | 2251a78817e67a2adaf0722fd05c7ac0 310 | 97655df62dd4a176b65cf8a2c2a6e82d 311 | fa371b1fbb4d20143e638a7dac6e4f6b 312 | c8930568f1175e8bb0bff9b932a5c2d4 313 | 43326b9fa7deac9d3f8f9e2a0aa0e5cf 314 | 5685a6069312d52a897fe69973269338 315 | 4829d3d91263ed9d8801e6d94c3569a5 316 | 1a11228e8230c359e0f357cbd8240b01 317 | 342431e61e80263f606c46bb5e399cc7 318 | f703536e3212f51cbf26ce47aa7b5eff 319 | 086e26ec92d1cc02f3900e9ac46d6962 320 | 502c46cc149d30f9ad0c25194636dcb6 321 | 33ed1c9fdee1000e2340ac7f92c77752 322 | 609d5112c0386dc4e5f2e90b93cb7a5f 323 | 0fbcb8f76124b9654076889ce04a045b 324 | NYT_ENG_20130603.0111 325 | 824610c87232d345dcc130521f20f72a 326 | 57fb3f87bbb8c3205163ea256f658891 327 | 09098ae4e956a51b038876197814735e 328 | 073020eb350fc73f123bfac8ec485ecc 329 | 48c498c9762046efbece8d183ed996ca 330 | 4b2d9d5984b731dbdd3db398b5fb5e46 331 | 2d8d3572658fdb8754fdc84d2b15f302 332 | 8f575db98ccc3af0a904b650898368dd 333 | e37cfedb8a3a32769a12262eaef9ee0d 334 | 542d2b2755c23b22e9747d8a3b020bf2 335 | 373a3b4bb2a9e67a12c50ad54a1be657 336 | b6b443777e5ca92aa5152f5593960fd9 337 | 3065902101e4282b89ed4ac8f64d4a84 338 | bdca67a0bacec61b5e691d5ca51ba724 339 | 6f13620752b8bd5acf2e1e94c49faef5 340 | XIN_ENG_20101125.0137 341 | NYT_ENG_20130910.0002 342 | 65814a1b2cccd0fd9be5ee3d5068038d 343 | 84828469f40b28161c559e3d01526039 344 | 584b6272bb8c9cc134621ff5ace8c98d 345 | 590baa25bb1cc16c31fd02395edf6835 346 | 39ebaa0bb958e3529b331f4c71025e62 347 | 17f22c2b1e5642b41a9aeedb03261d1a 348 | 081fede2fca345dce82bf6b2355d4ae5 349 | 3e6c7121211de578d7fd831eae801438 350 | 4175e3da216dcc8710a26359e4ecaaad 351 | d4698e3ad06f896058ade2e8f3a09577 352 | d528b874a0a6bd6011279a3239360aa2 353 | 3f78c311ad97d4bbc6b4914deb4ab1ec 354 | 08ebdc5f0ec8588af38ab1684318d99c 355 | NYT_ENG_20130625.0044 356 | 543e319fb067ef8cba81c74bb13c5711 357 | 2c8bcca93da4097da338a8754e4f03b0 358 | 52355a4167e6ac3a80d19c94ad6259a7 359 | 1557734399e8da2b84a2dd9ddb4eba49 360 | a223ebce2f7481c8feecaba0982b4fa7 361 | 4a3d067b19686b281e0beb437573a28c 362 | 1b268b27094ba9c5feb11192dad940ab 363 | 376c304800b734b2a5a2c87b19eddc2a 364 | 018fb4e59ac5474167ffc5940d7e55e7 365 | NYT_ENG_20131003.0269 366 | 3138f7fb2f8575ed762eb0bc11023d59 367 | AFP_ENG_20100601.0724 368 | 37d781089c669131c5118415cf470422 369 | 3a9a0c07af53fce42e1a55c21826c54d 370 | 0f565d3822dca80336582ffac4adaf78 371 | cfd86b06365dab636d13523c7ed93ad6 372 | c06e8bbdf69f73a69cd3d5dbb4d06a21 373 | d0b9b1747f4a6247294cde9ac0165c60 374 | 416cfc6a5717682cd35d381c5be07734 375 | 22696c601df1a7359e9b629c689700ad 376 | NYT_ENG_20130506.0130 377 | 4764f1400fa336d1fb972719b10b939a 378 | cca700aed62fd497e64e507752409b41 379 | 56c895a1c8dead5698a49321a674f3f4 380 | 17af00d74fca31bceab4ad463bf1c384 381 | 026e0a2c96e90bd8bf9aecde62d7530d 382 | 4eb58398a5c2ef35b16d885c5573b3d4 383 | 5c29f9e575b94c61db8ed52bdfa53843 384 | 3d8f19221d257f81e3376b9e0731d4db 385 | 12bbeaf10a36d36d82824a72352ac178 386 | 362f9d9707c4da0c8068bc7034aae4b4 387 | 026bd1c7eae9f14da9480a4b88ba2fb6 388 | 4683e6affe801713ed4cc9d596b57fac 389 | 2b96d1172d37f60aea5ce64a0b410248 390 | b608865c83b6612bf9ccb4e4c6e66ee7 391 | 584ccaef38f5936e973f0561966bbf06 392 | 0cfdfe102b7a4cb34e1a181c1d36d23d 393 | 83d7cb6d5b663f34dcf83879a8729fb4 394 | 30fa916e5173b52d449300e2ea71b787 395 | 25f868780ac18430a6f10ab4de22ffb8 396 | 4c2488e10c34e5412d3b67e794c9bc84 397 | -------------------------------------------------------------------------------- /evaluations/supervised-ie/resource/splits/ERE-ES/dev.doc.txt: -------------------------------------------------------------------------------- 1 | XIN_SPA_20050322.0162 2 | c93832992e8ca0020c806137834bdd38 3 | c70895b8121c60e3a0aeba14f40707ce 4 | XIN_SPA_20021108.0309 5 | APW_SPA_20050201.0692 6 | 4ba25020962498950ff85a88c30648fb 7 | APW_SPA_20060120.0581 8 | APW_SPA_20060717.0443 9 | XIN_SPA_20050403.0126 10 | 295f9c1a5b9bd20e6e20547ffe0db294 11 | -------------------------------------------------------------------------------- /evaluations/supervised-ie/resource/splits/ERE-ES/test.doc.txt: -------------------------------------------------------------------------------- 1 | APW_SPA_20040902.1107 2 | XIN_SPA_20050403.0106 3 | 9b0b7d0b89b7fc118363762e1af5ace4 4 | APW_SPA_19950810.0183 5 | XIN_SPA_20050205.0035 6 | 3eeaac8978fc543ffcaa6ac0a1d9a5ed 7 | 29838866bc6ab760d9a7dda4c9c77503 8 | XIN_SPA_20050125.0291 9 | APW_SPA_20060113.0097 10 | APW_SPA_20080609.0519 11 | -------------------------------------------------------------------------------- /evaluations/supervised-ie/resource/splits/ERE-ES/train.doc.txt: -------------------------------------------------------------------------------- 1 | 4592b44df692ae3b8a4f22372d04fa62 2 | e3ac81d75cea10852ef8e85b20b69c74 3 | XIN_SPA_20050403.0111 4 | APW_SPA_20050327.0348 5 | fde63a2d572ab7ebf368318020f0b071 6 | XIN_SPA_20050402.0105 7 | APW_SPA_19940407.0064 8 | 988f859f2ab7d4f4c9d46a4a1783cbba 9 | 989afacd6af08934a6af2cfc81b6c436 10 | APW_SPA_20050312.0510 11 | XIN_SPA_20050403.0051 12 | APW_SPA_20070614.0172 13 | APW_SPA_20050111.0523 14 | 159551b03ba96d920df6080d9b86176e 15 | 4bc0205d8d2ac3edd35fb542701105e2 16 | APW_SPA_19951008.0022 17 | APW_SPA_20050503.0767 18 | da4faa0089a16785f47ca8a67804c3f2 19 | APW_SPA_20000328.0091 20 | APW_SPA_19990915.0054 21 | APW_SPA_20050521.0623 22 | APW_SPA_20060810.0577 23 | APW_SPA_20050330.0162 24 | XIN_SPA_20050119.0238 25 | APW_SPA_19960612.0124 26 | c1d312ae8c2b67eaf56dfc7eba7e99ad 27 | XIN_SPA_20070922.0015 28 | 1b7885b9a6142dcaecf9e0e9613d6952 29 | 6a022c5cd583f4b9890141ca749997fb 30 | XIN_SPA_20050203.0224 31 | APW_SPA_20051030.0439 32 | 1117479cfa864e6607dcf7f143a808ba 33 | 3135e76164dd737557d83178339e54e7 34 | c8416f15a6cec37e56de13eb82b7b510 35 | XIN_SPA_20050329.0184 36 | APW_SPA_20050108.0071 37 | XIN_SPA_20050403.0139 38 | APW_SPA_20050121.0930 39 | APW_SPA_19960702.0050 40 | APW_SPA_20050302.0582 41 | XIN_SPA_20050401.0038 42 | XIN_SPA_20050403.0122 43 | 8ecee1d57d9d28609e5d56f23b0d76bb 44 | XIN_SPA_20050127.0132 45 | APW_SPA_20050329.0740 46 | XIN_SPA_20050403.0054 47 | 143877c9d95829210efcfe7f995c769b 48 | d7d81874a99b95dea45ba319ae52af8c 49 | XIN_SPA_20090715.0070 50 | XIN_SPA_20050128.0086 51 | XIN_SPA_20050212.0123 52 | APW_SPA_20080826.1135 53 | APW_SPA_19950811.0059 54 | APW_SPA_19951109.0028 55 | a19444cffdf468ac2155654a9d1c8693 56 | APW_SPA_20050124.0920 57 | APW_SPA_20041202.0453 58 | 1226499305419e318a5dc104ab8066c5 59 | a3ad9a4413967891746040cdac56aaf1 60 | APW_SPA_20060425.1120 61 | APW_SPA_20070414.0157 62 | 735dbf2a938d7bf8d6ff3491bdbe0715 63 | 543313af510918445612a0f0b6f79871 64 | XIN_SPA_20050403.0117 65 | 8a9c8bbd3ad2ad5fc85d35fb84a2124d 66 | APW_SPA_20041022.0758 67 | APW_SPA_20050427.1010 68 | XIN_SPA_20050111.0122 69 | XIN_SPA_20050325.0210 70 | 8a6fa6598778ac9922c5a65733b28ead 71 | APW_SPA_19960129.0030 72 | APW_SPA_20010208.0070 73 | XIN_SPA_20090709.0118 74 | APW_SPA_20090403.1130 75 | XIN_SPA_20091117.0065 76 | e7b2d90daec857685a51c9f1a4ad98de 77 | XIN_SPA_20050403.0093 78 | 7984eb82f19ef829045f9876f74f30dc 79 | APW_SPA_20040325.0157 80 | APW_SPA_20050401.0996 81 | APW_SPA_20070828.0549 82 | APW_SPA_20050405.0784 83 | 289a43fb49b37dbe63398decde9625be 84 | d5ae0a896a5b5e40366fc25d48e29fe9 85 | XIN_SPA_20050524.0180 86 | XIN_SPA_20050125.0232 87 | XIN_SPA_20020422.0221 88 | APW_SPA_19991004.0100 89 | XIN_SPA_20050403.0110 90 | APW_SPA_20050119.1163 91 | XIN_SPA_20050120.0219 92 | APW_SPA_20001214.0085 93 | XIN_SPA_20050411.0073 94 | 3f21bcd1dca99ac949bae07cf858f2da 95 | 66bd051ce8a098996903afe59cff69d7 96 | XIN_SPA_20050125.0276 97 | edc4094bbffed34aa86bcc3c3a2ac739 98 | APW_SPA_20090406.0868 99 | 40be1d303aef5e921f8c35d93f753abc 100 | 3cde74ca728f84882b404c78fb9d50bb 101 | APW_SPA_19970720.0046 102 | 5eac974a46f903198596aa69a1ad317d 103 | APW_SPA_20050303.1080 104 | b7a1a9f7a6573dc4e38e5eea61ff0348 105 | APW_SPA_19940405.0063 106 | fc34abcf77bbbdef2847c60183ca49d4 107 | APW_SPA_20050206.0403 108 | APW_SPA_20020527.0102 109 | XIN_SPA_20090726.0126 110 | 2304550f3162898f67ac68c08b390780 111 | APW_SPA_20070125.0656 112 | APW_SPA_20050503.0035 113 | 1f682475f9a8809adfae6f142c34e59d 114 | c60e6b87d095f1dfe39bd2ddd24f9f9a 115 | XIN_SPA_20050403.0040 116 | e7b8b8eea44d88d2ac8737f50479e55a 117 | APW_SPA_20011108.0042 118 | APW_SPA_20050324.1013 119 | APW_SPA_20070605.0825 120 | XIN_SPA_20050102.0066 121 | APW_SPA_20090407.1189 122 | b0a0c2687af1cb1b966f73d232d8367a 123 | APW_SPA_20050211.0074 124 | 718af85e0c59d04709f4349c77753378 125 | 9476d05ff8d63dc664ca6035f87a0ced 126 | XIN_SPA_20030223.0128 127 | APW_SPA_19980914.0097 128 | XIN_SPA_20070713.0191 129 | XIN_SPA_20100610.0160 130 | APW_SPA_19970713.0038 131 | XIN_SPA_20050402.0084 132 | APW_SPA_19950407.0125 133 | 3016fa4a06962bb3e642a3784c6d74b5 134 | XIN_SPA_20050131.0244 135 | -------------------------------------------------------------------------------- /evaluations/supervised-ie/resource/valid_patterns/event_role.json: -------------------------------------------------------------------------------- 1 | { 2 | "Movement:Transport": [ 3 | "Vehicle", 4 | "Artifact", 5 | "Agent", 6 | "Origin", 7 | "Destination" 8 | ], 9 | "Personnel:Elect": [ 10 | "Place", 11 | "Person", 12 | "Entity" 13 | ], 14 | "Personnel:Start-Position": [ 15 | "Place", 16 | "Person", 17 | "Entity" 18 | ], 19 | "Personnel:Nominate": [ 20 | "Agent", 21 | "Person" 22 | ], 23 | "Personnel:End-Position": [ 24 | "Place", 25 | "Person", 26 | "Entity" 27 | ], 28 | "Conflict:Attack": [ 29 | "Target", 30 | "Place", 31 | "Victim", 32 | "Instrument", 33 | "Attacker" 34 | ], 35 | "Contact:Meet": [ 36 | "Place", 37 | "Entity" 38 | ], 39 | "Life:Marry": [ 40 | "Place", 41 | "Person" 42 | ], 43 | "Transaction:Transfer-Money": [ 44 | "Giver", 45 | "Place", 46 | "Recipient", 47 | "Beneficiary" 48 | ], 49 | "Conflict:Demonstrate": [ 50 | "Place", 51 | "Entity" 52 | ], 53 | "Business:End-Org": [ 54 | "Place", 55 | "Org" 56 | ], 57 | "Justice:Sue": [ 58 | "Defendant", 59 | "Plaintiff", 60 | "Adjudicator", 61 | "Place" 62 | ], 63 | "Life:Injure": [ 64 | "Agent", 65 | "Place", 66 | "Victim", 67 | "Instrument" 68 | ], 69 | "Life:Die": [ 70 | "Person", 71 | "Agent", 72 | "Place", 73 | "Victim", 74 | "Instrument" 75 | ], 76 | "Justice:Arrest-Jail": [ 77 | "Agent", 78 | "Place", 79 | "Person" 80 | ], 81 | "Contact:Phone-Write": [ 82 | "Place", 83 | "Entity" 84 | ], 85 | "Transaction:Transfer-Ownership": [ 86 | "Artifact", 87 | "Beneficiary", 88 | "Buyer", 89 | "Place", 90 | "Seller" 91 | ], 92 | "Business:Start-Org": [ 93 | "Agent", 94 | "Place", 95 | "Org" 96 | ], 97 | "Justice:Execute": [ 98 | "Agent", 99 | "Place", 100 | "Person" 101 | ], 102 | "Justice:Trial-Hearing": [ 103 | "Prosecutor", 104 | "Defendant", 105 | "Place", 106 | "Adjudicator" 107 | ], 108 | "Life:Be-Born": [ 109 | "Place", 110 | "Person" 111 | ], 112 | "Justice:Charge-Indict": [ 113 | "Prosecutor", 114 | "Adjudicator", 115 | "Place", 116 | "Defendant" 117 | ], 118 | "Justice:Convict": [ 119 | "Defendant", 120 | "Place", 121 | "Adjudicator" 122 | ], 123 | "Justice:Sentence": [ 124 | "Adjudicator", 125 | "Place", 126 | "Defendant" 127 | ], 128 | "Business:Declare-Bankruptcy": [ 129 | "Place", 130 | "Org" 131 | ], 132 | "Justice:Release-Parole": [ 133 | "Place", 134 | "Person", 135 | "Entity" 136 | ], 137 | "Justice:Fine": [ 138 | "Adjudicator", 139 | "Place", 140 | "Entity" 141 | ], 142 | "Justice:Pardon": [ 143 | "Adjudicator", 144 | "Place", 145 | "Defendant" 146 | ], 147 | "Justice:Appeal": [ 148 | "Adjudicator", 149 | "Plaintiff", 150 | "Place" 151 | ], 152 | "Justice:Extradite": [ 153 | "Agent", 154 | "Origin", 155 | "Destination" 156 | ], 157 | "Life:Divorce": [ 158 | "Place", 159 | "Person" 160 | ], 161 | "Business:Merge-Org": [ 162 | "Org" 163 | ], 164 | "Justice:Acquit": [ 165 | "Defendant", 166 | "Adjudicator" 167 | ] 168 | } 169 | -------------------------------------------------------------------------------- /evaluations/supervised-ie/resource/valid_patterns/relation_entity.json: -------------------------------------------------------------------------------- 1 | {"ORG-AFF": ["ORG", "PER", "GPE", "FAC"], "GEN-AFF": ["LOC", "PER", "FAC", "ORG", "GPE"], "PHYS": ["LOC", "PER", "FAC", "VEH", "ORG", "GPE"], "PART-WHOLE": ["LOC", "WEA", "PER", "FAC", "VEH", "ORG", "GPE"], "ART": ["WEA", "PER", "VEH", "FAC", "ORG", "GPE"], "PER-SOC": ["ORG", "PER"]} -------------------------------------------------------------------------------- /evaluations/supervised-ie/resource/valid_patterns/role_entity.json: -------------------------------------------------------------------------------- 1 | {"Attacker": ["ORG", "PER", "GPE"], "Place": ["LOC", "GPE", "FAC"], "Target": ["LOC", "WEA", "PER", "FAC", "VEH", "ORG"], "Victim": ["PER"], "Agent": ["ORG", "PER", "GPE"], "Entity": ["ORG", "PER", "GPE"], "Instrument": ["WEA", "VEH"], "Artifact": ["WEA", "PER", "VEH", "FAC", "ORG"], "Origin": ["LOC", "GPE", "FAC"], "Vehicle": ["VEH"], "Destination": ["LOC", "GPE", "FAC"], "Buyer": ["ORG", "PER", "GPE"], "Person": ["PER"], "Org": ["ORG", "PER"], "Adjudicator": ["ORG", "PER", "GPE"], "Plaintiff": ["ORG", "PER", "GPE"], "Defendant": ["ORG", "PER", "GPE"], "Prosecutor": ["ORG", "PER", "GPE"], "Giver": ["ORG", "PER", "GPE"], "Seller": ["ORG", "PER", "GPE"], "Recipient": ["ORG", "PER", "GPE"], "Beneficiary": ["ORG", "PER", "GPE"]} -------------------------------------------------------------------------------- /evaluations/supervised-ie/scorer.py: -------------------------------------------------------------------------------- 1 | """Our scorer is adapted from: https://github.com/dwadden/dygiepp""" 2 | 3 | def safe_div(num, denom): 4 | if denom > 0: 5 | return num / denom 6 | else: 7 | return 0 8 | 9 | def compute_f1(predicted, gold, matched): 10 | precision = safe_div(matched, predicted) 11 | recall = safe_div(matched, gold) 12 | f1 = safe_div(2 * precision * recall, precision + recall) 13 | return precision, recall, f1 14 | 15 | 16 | def convert_arguments(triggers, entities, roles): 17 | args = set() 18 | for trigger_idx, entity_idx, role in roles: 19 | arg_start, arg_end, _ = entities[entity_idx] 20 | trigger_label = triggers[trigger_idx][-1] 21 | args.add((arg_start, arg_end, trigger_label, role)) 22 | return args 23 | 24 | 25 | def score_graphs(gold_graphs, pred_graphs, 26 | relation_directional=False): 27 | gold_arg_num = pred_arg_num = arg_idn_num = arg_class_num = 0 28 | gold_trigger_num = pred_trigger_num = trigger_idn_num = trigger_class_num = 0 29 | gold_ent_num = pred_ent_num = ent_match_num = 0 30 | gold_rel_num = pred_rel_num = rel_match_num = 0 31 | gold_men_num = pred_men_num = men_match_num = 0 32 | 33 | for gold_graph, pred_graph in zip(gold_graphs, pred_graphs): 34 | # Entity 35 | gold_entities = gold_graph.entities 36 | pred_entities = pred_graph.entities 37 | gold_ent_num += len(gold_entities) 38 | pred_ent_num += len(pred_entities) 39 | ent_match_num += len([entity for entity in pred_entities 40 | if entity in gold_entities]) 41 | 42 | # Mention 43 | gold_mentions = gold_graph.mentions 44 | pred_mentions = pred_graph.mentions 45 | gold_men_num += len(gold_mentions) 46 | pred_men_num += len(pred_mentions) 47 | men_match_num += len([mention for mention in pred_mentions 48 | if mention in gold_mentions]) 49 | 50 | # Relation 51 | gold_relations = gold_graph.relations 52 | pred_relations = pred_graph.relations 53 | gold_rel_num += len(gold_relations) 54 | pred_rel_num += len(pred_relations) 55 | for arg1, arg2, rel_type in pred_relations: 56 | arg1_start, arg1_end, _ = pred_entities[arg1] 57 | arg2_start, arg2_end, _ = pred_entities[arg2] 58 | for arg1_gold, arg2_gold, rel_type_gold in gold_relations: 59 | arg1_start_gold, arg1_end_gold, _ = gold_entities[arg1_gold] 60 | arg2_start_gold, arg2_end_gold, _ = gold_entities[arg2_gold] 61 | if relation_directional: 62 | if (arg1_start == arg1_start_gold and 63 | arg1_end == arg1_end_gold and 64 | arg2_start == arg2_start_gold and 65 | arg2_end == arg2_end_gold 66 | ) and rel_type == rel_type_gold: 67 | rel_match_num += 1 68 | break 69 | else: 70 | if ((arg1_start == arg1_start_gold and 71 | arg1_end == arg1_end_gold and 72 | arg2_start == arg2_start_gold and 73 | arg2_end == arg2_end_gold) or ( 74 | arg1_start == arg2_start_gold and 75 | arg1_end == arg2_end_gold and 76 | arg2_start == arg1_start_gold and 77 | arg2_end == arg1_end_gold 78 | )) and rel_type == rel_type_gold: 79 | rel_match_num += 1 80 | break 81 | 82 | # Trigger 83 | gold_triggers = gold_graph.triggers 84 | pred_triggers = pred_graph.triggers 85 | gold_trigger_num += len(gold_triggers) 86 | pred_trigger_num += len(pred_triggers) 87 | for trg_start, trg_end, event_type in pred_triggers: 88 | matched = [item for item in gold_triggers 89 | if item[0] == trg_start and item[1] == trg_end] 90 | if matched: 91 | trigger_idn_num += 1 92 | if matched[0][-1] == event_type: 93 | trigger_class_num += 1 94 | 95 | # Argument 96 | gold_args = convert_arguments(gold_triggers, gold_entities, 97 | gold_graph.roles) 98 | pred_args = convert_arguments(pred_triggers, pred_entities, 99 | pred_graph.roles) 100 | gold_arg_num += len(gold_args) 101 | pred_arg_num += len(pred_args) 102 | for pred_arg in pred_args: 103 | arg_start, arg_end, event_type, role = pred_arg 104 | gold_idn = {item for item in gold_args 105 | if item[0] == arg_start and item[1] == arg_end 106 | and item[2] == event_type} 107 | if gold_idn: 108 | arg_idn_num += 1 109 | gold_class = {item for item in gold_idn if item[-1] == role} 110 | if gold_class: 111 | arg_class_num += 1 112 | 113 | entity_prec, entity_rec, entity_f = compute_f1( 114 | pred_ent_num, gold_ent_num, ent_match_num) 115 | mention_prec, mention_rec, mention_f = compute_f1( 116 | pred_men_num, gold_men_num, men_match_num) 117 | trigger_id_prec, trigger_id_rec, trigger_id_f = compute_f1( 118 | pred_trigger_num, gold_trigger_num, trigger_idn_num) 119 | trigger_prec, trigger_rec, trigger_f = compute_f1( 120 | pred_trigger_num, gold_trigger_num, trigger_class_num) 121 | relation_prec, relation_rec, relation_f = compute_f1( 122 | pred_rel_num, gold_rel_num, rel_match_num) 123 | role_id_prec, role_id_rec, role_id_f = compute_f1( 124 | pred_arg_num, gold_arg_num, arg_idn_num) 125 | role_prec, role_rec, role_f = compute_f1( 126 | pred_arg_num, gold_arg_num, arg_class_num) 127 | 128 | print('Entity: P: {:.2f}, R: {:.2f}, F: {:.2f}'.format( 129 | entity_prec * 100.0, entity_rec * 100.0, entity_f * 100.0)) 130 | print('Mention: P: {:.2f}, R: {:.2f}, F: {:.2f}'.format( 131 | mention_prec * 100.0, mention_rec * 100.0, mention_f * 100.0)) 132 | print('Trigger identification: P: {:.2f}, R: {:.2f}, F: {:.2f}'.format( 133 | trigger_id_prec * 100.0, trigger_id_rec * 100.0, trigger_id_f * 100.0)) 134 | print('Trigger: P: {:.2f}, R: {:.2f}, F: {:.2f}'.format( 135 | trigger_prec * 100.0, trigger_rec * 100.0, trigger_f * 100.0)) 136 | print('Relation: P: {:.2f}, R: {:.2f}, F: {:.2f}'.format( 137 | relation_prec * 100.0, relation_rec * 100.0, relation_f * 100.0)) 138 | print('Role identification: P: {:.2f}, R: {:.2f}, F: {:.2f}'.format( 139 | role_id_prec * 100.0, role_id_rec * 100.0, role_id_f * 100.0)) 140 | print('Role: P: {:.2f}, R: {:.2f}, F: {:.2f}'.format( 141 | role_prec * 100.0, role_rec * 100.0, role_f * 100.0)) 142 | 143 | scores = { 144 | 'entity': {'prec': entity_prec, 'rec': entity_rec, 'f': entity_f}, 145 | 'mention': {'prec': mention_prec, 'rec': mention_rec, 'f': mention_f}, 146 | 'trigger': {'prec': trigger_prec, 'rec': trigger_rec, 'f': trigger_f}, 147 | 'trigger_id': {'prec': trigger_id_prec, 'rec': trigger_id_rec, 148 | 'f': trigger_id_f}, 149 | 'role': {'prec': role_prec, 'rec': role_rec, 'f': role_f}, 150 | 'role_id': {'prec': role_id_prec, 'rec': role_id_rec, 'f': role_id_f}, 151 | 'relation': {'prec': relation_prec, 'rec': relation_rec, 152 | 'f': relation_f} 153 | } 154 | return scores 155 | 156 | def score_coref(gold_graphs, pred_graphs): 157 | pass -------------------------------------------------------------------------------- /evaluations/supervised-ie/train.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | import time 4 | from argparse import ArgumentParser 5 | 6 | import tqdm 7 | import torch 8 | from torch.utils.data import DataLoader 9 | from transformers import (BertTokenizer, BertConfig, AdamW, 10 | get_linear_schedule_with_warmup) 11 | from model import OneIE 12 | from graph import Graph 13 | from config import Config 14 | from data import IEDataset 15 | from scorer import score_graphs 16 | from util import generate_vocabs, load_valid_patterns, save_result, best_score_by_task 17 | 18 | 19 | # configuration 20 | parser = ArgumentParser() 21 | parser.add_argument('-c', '--config', default='config/example.json') 22 | parser.add_argument('-n', '--name', default='default') 23 | args = parser.parse_args() 24 | config = Config.from_json_file(args.config) 25 | # print(config.to_dict()) 26 | 27 | # set GPU device 28 | use_gpu = config.use_gpu 29 | if use_gpu and config.gpu_device >= 0: 30 | torch.cuda.set_device(config.gpu_device) 31 | 32 | # output 33 | output_dir = os.path.join(config.log_path, args.name) 34 | if not os.path.exists(output_dir): 35 | os.mkdir(output_dir) 36 | log_file = os.path.join(output_dir, 'log.txt') 37 | with open(log_file, 'w', encoding='utf-8') as w: 38 | w.write(json.dumps(config.to_dict()) + '\n') 39 | print('Log file: {}'.format(log_file)) 40 | best_role_model = os.path.join(output_dir, 'best.role.mdl') 41 | best_entity_model = os.path.join(output_dir, 'best.entity.mdl') 42 | best_trigger_model = os.path.join(output_dir, 'best.trigger.mdl') 43 | best_relation_model = os.path.join(output_dir, 'best.relation.mdl') 44 | 45 | dev_result_file = os.path.join(output_dir, 'result.dev.json') 46 | test_result_file = os.path.join(output_dir, 'result.test.json') 47 | 48 | # datasets 49 | model_name = config.bert_model_name 50 | tokenizer = BertTokenizer.from_pretrained("bert-base-uncased") 51 | 52 | train_set = IEDataset(config.train_file, gpu=use_gpu, 53 | relation_mask_self=config.relation_mask_self, 54 | relation_directional=config.relation_directional, 55 | symmetric_relations=config.symmetric_relations, 56 | ignore_title=config.ignore_title) 57 | dev_set = IEDataset(config.dev_file, gpu=use_gpu, 58 | relation_mask_self=config.relation_mask_self, 59 | relation_directional=config.relation_directional, 60 | symmetric_relations=config.symmetric_relations) 61 | test_set = IEDataset(config.test_file, gpu=use_gpu, 62 | relation_mask_self=config.relation_mask_self, 63 | relation_directional=config.relation_directional, 64 | symmetric_relations=config.symmetric_relations) 65 | vocabs = generate_vocabs([train_set, dev_set, test_set]) 66 | 67 | train_set.numberize(tokenizer, vocabs) 68 | dev_set.numberize(tokenizer, vocabs) 69 | test_set.numberize(tokenizer, vocabs) 70 | valid_patterns = load_valid_patterns(config.valid_pattern_path, vocabs) 71 | 72 | batch_num = len(train_set) // config.batch_size 73 | dev_batch_num = len(dev_set) // config.eval_batch_size + \ 74 | (len(dev_set) % config.eval_batch_size != 0) 75 | test_batch_num = len(test_set) // config.eval_batch_size + \ 76 | (len(test_set) % config.eval_batch_size != 0) 77 | 78 | # initialize the model 79 | model = OneIE(config, vocabs, valid_patterns) 80 | model.load_bert(model_name, cache_dir=config.bert_cache_dir) 81 | if use_gpu: 82 | model.cuda(device=config.gpu_device) 83 | 84 | # optimizer 85 | param_groups = [ 86 | { 87 | 'params': [p for n, p in model.named_parameters() if n.startswith('bert')], 88 | 'lr': config.bert_learning_rate, 'weight_decay': config.bert_weight_decay 89 | }, 90 | { 91 | 'params': [p for n, p in model.named_parameters() if not n.startswith('bert') 92 | and 'crf' not in n and 'global_feature' not in n], 93 | 'lr': config.learning_rate, 'weight_decay': config.weight_decay 94 | }, 95 | { 96 | 'params': [p for n, p in model.named_parameters() if not n.startswith('bert') 97 | and ('crf' in n or 'global_feature' in n)], 98 | 'lr': config.learning_rate, 'weight_decay': 0 99 | } 100 | ] 101 | optimizer = AdamW(params=param_groups) 102 | schedule = get_linear_schedule_with_warmup(optimizer, 103 | num_warmup_steps=batch_num * config.warmup_epoch, 104 | num_training_steps=batch_num * config.max_epoch) 105 | 106 | # model state 107 | state = dict(model=model.state_dict(), 108 | config=config.to_dict(), 109 | vocabs=vocabs, 110 | valid=valid_patterns) 111 | 112 | global_step = 0 113 | global_feature_max_step = int(config.global_warmup * batch_num) + 1 114 | print('global feature max step:', global_feature_max_step) 115 | 116 | tasks = ['entity', 'trigger', 'relation', 'role'] 117 | best_dev = {k: 0 for k in tasks} 118 | for epoch in range(config.max_epoch): 119 | print('Epoch: {}'.format(epoch)) 120 | 121 | # training set 122 | progress = tqdm.tqdm(total=batch_num, ncols=75, 123 | desc='Train {}'.format(epoch)) 124 | optimizer.zero_grad() 125 | for batch_idx, batch in enumerate(DataLoader( 126 | train_set, batch_size=config.batch_size // config.accumulate_step, 127 | shuffle=True, drop_last=True, collate_fn=train_set.collate_fn)): 128 | 129 | loss = model(batch) 130 | loss = loss * (1 / config.accumulate_step) 131 | loss.backward() 132 | 133 | if (batch_idx + 1) % config.accumulate_step == 0: 134 | progress.update(1) 135 | global_step += 1 136 | torch.nn.utils.clip_grad_norm_( 137 | model.parameters(), config.grad_clipping) 138 | optimizer.step() 139 | schedule.step() 140 | optimizer.zero_grad() 141 | progress.close() 142 | 143 | # dev set 144 | progress = tqdm.tqdm(total=dev_batch_num, ncols=75, 145 | desc='Dev {}'.format(epoch)) 146 | best_dev_role_model = False 147 | dev_gold_graphs, dev_pred_graphs, dev_sent_ids, dev_tokens = [], [], [], [] 148 | for batch in DataLoader(dev_set, batch_size=config.eval_batch_size, 149 | shuffle=False, collate_fn=dev_set.collate_fn): 150 | progress.update(1) 151 | graphs = model.predict(batch) 152 | if config.ignore_first_header: 153 | for inst_idx, sent_id in enumerate(batch.sent_ids): 154 | if int(sent_id.split('-')[-1]) < 4: 155 | graphs[inst_idx] = Graph.empty_graph(vocabs) 156 | for graph in graphs: 157 | graph.clean(relation_directional=config.relation_directional, 158 | symmetric_relations=config.symmetric_relations) 159 | dev_gold_graphs.extend(batch.graphs) 160 | dev_pred_graphs.extend(graphs) 161 | dev_sent_ids.extend(batch.sent_ids) 162 | dev_tokens.extend(batch.tokens) 163 | progress.close() 164 | dev_scores = score_graphs(dev_gold_graphs, dev_pred_graphs, 165 | relation_directional=config.relation_directional) 166 | for task in tasks: 167 | if dev_scores[task]['f'] > best_dev[task]: 168 | best_dev[task] = dev_scores[task]['f'] 169 | if task == 'role': 170 | print('Saving best role model') 171 | torch.save(state, best_role_model) 172 | best_dev_role_model = True 173 | save_result(dev_result_file, 174 | dev_gold_graphs, dev_pred_graphs, dev_sent_ids, 175 | dev_tokens) 176 | if task == 'entity': 177 | print('Saving best entity model') 178 | torch.save(state, best_entity_model) 179 | 180 | if task == 'trigger': 181 | print('Saving best trigger model') 182 | torch.save(state, best_trigger_model) 183 | 184 | if task == 'relation': 185 | print('Saving best relation model') 186 | torch.save(state, best_relation_model) 187 | 188 | # test set 189 | progress = tqdm.tqdm(total=test_batch_num, ncols=75, 190 | desc='Test {}'.format(epoch)) 191 | test_gold_graphs, test_pred_graphs, test_sent_ids, test_tokens = [], [], [], [] 192 | for batch in DataLoader(test_set, batch_size=config.eval_batch_size, shuffle=False, 193 | collate_fn=test_set.collate_fn): 194 | progress.update(1) 195 | graphs = model.predict(batch) 196 | if config.ignore_first_header: 197 | for inst_idx, sent_id in enumerate(batch.sent_ids): 198 | if int(sent_id.split('-')[-1]) < 4: 199 | graphs[inst_idx] = Graph.empty_graph(vocabs) 200 | for graph in graphs: 201 | graph.clean(relation_directional=config.relation_directional, 202 | symmetric_relations=config.symmetric_relations) 203 | test_gold_graphs.extend(batch.graphs) 204 | test_pred_graphs.extend(graphs) 205 | test_sent_ids.extend(batch.sent_ids) 206 | test_tokens.extend(batch.tokens) 207 | progress.close() 208 | test_scores = score_graphs(test_gold_graphs, test_pred_graphs, 209 | relation_directional=config.relation_directional) 210 | 211 | if best_dev_role_model: 212 | save_result(test_result_file, test_gold_graphs, test_pred_graphs, 213 | test_sent_ids, test_tokens) 214 | 215 | result = json.dumps( 216 | {'epoch': epoch, 'dev': dev_scores, 'test': test_scores}) 217 | with open(log_file, 'a', encoding='utf-8') as w: 218 | w.write(result + '\n') 219 | print('Log file', log_file) 220 | 221 | 222 | best_score_by_task(log_file, os.path.join(output_dir, 'entity.txt'), 'entity') 223 | best_score_by_task(log_file, os.path.join(output_dir, 'trigger.txt'), 'trigger') 224 | best_score_by_task(log_file, os.path.join(output_dir, 'role.txt'), 'role') 225 | best_score_by_task(log_file, os.path.join(output_dir, 'relation.txt'), 'relation') 226 | -------------------------------------------------------------------------------- /evaluations/supervised-ie/util.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | import glob 4 | import lxml.etree as et 5 | from nltk import word_tokenize, sent_tokenize 6 | from copy import deepcopy 7 | 8 | 9 | def generate_vocabs(datasets, coref=False, 10 | relation_directional=False, 11 | symmetric_relations=None): 12 | """Generate vocabularies from a list of data sets 13 | :param datasets (list): A list of data sets 14 | :return (dict): A dictionary of vocabs 15 | """ 16 | entity_type_set = set() 17 | event_type_set = set() 18 | relation_type_set = set() 19 | role_type_set = set() 20 | for dataset in datasets: 21 | entity_type_set.update(dataset.entity_type_set) 22 | event_type_set.update(dataset.event_type_set) 23 | relation_type_set.update(dataset.relation_type_set) 24 | role_type_set.update(dataset.role_type_set) 25 | 26 | # add inverse relation types for non-symmetric relations 27 | if relation_directional: 28 | if symmetric_relations is None: 29 | symmetric_relations = [] 30 | relation_type_set_ = set() 31 | for relation_type in relation_type_set: 32 | relation_type_set_.add(relation_type) 33 | if relation_directional and relation_type not in symmetric_relations: 34 | relation_type_set_.add(relation_type + '_inv') 35 | 36 | # entity and trigger labels 37 | prefix = ['B', 'I'] 38 | entity_label_stoi = {'O': 0} 39 | trigger_label_stoi = {'O': 0} 40 | for t in entity_type_set: 41 | for p in prefix: 42 | entity_label_stoi['{}-{}'.format(p, t)] = len(entity_label_stoi) 43 | for t in event_type_set: 44 | for p in prefix: 45 | trigger_label_stoi['{}-{}'.format(p, t)] = len(trigger_label_stoi) 46 | 47 | entity_type_stoi = {k: i for i, k in enumerate(entity_type_set, 1)} 48 | entity_type_stoi['O'] = 0 49 | 50 | event_type_stoi = {k: i for i, k in enumerate(event_type_set, 1)} 51 | event_type_stoi['O'] = 0 52 | 53 | relation_type_stoi = {k: i for i, k in enumerate(relation_type_set, 1)} 54 | relation_type_stoi['O'] = 0 55 | if coref: 56 | relation_type_stoi['COREF'] = len(relation_type_stoi) 57 | 58 | role_type_stoi = {k: i for i, k in enumerate(role_type_set, 1)} 59 | role_type_stoi['O'] = 0 60 | 61 | mention_type_stoi = {'NAM': 0, 'NOM': 1, 'PRO': 2, 'UNK': 3} 62 | 63 | return { 64 | 'entity_type': entity_type_stoi, 65 | 'event_type': event_type_stoi, 66 | 'relation_type': relation_type_stoi, 67 | 'role_type': role_type_stoi, 68 | 'mention_type': mention_type_stoi, 69 | 'entity_label': entity_label_stoi, 70 | 'trigger_label': trigger_label_stoi, 71 | } 72 | 73 | 74 | def load_valid_patterns(path, vocabs): 75 | event_type_vocab = vocabs['event_type'] 76 | entity_type_vocab = vocabs['entity_type'] 77 | relation_type_vocab = vocabs['relation_type'] 78 | role_type_vocab = vocabs['role_type'] 79 | 80 | # valid event-role 81 | valid_event_role = set() 82 | event_role = json.load( 83 | open(os.path.join(path, 'event_role.json'), 'r', encoding='utf-8')) 84 | for event, roles in event_role.items(): 85 | if event not in event_type_vocab: 86 | continue 87 | event_type_idx = event_type_vocab[event] 88 | for role in roles: 89 | if role not in role_type_vocab: 90 | continue 91 | role_type_idx = role_type_vocab[role] 92 | valid_event_role.add(event_type_idx * 100 + role_type_idx) 93 | 94 | # valid relation-entity 95 | valid_relation_entity = set() 96 | relation_entity = json.load( 97 | open(os.path.join(path, 'relation_entity.json'), 'r', encoding='utf-8')) 98 | for relation, entities in relation_entity.items(): 99 | relation_type_idx = relation_type_vocab[relation] 100 | for entity in entities: 101 | entity_type_idx = entity_type_vocab[entity] 102 | valid_relation_entity.add( 103 | relation_type_idx * 100 + entity_type_idx) 104 | 105 | # valid role-entity 106 | valid_role_entity = set() 107 | role_entity = json.load( 108 | open(os.path.join(path, 'role_entity.json'), 'r', encoding='utf-8')) 109 | for role, entities in role_entity.items(): 110 | if role not in role_type_vocab: 111 | continue 112 | role_type_idx = role_type_vocab[role] 113 | for entity in entities: 114 | entity_type_idx = entity_type_vocab[entity] 115 | valid_role_entity.add(role_type_idx * 100 + entity_type_idx) 116 | 117 | return { 118 | 'event_role': valid_event_role, 119 | 'relation_entity': valid_relation_entity, 120 | 'role_entity': valid_role_entity 121 | } 122 | 123 | 124 | def read_ltf(path): 125 | root = et.parse(path, et.XMLParser( 126 | dtd_validation=False, encoding='utf-8')).getroot() 127 | doc_id = root.find('DOC').get('id') 128 | doc_tokens = [] 129 | for seg in root.find('DOC').find('TEXT').findall('SEG'): 130 | seg_id = seg.get('id') 131 | seg_tokens = [] 132 | seg_start = int(seg.get('start_char')) 133 | seg_text = seg.find('ORIGINAL_TEXT').text 134 | for token in seg.findall('TOKEN'): 135 | token_text = token.text 136 | start_char = int(token.get('start_char')) 137 | end_char = int(token.get('end_char')) 138 | assert seg_text[start_char - seg_start: 139 | end_char - seg_start + 1 140 | ] == token_text, 'token offset error' 141 | seg_tokens.append((token_text, start_char, end_char)) 142 | doc_tokens.append((seg_id, seg_tokens)) 143 | 144 | return doc_tokens, doc_id 145 | 146 | 147 | def read_txt(path, language='english'): 148 | doc_id = os.path.basename(path) 149 | data = open(path, 'r', encoding='utf-8').read() 150 | data = [s.strip() for s in data.split('\n') if s.strip()] 151 | sents = [l for ls in [sent_tokenize(line, language=language) for line in data] 152 | for l in ls] 153 | doc_tokens = [] 154 | offset = 0 155 | for sent_idx, sent in enumerate(sents): 156 | sent_id = '{}-{}'.format(doc_id, sent_idx) 157 | tokens = word_tokenize(sent) 158 | tokens = [(token, offset + i, offset + i + 1) 159 | for i, token in enumerate(tokens)] 160 | offset += len(tokens) 161 | doc_tokens.append((sent_id, tokens)) 162 | return doc_tokens, doc_id 163 | 164 | 165 | def read_json(path): 166 | with open(path, 'r', encoding='utf-8') as r: 167 | data = [json.loads(line) for line in r] 168 | doc_id = data[0]['doc_id'] 169 | offset = 0 170 | doc_tokens = [] 171 | 172 | for inst in data: 173 | tokens = inst['tokens'] 174 | tokens = [(token, offset + i, offset + i + 1) 175 | for i, token in enumerate(tokens)] 176 | offset += len(tokens) 177 | doc_tokens.append((inst['sent_id'], tokens)) 178 | return doc_tokens, doc_id 179 | 180 | 181 | def read_json_single(path): 182 | with open(path, 'r', encoding='utf-8') as r: 183 | data = [json.loads(line) for line in r] 184 | doc_id = os.path.basename(path) 185 | doc_tokens = [] 186 | for inst in data: 187 | tokens = inst['tokens'] 188 | tokens = [(token, i, i + 1) for i, token in enumerate(tokens)] 189 | doc_tokens.append((inst['sent_id'], tokens)) 190 | return doc_tokens, doc_id 191 | 192 | 193 | def save_result(output_file, gold_graphs, pred_graphs, sent_ids, tokens=None): 194 | with open(output_file, 'w', encoding='utf-8') as w: 195 | for i, (gold_graph, pred_graph, sent_id) in enumerate( 196 | zip(gold_graphs, pred_graphs, sent_ids)): 197 | output = {'sent_id': sent_id, 198 | 'gold': gold_graph.to_dict(), 199 | 'pred': pred_graph.to_dict()} 200 | if tokens: 201 | output['tokens'] = tokens[i] 202 | w.write(json.dumps(output) + '\n') 203 | 204 | 205 | def mention_to_tab(start, end, entity_type, mention_type, mention_id, tokens, token_ids, score=1): 206 | tokens = tokens[start:end] 207 | token_ids = token_ids[start:end] 208 | span = '{}:{}-{}'.format(token_ids[0].split(':')[0], 209 | token_ids[0].split(':')[1].split('-')[0], 210 | token_ids[1].split(':')[1].split('-')[1]) 211 | mention_text = tokens[0] 212 | previous_end = int(token_ids[0].split(':')[1].split('-')[1]) 213 | for token, token_id in zip(tokens[1:], token_ids[1:]): 214 | start, end = token_id.split(':')[1].split('-') 215 | start, end = int(start), int(end) 216 | mention_text += ' ' * (start - previous_end) + token 217 | previous_end = end 218 | return '\t'.join([ 219 | 'json2tab', 220 | mention_id, 221 | mention_text, 222 | span, 223 | 'NIL', 224 | entity_type, 225 | mention_type, 226 | str(score) 227 | ]) 228 | 229 | 230 | def json_to_mention_results(input_dir, output_dir, file_name, 231 | bio_separator=' '): 232 | mention_type_list = ['nam', 'nom', 'pro', 'nam+nom+pro'] 233 | file_type_list = ['bio', 'tab'] 234 | writers = {} 235 | for mention_type in mention_type_list: 236 | for file_type in file_type_list: 237 | output_file = os.path.join(output_dir, '{}.{}.{}'.format(file_name, 238 | mention_type, 239 | file_type)) 240 | writers['{}_{}'.format(mention_type, file_type) 241 | ] = open(output_file, 'w') 242 | 243 | json_files = glob.glob(os.path.join(input_dir, '*.json')) 244 | for f in json_files: 245 | with open(f, 'r', encoding='utf-8') as r: 246 | for line in r: 247 | result = json.loads(line) 248 | doc_id = result['doc_id'] 249 | tokens = result['tokens'] 250 | token_ids = result['token_ids'] 251 | bio_tokens = [[t, tid, 'O'] 252 | for t, tid in zip(tokens, token_ids)] 253 | # separate bio output 254 | for mention_type in ['NAM', 'NOM', 'PRO']: 255 | tokens_tmp = deepcopy(bio_tokens) 256 | for start, end, enttype, mentype in result['graph']['entities']: 257 | if mention_type == mentype: 258 | tokens_tmp[start] = 'B-{}'.format(enttype) 259 | for token_idx in range(start + 1, end): 260 | tokens_tmp[token_idx] = 'I-{}'.format( 261 | enttype) 262 | writer = writers['{}_bio'.format(mention_type.lower())] 263 | for token in tokens_tmp: 264 | writer.write(bio_separator.join(token) + '\n') 265 | writer.write('\n') 266 | # combined bio output 267 | tokens_tmp = deepcopy(bio_tokens) 268 | for start, end, enttype, _ in result['graph']['entities']: 269 | tokens_tmp[start] = 'B-{}'.format(enttype) 270 | for token_idx in range(start + 1, end): 271 | tokens_tmp[token_idx] = 'I-{}'.format(enttype) 272 | writer = writers['nam+nom+pro_bio'] 273 | for token in tokens_tmp: 274 | writer.write(bio_separator.join(token) + '\n') 275 | writer.write('\n') 276 | # separate tab output 277 | for mention_type in ['NAM', 'NOM', 'PRO']: 278 | writer = writers['{}_tab'.format(mention_type.lower())] 279 | mention_count = 0 280 | for start, end, enttype, mentype in result['graph']['entities']: 281 | if mention_type == mentype: 282 | mention_id = '{}-{}'.format(doc_id, mention_count) 283 | tab_line = mention_to_tab( 284 | start, end, enttype, mentype, mention_id, tokens, token_ids) 285 | writer.write(tab_line + '\n') 286 | # combined tab output 287 | writer = writers['nam+nom+pro_tab'] 288 | mention_count = 0 289 | for start, end, enttype, mentype in result['graph']['entities']: 290 | mention_id = '{}-{}'.format(doc_id, mention_count) 291 | tab_line = mention_to_tab( 292 | start, end, enttype, mentype, mention_id, tokens, token_ids) 293 | writer.write(tab_line + '\n') 294 | for w in writers: 295 | w.close() 296 | 297 | 298 | def normalize_score(scores): 299 | min_score, max_score = min(scores), max(scores) 300 | if min_score == max_score: 301 | return [0] * len(scores) 302 | return [(s - min_score) / (max_score - min_score) for s in scores] 303 | 304 | 305 | def best_score_by_task(log_file, scores_file, task, max_epoch=1000): 306 | with open(log_file, 'r', encoding='utf-8') as r: 307 | config = r.readline() 308 | 309 | best_scores = [] 310 | best_dev_score = 0 311 | for line in r: 312 | record = json.loads(line) 313 | dev = record['dev'] 314 | test = record['test'] 315 | epoch = record['epoch'] 316 | if epoch > max_epoch: 317 | break 318 | if dev[task]['f'] > best_dev_score: 319 | best_dev_score = dev[task]['f'] 320 | best_scores = [dev, test, epoch] 321 | 322 | print('Epoch: {}'.format(best_scores[-1])) 323 | tasks = ['entity', 'mention', 'relation', 'trigger_id', 'trigger', 324 | 'role_id', 'role'] 325 | for t in tasks: 326 | print('{}: dev: {:.2f}, test: {:.2f}'.format(t, best_scores[0][t]['f'] * 100.0, best_scores[1][t]['f'] * 100.0)) 327 | 328 | with open(scores_file, 'w', encoding='utf-8') as f: 329 | for t in tasks: 330 | f.write('{}: dev: {:.2f}, test: {:.2f}'.format(t, best_scores[0][t]['f'] * 100.0, best_scores[1][t]['f'] * 100.0) + '\n') 331 | -------------------------------------------------------------------------------- /gumbel_latent_typer.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Liliang Ren. 2 | # 3 | # This source code is licensed under the Apache 2.0 license found in the 4 | # LICENSE file in the root directory of this source tree. 5 | 6 | import torch 7 | import torch.nn as nn 8 | import torch.nn.functional as F 9 | import math 10 | 11 | class GumbelLatentTyper(nn.Module): 12 | def __init__( 13 | self, 14 | dim, 15 | num_vars, 16 | temp, 17 | var_dim, 18 | hard = True, 19 | ): 20 | 21 | super().__init__() 22 | 23 | self.input_dim = dim 24 | self.num_vars = num_vars 25 | self.hard = hard 26 | 27 | 28 | self.vars = nn.Parameter(torch.FloatTensor(num_vars, var_dim)) 29 | nn.init.uniform_(self.vars, a=-0.5, b=0.5) 30 | 31 | 32 | self.weight_proj = nn.Linear(self.input_dim, num_vars, bias = False) 33 | nn.init.kaiming_uniform_(self.weight_proj.weight.data, nonlinearity = 'linear') 34 | 35 | self.max_temp, self.min_temp, self.temp_decay = temp 36 | self.curr_temp = self.min_temp 37 | 38 | def set_num_updates(self, num_updates): 39 | #exponential decay 40 | self.curr_temp = max( 41 | self.max_temp * self.temp_decay**num_updates, self.min_temp 42 | ) 43 | 44 | 45 | def forward(self, x, mask=None, deterministic = True): 46 | result = {"num_vars": self.num_vars } 47 | bsz, tsz, fsz = x.shape 48 | 49 | x = self.weight_proj(x) 50 | x = x.view(bsz * tsz, -1) 51 | zero_mask = torch.ones_like(x) 52 | zero_mask[:,0]=0 53 | x = x*zero_mask 54 | 55 | 56 | if mask is not None: 57 | x = x* mask.view(-1,1) 58 | 59 | _, k = x.max(-1) 60 | hard_x = ( 61 | x.new_zeros(*x.shape) 62 | .scatter_(-1, k.view(-1, 1), 1.0) 63 | .view(bsz * tsz, -1) 64 | ) 65 | 66 | 67 | avg_probs = torch.softmax( 68 | x.view(bsz * tsz, -1).float(), dim=-1 69 | ) 70 | result["soft_probs"] = avg_probs 71 | 72 | if mask is not None: 73 | avg_probs = (avg_probs * mask.view(bsz * tsz,1)).sum(0)/mask.sum() 74 | else: 75 | avg_probs = avg_probs.mean(dim=0) 76 | 77 | result["prob_perplexity"] = torch.exp( 78 | -torch.sum(avg_probs * torch.log(avg_probs + 1e-7), dim=-1) 79 | ).sum() 80 | 81 | if self.training: 82 | x = F.gumbel_softmax(x.float(), tau=self.curr_temp, hard=self.hard).type_as(x) 83 | else: 84 | if deterministic: 85 | x = hard_x 86 | else: 87 | x = F.gumbel_softmax(x.float(), tau=self.curr_temp, hard=self.hard).type_as(x) 88 | 89 | 90 | result["gumbel_probs"] = x.view(bsz * tsz, -1) 91 | 92 | x = x.view(bsz * tsz, -1) 93 | 94 | vars = self.vars 95 | mask = torch.ones_like(vars) 96 | mask[0,:]=0 97 | vars = vars * mask 98 | 99 | x = torch.matmul(x, vars) 100 | x = x.view(bsz, tsz, -1) 101 | 102 | result["x"] = x 103 | 104 | return result 105 | -------------------------------------------------------------------------------- /model.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Liliang Ren, Zixuan Zhang. 2 | # 3 | # This source code is licensed under the Apache 2.0 license found in the 4 | # LICENSE file in the root directory of this source tree. 5 | 6 | 7 | from transformers import RobertaPreTrainedModel, RobertaModel,RobertaForMaskedLM, AutoModel, BertPreTrainedModel, BertModel, BertForMaskedLM 8 | from transformers.modeling_outputs import MaskedLMOutput 9 | from transformers.activations import ACT2FN 10 | from transformers.models.roberta.modeling_roberta import RobertaEmbeddings 11 | 12 | from utils import RobertaConfig 13 | from typing import List, Optional, Tuple, Union 14 | 15 | from decoder import BartDecoder, _make_causal_mask, _expand_mask 16 | 17 | import torch 18 | import math 19 | import torch.nn as nn 20 | 21 | from gumbel_latent_typer import GumbelLatentTyper 22 | 23 | 24 | def gelu(x): 25 | return 0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3)))) 26 | 27 | 28 | class BertOnlyMLMHead(nn.Module): 29 | def __init__(self, config): 30 | super().__init__() 31 | self.predictions = BertLMPredictionHead(config) 32 | 33 | def forward(self, sequence_output: torch.Tensor) -> torch.Tensor: 34 | prediction_scores = self.predictions(sequence_output) 35 | return prediction_scores 36 | 37 | 38 | class BertLMPredictionHead(nn.Module): 39 | def __init__(self, config): 40 | super().__init__() 41 | self.transform = BertPredictionHeadTransform(config) 42 | 43 | # The output weights are the same as the input embeddings, but there is 44 | # an output-only bias for each token. 45 | self.decoder = nn.Linear(config.hidden_size, config.vocab_size, bias=False) 46 | 47 | self.bias = nn.Parameter(torch.zeros(config.vocab_size)) 48 | 49 | # Need a link between the two variables so that the bias is correctly resized with `resize_token_embeddings` 50 | self.decoder.bias = self.bias 51 | 52 | def forward(self, hidden_states): 53 | hidden_states = self.transform(hidden_states) 54 | hidden_states = self.decoder(hidden_states) 55 | return hidden_states 56 | 57 | 58 | class BertPredictionHeadTransform(nn.Module): 59 | def __init__(self, config): 60 | super().__init__() 61 | self.dense = nn.Linear(config.hidden_size, config.hidden_size) 62 | if isinstance(config.hidden_act, str): 63 | self.transform_act_fn = ACT2FN[config.hidden_act] 64 | else: 65 | self.transform_act_fn = config.hidden_act 66 | self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) 67 | 68 | def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: 69 | hidden_states = self.dense(hidden_states) 70 | hidden_states = self.transform_act_fn(hidden_states) 71 | hidden_states = self.LayerNorm(hidden_states) 72 | return hidden_states 73 | 74 | 75 | 76 | class RobertaLMHead(nn.Module): 77 | def __init__(self, config): 78 | super().__init__() 79 | self.dense = nn.Linear(config.hidden_size, config.hidden_size) 80 | self.layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) 81 | self.decoder = nn.Linear(config.hidden_size, config.vocab_size) 82 | self.bias = nn.Parameter(torch.zeros(config.vocab_size)) 83 | self.decoder.bias = self.bias 84 | 85 | def forward(self, features, **kwargs): 86 | x = self.dense(features) 87 | x = gelu(x) 88 | x = self.layer_norm(x) 89 | # project back to size of vocabulary with bias 90 | x = self.decoder(x) 91 | return x 92 | 93 | def _tie_weights(self): 94 | # To tie those two weights if they get disconnected (on TPU or when the bias is resized) 95 | self.bias = self.decoder.bias 96 | 97 | 98 | class RobertaAutoEncoder(BertPreTrainedModel): 99 | 100 | def __init__(self, config): 101 | super().__init__(config) 102 | self.model = BertForMaskedLM.from_pretrained("bert-base-uncased") 103 | 104 | self.glm_head = None 105 | self.glm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=True) # for confitional generation (lm) 106 | nn.init.constant_(self.glm_head.bias, 0.0) 107 | 108 | self.decoder = BartDecoder(config, self.roberta.embeddings) 109 | 110 | 111 | self.sa_pm = GumbelLatentTyper( 112 | dim = config.hidden_size, 113 | num_vars = 64, 114 | temp = (5, 0.5, 1-3e-5), 115 | var_dim = config.hidden_size, 116 | hard = False, 117 | ) 118 | 119 | self.tie_weights() 120 | 121 | @property 122 | def roberta(self): 123 | return self.model.bert 124 | 125 | @property 126 | def mlm_head(self): 127 | return self.model.cls.predictions 128 | 129 | def tie_weights(self,): 130 | if self.glm_head is not None: 131 | self.glm_head.weight = self.roberta.embeddings.word_embeddings.weight 132 | 133 | self.mlm_head.decoder.weight = self.roberta.embeddings.word_embeddings.weight 134 | 135 | def forward(self, input_ids=None, attention_mask=None, mlm_input_ids=None, mlm_labels=None, decoder_input_ids=None, decoder_attention_mask=None, gen_labels=None, original_tokens=None, return_dict=None): 136 | 137 | 138 | return_dict = return_dict if return_dict is not None else self.config.use_return_dict 139 | # loss #1: masked lm loss 140 | masked_sequence_output = self.roberta( 141 | mlm_input_ids, 142 | attention_mask=attention_mask, 143 | return_dict=return_dict 144 | ) 145 | prediction_scores = self.mlm_head(masked_sequence_output[0]) 146 | 147 | masked_lm_loss = None 148 | 149 | if mlm_labels is not None: 150 | loss_fct = nn.CrossEntropyLoss() 151 | masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), mlm_labels.view(-1)) 152 | 153 | # loss #2: reconstruction loss 154 | outputs = self.roberta( 155 | input_ids, 156 | attention_mask=attention_mask, 157 | return_dict=return_dict 158 | ) 159 | sequence_output = outputs[0] 160 | 161 | # sequence_output: (batch, seq_len, dim) 162 | EPS = torch.finfo(sequence_output.dtype).tiny 163 | b,q,c = sequence_output.shape 164 | result = self.sa_pm(sequence_output,mask=attention_mask, deterministic=True) 165 | 166 | div_loss = (result["num_vars"] - result["prob_perplexity"])/result["num_vars"] 167 | 168 | soft_probs = result["soft_probs"].view(b,q,-1)[:,:,0] 169 | reduced_output = (sequence_output * result["x"]) 170 | pm_loss = - torch.log((soft_probs*attention_mask).sum()/attention_mask.sum()+EPS) 171 | 172 | seq_logits = self.decoder( 173 | input_ids=decoder_input_ids, 174 | attention_mask=decoder_attention_mask, 175 | encoder_hidden_states=reduced_output, 176 | encoder_attention_mask=attention_mask 177 | )[0] 178 | 179 | lm_logits = self.glm_head(seq_logits) 180 | gen_loss = None 181 | if gen_labels is not None: 182 | loss_fct = nn.CrossEntropyLoss() 183 | gen_loss = loss_fct(lm_logits.view(-1, self.config.vocab_size), gen_labels.view(-1)) 184 | 185 | if torch.isnan(masked_lm_loss): 186 | masked_lm_loss = gen_loss.new_zeros(1)[0] 187 | 188 | return masked_lm_loss, gen_loss, pm_loss, div_loss 189 | 190 | 191 | def test_generate(self, input_ids=None, attention_mask=None, original_tokens=None, return_dict=None, tsne=False, return_latent = False): 192 | bs, seq_len = input_ids.shape 193 | decoder_input_ids = torch.zeros(bs, seq_len).long() 194 | decoder_attn_mask = torch.zeros(bs, seq_len).long() 195 | 196 | decoder_input_ids[:, 0:1] = input_ids[:, 0:1] 197 | decoder_attn_mask[:, 0:1] = torch.ones(bs, 1).long() 198 | 199 | output_ids = torch.zeros(bs, seq_len).long() 200 | output_ids[:, 0:1] = input_ids[:, 0:1] 201 | 202 | type_str = "" 203 | selected_list = [] 204 | 205 | with torch.no_grad(): 206 | # loss #2: reconstruction loss 207 | outputs = self.roberta( 208 | input_ids, 209 | attention_mask=attention_mask, 210 | return_dict=return_dict 211 | ) 212 | sequence_output = outputs[0] 213 | 214 | # sequence_output: (batch, seq_len, dim) 215 | EPS = torch.finfo(sequence_output.dtype).tiny 216 | b,q,c = sequence_output.shape 217 | result = self.sa_pm(sequence_output, deterministic=True) 218 | gumbel_types = torch.argmax(result["gumbel_probs"][:, 0, :], 1) 219 | 220 | if tsne: 221 | return sequence_output, gumbel_types 222 | 223 | #only support batch_size = 1 after this line 224 | reduced_output = (sequence_output * result["x"]) 225 | type_ids = [] 226 | for j in range(len(original_tokens[0])): 227 | token = original_tokens[0][j] 228 | type_idx = gumbel_types.tolist()[j] 229 | type_ids.append(type_idx) 230 | if type_idx != 0: 231 | type_str += (token + '(' + str(type_idx)+'), ') 232 | selected_list.append(token) 233 | 234 | if return_latent: 235 | return type_ids 236 | 237 | print("LATENT TYPINGS: ") 238 | print(type_str) 239 | print('\n') 240 | 241 | 242 | for i in range(seq_len - 1): 243 | seq_logits = self.decoder( 244 | input_ids=decoder_input_ids, 245 | attention_mask=decoder_attn_mask, 246 | encoder_hidden_states=reduced_output, 247 | encoder_attention_mask=attention_mask 248 | )[0] 249 | # seq_logits: bs, seq_len, vocab_size 250 | lm_logits = self.glm_head(seq_logits) 251 | selected_logits = lm_logits[:, i, :] 252 | logit_idxs = torch.argmax(selected_logits, 1) 253 | output_ids[:, i+1:i+2] = logit_idxs.unsqueeze(-1) 254 | 255 | decoder_input_ids[:, i+1:i+2] = logit_idxs.unsqueeze(-1) 256 | decoder_attn_mask[:, i+1:i+2] = torch.ones(bs, 1) 257 | 258 | return output_ids 259 | 260 | 261 | 262 | -------------------------------------------------------------------------------- /overview.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/renll/SparseLT/86306e94c27ec79b4ea4e810a262df42798e5ab9/overview.png -------------------------------------------------------------------------------- /pretrain.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Zixuan Zhang, Liliang Ren. 2 | # 3 | # This source code is licensed under the Apache 2.0 license found in the 4 | # LICENSE file in the root directory of this source tree. 5 | 6 | 7 | import os 8 | import argparse 9 | 10 | from transformers import AutoConfig, AutoTokenizer 11 | 12 | from utils import RobertaConfig 13 | from model import RobertaAutoEncoder 14 | from dataset import PLMDataset, PLMDataCollator, PLMTrainer, PLMTrainingArgs 15 | import wandb 16 | 17 | parser = argparse.ArgumentParser() 18 | parser.add_argument('--name', type=str, default="default_latent_typing") 19 | parser.add_argument('--data', type=str, default="voa_corpus") 20 | parser.add_argument('--local_rank', type=int, default=-1) 21 | parser.add_argument('--alpha', type=float, default=0.05) 22 | parser.add_argument('--beta', type=float, default=0.05) 23 | parser.add_argument('--gamma', type=float, default=0.1) 24 | args = parser.parse_args() 25 | 26 | if args.local_rank in [-1,0]: 27 | wandb.tensorboard.patch(root_logdir="./tb_logs/") 28 | wandb.init(project='latent-typing', sync_tensorboard=True) 29 | 30 | model_name = "bert-base-uncased" 31 | tokenizer = AutoTokenizer.from_pretrained(model_name) 32 | checkpoint_dir = "./checkpoints/" + args.name 33 | dataset_dir = "./data/" + args.data + '.txt' 34 | if not os.path.exists(checkpoint_dir): 35 | os.makedirs(checkpoint_dir, exist_ok=True) 36 | 37 | 38 | 39 | config = AutoConfig.from_pretrained(model_name) 40 | 41 | config.decoder_layers = 1 42 | config.activation_function = config.hidden_act 43 | config.decoder_ffn_dim = config.intermediate_size 44 | config.init_std = config.initializer_range 45 | 46 | print(config) 47 | 48 | model = RobertaAutoEncoder(config) 49 | print(model) 50 | 51 | 52 | training_args = PLMTrainingArgs( 53 | output_dir=checkpoint_dir, 54 | overwrite_output_dir=False, 55 | do_train=True, 56 | do_eval=False, 57 | do_predict=False, 58 | evaluation_strategy='no', 59 | prediction_loss_only=False, 60 | per_device_train_batch_size=32, 61 | per_device_eval_batch_size=8, 62 | gradient_accumulation_steps=1, 63 | eval_accumulation_steps=32, 64 | learning_rate=1e-4, 65 | weight_decay=0.01, 66 | adam_beta1=0.9, 67 | adam_beta2=0.999, 68 | adam_epsilon=1e-8, 69 | max_grad_norm=1., 70 | num_train_epochs=10, 71 | max_steps=100000, 72 | lr_scheduler_type='linear', 73 | warmup_steps=300, 74 | save_steps=10000, 75 | save_total_limit=100, 76 | no_cuda=False, 77 | seed=61820, 78 | local_rank=args.local_rank, 79 | dataloader_drop_last=False, 80 | ) 81 | 82 | training_args.add_loss_weights( 83 | mlm=1, # mlm 84 | alpha = args.alpha, # gen 85 | beta = args.beta, # pm 86 | gamma = args.gamma #diversity 87 | ) 88 | 89 | train_dataset = PLMDataset( 90 | tokenizer=tokenizer, 91 | file_path=dataset_dir, 92 | block_size=128 93 | ) 94 | 95 | data_collator = PLMDataCollator(tokenizer=tokenizer, mlm_probability=0.15) 96 | 97 | 98 | trainer = PLMTrainer( 99 | model=model, 100 | args=training_args, 101 | data_collator=data_collator, 102 | train_dataset=train_dataset 103 | ) 104 | 105 | run_name = args.name+"_mlm_" + str(trainer.args.mlm) + "_gen_" + str(trainer.args.alpha) + "_pm_" + str(trainer.args.beta) + "_div_" + str(trainer.args.gamma) 106 | log_dir = "./tb_logs/" + run_name 107 | 108 | trainer.load_tb(log_dir) 109 | 110 | trainer.train() 111 | trainer.save_model(checkpoint_dir) 112 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | numpy==1.22.3 2 | torch==1.12.1 3 | transformers==4.21.1 4 | wandb 5 | seqeval 6 | -------------------------------------------------------------------------------- /test_generation.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Zixuan Zhang, Liliang Ren. 2 | # 3 | # This source code is licensed under the Apache 2.0 license found in the 4 | # LICENSE file in the root directory of this source tree. 5 | 6 | 7 | from transformers import RobertaTokenizerFast, RobertaModel, AutoTokenizer 8 | import torch 9 | from model import RobertaAutoEncoder 10 | 11 | t = AutoTokenizer.from_pretrained("bert-base-uncased") 12 | 13 | 14 | ckpt_dirs = ["./checkpoints/YOUR_FULLMODEL_CKPT_DIR/"] 15 | for ckpt_dir in ckpt_dirs: 16 | print(ckpt_dir.split("/")[-2]) 17 | m = RobertaAutoEncoder.from_pretrained(ckpt_dir) 18 | 19 | 20 | input_sentences = ["She was murdered in her New York office, just days after learning that Waitress had been accepted into the Sundance Film Festival."] 21 | 22 | for input_sentence in input_sentences: 23 | print("INPUT SENTENCE: ") 24 | print(input_sentence + '\n') 25 | input_batch = t(input_sentence, return_tensors="pt") 26 | 27 | input_ids = input_batch["input_ids"] 28 | attn_mask = input_batch["attention_mask"] 29 | # print(input_ids.shape) 30 | input_tokens = [["CLS"] + t.tokenize(input_sentence) + ["SEP"]] 31 | 32 | output_ids = m.test_generate(input_ids=input_ids, attention_mask=attn_mask, original_tokens=input_tokens) 33 | sentence_output = t.decode(output_ids[0], skip_special_tokens=False) 34 | print("OUTPUT SENTENCE: ") 35 | print(sentence_output) 36 | print('\n') 37 | -------------------------------------------------------------------------------- /utils.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Zixuan Zhang. 2 | # 3 | # This source code is licensed under the Apache 2.0 license found in the 4 | # LICENSE file in the root directory of this source tree. 5 | 6 | 7 | from transformers.configuration_utils import PretrainedConfig 8 | 9 | 10 | class BertConfig(PretrainedConfig): 11 | 12 | def __init__( 13 | self, 14 | vocab_size=50265, 15 | hidden_size=1024, 16 | num_hidden_layers=24, 17 | num_attention_heads=16, 18 | intermediate_size=4096, 19 | hidden_act="gelu", 20 | hidden_dropout_prob=0.1, 21 | attention_probs_dropout_prob=0.1, 22 | max_position_embeddings=514, 23 | type_vocab_size=1, 24 | initializer_range=0.02, 25 | layer_norm_eps=1e-05, 26 | pad_token_id=1, 27 | position_embedding_type="absolute", 28 | use_cache=True, 29 | classifier_dropout=None, 30 | activation_function="gelu", 31 | decoder_ffn_dim=4096, 32 | decoder_layers=1, 33 | init_std=0.02, 34 | **kwargs 35 | ): 36 | super().__init__(pad_token_id=pad_token_id, **kwargs) 37 | 38 | self.vocab_size = vocab_size 39 | self.hidden_size = hidden_size 40 | self.num_hidden_layers = num_hidden_layers 41 | self.num_attention_heads = num_attention_heads 42 | self.hidden_act = hidden_act 43 | self.intermediate_size = intermediate_size 44 | self.hidden_dropout_prob = hidden_dropout_prob 45 | self.init_std = init_std 46 | self.attention_probs_dropout_prob = attention_probs_dropout_prob 47 | self.max_position_embeddings = max_position_embeddings 48 | self.type_vocab_size = type_vocab_size 49 | self.initializer_range = initializer_range 50 | self.layer_norm_eps = layer_norm_eps 51 | self.position_embedding_type = position_embedding_type 52 | self.use_cache = use_cache 53 | self.classifier_dropout = classifier_dropout 54 | self.activation_function = activation_function 55 | self.decoder_ffn_dim = decoder_ffn_dim 56 | self.decoder_layers = decoder_layers 57 | 58 | 59 | class RobertaConfig(BertConfig): 60 | 61 | def __init__(self, pad_token_id=1, bos_token_id=0, eos_token_id=2, **kwargs): 62 | super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs) 63 | 64 | 65 | if __name__ == "__main__": 66 | c = BertConfig() 67 | d = RobertaConfig() 68 | --------------------------------------------------------------------------------