├── .gitignore
├── LICENSE
├── README.md
├── convert_checkpoint.py
├── dataset.py
├── decoder.py
├── evaluations
    ├── ner
    │   └── CONTaiNER
    │   │   ├── .gitignore
    │   │   ├── LICENSE
    │   │   ├── README.md
    │   │   ├── calc_micro.sh
    │   │   ├── download.sh
    │   │   ├── exec_container.sh
    │   │   ├── misc
    │   │       ├── __init__.py
    │   │       ├── cvt_conll.py
    │   │       └── cvt_to_lowercase.py
    │   │   ├── process_fewnerd.sh
    │   │   └── src
    │   │       ├── __init__.py
    │   │       ├── calc-micro-avg.py
    │   │       ├── container.py
    │   │       ├── crf.py
    │   │       ├── dataset.py
    │   │       ├── decoder.py
    │   │       ├── embedding.py
    │   │       └── utils.py
    └── supervised-ie
    │   ├── README.md
    │   ├── config.py
    │   ├── config
    │       ├── ace.json
    │       └── ere.json
    │   ├── convert.py
    │   ├── data.py
    │   ├── global_feature.py
    │   ├── graph.py
    │   ├── model.py
    │   ├── predict.py
    │   ├── preprocessing
    │       ├── process_ace.py
    │       ├── process_dygiepp.py
    │       └── process_ere.py
    │   ├── resource
    │       ├── ace_to_aida_entity.tsv
    │       ├── ace_to_aida_event.tsv
    │       ├── ace_to_aida_relation.tsv
    │       ├── ace_to_aida_role.tsv
    │       ├── ere_patterns
    │       │   ├── event_role.json
    │       │   ├── relation_entity.json
    │       │   └── role_entity.json
    │       ├── splits
    │       │   ├── ACE05-CN
    │       │   │   ├── dev.doc.txt
    │       │   │   ├── test.doc.txt
    │       │   │   └── train.doc.txt
    │       │   ├── ACE05-E
    │       │   │   ├── dev.doc.txt
    │       │   │   ├── test.doc.txt
    │       │   │   └── train.doc.txt
    │       │   ├── ACE05-R
    │       │   │   ├── dev.doc.txt
    │       │   │   ├── test.doc.txt
    │       │   │   └── train.doc.txt
    │       │   ├── ERE-EN
    │       │   │   ├── dev.doc.txt
    │       │   │   ├── test.doc.txt
    │       │   │   └── train.doc.txt
    │       │   └── ERE-ES
    │       │   │   ├── dev.doc.txt
    │       │   │   ├── test.doc.txt
    │       │   │   └── train.doc.txt
    │       └── valid_patterns
    │       │   ├── event_role.json
    │       │   ├── relation_entity.json
    │       │   └── role_entity.json
    │   ├── scorer.py
    │   ├── train.py
    │   └── util.py
├── gumbel_latent_typer.py
├── model.py
├── overview.png
├── pretrain.py
├── requirements.txt
├── test_generation.py
└── utils.py


/.gitignore:
--------------------------------------------------------------------------------
1 | __pycache__/
2 | wandb/
3 | tb_logs/
4 | data/
5 | checkpoints/
6 | *.tar
7 | *.bin
8 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [yyyy] [name of copyright owner]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Language Model Pre-Training with Sparse Latent Typing 
  2 | 
  3 | This is the official PyTorch implementation of the paper:
  4 | **Language Model Pre-Training with Sparse Latent Typing**. Liliang Ren\*, Zixuan Zhang\*, Han Wang, Clare R. Voss, Chengxiang Zhai, Heng Ji. (\*Equal Contribution) ***EMNLP 2022 (Oral)***
  5 | [[pdf]](https://aclanthology.org/2022.emnlp-main.96.pdf) [[slides]](https://drive.google.com/file/d/1gTMifRSAyj45izkTPLQE5TMsgH-_WSo5/view?usp=sharing)
  6 | 
  7 | 
  8 | ## Overview
  9 | <p align="center">
 10 | <img src="overview.png" width="100%" />
 11 | </p>
 12 | 
 13 | The figure shows the general architecture of our proposed Gumbel Latent Typing module.
 14 | 
 15 | Our approach is especially effective for the information extraction related downstream tasks. We push state-of-the-art on the [Few-NERD](https://arxiv.org/abs/2105.07464) benchmark for both the *INTRA* and the *INTER* settings with absolute average F1 improvements of 6.24\% and 3.75\% respectively. We also significantly outperform a strong baseline ([OneIE](https://aclanthology.org/2020.acl-main.713/)) on both the ACE2005 and the ERE dataset (Notably, an absolute improvement of 7.59% on the ERE Entity Extraction subtask) by intializing the parameters of its vanilia `BERT-base` encoder with our `BERT-SparseLT` model continually pretrained from a `BERT-base-uncased` checkpoint using our Sparse Latent Typing objective. 
 16 | 
 17 | 
 18 | ## Requirements and Installation
 19 | 
 20 | The required environment can be installed using the followining command line with the Python version `3.8.12`. 
 21 | 
 22 | ```
 23 | pip install -r requirements.txt
 24 | ```
 25 | 
 26 | ## Pretrained Models
 27 | 
 28 | We currently released our `BERT-SparseLT` (also denoted as `BERT-SparseLT-VOA`) model, a `BERT-base-uncased` model continually pretrained on the VOA corpus with the Sparse Latent Typing objective for reproducing the results mentioned in our paper. The model checkpoint can be downloaded from [here](https://drive.google.com/file/d/1Clq-VcdMRLnaEpOlV6BS_chMzY0-iO6Z/view?usp=sharing). 
 29 | 
 30 | ## Data Preparation
 31 | 
 32 | To do continual pretraining, you need to first download the VOA corpus from [here](https://drive.google.com/file/d/1IZ633R2IoBAEQ4lOtJ-aPDMPLbAkfZJr/view?usp=sharing) and put it under the `data/` directory. If you also want to evaluate the pretrained model on the few-shot named entity extraction task, please go to the `evaluations/ner/CONTaiNER/` directory and follow the original *CONTaiNER* [repository](https://github.com/psunlpgroup/CONTaiNER) for data preparation.
 33 | 
 34 | ## Continual Pre-Training
 35 | 
 36 | Our `BERT-SparseLT` model can be continually pretrained from the `BERT-base-uncased` checkpoint on a single V100 GPU with 16GB graphical memory using the following command line.
 37 | 
 38 | ```
 39 | python pretrain.py --name bert_base_voa_sparseLT --alpha 0.05 --beta 0.05 --gamma 0.1
 40 | ```
 41 | 
 42 | This will produce and store the checkpoint for the full autoencoder model (includeing the Gumbel Latent Typer) containing the `BERT-SparseLT` encoder as denoted in the paper.
 43 | 
 44 | You may use the following code snippet to extract the `BERT-SparseLT` model parameters and use it in the same way as the original `BERT-base-uncased` model for the downstream tasks.
 45 | 
 46 | ```
 47 | python convert_checkpoint.py
 48 | ```
 49 | 
 50 | ## Latent Typing and Sentence Reconstruction
 51 | 
 52 | After pretraining, you may use the following code snippet to play with the latent typer and the decoder to do sparse latent typing and sentence reconstruction for any input text. The model directory used in the snippet should store the checkpoint of the full autoenocoder model. 
 53 | 
 54 | ```
 55 | python test_generation.py
 56 | ```
 57 | 
 58 | An example of the latent typing and sentence reconstruction result, the tokens that are not selected for typing (classified as the latent type 0) are ignored:
 59 | 
 60 | ```
 61 | INPUT SENTENCE:
 62 | Our approach provides the decoder model with a shortcut to directly access the encoded token representations, so that the latent representation for each of the input tokens can be learned as an auxiliary type representation.
 63 | 
 64 | LATENT TYPINGS:
 65 | our(20), approach(20), provides(48), deco(19), ##der(13), model(27), with(16), short(49), ##cut(61), directly(18), access(48), encoded(25), token(53), representations(6), so(2), that(59), late(49), ##nt(4), representation(22), each(26), input(25), token(53), can(41), learned(38), as(58), auxiliary(32), type(30), representation(53),
 66 | 
 67 | OUTPUT SENTENCE:
 68 | our approach provides the decoder model with a shortcut to directly access the encoded token representations, so that the latent representation of each of the input tokens can be learned as an auxiliary type representation representation.
 69 | ```
 70 | 
 71 | ## Few-shot Evaluation
 72 | 
 73 | To reproduce the few-shot results in our paper, please go to the directory `evaluations/ner/CONTaiNER/` and run the following bash script to evaluate our model on the INTRA/INTER settings of the Few-NERD benchmark. We are assuming the model checkpoint is stored in the `checkpoints/for_container/` directory. 
 74 | 
 75 | ```
 76 | NAME=model_LP_100k #random model name
 77 | TYPE=intra #change to 'inter' for INTER setting evaluation 
 78 | bash exec_container.sh $TYPE 0 5 1 $NAME  #5-way-1-shot
 79 | bash exec_container.sh $TYPE 0 5 5 $NAME  #5-way-5-shot
 80 | bash exec_container.sh $TYPE 0 10 5 $NAME #10-way-5-shot
 81 | bash exec_container.sh $TYPE 0 10 1 $NAME #10-way-1-shot
 82 | 
 83 | ```
 84 | 
 85 | After producing the outputs, you may also collect the evaluation results and calculate the F1 scores using the `calc_micro.sh` script in that directory.
 86 | 
 87 | 
 88 | ## Citation
 89 | 
 90 | If you found this repository helpful, please cite our paper: 
 91 | 
 92 | ```
 93 | @inproceedings{ren-etal-2022-language,
 94 |     title = "Language Model Pre-Training with Sparse Latent Typing",
 95 |     author = "Ren, Liliang and Zhang, Zixuan and Wang, Han and Voss, Clare and Zhai, ChengXiang and Ji, Heng",
 96 |     booktitle = "Proceedings of the 2022 Conference on Empirical Methods in Natural Language Processing",
 97 |     month = dec,
 98 |     year = "2022",
 99 |     address = "Abu Dhabi, United Arab Emirates",
100 |     publisher = "Association for Computational Linguistics",
101 |     url = "https://aclanthology.org/2022.emnlp-main.96",
102 |     pages = "1480--1494",
103 | }
104 | ```
105 | 
106 | 
107 | 


--------------------------------------------------------------------------------
/convert_checkpoint.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Liliang Ren.
 2 | #
 3 | # This source code is licensed under the Apache 2.0 license found in the
 4 | # LICENSE file in the root directory of this source tree.
 5 | 
 6 | 
 7 | from model import RobertaAutoEncoder
 8 | import os
 9 | 
10 | in_dir = "./checkpoints/YOUR_MODEL_CKPT_DIR"
11 | out_dir = "./checkpoints/for_container" # Assuming it is used for few-shot eval.
12 | 
13 | os.makedirs(out_dir, exist_ok=True)
14 | 
15 | m = RobertaAutoEncoder.from_pretrained(in_dir)
16 | m.model.save_pretrained(out_dir)
17 | 


--------------------------------------------------------------------------------
/dataset.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Zixuan Zhang, Liliang Ren.
  2 | #
  3 | # This source code is licensed under the Apache 2.0 license found in the
  4 | # LICENSE file in the root directory of this source tree.
  5 | 
  6 | 
  7 | import os
  8 | import torch
  9 | import torch.nn as nn
 10 | 
 11 | from torch.utils.data import Dataset, Sampler
 12 | from torch.utils.tensorboard import SummaryWriter
 13 | from transformers import Trainer, TrainingArguments
 14 | 
 15 | from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple, Union
 16 | 
 17 | 
 18 | class PLMDataset(Dataset):
 19 | 
 20 |     def __init__(self, tokenizer, file_path, block_size):
 21 |         assert os.path.isfile(file_path), f"Input file path {file_path} not found"
 22 | 
 23 |         with open(file_path, encoding="utf-8") as f:
 24 |             lines = f.readlines()
 25 | 
 26 |         batch_encoding = tokenizer([line[:-1] for line in lines], add_special_tokens=True, truncation=True, max_length=block_size)
 27 |         examples = batch_encoding["input_ids"]
 28 |         original_tokens = [['START'] + tokenizer.tokenize(line[:-1]) + ['END'] for line in lines]
 29 |         self.examples = [{"input_ids": torch.tensor(e, dtype=torch.long), "original_tokens": original_tokens[i]} for i,e in enumerate(examples)]
 30 | 
 31 |     def __len__(self):
 32 |         return len(self.examples)
 33 | 
 34 |     def __getitem__(self, i):
 35 |         return self.examples[i]
 36 | 
 37 | 
 38 | class PLMDataCollator(object):
 39 | 
 40 |     def __init__(self, tokenizer, mlm_probability=0.15):
 41 |         self.tokenizer = tokenizer
 42 |         self.mlm_probability = mlm_probability
 43 | 
 44 |     def __call__(self, examples):
 45 |         # examples: list of {"input_ids": xxx, "original_tokens": xxx}
 46 |         input_ids = [{"input_ids": example["input_ids"]} for example in examples]
 47 |         original_tokens = [{"original_tokens": example["original_tokens"]} for example in examples]
 48 | 
 49 |         batch_src = self.tokenizer.pad(input_ids, return_attention_mask=True, return_tensors="pt")
 50 |         batch_tgt = self.tokenizer.pad(input_ids, return_attention_mask=True, return_tensors="pt")
 51 | 
 52 |         # If special token mask has been preprocessed, pop it from the dict.
 53 |         tgt_labels = batch_tgt.input_ids[:, 1:].clone()
 54 |         if self.tokenizer.pad_token_id is not None:
 55 |             tgt_labels[tgt_labels == self.tokenizer.pad_token_id] = -100
 56 |         # batch_tgt.input_ids[:, 0] = self.tokenizer.eos_token_id
 57 |         masked_input_ids, masked_labels = self.mask_tokens(batch_src.input_ids)
 58 | 
 59 |         # batch_src 
 60 |         batch = {
 61 |             "input_ids": batch_tgt.input_ids,
 62 |             "attention_mask": batch_src.attention_mask,
 63 |             "masked_input_ids": masked_input_ids,
 64 |             "masked_labels": masked_labels,
 65 |             "decoder_input_ids": batch_tgt.input_ids[:, :-1],
 66 |             "decoder_attention_mask": batch_tgt.attention_mask[:, :-1],
 67 |             "labels": tgt_labels,
 68 |             "original_tokens": original_tokens
 69 |         }
 70 |         return batch
 71 | 
 72 |     def mask_tokens(self, inputs, special_tokens_mask=None):
 73 |         """
 74 |         Prepare masked tokens inputs/labels for masked language modeling: 80% MASK, 10% random, 10% original.
 75 |         """
 76 |         labels = inputs.clone()
 77 |         # We sample a few tokens in each sequence for MLM training (with probability `self.mlm_probability`)
 78 |         probability_matrix = torch.full(labels.shape, self.mlm_probability)
 79 |         if special_tokens_mask is None:
 80 |             special_tokens_mask = [
 81 |                 self.tokenizer.get_special_tokens_mask(val, already_has_special_tokens=True) for val in labels.tolist()
 82 |             ]
 83 |             special_tokens_mask = torch.tensor(special_tokens_mask, dtype=torch.bool)
 84 |         else:
 85 |             special_tokens_mask = special_tokens_mask.bool()
 86 | 
 87 |         probability_matrix.masked_fill_(special_tokens_mask, value=0.0)
 88 |         masked_indices = torch.bernoulli(probability_matrix).bool()
 89 |         labels[~masked_indices] = -100 # We only compute loss on masked tokens
 90 | 
 91 |         # 80% of the time, we replace masked input tokens with tokenizer.mask_token ([MASK])
 92 |         indices_replaced = torch.bernoulli(torch.full(labels.shape, 0.8)).bool() & masked_indices
 93 |         inputs[indices_replaced] = self.tokenizer.convert_tokens_to_ids(self.tokenizer.mask_token)
 94 | 
 95 |         # 10% of the time, we replace masked input tokens with random word
 96 |         indices_random = torch.bernoulli(torch.full(labels.shape, 0.5)).bool() & masked_indices & ~indices_replaced
 97 |         random_words = torch.randint(len(self.tokenizer), labels.shape, dtype=torch.long)
 98 |         inputs[indices_random] = random_words[indices_random]
 99 | 
100 |         # The rest of the time (10% of the time) we keep the masked input tokens unchanged
101 |         return inputs, labels
102 | 
103 | 
104 | class PLMTrainingArgs(TrainingArguments):
105 | 
106 |     def add_loss_weights(self, mlm, alpha, beta, gamma ):
107 |         self.mlm = mlm
108 |         self.alpha = alpha
109 |         self.beta = beta
110 |         self.gamma = gamma
111 | 
112 | 
113 | class PLMTrainer(Trainer):
114 | 
115 |     def load_tb(self, log_dir):
116 |         self.writer = SummaryWriter(log_dir)
117 |         self.global_step = 0
118 | 
119 |     def _prepare_inputs(self, inputs: Dict[str, Union[torch.Tensor, Any]]) -> Dict[str, Union[torch.Tensor, Any]]:
120 |         return inputs
121 |     
122 |     def create_optimizer(self):
123 |         opt_model = self.model
124 |         if self.optimizer is None:
125 |             param_optimizer = [(n, p) for n, p in opt_model.named_parameters() if p.requires_grad]
126 |             no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight','layer_norm.bias','layer_norm.weight',]
127 |             slow_lr=['bert']
128 |             optimizer_grouped_parameters = [
129 |                             {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay) and \
130 |                                             not any(nd in n for nd in slow_lr) ], 'weight_decay': self.args.weight_decay,
131 |                                                          'lr': self.args.learning_rate},
132 |                                         {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay) and \
133 |                                                          any(nd in n for nd in slow_lr) ], 'weight_decay': self.args.weight_decay,
134 |                                                                       'lr': self.args.learning_rate*0.1},
135 |                                                     {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay) and \
136 |                                                                      any(nd in n for nd in slow_lr) ], 'weight_decay': 0.0,
137 |                                                                                   'lr': self.args.learning_rate*0.1},
138 |                                                                 {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay) and \
139 |                                                                                 not any(nd in n for nd in slow_lr) ], 'weight_decay': 0.0,
140 |                                                                                              'lr': self.args.learning_rate},
141 |                                                                         ]
142 | 
143 |             optimizer_cls, optimizer_kwargs = Trainer.get_optimizer_cls_and_kwargs(self.args)
144 | 
145 |             self.optimizer = optimizer_cls(optimizer_grouped_parameters, **optimizer_kwargs)
146 | 
147 |         return self.optimizer
148 | 
149 |     def compute_loss(self, model, inputs, return_outputs=False):
150 |         ''' main model '''
151 |         losses= model(
152 |             input_ids=inputs["input_ids"].cuda(), 
153 |             attention_mask=inputs["attention_mask"].cuda(),
154 |             mlm_input_ids=inputs["masked_input_ids"].cuda(), 
155 |             mlm_labels=inputs["masked_labels"].cuda(), 
156 |             decoder_input_ids=inputs["decoder_input_ids"].cuda(), 
157 |             decoder_attention_mask=inputs["decoder_attention_mask"].cuda(), 
158 |             gen_labels=inputs["labels"].cuda(), 
159 |             original_tokens=inputs["original_tokens"],
160 |             return_dict=None
161 |         )
162 |         mlm_loss, gen_loss, pm_loss, div_loss = losses
163 |         self.writer.add_scalar('sparse_loss', torch.mean(pm_loss).item(), self.global_step)
164 |         self.writer.add_scalar('mlm_loss', torch.mean(mlm_loss).item(), self.global_step)
165 |         self.writer.add_scalar('gen_loss', torch.mean(gen_loss).item(), self.global_step)
166 |         self.writer.add_scalar('div_loss', torch.mean(div_loss).item(), self.global_step)
167 | 
168 |         self.model.sa_pm.set_num_updates(self.global_step)
169 |         self.global_step += 1
170 |         
171 |         return self.args.mlm * mlm_loss + self.args.alpha * gen_loss + self.args.beta * pm_loss + self.args.gamma * div_loss
172 |     
173 | 
174 | 
175 | 


--------------------------------------------------------------------------------
/evaluations/ner/CONTaiNER/.gitignore:
--------------------------------------------------------------------------------
1 | data/
2 | outputs/
3 | saved_models/
4 | episode-data/
5 | *.zip
6 | 


--------------------------------------------------------------------------------
/evaluations/ner/CONTaiNER/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2022 Penn State NLP Group
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/evaluations/ner/CONTaiNER/README.md:
--------------------------------------------------------------------------------
1 | This directory contains the source code modified from the original `CONTaiNER` [repository](https://github.com/psunlpgroup/CONTaiNER) for the training and the evaluation pipeline on the Few-NERD dataset.
2 | 
3 | Please follow the original repository for data preparation.
4 | 
5 | 


--------------------------------------------------------------------------------
/evaluations/ner/CONTaiNER/calc_micro.sh:
--------------------------------------------------------------------------------
 1 | G=intra
 2 | 
 3 | DIR=./outputs/few-nerd/${G}/YOUR_OUTPUT_DIR
 4 | 
 5 | python src/calc-micro-avg.py --target_dir ${DIR}/${G}-5-5/ --range 5000
 6 | python src/calc-micro-avg.py --target_dir ${DIR}/${G}-5-1/ --range 5000
 7 | python src/calc-micro-avg.py --target_dir ${DIR}/${G}-10-5/ --range 5000
 8 | python src/calc-micro-avg.py --target_dir ${DIR}/${G}-10-1/ --range 5000
 9 | 
10 | 


--------------------------------------------------------------------------------
/evaluations/ner/CONTaiNER/download.sh:
--------------------------------------------------------------------------------
1 | wget -O episode_data.zip https://cloud.tsinghua.edu.cn/f/56fb277d3fd2437a8ee3/?dl=1
2 | 


--------------------------------------------------------------------------------
/evaluations/ner/CONTaiNER/exec_container.sh:
--------------------------------------------------------------------------------
 1 | export G=$1
 2 | export GPU=$2
 3 | export WAY=$3
 4 | export SHOT=$4
 5 | export SAVED_MODEL_DIR=$5
 6 | export way=${WAY}
 7 | export shot=${SHOT}
 8 | echo $SAVED_MODEL_DIR
 9 | echo $shot $way
10 | export finetune_loss=KL
11 | export is_viterbi=viterbi
12 | 
13 | MODEL="../../../checkpoints/for_container/"
14 | CONFIG="../../../checkpoints/for_container/"
15 | 
16 | ## training with toy evaluation for sanity check
17 | python src/container.py --data_dir data/few-nerd/${G} --labels-train data/few-nerd/${G}/labels_train.txt --labels-test data/few-nerd/${G}/labels_test.txt --config_name $CONFIG --model_name_or_path $MODEL --saved_model_dir saved_models/few-nerd/${G}/${SAVED_MODEL_DIR} --output_dir outputs/few-nerd/${G}/${finetune_loss}_${is_viterbi}_final_5000_${SAVED_MODEL_DIR}/${G}-${way}-${shot}/ --support_path support_test_${way}_${shot}/ --test_path query_test_${way}_${shot}/ --n_shots ${shot} --max_seq_length 128 --embedding_dimension 128 --num_train_epochs 1 --train_batch_size 32 --seed 1 --do_train --do_predict --select_gpu ${GPU} --training_loss KL --finetune_loss ${finetune_loss} --evaluation_criteria euclidean_hidden_state --consider_mutual_O --learning_rate 1e-4 --learning_rate_finetuning 1e-4
18 | 
19 | ## evaluation
20 | echo $shot $way 
21 | python src/container.py --data_dir data/few-nerd/${G} --labels-train data/few-nerd/${G}/labels_train.txt --labels-test data/few-nerd/${G}/labels_test.txt --config_name $CONFIG --model_name_or_path $MODEL --saved_model_dir saved_models/few-nerd/${G}/${SAVED_MODEL_DIR} --output_dir outputs/few-nerd/${G}/${finetune_loss}_${is_viterbi}_final_5000_${SAVED_MODEL_DIR}/${G}-${way}-${shot}/ --support_path support_test_${way}_${shot}/ --test_path query_test_${way}_${shot}/ --n_shots ${shot} --max_seq_length 128 --embedding_dimension 128 --num_train_epochs 1 --train_batch_size 32 --seed 1 --do_predict --select_gpu ${GPU} --training_loss KL --finetune_loss ${finetune_loss} --evaluation_criteria euclidean_hidden_state --learning_rate 1e-4 --learning_rate_finetuning 1e-4 --consider_mutual_O --temp_trans 0.01 --silent
22 | 


--------------------------------------------------------------------------------
/evaluations/ner/CONTaiNER/misc/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/renll/SparseLT/86306e94c27ec79b4ea4e810a262df42798e5ab9/evaluations/ner/CONTaiNER/misc/__init__.py


--------------------------------------------------------------------------------
/evaluations/ner/CONTaiNER/misc/cvt_conll.py:
--------------------------------------------------------------------------------
 1 | import jsonlines
 2 | import os
 3 | import argparse
 4 | import glob
 5 | from tqdm import tqdm
 6 | 
 7 | 
 8 | def convert_file(input_name, sup_output_name, query_output_name):
 9 |     if not os.path.exists(sup_output_name):
10 |         os.makedirs(sup_output_name)
11 |     #okay
12 |     if not os.path.exists(query_output_name):
13 |         os.makedirs(query_output_name)
14 |     reader = jsonlines.open(input_name)
15 |     for ct, dicts in enumerate(reader):
16 |         supdict = dicts["support"]
17 |         supwords = supdict["word"]
18 |         suplabels = supdict["label"]
19 |         querydict = dicts["query"]
20 |         querywords = querydict["word"]
21 |         querylabels = querydict["label"]
22 |         str1 = ''
23 |         for i in range(len(supwords)):
24 |             for j in range(len(supwords[i])):
25 |                 str1 = str1 + supwords[i][j] + '\t' + suplabels[i][j] + '\n'
26 |             str1 += '\n'
27 |         out_file = open(sup_output_name + '/' + str(ct) + '.txt', 'w', encoding='utf-8')
28 |         out_file.write(str1)
29 |         out_file.close()
30 |         str2 = ''
31 |         for i in range(len(querywords)):
32 |             for j in range(len(querywords[i])):
33 |                 str2 = str2 + querywords[i][j] + '\t' + querylabels[i][j] + '\n'
34 |             str2 += '\n'
35 |         out_file = open(query_output_name + '/' + str(ct) + '.txt', 'w', encoding='utf-8')
36 |         out_file.write(str2)
37 |         out_file.close()
38 |     
39 | 
40 | if __name__ == '__main__':
41 |     output_base_dir = 'data/few-nerd'
42 |     support_file_prefix = 'support_'
43 |     query_file_prefix = 'query_'
44 |     print(os.getcwd())    
45 |     # ensure the folders are already created beforehand
46 |     
47 |     all_input_files = glob.glob('**/*.jsonl', recursive=True)
48 | 
49 |     for file in tqdm(all_input_files):
50 |         input_base_name = os.path.basename(file).split('.')[0] # just take the base name
51 |         if 'test' in file:
52 |             target_split_text = 'inter' if 'inter' in file else 'intra'
53 |             sup_output_name = os.path.join(output_base_dir, target_split_text, support_file_prefix + input_base_name)
54 |             query_output_name = os.path.join(output_base_dir, target_split_text, query_file_prefix + input_base_name)
55 |             convert_file(file, sup_output_name, query_output_name)
56 |         
57 |         


--------------------------------------------------------------------------------
/evaluations/ner/CONTaiNER/misc/cvt_to_lowercase.py:
--------------------------------------------------------------------------------
 1 | #makeshift
 2 | directories = ['data/few-nerd/inter/train.txt', 'data/few-nerd/intra/train.txt']
 3 | 
 4 | for file1 in directories:
 5 |   final_str = ""
 6 |   with open(file1, 'r') as f:
 7 |     lines = f.readlines()
 8 |     for line in lines:
 9 |       cmpnnts = line.split('\t')
10 |       if len(cmpnnts) < 2:
11 |         final_str += line 
12 |         continue
13 |       c1, c2 = cmpnnts
14 |       c1 = c1.lower()
15 |       final_str += (c1 + '\t' + c2 )
16 | 
17 |   f = open(file1, 'w')
18 |   f.write(final_str)
19 |   f.close()


--------------------------------------------------------------------------------
/evaluations/ner/CONTaiNER/process_fewnerd.sh:
--------------------------------------------------------------------------------
1 | unzip episode_data.zip &&
2 | python misc/cvt_conll.py &&
3 | python misc/cvt_to_lowercase.py
4 | # cleanup
5 | rm -rf episode*
6 | 


--------------------------------------------------------------------------------
/evaluations/ner/CONTaiNER/src/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/renll/SparseLT/86306e94c27ec79b4ea4e810a262df42798e5ab9/evaluations/ner/CONTaiNER/src/__init__.py


--------------------------------------------------------------------------------
/evaluations/ner/CONTaiNER/src/calc-micro-avg.py:
--------------------------------------------------------------------------------
 1 | 
 2 | import numpy as np
 3 | import glob
 4 | import os
 5 | import argparse
 6 | 
 7 | 
 8 | def main():
 9 | 
10 |     parser = argparse.ArgumentParser()
11 |     parser.add_argument("--target_dir",
12 |             default=None,
13 |             type=str,
14 |             required=True,
15 |             help="The input data dir.",)
16 |     parser.add_argument("--range",
17 |                         default=5000,
18 |                         type=int,
19 |                         required=False,
20 |                         help="ranging the amount of data to consider in count")
21 |     args = parser.parse_args()
22 |     path = args.target_dir
23 |     s = []
24 | 
25 |     target_res_text = 'results.txt'
26 |     if args.range == None:
27 |         s = glob.glob(os.path.join(path, '*', target_res_text))
28 |     else:
29 |         for i in range(0, args.range):
30 |             try:
31 |                 s.append(glob.glob(os.path.join(path, str(i), target_res_text))[0])
32 |             except:
33 |                 print("Missing file: " + str(i))
34 | 
35 |     precisions = []
36 |     recalls = []
37 |     f1s = []
38 |     pred_sum = []
39 |     tp_sum = []
40 |     true_sum = []
41 | 
42 |     for file in s:
43 |         with open(file) as f:
44 |             lines = f.readlines()
45 |             pred_sum.append(int(lines[6].split()[-1]))
46 |             tp_sum.append(int(lines[8].split()[-1]))
47 |             true_sum.append(int(lines[9].split()[-1]))
48 |     recall = np.sum(tp_sum) / np.sum(true_sum)
49 |     precision = np.sum(tp_sum) / np.sum(pred_sum)
50 |     f1 = (2 * precision * recall) / (precision + recall)
51 | 
52 |     print("avg. f1 = %f" % (f1) )
53 |     print("avg. precision = %f" % (precision))
54 |     print("avg. recall = %f" % (recall))
55 |     print("covered = %f" % len(tp_sum))
56 | 
57 | 
58 | 
59 | 
60 | if __name__ == "__main__":
61 |     main()


--------------------------------------------------------------------------------
/evaluations/ner/CONTaiNER/src/crf.py:
--------------------------------------------------------------------------------
  1 | """
  2 | 
  3 | Conditional Random Fields
  4 | Reference: https://aclanthology.org/2020.emnlp-main.516.pdf
  5 | """
  6 | 
  7 | import torch
  8 | import torch.nn as nn
  9 | from dataset import IdxMaps
 10 | 
 11 | 
 12 | START_ID = 0
 13 | O_ID = 1
 14 | 
 15 | 
 16 | class CRFInference:
 17 |     """
 18 |     Inference part of the generalized CRF model
 19 |     """
 20 | 
 21 |     def __init__(self, n_tag, trans_priors, power):
 22 |         """
 23 |         We assume the batch size is 1, so no need to worry about PAD for now
 24 |         n_tag: START, O, and I_Xs
 25 |         """
 26 |         super().__init__()
 27 |         self.transitions = self.trans_expand(n_tag, trans_priors, power)
 28 | 
 29 |     @staticmethod
 30 |     def trans_expand(n_tag, priors, power):
 31 |         s_o, s_i, o_o, o_i, i_o, i_i, x_y = priors
 32 |         # self transitions for I-X tags
 33 |         a = torch.eye(n_tag) * i_i
 34 |         # transitions from I-X to I-Y
 35 |         b = torch.ones(n_tag, n_tag) * x_y / (n_tag - 3)
 36 |         c = torch.eye(n_tag) * x_y / (n_tag - 3)
 37 |         transitions = a + b - c
 38 |         # transition from START to O
 39 |         transitions[START_ID, O_ID] = s_o
 40 |         # transitions from START to I-X
 41 |         transitions[START_ID, O_ID+1:] = s_i / (n_tag - 2)
 42 |         # transition from O to O
 43 |         transitions[O_ID, O_ID] = o_o
 44 |         # transitions from O to I-X
 45 |         transitions[O_ID, O_ID+1:] = o_i / (n_tag - 2)
 46 |         # transitions from I-X to O
 47 |         transitions[O_ID+1:, O_ID] = i_o
 48 |         # no transitions to START
 49 |         transitions[:, START_ID] = 0.
 50 | 
 51 |         powered = torch.pow(transitions, power)
 52 |         summed = powered.sum(dim=1)
 53 | 
 54 |         transitions = powered / summed.view(n_tag, 1)
 55 | 
 56 |         transitions = torch.where(transitions > 0, transitions, torch.tensor(.000001))
 57 | 
 58 |         # print(transitions)
 59 |         # print(torch.sum(transitions, dim=1))
 60 |         return torch.log(transitions)
 61 | 
 62 |     def forward(self, scores: torch.Tensor) -> torch.Tensor:  # type: ignore
 63 |         """
 64 |         Take the emission scores calculated by NERModel, and return a tensor of CRF features,
 65 |         which is the sum of transition scores and emission scores.
 66 |         :param scores: emission scores calculated by NERModel.
 67 |             shape: (batch_size, sentence_length, ntags)
 68 |         :return: a tensor containing the CRF features whose shape is
 69 |             (batch_size, sentence_len, ntags, ntags). F[b, t, i, j] represents
 70 |             emission[t, j] + transition[i, j] for the b'th sentence in this batch.
 71 |         """
 72 |         batch_size, sentence_len, _ = scores.size()
 73 | 
 74 |         # expand the transition matrix batch-wise as well as sentence-wise
 75 |         transitions = self.transitions.expand(batch_size, sentence_len, -1, -1)
 76 | 
 77 |         # add another dimension for the "from" state, then expand to match
 78 |         # the dimensions of the expanded transition matrix above
 79 |         emissions = scores.unsqueeze(2).expand_as(transitions)
 80 | 
 81 |         # add them up
 82 |         return transitions + emissions
 83 | 
 84 |     @staticmethod
 85 |     def viterbi(features: torch.Tensor) -> torch.Tensor:
 86 |         """
 87 |         Decode the most probable sequence of tags.
 88 |         Note that the delta values are calculated in the log space.
 89 |         :param features: the feature matrix from the forward method of CRF.
 90 |             shaped (batch_size, sentence_len, ntags, ntags)
 91 |         :return: a tensor containing the most probable sequences for the batch.
 92 |             shaped (batch_size, sentence_len)
 93 |         """
 94 |         batch_size, sentence_len, ntags, _ = features.size()
 95 | 
 96 |         # initialize the deltas
 97 |         delta_t = features[:, 0, START_ID, :]
 98 |         deltas = [delta_t]
 99 | 
100 |         # use dynamic programming to iteratively calculate the delta values
101 |         for t in range(1, sentence_len):
102 |             f_t = features[:, t]
103 |             delta_t, _ = torch.max(f_t + delta_t.unsqueeze(2).expand_as(f_t), 1)
104 |             deltas.append(delta_t)
105 | 
106 |         # now iterate backward to figure out the most probable tags
107 |         sequences = [torch.argmax(deltas[-1], 1, keepdim=True)]
108 |         for t in reversed(range(sentence_len - 1)):
109 |             f_prev = features[:, t + 1].gather(
110 |                 2, sequences[-1].unsqueeze(2).expand(batch_size, ntags, 1)).squeeze(2)
111 |             sequences.append(torch.argmax(f_prev + deltas[t], 1, keepdim=True))
112 |         sequences.reverse()
113 |         return torch.cat(sequences, dim=1)
114 | 
115 | 
116 | class CRF(nn.Module):
117 |     """
118 |     Linear Chain CRF
119 |     """
120 | 
121 |     def __init__(self, ntags: int):
122 |         """
123 |         Initialize the Linear Chain CRF layer.
124 |         :param ntags: number of tags. Usually from IdxMaps
125 |         """
126 |         super().__init__()
127 |         transitions = torch.empty(ntags, ntags)
128 |         nn.init.uniform_(transitions, -0.1, 0.1)
129 |         # can't transition into START
130 |         transitions[:, IdxMaps.START_ID] = -10000.0
131 | 
132 |         self.transitions = nn.Parameter(transitions)  # type: ignore
133 | 
134 |     def forward(self, scores: torch.Tensor) -> torch.Tensor:  # type: ignore
135 |         """
136 |         Take the emission scores calculated by NERModel, and return a tensor of CRF features,
137 |         which is the sum of transition scores and emission scores.
138 |         :param scores: emission scores calculated by NERModel.
139 |             shape: (batch_size, sentence_length, ntags)
140 |         :return: a tensor containing the CRF features whose shape is
141 |             (batch_size, sentence_len, ntags, ntags). F[b, t, i, j] represents
142 |             emission[t, j] + transition[i, j] for the b'th sentence in this batch.
143 |         """
144 |         batch_size, sentence_len, _ = scores.size()
145 | 
146 |         # expand the transition matrix batch-wise as well as sentence-wise
147 |         transitions = self.transitions.expand(batch_size, sentence_len, -1, -1)
148 | 
149 |         # add another dimension for the "from" state, then expand to match
150 |         # the dimensions of the expanded transition matrix above
151 |         emissions = scores.unsqueeze(2).expand_as(transitions)
152 | 
153 |         # add them up
154 |         return transitions + emissions
155 | 
156 |     @staticmethod
157 |     def forward_alg(features: torch.Tensor) -> torch.Tensor:
158 |         """
159 |         Calculate the log alpha values using the forward algorithm.
160 |         :param features: the features matrix from the forward method of CRF
161 |             shaped (batch_size, sentence_len, ntags, ntags)
162 |         :return: the tensor that represents a series of alpha values for the batch
163 |             whose shape is (batch_size, sentence_len)
164 |         """
165 |         _, sentence_len, _, _ = features.size()
166 | 
167 |         # initialize the alpha value
168 |         alpha_t = features[:, 0, IdxMaps.START_ID, :]
169 |         alphas = [alpha_t]
170 | 
171 |         # use dynamic programming to iteratively calculate the alpha value
172 |         for t in range(1, sentence_len):
173 |             f_t = features[:, t]
174 |             alpha_t = torch.logsumexp(f_t + alpha_t.unsqueeze(2).expand_as(f_t), 1)
175 |             alphas.append(alpha_t)
176 | 
177 |         # return all the alpha values
178 |         return torch.stack(alphas, dim=1)
179 | 
180 |     @staticmethod
181 |     def tags_score(tags: torch.Tensor, features: torch.Tensor) -> torch.Tensor:
182 |         """
183 |         Calculate the score for the given sequence of tags.
184 |         :param tags: a batch of sequences of tags whose shape is (batch_sizee, sentence_len)
185 |         :param features: the features matrix from the forward method of CRF.
186 |             shaped (batch_size, sentence_len, ntags, ntags)
187 |         :return: a tensor with scores for the given sequences of tags.
188 |             shaped (batch_size,)
189 |         """
190 |         batch_size, sentence_len, ntags, _ = features.size()
191 | 
192 |         # we first collect all the features whose "to" tag is given by tags,
193 |         # i.e. F[b, t, i, *tags]
194 |         # the resulting dimension is (batch, sentence_len, ntags, 1)
195 |         to_idx = tags.view(-1, sentence_len, 1, 1).expand(-1, -1, ntags, -1)
196 |         to_scores = features.gather(3, to_idx)
197 | 
198 |         # now out of to_scores, gather all the features whose "from" tag is
199 |         # given by tags plus the start tag.
200 |         # i.e. F[b, t, *[start + tags], j]
201 |         # the resulting dimension is (batch, sentence_len, 1, 1)
202 |         from_idx = torch.cat(
203 |             (torch.tensor(IdxMaps.START_ID).expand(batch_size, 1).to(tags.device), tags[:, :-1]),
204 |             dim=1
205 |         )
206 |         scores = to_scores.gather(2, from_idx.view(-1, sentence_len, 1, 1))
207 | 
208 |         # we've now gathered all the right scores, so sum them up!
209 |         return torch.sum(scores.view(-1, sentence_len), dim=1)
210 | 
211 |     @staticmethod
212 |     def viterbi(features: torch.Tensor) -> torch.Tensor:
213 |         """
214 |         Decode the most probable sequence of tags.
215 |         Note that the delta values are calculated in the log space.
216 |         :param features: the feature matrix from the forward method of CRF.
217 |             shaped (batch_size, sentence_len, ntags, ntags)
218 |         :return: a tensor containing the most probable sequences for the batch.
219 |             shaped (batch_size, sentence_len)
220 |         """
221 |         batch_size, sentence_len, ntags, _ = features.size()
222 | 
223 |         # initialize the deltas
224 |         delta_t = features[:, 0, IdxMaps.START_ID, :]
225 |         deltas = [delta_t]
226 | 
227 |         # use dynamic programming to iteratively calculate the delta values
228 |         for t in range(1, sentence_len):
229 |             f_t = features[:, t]
230 |             delta_t, _ = torch.max(f_t + delta_t.unsqueeze(2).expand_as(f_t), 1)
231 |             deltas.append(delta_t)
232 | 
233 |         # now iterate backward to figure out the most probable tags
234 |         sequences = [torch.argmax(deltas[-1], 1, keepdim=True)]
235 |         for t in reversed(range(sentence_len - 1)):
236 |             f_prev = features[:, t + 1].gather(
237 |                 2, sequences[-1].unsqueeze(2).expand(batch_size, ntags, 1)).squeeze(2)
238 |             sequences.append(torch.argmax(f_prev + deltas[t], 1, keepdim=True))
239 |         sequences.reverse()
240 |         return torch.cat(sequences, dim=1)
241 | 


--------------------------------------------------------------------------------
/evaluations/ner/CONTaiNER/src/embedding.py:
--------------------------------------------------------------------------------
 1 | from typing import TextIO, Tuple
 2 | import numpy as np
 3 | 
 4 | 
 5 | class GloveEmbedding(dict):
 6 |     """
 7 |     Class with the pretrained 100d glove embeddings.
 8 |     Note: glove embedding is for lower case tokens.
 9 |     """
10 |     DIM_EMDEDDING = 100
11 | 
12 |     def __init__(self, fileh: TextIO):
13 |         """
14 |         Initialize a GloveEmbedding instance.
15 |         :param fileh: a file handler for the glove embeddings
16 |         """
17 |         super().__init__()
18 |         self.load(fileh)
19 | 
20 |     def load(self, fileh: TextIO):
21 |         """
22 |         Load and parse each line of the glove embeddiings file.
23 |         :param fileh: the glove embeddings file to be loaded
24 |         """
25 |         for line in fileh:
26 |             token, embedding = GloveEmbedding.split(str(line))
27 |             self[token.lower()] = embedding
28 | 
29 |     @staticmethod
30 |     def split(line: str) -> Tuple[str, np.ndarray]:
31 |         """
32 |         Split the given line into a token and its embedding vector.
33 |         :param line: line to be splitted into a token and its embedding vector
34 |         :return: a tuple of a token and its embedding vector (numpy array)
35 |         """
36 |         token, vals = line.split(None, 1)
37 |         return token, np.array([float(v) for v in vals.split()], dtype=np.float)
38 | 
39 |     @classmethod
40 |     def random(cls) -> np.ndarray:
41 |         """
42 |         Return a random vector with the right scale.
43 |         :return: a random numpy vector
44 |         """
45 |         dim = cls.DIM_EMDEDDING
46 |         scale = np.sqrt(3.0 / dim)
47 |         return np.random.uniform(-scale, scale, dim)
48 | 
49 |     @classmethod
50 |     def zeros(cls) -> np.ndarray:
51 |         """
52 |         Return a zero vector.
53 |         :return: a zero numpy vector
54 |         """
55 |         return np.zeros(cls.DIM_EMDEDDING, dtype=np.float)
56 | 
57 |     def get(self, token: str, default=None) -> np.ndarray:
58 |         """
59 |         Get the glove embedding if the token is found, else the given default or a random vector.
60 |         :param token: a token to be looked up
61 |         :param default: a default to be returned if the given token is not found
62 |         :return: the glove embedding, default or a random vector
63 |         """
64 |         token = token.lower()
65 |         ret = super(GloveEmbedding, self).get(token)
66 |         if ret is not None:
67 |             return ret
68 |         elif default is not None:
69 |             return default
70 |         return self.random()
71 | 


--------------------------------------------------------------------------------
/evaluations/supervised-ie/README.md:
--------------------------------------------------------------------------------
  1 | OneIE v0.4.8
  2 | 
  3 | # Requirements
  4 | 
  5 | Python 3.7
  6 | Python packages
  7 | - PyTorch 1.0+ (Install the CPU version if you use this tool on a machine without GPUs)
  8 | - transformers 3.0.2 (It seems using transformers 3.1+ may cause some model loading issue)
  9 | - tqdm
 10 | - lxml
 11 | - nltk
 12 | 
 13 | 
 14 | # How to Run
 15 | 
 16 | ## Pre-processing
 17 | ### DyGIE++ to OneIE format
 18 | The `prepreocessing/process_dygiepp.py` script converts datasets in DyGIE++
 19 | format (https://github.com/dwadden/dygiepp/tree/master/scripts/data/ace-event) to
 20 | the format used by OneIE. Example:
 21 | 
 22 | ```
 23 | python preprocessing/process_dygiepp.py -i train.json -o train.oneie.json
 24 | ```
 25 | 
 26 | Arguments:
 27 | - -i, --input: Path to the input file.
 28 | - -o, --output: Path to the output file.
 29 | 
 30 | ### ACE2005 to OneIE format
 31 | The `prepreocessing/process_ace.py` script converts raw ACE2005 datasets to the
 32 | format used by OneIE. Example:
 33 | 
 34 | ```
 35 | python preprocessing/process_ace.py -i <INPUT_DIR>/LDC2006T06/data -o <OUTPUT_DIR>
 36 |   -s resource/splits/ACE05-E -b bert-large-cased -c <BERT_CACHE_DIR> -l english
 37 | ```
 38 | 
 39 | Arguments:
 40 | - -i, --input: Path to the input directory (`data` folder in your LDC2006T06
 41 |   package).
 42 | - -o, --output: Path to the output directory.
 43 | - -b, --bert: Bert model name.
 44 | - -c, --bert_cache_dir: Path to the BERT cache directory.
 45 | - -s, --split: Path to the split directory. We provide document id lists for all
 46 |   datasets used in our paper in `resource/splits`.
 47 | - -l, --lang: Language (options: english, chinese).
 48 | 
 49 | 
 50 | ### ERE to OneIE format
 51 | The `prepreocessing/process_ere.py` script converts raw ERE datasets (LDC2015E29,
 52 | LDC2015E68, LDC2015E78, LDC2015E107) to the format used by OneIE. 
 53 | 
 54 | ```
 55 | python preprocessing/process_ere.py -i <INPUT_DIR>/data -o <OUTPUT_DIR>
 56 |   -b bert-large-cased -c <BERT_CACHE_DIR> -l english -d normal
 57 | ```
 58 | 
 59 | Arguments:
 60 | - -i, --input: Path to the input directory (`data` folder in your ERE package).
 61 | - -o, --output: Path to the output directory.
 62 | - -b, --bert: Bert model name.
 63 | - -c, --bert_cache_dir: Path to the BERT cache directory.
 64 | - -d, --dataset: Dataset type: normal, r2v2, parallel, or spanish.
 65 | - -l, --lang: Language (options: english, spanish).
 66 | 
 67 | This script only supports:
 68 | - LDC2015E29_DEFT_Rich_ERE_English_Training_Annotation_V1
 69 | - LDC2015E29_DEFT_Rich_ERE_English_Training_Annotation_V2
 70 | - LDC2015E68_DEFT_Rich_ERE_English_Training_Annotation_R2_V2
 71 | - LDC2015E78_DEFT_Rich_ERE_Chinese_and_English_Parallel_Annotation_V2
 72 | - LDC2015E107_DEFT_Rich_ERE_Spanish_Annotation_V2
 73 | 
 74 | 
 75 | ## Training
 76 | 
 77 | - `cd` to the root directory of this package
 78 | - Set the environment variable PYTHONPATH to the current directory.
 79 |   For example, if you unpack this package to `~/oneie_v0.4.8`, run:
 80 |   `export PYTHONPATH=~/oneie_v0.4.8`
 81 | - Run this commandline to train a model: `python train.py -c <CONFIG_FILE_PATH>`.
 82 | - We provide an example configuration file `config/example.json`. Fill in the 
 83 |   following paths in the configuration file:
 84 |   - BERT_CACHE_DIR: Pre-trained BERT models, configs, and tokenizers will be 
 85 |     downloaded to this directory.
 86 |   - TRAIN_FILE_PATH, DEV_FILE_PATH, TEST_FILE_PATH: Path to the training/dev/test
 87 |     files.
 88 |   - OUTPUT_DIR: The model will be saved to sub folders in this directory.
 89 |   - VALID_PATTERN_DIR: Valid patterns created based on the annotation guidelines or training set. Example files are provided in `resource/valid_patterns`.
 90 | 
 91 | 
 92 | ## Evaluation
 93 | 
 94 | - `cd` to the root directory of this package
 95 | - Set the environment variable PYTHONPATH to the current directory.
 96 |   For example, if you unpack this package to `~/oneie_v0.4.8`, run:
 97 |   `export PYTHONPATH=~/oneie_v0.4.8`
 98 | - Example commandline to use OneIE: `python predict.py -m best.role.mdl -i input -o output -c output_cs --format ltf`
 99 |   + Arguments:
100 |     - -m, --model_path: Path to the trained model.
101 |     - -i, --input_dir: Path to the input directory. LTF format sample files can be found in the `input` directory.
102 |     - -o, --output_dir: Path to the output directory (json format). Output files are in the JSON format. Sample files can be found in the `output` directory.
103 |     - -c, --cs_dir: (optional) Path to the output directory (cs format). Sample files can be found in the `output_cs` directory.
104 |     - -l, --log_path: (optional) Path to the log file. A sample file `log.json` can be found in `output`.
105 |     - --gpu: (optional) Use GPU
106 |     - -d, --device: (optional) GPU device index (for multi-GPU machines).
107 |     - -b, --batch_size: (optional) Batch size. For a 16GB GPU, a batch size of 10~15 is a reasonable value.
108 |     - --max_len: (optional) Max sentence length. Sentences longer than this value will be ignored. You may need to decrease `batch_size` if you set `max_len` to a larger number.
109 |     - --beam_size: (optional) Beam set size of the decoder. Increasing this value may improve the results and make the decoding slower.
110 |     - --lang: (optional) Model language.
111 |     - --format: Input file format (txt or ltf).
112 | 
113 | 
114 | # Output Format
115 | 
116 | OneIE save results in JSON format. Each line is a JSON object for a sentence 
117 | containing the following fields:
118 | + doc_id (string): Document ID
119 | + sent_id (string): Sentence ID
120 | + tokens (list): A list of tokens
121 | + token_ids (list): A list of token IDs (doc_id:start_offset-end_offset)
122 | + graph (object): Information graph predicted by the model
123 |   - entities (list): A list of predicted entities. Each item in the list has exactly
124 |   four values: start_token_index, end_token_index, entity_type, mention_type, score.
125 |   For example, "[3, 5, "GPE", "NAM", 1.0]" means the index of the start token is 3, 
126 |   index of the end token is 4 (5 - 1), entity type is GPE, mention type is NAM,
127 |   and local score is 1.0.
128 |   - triggers (list): A list of predicted triggers. It is similar to `entities`, while
129 |   each item has three values: start_token_index, end_token_index, event_type, score.
130 |   - relations (list): A list of predicted relations. Each item in the list has
131 |   three values: arg1_entity_index, arg2_entity_index, relation_type, score.
132 |   In the following example, `[1, 0, "ORG-AFF", 0.52]` means there is a ORG-AFF relation
133 |   between entity 1 ("leader") and entity 0 ("North Korean") with a local
134 |   score of 0.52.
135 |   The order of arg1 and arg2 can be ignored for "SOC-PER" as this relation is 
136 |   symmetric.
137 |   - roles (list): A list of predicted argument roles. Each item has three values:
138 |   trigger_index, entity_index, role, score.
139 |   In the following example, `[0, 2, "Attacker", 0.8]` means entity 2 (Kim Jong Un) is
140 |   the Attacker argument of event 0 ("detonate": Conflict:Attack), and the local
141 |   score is 0.8.
142 | 
143 | Output example:
144 | ```
145 | {"doc_id": "HC0003PYD", "sent_id": "HC0003PYD-16", "token_ids": ["HC0003PYD:2295-2296", "HC0003PYD:2298-2304", "HC0003PYD:2305-2305", "HC0003PYD:2307-2311", "HC0003PYD:2313-2318", "HC0003PYD:2320-2325", "HC0003PYD:2327-2329", "HC0003PYD:2331-2334", "HC0003PYD:2336-2337", "HC0003PYD:2339-2348", "HC0003PYD:2350-2351", "HC0003PYD:2353-2360", "HC0003PYD:2362-2362", "HC0003PYD:2364-2367", "HC0003PYD:2369-2376", "HC0003PYD:2378-2383", "HC0003PYD:2385-2386", "HC0003PYD:2388-2390", "HC0003PYD:2392-2397", "HC0003PYD:2399-2401", "HC0003PYD:2403-2408", "HC0003PYD:2410-2412", "HC0003PYD:2414-2415", "HC0003PYD:2417-2425", "HC0003PYD:2427-2428", "HC0003PYD:2430-2432", "HC0003PYD:2434-2437", "HC0003PYD:2439-2441", "HC0003PYD:2443-2447", "HC0003PYD:2449-2450", "HC0003PYD:2452-2454", "HC0003PYD:2456-2464", "HC0003PYD:2466-2472", "HC0003PYD:2474-2480", "HC0003PYD:2481-2481", "HC0003PYD:2483-2485", "HC0003PYD:2487-2491", "HC0003PYD:2493-2502", "HC0003PYD:2504-2509", "HC0003PYD:2511-2514", "HC0003PYD:2516-2523", "HC0003PYD:2524-2524"], "tokens": ["On", "Tuesday", ",", "North", "Korean", "leader", "Kim", "Jong", "Un", "threatened", "to", "detonate", "a", "more", "powerful", "H-bomb", "in", "the", "future", "and", "called", "for", "an", "expansion", "of", "the", "size", "and", "power", "of", "his", "country's", "nuclear", "arsenal", ",", "the", "state", "television", "agency", "KCNA", "reported", "."], "graph": {"entities": [[3, 5, "GPE", "NAM", 1.0], [5, 6, "PER", "NOM", 0.2], [6, 9, "PER", "NAM", 0.5060472888322202], [15, 16, "WEA", "NOM", 0.5332313915378754], [30, 31, "PER", "PRO", 1.0], [32, 33, "WEA", "NOM", 1.0], [33, 34, "WEA", "NOM", 0.5212696155645499], [36, 37, "GPE", "NOM", 0.4998288792916457], [38, 39, "ORG", "NOM", 1.0], [39, 40, "ORG", "NAM", 0.5294904130032032]], "triggers": [[11, 12, "Conflict:Attack", 1.0]], "relations": [[1, 0, "ORG-AFF", 1.0]], "roles": [[0, 2, "Attacker", 0.4597024700555278], [0, 3, "Instrument", 1.0]]}}
146 | ```


--------------------------------------------------------------------------------
/evaluations/supervised-ie/config.py:
--------------------------------------------------------------------------------
  1 | import copy
  2 | import json
  3 | import os
  4 | 
  5 | from typing import Dict, Any
  6 | 
  7 | from transformers import (BertConfig, RobertaConfig, XLMRobertaConfig,
  8 |                           PretrainedConfig)
  9 | 
 10 | class Config(object):
 11 |     def __init__(self, **kwargs):
 12 |         self.coref = kwargs.pop('coref', True)
 13 |         # bert
 14 |         self.bert_model_name = kwargs.pop('bert_model_name', 'bert-large-cased')
 15 |         self.bert_cache_dir = kwargs.pop('bert_cache_dir', None)
 16 |         self.extra_bert = kwargs.pop('extra_bert', -1)
 17 |         self.use_extra_bert = kwargs.pop('use_extra_bert', False)
 18 |         # global features
 19 |         self.use_global_features = kwargs.get('use_global_features', False)
 20 |         self.global_features = kwargs.get('global_features', [])
 21 |         # model
 22 |         self.multi_piece_strategy = kwargs.pop('multi_piece_strategy', 'first')
 23 |         self.bert_dropout = kwargs.pop('bert_dropout', .5)
 24 |         self.linear_dropout = kwargs.pop('linear_dropout', .4)
 25 |         self.linear_bias = kwargs.pop('linear_bias', True)
 26 |         self.linear_activation = kwargs.pop('linear_activation', 'relu')
 27 |         self.entity_hidden_num = kwargs.pop('entity_hidden_num', 150)
 28 |         self.mention_hidden_num = kwargs.pop('mention_hidden_num', 150)
 29 |         self.event_hidden_num = kwargs.pop('event_hidden_num', 600)
 30 |         self.relation_hidden_num = kwargs.pop('relation_hidden_num', 150)
 31 |         self.role_hidden_num = kwargs.pop('role_hidden_num', 600)
 32 |         self.use_entity_type = kwargs.pop('use_entity_type', False)
 33 |         self.beam_size = kwargs.pop('beam_size', 5)
 34 |         self.beta_v = kwargs.pop('beta_v', 2)
 35 |         self.beta_e = kwargs.pop('beta_e', 2)
 36 |         self.relation_mask_self = kwargs.pop('relation_mask_self', True)
 37 |         self.relation_directional = kwargs.pop('relation_directional', False)
 38 |         self.symmetric_relations = set(kwargs.pop('symmetric_relations', ['PER-SOC']))
 39 |         # files
 40 |         self.train_file = kwargs.pop('train_file', None)
 41 |         self.dev_file = kwargs.pop('dev_file', None)
 42 |         self.test_file = kwargs.pop('test_file', None)
 43 |         self.valid_pattern_path = kwargs.pop('valid_pattern_path', None)
 44 |         self.log_path = kwargs.pop('log_path', None)
 45 |         # training
 46 |         self.accumulate_step = kwargs.pop('accumulate_step', 1)
 47 |         self.batch_size = kwargs.pop('batch_size', 10)
 48 |         self.eval_batch_size = kwargs.pop('eval_batch_size', 5)
 49 |         self.max_epoch = kwargs.pop('max_epoch', 50)
 50 |         self.learning_rate = kwargs.pop('learning_rate', 1e-3)
 51 |         self.bert_learning_rate = kwargs.pop('bert_learning_rate', 1e-5)
 52 |         self.weight_decay = kwargs.pop('weight_decay', 0.001)
 53 |         self.bert_weight_decay = kwargs.pop('bert_weight_decay', 0.00001)
 54 |         self.warmup_epoch = kwargs.pop('warmup_epoch', 5)
 55 |         self.grad_clipping = kwargs.pop('grad_clipping', 5.0)
 56 |         # others
 57 |         self.use_gpu = kwargs.pop('use_gpu', True)
 58 |         self.gpu_device = kwargs.pop('gpu_device', -1)
 59 | 
 60 |     @classmethod
 61 |     def from_dict(cls, dict_obj):
 62 |         """Creates a Config object from a dictionary.
 63 |         Args:
 64 |             dict_obj (Dict[str, Any]): a dict where keys are
 65 |         """
 66 |         config = cls()
 67 |         for k, v in dict_obj.items():
 68 |             setattr(config, k, v)
 69 |         return config
 70 | 
 71 |     @classmethod
 72 |     def from_json_file(cls, path):
 73 |         with open(path, 'r', encoding='utf-8') as r:
 74 |             return cls.from_dict(json.load(r))
 75 | 
 76 |     def to_dict(self):
 77 |         output = copy.deepcopy(self.__dict__)
 78 |         return output
 79 | 
 80 |     def save_config(self, path):
 81 |         """Save a configuration object to a file.
 82 |         :param path (str): path to the output file or its parent directory.
 83 |         """
 84 |         if os.path.isdir(path):
 85 |             path = os.path.join(path, 'config.json')
 86 |         print('Save config to {}'.format(path))
 87 |         with open(path, 'w', encoding='utf-8') as w:
 88 |             w.write(json.dumps(self.to_dict(), indent=2,
 89 |                                sort_keys=True))
 90 |     @property
 91 |     def bert_config(self):
 92 |         if self.bert_model_name.startswith('bert-'):
 93 |             return BertConfig.from_pretrained(self.bert_model_name,
 94 |                                               cache_dir=self.bert_cache_dir)
 95 |         elif self.bert_model_name.startswith('roberta-'):
 96 |             return RobertaConfig.from_pretrained(self.bert_model_name,
 97 |                                                  cache_dir=self.bert_cache_dir)
 98 |         elif self.bert_model_name.startswith('xlm-roberta-'):
 99 |             return XLMRobertaConfig.from_pretrained(self.bert_model_name,
100 |                                                     cache_dir=self.bert_cache_dir)
101 |         else:
102 |             return BertConfig.from_pretrained(self.bert_model_name)


--------------------------------------------------------------------------------
/evaluations/supervised-ie/config/ace.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "bert_model_name": "/shared/nas/data/m1/liliang3/checkpoint_final/",
 3 |     "bert_cache_dir": "<BERT_CACHE_DIR>",
 4 |     "multi_piece_strategy": "average",
 5 |     "bert_dropout": 0.5,
 6 |     "use_extra_bert": true,
 7 |     "extra_bert": -3,
 8 |   
 9 |     "use_global_features": false,
10 |     "global_features": [],
11 |     "global_warmup": 0,
12 |   
13 |     "linear_dropout": 0.4,
14 |     "linear_bias": true,
15 |     "entity_hidden_num": 150,
16 |     "mention_hidden_num": 150,
17 |     "event_hidden_num": 600,
18 |     "relation_hidden_num": 150,
19 |     "role_hidden_num": 600,
20 |     "use_entity_type": true,
21 |     "beam_size": 20,
22 |     "beta_v": 2,
23 |     "beta_e": 2,
24 |     "relation_mask_self": true,
25 |     "relation_directional": false,
26 |     "symmetric_relations": ["PER-SOC"],
27 |   
28 |     "train_file": "./data/ace_bert_uncased/train.oneie.json",
29 |     "dev_file": "./data/ace_bert_uncased/dev.oneie.json",
30 |     "test_file": "./data/ace_bert_uncased/test.oneie.json",
31 |     "log_path": "./log",
32 |     "valid_pattern_path": "./resource/ere_patterns",
33 |     "ignore_title": false,
34 |     "ignore_first_header": false,
35 |   
36 |     "accumulate_step": 1,
37 |     "batch_size": 10,
38 |     "eval_batch_size": 10,
39 |     "max_epoch": 100,
40 |     "learning_rate": 1e-4,
41 |     "bert_learning_rate": 1e-5,
42 |     "weight_decay": 1e-3,
43 |     "bert_weight_decay": 1e-5,
44 |     "warmup_epoch": 5,
45 |     "grad_clipping": 5.0,
46 |   
47 |     "use_gpu": true,
48 |     "gpu_device": 1
49 |   }


--------------------------------------------------------------------------------
/evaluations/supervised-ie/config/ere.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "bert_model_name": "/shared/nas/data/m1/liliang3/checkpoint_final/",
 3 |     "bert_cache_dir": "<BERT_CACHE_DIR>",
 4 |     "multi_piece_strategy": "average",
 5 |     "bert_dropout": 0.5,
 6 |     "use_extra_bert": true,
 7 |     "extra_bert": -3,
 8 |   
 9 |     "use_global_features": false,
10 |     "global_features": [],
11 |     "global_warmup": 0,
12 |   
13 |     "linear_dropout": 0.4,
14 |     "linear_bias": true,
15 |     "entity_hidden_num": 150,
16 |     "mention_hidden_num": 150,
17 |     "event_hidden_num": 600,
18 |     "relation_hidden_num": 150,
19 |     "role_hidden_num": 600,
20 |     "use_entity_type": true,
21 |     "beam_size": 20,
22 |     "beta_v": 2,
23 |     "beta_e": 2,
24 |     "relation_mask_self": true,
25 |     "relation_directional": false,
26 |     "symmetric_relations": ["PER-SOC"],
27 |   
28 |     "train_file": "./data/ere_bert_uncased/train.oneie.json",
29 |     "dev_file": "./data/ere_bert_uncased/dev.oneie.json",
30 |     "test_file": "./data/ere_bert_uncased/test.oneie.json",
31 |     "log_path": "./log",
32 |     "valid_pattern_path": "./resource/ere_patterns",
33 |     "ignore_title": false,
34 |     "ignore_first_header": false,
35 |   
36 |     "accumulate_step": 1,
37 |     "batch_size": 10,
38 |     "eval_batch_size": 10,
39 |     "max_epoch": 100,
40 |     "learning_rate": 1e-4,
41 |     "bert_learning_rate": 1e-5,
42 |     "weight_decay": 1e-3,
43 |     "bert_weight_decay": 1e-5,
44 |     "warmup_epoch": 5,
45 |     "grad_clipping": 5.0,
46 |   
47 |     "use_gpu": true,
48 |     "gpu_device": 1
49 |   }


--------------------------------------------------------------------------------
/evaluations/supervised-ie/convert.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import glob
  3 | import json
  4 | 
  5 | cur_dir = os.path.dirname(os.path.realpath(__file__))
  6 | 
  7 | entity_type_mapping_file = os.path.join(cur_dir, 'resource', 'ace_to_aida_entity.tsv')
  8 | event_type_mapping_file = os.path.join(cur_dir, 'resource', 'ace_to_aida_event.tsv')
  9 | role_type_mapping_file = os.path.join(cur_dir, 'resource', 'ace_to_aida_role.tsv')
 10 | relation_type_mapping_file = os.path.join(cur_dir, 'resource', 'ace_to_aida_relation.tsv')
 11 | 
 12 | def load_mapping(mapping_file):
 13 |     mapping = {}
 14 |     with open(mapping_file, 'r', encoding='utf-8') as r:
 15 |         for line in r:
 16 |             from_type, to_type = line.strip().split('\t')
 17 |             mapping[from_type] = to_type
 18 |     return mapping
 19 | 
 20 | 
 21 | def get_span_mention_text(tokens, token_ids, start, end):
 22 |     if start + 1 == end:
 23 |         return tokens[start], token_ids[start]
 24 | 
 25 |     start_token = tokens[start]
 26 |     end_token = tokens[end - 1]
 27 |     start_char = int(token_ids[start].split(':')[1].split('-')[0])
 28 |     end_char = int(token_ids[end - 1].split(':')[1].split('-')[1])
 29 |     text = ' ' * (end_char - start_char + 1)
 30 |     for token, token_id in zip(tokens[start:end], token_ids[start:end]):
 31 |         token_start, token_end = token_id.split(':')[1].split('-')
 32 |         token_start, token_end = int(token_start), int(token_end)
 33 |         token_start -= start_char
 34 |         token_end -= start_char
 35 |         assert len(text[:token_start] + token + text[token_end + 1:]) == len(text)
 36 |         text = text[:token_start] + token + text[token_end + 1:]
 37 |     return text, '{}:{}-{}'.format(token_ids[start].split(':')[0],
 38 |                                    start_char, end_char)
 39 | 
 40 | 
 41 | def json_to_cs(input_dir, output_dir):
 42 |     # TODO: add the first cs line
 43 |     entity_type_mapping = load_mapping(entity_type_mapping_file)
 44 |     relation_type_mapping = load_mapping(relation_type_mapping_file)
 45 |     event_type_mapping = load_mapping(event_type_mapping_file)
 46 |     role_type_mapping = load_mapping(role_type_mapping_file)
 47 | 
 48 |     json_files = glob.glob(os.path.join(input_dir, '*.json'))
 49 |     # convert entities
 50 |     print('Converting entity mentions and generate entity cs file')
 51 |     entity_mapping = {}
 52 |     entity_id_mapping = {}
 53 |     entity_cs_file = os.path.join(output_dir, 'entity.cs')
 54 |     with open(entity_cs_file, 'w', encoding='utf-8') as w:
 55 |         for f in json_files:
 56 |             with open(f, 'r', encoding='utf-8') as r:
 57 |                 for line in r:
 58 |                     result = json.loads(line)
 59 |                     doc_id = result['doc_id']
 60 |                     sent_id = result['sent_id']
 61 |                     tokens, token_ids = result['tokens'], result['token_ids']
 62 |                     for i, (start, end, enttype, mentype, _) in enumerate(result['graph']['entities']):
 63 |                         entity_text, entity_span = get_span_mention_text(
 64 |                             tokens, token_ids, start, end)
 65 |                         entity_id = 'Entity_EDL_{:07d}'.format(len(entity_mapping) + 1)
 66 |                         entity_mapping[(sent_id, i)] = (entity_text, entity_id, entity_span, enttype, mentype)
 67 |                         entity_id_mapping[entity_id] = (sent_id, i)
 68 |                         enttype_mapped = entity_type_mapping[enttype]
 69 |                         w.write(':{}\ttype\t{}\t1.000000\n'.format(entity_id, enttype_mapped))
 70 |                         w.write(':{}\tcanonical_mention\t"{}"\t{}\t0.000\n'.format(
 71 |                             entity_id, entity_text, entity_span))
 72 |                         w.write(':{}\tmention\t"{}"\t{}\t0.000\n'.format(
 73 |                             entity_id, entity_text, entity_span))
 74 |                         # skip the link line
 75 |     
 76 |     # converting relations and events
 77 |     print('Converting relations and events')
 78 |     event_count = 0
 79 |     relation_cs_file = os.path.join(output_dir, 'relation.cs')
 80 |     event_cs_file = os.path.join(output_dir, 'event.cs')
 81 |     with open(relation_cs_file, 'w', encoding='utf-8') as rel_w, \
 82 |         open(event_cs_file, 'w', encoding='utf-8') as evt_w:
 83 |         for f in json_files:
 84 |             with open(f, 'r', encoding='utf-8') as r:
 85 |                 for line in r:
 86 |                     result = json.loads(line)
 87 |                     sent_id = result['sent_id']
 88 |                     tokens, token_ids = result['tokens'], result['token_ids']
 89 |                     relations = result['graph']['relations']
 90 |                     triggers = result['graph']['triggers']
 91 |                     roles = result['graph']['roles']
 92 |                     # sentence span
 93 |                     sent_span = '{}:{}-{}'.format(
 94 |                         token_ids[0].split(':')[0],
 95 |                         token_ids[0].split(':')[1].split('-')[0],
 96 |                         token_ids[-1].split(':')[1].split('-')[1])
 97 |                     # convert relations
 98 |                     for arg1, arg2, reltype, _ in relations:
 99 |                         if reltype == 'ART':
100 |                             continue
101 |                         entity_id_1 = entity_mapping[(sent_id, arg1)][1]
102 |                         entity_id_2 = entity_mapping[(sent_id, arg2)][1]
103 |                         reltype_mapped = relation_type_mapping[reltype]
104 |                         rel_w.write(':{}\t{}\t:{}\t{}\t1.000\n'.format(
105 |                             entity_id_1, reltype_mapped, entity_id_2, sent_span
106 |                         ))
107 |                     # convert events
108 |                     for cur_trigger_idx, (start, end, eventtype, _) in enumerate(triggers):
109 |                         event_count += 1
110 |                         event_id = 'Event_{:06d}'.format(event_count)
111 |                         trigger_text, trigger_span = get_span_mention_text(
112 |                             tokens, token_ids, start, end)
113 |                         eventtype_mapped = event_type_mapping[eventtype]
114 |                         evt_w.write(':{}\ttype\t{}\n'.format(event_id, eventtype_mapped))
115 |                         evt_w.write(':{}\tmention.actual\t"{}"\t{}\t1.000\n'.format(
116 |                             event_id, trigger_text, trigger_span))
117 |                         evt_w.write(':{}\tcanonical_mention.actual\t"{}"\t{}\t1.000\n'.format(
118 |                             event_id, trigger_text, trigger_span))
119 |                         for trigger_idx, entity_idx, role, _ in roles:
120 |                             if cur_trigger_idx == trigger_idx:
121 |                                 role_mapped = role_type_mapping['{}:{}'.format(eventtype, role).lower()]
122 |                                 _, entity_id, entity_span, _, _ = entity_mapping[(sent_id, entity_idx)]
123 |                                 evt_w.write(':{}\t{}.actual\t{}\t{}\t1.000\n'.format(
124 |                                     event_id, role_mapped, entity_id, entity_span))


--------------------------------------------------------------------------------
/evaluations/supervised-ie/global_feature.py:
--------------------------------------------------------------------------------
  1 | import itertools
  2 | 
  3 | import numpy as np
  4 | 
  5 | from collections import Counter
  6 | 
  7 | 
  8 | def generate_global_feature_maps(vocabs, valid_patterns):
  9 |     """
 10 |     Note that feature maps here refer to "feature-index mappings", not feature
 11 |     maps in CNNs.
 12 |     :param vocabs: vocabularies.
 13 |     :param valid_patterns: valid patterns (only event-role patterns are used).
 14 |     :return (dict): a dictionary of feature-index maps.
 15 |     """
 16 |     event_type_vocab = vocabs['event_type']
 17 |     entity_type_vocab = vocabs['entity_type']
 18 |     role_type_vocab = vocabs['role_type']
 19 |     relation_type_vocab = vocabs['relation_type']
 20 |     event_role = valid_patterns['event_role']
 21 | 
 22 |     # 1. role role: the number of entities that act as <role_i> and <role_j>
 23 |     # arguments at the same time
 24 |     role_role_map = set()
 25 |     for role1 in role_type_vocab.values():
 26 |         for role2 in role_type_vocab.values():
 27 |             if role1 and role2:
 28 |                 if role1 < role2:
 29 |                     key = role1 * 100 + role2
 30 |                 else:
 31 |                     key = role2 * 100 + role1
 32 |                 role_role_map.add(key)
 33 |     role_role_map = sorted(list(role_role_map))
 34 |     role_role_map = {k: i for i, k in enumerate(role_role_map)}
 35 | 
 36 |     # 2. event role num: the number of <event_type_i> events with <number>
 37 |     # <role_j> arguments
 38 |     event_role_num_map = list()
 39 |     for event in event_type_vocab.values():
 40 |         for role in role_type_vocab.values():
 41 |             if event and role:
 42 |                 key = event * 1000 + role * 10
 43 |                 event_role_num_map.append(key + 1)
 44 |                 event_role_num_map.append(key + 2)
 45 |     event_role_num_map.sort()
 46 |     event_role_num_map = {k: i for i, k in enumerate(event_role_num_map)}
 47 | 
 48 |     # 3. role entity: the number of occurrences of <entity_type_i> and <role_j>
 49 |     # combination
 50 |     role_entity_map = list()
 51 |     for role in role_type_vocab.values():
 52 |         for entity in entity_type_vocab.values():
 53 |             if role and entity:
 54 |                 role_entity_map.append(role * 100 + entity)
 55 |     role_entity_map.sort()
 56 |     role_entity_map = {k: i for i, k in enumerate(role_entity_map)}
 57 | 
 58 |     # 4. multiple role
 59 |     multi_role_map = [role for role in role_type_vocab.values() if role]
 60 |     multi_role_map.sort()
 61 |     multi_role_map = {k: i for i, k in enumerate(multi_role_map)}
 62 | 
 63 |     # 5. event role event role: the number of entities that act as a <role_i>
 64 |     # argument of an <event_type_j> event and a <role_k> argument of an
 65 |     # <event_type_l> event at the same time
 66 |     event_role_event_role_map = set()
 67 |     for event_role1 in event_role:
 68 |         for event_role2 in event_role:
 69 |             event1 = event_role1 // 100
 70 |             event2 = event_role2 // 100
 71 |             role1 = event_role1 % 100
 72 |             role2 = event_role2 % 100
 73 |             if event1 < event2:
 74 |                 key = event1 * 1000000 + role1 * 10000 + event2 * 100 + role2
 75 |             else:
 76 |                 key = event2 * 1000000 + role2 * 10000 + event1 * 100 + role1
 77 |             event_role_event_role_map.add(key)
 78 |     event_role_event_role_map = sorted(list(event_role_event_role_map))
 79 |     event_role_event_role_map = {k: i for i, k in enumerate(event_role_event_role_map)}
 80 | 
 81 |     # 6. relation entity entity: the number of occurrences of <entity_type_i>,
 82 |     # <entity_type_j>, and <relation_type_k> combination
 83 |     relation_entity_entity_map = set()
 84 |     for relation in relation_type_vocab.values():
 85 |         for entity1 in entity_type_vocab.values():
 86 |             for entity2 in entity_type_vocab.values():
 87 |                 if relation and entity1 and entity2:
 88 |                     key = relation * 10000
 89 |                     if entity1 < entity2:
 90 |                         key += entity1 * 100 + entity2
 91 |                     else:
 92 |                         key += entity2 * 100 + entity1
 93 |                     relation_entity_entity_map.add(key)
 94 |     relation_entity_entity_map = sorted(list(relation_entity_entity_map))
 95 |     relation_entity_entity_map = {k: i for i, k in enumerate(relation_entity_entity_map)}
 96 | 
 97 |     # 7. relation entity: the number of occurrences of <entity_type_i> and
 98 |     # <relation_type_j> combination
 99 |     relation_entity_map = [relation * 100 + entity
100 |                            for relation in relation_type_vocab.values()
101 |                            for entity in entity_type_vocab.values()
102 |                            if relation and entity]
103 |     relation_entity_map.sort()
104 |     relation_entity_map = {k: i for i, k in enumerate(relation_entity_map)}
105 | 
106 |     # 8. relation role role: the number of occurrences of a <relation_type_i>
107 |     # relation between a <role_j> argument and a <role_k> argument of the same
108 |     # event
109 |     relation_role_role_map = set()
110 |     for relation in relation_type_vocab.values():
111 |         for role1 in role_type_vocab.values():
112 |             for role2 in role_type_vocab.values():
113 |                 if relation and role1 and role2:
114 |                     key = relation * 10000
115 |                     if role1 < role2:
116 |                         key += role1 * 100 + role2
117 |                     else:
118 |                         key += role2 * 100 + role1
119 |                     relation_role_role_map.add(key)
120 |     relation_role_role_map = sorted(list(relation_role_role_map))
121 |     relation_role_role_map = {k: i for i, k in enumerate(relation_role_role_map)}
122 | 
123 |     # 9. multiple relation: the number of entities that have a <relation_type_i>
124 |     # relation with multiple entities
125 |     multi_relation_map = [relation for relation in relation_type_vocab.values()
126 |                           if relation]
127 |     multi_relation_map.sort()
128 |     multi_relation_map = {k: i for i, k in enumerate((multi_relation_map))}
129 | 
130 |     # 10. relation relation: the number of entities involving in <relation_type_i>
131 |     # and <relation_type_j> relations simultaneously
132 |     relation_relation_map = set()
133 |     for relation1 in relation_type_vocab.values():
134 |         for relation2 in relation_type_vocab.values():
135 |             if relation1 and relation2:
136 |                 key = relation1 * 100 + relation2 if relation1 < relation2 \
137 |                     else relation2 * 100 + relation1
138 |                 relation_relation_map.add(key)
139 |     relation_relation_map = sorted(list(relation_relation_map))
140 |     relation_relation_map = {k: i for i, k in enumerate(relation_relation_map)}
141 | 
142 |     # 11. multiple event: whether a graph contains more than one <event_type_i>
143 |     # event
144 |     multi_event_map = [event for event in event_type_vocab.values() if event]
145 |     multi_event_map.sort()
146 |     multi_event_map = {k: i for i, k in enumerate(multi_event_map)}
147 | 
148 |     return {
149 |         'role_role': role_role_map,
150 |         'event_role_num': event_role_num_map,
151 |         'role_entity': role_entity_map,
152 |         'multi_role': multi_role_map,
153 |         'event_role_event_role': event_role_event_role_map,
154 |         'relation_entity_entity': relation_entity_entity_map,
155 |         'relation_entity': relation_entity_map,
156 |         'relation_role_role': relation_role_role_map,
157 |         'multi_relation': multi_relation_map,
158 |         'relation_relation': relation_relation_map,
159 |         'multi_event': multi_event_map
160 |     }
161 | 
162 | 
163 | def generate_global_feature_vector(graph,
164 |                                    global_feature_maps,
165 |                                    features=None):
166 |     role_role_map = global_feature_maps['role_role']
167 |     role_role_vec = np.zeros(len((role_role_map)))
168 |     role_entity_map = global_feature_maps['role_entity']
169 |     role_entity_vec = np.zeros(len(role_entity_map))
170 |     event_role_num_map = global_feature_maps['event_role_num']
171 |     event_role_num_vec = np.zeros(len(event_role_num_map))
172 |     multi_role_map = global_feature_maps['multi_role']
173 |     multi_role_vec = np.zeros(len(multi_role_map))
174 |     event_role_event_role_map = global_feature_maps['event_role_event_role']
175 |     event_role_event_role_vec = np.zeros(len(event_role_event_role_map))
176 |     relation_entity_entity_map = global_feature_maps['relation_entity_entity']
177 |     relation_entity_entity_vec = np.zeros(len(relation_entity_entity_map))
178 |     relation_entity_map = global_feature_maps['relation_entity']
179 |     relation_entity_vec = np.zeros(len(relation_entity_map))
180 |     relation_role_role_map = global_feature_maps['relation_role_role']
181 |     relation_role_role_vec = np.zeros(len(relation_role_role_map))
182 |     multi_relation_map = global_feature_maps['multi_relation']
183 |     multi_relation_vec = np.zeros(len(multi_relation_map))
184 |     relation_relation_map = global_feature_maps['relation_relation']
185 |     relation_relation_vec = np.zeros(len(relation_relation_map))
186 |     multi_event_map = global_feature_maps['multi_event']
187 |     multi_event_vec = np.zeros(len(multi_event_map))
188 | 
189 |     # event argument role related features
190 |     entity_roles = [[] for _ in range(graph.entity_num)]
191 |     entity_event_role = [[] for _ in range(graph.entity_num)]
192 |     event_role_count = [Counter() for _ in range(graph.trigger_num)]
193 |     for trigger_idx, entity_idx, role in graph.roles:
194 |         entity_roles[entity_idx].append(role)
195 |         entity_event_role[entity_idx].append(
196 |             (graph.triggers[trigger_idx][-1], role))
197 |         event_role_count[trigger_idx][role] += 1
198 |         # 3. role entity
199 |         role_entity = role * 100 + graph.entities[entity_idx][-1]
200 |         if role_entity in role_entity_map:
201 |             role_entity_vec[role_entity_map[role_entity]] += 1
202 |     # 1. role role
203 |     for roles in entity_roles:
204 |         for role1, role2 in itertools.combinations(roles, 2):
205 |             key = role1 * 100 + role2 if role1 < role2 \
206 |                 else role2 * 100 + role1
207 |             if key in role_role_map:
208 |                 role_role_vec[role_role_map[key]] += 1
209 |     # 2. event role num & 4. multiple role
210 |     for event, role_count in enumerate(event_role_count):
211 |         for role, count in role_count.items():
212 |             # to reduce the number of features, we treat numbers > 2 as 2
213 |             key = graph.triggers[event][-1] * 1000 + role * 10 + min(count, 2)
214 |             if key in event_role_num_map:
215 |                 event_role_num_vec[event_role_num_map[key]] += 1
216 |             if count > 1 and role in multi_role_map:
217 |                 multi_role_vec[multi_role_map[role]] += 1
218 |     # 5. event role event role
219 |     for event_role_pairs in entity_event_role:
220 |         for (event1, role1), (event2, role2) in itertools.combinations(
221 |                 event_role_pairs, 2):
222 |             if event1 < event2:
223 |                 key = event1 * 1000000 + role1 * 10000 + event2 * 100 + role2
224 |             else:
225 |                 key = event2 * 1000000 + role2 * 10000 + event1 * 100 + role1
226 |             if key in event_role_event_role_map:
227 |                 event_role_event_role_vec[event_role_event_role_map[key]] += 1
228 | 
229 |     # relation related features
230 |     entity_role_unique = [set(x) for x in entity_roles]
231 |     entity_relation_count = [Counter() for _ in range(graph.entity_num)]
232 |     for entity_idx1, entity_idx2, relation in graph.relations:
233 |         entity_relation_count[entity_idx1][relation] += 1
234 |         entity_relation_count[entity_idx2][relation] += 1
235 |         entity1 = graph.entities[entity_idx1][-1]
236 |         entity2 = graph.entities[entity_idx2][-1]
237 |         # 6. relation entity entity
238 |         if entity1 < entity2:
239 |             key = relation * 10000 + entity1 * 100 + entity2
240 |         else:
241 |             key = relation * 10000 + entity2 * 100 + entity1
242 |         if key in relation_entity_entity_map:
243 |             relation_entity_entity_vec[relation_entity_entity_map[key]] += 1
244 |         # 7. relation entity
245 |         key1 = relation * 100 + entity1
246 |         key2 = relation * 100 + entity2
247 |         if key1 in relation_entity_map:
248 |             relation_entity_vec[relation_entity_map[key1]] += 1
249 |         if key2 in relation_entity_map:
250 |             relation_entity_vec[relation_entity_map[key2]] += 1
251 |         # 8. relation role role
252 |         roles1 = entity_role_unique[entity_idx1]
253 |         roles2 = entity_role_unique[entity_idx2]
254 |         for role1 in roles1:
255 |             for role2 in roles2:
256 |                 if role1 < role2:
257 |                     key = relation * 10000 + role1 * 100 + role2
258 |                 else:
259 |                     key = relation * 10000 + role2 * 100 + role1
260 |                 if key in relation_role_role_map:
261 |                     relation_role_role_vec[relation_role_role_map[key]] += 1
262 |     # 9. multiple relation & 10. relation relation
263 |     for relation_count in entity_relation_count:
264 |         relations = []
265 |         for relation, count in relation_count.items():
266 |             relations.append(relation)
267 |             if count > 1:
268 |                 relations.append(relation)
269 |                 if relation in multi_relation_map:
270 |                     multi_relation_vec[multi_relation_map[relation]] += 1
271 |         for relation1, relation2 in itertools.combinations(relations, 2):
272 |             if relation1 < relation2:
273 |                 key = relation1 * 100 + relation2
274 |             else:
275 |                 key = relation2 * 100 + relation1
276 |             if key in relation_relation_map:
277 |                 relation_relation_vec[relation_relation_map[key]] += 1
278 | 
279 |     # 11. multiple event
280 |     trigger_count = Counter()
281 |     for _, _, trigger in graph.triggers:
282 |         trigger_count[trigger] += 1
283 |     for trigger, count in trigger_count.items():
284 |         if count > 1 and trigger in multi_event_map:
285 |             multi_event_vec[multi_event_map[trigger]] = 1
286 | 
287 |     feature_vector = np.concatenate(
288 |         [role_role_vec, event_role_num_vec, role_entity_vec,
289 |          multi_role_vec, event_role_event_role_vec, relation_entity_entity_vec,
290 |          relation_entity_vec, relation_role_role_vec,
291 |          multi_relation_vec, relation_relation_vec, multi_event_vec]
292 |     )
293 | 
294 |     if features:
295 |         vectors = {
296 |             'role_role': role_role_vec,
297 |             'event_role_num': event_role_num_vec,
298 |             'role_entity': role_entity_vec,
299 |             'multi_role': multi_role_vec,
300 |             'event_role_event_role': event_role_event_role_vec,
301 |             'relation_entity_entity': relation_entity_entity_vec,
302 |             'relation_entity': relation_entity_vec,
303 |             'relation_role_role': relation_role_role_vec,
304 |             'multi_relation': multi_relation_vec,
305 |             'relation_relation': relation_relation_vec,
306 |             'multi_event': multi_event_vec
307 |         }
308 |         feature_vector = np.concatenate([vectors[k] for k in features])
309 |     else:
310 |         feature_vector = np.concatenate(
311 |             [role_role_vec, event_role_num_vec, role_entity_vec,
312 |              multi_role_vec, event_role_event_role_vec, relation_entity_entity_vec,
313 |              relation_entity_vec, relation_role_role_vec,
314 |              multi_relation_vec, relation_relation_vec, multi_event_vec]
315 |         )
316 |     return feature_vector
317 | 


--------------------------------------------------------------------------------
/evaluations/supervised-ie/graph.py:
--------------------------------------------------------------------------------
  1 | class Graph(object):
  2 |     def __init__(self, entities, triggers, relations, roles, vocabs, mentions=None):
  3 |         """
  4 |         :param entities (list): A list of entities represented as a tuple of
  5 |         (start_offset, end_offset, label_idx). end_offset = the index of the end
  6 |         token + 1.
  7 |         :param triggers (list): A list of triggers represented as a tuple of
  8 |         (start_offset, end_offset, label_idx). end_offset = the index of the end
  9 |         token + 1.
 10 |         :param relations (list): A list of relations represented as a tuple of
 11 |         (entity_idx_1, entity_idx_2, label_idx). As we do not consider the
 12 |         direction of relations (list), it is better to have entity_idx_1 <
 13 |         entity_idx2.
 14 |         :param roles: A list of roles represented as a tuple of (trigger_idx_1,
 15 |         entity_idx_2, label_idx).
 16 |         :param vocabs (dict): Label type vocabularies.
 17 |         """
 18 |         self.entities = entities
 19 |         self.triggers = triggers
 20 |         self.relations = relations
 21 |         self.roles = roles
 22 |         self.vocabs = vocabs
 23 |         self.mentions = [] if mentions is None else mentions
 24 | 
 25 |         self.entity_num = len(entities)
 26 |         self.trigger_num = len(triggers)
 27 |         self.relation_num = len(relations)
 28 |         self.role_num = len(roles)
 29 |         self.graph_local_score = 0.0
 30 | 
 31 |         # subscores
 32 |         self.entity_scores = []
 33 |         self.trigger_scores = []
 34 |         self.relation_scores = []
 35 |         self.role_scores = []
 36 | 
 37 |     def __eq__(self, other):
 38 |         if isinstance(other, Graph):
 39 |             equal = (self.entities == other.entities and
 40 |                      self.triggers == other.triggers and
 41 |                      self.relations == other.relations and
 42 |                      self.roles == other.roles and
 43 |                      self.mentions == other.mentions)
 44 |             return equal
 45 |         return False
 46 | 
 47 | 
 48 |     def to_dict(self):
 49 |         """Convert a graph to a dict object
 50 |         :return (dict): A dictionary representing the graph, where label indices
 51 |         have been replaced with label strings.
 52 |         """
 53 |         entity_itos = {i: s for s, i in self.vocabs['entity_type'].items()}
 54 |         trigger_itos = {i: s for s, i in self.vocabs['event_type'].items()}
 55 |         relation_itos = {i: s for s, i in self.vocabs['relation_type'].items()}
 56 |         role_itos = {i: s for s, i in self.vocabs['role_type'].items()}
 57 |         mention_itos = {i: s for s, i in self.vocabs['mention_type'].items()}
 58 |         
 59 |         # entities = [[i, j, entity_itos[k], mention_itos[l]] for (i, j, k), (_, _, l) in zip(self.entities, self.mentions)]
 60 |         # triggers = [[i, j, trigger_itos[k]] for i, j, k in self.triggers]
 61 |         # relations = [[i, j, relation_itos[k]] for i, j, k in self.relations]
 62 |         # roles = [[i, j, role_itos[k]] for i, j, k in self.roles]
 63 | 
 64 |         entities = [[i, j, entity_itos[k], mention_itos[l], s] for (i, j, k), (_, _, l), s in zip(self.entities, self.mentions, self.entity_scores)]
 65 |         triggers = [[i, j, trigger_itos[k], l] for (i, j, k), l in zip(self.triggers, self.trigger_scores)]
 66 |         relations = [[i, j, relation_itos[k], l] for (i, j, k), l in zip(self.relations, self.relation_scores)]
 67 |         roles = [[i, j, role_itos[k], l] for (i, j, k), l in zip(self.roles, self.role_scores)]
 68 | 
 69 |         return {
 70 |             'entities': entities,
 71 |             'triggers': triggers,
 72 |             'relations': relations,
 73 |             'roles': roles,
 74 |         }
 75 | 
 76 |     def __str__(self):
 77 |         return str(self.to_dict())
 78 | 
 79 |     def copy(self):
 80 |         """Make a copy of the graph
 81 |         :return (Graph): a copy of the current graph.
 82 |         """
 83 |         graph = Graph(
 84 |             entities=self.entities.copy(),
 85 |             triggers=self.triggers.copy(),
 86 |             relations=self.relations.copy(),
 87 |             roles=self.roles.copy(),
 88 |             mentions=self.mentions.copy(),
 89 |             vocabs=self.vocabs
 90 |         )
 91 |         graph.graph_local_score = self.graph_local_score
 92 |         graph.entity_scores = self.entity_scores
 93 |         graph.trigger_scores = self.trigger_scores
 94 |         graph.relation_scores = self.relation_scores
 95 |         graph.role_scores = self.role_scores
 96 |         return graph
 97 | 
 98 |     def clean(self, relation_directional=False, symmetric_relations=None):
 99 |         # self.entities.sort(key=lambda x: (x[0], x[1]))
100 |         # self.triggers.sort(key=lambda x: (x[0], x[1]))
101 |         # self.relations.sort(key=lambda x: (x[0], x[1]))
102 |         # self.roles.sort(key=lambda x: (x[0], x[1]))
103 | 
104 |         entities = [(i, j, k, l) for (i, j, k), l in zip(self.entities, self.entity_scores)]
105 |         triggers = [(i, j, k, l) for (i, j, k), l in zip(self.triggers, self.trigger_scores)]
106 |         relations = [(i, j, k, l) for (i, j, k), l in zip(self.relations, self.relation_scores)]
107 |         roles = [(i, j, k, l) for (i, j, k), l in zip(self.roles, self.role_scores)]
108 | 
109 |         # coref_idx = self.vocabs['relation_type'].get('COREF', None)
110 |         # if coref_idx is not None:
111 |         #     relations, corefs = [], []
112 |         #     for i, j, k in self.relations:
113 |         #         if k == coref_idx:
114 |         #             corefs.append((i, j, k))
115 |         #         else:
116 |         #             relations.append((i, j, k))
117 |         #     self.relations = relations
118 |         #     self.corefs = corefs
119 | 
120 |         # clean relations
121 |         if relation_directional and symmetric_relations:
122 |             relation_itos = {i: s for s, i in self.vocabs['relation_type'].items()}
123 |             # relations = []
124 |             relations_tmp = []
125 |             # for i, j, k in self.relations:
126 |             for i, j, k, l in relations:
127 |                 if relation_itos[k] not in symmetric_relations:
128 |                     # relations.append((i, j, k))
129 |                     relations_tmp.append((i, j, k, l))
130 |                 else:
131 |                     if j < i:
132 |                         i, j = j, i
133 |                     relations_tmp.append((i, j, k, l))
134 |             # self.relations = relations
135 |             relations = relations_tmp
136 | 
137 |         self.entities = [(i, j, k) for i, j, k, _ in entities]
138 |         self.entity_scores = [l for _, _, _, l in entities]
139 |         self.triggers = [(i, j, k) for i, j, k, _ in triggers]
140 |         self.trigger_scores = [l for _, _, _, l in triggers]
141 |         self.relations = [(i, j, k) for i, j, k, _ in relations]
142 |         self.relation_scores = [l for _, _, _, l in relations]
143 |         self.roles = [(i, j, k) for i, j, k, _ in roles]
144 |         self.role_scores = [l for _, _, _, l in roles]
145 | 
146 |     def add_entity(self, start, end, label, score=0, score_norm=0):
147 |         """Add an entity mention to the graph.
148 |         :param start (int): Start token offset of the entity mention.
149 |         :param end (int): End token offset of the entity mention + 1.
150 |         :param label (int): Index of the entity type label.
151 |         :param score (float): Label score.
152 |         """
153 |         self.entities.append((start, end, label))
154 |         self.entity_num = len(self.entities)
155 |         self.graph_local_score += score
156 |         self.entity_scores.append(score_norm)
157 | 
158 |     def add_trigger(self, start, end, label, score=0, score_norm=0):
159 |         """Add an event trigger to the graph.
160 |         :param start (int): Start token offset of the trigger.
161 |         :param end (int): End token offset of the trigger + 1.
162 |         :param label (int): Index of the event type label.
163 |         :param score (float): Label score.
164 |         """
165 |         self.triggers.append((start, end, label))
166 |         self.trigger_num = len(self.triggers)
167 |         self.graph_local_score += score
168 |         self.trigger_scores.append(score_norm)
169 | 
170 |     def add_relation(self, idx1, idx2, label, score=0, score_norm=0):
171 |         """Add a relation edge to the graph.
172 |         :param idx1 (int): Index of the entity node 1.
173 |         :param idx2 (int): Index of the entity node 2.
174 |         :param label (int): Index of the relation type label.
175 |         :param score (float): Label score.
176 |         """
177 |         # assert idx1 < self.entity_num and idx2 < self.entity_num
178 |         if label:
179 |             self.relations.append((idx1, idx2, label))
180 |             self.relation_scores.append(score_norm)
181 |         self.relation_num = len(self.relations)
182 |         self.graph_local_score += score
183 | 
184 |     def add_role(self, idx1, idx2, label, score=0, score_norm=0):
185 |         """Add an event-argument link edge to the graph.
186 |         :param idx1 (int): Index of the trigger node.
187 |         :param idx2 (int): Index of the entity node.
188 |         :param label (int): Index of the role label.
189 |         :param score (float): Label score.
190 |         """
191 |         # assert idx1 < self.trigger_num and idx2 < self.entity_num
192 |         # self.roles.append((idx1, idx2, label))
193 |         if label:
194 |             self.roles.append((idx1, idx2, label))
195 |             self.role_scores.append(score_norm)
196 |         self.role_num = len(self.roles)
197 |         self.graph_local_score += score
198 | 
199 |     @staticmethod
200 |     def empty_graph(vocabs):
201 |         """Create a graph without any node and edge.
202 |         :param vocabs (dict): Vocabulary object.
203 |         """
204 |         return Graph([], [], [], [], vocabs)
205 | 
206 |     def to_label_idxs(self, max_entity_num, max_trigger_num,
207 |                       relation_directional=False,
208 |                       symmetric_relation_idxs=None):
209 |         """Generate label index tensors (which are actually list objects not
210 |         Pytorch tensors) to gather calculated scores.
211 |         :param max_entity_num: Max entity number of the batch.
212 |         :param max_trigger_num: Max trigger number of the batch.
213 |         :return: Index and mask tensors.
214 |         """
215 |         entity_idxs = [i[-1] for i in self.entities] + [0] * (max_entity_num - self.entity_num)
216 |         entity_mask = [1] * self.entity_num + [0] * (max_entity_num - self.entity_num)
217 | 
218 |         trigger_idxs = [i[-1] for i in self.triggers] + [0] * (max_trigger_num - self.trigger_num)
219 |         trigger_mask = [1] * self.trigger_num + [0] * (max_trigger_num - self.trigger_num)
220 | 
221 |         relation_idxs = [0] * max_entity_num * max_entity_num
222 |         relation_mask = [1 if i < self.entity_num and j < self.entity_num and i != j else 0
223 |                          for i in range(max_entity_num) for j in range(max_entity_num)]
224 |         for i, j, relation in self.relations:
225 |             # TODO: check relation label idxs and mask
226 |             relation_idxs[i * max_entity_num + j] = relation
227 |             if not relation_directional:
228 |                 relation_idxs[j * max_entity_num + i] = relation
229 |                 # relation_mask[i * max_entity_num + j] = .5
230 |                 # relation_mask[j * max_entity_num + i] = .5
231 |             if relation_directional and symmetric_relation_idxs and relation in symmetric_relation_idxs:
232 |                 relation_idxs[j * max_entity_num + i] = relation
233 |                 # relation_mask[i * max_entity_num + j] = .5
234 |                 # relation_mask[j * max_entity_num + i] = .5
235 |         
236 | 
237 |         role_idxs = [0] * max_trigger_num * max_entity_num
238 |         for i, j, role in self.roles:
239 |             role_idxs[i * max_entity_num + j] = role
240 |         role_mask = [1 if i < self.trigger_num and j < self.entity_num else 0
241 |                      for i in range(max_trigger_num) for j in range(max_entity_num)]
242 | 
243 |         return (
244 |             entity_idxs, entity_mask, trigger_idxs, trigger_mask,
245 |             relation_idxs, relation_mask, role_idxs, role_mask,
246 |         )
247 | 
248 | 


--------------------------------------------------------------------------------
/evaluations/supervised-ie/predict.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import json
  3 | import glob
  4 | import tqdm
  5 | import traceback
  6 | from argparse import ArgumentParser
  7 | 
  8 | import torch
  9 | from torch.utils.data import DataLoader
 10 | from transformers import BertTokenizer, BertConfig
 11 | 
 12 | from model import OneIE
 13 | from config import Config
 14 | from util import save_result
 15 | from data import IEDatasetEval
 16 | from convert import json_to_cs
 17 | 
 18 | cur_dir = os.path.dirname(os.path.realpath(__file__))
 19 | format_ext_mapping = {'txt': 'txt', 'ltf': 'ltf.xml', 'json': 'json',
 20 |                       'json_single': 'json'}
 21 | 
 22 | def load_model(model_path, device=0, gpu=False, beam_size=5):
 23 |     print('Loading the model from {}'.format(model_path))
 24 |     map_location = 'cuda:{}'.format(device) if gpu else 'cpu'
 25 |     state = torch.load(model_path, map_location=map_location)
 26 | 
 27 |     config = state['config']
 28 |     if type(config) is dict:
 29 |         config = Config.from_dict(config)
 30 |     config.bert_cache_dir = os.path.join(cur_dir, 'bert')
 31 |     vocabs = state['vocabs']
 32 |     valid_patterns = state['valid']
 33 | 
 34 |     # recover the model
 35 |     model = OneIE(config, vocabs, valid_patterns)
 36 |     model.load_state_dict(state['model'])
 37 |     model.beam_size = beam_size
 38 |     if gpu:
 39 |         model.cuda(device)
 40 | 
 41 |     tokenizer = BertTokenizer.from_pretrained(config.bert_model_name,
 42 |                                               cache_dir=config.bert_cache_dir,
 43 |                                               do_lower_case=False)
 44 | 
 45 |     return model, tokenizer, config
 46 | 
 47 | 
 48 | def predict_document(path, model, tokenizer, config, batch_size=20, 
 49 |                      max_length=128, gpu=False, input_format='txt',
 50 |                      language='english'):
 51 |     """
 52 |     :param path (str): path to the input file.
 53 |     :param model (OneIE): pre-trained model object.
 54 |     :param tokenizer (BertTokenizer): BERT tokenizer.
 55 |     :param config (Config): configuration object.
 56 |     :param batch_size (int): Batch size (default=20).
 57 |     :param max_length (int): Max word piece number (default=128).
 58 |     :param gpu (bool): Use GPU or not (default=False).
 59 |     :param input_format (str): Input file format (txt or ltf, default='txt).
 60 |     :param langauge (str): Input document language (default='english').
 61 |     """
 62 |     test_set = IEDatasetEval(path, max_length=max_length, gpu=gpu,
 63 |                              input_format=input_format, language=language)
 64 |     test_set.numberize(tokenizer)
 65 |     # document info
 66 |     info = {
 67 |         'doc_id': test_set.doc_id,
 68 |         'ori_sent_num': test_set.ori_sent_num,
 69 |         'sent_num': len(test_set)
 70 |     }
 71 |     # prediction result
 72 |     result = []
 73 |     for batch in DataLoader(test_set, batch_size=batch_size, shuffle=False, 
 74 |                             collate_fn=test_set.collate_fn):
 75 |         graphs = model.predict(batch)
 76 |         for graph, tokens, sent_id, token_ids in zip(graphs, batch.tokens,
 77 |                                                      batch.sent_ids,
 78 |                                                      batch.token_ids):
 79 |             graph.clean(relation_directional=config.relation_directional,
 80 |                         symmetric_relations=config.symmetric_relations)
 81 |             result.append((sent_id, token_ids, tokens, graph))
 82 |     return result, info
 83 | 
 84 | 
 85 | def predict(model_path, input_path, output_path, log_path=None, cs_path=None,
 86 |          batch_size=50, max_length=128, device=0, gpu=False,
 87 |          file_extension='txt', beam_size=5, input_format='txt',
 88 |          language='english'):
 89 |     """Perform information extraction.
 90 |     :param model_path (str): Path to the pre-trained model file.
 91 |     :param input_path (str): Path to the input directory.
 92 |     :param output_path (str): Path to the output directory.
 93 |     :param log_path (str): Path to the log file.
 94 |     :param cs_path (str): (optional) Path to the cold-start format output directory.
 95 |     :param batch_size (int): Batch size (default=50).
 96 |     :param max_length (int): Max word piece number for each sentence (default=128).
 97 |     :param device (int): GPU device index (default=0).
 98 |     :param gpu (bool): Use GPU (default=False).
 99 |     :param file_extension (str): Input file extension. Only files ending with the
100 |     given extension will be processed (default='txt').
101 |     :param beam_size (int): Beam size of the decoder (default=5).
102 |     :param input_format (str): Input file format (txt or ltf, default='txt').
103 |     :param language (str): Document language (default='english').
104 |     """
105 |     # set gpu device
106 |     if gpu:
107 |         torch.cuda.set_device(device)
108 |     # load the model from file
109 |     model, tokenizer, config = load_model(model_path, device=device, gpu=gpu,
110 |                                           beam_size=beam_size)
111 |     # get the list of documents
112 |     file_list = glob.glob(os.path.join(input_path, '*.{}'.format(file_extension)))
113 |     # log writer
114 |     if log_path:
115 |         log_writer = open(log_path, 'w', encoding='utf-8')
116 |     # run the model; collect result and info
117 |     doc_info_list = []
118 |     progress = tqdm.tqdm(total=len(file_list), ncols=75)
119 |     for f in file_list:
120 |         progress.update(1)
121 |         try:
122 |             doc_result, doc_info = predict_document(
123 |                 f, model, tokenizer, config, batch_size=batch_size,
124 |                 max_length=max_length, gpu=gpu, input_format=input_format,
125 |                 language=language)
126 |             # save json format result
127 |             doc_id = doc_info['doc_id']
128 |             with open(os.path.join(output_path, '{}.json'.format(doc_id)), 'w') as w:
129 |                 for sent_id, token_ids, tokens, graph in doc_result:
130 |                     output = {
131 |                         'doc_id': doc_id,
132 |                         'sent_id': sent_id,
133 |                         'token_ids': token_ids,
134 |                         'tokens': tokens,
135 |                         'graph': graph.to_dict()
136 |                     }
137 |                     w.write(json.dumps(output) + '\n')
138 |             # write doc info
139 |             if log_path:
140 |                 log_writer.write(json.dumps(doc_info) + '\n')
141 |                 log_writer.flush()
142 |         except Exception as e:
143 |             traceback.print_exc()
144 |             if log_path:
145 |                 log_writer.write(json.dumps(
146 |                     {'file': file, 'message': str(e)}) + '\n')
147 |                 log_writer.flush()
148 |     progress.close()
149 | 
150 |     # convert to the cold-start format
151 |     if cs_path:
152 |         print('Converting to cs format')
153 |         json_to_cs(output_path, cs_path)
154 | 
155 | 
156 | parser = ArgumentParser()
157 | parser.add_argument('-m', '--model_path', help='path to the trained model')
158 | parser.add_argument('-i', '--input_dir', help='path to the input folder (ltf files)')
159 | parser.add_argument('-o', '--output_dir', help='path to the output folder (json files)')
160 | parser.add_argument('-l', '--log_path', default=None, help='path to the log file')
161 | parser.add_argument('-c', '--cs_dir', default=None, help='path to the output folder (cs files)')
162 | parser.add_argument('--gpu', action='store_true', help='use gpu')
163 | parser.add_argument('-d', '--device', default=0, type=int, help='gpu device index')
164 | parser.add_argument('-b', '--batch_size', default=10, type=int, help='batch size')
165 | parser.add_argument('--max_len', default=128, type=int, help='max sentence length')
166 | parser.add_argument('--beam_size', default=5, type=int, help='beam set size')
167 | parser.add_argument('--lang', default='english', help='Model language')
168 | parser.add_argument('--format', default='txt', help='Input format (txt, ltf, json)')
169 | 
170 | args = parser.parse_args()
171 | extension = format_ext_mapping.get(args.format, 'ltf.xml')
172 | 
173 | predict(
174 |     model_path=args.model_path,
175 |     input_path=args.input_dir,
176 |     output_path=args.output_dir,
177 |     cs_path=args.cs_dir,
178 |     log_path=args.log_path,
179 |     batch_size=args.batch_size,
180 |     max_length=args.max_len,
181 |     device=args.device,
182 |     gpu=args.gpu,
183 |     beam_size=args.beam_size,
184 |     file_extension=extension,
185 |     input_format=args.format,
186 |     language=args.lang,
187 | )


--------------------------------------------------------------------------------
/evaluations/supervised-ie/preprocessing/process_dygiepp.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | from argparse import ArgumentParser
  3 | from transformers import BertTokenizer
  4 | 
  5 | 
  6 | def map_index(pieces):
  7 |     idxs = []
  8 |     for i, piece in enumerate(pieces):
  9 |         if i == 0:
 10 |             idxs.append([0, len(piece)])
 11 |         else:
 12 |             _, last = idxs[-1]
 13 |             idxs.append([last, last + len(piece)])
 14 |     return idxs
 15 | 
 16 | 
 17 | def convert(input_file, output_file, tokenizer):
 18 |     with open(input_file, 'r', encoding='utf-8') as r, \
 19 |             open(output_file, 'w', encoding='utf-8') as w:
 20 |         for line in r:
 21 |             doc = json.loads(line)
 22 |             doc_id = doc['doc_key']
 23 |             sentences = doc['sentences']
 24 |             sent_num = len(sentences)
 25 |             entities = doc.get('ner', [[] for _ in range(sent_num)])
 26 |             relations = doc.get('relations', [[] for _ in range(sent_num)])
 27 |             events = doc.get('events', [[] for _ in range(sent_num)])
 28 | 
 29 |             offset = 0
 30 |             for i, (sent_tokens, sent_entities, sent_relations, sent_events) in enumerate(zip(
 31 |                 sentences, entities, relations, events
 32 |             )):
 33 |                 sent_id = '{}-{}'.format(doc_id, i)
 34 |                 pieces = [tokenizer.tokenize(t) for t in sent_tokens]
 35 |                 word_lens = [len(p) for p in pieces]
 36 |                 idx_mapping = map_index(pieces)
 37 | 
 38 |                 sent_entities_ = []
 39 |                 sent_entity_map = {}
 40 |                 for j, (start, end, entity_type) in enumerate(sent_entities):
 41 |                     start, end = start - offset, end - offset + 1
 42 |                     entity_id = '{}-E{}'.format(sent_id, j)
 43 |                     entity = {
 44 |                         'id': entity_id,
 45 |                         'start': start, 'end': end,
 46 |                         'entity_type': entity_type,
 47 |                         # Mention types are not included in DyGIE++'s format
 48 |                         'mention_type': 'UNK',
 49 |                         'text': ' '.join(sent_tokens[start:end])}
 50 |                     sent_entities_.append(entity)
 51 |                     sent_entity_map[start] = entity
 52 | 
 53 |                 sent_relations_ = []
 54 |                 for j, (start1, end1, start2, end2, rel_type) in enumerate(sent_relations):
 55 |                     start1, end1 = start1 - offset, end1 - offset
 56 |                     start2, end2 = start2 - offset, end2 - offset
 57 |                     arg1 = sent_entity_map[start1]
 58 |                     arg2 = sent_entity_map[start2]
 59 |                     relation_id = '{}-R{}'.format(sent_id, j)
 60 |                     rel_type = rel_type.split('.')[0]
 61 |                     relation = {
 62 |                         'relation_type': rel_type,
 63 |                         'id': relation_id,
 64 |                         'arguments': [
 65 |                             {
 66 |                                 'entity_id': arg1['id'],
 67 |                                 'text': arg1['text'],
 68 |                                 'role': 'Arg-1'
 69 |                             },
 70 |                             {
 71 |                                 'entity_id': arg2['id'],
 72 |                                 'text': arg2['text'],
 73 |                                 'role': 'Arg-2'
 74 |                             },
 75 |                         ]
 76 |                     }
 77 |                     sent_relations_.append(relation)
 78 | 
 79 |                 sent_events_ = []
 80 |                 for j, event in enumerate(sent_events):
 81 |                     event_id = '{}-EV{}'.format(sent_id, j)
 82 |                     if len(event[0]) == 3:
 83 |                         trigger_start, trigger_end, event_type = event[0]
 84 |                     elif len(event[0]) == 2:
 85 |                         trigger_start, event_type = event[0]
 86 |                         trigger_end = trigger_start
 87 |                     trigger_start, trigger_end = trigger_start - offset, trigger_end - offset + 1
 88 |                     event_type = event_type.replace('.', ':')
 89 |                     args = event[1:]
 90 |                     args_ = []
 91 |                     for arg_start, arg_end, role in args:
 92 |                         arg_start, arg_end = arg_start - offset, arg_end - offset
 93 |                         arg = sent_entity_map[arg_start]
 94 |                         args_.append({
 95 |                             'entity_id': arg['id'],
 96 |                             'text': arg['text'],
 97 |                             'role': role
 98 |                         })
 99 |                     event_obj = {
100 |                         'event_type': event_type,
101 |                         'id': event_id,
102 |                         'trigger': {
103 |                             'start': trigger_start,
104 |                             'end': trigger_end,
105 |                             'text': ' '.join(sent_tokens[trigger_start:trigger_end])
106 |                         },
107 |                         'arguments': args_
108 |                     }
109 |                     sent_events_.append(event_obj)
110 | 
111 |                 sent_ = {
112 |                     'doc_id': doc_id,
113 |                     'sent_id': sent_id,
114 |                     'entity_mentions': sent_entities_,
115 |                     'relation_mentions': sent_relations_,
116 |                     'event_mentions': sent_events_,
117 |                     'tokens': sent_tokens,
118 |                     'pieces': [p for w in pieces for p in w],
119 |                     'token_lens': word_lens,
120 |                     'sentence': ' '.join(sent_tokens)
121 |                 }
122 |                 w.write(json.dumps(sent_) + '\n')
123 | 
124 |                 offset += len(sent_tokens)
125 | 
126 | 
127 | if __name__ == '__main__':
128 |     parser = ArgumentParser()
129 |     parser.add_argument('-i', '--input', help='Path to the input file')
130 |     parser.add_argument('-o', '--output', help='Path to the output file')
131 |     args = parser.parse_args()
132 |     
133 |     bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased',
134 |                                                do_lower_case=False)
135 |     convert(args.input, args.output, bert_tokenizer)
136 | 


--------------------------------------------------------------------------------
/evaluations/supervised-ie/resource/ace_to_aida_entity.tsv:
--------------------------------------------------------------------------------
 1 | PER	https://tac.nist.gov/tracks/SM-KBP/2018/ontologies/SeedlingOntology#Person
 2 | ORG	https://tac.nist.gov/tracks/SM-KBP/2018/ontologies/SeedlingOntology#Organization
 3 | GPE	https://tac.nist.gov/tracks/SM-KBP/2018/ontologies/SeedlingOntology#GeopoliticalEntity
 4 | LOC	https://tac.nist.gov/tracks/SM-KBP/2018/ontologies/SeedlingOntology#Location
 5 | FAC	https://tac.nist.gov/tracks/SM-KBP/2018/ontologies/SeedlingOntology#Facility
 6 | WEA	https://tac.nist.gov/tracks/SM-KBP/2018/ontologies/SeedlingOntology#Weapon
 7 | VEH	https://tac.nist.gov/tracks/SM-KBP/2018/ontologies/SeedlingOntology#Vehicle
 8 | TME	https://tac.nist.gov/tracks/SM-KBP/2018/ontologies/SeedlingOntology#Time
 9 | TTL	https://tac.nist.gov/tracks/SM-KBP/2018/ontologies/SeedlingOntology#Title
10 | VAL	https://tac.nist.gov/tracks/SM-KBP/2018/ontologies/SeedlingOntology#NumericalValue
11 | MON	https://tac.nist.gov/tracks/SM-KBP/2018/ontologies/SeedlingOntology#Money
12 | URL	https://tac.nist.gov/tracks/SM-KBP/2018/ontologies/SeedlingOntology#URL


--------------------------------------------------------------------------------
/evaluations/supervised-ie/resource/ace_to_aida_event.tsv:
--------------------------------------------------------------------------------
 1 | Business:Declare-Bankruptcy	https://tac.nist.gov/tracks/SM-KBP/2018/ontologies/SeedlingOntology#Business.DeclareBankruptcy
 2 | Business:Merge-Org	https://tac.nist.gov/tracks/SM-KBP/2018/ontologies/SeedlingOntology#Business.Merge
 3 | Business:End-Org	https://tac.nist.gov/tracks/SM-KBP/2018/ontologies/SeedlingOntology#Business.End
 4 | Business:Start-Org	https://tac.nist.gov/tracks/SM-KBP/2018/ontologies/SeedlingOntology#Business.Start
 5 | Conflict:Attack	https://tac.nist.gov/tracks/SM-KBP/2018/ontologies/SeedlingOntology#Conflict.Attack
 6 | Conflict:Demonstrate	https://tac.nist.gov/tracks/SM-KBP/2018/ontologies/SeedlingOntology#Conflict.Demonstrate
 7 | Contact:Correspondence	https://tac.nist.gov/tracks/SM-KBP/2018/ontologies/SeedlingOntology#Contact.Correspondence
 8 | Contact:Phone-Write	https://tac.nist.gov/tracks/SM-KBP/2018/ontologies/SeedlingOntology#Contact.Correspondence
 9 | Contact:Meet	https://tac.nist.gov/tracks/SM-KBP/2018/ontologies/SeedlingOntology#Contact.Meet
10 | Justice:Appeal	https://tac.nist.gov/tracks/SM-KBP/2018/ontologies/SeedlingOntology#Justice.Appeal
11 | Justice:Arrest-Jail	https://tac.nist.gov/tracks/SM-KBP/2018/ontologies/SeedlingOntology#Justice.ArrestJail
12 | Justice:Charge-Indict	https://tac.nist.gov/tracks/SM-KBP/2018/ontologies/SeedlingOntology#Justice.ChargeIndict
13 | Justice:Convict	https://tac.nist.gov/tracks/SM-KBP/2018/ontologies/SeedlingOntology#Justice.Convict
14 | Justice:Execute	https://tac.nist.gov/tracks/SM-KBP/2018/ontologies/SeedlingOntology#Justice.Execute
15 | Justice:Fine	https://tac.nist.gov/tracks/SM-KBP/2018/ontologies/SeedlingOntology#Justice.Fine
16 | Justice:Release-Parole	https://tac.nist.gov/tracks/SM-KBP/2018/ontologies/SeedlingOntology#Justice.ReleaseParole
17 | Justice:Sentence	https://tac.nist.gov/tracks/SM-KBP/2018/ontologies/SeedlingOntology#Justice.Sentence
18 | Justice:Sue	https://tac.nist.gov/tracks/SM-KBP/2018/ontologies/SeedlingOntology#Justice.Sue
19 | Justice:Trial-Hearing	https://tac.nist.gov/tracks/SM-KBP/2018/ontologies/SeedlingOntology#Justice.TrialHearing
20 | Justice:Pardon	https://tac.nist.gov/tracks/SM-KBP/2018/ontologies/SeedlingOntology#Justice.Pardon
21 | Justice:Extradite	https://tac.nist.gov/tracks/SM-KBP/2018/ontologies/SeedlingOntology#Justice.Extradite
22 | Justice:Acquit	https://tac.nist.gov/tracks/SM-KBP/2018/ontologies/SeedlingOntology#Justice.Acquit
23 | Life:Be-Born	https://tac.nist.gov/tracks/SM-KBP/2018/ontologies/SeedlingOntology#Life.BeBorn
24 | Life:Die	https://tac.nist.gov/tracks/SM-KBP/2018/ontologies/SeedlingOntology#Life.Die
25 | Life:Injure	https://tac.nist.gov/tracks/SM-KBP/2018/ontologies/SeedlingOntology#Life.Injure
26 | Life:Marry	https://tac.nist.gov/tracks/SM-KBP/2018/ontologies/SeedlingOntology#Life.Marry
27 | Life:Divorce	https://tac.nist.gov/tracks/SM-KBP/2018/ontologies/SeedlingOntology#Life.Divorce
28 | Movement:Transport	https://tac.nist.gov/tracks/SM-KBP/2018/ontologies/SeedlingOntology#Movement.TransportArtifact
29 | Movement:Transport-Artifact	https://tac.nist.gov/tracks/SM-KBP/2018/ontologies/SeedlingOntology#Movement.TransportArtifact
30 | Movement:Transport-Person	https://tac.nist.gov/tracks/SM-KBP/2018/ontologies/SeedlingOntology#Movement.TransportPerson
31 | Personnel:Elect	https://tac.nist.gov/tracks/SM-KBP/2018/ontologies/SeedlingOntology#Personnel.Elect
32 | Personnel:End-Position	https://tac.nist.gov/tracks/SM-KBP/2018/ontologies/SeedlingOntology#Personnel.EndPosition
33 | Personnel:Start-Position	https://tac.nist.gov/tracks/SM-KBP/2018/ontologies/SeedlingOntology#Personnel.StartPosition
34 | Personnel:Nominate	https://tac.nist.gov/tracks/SM-KBP/2018/ontologies/SeedlingOntology#Personnel.Nominate
35 | Transaction:Transfer-Money	https://tac.nist.gov/tracks/SM-KBP/2018/ontologies/SeedlingOntology#Transaction.TransferMoney
36 | Transaction:Transfer-Ownership	https://tac.nist.gov/tracks/SM-KBP/2018/ontologies/SeedlingOntology#Transaction.TransferOwnership
37 | 


--------------------------------------------------------------------------------
/evaluations/supervised-ie/resource/ace_to_aida_relation.tsv:
--------------------------------------------------------------------------------
1 | ORG-AFF	https://tac.nist.gov/tracks/SM-KBP/2018/ontologies/SeedlingOntology#OrganizationAffiliation
2 | ART	Artifact
3 | GEN-AFF	https://tac.nist.gov/tracks/SM-KBP/2018/ontologies/SeedlingOntology#GeneralAffiliation
4 | PART-WHOLE	https://tac.nist.gov/tracks/SM-KBP/2018/ontologies/SeedlingOntology#PartWhole
5 | PHYS	https://tac.nist.gov/tracks/SM-KBP/2018/ontologies/SeedlingOntology#Physical
6 | PER-SOC	https://tac.nist.gov/tracks/SM-KBP/2018/ontologies/SeedlingOntology#PersonalSocial


--------------------------------------------------------------------------------
/evaluations/supervised-ie/resource/ere_patterns/event_role.json:
--------------------------------------------------------------------------------
  1 | {
  2 |   "Movement:Transport": [
  3 |     "Vehicle",
  4 |     "Artifact",
  5 |     "Agent",
  6 |     "Origin",
  7 |     "Destination"
  8 |   ],
  9 |   "Personnel:Elect": [
 10 |     "Place",
 11 |     "Person",
 12 |     "Entity"
 13 |   ],
 14 |   "Personnel:Start-Position": [
 15 |     "Place",
 16 |     "Person",
 17 |     "Entity"
 18 |   ],
 19 |   "Personnel:Nominate": [
 20 |     "Agent",
 21 |     "Person"
 22 |   ],
 23 |   "Personnel:End-Position": [
 24 |     "Place",
 25 |     "Person",
 26 |     "Entity"
 27 |   ],
 28 |   "Conflict:Attack": [
 29 |     "Target",
 30 |     "Place",
 31 |     "Victim",
 32 |     "Instrument",
 33 |     "Attacker"
 34 |   ],
 35 |   "Contact:Meet": [
 36 |     "Place",
 37 |     "Entity"
 38 |   ],
 39 |   "Life:Marry": [
 40 |     "Place",
 41 |     "Person"
 42 |   ],
 43 |   "Transaction:Transfer-Money": [
 44 |     "Giver",
 45 |     "Place",
 46 |     "Recipient",
 47 |     "Beneficiary"
 48 |   ],
 49 |   "Conflict:Demonstrate": [
 50 |     "Place",
 51 |     "Entity"
 52 |   ],
 53 |   "Business:End-Org": [
 54 |     "Place",
 55 |     "Org"
 56 |   ],
 57 |   "Justice:Sue": [
 58 |     "Defendant",
 59 |     "Plaintiff",
 60 |     "Adjudicator",
 61 |     "Place"
 62 |   ],
 63 |   "Life:Injure": [
 64 |     "Agent",
 65 |     "Place",
 66 |     "Victim",
 67 |     "Instrument"
 68 |   ],
 69 |   "Life:Die": [
 70 |     "Person",
 71 |     "Agent",
 72 |     "Place",
 73 |     "Victim",
 74 |     "Instrument"
 75 |   ],
 76 |   "Justice:Arrest-Jail": [
 77 |     "Agent",
 78 |     "Place",
 79 |     "Person"
 80 |   ],
 81 |   "Contact:Phone-Write": [
 82 |     "Place",
 83 |     "Entity"
 84 |   ],
 85 |   "Transaction:Transfer-Ownership": [
 86 |     "Artifact",
 87 |     "Beneficiary",
 88 |     "Buyer",
 89 |     "Place",
 90 |     "Seller"
 91 |   ],
 92 |   "Business:Start-Org": [
 93 |     "Agent",
 94 |     "Place",
 95 |     "Org"
 96 |   ],
 97 |   "Justice:Execute": [
 98 |     "Agent",
 99 |     "Place",
100 |     "Person"
101 |   ],
102 |   "Justice:Trial-Hearing": [
103 |     "Prosecutor",
104 |     "Defendant",
105 |     "Place",
106 |     "Adjudicator"
107 |   ],
108 |   "Life:Be-Born": [
109 |     "Place",
110 |     "Person"
111 |   ],
112 |   "Justice:Charge-Indict": [
113 |     "Prosecutor",
114 |     "Adjudicator",
115 |     "Place",
116 |     "Defendant"
117 |   ],
118 |   "Justice:Convict": [
119 |     "Defendant",
120 |     "Place",
121 |     "Adjudicator"
122 |   ],
123 |   "Justice:Sentence": [
124 |     "Adjudicator",
125 |     "Place",
126 |     "Defendant"
127 |   ],
128 |   "Business:Declare-Bankruptcy": [
129 |     "Place",
130 |     "Org"
131 |   ],
132 |   "Justice:Release-Parole": [
133 |     "Place",
134 |     "Person",
135 |     "Entity"
136 |   ],
137 |   "Justice:Fine": [
138 |     "Adjudicator",
139 |     "Place",
140 |     "Entity"
141 |   ],
142 |   "Justice:Pardon": [
143 |     "Adjudicator",
144 |     "Place",
145 |     "Defendant"
146 |   ],
147 |   "Justice:Appeal": [
148 |     "Adjudicator",
149 |     "Plaintiff",
150 |     "Place"
151 |   ],
152 |   "Justice:Extradite": [
153 |     "Agent",
154 |     "Origin",
155 |     "Destination"
156 |   ],
157 |   "Life:Divorce": [
158 |     "Place",
159 |     "Person"
160 |   ],
161 |   "Business:Merge-Org": [
162 |     "Org"
163 |   ],
164 |   "Justice:Acquit": [
165 |     "Defendant",
166 |     "Adjudicator"
167 |   ]
168 | }
169 | 


--------------------------------------------------------------------------------
/evaluations/supervised-ie/resource/ere_patterns/relation_entity.json:
--------------------------------------------------------------------------------
1 | {"ORG-AFF": ["ORG", "PER", "GPE", "FAC"], "GEN-AFF": ["LOC", "PER", "FAC", "ORG", "GPE"], "PHYS": ["LOC", "PER", "FAC", "VEH", "ORG", "GPE"], "PART-WHOLE": ["LOC", "WEA", "PER", "FAC", "VEH", "ORG", "GPE"], "PER-SOC": ["ORG", "PER"]}


--------------------------------------------------------------------------------
/evaluations/supervised-ie/resource/ere_patterns/role_entity.json:
--------------------------------------------------------------------------------
1 | {"Attacker": ["ORG", "PER", "GPE"], "Place": ["LOC", "GPE", "FAC"], "Target": ["LOC", "WEA", "PER", "FAC", "VEH", "ORG"], "Victim": ["PER"], "Agent": ["ORG", "PER", "GPE"], "Entity": ["ORG", "PER", "GPE"], "Instrument": ["WEA", "VEH"], "Artifact": ["WEA", "PER", "VEH", "FAC", "ORG"], "Origin": ["LOC", "GPE", "FAC"], "Vehicle": ["VEH"], "Destination": ["LOC", "GPE", "FAC"], "Buyer": ["ORG", "PER", "GPE"], "Person": ["PER"], "Org": ["ORG", "PER"], "Adjudicator": ["ORG", "PER", "GPE"], "Plaintiff": ["ORG", "PER", "GPE"], "Defendant": ["ORG", "PER", "GPE"], "Prosecutor": ["ORG", "PER", "GPE"], "Giver": ["ORG", "PER", "GPE"], "Seller": ["ORG", "PER", "GPE"], "Recipient": ["ORG", "PER", "GPE"], "Beneficiary": ["ORG", "PER", "GPE"]}


--------------------------------------------------------------------------------
/evaluations/supervised-ie/resource/splits/ACE05-CN/dev.doc.txt:
--------------------------------------------------------------------------------
 1 | CTS20001211.1300.0012
 2 | CBS20001113.1000.0822
 3 | XIN20001231.1400.0076
 4 | XIN20001009.0800.0058
 5 | VOM20001027.1800.0230
 6 | XIN20001125.0800.0031
 7 | XIN20001205.2000.0143
 8 | CBS20001008.1000.0742
 9 | DAVYZW_20050127.1720
10 | VOM20001024.1800.2758
11 | CTS20001024.1300.0506
12 | VOM20001024.1800.1850
13 | XIN20001219.2000.0158
14 | NJWSL_20041211.1642
15 | CTS20001030.1800.0439
16 | XIN20001216.1400.0090
17 | XIN20001125.1400.0078
18 | XIN20001209.0200.0008
19 | CTV20001205.1330.1436
20 | CTV20001106.1330.0676
21 | CBS20001205.1000.0731
22 | XIN20001001.2000.0152
23 | XIN20001102.1400.0144
24 | CTS20001130.1300.0941
25 | XIN20001020.0200.0006
26 | LIUYIFENG_20050127.0709
27 | CBS20001001.1000.0041
28 | ZBN20001228.0400.0017
29 | VOM20001024.1800.2163
30 | CTS20001016.1300.0297
31 | XIN20001124.1400.0105
32 | CTS20001019.1300.0638
33 | XIN20001223.2000.0095
34 | LIUYIFENG_20050128.0814
35 | LIUYIFENG_20050115.0916
36 | XIN20001129.0200.0039
37 | CTV20001207.1330.0642
38 | DAVYZW_20050110.1403
39 | CTS20001031.1300.1129
40 | LIUYIFENG_20050126.0820
41 | CTS20001215.1300.0532
42 | VOM20001222.0700.1974
43 | CTS20001004.1300.0461
44 | 


--------------------------------------------------------------------------------
/evaluations/supervised-ie/resource/splits/ACE05-CN/test.doc.txt:
--------------------------------------------------------------------------------
 1 | CBS20001126.1000.0700
 2 | XIN20001007.0800.0037
 3 | VOM20001020.1800.2981
 4 | XIN20001216.1400.0085
 5 | XIN20001024.2000.0153
 6 | CTV20001026.1530.0802
 7 | LIUYIFENG_20050124.1835
 8 | XIN20001122.1400.0074
 9 | CBS20001023.1000.1067
10 | VOM20001216.0700.1886
11 | XIN20001017.1400.0130
12 | CTS20001224.1300.0396
13 | XIN20001003.0200.0001
14 | VOM20001005.1800.1966
15 | XIN20001126.2000.0101
16 | CBS20001129.1000.1072
17 | LIUYIFENG_20050129.0957
18 | XIN20001107.2000.0150
19 | CBS20001123.1000.1060
20 | XIN20001216.1400.0068
21 | CBS20001118.1000.0340
22 | CTV20001116.1330.0474
23 | XIN20001020.0200.0018
24 | CBS20001021.1000.0734
25 | DAVYZW_20050114.1634
26 | CNR20001201.1700.1429
27 | XIN20001217.2000.0089
28 | XIN20001228.0200.0038
29 | CBS20001117.1000.0341
30 | CTS20001108.1300.0504
31 | XIN20001010.0800.0053
32 | CTS20001015.1300.1065
33 | CTS20001105.1300.0613
34 | LIUYIFENG_20050113.1047
35 | CTV20001005.1330.1455
36 | LANGLANGGARGEN_20050124.1017
37 | CTV20001011.1330.0522
38 | DAVYZW_20050124.1833
39 | XIN20001126.0800.0042
40 | ZBN20001119.1300.0039
41 | VOM20001006.1800.0436
42 | CTV20001129.1330.1511
43 | XIN20001031.0800.0085
44 | LIUYIFENG_20050112.1200
45 | 


--------------------------------------------------------------------------------
/evaluations/supervised-ie/resource/splits/ACE05-E/dev.doc.txt:
--------------------------------------------------------------------------------
 1 | CNN_CF_20030303.1900.02
 2 | CNN_IP_20030329.1600.00-2
 3 | CNN_IP_20030402.1600.00-1
 4 | CNN_IP_20030405.1600.01-1
 5 | CNN_IP_20030409.1600.02
 6 | marcellapr_20050228.2219
 7 | rec.games.chess.politics_20041217.2111
 8 | soc.org.nonprofit_20050218.1902
 9 | FLOPPINGACES_20050217.1237.014
10 | AGGRESSIVEVOICEDAILY_20041116.1347
11 | FLOPPINGACES_20041117.2002.024
12 | FLOPPINGACES_20050203.1953.038
13 | TTRACY_20050223.1049
14 | CNNHL_ENG_20030304_142751.10
15 | CNNHL_ENG_20030424_123502.25
16 | CNNHL_ENG_20030513_220910.32
17 | CNN_ENG_20030304_173120.16
18 | CNN_ENG_20030328_150609.10
19 | CNN_ENG_20030424_070008.15
20 | CNN_ENG_20030512_170454.13
21 | CNN_ENG_20030620_085840.7
22 | AFP_ENG_20030305.0918
23 | AFP_ENG_20030311.0491
24 | AFP_ENG_20030314.0238
25 | AFP_ENG_20030319.0879
26 | AFP_ENG_20030320.0722
27 | AFP_ENG_20030327.0022
28 | AFP_ENG_20030327.0224
29 | 


--------------------------------------------------------------------------------
/evaluations/supervised-ie/resource/splits/ACE05-E/test.doc.txt:
--------------------------------------------------------------------------------
 1 | AFP_ENG_20030401.0476
 2 | AFP_ENG_20030413.0098
 3 | AFP_ENG_20030415.0734
 4 | AFP_ENG_20030417.0004
 5 | AFP_ENG_20030417.0307
 6 | AFP_ENG_20030417.0764
 7 | AFP_ENG_20030418.0556
 8 | AFP_ENG_20030425.0408
 9 | AFP_ENG_20030427.0118
10 | AFP_ENG_20030428.0720
11 | AFP_ENG_20030429.0007
12 | AFP_ENG_20030430.0075
13 | AFP_ENG_20030502.0614
14 | AFP_ENG_20030504.0248
15 | AFP_ENG_20030508.0118
16 | AFP_ENG_20030508.0357
17 | AFP_ENG_20030509.0345
18 | AFP_ENG_20030514.0706
19 | AFP_ENG_20030519.0049
20 | AFP_ENG_20030519.0372
21 | AFP_ENG_20030522.0878
22 | AFP_ENG_20030527.0616
23 | AFP_ENG_20030528.0561
24 | AFP_ENG_20030530.0132
25 | AFP_ENG_20030601.0262
26 | AFP_ENG_20030607.0030
27 | AFP_ENG_20030616.0715
28 | AFP_ENG_20030617.0846
29 | AFP_ENG_20030625.0057
30 | AFP_ENG_20030630.0271
31 | APW_ENG_20030304.0555
32 | APW_ENG_20030306.0191
33 | APW_ENG_20030308.0314
34 | APW_ENG_20030310.0719
35 | APW_ENG_20030311.0775
36 | APW_ENG_20030318.0689
37 | APW_ENG_20030319.0545
38 | APW_ENG_20030322.0119
39 | APW_ENG_20030324.0768
40 | APW_ENG_20030325.0786
41 | 


--------------------------------------------------------------------------------
/evaluations/supervised-ie/resource/splits/ACE05-R/dev.doc.txt:
--------------------------------------------------------------------------------
 1 | CNN_ENG_20030530_130025.12
 2 | CNN_ENG_20030605_085831.13
 3 | CNN_ENG_20030415_103039.0
 4 | CNN_ENG_20030407_080037.12
 5 | CNN_ENG_20030429_110706.7
 6 | CNN_ENG_20030428_193655.2
 7 | CNNHL_ENG_20030604_230238.5
 8 | CNN_ENG_20030612_072835.2
 9 | CNN_ENG_20030306_083604.6
10 | CNN_ENG_20030624_140104.22
11 | CNN_ENG_20030627_065846.3
12 | CNN_ENG_20030429_083016.5
13 | CNN_ENG_20030509_123601.13
14 | CNN_ENG_20030423_180539.2
15 | CNN_ENG_20030617_193116.10
16 | CNN_ENG_20030507_160538.15
17 | CNN_ENG_20030422_083005.10
18 | CNN_ENG_20030305_170125.1
19 | CNN_ENG_20030320_153434.7
20 | CNN_ENG_20030509_090025.5
21 | CNN_ENG_20030618_150128.6
22 | CNN_ENG_20030617_173115.14
23 | CNN_ENG_20030502_093018.6
24 | CNN_ENG_20030409_180633.8
25 | CNN_ENG_20030624_153103.17
26 | CNN_ENG_20030407_130604.10
27 | CNN_ENG_20030329_170349.7
28 | CNNHL_ENG_20030416_230741.33
29 | CNNHL_ENG_20030402_193443.5
30 | CNN_ENG_20030620_170011.14
31 | CNN_ENG_20030626_193133.8
32 | CNN_ENG_20030610_085833.10
33 | CNN_ENG_20030507_170539.0
34 | CNN_ENG_20030526_183538.3
35 | CNN_ENG_20030513_080020.2
36 | CNN_ENG_20030611_102832.3
37 | XIN_ENG_20030513.0002
38 | XIN_ENG_20030408.0341
39 | APW_ENG_20030331.0410
40 | APW_ENG_20030409.0013
41 | APW_ENG_20030519.0548
42 | AFP_ENG_20030429.0007
43 | APW_ENG_20030422.0469
44 | AFP_ENG_20030330.0211
45 | APW_ENG_20030419.0358
46 | APW_ENG_20030619.0383
47 | APW_ENG_20030310.0719
48 | AFP_ENG_20030519.0049
49 | AFP_ENG_20030327.0224
50 | AFP_ENG_20030401.0476
51 | APW_ENG_20030519.0367
52 | NYT_ENG_20030630.0079
53 | MARKETVIEW_20050216.2120
54 | AGGRESSIVEVOICEDAILY_20041101.1806
55 | MARKETVIEW_20050215.1858
56 | MARKETVIEW_20041209.1401
57 | MARKBACKER_20050217.0647
58 | MARKETVIEW_20050208.2033
59 | BACONSREBELLION_20050209.0721
60 | MARKBACKER_20041128.1641
61 | MARKETVIEW_20050209.1923
62 | BACONSREBELLION_20050127.1017
63 | AGGRESSIVEVOICEDAILY_20041101.1144
64 | MARKETVIEW_20050120.1641
65 | MARKETVIEW_20050212.1607
66 | MARKBACKER_20041112.0707
67 | MARKETVIEW_20050222.0729
68 | MARKETVIEW_20050226.1307
69 | FLOPPINGACES_20050101.2244.048
70 | BACONSREBELLION_20050226.1317
71 | BACONSREBELLION_20050216.1632
72 | CNN_IP_20030414.1600.04
73 | CNN_IP_20030329.1600.00-5
74 | CNN_IP_20030406.1600.03
75 | CNN_CF_20030304.1900.01
76 | CNN_IP_20030408.1600.03
77 | CNN_IP_20030412.1600.05
78 | CNN_IP_20030402.1600.00-4
79 | CNN_IP_20030408.1600.04
80 | CNN_IP_20030404.1600.00-2
81 | 


--------------------------------------------------------------------------------
/evaluations/supervised-ie/resource/splits/ACE05-R/test.doc.txt:
--------------------------------------------------------------------------------
 1 | CNN_ENG_20030527_195948.3
 2 | CNN_ENG_20030618_065839.11
 3 | CNN_ENG_20030411_070039.21
 4 | CNN_ENG_20030415_173752.0
 5 | CNN_ENG_20030515_193533.6
 6 | CNN_ENG_20030403_080032.9
 7 | CNN_ENG_20030407_170605.7
 8 | CNNHL_ENG_20030513_220910.32
 9 | CNN_ENG_20030416_180808.15
10 | CNNHL_ENG_20030304_142751.10
11 | CNN_ENG_20030506_053020.14
12 | CNN_ENG_20030607_170312.6
13 | CNNHL_ENG_20030624_230338.34
14 | CNN_ENG_20030516_123543.8
15 | CNN_ENG_20030401_073033.14
16 | CNN_ENG_20030501_160459.0
17 | CNN_ENG_20030508_170552.18
18 | CNN_ENG_20030624_153103.16
19 | CNN_ENG_20030410_183644.8
20 | CNN_ENG_20030325_220534.6
21 | CNN_ENG_20030424_073006.4
22 | CNN_ENG_20030528_172957.18
23 | CNN_ENG_20030528_125956.8
24 | CNN_ENG_20030408_123613.0
25 | CNN_ENG_20030617_065838.21
26 | CNNHL_ENG_20030416_133739.9
27 | CNN_ENG_20030312_083725.3
28 | CNN_ENG_20030501_063017.15
29 | CNNHL_ENG_20030611_133445.24
30 | CNN_ENG_20030416_100042.7
31 | CNN_ENG_20030418_083040.11
32 | CNNHL_ENG_20030610_133347.6
33 | CNN_ENG_20030327_163556.20
34 | CNNHL_ENG_20030407_193547.5
35 | CNNHL_ENG_20030331_193419.9
36 | CNNHL_ENG_20030609_133335.37
37 | AFP_ENG_20030509.0345
38 | APW_ENG_20030318.0689
39 | APW_ENG_20030520.0757
40 | APW_ENG_20030416.0581
41 | AFP_ENG_20030502.0614
42 | APW_ENG_20030602.0037
43 | APW_ENG_20030324.0768
44 | APW_ENG_20030410.0906
45 | AFP_ENG_20030304.0250
46 | APW_ENG_20030325.0786
47 | AFP_ENG_20030427.0118
48 | AFP_ENG_20030514.0706
49 | APW_ENG_20030610.0010
50 | APW_ENG_20030527.0232
51 | AFP_ENG_20030323.0020
52 | XIN_ENG_20030415.0379
53 | AGGRESSIVEVOICEDAILY_20041116.1347
54 | MARKETVIEW_20050217.2115
55 | FLOPPINGACES_20041114.1240.039
56 | MARKETVIEW_20041213.0722
57 | AGGRESSIVEVOICEDAILY_20050205.1954
58 | AGGRESSIVEVOICEDAILY_20050125.0136
59 | AGGRESSIVEVOICEDAILY_20050124.1354
60 | AGGRESSIVEVOICEDAILY_20050109.1627
61 | MARKETVIEW_20050201.0748
62 | AGGRESSIVEVOICEDAILY_20050114.1922
63 | AGGRESSIVEVOICEDAILY_20041208.2133
64 | MARKETVIEW_20050206.2009
65 | MARKETVIEW_20041215.2128
66 | FLOPPINGACES_20041115.1613.032
67 | MARKETVIEW_20050210.2138
68 | MARKETVIEW_20050226.1444
69 | AGGRESSIVEVOICEDAILY_20050116.2149
70 | TTRACY_20050223.1049
71 | OIADVANTAGE_20050204.1155
72 | CNN_CF_20030303.1900.05
73 | CNN_IP_20030405.1600.01-3
74 | CNN_IP_20030405.1600.00-3
75 | CNN_IP_20030329.1600.00-6
76 | CNN_IP_20030402.1600.02-2
77 | CNN_IP_20030404.1600.00-1
78 | CNN_IP_20030422.1600.05
79 | CNN_IP_20030405.1600.00-2
80 | CNN_IP_20030402.1600.00-2
81 | 


--------------------------------------------------------------------------------
/evaluations/supervised-ie/resource/splits/ACE05-R/train.doc.txt:
--------------------------------------------------------------------------------
  1 | CNN_ENG_20030416_160804.4
  2 | CNNHL_ENG_20030603_230307.3
  3 | CNN_ENG_20030415_183752.14
  4 | CNN_ENG_20030506_163523.22
  5 | CNN_ENG_20030605_065831.18
  6 | CNN_ENG_20030304_173120.16
  7 | CNN_ENG_20030328_150609.10
  8 | CNN_ENG_20030408_200618.14
  9 | CNN_ENG_20030619_125955.10
 10 | CNN_ENG_20030604_092828.7
 11 | CNN_ENG_20030421_120508.17
 12 | CNN_ENG_20030404_073033.4
 13 | CNN_ENG_20030625_220123.3
 14 | CNNHL_ENG_20030513_183907.5
 15 | CNN_ENG_20030513_113501.6
 16 | CNNHL_ENG_20030618_230303.36
 17 | CNN_ENG_20030527_215946.12
 18 | CNN_ENG_20030325_150531.10
 19 | CNNHL_ENG_20030429_220618.15
 20 | CNNHL_ENG_20030625_230351.4
 21 | CNN_ENG_20030616_130059.25
 22 | CNN_ENG_20030610_130042.17
 23 | CNN_ENG_20030306_070606.18
 24 | CNNHL_ENG_20030519_124020.23
 25 | CNNHL_ENG_20030410_193626.13
 26 | CNN_ENG_20030618_150128.5
 27 | CNN_ENG_20030630_085848.18
 28 | CNN_ENG_20030605_223004.4
 29 | CNN_ENG_20030424_113549.11
 30 | CNN_ENG_20030610_123040.9
 31 | CNNHL_ENG_20030616_230155.7
 32 | CNNHL_ENG_20030416_133739.13
 33 | CNN_ENG_20030418_063040.1
 34 | CNN_ENG_20030529_085826.10
 35 | CNNHL_ENG_20030403_133453.21
 36 | CNN_ENG_20030512_190454.7
 37 | CNN_ENG_20030403_060032.0
 38 | CNN_ENG_20030428_173654.13
 39 | CNN_ENG_20030425_063006.5
 40 | CNN_ENG_20030526_133535.4
 41 | CNN_ENG_20030408_153616.9
 42 | CNN_ENG_20030528_165958.16
 43 | CNNHL_ENG_20030403_193455.30
 44 | CNN_ENG_20030514_130518.5
 45 | CNNHL_ENG_20030411_230640.38
 46 | CNN_ENG_20030424_173553.8
 47 | CNN_ENG_20030622_173306.9
 48 | CNN_ENG_20030605_105831.11
 49 | CNN_ENG_20030618_193127.17
 50 | CNN_ENG_20030619_115954.4
 51 | CNN_ENG_20030619_115954.10
 52 | CNN_ENG_20030417_063039.0
 53 | CNNHL_ENG_20030616_230155.28
 54 | CNN_ENG_20030612_173004.10
 55 | CNN_ENG_20030416_190806.4
 56 | CNNHL_ENG_20030505_220734.25
 57 | CNN_ENG_20030513_160506.16
 58 | CNN_ENG_20030403_090032.1
 59 | CNN_ENG_20030430_093016.0
 60 | CNN_ENG_20030429_190711.14
 61 | CNNHL_ENG_20030618_230303.6
 62 | CNNHL_ENG_20030624_133331.33
 63 | CNNHL_ENG_20030428_123600.14
 64 | CNN_ENG_20030403_183513.1
 65 | CNN_ENG_20030404_163526.10
 66 | CNN_ENG_20030621_115841.16
 67 | CNNHL_ENG_20030430_220712.37
 68 | CNN_ENG_20030605_193002.8
 69 | CNN_ENG_20030403_180511.16
 70 | CNN_ENG_20030525_160525.13
 71 | CNN_ENG_20030610_133041.17
 72 | CNN_ENG_20030610_095857.4
 73 | CNN_ENG_20030612_173004.2
 74 | CNN_ENG_20030525_143522.8
 75 | CNN_ENG_20030312_223733.14
 76 | CNN_ENG_20030421_120508.13
 77 | CNN_ENG_20030620_085840.7
 78 | CNN_ENG_20030625_210122.0
 79 | CNN_ENG_20030604_102828.6
 80 | CNN_ENG_20030603_095830.17
 81 | CNN_ENG_20030612_160005.13
 82 | CNN_ENG_20030602_072826.1
 83 | CNNHL_ENG_20030424_123502.25
 84 | CNN_ENG_20030630_075848.7
 85 | CNN_ENG_20030425_133605.6
 86 | CNN_ENG_20030418_130831.5
 87 | CNN_ENG_20030617_112838.4
 88 | CNN_ENG_20030614_173123.4
 89 | CNN_ENG_20030626_203133.11
 90 | CNN_ENG_20030515_063019.6
 91 | CNNHL_ENG_20030513_220910.11
 92 | CNN_ENG_20030515_073019.7
 93 | CNNHL_ENG_20030625_193346.7
 94 | CNN_ENG_20030524_143511.4
 95 | CNN_ENG_20030430_160723.6
 96 | CNN_ENG_20030602_102826.13
 97 | CNN_ENG_20030418_163834.14
 98 | CNN_ENG_20030529_130011.6
 99 | CNN_ENG_20030426_160621.0
100 | CNN_ENG_20030602_133012.9
101 | CNN_ENG_20030318_140851.8
102 | CNN_ENG_20030331_193655.14
103 | CNN_ENG_20030421_090007.11
104 | CNNHL_ENG_20030526_221156.39
105 | CNN_ENG_20030422_213527.4
106 | CNN_ENG_20030611_175950.5
107 | CNN_ENG_20030617_173115.22
108 | CNN_ENG_20030508_210555.5
109 | CNN_ENG_20030610_105832.1
110 | CNN_ENG_20030402_190500.11
111 | CNN_ENG_20030408_083034.11
112 | CNN_ENG_20030605_153000.9
113 | CNNHL_ENG_20030416_193742.7
114 | CNN_ENG_20030512_170454.13
115 | CNN_ENG_20030429_143706.14
116 | CNN_ENG_20030607_173310.4
117 | CNN_ENG_20030617_105836.4
118 | CNNHL_ENG_20030416_193742.26
119 | CNN_ENG_20030624_065843.24
120 | CNN_ENG_20030528_195959.20
121 | CNN_ENG_20030502_080020.7
122 | CNNHL_ENG_20030312_150218.13
123 | CNN_ENG_20030624_082841.12
124 | CNNHL_ENG_20030425_183518.12
125 | CNN_ENG_20030526_180540.6
126 | CNN_ENG_20030611_102832.4
127 | CNN_ENG_20030421_133510.6
128 | CNN_ENG_20030528_082823.9
129 | CNN_ENG_20030428_130651.4
130 | CNN_ENG_20030505_090022.1
131 | CNN_ENG_20030621_160254.25
132 | CNN_ENG_20030507_060023.1
133 | CNN_ENG_20030620_095840.4
134 | CNN_ENG_20030411_193701.3
135 | CNN_ENG_20030602_105829.2
136 | CNN_ENG_20030424_183556.7
137 | CNN_ENG_20030627_130145.6
138 | CNN_ENG_20030415_180754.5
139 | CNN_ENG_20030430_063016.14
140 | CNN_ENG_20030401_233449.5
141 | CNN_ENG_20030414_130735.7
142 | CNNHL_ENG_20030523_221118.14
143 | CNN_ENG_20030424_070008.15
144 | CNN_ENG_20030411_233701.11
145 | CNN_ENG_20030417_073039.2
146 | CNNHL_ENG_20030402_133449.22
147 | CNN_ENG_20030506_160524.18
148 | CNNHL_ENG_20030415_193729.5
149 | CNN_ENG_20030313_083739.0
150 | CNN_ENG_20030603_133025.7
151 | CNN_ENG_20030516_090022.7
152 | CNNHL_ENG_20030610_230438.14
153 | CNN_ENG_20030331_123648.4
154 | CNN_ENG_20030429_170710.4
155 | APW_ENG_20030404.0439
156 | XIN_ENG_20030314.0208
157 | XIN_ENG_20030624.0085
158 | AFP_ENG_20030320.0722
159 | AFP_ENG_20030327.0022
160 | APW_ENG_20030414.0392
161 | XIN_ENG_20030324.0191
162 | AFP_ENG_20030428.0720
163 | AFP_ENG_20030319.0879
164 | XIN_ENG_20030609.0118
165 | AFP_ENG_20030504.0248
166 | AFP_ENG_20030415.0734
167 | AFP_ENG_20030519.0372
168 | APW_ENG_20030415.0742
169 | APW_ENG_20030304.0555
170 | APW_ENG_20030408.0090
171 | APW_ENG_20030422.0485
172 | APW_ENG_20030508.0772
173 | APW_ENG_20030418.0084
174 | APW_ENG_20030403.0862
175 | NYT_ENG_20030602.0074
176 | XIN_ENG_20030317.0177
177 | APW_ENG_20030407.0030
178 | APW_ENG_20030603.0303
179 | AFP_ENG_20030417.0764
180 | APW_ENG_20030510.0228
181 | APW_ENG_20030520.0081
182 | AFP_ENG_20030417.0004
183 | APW_ENG_20030610.0554
184 | APW_ENG_20030423.0079
185 | APW_ENG_20030327.0376
186 | AFP_ENG_20030430.0075
187 | XIN_ENG_20030423.0011
188 | AFP_ENG_20030607.0030
189 | AFP_ENG_20030522.0878
190 | AFP_ENG_20030528.0561
191 | AFP_ENG_20030601.0262
192 | APW_ENG_20030406.0191
193 | XIN_ENG_20030610.0299
194 | APW_ENG_20030424.0532
195 | AFP_ENG_20030413.0098
196 | AFP_ENG_20030314.0238
197 | AFP_ENG_20030418.0556
198 | APW_ENG_20030412.0531
199 | APW_ENG_20030308.0314
200 | APW_ENG_20030424.0698
201 | AFP_ENG_20030425.0408
202 | APW_ENG_20030502.0686
203 | XIN_ENG_20030523.0202
204 | AFP_ENG_20030417.0307
205 | APW_ENG_20030411.0304
206 | NYT_ENG_20030403.0008
207 | AFP_ENG_20030617.0846
208 | AFP_ENG_20030616.0715
209 | AFP_ENG_20030508.0118
210 | AFP_ENG_20030527.0616
211 | AFP_ENG_20030311.0491
212 | AFP_ENG_20030530.0132
213 | APW_ENG_20030322.0119
214 | APW_ENG_20030319.0545
215 | XIN_ENG_20030616.0274
216 | APW_ENG_20030417.0555
217 | XIN_ENG_20030327.0202
218 | XIN_ENG_20030509.0137
219 | APW_ENG_20030513.0139
220 | AFP_ENG_20030305.0918
221 | APW_ENG_20030311.0775
222 | APW_ENG_20030326.0190
223 | APW_ENG_20030502.0470
224 | AFP_ENG_20030625.0057
225 | AFP_ENG_20030508.0357
226 | APW_ENG_20030306.0191
227 | AFP_ENG_20030630.0271
228 | XIN_ENG_20030425.0184
229 | MARKETVIEW_20050208.2059
230 | MARKBACKER_20041216.0656
231 | OIADVANTAGE_20050110.1009
232 | MARKETVIEW_20041211.1845
233 | BACONSREBELLION_20050222.0817
234 | AGGRESSIVEVOICEDAILY_20050224.2252
235 | MARKETVIEW_20041220.1537
236 | MARKBACKER_20050105.1526
237 | AGGRESSIVEVOICEDAILY_20050106.1310
238 | FLOPPINGACES_20041230.1844.003
239 | MARKETVIEW_20041217.0801
240 | BACONSREBELLION_20050217.0744
241 | MARKBACKER_20041117.0723
242 | MARKETVIEW_20041212.1447
243 | BACONSREBELLION_20050214.0944
244 | MARKETVIEW_20050225.0541
245 | MARKBACKER_20041217.1639
246 | OIADVANTAGE_20050103.0944
247 | BACONSREBELLION_20050123.1639
248 | AGGRESSIVEVOICEDAILY_20050105.1344
249 | MARKETVIEW_20050206.1951
250 | HEALINGIRAQ_20041108.1942.05
251 | OIADVANTAGE_20050109.1947
252 | AGGRESSIVEVOICEDAILY_20041215.2302
253 | AGGRESSIVEVOICEDAILY_20041218.0146
254 | MARKBACKER_20050103.0829
255 | BACONSREBELLION_20050205.1919
256 | AGGRESSIVEVOICEDAILY_20041203.1959
257 | MARKETVIEW_20050212.1717
258 | BACONSREBELLION_20050216.1536
259 | FLOPPINGACES_20041117.2002.024
260 | AGGRESSIVEVOICEDAILY_20050208.1142
261 | BACONSREBELLION_20050125.1108
262 | GETTINGPOLITICAL_20050105.0127.001
263 | MARKETVIEW_20050127.0716
264 | MARKETVIEW_20050105.1901
265 | MARKETVIEW_20050205.1358
266 | FLOPPINGACES_20041113.1528.042
267 | MARKETVIEW_20050222.1919
268 | MARKBACKER_20041103.1300
269 | BACONSREBELLION_20050218.1214
270 | AGGRESSIVEVOICEDAILY_20041226.1712
271 | MARKETVIEW_20050204.1322
272 | MARKETVIEW_20050126.0711
273 | MARKETVIEW_20041219.1509
274 | FLOPPINGACES_20050203.1953.038
275 | MARKETVIEW_20050204.1337
276 | BACONSREBELLION_20050227.1238
277 | MARKBACKER_20041206.0733
278 | AGGRESSIVEVOICEDAILY_20050224.1207
279 | MARKBACKER_20050105.1632
280 | MARKETVIEW_20050207.0746
281 | AGGRESSIVEVOICEDAILY_20041218.1004
282 | FLOPPINGACES_20041228.0927.010
283 | MARKBACKER_20041108.1507
284 | BACONSREBELLION_20050218.0848
285 | AGGRESSIVEVOICEDAILY_20041201.2313
286 | FLOPPINGACES_20050217.1237.014
287 | OIADVANTAGE_20050203.1000
288 | BACONSREBELLION_20050206.1345
289 | OIADVANTAGE_20041224.1007
290 | MARKBACKER_20041220.0919
291 | BACONSREBELLION_20050204.1326
292 | BACONSREBELLION_20050222.1348
293 | MARKETVIEW_20050204.1736
294 | AGGRESSIVEVOICEDAILY_20041223.1449
295 | MARKBACKER_20041119.1002
296 | MARKBACKER_20041202.0711
297 | AGGRESSIVEVOICEDAILY_20050203.1356
298 | BACONSREBELLION_20050210.0728
299 | OIADVANTAGE_20050203.2102
300 | BACONSREBELLION_20050216.1618
301 | MARKETVIEW_20050228.2211
302 | MARKBACKER_20041117.1107
303 | FLOPPINGACES_20041116.0833.027
304 | AGGRESSIVEVOICEDAILY_20050213.2123
305 | OIADVANTAGE_20050108.1323
306 | MARKETVIEW_20050214.2115
307 | AGGRESSIVEVOICEDAILY_20050113.1400
308 | AGGRESSIVEVOICEDAILY_20050107.2012
309 | OIADVANTAGE_20050105.0922
310 | CNN_IP_20030329.1600.00-2
311 | CNN_IP_20030329.1600.00-3
312 | CNN_CF_20030303.1900.00
313 | CNN_IP_20030329.1600.02
314 | CNN_IP_20030329.1600.01-3
315 | CNN_LE_20030504.1200.02-1
316 | CNN_CF_20030304.1900.04
317 | CNN_IP_20030409.1600.02
318 | CNN_CF_20030304.1900.06-2
319 | CNN_IP_20030329.1600.00-4
320 | CNN_CF_20030305.1900.00-2
321 | CNN_IP_20030410.1600.03-1
322 | CNN_CF_20030303.1900.06-1
323 | CNN_IP_20030403.1600.00-3
324 | CNN_CF_20030305.1900.06-2
325 | CNN_IP_20030402.1600.00-1
326 | CNN_IP_20030405.1600.01-1
327 | CNN_IP_20030402.1600.02-1
328 | CNN_CF_20030303.1900.06-2
329 | CNN_IP_20030330.1600.05-2
330 | CNN_IP_20030403.1600.00-1
331 | CNN_IP_20030410.1600.03-2
332 | CNN_IP_20030402.1600.00-3
333 | CNN_LE_20030504.1200.01
334 | CNN_CF_20030303.1900.02
335 | CNN_IP_20030405.1600.01-2
336 | CNN_CF_20030305.1900.00-3
337 | CNN_CF_20030305.1900.00-1
338 | CNN_IP_20030407.1600.05
339 | CNN_CF_20030305.1900.02
340 | CNN_CF_20030304.1900.02
341 | CNN_IP_20030403.1600.00-4
342 | CNN_LE_20030504.1200.02-2
343 | CNN_IP_20030403.1600.00-2
344 | CNN_IP_20030409.1600.04
345 | CNN_IP_20030417.1600.06
346 | CNN_IP_20030329.1600.01-1
347 | CNN_IP_20030405.1600.02
348 | CNN_IP_20030328.1600.07
349 | CNN_CF_20030305.1900.06-1
350 | CNN_IP_20030412.1600.03
351 | CNN_IP_20030330.1600.06
352 | 


--------------------------------------------------------------------------------
/evaluations/supervised-ie/resource/splits/ERE-EN/dev.doc.txt:
--------------------------------------------------------------------------------
 1 | 101d0fc4a78dc1b84953ebd399b2fad5
 2 | 0f03cc5a508d630c6c8c8c61396e31a9
 3 | NYT_ENG_20130910.0191
 4 | 14294db341956a71811c9dd015b04ed7
 5 | 0659c87d9fd3d5efd258ee6de3ba1003
 6 | 11a29a0d63a79b0f5d19ccae1838b125
 7 | 3dff15d768dbfe27e4d6b81fb63aee95
 8 | 4aea880c68f1708f68271a7913f2001f
 9 | 2bdb9d86091c6f412ffa767bdc749be9
10 | 1a0f894682abf633cc94b06405b78a8e
11 | 7bac41e8aea34c7ef9462fcc1a572109
12 | 45b9b8f7d17ce5f352c16a339e96705f
13 | 75a85a5de2dd86d7b7662b83aa639d0a
14 | 06fa2a5cdc50c1d2a96bfe02adcc0b40
15 | 22ca1a5aa492b429d274169c54554a7c
16 | edb392c8323a4f5f27cc0e59df409c68
17 | NYT_ENG_20131022.0102
18 | 9e49d5babe9b22ac5ebe1afd3d440ff2
19 | 5bac42475431a87070720e94b27cfd99
20 | 48dafc1e3678fa7b13cb467ab3eed071
21 | 3ddbad6f438c88eec387131477ffe1b9
22 | 44169f6a3f5b04e8dbab2a26e572a136
23 | NYT_ENG_20131029.0228
24 | bec156fe4d6369a40f347477578d28b0
25 | 14fbeb82a73a7df37bcda0583c9bca7e
26 | 61d2b0dcc730f0b4e92ae0d1929b3caf
27 | 428e1e095b4e6e830b47e72f133faf87
28 | APW_ENG_20090611.0697
29 | bb1fba8ce6504faf37892e990d50fb68
30 | c0cae135f2727d4e61315f719cb27434
31 | 90f8a4e01d7a52940959427f10e45f8c
32 | 


--------------------------------------------------------------------------------
/evaluations/supervised-ie/resource/splits/ERE-EN/test.doc.txt:
--------------------------------------------------------------------------------
 1 | NYT_ENG_20130716.0217
 2 | 963549e727a8abe0e772e51580fca702
 3 | 35621bc5e29e511198d6eabe34676975
 4 | NYT_ENG_20130625.0192
 5 | 17a2dc40635ec239e9e16d10b6dd45e8
 6 | NYT_ENG_20130712.0047
 7 | f81535eaaa2c20ef26d54d1d87a02186
 8 | 7677d625b58ce649c8aeda2ff4a56389
 9 | ae6d0c01a0bea085e48016ac29a3c535
10 | 4622b60202cf3944119daf2be53aa74f
11 | NYT_ENG_20130506.0045
12 | 56af144a4d1d2e662531bdfd00d3c725
13 | 0e6c9afe37a18411d275ee225a0f0f9b
14 | 34d49f3357eaf14c849e9cdfeb893273
15 | dd0b65f632f64369c530f9bbb4b024b4
16 | 0648a08469a3be9eb972f0d213562805
17 | aa33a695c3e28d1f3dd03f4e0b373f70
18 | 1f288dcbcb562b39031c6a9402ebf6d0
19 | e8ad0cb1356161f82fb56c9f88b41990
20 | e5e3faef4fb44311a0ec8aab24903c41
21 | c728ed6c29213079b5f66788047ec89e
22 | 6154640fdb94510274583591cad7b379
23 | 5bbe1c6185296d179b95810e48ee3834
24 | a268efbb260f633c3979688e3b07e7d0
25 | bb6cb93cbd13b91ca52bfc582af0eb45
26 | 19569b08f07d751d6ac4a07633653c50
27 | 3b4d58c0a53671c6ce03f0529bb6089d
28 | a72d82525600c5a2e1aa428264bf089c
29 | d81d2b468875c49a9f6453d78a8e1ddc
30 | a08e03759505523de8475e3bf906dd5d
31 | NYT_ENG_20130710.0155
32 | 


--------------------------------------------------------------------------------
/evaluations/supervised-ie/resource/splits/ERE-EN/train.doc.txt:
--------------------------------------------------------------------------------
  1 | 459bc8b09f4dd2e1fec7c77d26193b01
  2 | 43611a2f256d101f910b852379c70959
  3 | 6521f6bd1eb405232a5e852423722bac
  4 | 565fa81d640f451b20955887a43b3a23
  5 | 5fa0f2a7f323a781640b126978ca8a42
  6 | 1d2911e09a6746b942c3e7b3cbdcb0ce
  7 | 08b0dfe15192c063055ed7db8d24c625
  8 | 644706e2d97c9a9a1f9874510180f136
  9 | fd103b2c981e724f64d70a22c392ee93
 10 | e98123aa18eb4ce95d2d4eccace51169
 11 | 5254f96ac3a601e99b6357c4f7627991
 12 | 4743a10c1d5f1ad35c31646049acb9db
 13 | c793b6b583e008f105af586fe433d4ac
 14 | NYT_ENG_20130822.0136
 15 | 2ac3b55a10d5395ded9e8e54c345553b
 16 | 38cd9b530a5be18dbad52400da435934
 17 | 59f8514f6db132207ba9e5828f73d706
 18 | NYT_ENG_20130525.0040
 19 | aa54ac32868c5de9b05b65a8ee7a4329
 20 | NYT_ENG_20130509.0160
 21 | 52e569e00b6428b94205d3dd5c457c54
 22 | 7c5b86ed55f4e5b8667423ef88f49fb5
 23 | 78333509dffd4a7df90b029a5d851dfe
 24 | 1ae45904ad12b1540dc390e162b61235
 25 | 27eb0b9d14d45ede66fe86534e36a2ce
 26 | 2a54459212636289034af844f8634e37
 27 | 7e520221ddb1602a0f2aa10560a50a66
 28 | 24d93564f48ae17904aa82f937db8c21
 29 | 35587c6d8aa67724ba23231dd16f7b44
 30 | af79ea77b8fb92424dbc02d88d8c14e8
 31 | NYT_ENG_20130422.0048
 32 | 361e1c2ca3a1e21c618e0e8fab959e30
 33 | 9777919d54ccbb7810bd1c73df91fa4a
 34 | 18e8a277f2659f79291efa0e12e80cb3
 35 | 5e3fbf49f8301654bb4954c0f1e386a9
 36 | 44b011cd504c9ed71beb851324db886a
 37 | 5fa7fbe87758a02a1e4591f88175ccf3
 38 | 0eb03fc279066b84ed49d44b2405469a
 39 | 2f5ee4e363c30678dc3b55caf43bc63d
 40 | 57026b7bcb8f855de3e26d572db35285
 41 | 3446f8cbcf53eaca5692913ced012b11
 42 | 2c2e8b3286bd34e30a4cb57cb7e26ce5
 43 | 4df3dfff1ee1683ac6e1c2ea24ce2589
 44 | 105249d0d0575a1a5939b16139f6229d
 45 | 01f69c4c2206e7c3fa3706ccd5b8b350
 46 | 648abb9000309b9807cc8b212c11254f
 47 | deb3e0ea36b437c34b52d95aa6a9631f
 48 | 1badbb95e5e70ef90e49cdf5a46b6d9b
 49 | 7734fb9363c2adf91c6ede6c7bb7df90
 50 | 1d6c0e3df079663f6bceca0b44c98a40
 51 | 464e03afec9c80f8c1ce4acfe2d002ae
 52 | 04debcc4da342dc971bdef4210fe468a
 53 | 63dca285201d1fcda72a54f4302b2c3e
 54 | 2cf358ab89c732d6b35b65e619d2bc86
 55 | 07c9c8ca974b6e9333c38720b0b06896
 56 | a68c8d0ef75bbbd2923bf7aa78b72d3e
 57 | NYT_ENG_20130813.0006
 58 | 3ac3c99241c2243a9e233b091eddfe15
 59 | a13d4f9511d799fc25b73e4d5cf28d13
 60 | 9e4a09ec419e110a3a12f184e66aea72
 61 | 255bae1c133d1d77ef727c063e435a78
 62 | fd80f8b1a5694813bbda3253139c6395
 63 | 4bab621aef9d14b5d20ac23cb8142112
 64 | 96bf72399b104346f3e79022e0c08e5a
 65 | 3b34a76a3589417f5db02883b47280a6
 66 | 26175bdbe49b712d7412c273c111e813
 67 | 3eb834d9a5d9c9fcad258087b5c2794a
 68 | d3b5c32563ebb009bc1b1f5bc1b9eb14
 69 | NYT_ENG_20130703.0214
 70 | a48d00241e327e54ca914b950e97c7d4
 71 | 3c9fb643a48360935c1044efca570514
 72 | 130a86739522ab7c56232e798d04cbf9
 73 | af18d29036ab0a9f8cf2742a5a1b4804
 74 | 652f1fbc927a6c358447947d0d77f95f
 75 | 66fba4f92d2f9d8c3bee5dfad3af9828
 76 | 37b56b6dd846ad0dd6e8cd00ba2efaf4
 77 | 4d7e1af80bc46167ef3d81cf642bf94b
 78 | NYT_ENG_20130613.0153
 79 | 2ca0238925d38f345acbf826854ea448
 80 | 2a10c5cc27e7504dc9df92396b9e28b8
 81 | 0c100ebc18cc55f80cdae6343f72db69
 82 | dbed9b6ed7d2eaf75fef0aa5a245a663
 83 | 5d4273298e649a13c4dce27c89f414ac
 84 | cb156ad2a5458fabc9e093b6b5e0f97f
 85 | 0929d82f7059353f9593b9558983efba
 86 | af36543ebce546c7c678fbf9767bfdbb
 87 | 120fe19a9bc68fd85fc4963c166e9345
 88 | 774caed283a1e55ef9490864771029c3
 89 | 561a0178f4b846b9bbcf39f7e63afe4e
 90 | 3f987a93959acff3609a251b5abbecd7
 91 | 0f947223d04c10118b523cfeec5d231e
 92 | 0fe5904ced20c20537fe29c1db11cd28
 93 | 9c500ea2248358171d77d419e67f5760
 94 | 043b35fbf220a2d1bbe7d0612ad87635
 95 | da156c00417e2020948c009d39341607
 96 | NYT_ENG_20130501.0255
 97 | NYT_ENG_20131118.0019
 98 | 26542fb5b83cdb4b98a3fe31e0226b39
 99 | 95af1b55c359f28ff3a9159d55e9528a
100 | 2ac34d012c8d909d4a29aa3f6be1f23d
101 | 1d16a571f14fb1032bc19e9314a46deb
102 | 9f23d711bf5016fec9d05081772b4f24
103 | NYT_ENG_20130914.0094
104 | 5d7b429073c60d53acba21bb6e7e6caa
105 | 11c906f2f798abb05f143b206edf77a5
106 | 334de29f692ef2c5460b78fcad5c6c9e
107 | ffc5cc6892ff203f43b2dc8d83bcd725
108 | abbdf0048737e9e639403f8fe8cd7dd2
109 | 23987125927d321ec6f0c30c8f453cb3
110 | 31ea929baed3887e762b0b7f9196ce7e
111 | ae656f6d658efca126f9721087608e95
112 | ecb7c8154bf58b48ae00b252ff283c29
113 | 67db76e5116c4c809107948d4b0a5ecc
114 | 861cdd1a5c6c41610021b25c3795e293
115 | 63878a2b6d34b576361d2a2778f321a6
116 | ed6c37ed1996fc89f5fe813731c71b9d
117 | NYT_ENG_20131115.0084
118 | 8492134197b5bf8e9179e2fa245ae02f
119 | d6bc66d7c8423368aaa8d789b5bdf5db
120 | 4f7eedf44076ea050d7db3715f9333fa
121 | f0612c786635ed96ee3df84821a17685
122 | 459b795a150e7866d6e4ef75e1b92b4a
123 | 1473ea2ded50c05b29b4f55f1b83ada3
124 | cf88887857b155d8822f82cad3597744
125 | 1980ed7ea6a283f8dd19da5a4e9952d6
126 | 17f98f0c6cda0227e732e6761f396d1f
127 | 477135a713d07aafe00d5e86648ea408
128 | 33bdb079026f1fcbe47c64b8c6968d0e
129 | 290e2643c2f91c108b206c5edb7a1c0f
130 | 29f64df7feb04dfb16f4667ce199c9f0
131 | NYT_ENG_20130716.0036
132 | ca2a6fbf721ca102c149ad6a90d5b00a
133 | 5b7cab1d1cfc0c05686399d8bcbcfe5b
134 | 4fca88a5c29716cbb7c0f9aa9b84007a
135 | 40f1f697a457e39c30ad94b7cc712c96
136 | 8073c89ca4fdbe3b1eba0352bfe15d78
137 | 4deb48e2b0ab194ce37c1bd31c73586a
138 | bf1047c7c17ae3daab59c3bee423e12f
139 | c397ecd66789b905c6b1c5ef21af03ec
140 | NYT_ENG_20130504.0098
141 | NYT_ENG_20131121.0040
142 | 178e7de35eccad0df800f0c7539cf614
143 | f913574a9c0637dbcf66def4a2c1dc84
144 | 3e9bbf75058a3f16585889bb9c64a903
145 | a83302f9002b6707fc7a91a7d7d29e6e
146 | d7369ce92ed0b6327412c705dbbab654
147 | 70b2f9277a1c78bd13cef68ba6485bd9
148 | 5d0b5755e212a88afbbb8b29c34c4f13
149 | c1f185252a2837aa464e36f263d1ebe9
150 | 6291811a3fe70d3ec8fc26b91060e2f5
151 | aa32f4f9534045b9f33a9599d0c1b580
152 | 2bebb50073ceefd0c9ccfdf3e07b3258
153 | 30cced37fcceb1800341d18d4f97b670
154 | 3b9c27eda65c635e109a547930942486
155 | 3322caacf140c92366a639ee004560ce
156 | d5825f99faec1ae48589b98560a98d61
157 | 408dff173c599256711f23238e280c15
158 | 47de592453663260c44944346d669611
159 | 86a94ca907de6688cca64610730fa11b
160 | 18a89cdd00dadc593a88c924111575f1
161 | 4edd239ce7d1f7274154cd05081f8995
162 | 7c0e0e53980aeb2868cbe4e1c1cb79db
163 | 33c71a5cec78e7d766d75c9a73b327b8
164 | 9b3bc3c727dfaa49218b57254087ff5d
165 | NYT_ENG_20131210.0203
166 | 2d2a4ddb1c8f4a669541704f9fb78472
167 | 5dfd5bfee062cd5896b619a2b1309766
168 | 2701285c791f423cd2f8fd827df9c2c9
169 | 04952b874a2a34d602faaa74712d435e
170 | 2ee2377e5d4ae6f5922ea2af11f9d4e1
171 | 79a3cc37998a99808583eba765aedca1
172 | NYT_ENG_20131025.0190
173 | 5bfd613fd31f0c2bdfb5c41f21629144
174 | 61d6f81f680f83a1a3281fde24d9c3ac
175 | 79c976f694784ced2b0c8752eb767901
176 | 0a421343005f3241376fa01e1cb3c6fb
177 | 5753617c893938f625b349cf6bd2b388
178 | NYT_ENG_20130428.0140
179 | 34f729e5ac124e9898b2744a6598d50e
180 | a724033bff06e750d27cd7e3bf8263ac
181 | 1b0f90c029f75d326ea39c0371901ef4
182 | 51d64c51a2363954454ee9e921b590ce
183 | 1656bbad43fee4569b5c5f14110c1342
184 | 5f3a6a4c39c15d7382c2cafe64ae898d
185 | 6667fb9e43ac7edde844453cba97baf0
186 | 52a77871923a7f86bb1a52812bc7f2e1
187 | 44a65adb7f74e6c99d05eb2721fd0baf
188 | NYT_ENG_20131029.0042
189 | 5bb3c2b1094912a6df7e862bb2981481
190 | 47c26ba3563092e41c5a42252931baf1
191 | 41404718f9c1e94cf58aad1fc90c70a7
192 | APW_ENG_20101231.0037
193 | a05c08e340a73270592f62361a19274d
194 | 99ab1cad51361e94c2fe3f997c45705a
195 | NYT_ENG_20131029.0091
196 | 5c7ea2b51202d80ee37eba8a182afad3
197 | NYT_ENG_20131128.0177
198 | 0e0abbf0da91d9e34750441c08d5d262
199 | 15ba31cca04cc5300361f46319247c40
200 | 459f9a2b3eddd436f0232395f129dfd0
201 | NYT_ENG_20130508.0098
202 | 04134f2be20afbb868d7a8292f49e277
203 | 0cde024ee993679967f7ac397000ad52
204 | 593cb5020613a4695859130542f7fc94
205 | NYT_ENG_20131122.0237
206 | 9f6e4c46ae753bf14edff7e2ac767213
207 | cd04993849c889a56ea66c6670f002f4
208 | 4042cd8643253f65df3a4e8de320a1c9
209 | 3f0e2f2fb9b773bc178522a6535a9651
210 | 4798bc0e166fe93893bdf2d922f06258
211 | a9318b72c7a2ff32d459af958c7defe1
212 | 3ae6760a860a33cb90af23596fac475c
213 | 11329f1cdb44019afc8f48b6fdc5376d
214 | 39ff7dcae4034417ba175de97d14b165
215 | 43341a312ffd84a4ad3c3ab0df8bcd7c
216 | 21dbe23f56aaef87fd0980234895b321
217 | 02905b7ce3a6b8b0961c6c2310392ef9
218 | f6ad2150f6c32fcb1488438f6b4275ce
219 | f9af64dc0cf1e7edd4a8feef75018b81
220 | aa003ea934a97bac86cee52b7122f1f8
221 | 766386bc5cb9eb40419a80d082472d50
222 | 4435a7cb258d37b4fafc3ef0e833582e
223 | 736fa00bfb16f3298883be5e962fe01b
224 | NYT_ENG_20130731.0133
225 | d409fd37c208c5a7a5b2c64b4130b0ec
226 | 5cd7d603e1cf8d2c134d039dc90112f0
227 | 1e9dfabe5e068a4142e768c0c5c37b6b
228 | 36b12cef6f7a805e3e74a4f430129028
229 | e2e2039f203f36b821d15e2cb6f588e0
230 | NYT_ENG_20130816.0151
231 | 856bc3bee118c826c394ed09548db9b2
232 | 909239794c799f2d2e79c023ae090c35
233 | 087f58983ef5e94e54024bc9f0f009ae
234 | b49eee97fd373efbb4cb41926e60e385
235 | f801d26c9b4d7577df089a196e242a04
236 | 2a46fcf4ff6ce3896f249848e48b3b4c
237 | NYT_ENG_20130619.0092
238 | 1a79f9d5c3f784a494196a9bbb586f3b
239 | fab32c473df923a6a9242054c8d23bf3
240 | 1a0f101744b34677ce1e1da1b1b91beb
241 | 4572d22caf3e1924f894002b724f958b
242 | 30eadb19db9f0db62cba7be66862920d
243 | 59a5d2e146c13f7519130193fc773610
244 | 3878ab866ca434318076c4e7eac49c0d
245 | 2d7d6761aad911a63a235a571fa7862f
246 | 4d996a22855cc2ec9f54990a23d51c56
247 | 324274e50f2d07757e2d88ff58a0c33b
248 | AFP_ENG_20100414.0615
249 | 563b1e8fcb1de7a4c0e01da9100d6e09
250 | 5dd42026c76290af6689691fbe2b8d1c
251 | edc4216d65afa47fe7bc6004ac172e92
252 | 2aaa319d1e1a0600837d013cb84290ea
253 | b9109877820d90dbc5efcdda02e6d450
254 | 0f316bb245762eedec6682acbecf2822
255 | 3dc7812b2b39ed067cc7c8ab1218e128
256 | 648fc5834f73b4196b4ceb3daad954f9
257 | 0fab386f8b6527439481f526c92341c7
258 | NYT_ENG_20131220.0283
259 | 36d45aff571e3fbe036f309c18d31668
260 | 3a0d64b5cb2bc7319e803e344dc695b5
261 | 39280a4d31d81837e17469e18a854116
262 | 670b5425fcd1700e2c27af5f09244cb1
263 | f3e00fa1d34bca154aea0845c628f0e6
264 | ae9a0d394c5e3d3d812c7ffc07c2f836
265 | f18a7b77b1fd1065db9aeaf3f6143a5e
266 | 0536891daea71ab51ee1123137b67146
267 | 6491f0650d9628b84dee6f539df5a53d
268 | 2ba8bbf004fe30c0a01f6fcd25f01dcc
269 | f0aabfc899d1c17b8e99039bb4f80d64
270 | 4ae1669fc17f6b863ff35fa14a960270
271 | 2bbf45266e4ec0ae72977c89ac8d55c1
272 | 0ba982819aaf9f5b94a7cebd48ac6018
273 | 010aaf594ae6ef20eb28e3ee26038375
274 | 0c49bb860962aa0d5b8e3fc277592da0
275 | e972c0257d72aefc52cfdf7e7f5a1623
276 | 82f0af70bf68f4e78e6ea60a339f830d
277 | acdf07c9477b21e1d29c51dc692e085b
278 | 186ef6837e001cd9b97a132c86705545
279 | 389c70a4859f7528cc6e8b84c10766d7
280 | 91147deeeec220cc445a8d546585cdb7
281 | 370e7ee173951eeff13998a416b8b3d0
282 | 9fc05e3fab69893da830adfa6513510d
283 | 3b9b81a3a446c24009c7642da54dbd28
284 | 1bf9912633f942d6d1d4e87df33cee40
285 | a42f7cf822523c76c225602537aefc7a
286 | 4fbb1eec7dfd5c2fefb94a2d873ddfa5
287 | ea4d6baa1d6174c45fce1e6bbb58e1b4
288 | 3059538a2542c71687871b3444f8d921
289 | NYT_ENG_20131121.0250
290 | 661ece467567ffbb54b551dfc1c2c254
291 | 204f8f6bdb24c5198175bf1ed483247b
292 | 1f60eb9697e240af089b134b69c2042d
293 | 44087d95184e9d94f3948f47e9b602af
294 | cade0d91e2e82e4db58efe64d7462c33
295 | 10953ba63f691cb49f47f852b359a6e3
296 | 15c96bac6c08ef94fe249fde914b53d7
297 | 5c59566e9132c060423cad5b2d1bac1e
298 | 57b2773ab54bbc5c119a46fd9be2c4f0
299 | 368df106b2eaa0b4091e099f360a07d6
300 | cb824da90723fed309217c6e28b1c7cd
301 | NYT_ENG_20130828.0147
302 | NYT_ENG_20131225.0200
303 | 3f115570c2fcc85263ba97e0134fb039
304 | 44fd27d40ae65547c3b584c2ff360cd7
305 | 07b79a8764693a80861e5a3e5fd47fa5
306 | 6f9d5ec51264868ada3c2c22c70fc57c
307 | NYT_ENG_20130709.0087
308 | 6837dcaff76ad3235d46708dd89e7306
309 | 2251a78817e67a2adaf0722fd05c7ac0
310 | 97655df62dd4a176b65cf8a2c2a6e82d
311 | fa371b1fbb4d20143e638a7dac6e4f6b
312 | c8930568f1175e8bb0bff9b932a5c2d4
313 | 43326b9fa7deac9d3f8f9e2a0aa0e5cf
314 | 5685a6069312d52a897fe69973269338
315 | 4829d3d91263ed9d8801e6d94c3569a5
316 | 1a11228e8230c359e0f357cbd8240b01
317 | 342431e61e80263f606c46bb5e399cc7
318 | f703536e3212f51cbf26ce47aa7b5eff
319 | 086e26ec92d1cc02f3900e9ac46d6962
320 | 502c46cc149d30f9ad0c25194636dcb6
321 | 33ed1c9fdee1000e2340ac7f92c77752
322 | 609d5112c0386dc4e5f2e90b93cb7a5f
323 | 0fbcb8f76124b9654076889ce04a045b
324 | NYT_ENG_20130603.0111
325 | 824610c87232d345dcc130521f20f72a
326 | 57fb3f87bbb8c3205163ea256f658891
327 | 09098ae4e956a51b038876197814735e
328 | 073020eb350fc73f123bfac8ec485ecc
329 | 48c498c9762046efbece8d183ed996ca
330 | 4b2d9d5984b731dbdd3db398b5fb5e46
331 | 2d8d3572658fdb8754fdc84d2b15f302
332 | 8f575db98ccc3af0a904b650898368dd
333 | e37cfedb8a3a32769a12262eaef9ee0d
334 | 542d2b2755c23b22e9747d8a3b020bf2
335 | 373a3b4bb2a9e67a12c50ad54a1be657
336 | b6b443777e5ca92aa5152f5593960fd9
337 | 3065902101e4282b89ed4ac8f64d4a84
338 | bdca67a0bacec61b5e691d5ca51ba724
339 | 6f13620752b8bd5acf2e1e94c49faef5
340 | XIN_ENG_20101125.0137
341 | NYT_ENG_20130910.0002
342 | 65814a1b2cccd0fd9be5ee3d5068038d
343 | 84828469f40b28161c559e3d01526039
344 | 584b6272bb8c9cc134621ff5ace8c98d
345 | 590baa25bb1cc16c31fd02395edf6835
346 | 39ebaa0bb958e3529b331f4c71025e62
347 | 17f22c2b1e5642b41a9aeedb03261d1a
348 | 081fede2fca345dce82bf6b2355d4ae5
349 | 3e6c7121211de578d7fd831eae801438
350 | 4175e3da216dcc8710a26359e4ecaaad
351 | d4698e3ad06f896058ade2e8f3a09577
352 | d528b874a0a6bd6011279a3239360aa2
353 | 3f78c311ad97d4bbc6b4914deb4ab1ec
354 | 08ebdc5f0ec8588af38ab1684318d99c
355 | NYT_ENG_20130625.0044
356 | 543e319fb067ef8cba81c74bb13c5711
357 | 2c8bcca93da4097da338a8754e4f03b0
358 | 52355a4167e6ac3a80d19c94ad6259a7
359 | 1557734399e8da2b84a2dd9ddb4eba49
360 | a223ebce2f7481c8feecaba0982b4fa7
361 | 4a3d067b19686b281e0beb437573a28c
362 | 1b268b27094ba9c5feb11192dad940ab
363 | 376c304800b734b2a5a2c87b19eddc2a
364 | 018fb4e59ac5474167ffc5940d7e55e7
365 | NYT_ENG_20131003.0269
366 | 3138f7fb2f8575ed762eb0bc11023d59
367 | AFP_ENG_20100601.0724
368 | 37d781089c669131c5118415cf470422
369 | 3a9a0c07af53fce42e1a55c21826c54d
370 | 0f565d3822dca80336582ffac4adaf78
371 | cfd86b06365dab636d13523c7ed93ad6
372 | c06e8bbdf69f73a69cd3d5dbb4d06a21
373 | d0b9b1747f4a6247294cde9ac0165c60
374 | 416cfc6a5717682cd35d381c5be07734
375 | 22696c601df1a7359e9b629c689700ad
376 | NYT_ENG_20130506.0130
377 | 4764f1400fa336d1fb972719b10b939a
378 | cca700aed62fd497e64e507752409b41
379 | 56c895a1c8dead5698a49321a674f3f4
380 | 17af00d74fca31bceab4ad463bf1c384
381 | 026e0a2c96e90bd8bf9aecde62d7530d
382 | 4eb58398a5c2ef35b16d885c5573b3d4
383 | 5c29f9e575b94c61db8ed52bdfa53843
384 | 3d8f19221d257f81e3376b9e0731d4db
385 | 12bbeaf10a36d36d82824a72352ac178
386 | 362f9d9707c4da0c8068bc7034aae4b4
387 | 026bd1c7eae9f14da9480a4b88ba2fb6
388 | 4683e6affe801713ed4cc9d596b57fac
389 | 2b96d1172d37f60aea5ce64a0b410248
390 | b608865c83b6612bf9ccb4e4c6e66ee7
391 | 584ccaef38f5936e973f0561966bbf06
392 | 0cfdfe102b7a4cb34e1a181c1d36d23d
393 | 83d7cb6d5b663f34dcf83879a8729fb4
394 | 30fa916e5173b52d449300e2ea71b787
395 | 25f868780ac18430a6f10ab4de22ffb8
396 | 4c2488e10c34e5412d3b67e794c9bc84
397 | 


--------------------------------------------------------------------------------
/evaluations/supervised-ie/resource/splits/ERE-ES/dev.doc.txt:
--------------------------------------------------------------------------------
 1 | XIN_SPA_20050322.0162
 2 | c93832992e8ca0020c806137834bdd38
 3 | c70895b8121c60e3a0aeba14f40707ce
 4 | XIN_SPA_20021108.0309
 5 | APW_SPA_20050201.0692
 6 | 4ba25020962498950ff85a88c30648fb
 7 | APW_SPA_20060120.0581
 8 | APW_SPA_20060717.0443
 9 | XIN_SPA_20050403.0126
10 | 295f9c1a5b9bd20e6e20547ffe0db294
11 | 


--------------------------------------------------------------------------------
/evaluations/supervised-ie/resource/splits/ERE-ES/test.doc.txt:
--------------------------------------------------------------------------------
 1 | APW_SPA_20040902.1107
 2 | XIN_SPA_20050403.0106
 3 | 9b0b7d0b89b7fc118363762e1af5ace4
 4 | APW_SPA_19950810.0183
 5 | XIN_SPA_20050205.0035
 6 | 3eeaac8978fc543ffcaa6ac0a1d9a5ed
 7 | 29838866bc6ab760d9a7dda4c9c77503
 8 | XIN_SPA_20050125.0291
 9 | APW_SPA_20060113.0097
10 | APW_SPA_20080609.0519
11 | 


--------------------------------------------------------------------------------
/evaluations/supervised-ie/resource/splits/ERE-ES/train.doc.txt:
--------------------------------------------------------------------------------
  1 | 4592b44df692ae3b8a4f22372d04fa62
  2 | e3ac81d75cea10852ef8e85b20b69c74
  3 | XIN_SPA_20050403.0111
  4 | APW_SPA_20050327.0348
  5 | fde63a2d572ab7ebf368318020f0b071
  6 | XIN_SPA_20050402.0105
  7 | APW_SPA_19940407.0064
  8 | 988f859f2ab7d4f4c9d46a4a1783cbba
  9 | 989afacd6af08934a6af2cfc81b6c436
 10 | APW_SPA_20050312.0510
 11 | XIN_SPA_20050403.0051
 12 | APW_SPA_20070614.0172
 13 | APW_SPA_20050111.0523
 14 | 159551b03ba96d920df6080d9b86176e
 15 | 4bc0205d8d2ac3edd35fb542701105e2
 16 | APW_SPA_19951008.0022
 17 | APW_SPA_20050503.0767
 18 | da4faa0089a16785f47ca8a67804c3f2
 19 | APW_SPA_20000328.0091
 20 | APW_SPA_19990915.0054
 21 | APW_SPA_20050521.0623
 22 | APW_SPA_20060810.0577
 23 | APW_SPA_20050330.0162
 24 | XIN_SPA_20050119.0238
 25 | APW_SPA_19960612.0124
 26 | c1d312ae8c2b67eaf56dfc7eba7e99ad
 27 | XIN_SPA_20070922.0015
 28 | 1b7885b9a6142dcaecf9e0e9613d6952
 29 | 6a022c5cd583f4b9890141ca749997fb
 30 | XIN_SPA_20050203.0224
 31 | APW_SPA_20051030.0439
 32 | 1117479cfa864e6607dcf7f143a808ba
 33 | 3135e76164dd737557d83178339e54e7
 34 | c8416f15a6cec37e56de13eb82b7b510
 35 | XIN_SPA_20050329.0184
 36 | APW_SPA_20050108.0071
 37 | XIN_SPA_20050403.0139
 38 | APW_SPA_20050121.0930
 39 | APW_SPA_19960702.0050
 40 | APW_SPA_20050302.0582
 41 | XIN_SPA_20050401.0038
 42 | XIN_SPA_20050403.0122
 43 | 8ecee1d57d9d28609e5d56f23b0d76bb
 44 | XIN_SPA_20050127.0132
 45 | APW_SPA_20050329.0740
 46 | XIN_SPA_20050403.0054
 47 | 143877c9d95829210efcfe7f995c769b
 48 | d7d81874a99b95dea45ba319ae52af8c
 49 | XIN_SPA_20090715.0070
 50 | XIN_SPA_20050128.0086
 51 | XIN_SPA_20050212.0123
 52 | APW_SPA_20080826.1135
 53 | APW_SPA_19950811.0059
 54 | APW_SPA_19951109.0028
 55 | a19444cffdf468ac2155654a9d1c8693
 56 | APW_SPA_20050124.0920
 57 | APW_SPA_20041202.0453
 58 | 1226499305419e318a5dc104ab8066c5
 59 | a3ad9a4413967891746040cdac56aaf1
 60 | APW_SPA_20060425.1120
 61 | APW_SPA_20070414.0157
 62 | 735dbf2a938d7bf8d6ff3491bdbe0715
 63 | 543313af510918445612a0f0b6f79871
 64 | XIN_SPA_20050403.0117
 65 | 8a9c8bbd3ad2ad5fc85d35fb84a2124d
 66 | APW_SPA_20041022.0758
 67 | APW_SPA_20050427.1010
 68 | XIN_SPA_20050111.0122
 69 | XIN_SPA_20050325.0210
 70 | 8a6fa6598778ac9922c5a65733b28ead
 71 | APW_SPA_19960129.0030
 72 | APW_SPA_20010208.0070
 73 | XIN_SPA_20090709.0118
 74 | APW_SPA_20090403.1130
 75 | XIN_SPA_20091117.0065
 76 | e7b2d90daec857685a51c9f1a4ad98de
 77 | XIN_SPA_20050403.0093
 78 | 7984eb82f19ef829045f9876f74f30dc
 79 | APW_SPA_20040325.0157
 80 | APW_SPA_20050401.0996
 81 | APW_SPA_20070828.0549
 82 | APW_SPA_20050405.0784
 83 | 289a43fb49b37dbe63398decde9625be
 84 | d5ae0a896a5b5e40366fc25d48e29fe9
 85 | XIN_SPA_20050524.0180
 86 | XIN_SPA_20050125.0232
 87 | XIN_SPA_20020422.0221
 88 | APW_SPA_19991004.0100
 89 | XIN_SPA_20050403.0110
 90 | APW_SPA_20050119.1163
 91 | XIN_SPA_20050120.0219
 92 | APW_SPA_20001214.0085
 93 | XIN_SPA_20050411.0073
 94 | 3f21bcd1dca99ac949bae07cf858f2da
 95 | 66bd051ce8a098996903afe59cff69d7
 96 | XIN_SPA_20050125.0276
 97 | edc4094bbffed34aa86bcc3c3a2ac739
 98 | APW_SPA_20090406.0868
 99 | 40be1d303aef5e921f8c35d93f753abc
100 | 3cde74ca728f84882b404c78fb9d50bb
101 | APW_SPA_19970720.0046
102 | 5eac974a46f903198596aa69a1ad317d
103 | APW_SPA_20050303.1080
104 | b7a1a9f7a6573dc4e38e5eea61ff0348
105 | APW_SPA_19940405.0063
106 | fc34abcf77bbbdef2847c60183ca49d4
107 | APW_SPA_20050206.0403
108 | APW_SPA_20020527.0102
109 | XIN_SPA_20090726.0126
110 | 2304550f3162898f67ac68c08b390780
111 | APW_SPA_20070125.0656
112 | APW_SPA_20050503.0035
113 | 1f682475f9a8809adfae6f142c34e59d
114 | c60e6b87d095f1dfe39bd2ddd24f9f9a
115 | XIN_SPA_20050403.0040
116 | e7b8b8eea44d88d2ac8737f50479e55a
117 | APW_SPA_20011108.0042
118 | APW_SPA_20050324.1013
119 | APW_SPA_20070605.0825
120 | XIN_SPA_20050102.0066
121 | APW_SPA_20090407.1189
122 | b0a0c2687af1cb1b966f73d232d8367a
123 | APW_SPA_20050211.0074
124 | 718af85e0c59d04709f4349c77753378
125 | 9476d05ff8d63dc664ca6035f87a0ced
126 | XIN_SPA_20030223.0128
127 | APW_SPA_19980914.0097
128 | XIN_SPA_20070713.0191
129 | XIN_SPA_20100610.0160
130 | APW_SPA_19970713.0038
131 | XIN_SPA_20050402.0084
132 | APW_SPA_19950407.0125
133 | 3016fa4a06962bb3e642a3784c6d74b5
134 | XIN_SPA_20050131.0244
135 | 


--------------------------------------------------------------------------------
/evaluations/supervised-ie/resource/valid_patterns/event_role.json:
--------------------------------------------------------------------------------
  1 | {
  2 |   "Movement:Transport": [
  3 |     "Vehicle",
  4 |     "Artifact",
  5 |     "Agent",
  6 |     "Origin",
  7 |     "Destination"
  8 |   ],
  9 |   "Personnel:Elect": [
 10 |     "Place",
 11 |     "Person",
 12 |     "Entity"
 13 |   ],
 14 |   "Personnel:Start-Position": [
 15 |     "Place",
 16 |     "Person",
 17 |     "Entity"
 18 |   ],
 19 |   "Personnel:Nominate": [
 20 |     "Agent",
 21 |     "Person"
 22 |   ],
 23 |   "Personnel:End-Position": [
 24 |     "Place",
 25 |     "Person",
 26 |     "Entity"
 27 |   ],
 28 |   "Conflict:Attack": [
 29 |     "Target",
 30 |     "Place",
 31 |     "Victim",
 32 |     "Instrument",
 33 |     "Attacker"
 34 |   ],
 35 |   "Contact:Meet": [
 36 |     "Place",
 37 |     "Entity"
 38 |   ],
 39 |   "Life:Marry": [
 40 |     "Place",
 41 |     "Person"
 42 |   ],
 43 |   "Transaction:Transfer-Money": [
 44 |     "Giver",
 45 |     "Place",
 46 |     "Recipient",
 47 |     "Beneficiary"
 48 |   ],
 49 |   "Conflict:Demonstrate": [
 50 |     "Place",
 51 |     "Entity"
 52 |   ],
 53 |   "Business:End-Org": [
 54 |     "Place",
 55 |     "Org"
 56 |   ],
 57 |   "Justice:Sue": [
 58 |     "Defendant",
 59 |     "Plaintiff",
 60 |     "Adjudicator",
 61 |     "Place"
 62 |   ],
 63 |   "Life:Injure": [
 64 |     "Agent",
 65 |     "Place",
 66 |     "Victim",
 67 |     "Instrument"
 68 |   ],
 69 |   "Life:Die": [
 70 |     "Person",
 71 |     "Agent",
 72 |     "Place",
 73 |     "Victim",
 74 |     "Instrument"
 75 |   ],
 76 |   "Justice:Arrest-Jail": [
 77 |     "Agent",
 78 |     "Place",
 79 |     "Person"
 80 |   ],
 81 |   "Contact:Phone-Write": [
 82 |     "Place",
 83 |     "Entity"
 84 |   ],
 85 |   "Transaction:Transfer-Ownership": [
 86 |     "Artifact",
 87 |     "Beneficiary",
 88 |     "Buyer",
 89 |     "Place",
 90 |     "Seller"
 91 |   ],
 92 |   "Business:Start-Org": [
 93 |     "Agent",
 94 |     "Place",
 95 |     "Org"
 96 |   ],
 97 |   "Justice:Execute": [
 98 |     "Agent",
 99 |     "Place",
100 |     "Person"
101 |   ],
102 |   "Justice:Trial-Hearing": [
103 |     "Prosecutor",
104 |     "Defendant",
105 |     "Place",
106 |     "Adjudicator"
107 |   ],
108 |   "Life:Be-Born": [
109 |     "Place",
110 |     "Person"
111 |   ],
112 |   "Justice:Charge-Indict": [
113 |     "Prosecutor",
114 |     "Adjudicator",
115 |     "Place",
116 |     "Defendant"
117 |   ],
118 |   "Justice:Convict": [
119 |     "Defendant",
120 |     "Place",
121 |     "Adjudicator"
122 |   ],
123 |   "Justice:Sentence": [
124 |     "Adjudicator",
125 |     "Place",
126 |     "Defendant"
127 |   ],
128 |   "Business:Declare-Bankruptcy": [
129 |     "Place",
130 |     "Org"
131 |   ],
132 |   "Justice:Release-Parole": [
133 |     "Place",
134 |     "Person",
135 |     "Entity"
136 |   ],
137 |   "Justice:Fine": [
138 |     "Adjudicator",
139 |     "Place",
140 |     "Entity"
141 |   ],
142 |   "Justice:Pardon": [
143 |     "Adjudicator",
144 |     "Place",
145 |     "Defendant"
146 |   ],
147 |   "Justice:Appeal": [
148 |     "Adjudicator",
149 |     "Plaintiff",
150 |     "Place"
151 |   ],
152 |   "Justice:Extradite": [
153 |     "Agent",
154 |     "Origin",
155 |     "Destination"
156 |   ],
157 |   "Life:Divorce": [
158 |     "Place",
159 |     "Person"
160 |   ],
161 |   "Business:Merge-Org": [
162 |     "Org"
163 |   ],
164 |   "Justice:Acquit": [
165 |     "Defendant",
166 |     "Adjudicator"
167 |   ]
168 | }
169 | 


--------------------------------------------------------------------------------
/evaluations/supervised-ie/resource/valid_patterns/relation_entity.json:
--------------------------------------------------------------------------------
1 | {"ORG-AFF": ["ORG", "PER", "GPE", "FAC"], "GEN-AFF": ["LOC", "PER", "FAC", "ORG", "GPE"], "PHYS": ["LOC", "PER", "FAC", "VEH", "ORG", "GPE"], "PART-WHOLE": ["LOC", "WEA", "PER", "FAC", "VEH", "ORG", "GPE"], "ART": ["WEA", "PER", "VEH", "FAC", "ORG", "GPE"], "PER-SOC": ["ORG", "PER"]}


--------------------------------------------------------------------------------
/evaluations/supervised-ie/resource/valid_patterns/role_entity.json:
--------------------------------------------------------------------------------
1 | {"Attacker": ["ORG", "PER", "GPE"], "Place": ["LOC", "GPE", "FAC"], "Target": ["LOC", "WEA", "PER", "FAC", "VEH", "ORG"], "Victim": ["PER"], "Agent": ["ORG", "PER", "GPE"], "Entity": ["ORG", "PER", "GPE"], "Instrument": ["WEA", "VEH"], "Artifact": ["WEA", "PER", "VEH", "FAC", "ORG"], "Origin": ["LOC", "GPE", "FAC"], "Vehicle": ["VEH"], "Destination": ["LOC", "GPE", "FAC"], "Buyer": ["ORG", "PER", "GPE"], "Person": ["PER"], "Org": ["ORG", "PER"], "Adjudicator": ["ORG", "PER", "GPE"], "Plaintiff": ["ORG", "PER", "GPE"], "Defendant": ["ORG", "PER", "GPE"], "Prosecutor": ["ORG", "PER", "GPE"], "Giver": ["ORG", "PER", "GPE"], "Seller": ["ORG", "PER", "GPE"], "Recipient": ["ORG", "PER", "GPE"], "Beneficiary": ["ORG", "PER", "GPE"]}


--------------------------------------------------------------------------------
/evaluations/supervised-ie/scorer.py:
--------------------------------------------------------------------------------
  1 | """Our scorer is adapted from: https://github.com/dwadden/dygiepp"""
  2 | 
  3 | def safe_div(num, denom):
  4 |     if denom > 0:
  5 |         return num / denom
  6 |     else:
  7 |         return 0
  8 | 
  9 | def compute_f1(predicted, gold, matched):
 10 |     precision = safe_div(matched, predicted)
 11 |     recall = safe_div(matched, gold)
 12 |     f1 = safe_div(2 * precision * recall, precision + recall)
 13 |     return precision, recall, f1
 14 | 
 15 | 
 16 | def convert_arguments(triggers, entities, roles):
 17 |     args = set()
 18 |     for trigger_idx, entity_idx, role in roles:
 19 |         arg_start, arg_end, _ = entities[entity_idx]
 20 |         trigger_label = triggers[trigger_idx][-1]
 21 |         args.add((arg_start, arg_end, trigger_label, role))
 22 |     return args
 23 | 
 24 | 
 25 | def score_graphs(gold_graphs, pred_graphs,
 26 |                  relation_directional=False):
 27 |     gold_arg_num = pred_arg_num = arg_idn_num = arg_class_num = 0
 28 |     gold_trigger_num = pred_trigger_num = trigger_idn_num = trigger_class_num = 0
 29 |     gold_ent_num = pred_ent_num = ent_match_num = 0
 30 |     gold_rel_num = pred_rel_num = rel_match_num = 0
 31 |     gold_men_num = pred_men_num = men_match_num = 0
 32 | 
 33 |     for gold_graph, pred_graph in zip(gold_graphs, pred_graphs):
 34 |         # Entity
 35 |         gold_entities = gold_graph.entities
 36 |         pred_entities = pred_graph.entities
 37 |         gold_ent_num += len(gold_entities)
 38 |         pred_ent_num += len(pred_entities)
 39 |         ent_match_num += len([entity for entity in pred_entities
 40 |                               if entity in gold_entities])
 41 | 
 42 |         # Mention
 43 |         gold_mentions = gold_graph.mentions
 44 |         pred_mentions = pred_graph.mentions
 45 |         gold_men_num += len(gold_mentions)
 46 |         pred_men_num += len(pred_mentions)
 47 |         men_match_num += len([mention for mention in pred_mentions
 48 |                               if mention in gold_mentions])
 49 | 
 50 |         # Relation
 51 |         gold_relations = gold_graph.relations
 52 |         pred_relations = pred_graph.relations
 53 |         gold_rel_num += len(gold_relations)
 54 |         pred_rel_num += len(pred_relations)
 55 |         for arg1, arg2, rel_type in pred_relations:
 56 |             arg1_start, arg1_end, _ = pred_entities[arg1]
 57 |             arg2_start, arg2_end, _ = pred_entities[arg2]
 58 |             for arg1_gold, arg2_gold, rel_type_gold in gold_relations:
 59 |                 arg1_start_gold, arg1_end_gold, _ = gold_entities[arg1_gold]
 60 |                 arg2_start_gold, arg2_end_gold, _ = gold_entities[arg2_gold]
 61 |                 if relation_directional:
 62 |                     if (arg1_start == arg1_start_gold and
 63 |                         arg1_end == arg1_end_gold and
 64 |                         arg2_start == arg2_start_gold and
 65 |                         arg2_end == arg2_end_gold
 66 |                     ) and rel_type == rel_type_gold:
 67 |                         rel_match_num += 1
 68 |                         break
 69 |                 else:
 70 |                     if ((arg1_start == arg1_start_gold and
 71 |                             arg1_end == arg1_end_gold and
 72 |                             arg2_start == arg2_start_gold and
 73 |                             arg2_end == arg2_end_gold) or (
 74 |                         arg1_start == arg2_start_gold and
 75 |                         arg1_end == arg2_end_gold and
 76 |                         arg2_start == arg1_start_gold and
 77 |                         arg2_end == arg1_end_gold
 78 |                     )) and rel_type == rel_type_gold:
 79 |                         rel_match_num += 1
 80 |                         break
 81 | 
 82 |         # Trigger
 83 |         gold_triggers = gold_graph.triggers
 84 |         pred_triggers = pred_graph.triggers
 85 |         gold_trigger_num += len(gold_triggers)
 86 |         pred_trigger_num += len(pred_triggers)
 87 |         for trg_start, trg_end, event_type in pred_triggers:
 88 |             matched = [item for item in gold_triggers
 89 |                        if item[0] == trg_start and item[1] == trg_end]
 90 |             if matched:
 91 |                 trigger_idn_num += 1
 92 |                 if matched[0][-1] == event_type:
 93 |                     trigger_class_num += 1
 94 | 
 95 |         # Argument
 96 |         gold_args = convert_arguments(gold_triggers, gold_entities,
 97 |                                       gold_graph.roles)
 98 |         pred_args = convert_arguments(pred_triggers, pred_entities,
 99 |                                       pred_graph.roles)
100 |         gold_arg_num += len(gold_args)
101 |         pred_arg_num += len(pred_args)
102 |         for pred_arg in pred_args:
103 |             arg_start, arg_end, event_type, role = pred_arg
104 |             gold_idn = {item for item in gold_args
105 |                         if item[0] == arg_start and item[1] == arg_end
106 |                         and item[2] == event_type}
107 |             if gold_idn:
108 |                 arg_idn_num += 1
109 |                 gold_class = {item for item in gold_idn if item[-1] == role}
110 |                 if gold_class:
111 |                     arg_class_num += 1
112 | 
113 |     entity_prec, entity_rec, entity_f = compute_f1(
114 |         pred_ent_num, gold_ent_num, ent_match_num)
115 |     mention_prec, mention_rec, mention_f = compute_f1(
116 |         pred_men_num, gold_men_num, men_match_num)
117 |     trigger_id_prec, trigger_id_rec, trigger_id_f = compute_f1(
118 |         pred_trigger_num, gold_trigger_num, trigger_idn_num)
119 |     trigger_prec, trigger_rec, trigger_f = compute_f1(
120 |         pred_trigger_num, gold_trigger_num, trigger_class_num)
121 |     relation_prec, relation_rec, relation_f = compute_f1(
122 |         pred_rel_num, gold_rel_num, rel_match_num)
123 |     role_id_prec, role_id_rec, role_id_f = compute_f1(
124 |         pred_arg_num, gold_arg_num, arg_idn_num)
125 |     role_prec, role_rec, role_f = compute_f1(
126 |         pred_arg_num, gold_arg_num, arg_class_num)
127 | 
128 |     print('Entity: P: {:.2f}, R: {:.2f}, F: {:.2f}'.format(
129 |         entity_prec * 100.0, entity_rec * 100.0, entity_f * 100.0))
130 |     print('Mention: P: {:.2f}, R: {:.2f}, F: {:.2f}'.format(
131 |         mention_prec * 100.0, mention_rec * 100.0, mention_f * 100.0))
132 |     print('Trigger identification: P: {:.2f}, R: {:.2f}, F: {:.2f}'.format(
133 |         trigger_id_prec * 100.0, trigger_id_rec * 100.0, trigger_id_f * 100.0))
134 |     print('Trigger: P: {:.2f}, R: {:.2f}, F: {:.2f}'.format(
135 |         trigger_prec * 100.0, trigger_rec * 100.0, trigger_f * 100.0))
136 |     print('Relation: P: {:.2f}, R: {:.2f}, F: {:.2f}'.format(
137 |         relation_prec * 100.0, relation_rec * 100.0, relation_f * 100.0))
138 |     print('Role identification: P: {:.2f}, R: {:.2f}, F: {:.2f}'.format(
139 |         role_id_prec * 100.0, role_id_rec * 100.0, role_id_f * 100.0))
140 |     print('Role: P: {:.2f}, R: {:.2f}, F: {:.2f}'.format(
141 |         role_prec * 100.0, role_rec * 100.0, role_f * 100.0))
142 | 
143 |     scores = {
144 |         'entity': {'prec': entity_prec, 'rec': entity_rec, 'f': entity_f},
145 |         'mention': {'prec': mention_prec, 'rec': mention_rec, 'f': mention_f},
146 |         'trigger': {'prec': trigger_prec, 'rec': trigger_rec, 'f': trigger_f},
147 |         'trigger_id': {'prec': trigger_id_prec, 'rec': trigger_id_rec,
148 |                        'f': trigger_id_f},
149 |         'role': {'prec': role_prec, 'rec': role_rec, 'f': role_f},
150 |         'role_id': {'prec': role_id_prec, 'rec': role_id_rec, 'f': role_id_f},
151 |         'relation': {'prec': relation_prec, 'rec': relation_rec,
152 |                      'f': relation_f}
153 |     }
154 |     return scores
155 | 
156 | def score_coref(gold_graphs, pred_graphs):
157 |     pass


--------------------------------------------------------------------------------
/evaluations/supervised-ie/train.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import json
  3 | import time
  4 | from argparse import ArgumentParser
  5 | 
  6 | import tqdm
  7 | import torch
  8 | from torch.utils.data import DataLoader
  9 | from transformers import (BertTokenizer, BertConfig, AdamW,
 10 |                           get_linear_schedule_with_warmup)
 11 | from model import OneIE
 12 | from graph import Graph
 13 | from config import Config
 14 | from data import IEDataset
 15 | from scorer import score_graphs
 16 | from util import generate_vocabs, load_valid_patterns, save_result, best_score_by_task
 17 | 
 18 | 
 19 | # configuration
 20 | parser = ArgumentParser()
 21 | parser.add_argument('-c', '--config', default='config/example.json')
 22 | parser.add_argument('-n', '--name', default='default')
 23 | args = parser.parse_args()
 24 | config = Config.from_json_file(args.config)
 25 | # print(config.to_dict())
 26 | 
 27 | # set GPU device
 28 | use_gpu = config.use_gpu
 29 | if use_gpu and config.gpu_device >= 0:
 30 |     torch.cuda.set_device(config.gpu_device)
 31 | 
 32 | # output
 33 | output_dir = os.path.join(config.log_path, args.name)
 34 | if not os.path.exists(output_dir):
 35 |     os.mkdir(output_dir)
 36 | log_file = os.path.join(output_dir, 'log.txt')
 37 | with open(log_file, 'w', encoding='utf-8') as w:
 38 |     w.write(json.dumps(config.to_dict()) + '\n')
 39 |     print('Log file: {}'.format(log_file))
 40 | best_role_model = os.path.join(output_dir, 'best.role.mdl')
 41 | best_entity_model = os.path.join(output_dir, 'best.entity.mdl')
 42 | best_trigger_model = os.path.join(output_dir, 'best.trigger.mdl')
 43 | best_relation_model = os.path.join(output_dir, 'best.relation.mdl')
 44 | 
 45 | dev_result_file = os.path.join(output_dir, 'result.dev.json')
 46 | test_result_file = os.path.join(output_dir, 'result.test.json')
 47 | 
 48 | # datasets
 49 | model_name = config.bert_model_name
 50 | tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
 51 | 
 52 | train_set = IEDataset(config.train_file, gpu=use_gpu,
 53 |                       relation_mask_self=config.relation_mask_self,
 54 |                       relation_directional=config.relation_directional,
 55 |                       symmetric_relations=config.symmetric_relations,
 56 |                       ignore_title=config.ignore_title)
 57 | dev_set = IEDataset(config.dev_file, gpu=use_gpu,
 58 |                     relation_mask_self=config.relation_mask_self,
 59 |                     relation_directional=config.relation_directional,
 60 |                     symmetric_relations=config.symmetric_relations)
 61 | test_set = IEDataset(config.test_file, gpu=use_gpu,
 62 |                      relation_mask_self=config.relation_mask_self,
 63 |                      relation_directional=config.relation_directional,
 64 |                      symmetric_relations=config.symmetric_relations)
 65 | vocabs = generate_vocabs([train_set, dev_set, test_set])
 66 | 
 67 | train_set.numberize(tokenizer, vocabs)
 68 | dev_set.numberize(tokenizer, vocabs)
 69 | test_set.numberize(tokenizer, vocabs)
 70 | valid_patterns = load_valid_patterns(config.valid_pattern_path, vocabs)
 71 | 
 72 | batch_num = len(train_set) // config.batch_size
 73 | dev_batch_num = len(dev_set) // config.eval_batch_size + \
 74 |     (len(dev_set) % config.eval_batch_size != 0)
 75 | test_batch_num = len(test_set) // config.eval_batch_size + \
 76 |     (len(test_set) % config.eval_batch_size != 0)
 77 | 
 78 | # initialize the model
 79 | model = OneIE(config, vocabs, valid_patterns)
 80 | model.load_bert(model_name, cache_dir=config.bert_cache_dir)
 81 | if use_gpu:
 82 |     model.cuda(device=config.gpu_device)
 83 | 
 84 | # optimizer
 85 | param_groups = [
 86 |     {
 87 |         'params': [p for n, p in model.named_parameters() if n.startswith('bert')],
 88 |         'lr': config.bert_learning_rate, 'weight_decay': config.bert_weight_decay
 89 |     },
 90 |     {
 91 |         'params': [p for n, p in model.named_parameters() if not n.startswith('bert')
 92 |                    and 'crf' not in n and 'global_feature' not in n],
 93 |         'lr': config.learning_rate, 'weight_decay': config.weight_decay
 94 |     },
 95 |     {
 96 |         'params': [p for n, p in model.named_parameters() if not n.startswith('bert')
 97 |                    and ('crf' in n or 'global_feature' in n)],
 98 |         'lr': config.learning_rate, 'weight_decay': 0
 99 |     }
100 | ]
101 | optimizer = AdamW(params=param_groups)
102 | schedule = get_linear_schedule_with_warmup(optimizer,
103 |                                            num_warmup_steps=batch_num * config.warmup_epoch,
104 |                                            num_training_steps=batch_num * config.max_epoch)
105 | 
106 | # model state
107 | state = dict(model=model.state_dict(),
108 |              config=config.to_dict(),
109 |              vocabs=vocabs,
110 |              valid=valid_patterns)
111 | 
112 | global_step = 0
113 | global_feature_max_step = int(config.global_warmup * batch_num) + 1
114 | print('global feature max step:', global_feature_max_step)
115 | 
116 | tasks = ['entity', 'trigger', 'relation', 'role']
117 | best_dev = {k: 0 for k in tasks}
118 | for epoch in range(config.max_epoch):
119 |     print('Epoch: {}'.format(epoch))
120 | 
121 |     # training set
122 |     progress = tqdm.tqdm(total=batch_num, ncols=75,
123 |                          desc='Train {}'.format(epoch))
124 |     optimizer.zero_grad()
125 |     for batch_idx, batch in enumerate(DataLoader(
126 |             train_set, batch_size=config.batch_size // config.accumulate_step,
127 |             shuffle=True, drop_last=True, collate_fn=train_set.collate_fn)):
128 | 
129 |         loss = model(batch)
130 |         loss = loss * (1 / config.accumulate_step)
131 |         loss.backward()
132 | 
133 |         if (batch_idx + 1) % config.accumulate_step == 0:
134 |             progress.update(1)
135 |             global_step += 1
136 |             torch.nn.utils.clip_grad_norm_(
137 |                 model.parameters(), config.grad_clipping)
138 |             optimizer.step()
139 |             schedule.step()
140 |             optimizer.zero_grad()
141 |     progress.close()
142 | 
143 |     # dev set
144 |     progress = tqdm.tqdm(total=dev_batch_num, ncols=75,
145 |                          desc='Dev {}'.format(epoch))
146 |     best_dev_role_model = False
147 |     dev_gold_graphs, dev_pred_graphs, dev_sent_ids, dev_tokens = [], [], [], []
148 |     for batch in DataLoader(dev_set, batch_size=config.eval_batch_size,
149 |                             shuffle=False, collate_fn=dev_set.collate_fn):
150 |         progress.update(1)
151 |         graphs = model.predict(batch)
152 |         if config.ignore_first_header:
153 |             for inst_idx, sent_id in enumerate(batch.sent_ids):
154 |                 if int(sent_id.split('-')[-1]) < 4:
155 |                     graphs[inst_idx] = Graph.empty_graph(vocabs)
156 |         for graph in graphs:
157 |             graph.clean(relation_directional=config.relation_directional,
158 |                         symmetric_relations=config.symmetric_relations)
159 |         dev_gold_graphs.extend(batch.graphs)
160 |         dev_pred_graphs.extend(graphs)
161 |         dev_sent_ids.extend(batch.sent_ids)
162 |         dev_tokens.extend(batch.tokens)
163 |     progress.close()
164 |     dev_scores = score_graphs(dev_gold_graphs, dev_pred_graphs,
165 |                               relation_directional=config.relation_directional)
166 |     for task in tasks:
167 |         if dev_scores[task]['f'] > best_dev[task]:
168 |             best_dev[task] = dev_scores[task]['f']
169 |             if task == 'role':
170 |                 print('Saving best role model')
171 |                 torch.save(state, best_role_model)
172 |                 best_dev_role_model = True
173 |                 save_result(dev_result_file,
174 |                             dev_gold_graphs, dev_pred_graphs, dev_sent_ids,
175 |                             dev_tokens)
176 |             if task == 'entity':
177 |                 print('Saving best entity model')
178 |                 torch.save(state, best_entity_model)
179 |             
180 |             if task == 'trigger':
181 |                 print('Saving best trigger model')
182 |                 torch.save(state, best_trigger_model)
183 |             
184 |             if task == 'relation':
185 |                 print('Saving best relation model')
186 |                 torch.save(state, best_relation_model)
187 | 
188 |     # test set
189 |     progress = tqdm.tqdm(total=test_batch_num, ncols=75,
190 |                          desc='Test {}'.format(epoch))
191 |     test_gold_graphs, test_pred_graphs, test_sent_ids, test_tokens = [], [], [], []
192 |     for batch in DataLoader(test_set, batch_size=config.eval_batch_size, shuffle=False,
193 |                             collate_fn=test_set.collate_fn):
194 |         progress.update(1)
195 |         graphs = model.predict(batch)
196 |         if config.ignore_first_header:
197 |             for inst_idx, sent_id in enumerate(batch.sent_ids):
198 |                 if int(sent_id.split('-')[-1]) < 4:
199 |                     graphs[inst_idx] = Graph.empty_graph(vocabs)
200 |         for graph in graphs:
201 |             graph.clean(relation_directional=config.relation_directional,
202 |                         symmetric_relations=config.symmetric_relations)
203 |         test_gold_graphs.extend(batch.graphs)
204 |         test_pred_graphs.extend(graphs)
205 |         test_sent_ids.extend(batch.sent_ids)
206 |         test_tokens.extend(batch.tokens)
207 |     progress.close()
208 |     test_scores = score_graphs(test_gold_graphs, test_pred_graphs,
209 |                                relation_directional=config.relation_directional)
210 | 
211 |     if best_dev_role_model:
212 |         save_result(test_result_file, test_gold_graphs, test_pred_graphs,
213 |                     test_sent_ids, test_tokens)
214 | 
215 |     result = json.dumps(
216 |         {'epoch': epoch, 'dev': dev_scores, 'test': test_scores})
217 |     with open(log_file, 'a', encoding='utf-8') as w:
218 |         w.write(result + '\n')
219 |     print('Log file', log_file)
220 | 
221 | 
222 | best_score_by_task(log_file, os.path.join(output_dir, 'entity.txt'), 'entity')
223 | best_score_by_task(log_file, os.path.join(output_dir, 'trigger.txt'), 'trigger')
224 | best_score_by_task(log_file, os.path.join(output_dir, 'role.txt'), 'role')
225 | best_score_by_task(log_file, os.path.join(output_dir, 'relation.txt'), 'relation')
226 | 


--------------------------------------------------------------------------------
/evaluations/supervised-ie/util.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import json
  3 | import glob
  4 | import lxml.etree as et
  5 | from nltk import word_tokenize, sent_tokenize
  6 | from copy import deepcopy
  7 | 
  8 | 
  9 | def generate_vocabs(datasets, coref=False,
 10 |                     relation_directional=False,
 11 |                     symmetric_relations=None):
 12 |     """Generate vocabularies from a list of data sets
 13 |     :param datasets (list): A list of data sets
 14 |     :return (dict): A dictionary of vocabs
 15 |     """
 16 |     entity_type_set = set()
 17 |     event_type_set = set()
 18 |     relation_type_set = set()
 19 |     role_type_set = set()
 20 |     for dataset in datasets:
 21 |         entity_type_set.update(dataset.entity_type_set)
 22 |         event_type_set.update(dataset.event_type_set)
 23 |         relation_type_set.update(dataset.relation_type_set)
 24 |         role_type_set.update(dataset.role_type_set)
 25 | 
 26 |     # add inverse relation types for non-symmetric relations
 27 |     if relation_directional:
 28 |         if symmetric_relations is None:
 29 |             symmetric_relations = []
 30 |         relation_type_set_ = set()
 31 |         for relation_type in relation_type_set:
 32 |             relation_type_set_.add(relation_type)
 33 |             if relation_directional and relation_type not in symmetric_relations:
 34 |                 relation_type_set_.add(relation_type + '_inv')
 35 | 
 36 |     # entity and trigger labels
 37 |     prefix = ['B', 'I']
 38 |     entity_label_stoi = {'O': 0}
 39 |     trigger_label_stoi = {'O': 0}
 40 |     for t in entity_type_set:
 41 |         for p in prefix:
 42 |             entity_label_stoi['{}-{}'.format(p, t)] = len(entity_label_stoi)
 43 |     for t in event_type_set:
 44 |         for p in prefix:
 45 |             trigger_label_stoi['{}-{}'.format(p, t)] = len(trigger_label_stoi)
 46 | 
 47 |     entity_type_stoi = {k: i for i, k in enumerate(entity_type_set, 1)}
 48 |     entity_type_stoi['O'] = 0
 49 | 
 50 |     event_type_stoi = {k: i for i, k in enumerate(event_type_set, 1)}
 51 |     event_type_stoi['O'] = 0
 52 | 
 53 |     relation_type_stoi = {k: i for i, k in enumerate(relation_type_set, 1)}
 54 |     relation_type_stoi['O'] = 0
 55 |     if coref:
 56 |         relation_type_stoi['COREF'] = len(relation_type_stoi)
 57 | 
 58 |     role_type_stoi = {k: i for i, k in enumerate(role_type_set, 1)}
 59 |     role_type_stoi['O'] = 0
 60 | 
 61 |     mention_type_stoi = {'NAM': 0, 'NOM': 1, 'PRO': 2, 'UNK': 3}
 62 | 
 63 |     return {
 64 |         'entity_type': entity_type_stoi,
 65 |         'event_type': event_type_stoi,
 66 |         'relation_type': relation_type_stoi,
 67 |         'role_type': role_type_stoi,
 68 |         'mention_type': mention_type_stoi,
 69 |         'entity_label': entity_label_stoi,
 70 |         'trigger_label': trigger_label_stoi,
 71 |     }
 72 | 
 73 | 
 74 | def load_valid_patterns(path, vocabs):
 75 |     event_type_vocab = vocabs['event_type']
 76 |     entity_type_vocab = vocabs['entity_type']
 77 |     relation_type_vocab = vocabs['relation_type']
 78 |     role_type_vocab = vocabs['role_type']
 79 | 
 80 |     # valid event-role
 81 |     valid_event_role = set()
 82 |     event_role = json.load(
 83 |         open(os.path.join(path, 'event_role.json'), 'r', encoding='utf-8'))
 84 |     for event, roles in event_role.items():
 85 |         if event not in event_type_vocab:
 86 |             continue
 87 |         event_type_idx = event_type_vocab[event]
 88 |         for role in roles:
 89 |             if role not in role_type_vocab:
 90 |                 continue
 91 |             role_type_idx = role_type_vocab[role]
 92 |             valid_event_role.add(event_type_idx * 100 + role_type_idx)
 93 | 
 94 |     # valid relation-entity
 95 |     valid_relation_entity = set()
 96 |     relation_entity = json.load(
 97 |         open(os.path.join(path, 'relation_entity.json'), 'r', encoding='utf-8'))
 98 |     for relation, entities in relation_entity.items():
 99 |         relation_type_idx = relation_type_vocab[relation]
100 |         for entity in entities:
101 |             entity_type_idx = entity_type_vocab[entity]
102 |             valid_relation_entity.add(
103 |                 relation_type_idx * 100 + entity_type_idx)
104 | 
105 |     # valid role-entity
106 |     valid_role_entity = set()
107 |     role_entity = json.load(
108 |         open(os.path.join(path, 'role_entity.json'), 'r', encoding='utf-8'))
109 |     for role, entities in role_entity.items():
110 |         if role not in role_type_vocab:
111 |             continue
112 |         role_type_idx = role_type_vocab[role]
113 |         for entity in entities:
114 |             entity_type_idx = entity_type_vocab[entity]
115 |             valid_role_entity.add(role_type_idx * 100 + entity_type_idx)
116 | 
117 |     return {
118 |         'event_role': valid_event_role,
119 |         'relation_entity': valid_relation_entity,
120 |         'role_entity': valid_role_entity
121 |     }
122 | 
123 | 
124 | def read_ltf(path):
125 |     root = et.parse(path, et.XMLParser(
126 |         dtd_validation=False, encoding='utf-8')).getroot()
127 |     doc_id = root.find('DOC').get('id')
128 |     doc_tokens = []
129 |     for seg in root.find('DOC').find('TEXT').findall('SEG'):
130 |         seg_id = seg.get('id')
131 |         seg_tokens = []
132 |         seg_start = int(seg.get('start_char'))
133 |         seg_text = seg.find('ORIGINAL_TEXT').text
134 |         for token in seg.findall('TOKEN'):
135 |             token_text = token.text
136 |             start_char = int(token.get('start_char'))
137 |             end_char = int(token.get('end_char'))
138 |             assert seg_text[start_char - seg_start:
139 |                             end_char - seg_start + 1
140 |                             ] == token_text, 'token offset error'
141 |             seg_tokens.append((token_text, start_char, end_char))
142 |         doc_tokens.append((seg_id, seg_tokens))
143 | 
144 |     return doc_tokens, doc_id
145 | 
146 | 
147 | def read_txt(path, language='english'):
148 |     doc_id = os.path.basename(path)
149 |     data = open(path, 'r', encoding='utf-8').read()
150 |     data = [s.strip() for s in data.split('\n') if s.strip()]
151 |     sents = [l for ls in [sent_tokenize(line, language=language) for line in data]
152 |              for l in ls]
153 |     doc_tokens = []
154 |     offset = 0
155 |     for sent_idx, sent in enumerate(sents):
156 |         sent_id = '{}-{}'.format(doc_id, sent_idx)
157 |         tokens = word_tokenize(sent)
158 |         tokens = [(token, offset + i, offset + i + 1)
159 |                   for i, token in enumerate(tokens)]
160 |         offset += len(tokens)
161 |         doc_tokens.append((sent_id, tokens))
162 |     return doc_tokens, doc_id
163 | 
164 | 
165 | def read_json(path):
166 |     with open(path, 'r', encoding='utf-8') as r:
167 |         data = [json.loads(line) for line in r]
168 |     doc_id = data[0]['doc_id']
169 |     offset = 0
170 |     doc_tokens = []
171 | 
172 |     for inst in data:
173 |         tokens = inst['tokens']
174 |         tokens = [(token, offset + i, offset + i + 1)
175 |                   for i, token in enumerate(tokens)]
176 |         offset += len(tokens)
177 |         doc_tokens.append((inst['sent_id'], tokens))
178 |     return doc_tokens, doc_id
179 | 
180 | 
181 | def read_json_single(path):
182 |     with open(path, 'r', encoding='utf-8') as r:
183 |         data = [json.loads(line) for line in r]
184 |     doc_id = os.path.basename(path)
185 |     doc_tokens = []
186 |     for inst in data:
187 |         tokens = inst['tokens']
188 |         tokens = [(token, i, i + 1) for i, token in enumerate(tokens)]
189 |         doc_tokens.append((inst['sent_id'], tokens))
190 |     return doc_tokens, doc_id
191 | 
192 | 
193 | def save_result(output_file, gold_graphs, pred_graphs, sent_ids, tokens=None):
194 |     with open(output_file, 'w', encoding='utf-8') as w:
195 |         for i, (gold_graph, pred_graph, sent_id) in enumerate(
196 |                 zip(gold_graphs, pred_graphs, sent_ids)):
197 |             output = {'sent_id': sent_id,
198 |                       'gold': gold_graph.to_dict(),
199 |                       'pred': pred_graph.to_dict()}
200 |             if tokens:
201 |                 output['tokens'] = tokens[i]
202 |             w.write(json.dumps(output) + '\n')
203 | 
204 | 
205 | def mention_to_tab(start, end, entity_type, mention_type, mention_id, tokens, token_ids, score=1):
206 |     tokens = tokens[start:end]
207 |     token_ids = token_ids[start:end]
208 |     span = '{}:{}-{}'.format(token_ids[0].split(':')[0],
209 |                              token_ids[0].split(':')[1].split('-')[0],
210 |                              token_ids[1].split(':')[1].split('-')[1])
211 |     mention_text = tokens[0]
212 |     previous_end = int(token_ids[0].split(':')[1].split('-')[1])
213 |     for token, token_id in zip(tokens[1:], token_ids[1:]):
214 |         start, end = token_id.split(':')[1].split('-')
215 |         start, end = int(start), int(end)
216 |         mention_text += ' ' * (start - previous_end) + token
217 |         previous_end = end
218 |     return '\t'.join([
219 |         'json2tab',
220 |         mention_id,
221 |         mention_text,
222 |         span,
223 |         'NIL',
224 |         entity_type,
225 |         mention_type,
226 |         str(score)
227 |     ])
228 | 
229 | 
230 | def json_to_mention_results(input_dir, output_dir, file_name,
231 |                             bio_separator=' '):
232 |     mention_type_list = ['nam', 'nom', 'pro', 'nam+nom+pro']
233 |     file_type_list = ['bio', 'tab']
234 |     writers = {}
235 |     for mention_type in mention_type_list:
236 |         for file_type in file_type_list:
237 |             output_file = os.path.join(output_dir, '{}.{}.{}'.format(file_name,
238 |                                                                      mention_type,
239 |                                                                      file_type))
240 |             writers['{}_{}'.format(mention_type, file_type)
241 |                     ] = open(output_file, 'w')
242 | 
243 |     json_files = glob.glob(os.path.join(input_dir, '*.json'))
244 |     for f in json_files:
245 |         with open(f, 'r', encoding='utf-8') as r:
246 |             for line in r:
247 |                 result = json.loads(line)
248 |                 doc_id = result['doc_id']
249 |                 tokens = result['tokens']
250 |                 token_ids = result['token_ids']
251 |                 bio_tokens = [[t, tid, 'O']
252 |                               for t, tid in zip(tokens, token_ids)]
253 |                 # separate bio output
254 |                 for mention_type in ['NAM', 'NOM', 'PRO']:
255 |                     tokens_tmp = deepcopy(bio_tokens)
256 |                     for start, end, enttype, mentype in result['graph']['entities']:
257 |                         if mention_type == mentype:
258 |                             tokens_tmp[start] = 'B-{}'.format(enttype)
259 |                             for token_idx in range(start + 1, end):
260 |                                 tokens_tmp[token_idx] = 'I-{}'.format(
261 |                                     enttype)
262 |                     writer = writers['{}_bio'.format(mention_type.lower())]
263 |                     for token in tokens_tmp:
264 |                         writer.write(bio_separator.join(token) + '\n')
265 |                     writer.write('\n')
266 |                 # combined bio output
267 |                 tokens_tmp = deepcopy(bio_tokens)
268 |                 for start, end, enttype, _ in result['graph']['entities']:
269 |                     tokens_tmp[start] = 'B-{}'.format(enttype)
270 |                     for token_idx in range(start + 1, end):
271 |                         tokens_tmp[token_idx] = 'I-{}'.format(enttype)
272 |                 writer = writers['nam+nom+pro_bio']
273 |                 for token in tokens_tmp:
274 |                     writer.write(bio_separator.join(token) + '\n')
275 |                 writer.write('\n')
276 |                 # separate tab output
277 |                 for mention_type in ['NAM', 'NOM', 'PRO']:
278 |                     writer = writers['{}_tab'.format(mention_type.lower())]
279 |                     mention_count = 0
280 |                     for start, end, enttype, mentype in result['graph']['entities']:
281 |                         if mention_type == mentype:
282 |                             mention_id = '{}-{}'.format(doc_id, mention_count)
283 |                             tab_line = mention_to_tab(
284 |                                 start, end, enttype, mentype, mention_id, tokens, token_ids)
285 |                             writer.write(tab_line + '\n')
286 |                 # combined tab output
287 |                 writer = writers['nam+nom+pro_tab']
288 |                 mention_count = 0
289 |                 for start, end, enttype, mentype in result['graph']['entities']:
290 |                     mention_id = '{}-{}'.format(doc_id, mention_count)
291 |                     tab_line = mention_to_tab(
292 |                         start, end, enttype, mentype, mention_id, tokens, token_ids)
293 |                     writer.write(tab_line + '\n')
294 |     for w in writers:
295 |         w.close()
296 | 
297 | 
298 | def normalize_score(scores):
299 |     min_score, max_score = min(scores), max(scores)
300 |     if min_score == max_score:
301 |         return [0] * len(scores)
302 |     return [(s - min_score) / (max_score - min_score) for s in scores]
303 | 
304 | 
305 | def best_score_by_task(log_file, scores_file, task, max_epoch=1000):
306 |     with open(log_file, 'r', encoding='utf-8') as r:
307 |         config = r.readline()
308 | 
309 |         best_scores = []
310 |         best_dev_score = 0
311 |         for line in r:
312 |             record = json.loads(line)
313 |             dev = record['dev']
314 |             test = record['test']
315 |             epoch = record['epoch']
316 |             if epoch > max_epoch:
317 |                 break
318 |             if dev[task]['f'] > best_dev_score:
319 |                 best_dev_score = dev[task]['f']
320 |                 best_scores = [dev, test, epoch]
321 | 
322 |         print('Epoch: {}'.format(best_scores[-1]))
323 |         tasks = ['entity', 'mention', 'relation', 'trigger_id', 'trigger',
324 |                  'role_id', 'role']
325 |         for t in tasks:
326 |             print('{}: dev: {:.2f}, test: {:.2f}'.format(t, best_scores[0][t]['f'] * 100.0, best_scores[1][t]['f'] * 100.0))
327 |         
328 |         with open(scores_file, 'w', encoding='utf-8') as f:
329 |             for t in tasks:
330 |                 f.write('{}: dev: {:.2f}, test: {:.2f}'.format(t, best_scores[0][t]['f'] * 100.0, best_scores[1][t]['f'] * 100.0) + '\n')
331 | 


--------------------------------------------------------------------------------
/gumbel_latent_typer.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Liliang Ren.
  2 | #
  3 | # This source code is licensed under the Apache 2.0 license found in the
  4 | # LICENSE file in the root directory of this source tree.
  5 | 
  6 | import torch
  7 | import torch.nn as nn
  8 | import torch.nn.functional as F
  9 | import math
 10 | 
 11 | class GumbelLatentTyper(nn.Module):
 12 |     def __init__(
 13 |         self,
 14 |         dim,
 15 |         num_vars,
 16 |         temp,
 17 |         var_dim,
 18 |         hard = True,
 19 |     ):
 20 |         
 21 |         super().__init__()
 22 | 
 23 |         self.input_dim = dim
 24 |         self.num_vars = num_vars
 25 |         self.hard = hard
 26 | 
 27 |         
 28 |         self.vars = nn.Parameter(torch.FloatTensor(num_vars, var_dim))
 29 |         nn.init.uniform_(self.vars, a=-0.5, b=0.5)
 30 |         
 31 |         
 32 |         self.weight_proj = nn.Linear(self.input_dim,  num_vars, bias = False)
 33 |         nn.init.kaiming_uniform_(self.weight_proj.weight.data, nonlinearity = 'linear')
 34 |                 
 35 |         self.max_temp, self.min_temp, self.temp_decay = temp
 36 |         self.curr_temp = self.min_temp
 37 | 
 38 |     def set_num_updates(self, num_updates):
 39 |         #exponential decay
 40 |         self.curr_temp = max(
 41 |             self.max_temp * self.temp_decay**num_updates, self.min_temp
 42 |         )
 43 | 
 44 |     
 45 |     def forward(self, x, mask=None, deterministic = True):
 46 |         result = {"num_vars": self.num_vars }
 47 |         bsz, tsz, fsz = x.shape
 48 | 
 49 |         x = self.weight_proj(x)
 50 |         x = x.view(bsz * tsz, -1)
 51 |         zero_mask = torch.ones_like(x)
 52 |         zero_mask[:,0]=0
 53 |         x = x*zero_mask
 54 | 
 55 | 
 56 |         if mask is not None:
 57 |             x = x* mask.view(-1,1)
 58 | 
 59 |         _, k = x.max(-1)
 60 |         hard_x = (
 61 |             x.new_zeros(*x.shape)
 62 |             .scatter_(-1, k.view(-1, 1), 1.0)
 63 |             .view(bsz * tsz, -1)
 64 |         )
 65 | 
 66 | 
 67 |         avg_probs = torch.softmax(
 68 |             x.view(bsz * tsz, -1).float(), dim=-1
 69 |         )
 70 |         result["soft_probs"] = avg_probs
 71 | 
 72 |         if mask is not None:
 73 |             avg_probs = (avg_probs * mask.view(bsz * tsz,1)).sum(0)/mask.sum()
 74 |         else:
 75 |             avg_probs = avg_probs.mean(dim=0)
 76 | 
 77 |         result["prob_perplexity"] = torch.exp(
 78 |             -torch.sum(avg_probs * torch.log(avg_probs + 1e-7), dim=-1)
 79 |         ).sum()
 80 | 
 81 |         if self.training:
 82 |             x = F.gumbel_softmax(x.float(), tau=self.curr_temp, hard=self.hard).type_as(x)
 83 |         else:
 84 |             if deterministic:
 85 |                 x = hard_x
 86 |             else:
 87 |                 x = F.gumbel_softmax(x.float(), tau=self.curr_temp, hard=self.hard).type_as(x)
 88 | 
 89 |         
 90 |         result["gumbel_probs"] = x.view(bsz * tsz, -1) 
 91 |         
 92 |         x = x.view(bsz * tsz, -1)
 93 | 
 94 |         vars = self.vars
 95 |         mask = torch.ones_like(vars)
 96 |         mask[0,:]=0
 97 |         vars = vars * mask
 98 |         
 99 |         x = torch.matmul(x, vars)
100 |         x = x.view(bsz, tsz, -1)
101 | 
102 |         result["x"] = x
103 | 
104 |         return result
105 | 


--------------------------------------------------------------------------------
/model.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Liliang Ren, Zixuan Zhang.
  2 | #
  3 | # This source code is licensed under the Apache 2.0 license found in the
  4 | # LICENSE file in the root directory of this source tree.
  5 | 
  6 | 
  7 | from transformers import RobertaPreTrainedModel, RobertaModel,RobertaForMaskedLM, AutoModel, BertPreTrainedModel, BertModel, BertForMaskedLM
  8 | from transformers.modeling_outputs import MaskedLMOutput
  9 | from transformers.activations import ACT2FN
 10 | from transformers.models.roberta.modeling_roberta import RobertaEmbeddings
 11 | 
 12 | from utils import RobertaConfig
 13 | from typing import List, Optional, Tuple, Union
 14 | 
 15 | from decoder import BartDecoder, _make_causal_mask, _expand_mask
 16 | 
 17 | import torch
 18 | import math
 19 | import torch.nn as nn
 20 | 
 21 | from gumbel_latent_typer import GumbelLatentTyper
 22 | 
 23 | 
 24 | def gelu(x):
 25 |     return 0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))
 26 | 
 27 | 
 28 | class BertOnlyMLMHead(nn.Module):
 29 |     def __init__(self, config):
 30 |         super().__init__()
 31 |         self.predictions = BertLMPredictionHead(config)
 32 | 
 33 |     def forward(self, sequence_output: torch.Tensor) -> torch.Tensor:
 34 |         prediction_scores = self.predictions(sequence_output)
 35 |         return prediction_scores
 36 | 
 37 | 
 38 | class BertLMPredictionHead(nn.Module):
 39 |     def __init__(self, config):
 40 |         super().__init__()
 41 |         self.transform = BertPredictionHeadTransform(config)
 42 | 
 43 |         # The output weights are the same as the input embeddings, but there is
 44 |         # an output-only bias for each token.
 45 |         self.decoder = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
 46 | 
 47 |         self.bias = nn.Parameter(torch.zeros(config.vocab_size))
 48 | 
 49 |         # Need a link between the two variables so that the bias is correctly resized with `resize_token_embeddings`
 50 |         self.decoder.bias = self.bias
 51 | 
 52 |     def forward(self, hidden_states):
 53 |         hidden_states = self.transform(hidden_states)
 54 |         hidden_states = self.decoder(hidden_states)
 55 |         return hidden_states
 56 | 
 57 | 
 58 | class BertPredictionHeadTransform(nn.Module):
 59 |     def __init__(self, config):
 60 |         super().__init__()
 61 |         self.dense = nn.Linear(config.hidden_size, config.hidden_size)
 62 |         if isinstance(config.hidden_act, str):
 63 |             self.transform_act_fn = ACT2FN[config.hidden_act]
 64 |         else:
 65 |             self.transform_act_fn = config.hidden_act
 66 |         self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
 67 | 
 68 |     def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
 69 |         hidden_states = self.dense(hidden_states)
 70 |         hidden_states = self.transform_act_fn(hidden_states)
 71 |         hidden_states = self.LayerNorm(hidden_states)
 72 |         return hidden_states
 73 |         
 74 | 
 75 | 
 76 | class RobertaLMHead(nn.Module):
 77 |     def __init__(self, config):
 78 |         super().__init__()
 79 |         self.dense = nn.Linear(config.hidden_size, config.hidden_size)
 80 |         self.layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
 81 |         self.decoder = nn.Linear(config.hidden_size, config.vocab_size)
 82 |         self.bias = nn.Parameter(torch.zeros(config.vocab_size))
 83 |         self.decoder.bias = self.bias
 84 | 
 85 |     def forward(self, features, **kwargs):
 86 |         x = self.dense(features)
 87 |         x = gelu(x)
 88 |         x = self.layer_norm(x)
 89 |         # project back to size of vocabulary with bias
 90 |         x = self.decoder(x)
 91 |         return x
 92 | 
 93 |     def _tie_weights(self):
 94 |         # To tie those two weights if they get disconnected (on TPU or when the bias is resized)
 95 |         self.bias = self.decoder.bias
 96 | 
 97 | 
 98 | class RobertaAutoEncoder(BertPreTrainedModel):
 99 | 
100 |     def __init__(self, config):
101 |         super().__init__(config)
102 |         self.model = BertForMaskedLM.from_pretrained("bert-base-uncased")
103 |         
104 |         self.glm_head = None
105 |         self.glm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=True) # for confitional generation (lm)
106 |         nn.init.constant_(self.glm_head.bias, 0.0)
107 | 
108 |         self.decoder = BartDecoder(config, self.roberta.embeddings)
109 |         
110 |         
111 |         self.sa_pm = GumbelLatentTyper(
112 |                 dim = config.hidden_size,
113 |                 num_vars = 64,
114 |                 temp =  (5, 0.5, 1-3e-5),
115 |                 var_dim = config.hidden_size,
116 |                 hard = False,
117 |             )
118 | 
119 |         self.tie_weights()
120 |     
121 |     @property
122 |     def roberta(self):
123 |         return self.model.bert
124 | 
125 |     @property
126 |     def mlm_head(self):
127 |         return self.model.cls.predictions
128 | 
129 |     def tie_weights(self,):
130 |         if self.glm_head is not None:
131 |             self.glm_head.weight = self.roberta.embeddings.word_embeddings.weight
132 | 
133 |         self.mlm_head.decoder.weight = self.roberta.embeddings.word_embeddings.weight
134 |         
135 |     def forward(self, input_ids=None, attention_mask=None, mlm_input_ids=None, mlm_labels=None, decoder_input_ids=None, decoder_attention_mask=None, gen_labels=None, original_tokens=None, return_dict=None):
136 |         
137 | 
138 |         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
139 |         # loss #1: masked lm loss
140 |         masked_sequence_output = self.roberta(
141 |             mlm_input_ids,
142 |             attention_mask=attention_mask,
143 |             return_dict=return_dict
144 |         )
145 |         prediction_scores = self.mlm_head(masked_sequence_output[0])
146 | 
147 |         masked_lm_loss = None
148 | 
149 |         if mlm_labels is not None:
150 |             loss_fct = nn.CrossEntropyLoss()
151 |             masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), mlm_labels.view(-1))
152 |         
153 |         # loss #2: reconstruction loss
154 |         outputs = self.roberta(
155 |             input_ids,
156 |             attention_mask=attention_mask,
157 |             return_dict=return_dict
158 |         )
159 |         sequence_output = outputs[0]
160 | 
161 |         # sequence_output: (batch, seq_len, dim)
162 |         EPS = torch.finfo(sequence_output.dtype).tiny
163 |         b,q,c = sequence_output.shape
164 |         result = self.sa_pm(sequence_output,mask=attention_mask, deterministic=True)
165 |         
166 |         div_loss = (result["num_vars"] - result["prob_perplexity"])/result["num_vars"] 
167 |         
168 |         soft_probs =  result["soft_probs"].view(b,q,-1)[:,:,0]
169 |         reduced_output = (sequence_output * result["x"])
170 |         pm_loss = - torch.log((soft_probs*attention_mask).sum()/attention_mask.sum()+EPS)
171 | 
172 |         seq_logits = self.decoder(
173 |             input_ids=decoder_input_ids, 
174 |             attention_mask=decoder_attention_mask, 
175 |             encoder_hidden_states=reduced_output, 
176 |             encoder_attention_mask=attention_mask
177 |         )[0]
178 | 
179 |         lm_logits = self.glm_head(seq_logits)
180 |         gen_loss = None
181 |         if gen_labels is not None:
182 |             loss_fct = nn.CrossEntropyLoss()
183 |             gen_loss = loss_fct(lm_logits.view(-1, self.config.vocab_size), gen_labels.view(-1)) 
184 |         
185 |         if torch.isnan(masked_lm_loss):
186 |             masked_lm_loss = gen_loss.new_zeros(1)[0]
187 | 
188 |         return masked_lm_loss, gen_loss, pm_loss, div_loss
189 |     
190 | 
191 |     def test_generate(self, input_ids=None, attention_mask=None, original_tokens=None, return_dict=None, tsne=False, return_latent = False):
192 |         bs, seq_len = input_ids.shape
193 |         decoder_input_ids = torch.zeros(bs, seq_len).long()
194 |         decoder_attn_mask = torch.zeros(bs, seq_len).long()
195 | 
196 |         decoder_input_ids[:, 0:1] = input_ids[:, 0:1]
197 |         decoder_attn_mask[:, 0:1] = torch.ones(bs, 1).long()
198 | 
199 |         output_ids = torch.zeros(bs, seq_len).long()
200 |         output_ids[:, 0:1] = input_ids[:, 0:1]
201 | 
202 |         type_str = ""
203 |         selected_list = []
204 | 
205 |         with torch.no_grad():
206 |             # loss #2: reconstruction loss
207 |             outputs = self.roberta(
208 |                 input_ids,
209 |                 attention_mask=attention_mask,
210 |                 return_dict=return_dict
211 |             )
212 |             sequence_output = outputs[0]
213 | 
214 |             # sequence_output: (batch, seq_len, dim)
215 |             EPS = torch.finfo(sequence_output.dtype).tiny
216 |             b,q,c = sequence_output.shape
217 |             result = self.sa_pm(sequence_output, deterministic=True)
218 |             gumbel_types = torch.argmax(result["gumbel_probs"][:, 0, :], 1)
219 |             
220 |             if tsne:
221 |                 return sequence_output, gumbel_types
222 | 
223 |             #only support batch_size = 1 after this line
224 |             reduced_output = (sequence_output * result["x"])
225 |             type_ids = []
226 |             for j in range(len(original_tokens[0])):
227 |                 token = original_tokens[0][j]
228 |                 type_idx = gumbel_types.tolist()[j]
229 |                 type_ids.append(type_idx)
230 |                 if type_idx != 0:
231 |                     type_str += (token + '(' + str(type_idx)+'), ')
232 |                     selected_list.append(token)
233 | 
234 |             if return_latent:
235 |                 return type_ids
236 | 
237 |             print("LATENT TYPINGS: ")
238 |             print(type_str)
239 |             print('\n')
240 |             
241 | 
242 |             for i in range(seq_len - 1):
243 |                 seq_logits = self.decoder(
244 |                     input_ids=decoder_input_ids, 
245 |                     attention_mask=decoder_attn_mask, 
246 |                     encoder_hidden_states=reduced_output, 
247 |                     encoder_attention_mask=attention_mask
248 |                 )[0]
249 |                 # seq_logits: bs, seq_len, vocab_size
250 |                 lm_logits = self.glm_head(seq_logits)
251 |                 selected_logits = lm_logits[:, i, :]
252 |                 logit_idxs = torch.argmax(selected_logits, 1)
253 |                 output_ids[:, i+1:i+2] = logit_idxs.unsqueeze(-1)
254 | 
255 |                 decoder_input_ids[:, i+1:i+2] = logit_idxs.unsqueeze(-1)
256 |                 decoder_attn_mask[:, i+1:i+2] = torch.ones(bs, 1)
257 | 
258 |         return output_ids
259 | 
260 | 
261 | 
262 | 


--------------------------------------------------------------------------------
/overview.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/renll/SparseLT/86306e94c27ec79b4ea4e810a262df42798e5ab9/overview.png


--------------------------------------------------------------------------------
/pretrain.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Zixuan Zhang, Liliang Ren.
  2 | #
  3 | # This source code is licensed under the Apache 2.0 license found in the
  4 | # LICENSE file in the root directory of this source tree.
  5 | 
  6 | 
  7 | import os
  8 | import argparse
  9 | 
 10 | from transformers import AutoConfig, AutoTokenizer
 11 | 
 12 | from utils import RobertaConfig
 13 | from model import RobertaAutoEncoder
 14 | from dataset import PLMDataset, PLMDataCollator, PLMTrainer, PLMTrainingArgs
 15 | import wandb
 16 | 
 17 | parser = argparse.ArgumentParser()
 18 | parser.add_argument('--name', type=str, default="default_latent_typing")
 19 | parser.add_argument('--data', type=str, default="voa_corpus")
 20 | parser.add_argument('--local_rank', type=int, default=-1)
 21 | parser.add_argument('--alpha', type=float, default=0.05)
 22 | parser.add_argument('--beta', type=float, default=0.05)
 23 | parser.add_argument('--gamma', type=float, default=0.1)
 24 | args = parser.parse_args()
 25 | 
 26 | if args.local_rank in [-1,0]:
 27 |     wandb.tensorboard.patch(root_logdir="./tb_logs/")
 28 |     wandb.init(project='latent-typing', sync_tensorboard=True)
 29 | 
 30 | model_name = "bert-base-uncased"
 31 | tokenizer = AutoTokenizer.from_pretrained(model_name)
 32 | checkpoint_dir = "./checkpoints/" + args.name
 33 | dataset_dir = "./data/" + args.data + '.txt'
 34 | if not os.path.exists(checkpoint_dir):
 35 |     os.makedirs(checkpoint_dir, exist_ok=True)
 36 | 
 37 | 
 38 | 
 39 | config = AutoConfig.from_pretrained(model_name) 
 40 | 
 41 | config.decoder_layers = 1
 42 | config.activation_function = config.hidden_act
 43 | config.decoder_ffn_dim = config.intermediate_size
 44 | config.init_std = config.initializer_range 
 45 | 
 46 | print(config)
 47 | 
 48 | model = RobertaAutoEncoder(config)
 49 | print(model)
 50 | 
 51 | 
 52 | training_args = PLMTrainingArgs(
 53 |     output_dir=checkpoint_dir, 
 54 | 	overwrite_output_dir=False,
 55 | 	do_train=True,
 56 | 	do_eval=False,
 57 | 	do_predict=False,
 58 | 	evaluation_strategy='no',
 59 | 	prediction_loss_only=False,
 60 | 	per_device_train_batch_size=32, 
 61 | 	per_device_eval_batch_size=8, 
 62 | 	gradient_accumulation_steps=1,
 63 | 	eval_accumulation_steps=32,
 64 | 	learning_rate=1e-4,
 65 | 	weight_decay=0.01,
 66 | 	adam_beta1=0.9,
 67 | 	adam_beta2=0.999,
 68 | 	adam_epsilon=1e-8,
 69 | 	max_grad_norm=1.,
 70 | 	num_train_epochs=10, 
 71 | 	max_steps=100000, 
 72 | 	lr_scheduler_type='linear',
 73 |     warmup_steps=300,  
 74 | 	save_steps=10000, 
 75 | 	save_total_limit=100, 
 76 | 	no_cuda=False,
 77 | 	seed=61820, 
 78 | 	local_rank=args.local_rank,
 79 | 	dataloader_drop_last=False,
 80 | )
 81 | 
 82 | training_args.add_loss_weights(
 83 |     mlm=1, # mlm
 84 |     alpha = args.alpha, # gen
 85 |     beta = args.beta, # pm
 86 |     gamma = args.gamma #diversity
 87 |     )
 88 | 
 89 | train_dataset = PLMDataset(
 90 |     tokenizer=tokenizer,
 91 |     file_path=dataset_dir,
 92 |     block_size=128
 93 | )
 94 | 
 95 | data_collator = PLMDataCollator(tokenizer=tokenizer, mlm_probability=0.15)
 96 | 
 97 | 
 98 | trainer = PLMTrainer(
 99 |     model=model,                         
100 |     args=training_args,
101 |     data_collator=data_collator,                
102 |     train_dataset=train_dataset            
103 | )
104 | 
105 | run_name = args.name+"_mlm_" + str(trainer.args.mlm) + "_gen_" + str(trainer.args.alpha) + "_pm_" + str(trainer.args.beta) + "_div_" + str(trainer.args.gamma)
106 | log_dir = "./tb_logs/" + run_name
107 | 
108 | trainer.load_tb(log_dir)
109 | 
110 | trainer.train()
111 | trainer.save_model(checkpoint_dir)
112 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | numpy==1.22.3
2 | torch==1.12.1
3 | transformers==4.21.1
4 | wandb
5 | seqeval
6 | 


--------------------------------------------------------------------------------
/test_generation.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Zixuan Zhang, Liliang Ren.
 2 | #
 3 | # This source code is licensed under the Apache 2.0 license found in the
 4 | # LICENSE file in the root directory of this source tree.
 5 | 
 6 | 
 7 | from transformers import RobertaTokenizerFast, RobertaModel, AutoTokenizer
 8 | import torch
 9 | from model import RobertaAutoEncoder
10 | 
11 | t = AutoTokenizer.from_pretrained("bert-base-uncased")
12 | 
13 | 
14 | ckpt_dirs = ["./checkpoints/YOUR_FULLMODEL_CKPT_DIR/"]
15 | for ckpt_dir in ckpt_dirs:
16 |     print(ckpt_dir.split("/")[-2])
17 |     m = RobertaAutoEncoder.from_pretrained(ckpt_dir)
18 |     
19 |     
20 |     input_sentences = ["She was murdered in her New York office, just days after learning that Waitress had been accepted into the Sundance Film Festival."]
21 |    
22 |     for input_sentence in input_sentences:
23 |         print("INPUT SENTENCE: ")
24 |         print(input_sentence + '\n')
25 |         input_batch = t(input_sentence, return_tensors="pt")
26 |         
27 |         input_ids = input_batch["input_ids"]
28 |         attn_mask = input_batch["attention_mask"]
29 |         # print(input_ids.shape)
30 |         input_tokens = [["CLS"] + t.tokenize(input_sentence) + ["SEP"]]
31 |         
32 |         output_ids = m.test_generate(input_ids=input_ids, attention_mask=attn_mask, original_tokens=input_tokens)
33 |         sentence_output = t.decode(output_ids[0], skip_special_tokens=False)
34 |         print("OUTPUT SENTENCE: ")
35 |         print(sentence_output)
36 |         print('\n')
37 | 


--------------------------------------------------------------------------------
/utils.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Zixuan Zhang.
 2 | #
 3 | # This source code is licensed under the Apache 2.0 license found in the
 4 | # LICENSE file in the root directory of this source tree.
 5 | 
 6 | 
 7 | from transformers.configuration_utils import PretrainedConfig
 8 | 
 9 | 
10 | class BertConfig(PretrainedConfig):
11 | 
12 |     def __init__(
13 |         self,
14 |         vocab_size=50265,
15 |         hidden_size=1024,
16 |         num_hidden_layers=24,
17 |         num_attention_heads=16,
18 |         intermediate_size=4096,
19 |         hidden_act="gelu",
20 |         hidden_dropout_prob=0.1,
21 |         attention_probs_dropout_prob=0.1,
22 |         max_position_embeddings=514,
23 |         type_vocab_size=1,
24 |         initializer_range=0.02,
25 |         layer_norm_eps=1e-05,
26 |         pad_token_id=1,
27 |         position_embedding_type="absolute",
28 |         use_cache=True,
29 |         classifier_dropout=None,
30 |         activation_function="gelu",
31 |         decoder_ffn_dim=4096,
32 |         decoder_layers=1,
33 |         init_std=0.02,
34 |         **kwargs
35 |     ):
36 |         super().__init__(pad_token_id=pad_token_id, **kwargs)
37 | 
38 |         self.vocab_size = vocab_size
39 |         self.hidden_size = hidden_size
40 |         self.num_hidden_layers = num_hidden_layers
41 |         self.num_attention_heads = num_attention_heads
42 |         self.hidden_act = hidden_act
43 |         self.intermediate_size = intermediate_size
44 |         self.hidden_dropout_prob = hidden_dropout_prob
45 |         self.init_std = init_std
46 |         self.attention_probs_dropout_prob = attention_probs_dropout_prob
47 |         self.max_position_embeddings = max_position_embeddings
48 |         self.type_vocab_size = type_vocab_size
49 |         self.initializer_range = initializer_range
50 |         self.layer_norm_eps = layer_norm_eps
51 |         self.position_embedding_type = position_embedding_type
52 |         self.use_cache = use_cache
53 |         self.classifier_dropout = classifier_dropout
54 |         self.activation_function = activation_function
55 |         self.decoder_ffn_dim = decoder_ffn_dim
56 |         self.decoder_layers = decoder_layers
57 | 
58 | 
59 | class RobertaConfig(BertConfig):
60 | 
61 |     def __init__(self, pad_token_id=1, bos_token_id=0, eos_token_id=2, **kwargs):
62 |         super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
63 | 
64 | 
65 | if __name__ == "__main__":
66 |     c = BertConfig()
67 |     d = RobertaConfig()
68 | 


--------------------------------------------------------------------------------