├── .gitignore ├── DataFormat.md ├── LICENSE ├── README.md ├── baselines ├── BERT+CRF │ ├── BERT-CRF-ACE │ │ ├── bert_crf.py │ │ ├── crf.py │ │ ├── run_ACE.py │ │ ├── run_ACE.sh │ │ └── utils_ACE.py │ ├── BERT-CRF-MAVEN │ │ ├── bert_crf.py │ │ ├── crf.py │ │ ├── run_MAVEN.sh │ │ ├── run_MAVEN_infer.sh │ │ ├── run_maven.py │ │ └── utils_maven.py │ └── README.md ├── DMBERT │ ├── README.md │ ├── get_submission.py │ ├── model.py │ ├── run_ACE.sh │ ├── run_MAVEN.sh │ ├── run_MAVEN_infer.sh │ ├── run_ee.py │ └── utils_ee.py ├── DMCNN_BiLSTM_(CRF) │ ├── .DS_Store │ ├── README.md │ ├── clear.py │ ├── config │ │ ├── bilstm.config │ │ ├── crf.config │ │ └── dmcnn.config │ ├── formatter │ │ ├── BilstmFormatter.py │ │ ├── CrfFormatter.py │ │ ├── DmcnnFormatter.py │ │ └── __init__.py │ ├── main.py │ ├── model │ │ ├── Bilstm.py │ │ ├── Crf.py │ │ ├── Dmcnn.py │ │ ├── __init__.py │ │ └── layers │ │ │ ├── crf.py │ │ │ ├── embedding.py │ │ │ └── outputLayer.py │ ├── raw │ │ └── 100.utf8 │ ├── reader │ │ ├── MavenReader.py │ │ └── __init__.py │ └── utils │ │ ├── __init__.py │ │ ├── configparser_hook.py │ │ ├── evaluation.py │ │ ├── global_variables.py │ │ ├── initializer.py │ │ └── runner.py └── MOGANED │ ├── README.md │ ├── constant.py │ ├── func.py │ ├── models.py │ ├── train.py │ └── utils.py ├── docid2topic.json └── evaluate.py /.gitignore: -------------------------------------------------------------------------------- 1 | /baselines/DMCNN&LSTM&CRF/data 2 | /baselines/DMCNN&LSTM&CRF/raw 3 | !/baselines/DMCNN&LSTM&CRF/raw/100.utf8 4 | 5 | # Byte-compiled / optimized / DLL files 6 | __pycache__/ 7 | *.py[cod] 8 | *$py.class 9 | 10 | # C extensions 11 | *.so 12 | 13 | # Distribution / packaging 14 | .Python 15 | build/ 16 | develop-eggs/ 17 | dist/ 18 | downloads/ 19 | eggs/ 20 | .eggs/ 21 | lib/ 22 | lib64/ 23 | parts/ 24 | sdist/ 25 | var/ 26 | wheels/ 27 | pip-wheel-metadata/ 28 | share/python-wheels/ 29 | *.egg-info/ 30 | .installed.cfg 31 | *.egg 32 | MANIFEST 33 | 34 | # PyInstaller 35 | # Usually these files are written by a python script from a template 36 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 37 | *.manifest 38 | *.spec 39 | 40 | # Installer logs 41 | pip-log.txt 42 | pip-delete-this-directory.txt 43 | 44 | # Unit test / coverage reports 45 | htmlcov/ 46 | .tox/ 47 | .nox/ 48 | .coverage 49 | .coverage.* 50 | .cache 51 | nosetests.xml 52 | coverage.xml 53 | *.cover 54 | *.py,cover 55 | .hypothesis/ 56 | .pytest_cache/ 57 | 58 | # Translations 59 | *.mo 60 | *.pot 61 | 62 | # Django stuff: 63 | *.log 64 | local_settings.py 65 | db.sqlite3 66 | db.sqlite3-journal 67 | 68 | # Flask stuff: 69 | instance/ 70 | .webassets-cache 71 | 72 | # Scrapy stuff: 73 | .scrapy 74 | 75 | # Sphinx documentation 76 | docs/_build/ 77 | 78 | # PyBuilder 79 | target/ 80 | 81 | # Jupyter Notebook 82 | .ipynb_checkpoints 83 | 84 | # IPython 85 | profile_default/ 86 | ipython_config.py 87 | 88 | # pyenv 89 | .python-version 90 | 91 | # pipenv 92 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 93 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 94 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 95 | # install all needed dependencies. 96 | #Pipfile.lock 97 | 98 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 99 | __pypackages__/ 100 | 101 | # Celery stuff 102 | celerybeat-schedule 103 | celerybeat.pid 104 | 105 | # SageMath parsed files 106 | *.sage.py 107 | 108 | # Environments 109 | .env 110 | .venv 111 | env/ 112 | venv/ 113 | ENV/ 114 | env.bak/ 115 | venv.bak/ 116 | 117 | # Spyder project settings 118 | .spyderproject 119 | .spyproject 120 | 121 | # Rope project settings 122 | .ropeproject 123 | 124 | # mkdocs documentation 125 | /site 126 | 127 | # mypy 128 | .mypy_cache/ 129 | .dmypy.json 130 | dmypy.json 131 | 132 | # Pyre type checker 133 | .pyre/ -------------------------------------------------------------------------------- /DataFormat.md: -------------------------------------------------------------------------------- 1 | # MAVEN dataset 2 | 3 | Each `.jsonl` file is a subset of MAVEN and each line in the files is a json string for a document. For the `train.jsonl` and `valid.jsonl` the json format is as below: 4 | 5 | ```JSON5 6 | { 7 | "id": "6b2e8c050e30872e49c2f46edb4ac044", // an unique string for each document 8 | "title": "Selma to Montgomery marches", // the tiltle of the document 9 | "content": [ // the content of the document. A list, each item is a dict for a sentence 10 | { 11 | "sentence": "...", // a string, the plain text of the sentence 12 | "tokens": ["...", "..."] // a list, tokens of the sentence 13 | } 14 | ], 15 | "events":[ // a list for annotated events, each item is a dict for an event 16 | { 17 | "id": "75343904ec49aefe12c5749edadb7802", // an unique string for the event 18 | "type": "Arranging", // the event type 19 | "type_id": 70, // the numerical id for the event type 20 | "mention":[ // a list for the event mentions of the event, each item is a dict 21 | { 22 | "id": "2db165c25298aefb682cba50c9327e4f", // an unique string for the event mention 23 | "trigger_word": "organized", // a string of the trigger word or phrase 24 | "sent_id": 1, // the index of the corresponding sentence, strates with 0 25 | "offset": [3, 4], // the offset of the trigger words in the tokens list 26 | } 27 | ] 28 | }, 29 | ], 30 | "negative_triggers":[ // a list for negative instances, each item is a dict for a negative mention 31 | { 32 | "id": "46348f4078ae8460df4916d03573b7de", 33 | "trigger_word": "desire", 34 | "sent_id": 1, 35 | "offset": [10, 11], 36 | }, 37 | ] 38 | } 39 | ``` 40 | 41 | For the `test.jsonl`, the format is almost the same but we hide the golden labels: 42 | 43 | ```JSON5 44 | { 45 | "id": "6b2e8c050e30872e49c2f46edb4ac044", // an unique string for each document 46 | "title": "Selma to Montgomery marches", // the tiltle of the document 47 | "content": [ // the content of the document. A list, each item is a dict for a sentence 48 | { 49 | "sentence": "...", // a string, the plain text of the sentence 50 | "tokens": ["...", "..."] // a list, tokens of the sentence 51 | } 52 | ], 53 | "candidates":[ // a list for trigger candidiates, each item is a dict for a trigger or a negative instance, you need to classify the type for each candidate 54 | { 55 | "id": "46348f4078ae8460df4916d03573b7de", 56 | "trigger_word": "desire", 57 | "sent_id": 1, 58 | "offset": [10, 11], 59 | } 60 | ] 61 | } 62 | ``` 63 | 64 | You can submit the prediction results for the test set to [CodaLab](https://competitions.codalab.org/competitions/27320) to get the test results. You need to name your result file as `results.jsonl` and compress it into a `.zip` file for submission. 65 | 66 | Each line in the `results.jsonl` should be a json string encoding the prediction results for one document. The json format is as below: 67 | 68 | ```JSON5 69 | { 70 | "id": "6b2e8c050e30872e49c2f46edb4ac044", // id for the document 71 | "predictions":[ // a list, prediction results for the provided candidates 72 | { 73 | "id": "46348f4078ae8460df4916d03573b7de", // id for the candidate 74 | "type_id": 10, // integer id for the predicted type, 0 for the negative instances 75 | }, 76 | ] 77 | } 78 | ``` 79 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020 THU-KEG 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # MAVEN-dataset 2 | Source code and dataset for EMNLP 2020 paper "MAVEN: A Massive General Domain Event Detection Dataset". 3 | 4 | ## Data 5 | 6 | The dataset (ver. 1.0) can be obtained from [Tsinghua Cloud](https://cloud.tsinghua.edu.cn/d/874e0ad810f34272a03b/) or [Google Drive](https://drive.google.com/drive/folders/19Q0lqJE6A98OLnRqQVhbX3e6rG4BVGn8?usp=sharing). The data format is introduced in [this document](DataFormat.md). 7 | 8 | We also release the document topics for data analysis and model development. The [``docid2topic.json``](docid2topic.json) is to map the document ids to their EventWiki topic labels. 9 | 10 | ## CodaLab 11 | 12 | To get the test results, you can submit your predictions to our permanent [CodaLab competition](https://codalab.lisn.upsaclay.fr/competitions/395) (the [older version](https://competitions.codalab.org/competitions/27320) will be phased out soon). For the evaluation method, please refer to the [evaluation script](evaluate.py). 13 | 14 | ## Codes 15 | 16 | We release the source codes for the baselines, including [DMCNN](baselines/DMCNN_BiLSTM_(CRF)), [BiLSTM](baselines/DMCNN_BiLSTM_(CRF)), [BiLSTM+CRF](baselines/DMCNN_BiLSTM_(CRF)), [MOGANED](baselines/MOGANED) and [DMBERT](baselines/DMBERT). 17 | 18 | ## Citation 19 | 20 | If these data and codes help you, please cite this paper. 21 | 22 | ```bib 23 | @inproceedings{wang2020MAVEN, 24 | title={{MAVEN}: A Massive General Domain Event Detection Dataset}, 25 | author={Wang, Xiaozhi and Wang, Ziqi and Han, Xu and Jiang, Wangyi and Han, Rong and Liu, Zhiyuan and Li, Juanzi and Li, Peng and Lin, Yankai and Zhou, Jie}, 26 | booktitle={Proceedings of EMNLP 2020}, 27 | year={2020} 28 | } 29 | ``` 30 | 31 | 32 | 33 | -------------------------------------------------------------------------------- /baselines/BERT+CRF/BERT-CRF-ACE/bert_crf.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from transformers.modeling_bert import BertModel, BertPreTrainedModel 3 | from transformers.configuration_bert import BertConfig 4 | 5 | from crf import * 6 | from utils_ACE import to_crf_pad, unpad_crf 7 | 8 | logger = logging.getLogger(__name__) 9 | 10 | BERT_PRETRAINED_MODEL_ARCHIVE_MAP = { 11 | 'bert-base-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-pytorch_model.bin", 12 | 'bert-large-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-pytorch_model.bin", 13 | 'bert-base-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-pytorch_model.bin", 14 | 'bert-large-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-pytorch_model.bin", 15 | 'bert-base-multilingual-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-uncased-pytorch_model.bin", 16 | 'bert-base-multilingual-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-cased-pytorch_model.bin", 17 | 'bert-base-chinese': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-chinese-pytorch_model.bin", 18 | 'bert-base-german-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-cased-pytorch_model.bin", 19 | 'bert-large-uncased-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-pytorch_model.bin", 20 | 'bert-large-cased-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-pytorch_model.bin", 21 | 'bert-large-uncased-whole-word-masking-finetuned-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-finetuned-squad-pytorch_model.bin", 22 | 'bert-large-cased-whole-word-masking-finetuned-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-finetuned-squad-pytorch_model.bin", 23 | 'bert-base-cased-finetuned-mrpc': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-finetuned-mrpc-pytorch_model.bin", 24 | 'bert-base-german-dbmdz-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-dbmdz-cased-pytorch_model.bin", 25 | 'bert-base-german-dbmdz-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-dbmdz-uncased-pytorch_model.bin", 26 | 'bert-base-japanese': "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-pytorch_model.bin", 27 | 'bert-base-japanese-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-whole-word-masking-pytorch_model.bin", 28 | 'bert-base-japanese-char': "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-char-pytorch_model.bin", 29 | 'bert-base-japanese-char-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-char-whole-word-masking-pytorch_model.bin", 30 | 'bert-base-finnish-cased-v1': "https://s3.amazonaws.com/models.huggingface.co/bert/TurkuNLP/bert-base-finnish-cased-v1/pytorch_model.bin", 31 | 'bert-base-finnish-uncased-v1': "https://s3.amazonaws.com/models.huggingface.co/bert/TurkuNLP/bert-base-finnish-uncased-v1/pytorch_model.bin", 32 | } 33 | 34 | 35 | class BertCRFForTokenClassification(BertPreTrainedModel): 36 | config_class = BertConfig 37 | pretrained_model_archive_map = BERT_PRETRAINED_MODEL_ARCHIVE_MAP 38 | base_model_prefix = "bert" 39 | 40 | def __init__(self, config): 41 | super(BertCRFForTokenClassification, self).__init__(config) 42 | self.num_labels = config.num_labels 43 | 44 | self.bert = BertModel(config) 45 | self.dropout = nn.Dropout(config.hidden_dropout_prob) 46 | self.classifier = nn.Linear(config.hidden_size, config.num_labels + 2) 47 | self.crf = CRF(self.num_labels) 48 | 49 | self.init_weights() 50 | 51 | def _get_features(self, input_ids=None, attention_mask=None, token_type_ids=None, 52 | position_ids=None, head_mask=None, inputs_embeds=None): 53 | outputs = self.bert(input_ids, 54 | attention_mask=attention_mask, 55 | token_type_ids=token_type_ids, 56 | position_ids=position_ids, 57 | head_mask=head_mask, 58 | inputs_embeds=inputs_embeds) 59 | 60 | sequence_output = outputs[0] 61 | 62 | sequence_output = self.dropout(sequence_output) 63 | feats = self.classifier(sequence_output) 64 | return feats, outputs 65 | 66 | def forward(self, input_ids=None, attention_mask=None, token_type_ids=None, 67 | position_ids=None, head_mask=None, inputs_embeds=None, labels=None, pad_token_label_id=None): 68 | 69 | logits, outputs = self._get_features(input_ids, attention_mask, token_type_ids, 70 | position_ids, head_mask, inputs_embeds) 71 | 72 | outputs = (logits,) + outputs[2:] # add hidden states and attention if they are here 73 | if labels is not None: 74 | # loss_fct = nn.CrossEntropyLoss() 75 | pad_mask = (labels != pad_token_label_id) 76 | 77 | # Only keep active parts of the loss 78 | if attention_mask is not None: 79 | # active_loss = attention_mask.view(-1) == 1 80 | # active_logits = logits.view(-1, self.num_labels)[active_loss] 81 | # active_labels = labels.view(-1)[active_loss] 82 | loss_mask = ((attention_mask == 1) & pad_mask) 83 | else: 84 | # loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) 85 | loss_mask = ((torch.ones(logits.shape) == 1) & pad_mask) 86 | 87 | crf_labels, crf_mask = to_crf_pad(labels, loss_mask, pad_token_label_id) 88 | crf_logits, _ = to_crf_pad(logits, loss_mask, pad_token_label_id) 89 | 90 | loss = self.crf.neg_log_likelihood(crf_logits, crf_mask, crf_labels) 91 | # removing mask stuff from the output path is done later in my_crf_ner but it should be kept away 92 | # when calculating loss 93 | best_path = self.crf(crf_logits, crf_mask) # (torch.ones(logits.shape) == 1) 94 | best_path = unpad_crf(best_path, crf_mask, labels, pad_mask) 95 | outputs = (loss,) + outputs + (best_path,) 96 | else: 97 | # removing mask stuff from the output path is done later in my_crf_ner but it should be kept away 98 | # when calculating loss 99 | if attention_mask is not None: 100 | mask = (attention_mask == 1) # & (labels!=-100)) 101 | else: 102 | mask = torch.ones(logits.shape).bool() # (labels!=-100) 103 | crf_logits, crf_mask = to_crf_pad(logits, mask, pad_token_label_id) 104 | crf_mask = crf_mask.sum(axis=2) == crf_mask.shape[2] 105 | best_path = self.crf(crf_logits, crf_mask) 106 | temp_labels = torch.ones(mask.shape) * pad_token_label_id 107 | best_path = unpad_crf(best_path, crf_mask, temp_labels, mask) 108 | outputs = outputs + (best_path,) 109 | 110 | return outputs 111 | -------------------------------------------------------------------------------- /baselines/BERT+CRF/BERT-CRF-ACE/crf.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # @Author: Jie Yang 3 | # @Date: 2017-12-04 23:19:38 4 | # @Last Modified by: Jie Yang, Contact: jieynlp@gmail.com 5 | # @Last Modified time: 2018-05-27 22:48:17 6 | # Modified from original https://github.com/jiesutd/LatticeLSTM/blob/master/model/crf.py 7 | 8 | import torch 9 | import torch.autograd as autograd 10 | import torch.nn as nn 11 | 12 | 13 | # Compute log sum exp in a numerically stable way for the forward algorithm 14 | def log_sum_exp(vec, m_size): 15 | """ 16 | calculate log of exp sum 17 | args: 18 | vec (batch_size, vanishing_dim, hidden_dim) : input tensor 19 | m_size : hidden_dim 20 | return: 21 | batch_size, hidden_dim 22 | """ 23 | _, idx = torch.max(vec, 1) # B * 1 * M 24 | max_score = torch.gather(vec, 1, idx.view(-1, 1, m_size)).view(-1, 1, m_size) # B * M 25 | return max_score.view(-1, m_size) + torch.log(torch.sum(torch.exp(vec - max_score.expand_as(vec)), 1)).view(-1, 26 | m_size) # B * M 27 | 28 | 29 | class CRF(nn.Module): 30 | 31 | def __init__(self, tagset_size, use_gpu=False): # average_batch=False, 32 | super(CRF, self).__init__() 33 | print("build CRF...") 34 | # self.average_batch = average_batch 35 | self.gpu = use_gpu 36 | 37 | self.START_TAG = -2 38 | self.STOP_TAG = -1 39 | 40 | self.tagset_size = tagset_size 41 | 42 | # # We add 2 here, because of START_TAG and STOP_TAG 43 | # self.hidden2tag = nn.Linear(params['hidden_dim'], self.tagset_size + 2) 44 | # # transitions (f_tag_size, t_tag_size), transition value from f_tag to t_tag 45 | init_transitions = torch.zeros(self.tagset_size + 2, self.tagset_size + 2) 46 | init_transitions[:, self.START_TAG] = -10000.0 47 | init_transitions[self.STOP_TAG, :] = -10000.0 48 | if torch.cuda.is_available(): 49 | init_transitions = init_transitions.cuda() 50 | self.transitions = nn.Parameter(init_transitions, requires_grad=True) 51 | 52 | def init_hidden_cell(self, batch_size, layer_hidden_dim): 53 | return (torch.randn(2, batch_size, layer_hidden_dim // 2), 54 | torch.randn(2, batch_size, layer_hidden_dim // 2)) 55 | 56 | def _calculate_PZ(self, feats, mask): 57 | """ 58 | input: 59 | feats: (batch, seq_len, self.tag_size+2) 60 | masks: (batch, seq_len) 61 | """ 62 | batch_size = feats.size(0) 63 | seq_len = feats.size(1) 64 | tag_size = feats.size(2) 65 | # print feats.view(seq_len, tag_size) 66 | assert (tag_size == self.tagset_size + 2) 67 | mask = mask.transpose(1, 0).contiguous() 68 | ins_num = seq_len * batch_size 69 | ## be careful the view shape, it is .view(ins_num, 1, tag_size) but not .view(ins_num, tag_size, 1) 70 | feats = feats.transpose(1, 0).contiguous().view(ins_num, 1, tag_size).expand(ins_num, tag_size, tag_size) 71 | ## need to consider start 72 | scores = feats + self.transitions.view(1, tag_size, tag_size).expand(ins_num, tag_size, tag_size) 73 | scores = scores.view(seq_len, batch_size, tag_size, tag_size) 74 | # build iter 75 | seq_iter = enumerate(scores) 76 | _, inivalues = next(seq_iter) # bat_size * from_target_size * to_target_size 77 | # only need start from start_tag 78 | partition = inivalues[:, self.START_TAG, :].clone().view(batch_size, tag_size, 1) # bat_size * to_target_size 79 | 80 | ## add start score (from start to all tag, duplicate to batch_size) 81 | # partition = partition + self.transitions[START_TAG,:].view(1, tag_size, 1).expand(batch_size, tag_size, 1) 82 | # iter over last scores 83 | for idx, cur_values in seq_iter: 84 | # previous to_target is current from_target 85 | # partition: previous results log(exp(from_target)), #(batch_size * from_target) 86 | # cur_values: bat_size * from_target * to_target 87 | 88 | cur_values = cur_values + partition.contiguous().view(batch_size, tag_size, 1).expand(batch_size, tag_size, 89 | tag_size) 90 | cur_partition = log_sum_exp(cur_values, tag_size) 91 | # print cur_partition.data 92 | 93 | # (bat_size * from_target * to_target) -> (bat_size * to_target) 94 | # partition = utils.switch(partition, cur_partition, mask[idx].view(bat_size, 1).expand(bat_size, self.tagset_size)).view(bat_size, -1) 95 | mask_idx = mask[idx, :].view(batch_size, 1).expand(batch_size, tag_size) 96 | 97 | ## effective updated partition part, only keep the partition value of mask value = 1 98 | masked_cur_partition = cur_partition.masked_select(mask_idx) 99 | ## let mask_idx broadcastable, to disable warning 100 | mask_idx = mask_idx.contiguous().view(batch_size, tag_size, 1) 101 | 102 | ## replace the partition where the maskvalue=1, other partition value keeps the same 103 | partition.masked_scatter_(mask_idx, masked_cur_partition) 104 | # until the last state, add transition score for all partition (and do log_sum_exp) then select the value in STOP_TAG 105 | cur_values = self.transitions.view(1, tag_size, tag_size).expand(batch_size, tag_size, 106 | tag_size) + partition.contiguous().view( 107 | batch_size, tag_size, 1).expand(batch_size, tag_size, tag_size) 108 | 109 | cur_partition = log_sum_exp(cur_values, tag_size) 110 | final_partition = cur_partition[:, self.STOP_TAG] 111 | return final_partition.sum(), scores 112 | 113 | def _viterbi_decode(self, feats, mask): 114 | """ 115 | input: 116 | feats: (batch, seq_len, self.tag_size+2) 117 | mask: (batch, seq_len) 118 | output: 119 | decode_idx: (batch, seq_len) decoded sequence 120 | path_score: (batch, 1) corresponding score for each sequence (to be implementated) 121 | """ 122 | batch_size = feats.size(0) 123 | seq_len = feats.size(1) 124 | tag_size = feats.size(2) 125 | assert (tag_size == self.tagset_size + 2) 126 | ## calculate sentence length for each sentence 127 | length_mask = torch.sum(mask.long(), dim=1).view(batch_size, 1).long() 128 | ## mask to (seq_len, batch_size) 129 | mask = mask.transpose(1, 0).contiguous() 130 | ins_num = seq_len * batch_size 131 | ## be careful the view shape, it is .view(ins_num, 1, tag_size) but not .view(ins_num, tag_size, 1) 132 | feats = feats.transpose(1, 0).contiguous().view(ins_num, 1, tag_size).expand(ins_num, tag_size, tag_size) 133 | ## need to consider start 134 | scores = feats + self.transitions.view(1, tag_size, tag_size).expand(ins_num, tag_size, tag_size) 135 | scores = scores.view(seq_len, batch_size, tag_size, tag_size) 136 | 137 | # build iter 138 | seq_iter = enumerate(scores) 139 | ## record the position of best score 140 | back_points = list() 141 | partition_history = list() 142 | ## reverse mask (bug for mask = 1- mask, use this as alternative choice) 143 | # mask = 1 + (-1)*mask 144 | # mask = (1 - mask.long()).byte() 145 | mask = ~(mask) 146 | _, inivalues = next(seq_iter) # bat_size * from_target_size * to_target_size 147 | # only need start from start_tag 148 | partition = inivalues[:, self.START_TAG, :].clone().view(batch_size, tag_size) # bat_size * to_target_size 149 | # print "init part:",partition.size() 150 | partition_history.append(partition) 151 | # iter over last scores 152 | for idx, cur_values in seq_iter: 153 | # previous to_target is current from_target 154 | # partition: previous results log(exp(from_target)), #(batch_size * from_target) 155 | # cur_values: batch_size * from_target * to_target 156 | cur_values = cur_values + partition.contiguous().view(batch_size, tag_size, 1).expand(batch_size, tag_size, 157 | tag_size) 158 | ## forscores, cur_bp = torch.max(cur_values[:,:-2,:], 1) # do not consider START_TAG/STOP_TAG 159 | # print "cur value:", cur_values.size() 160 | partition, cur_bp = torch.max(cur_values, 1) 161 | # print "partsize:",partition.size() 162 | # exit(0) 163 | # print partition 164 | # print cur_bp 165 | # print "one best, ",idx 166 | partition_history.append(partition) 167 | ## cur_bp: (batch_size, tag_size) max source score position in current tag 168 | ## set padded label as 0, which will be filtered in post processing 169 | cur_bp.masked_fill_(mask[idx].view(batch_size, 1).expand(batch_size, tag_size), 0) 170 | back_points.append(cur_bp) 171 | # exit(0) 172 | ### add score to final STOP_TAG 173 | partition_history = torch.cat(partition_history, 0).view(seq_len, batch_size, -1).transpose(1, 174 | 0).contiguous() ## (batch_size, seq_len. tag_size) 175 | ### get the last position for each setences, and select the last partitions using gather() 176 | last_position = length_mask.view(batch_size, 1, 1).expand(batch_size, 1, tag_size) - 1 177 | last_partition = torch.gather(partition_history, 1, last_position).view(batch_size, tag_size, 1) 178 | ### calculate the score from last partition to end state (and then select the STOP_TAG from it) 179 | last_values = last_partition.expand(batch_size, tag_size, tag_size) + self.transitions.view(1, tag_size, 180 | tag_size).expand( 181 | batch_size, tag_size, tag_size) 182 | _, last_bp = torch.max(last_values, 1) 183 | pad_zero = autograd.Variable(torch.zeros(batch_size, tag_size)).long() 184 | if torch.cuda.is_available(): 185 | pad_zero = pad_zero.cuda() 186 | back_points.append(pad_zero) 187 | back_points = torch.cat(back_points).view(seq_len, batch_size, tag_size) 188 | 189 | ## select end ids in STOP_TAG 190 | pointer = last_bp[:, self.STOP_TAG] 191 | insert_last = pointer.contiguous().view(batch_size, 1, 1).expand(batch_size, 1, tag_size) 192 | back_points = back_points.transpose(1, 0).contiguous() 193 | ## move the end ids(expand to tag_size) to the corresponding position of back_points to replace the 0 values 194 | # print "lp:",last_position 195 | # print "il:",insert_last 196 | back_points.scatter_(1, last_position, insert_last) 197 | # print "bp:",back_points 198 | # exit(0) 199 | back_points = back_points.transpose(1, 0).contiguous() 200 | ## decode from the end, padded position ids are 0, which will be filtered if following evaluation 201 | decode_idx = autograd.Variable(torch.LongTensor(seq_len, batch_size)) 202 | if torch.cuda.is_available(): 203 | decode_idx = decode_idx.cuda() 204 | decode_idx[-1] = pointer.data # detach() 205 | for idx in range(len(back_points) - 2, -1, -1): 206 | pointer = torch.gather(back_points[idx], 1, pointer.contiguous().view(batch_size, 1)) 207 | decode_idx[idx] = pointer.data.t() # feili pointer.detach().view(batch_size) 208 | path_score = None 209 | decode_idx = decode_idx.transpose(1, 0) 210 | return path_score, decode_idx 211 | 212 | def forward(self, feats, mask): 213 | path_score, best_path = self._viterbi_decode(feats, mask) 214 | # return path_score, best_path 215 | return best_path 216 | 217 | def _score_sentence(self, scores, mask, tags): 218 | """ 219 | input: 220 | scores: variable (seq_len, batch, tag_size, tag_size) 221 | mask: (batch, seq_len) 222 | tags: tensor (batch, seq_len) 223 | output: 224 | score: sum of score for gold sequences within whole batch 225 | """ 226 | # Gives the score of a provided tag sequence 227 | batch_size = scores.size(1) 228 | seq_len = scores.size(0) 229 | tag_size = scores.size(2) 230 | ## convert tag value into a new format, recorded label bigram information to index 231 | new_tags = autograd.Variable(torch.LongTensor(batch_size, seq_len)) 232 | if torch.cuda.is_available(): 233 | new_tags = new_tags.cuda() 234 | for idx in range(seq_len): 235 | if idx == 0: 236 | ## start -> first score 237 | new_tags[:, 0] = (tag_size - 2) * tag_size + tags[:, 0] 238 | 239 | else: 240 | new_tags[:, idx] = tags[:, idx - 1] * tag_size + tags[:, idx] 241 | 242 | ## transition for label to STOP_TAG 243 | end_transition = self.transitions[:, self.STOP_TAG].contiguous().view(1, tag_size).expand(batch_size, tag_size) 244 | ## length for batch, last word position = length - 1 245 | length_mask = torch.sum(mask.long(), dim=1).view(batch_size, 1).long() 246 | ## index the label id of last word 247 | end_ids = torch.gather(tags, 1, length_mask - 1) 248 | 249 | ## index the transition score for end_id to STOP_TAG 250 | end_energy = torch.gather(end_transition, 1, end_ids) 251 | 252 | ## convert tag as (seq_len, batch_size, 1) 253 | new_tags = new_tags.transpose(1, 0).contiguous().view(seq_len, batch_size, 1) 254 | ### need convert tags id to search from 400 positions of scores 255 | tg_energy = torch.gather(scores.view(seq_len, batch_size, -1), 2, new_tags).view(seq_len, 256 | batch_size) # seq_len * bat_size 257 | ## mask transpose to (seq_len, batch_size) 258 | tg_energy = tg_energy.masked_select(mask.transpose(1, 0)) 259 | 260 | # ## calculate the score from START_TAG to first label 261 | # start_transition = self.transitions[START_TAG,:].view(1, tag_size).expand(batch_size, tag_size) 262 | # start_energy = torch.gather(start_transition, 1, tags[0,:]) 263 | 264 | ## add all score together 265 | # gold_score = start_energy.sum() + tg_energy.sum() + end_energy.sum() 266 | gold_score = tg_energy.sum() + end_energy.sum() 267 | return gold_score 268 | 269 | def neg_log_likelihood(self, feats, mask, tags): 270 | # nonegative log likelihood 271 | forward_score, scores = self._calculate_PZ(feats, mask) 272 | # print('Forward', forward_score) 273 | gold_score = self._score_sentence(scores, mask, tags) 274 | # print('Gold', gold_score) 275 | # print "batch, f:", forward_score.data[0], " g:", gold_score.data[0], " dis:", forward_score.data[0] - gold_score.data[0] 276 | # exit(0) 277 | # if self.average_batch: 278 | # return (forward_score - gold_score) / batch_size 279 | # else: 280 | return forward_score - gold_score 281 | -------------------------------------------------------------------------------- /baselines/BERT+CRF/BERT-CRF-ACE/run_ACE.sh: -------------------------------------------------------------------------------- 1 | python3 run_ACE.py \ 2 | --data_dir ../ACE05/ \ # path to the preprocessed ACE 2005 dataset 3 | --model_type bertcrf \ 4 | --model_name_or_path bert-base-uncased \ 5 | --output_dir ./ACE \ #path to dump checkpoints 6 | --max_seq_length 128 \ 7 | --do_lower_case \ 8 | --per_gpu_train_batch_size 32 \ 9 | --per_gpu_eval_batch_size 32 \ 10 | --gradient_accumulation_steps 8 \ 11 | --learning_rate 5e-5 \ 12 | --num_train_epochs 10 \ 13 | --save_steps 36 \ 14 | --logging_steps 36 \ 15 | --seed 24 \ 16 | --do_train \ 17 | --do_eval \ 18 | --evaluate_during_training 19 | -------------------------------------------------------------------------------- /baselines/BERT+CRF/BERT-CRF-ACE/utils_ACE.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. 3 | # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | """ Event detection CRF finetuning: utilities to work with ACE 2005 """ 17 | 18 | from __future__ import absolute_import, division, print_function 19 | import json 20 | import logging 21 | import os 22 | from io import open 23 | from transformers import XLMRobertaTokenizer, BertTokenizer, RobertaTokenizer 24 | 25 | from torch.nn.utils.rnn import pad_sequence 26 | 27 | logger = logging.getLogger(__name__) 28 | 29 | 30 | class InputExample(object): 31 | """A single training/test example for token classification.""" 32 | 33 | def __init__(self, guid, words, labels): 34 | """Constructs a InputExample. 35 | 36 | Args: 37 | guid: Unique id for the example. 38 | words: list. The words of the sequence. 39 | labels: (Optional) list. The labels for each word of the sequence. This should be 40 | specified for train and dev examples, but not for test examples. 41 | """ 42 | self.guid = guid 43 | self.words = words 44 | self.labels = labels 45 | 46 | 47 | class InputFeatures(object): 48 | """A single set of features of data.""" 49 | 50 | def __init__(self, input_ids, input_mask, segment_ids, label_ids): 51 | self.input_ids = input_ids 52 | self.input_mask = input_mask 53 | self.segment_ids = segment_ids 54 | self.label_ids = label_ids 55 | 56 | 57 | def read_examples_from_file(data_dir, mode): 58 | file_path = os.path.join(data_dir, "{}.json".format(mode)) 59 | examples = [] 60 | data=json.load(open(file_path, "r")) 61 | words=[] 62 | labels=[] 63 | def getLabel(eT,is_start=False): 64 | if eT=='None': 65 | return 'O' 66 | if is_start: 67 | return "B-"+eT 68 | return "I-"+eT 69 | words=data[0]['tokens'] 70 | labels=['X' for i in range(0,len(words))] 71 | labels[data[0]['trigger_start']]=getLabel(data[0]['event_type'],True) 72 | for j in range(data[0]['trigger_start']+1,data[0]['trigger_end']+1): 73 | labels[j]=getLabel(data[0]['event_type']) 74 | for i in range(1,len(data)): 75 | if data[i]['tokens']!=data[i-1]['tokens']: 76 | examples.append(InputExample(guid="%s-%d"%(mode,i), 77 | words=words, 78 | labels=labels)) 79 | words=data[i]['tokens'] 80 | labels=['X' for i in range(0,len(words))] 81 | labels[data[i]['trigger_start']]=getLabel(data[i]['event_type'],True) 82 | for j in range(data[i]['trigger_start']+1,data[i]['trigger_end']+1): 83 | labels[j]=getLabel(data[i]['event_type']) 84 | examples.append(InputExample(guid="%s-%d"%(mode,len(data)),words=words,labels=labels)) 85 | return examples 86 | 87 | 88 | def convert_examples_to_features(examples, 89 | label_list, 90 | max_seq_length, 91 | tokenizer, 92 | cls_token_at_end=False, 93 | cls_token="[CLS]", 94 | cls_token_segment_id=1, 95 | sep_token="[SEP]", 96 | sep_token_extra=False, 97 | pad_on_left=False, 98 | pad_token=0, 99 | pad_token_segment_id=0, 100 | pad_token_label_id=-100, 101 | sequence_a_segment_id=0, 102 | mask_padding_with_zero=True, 103 | model_name=None): 104 | """ Loads a data file into a list of `InputBatch`s 105 | `cls_token_at_end` define the location of the CLS token: 106 | - False (Default, BERT/XLM pattern): [CLS] + A + [SEP] + B + [SEP] 107 | - True (XLNet/GPT pattern): A + [SEP] + B + [SEP] + [CLS] 108 | `cls_token_segment_id` define the segment id associated to the CLS token (0 for BERT, 2 for XLNet) 109 | """ 110 | 111 | label_map = {label: i for i, label in enumerate(label_list)} 112 | 113 | # my logic in crf_padding requires this check. I create mask for crf by labels==pad_token_label_id to not include it 114 | # in loss and decoding 115 | assert pad_token_label_id not in label_map.values() 116 | 117 | features = [] 118 | for (ex_index, example) in enumerate(examples): 119 | if ex_index % 10000 == 0: 120 | print("###############") 121 | logger.info("Writing example %d of %d", ex_index, len(examples)) 122 | print("###############") 123 | 124 | tokens = [] 125 | label_ids = [] 126 | for word, label in zip(example.words, example.labels): 127 | word_tokens = tokenizer.tokenize(word) 128 | tokens.extend(word_tokens) 129 | # Use the real label id for the first token of the word, and padding ids for the remaining tokens 130 | if label!='X': 131 | label_ids.extend([label_map[label]] + [pad_token_label_id] * (len(word_tokens) - 1)) 132 | else: 133 | label_ids.extend([pad_token_label_id] + [pad_token_label_id] * (len(word_tokens) - 1)) 134 | 135 | # Account for [CLS] and [SEP] with "- 2" and with "- 3" for RoBERTa. 136 | special_tokens_count = 3 if sep_token_extra else 2 137 | if len(tokens) > max_seq_length - special_tokens_count: 138 | tokens = tokens[:(max_seq_length - special_tokens_count)] 139 | label_ids = label_ids[:(max_seq_length - special_tokens_count)] 140 | 141 | # The convention in BERT is: 142 | # (a) For sequence pairs: 143 | # tokens: [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP] 144 | # type_ids: 0 0 0 0 0 0 0 0 1 1 1 1 1 1 145 | # (b) For single sequences: 146 | # tokens: [CLS] the dog is hairy . [SEP] 147 | # type_ids: 0 0 0 0 0 0 0 148 | # 149 | # Where "type_ids" are used to indicate whether this is the first 150 | # sequence or the second sequence. The embedding vectors for `type=0` and 151 | # `type=1` were learned during pre-training and are added to the wordpiece 152 | # embedding vector (and position vector). This is not *strictly* necessary 153 | # since the [SEP] token unambiguously separates the sequences, but it makes 154 | # it easier for the model to learn the concept of sequences. 155 | # 156 | # For classification tasks, the first vector (corresponding to [CLS]) is 157 | # used as as the "sentence vector". Note that this only makes sense because 158 | # the entire model is fine-tuned. 159 | tokens += [sep_token] 160 | label_ids += [pad_token_label_id] # [label_map["X"]] 161 | if sep_token_extra: 162 | # roberta uses an extra separator b/w pairs of sentences 163 | tokens += [sep_token] 164 | label_ids += [pad_token_label_id] 165 | segment_ids = [sequence_a_segment_id] * len(tokens) 166 | 167 | if cls_token_at_end: 168 | tokens += [cls_token] 169 | label_ids += [pad_token_label_id] 170 | segment_ids += [cls_token_segment_id] 171 | else: 172 | tokens = [cls_token] + tokens 173 | label_ids = [pad_token_label_id] + label_ids 174 | segment_ids = [cls_token_segment_id] + segment_ids 175 | 176 | if model_name: 177 | if model_name == 'xlm-roberta-base': 178 | tokenizer = XLMRobertaTokenizer.from_pretrained(model_name) 179 | input_ids = tokenizer.convert_tokens_to_ids(tokens) 180 | elif model_name.startswith('bert'): 181 | tokenizer = BertTokenizer.from_pretrained(model_name) 182 | input_ids = tokenizer.convert_tokens_to_ids(tokens) 183 | elif model_name == 'roberta': 184 | tokenizer = RobertaTokenizer.from_pretrained(model_name) 185 | input_ids = tokenizer.convert_tokens_to_ids(tokens) 186 | else: 187 | input_ids = tokenizer.convert_tokens_to_ids(tokens) 188 | 189 | # The mask has 1 for real tokens and 0 for padding tokens. Only real 190 | # tokens are attended to. 191 | input_mask = [1 if mask_padding_with_zero else 0] * len(input_ids) 192 | 193 | # Zero-pad up to the sequence length. 194 | padding_length = max_seq_length - len(input_ids) 195 | if pad_on_left: 196 | input_ids = ([pad_token] * padding_length) + input_ids 197 | input_mask = ([0 if mask_padding_with_zero else 1] * padding_length) + input_mask 198 | segment_ids = ([pad_token_segment_id] * padding_length) + segment_ids 199 | label_ids = ([pad_token_label_id] * padding_length) + label_ids 200 | else: 201 | input_ids += ([pad_token] * padding_length) 202 | input_mask += ([0 if mask_padding_with_zero else 1] * padding_length) 203 | segment_ids += ([pad_token_segment_id] * padding_length) 204 | label_ids += ([pad_token_label_id] * padding_length) 205 | 206 | assert len(input_ids) == max_seq_length 207 | assert len(input_mask) == max_seq_length 208 | assert len(segment_ids) == max_seq_length 209 | assert len(label_ids) == max_seq_length 210 | 211 | if ex_index < 0: 212 | logger.info("*** Example ***") 213 | logger.info("guid: %s", example.guid) 214 | logger.info("tokens: %s", " ".join([str(x) for x in tokens])) 215 | logger.info("input_ids: %s", " ".join([str(x) for x in input_ids])) 216 | logger.info("input_mask: %s", " ".join([str(x) for x in input_mask])) 217 | logger.info("segment_ids: %s", " ".join([str(x) for x in segment_ids])) 218 | logger.info("label_ids: %s", " ".join([str(x) for x in label_ids])) 219 | 220 | features.append( 221 | InputFeatures(input_ids=input_ids, 222 | input_mask=input_mask, 223 | segment_ids=segment_ids, 224 | label_ids=label_ids)) 225 | return features 226 | 227 | 228 | def get_labels(path): 229 | return ["O", "B-Attack", "I-Attack", "B-Transport", "I-Transport", "B-Die", "I-Die", "B-End-Position", "I-End-Position", "B-Meet", "I-Meet", "B-Phone-Write", "I-Phone-Write", "B-Elect", "I-Elect", "B-Injure", "I-Injure", "B-Transfer-Ownership", "I-Transfer-Ownership", "B-Start-Org", "I-Start-Org", "B-Transfer-Money", "I-Transfer-Money", "B-Sue", "I-Sue", "B-Demonstrate", "I-Demonstrate", "B-Arrest-Jail", "I-Arrest-Jail", "B-Start-Position", "I-Start-Position", "B-Be-Born", "I-Be-Born", "B-End-Org", "I-End-Org", "B-Execute", "I-Execute", "B-Nominate", "I-Nominate", "B-Fine", "I-Fine", "B-Trial-Hearing", "I-Trial-Hearing", "B-Marry", "I-Marry", "B-Charge-Indict", "I-Charge-Indict", "B-Sentence", "I-Sentence", "B-Convict", "I-Convict", "B-Appeal", "I-Appeal", "B-Declare-Bankruptcy", "I-Declare-Bankruptcy", "B-Merge-Org", "I-Merge-Org", "B-Release-Parole", "I-Release-Parole", "B-Pardon", "I-Pardon", "B-Extradite", "I-Extradite", "B-Divorce", "I-Divorce", "B-Acquit", "I-Acquit"] 230 | 231 | def to_crf_pad(org_array, org_mask, pad_label_id): 232 | crf_array = [aa[bb] for aa, bb in zip(org_array, org_mask)] 233 | crf_array = pad_sequence(crf_array, batch_first=True, padding_value=pad_label_id) 234 | crf_pad = (crf_array != pad_label_id) 235 | # the viterbi decoder function in CRF makes use of multiplicative property of 0, then pads wrong numbers out. 236 | # Need a*0 = 0 for CRF to work. 237 | crf_array[~crf_pad] = 0 238 | return crf_array, crf_pad 239 | 240 | 241 | def unpad_crf(returned_array, returned_mask, org_array, org_mask): 242 | out_array = org_array.clone().detach() 243 | out_array[org_mask] = returned_array[returned_mask] 244 | return out_array -------------------------------------------------------------------------------- /baselines/BERT+CRF/BERT-CRF-MAVEN/bert_crf.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from transformers.modeling_bert import BertModel, BertPreTrainedModel 3 | from transformers.configuration_bert import BertConfig 4 | 5 | from crf import * 6 | from utils_maven import to_crf_pad, unpad_crf 7 | 8 | logger = logging.getLogger(__name__) 9 | 10 | BERT_PRETRAINED_MODEL_ARCHIVE_MAP = { 11 | 'bert-base-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-pytorch_model.bin", 12 | 'bert-large-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-pytorch_model.bin", 13 | 'bert-base-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-pytorch_model.bin", 14 | 'bert-large-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-pytorch_model.bin", 15 | 'bert-base-multilingual-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-uncased-pytorch_model.bin", 16 | 'bert-base-multilingual-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-cased-pytorch_model.bin", 17 | 'bert-base-chinese': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-chinese-pytorch_model.bin", 18 | 'bert-base-german-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-cased-pytorch_model.bin", 19 | 'bert-large-uncased-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-pytorch_model.bin", 20 | 'bert-large-cased-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-pytorch_model.bin", 21 | 'bert-large-uncased-whole-word-masking-finetuned-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-finetuned-squad-pytorch_model.bin", 22 | 'bert-large-cased-whole-word-masking-finetuned-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-finetuned-squad-pytorch_model.bin", 23 | 'bert-base-cased-finetuned-mrpc': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-finetuned-mrpc-pytorch_model.bin", 24 | 'bert-base-german-dbmdz-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-dbmdz-cased-pytorch_model.bin", 25 | 'bert-base-german-dbmdz-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-dbmdz-uncased-pytorch_model.bin", 26 | 'bert-base-japanese': "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-pytorch_model.bin", 27 | 'bert-base-japanese-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-whole-word-masking-pytorch_model.bin", 28 | 'bert-base-japanese-char': "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-char-pytorch_model.bin", 29 | 'bert-base-japanese-char-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-char-whole-word-masking-pytorch_model.bin", 30 | 'bert-base-finnish-cased-v1': "https://s3.amazonaws.com/models.huggingface.co/bert/TurkuNLP/bert-base-finnish-cased-v1/pytorch_model.bin", 31 | 'bert-base-finnish-uncased-v1': "https://s3.amazonaws.com/models.huggingface.co/bert/TurkuNLP/bert-base-finnish-uncased-v1/pytorch_model.bin", 32 | } 33 | 34 | 35 | class BertCRFForTokenClassification(BertPreTrainedModel): 36 | config_class = BertConfig 37 | pretrained_model_archive_map = BERT_PRETRAINED_MODEL_ARCHIVE_MAP 38 | base_model_prefix = "bert" 39 | 40 | def __init__(self, config): 41 | super(BertCRFForTokenClassification, self).__init__(config) 42 | self.num_labels = config.num_labels 43 | 44 | self.bert = BertModel(config) 45 | self.dropout = nn.Dropout(config.hidden_dropout_prob) 46 | self.classifier = nn.Linear(config.hidden_size, config.num_labels + 2) 47 | self.crf = CRF(self.num_labels) 48 | 49 | self.init_weights() 50 | 51 | def _get_features(self, input_ids=None, attention_mask=None, token_type_ids=None, 52 | position_ids=None, head_mask=None, inputs_embeds=None): 53 | outputs = self.bert(input_ids, 54 | attention_mask=attention_mask, 55 | token_type_ids=token_type_ids, 56 | position_ids=position_ids, 57 | head_mask=head_mask, 58 | inputs_embeds=inputs_embeds) 59 | 60 | sequence_output = outputs[0] 61 | 62 | sequence_output = self.dropout(sequence_output) 63 | feats = self.classifier(sequence_output) 64 | return feats, outputs 65 | 66 | def forward(self, input_ids=None, attention_mask=None, token_type_ids=None, 67 | position_ids=None, head_mask=None, inputs_embeds=None, labels=None, pad_token_label_id=None): 68 | 69 | logits, outputs = self._get_features(input_ids, attention_mask, token_type_ids, 70 | position_ids, head_mask, inputs_embeds) 71 | 72 | outputs = (logits,) + outputs[2:] # add hidden states and attention if they are here 73 | if labels is not None: 74 | # loss_fct = nn.CrossEntropyLoss() 75 | pad_mask = (labels != pad_token_label_id) 76 | 77 | # Only keep active parts of the loss 78 | if attention_mask is not None: 79 | # active_loss = attention_mask.view(-1) == 1 80 | # active_logits = logits.view(-1, self.num_labels)[active_loss] 81 | # active_labels = labels.view(-1)[active_loss] 82 | loss_mask = ((attention_mask == 1) & pad_mask) 83 | else: 84 | # loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) 85 | loss_mask = ((torch.ones(logits.shape) == 1) & pad_mask) 86 | 87 | crf_labels, crf_mask = to_crf_pad(labels, loss_mask, pad_token_label_id) 88 | crf_logits, _ = to_crf_pad(logits, loss_mask, pad_token_label_id) 89 | 90 | loss = self.crf.neg_log_likelihood(crf_logits, crf_mask, crf_labels) 91 | # removing mask stuff from the output path is done later in my_crf_ner but it should be kept away 92 | # when calculating loss 93 | best_path = self.crf(crf_logits, crf_mask) # (torch.ones(logits.shape) == 1) 94 | best_path = unpad_crf(best_path, crf_mask, labels, pad_mask) 95 | outputs = (loss,) + outputs + (best_path,) 96 | else: 97 | # removing mask stuff from the output path is done later in my_crf_ner but it should be kept away 98 | # when calculating loss 99 | if attention_mask is not None: 100 | mask = (attention_mask == 1) # & (labels!=-100)) 101 | else: 102 | mask = torch.ones(logits.shape).bool() # (labels!=-100) 103 | crf_logits, crf_mask = to_crf_pad(logits, mask, pad_token_label_id) 104 | crf_mask = crf_mask.sum(axis=2) == crf_mask.shape[2] 105 | best_path = self.crf(crf_logits, crf_mask) 106 | temp_labels = torch.ones(mask.shape) * pad_token_label_id 107 | best_path = unpad_crf(best_path, crf_mask, temp_labels, mask) 108 | outputs = outputs + (best_path,) 109 | 110 | return outputs 111 | -------------------------------------------------------------------------------- /baselines/BERT+CRF/BERT-CRF-MAVEN/crf.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # @Author: Jie Yang 3 | # @Date: 2017-12-04 23:19:38 4 | # @Last Modified by: Jie Yang, Contact: jieynlp@gmail.com 5 | # @Last Modified time: 2018-05-27 22:48:17 6 | # Modified from original https://github.com/jiesutd/LatticeLSTM/blob/master/model/crf.py 7 | 8 | import torch 9 | import torch.autograd as autograd 10 | import torch.nn as nn 11 | 12 | 13 | # Compute log sum exp in a numerically stable way for the forward algorithm 14 | def log_sum_exp(vec, m_size): 15 | """ 16 | calculate log of exp sum 17 | args: 18 | vec (batch_size, vanishing_dim, hidden_dim) : input tensor 19 | m_size : hidden_dim 20 | return: 21 | batch_size, hidden_dim 22 | """ 23 | _, idx = torch.max(vec, 1) # B * 1 * M 24 | max_score = torch.gather(vec, 1, idx.view(-1, 1, m_size)).view(-1, 1, m_size) # B * M 25 | return max_score.view(-1, m_size) + torch.log(torch.sum(torch.exp(vec - max_score.expand_as(vec)), 1)).view(-1, 26 | m_size) # B * M 27 | 28 | 29 | class CRF(nn.Module): 30 | 31 | def __init__(self, tagset_size, use_gpu=False): # average_batch=False, 32 | super(CRF, self).__init__() 33 | print("build CRF...") 34 | # self.average_batch = average_batch 35 | self.gpu = use_gpu 36 | 37 | self.START_TAG = -2 38 | self.STOP_TAG = -1 39 | 40 | self.tagset_size = tagset_size 41 | 42 | # # We add 2 here, because of START_TAG and STOP_TAG 43 | # self.hidden2tag = nn.Linear(params['hidden_dim'], self.tagset_size + 2) 44 | # # transitions (f_tag_size, t_tag_size), transition value from f_tag to t_tag 45 | init_transitions = torch.zeros(self.tagset_size + 2, self.tagset_size + 2) 46 | init_transitions[:, self.START_TAG] = -10000.0 47 | init_transitions[self.STOP_TAG, :] = -10000.0 48 | if torch.cuda.is_available(): 49 | init_transitions = init_transitions.cuda() 50 | self.transitions = nn.Parameter(init_transitions, requires_grad=True) 51 | 52 | def init_hidden_cell(self, batch_size, layer_hidden_dim): 53 | return (torch.randn(2, batch_size, layer_hidden_dim // 2), 54 | torch.randn(2, batch_size, layer_hidden_dim // 2)) 55 | 56 | def _calculate_PZ(self, feats, mask): 57 | """ 58 | input: 59 | feats: (batch, seq_len, self.tag_size+2) 60 | masks: (batch, seq_len) 61 | """ 62 | batch_size = feats.size(0) 63 | seq_len = feats.size(1) 64 | tag_size = feats.size(2) 65 | # print feats.view(seq_len, tag_size) 66 | assert (tag_size == self.tagset_size + 2) 67 | mask = mask.transpose(1, 0).contiguous() 68 | ins_num = seq_len * batch_size 69 | ## be careful the view shape, it is .view(ins_num, 1, tag_size) but not .view(ins_num, tag_size, 1) 70 | feats = feats.transpose(1, 0).contiguous().view(ins_num, 1, tag_size).expand(ins_num, tag_size, tag_size) 71 | ## need to consider start 72 | scores = feats + self.transitions.view(1, tag_size, tag_size).expand(ins_num, tag_size, tag_size) 73 | scores = scores.view(seq_len, batch_size, tag_size, tag_size) 74 | # build iter 75 | seq_iter = enumerate(scores) 76 | _, inivalues = next(seq_iter) # bat_size * from_target_size * to_target_size 77 | # only need start from start_tag 78 | partition = inivalues[:, self.START_TAG, :].clone().view(batch_size, tag_size, 1) # bat_size * to_target_size 79 | 80 | ## add start score (from start to all tag, duplicate to batch_size) 81 | # partition = partition + self.transitions[START_TAG,:].view(1, tag_size, 1).expand(batch_size, tag_size, 1) 82 | # iter over last scores 83 | for idx, cur_values in seq_iter: 84 | # previous to_target is current from_target 85 | # partition: previous results log(exp(from_target)), #(batch_size * from_target) 86 | # cur_values: bat_size * from_target * to_target 87 | 88 | cur_values = cur_values + partition.contiguous().view(batch_size, tag_size, 1).expand(batch_size, tag_size, 89 | tag_size) 90 | cur_partition = log_sum_exp(cur_values, tag_size) 91 | # print cur_partition.data 92 | 93 | # (bat_size * from_target * to_target) -> (bat_size * to_target) 94 | # partition = utils.switch(partition, cur_partition, mask[idx].view(bat_size, 1).expand(bat_size, self.tagset_size)).view(bat_size, -1) 95 | mask_idx = mask[idx, :].view(batch_size, 1).expand(batch_size, tag_size) 96 | 97 | ## effective updated partition part, only keep the partition value of mask value = 1 98 | masked_cur_partition = cur_partition.masked_select(mask_idx) 99 | ## let mask_idx broadcastable, to disable warning 100 | mask_idx = mask_idx.contiguous().view(batch_size, tag_size, 1) 101 | 102 | ## replace the partition where the maskvalue=1, other partition value keeps the same 103 | partition.masked_scatter_(mask_idx, masked_cur_partition) 104 | # until the last state, add transition score for all partition (and do log_sum_exp) then select the value in STOP_TAG 105 | cur_values = self.transitions.view(1, tag_size, tag_size).expand(batch_size, tag_size, 106 | tag_size) + partition.contiguous().view( 107 | batch_size, tag_size, 1).expand(batch_size, tag_size, tag_size) 108 | 109 | cur_partition = log_sum_exp(cur_values, tag_size) 110 | final_partition = cur_partition[:, self.STOP_TAG] 111 | return final_partition.sum(), scores 112 | 113 | def _viterbi_decode(self, feats, mask): 114 | """ 115 | input: 116 | feats: (batch, seq_len, self.tag_size+2) 117 | mask: (batch, seq_len) 118 | output: 119 | decode_idx: (batch, seq_len) decoded sequence 120 | path_score: (batch, 1) corresponding score for each sequence (to be implementated) 121 | """ 122 | batch_size = feats.size(0) 123 | seq_len = feats.size(1) 124 | tag_size = feats.size(2) 125 | assert (tag_size == self.tagset_size + 2) 126 | ## calculate sentence length for each sentence 127 | length_mask = torch.sum(mask.long(), dim=1).view(batch_size, 1).long() 128 | ## mask to (seq_len, batch_size) 129 | mask = mask.transpose(1, 0).contiguous() 130 | ins_num = seq_len * batch_size 131 | ## be careful the view shape, it is .view(ins_num, 1, tag_size) but not .view(ins_num, tag_size, 1) 132 | feats = feats.transpose(1, 0).contiguous().view(ins_num, 1, tag_size).expand(ins_num, tag_size, tag_size) 133 | ## need to consider start 134 | scores = feats + self.transitions.view(1, tag_size, tag_size).expand(ins_num, tag_size, tag_size) 135 | scores = scores.view(seq_len, batch_size, tag_size, tag_size) 136 | 137 | # build iter 138 | seq_iter = enumerate(scores) 139 | ## record the position of best score 140 | back_points = list() 141 | partition_history = list() 142 | ## reverse mask (bug for mask = 1- mask, use this as alternative choice) 143 | # mask = 1 + (-1)*mask 144 | # mask = (1 - mask.long()).byte() 145 | mask = ~(mask) 146 | _, inivalues = next(seq_iter) # bat_size * from_target_size * to_target_size 147 | # only need start from start_tag 148 | partition = inivalues[:, self.START_TAG, :].clone().view(batch_size, tag_size) # bat_size * to_target_size 149 | # print "init part:",partition.size() 150 | partition_history.append(partition) 151 | # iter over last scores 152 | for idx, cur_values in seq_iter: 153 | # previous to_target is current from_target 154 | # partition: previous results log(exp(from_target)), #(batch_size * from_target) 155 | # cur_values: batch_size * from_target * to_target 156 | cur_values = cur_values + partition.contiguous().view(batch_size, tag_size, 1).expand(batch_size, tag_size, 157 | tag_size) 158 | ## forscores, cur_bp = torch.max(cur_values[:,:-2,:], 1) # do not consider START_TAG/STOP_TAG 159 | # print "cur value:", cur_values.size() 160 | partition, cur_bp = torch.max(cur_values, 1) 161 | # print "partsize:",partition.size() 162 | # exit(0) 163 | # print partition 164 | # print cur_bp 165 | # print "one best, ",idx 166 | partition_history.append(partition) 167 | ## cur_bp: (batch_size, tag_size) max source score position in current tag 168 | ## set padded label as 0, which will be filtered in post processing 169 | cur_bp.masked_fill_(mask[idx].view(batch_size, 1).expand(batch_size, tag_size), 0) 170 | back_points.append(cur_bp) 171 | # exit(0) 172 | ### add score to final STOP_TAG 173 | partition_history = torch.cat(partition_history, 0).view(seq_len, batch_size, -1).transpose(1, 174 | 0).contiguous() ## (batch_size, seq_len. tag_size) 175 | ### get the last position for each setences, and select the last partitions using gather() 176 | last_position = length_mask.view(batch_size, 1, 1).expand(batch_size, 1, tag_size) - 1 177 | last_partition = torch.gather(partition_history, 1, last_position).view(batch_size, tag_size, 1) 178 | ### calculate the score from last partition to end state (and then select the STOP_TAG from it) 179 | last_values = last_partition.expand(batch_size, tag_size, tag_size) + self.transitions.view(1, tag_size, 180 | tag_size).expand( 181 | batch_size, tag_size, tag_size) 182 | _, last_bp = torch.max(last_values, 1) 183 | pad_zero = autograd.Variable(torch.zeros(batch_size, tag_size)).long() 184 | if torch.cuda.is_available(): 185 | pad_zero = pad_zero.cuda() 186 | back_points.append(pad_zero) 187 | back_points = torch.cat(back_points).view(seq_len, batch_size, tag_size) 188 | 189 | ## select end ids in STOP_TAG 190 | pointer = last_bp[:, self.STOP_TAG] 191 | insert_last = pointer.contiguous().view(batch_size, 1, 1).expand(batch_size, 1, tag_size) 192 | back_points = back_points.transpose(1, 0).contiguous() 193 | ## move the end ids(expand to tag_size) to the corresponding position of back_points to replace the 0 values 194 | # print "lp:",last_position 195 | # print "il:",insert_last 196 | back_points.scatter_(1, last_position, insert_last) 197 | # print "bp:",back_points 198 | # exit(0) 199 | back_points = back_points.transpose(1, 0).contiguous() 200 | ## decode from the end, padded position ids are 0, which will be filtered if following evaluation 201 | decode_idx = autograd.Variable(torch.LongTensor(seq_len, batch_size)) 202 | if torch.cuda.is_available(): 203 | decode_idx = decode_idx.cuda() 204 | decode_idx[-1] = pointer.data # detach() 205 | for idx in range(len(back_points) - 2, -1, -1): 206 | pointer = torch.gather(back_points[idx], 1, pointer.contiguous().view(batch_size, 1)) 207 | decode_idx[idx] = pointer.data.t() # feili pointer.detach().view(batch_size) 208 | path_score = None 209 | decode_idx = decode_idx.transpose(1, 0) 210 | return path_score, decode_idx 211 | 212 | def forward(self, feats, mask): 213 | path_score, best_path = self._viterbi_decode(feats, mask) 214 | # return path_score, best_path 215 | return best_path 216 | 217 | def _score_sentence(self, scores, mask, tags): 218 | """ 219 | input: 220 | scores: variable (seq_len, batch, tag_size, tag_size) 221 | mask: (batch, seq_len) 222 | tags: tensor (batch, seq_len) 223 | output: 224 | score: sum of score for gold sequences within whole batch 225 | """ 226 | # Gives the score of a provided tag sequence 227 | batch_size = scores.size(1) 228 | seq_len = scores.size(0) 229 | tag_size = scores.size(2) 230 | ## convert tag value into a new format, recorded label bigram information to index 231 | new_tags = autograd.Variable(torch.LongTensor(batch_size, seq_len)) 232 | if torch.cuda.is_available(): 233 | new_tags = new_tags.cuda() 234 | for idx in range(seq_len): 235 | if idx == 0: 236 | ## start -> first score 237 | new_tags[:, 0] = (tag_size - 2) * tag_size + tags[:, 0] 238 | 239 | else: 240 | new_tags[:, idx] = tags[:, idx - 1] * tag_size + tags[:, idx] 241 | 242 | ## transition for label to STOP_TAG 243 | end_transition = self.transitions[:, self.STOP_TAG].contiguous().view(1, tag_size).expand(batch_size, tag_size) 244 | ## length for batch, last word position = length - 1 245 | length_mask = torch.sum(mask.long(), dim=1).view(batch_size, 1).long() 246 | ## index the label id of last word 247 | end_ids = torch.gather(tags, 1, length_mask - 1) 248 | 249 | ## index the transition score for end_id to STOP_TAG 250 | end_energy = torch.gather(end_transition, 1, end_ids) 251 | 252 | ## convert tag as (seq_len, batch_size, 1) 253 | new_tags = new_tags.transpose(1, 0).contiguous().view(seq_len, batch_size, 1) 254 | ### need convert tags id to search from 400 positions of scores 255 | tg_energy = torch.gather(scores.view(seq_len, batch_size, -1), 2, new_tags).view(seq_len, 256 | batch_size) # seq_len * bat_size 257 | ## mask transpose to (seq_len, batch_size) 258 | tg_energy = tg_energy.masked_select(mask.transpose(1, 0)) 259 | 260 | # ## calculate the score from START_TAG to first label 261 | # start_transition = self.transitions[START_TAG,:].view(1, tag_size).expand(batch_size, tag_size) 262 | # start_energy = torch.gather(start_transition, 1, tags[0,:]) 263 | 264 | ## add all score together 265 | # gold_score = start_energy.sum() + tg_energy.sum() + end_energy.sum() 266 | gold_score = tg_energy.sum() + end_energy.sum() 267 | return gold_score 268 | 269 | def neg_log_likelihood(self, feats, mask, tags): 270 | # nonegative log likelihood 271 | forward_score, scores = self._calculate_PZ(feats, mask) 272 | # print('Forward', forward_score) 273 | gold_score = self._score_sentence(scores, mask, tags) 274 | # print('Gold', gold_score) 275 | # print "batch, f:", forward_score.data[0], " g:", gold_score.data[0], " dis:", forward_score.data[0] - gold_score.data[0] 276 | # exit(0) 277 | # if self.average_batch: 278 | # return (forward_score - gold_score) / batch_size 279 | # else: 280 | return forward_score - gold_score 281 | -------------------------------------------------------------------------------- /baselines/BERT+CRF/BERT-CRF-MAVEN/run_MAVEN.sh: -------------------------------------------------------------------------------- 1 | python3 run_maven.py \ 2 | --data_dir ../maven/ \ #path to the raw MAVEN data files 3 | --model_type bertcrf \ 4 | --model_name_or_path bert-base-uncased \ 5 | --output_dir ./MAVEN \ #path to dump checkpoints 6 | --max_seq_length 128 \ 7 | --do_lower_case \ 8 | --per_gpu_train_batch_size 16 \ 9 | --per_gpu_eval_batch_size 16 \ 10 | --gradient_accumulation_steps 8 \ 11 | --learning_rate 5e-5 \ 12 | --num_train_epochs 5 \ 13 | --save_steps 100 \ 14 | --logging_steps 100 \ 15 | --seed 0 \ 16 | --do_train \ 17 | --do_eval \ 18 | --evaluate_during_training 19 | -------------------------------------------------------------------------------- /baselines/BERT+CRF/BERT-CRF-MAVEN/run_MAVEN_infer.sh: -------------------------------------------------------------------------------- 1 | python3 run_maven.py \ 2 | --data_dir ../maven/ \ #path to the test data, remember to delete the cached files at first (otherwise the test data may be random shuffled before) 3 | --model_type bertcrf \ 4 | --model_name_or_path bert-base-uncased \ 5 | --output_dir ./MAVEN/checkpoint-1200 \ #path to the trained checkpoint, the results file will also be dumped here 6 | --max_seq_length 128 \ 7 | --do_lower_case \ 8 | --per_gpu_train_batch_size 16 \ 9 | --per_gpu_eval_batch_size 16 \ 10 | --gradient_accumulation_steps 8 \ 11 | --learning_rate 5e-5 \ 12 | --num_train_epochs 5 \ 13 | --save_steps 100 \ 14 | --logging_steps 100 \ 15 | --seed 0 \ 16 | --do_infer #add this flag to do inference only 17 | -------------------------------------------------------------------------------- /baselines/BERT+CRF/BERT-CRF-MAVEN/utils_maven.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. 3 | # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | """ BERT-CRF fine-tuning: utilities to work with MAVEN. """ 17 | 18 | from __future__ import absolute_import, division, print_function 19 | import json 20 | import logging 21 | import os 22 | from io import open 23 | from transformers import XLMRobertaTokenizer, BertTokenizer, RobertaTokenizer 24 | 25 | from torch.nn.utils.rnn import pad_sequence 26 | 27 | logger = logging.getLogger(__name__) 28 | 29 | 30 | class InputExample(object): 31 | """A single training/test example for token classification.""" 32 | 33 | def __init__(self, guid, words, labels): 34 | """Constructs a InputExample. 35 | 36 | Args: 37 | guid: Unique id for the example. 38 | words: list. The words of the sequence. 39 | labels: (Optional) list. The labels for each word of the sequence. This should be 40 | specified for train and dev examples, but not for test examples. 41 | """ 42 | self.guid = guid 43 | self.words = words 44 | self.labels = labels 45 | 46 | 47 | class InputFeatures(object): 48 | """A single set of features of data.""" 49 | 50 | def __init__(self, input_ids, input_mask, segment_ids, label_ids): 51 | self.input_ids = input_ids 52 | self.input_mask = input_mask 53 | self.segment_ids = segment_ids 54 | self.label_ids = label_ids 55 | 56 | 57 | def read_examples_from_file(data_dir, mode): 58 | file_path = os.path.join(data_dir, "{}.jsonl".format(mode)) 59 | examples = [] 60 | with open(file_path, "r") as fin: 61 | lines=fin.readlines() 62 | for line in lines: 63 | doc=json.loads(line) 64 | words=[] 65 | labels=[] 66 | for sent in doc['content']: 67 | words.append(sent['tokens']) 68 | labels.append(['O' for i in range(0,len(sent['tokens']))])#TBD 69 | if mode!='test': 70 | for event in doc['events']: 71 | for mention in event['mention']: 72 | labels[mention['sent_id']][mention['offset'][0]]="B-"+event['type'] 73 | for i in range(mention['offset'][0]+1,mention['offset'][1]): 74 | labels[mention['sent_id']][i]="I-"+event['type'] 75 | for mention in doc['negative_triggers']: 76 | labels[mention['sent_id']][mention['offset'][0]]="O" 77 | for i in range(mention['offset'][0]+1,mention['offset'][1]): 78 | labels[mention['sent_id']][i]="O" 79 | for i in range(0,len(words)): 80 | examples.append(InputExample(guid="%s-%s-%d"%(mode,doc['id'],i), 81 | words=words[i], 82 | labels=labels[i])) 83 | return examples 84 | 85 | 86 | def convert_examples_to_features(examples, 87 | label_list, 88 | max_seq_length, 89 | tokenizer, 90 | cls_token_at_end=False, 91 | cls_token="[CLS]", 92 | cls_token_segment_id=1, 93 | sep_token="[SEP]", 94 | sep_token_extra=False, 95 | pad_on_left=False, 96 | pad_token=0, 97 | pad_token_segment_id=0, 98 | pad_token_label_id=-100, 99 | sequence_a_segment_id=0, 100 | mask_padding_with_zero=True, 101 | model_name=None): 102 | """ Loads a data file into a list of `InputBatch`s 103 | `cls_token_at_end` define the location of the CLS token: 104 | - False (Default, BERT/XLM pattern): [CLS] + A + [SEP] + B + [SEP] 105 | - True (XLNet/GPT pattern): A + [SEP] + B + [SEP] + [CLS] 106 | `cls_token_segment_id` define the segment id associated to the CLS token (0 for BERT, 2 for XLNet) 107 | """ 108 | 109 | label_map = {label: i for i, label in enumerate(label_list)} 110 | 111 | # my logic in crf_padding requires this check. I create mask for crf by labels==pad_token_label_id to not include it 112 | # in loss and decoding 113 | assert pad_token_label_id not in label_map.values() 114 | 115 | features = [] 116 | for (ex_index, example) in enumerate(examples): 117 | if ex_index % 10000 == 0: 118 | print("###############") 119 | logger.info("Writing example %d of %d", ex_index, len(examples)) 120 | print("###############") 121 | 122 | tokens = [] 123 | label_ids = [] 124 | for word, label in zip(example.words, example.labels): 125 | word_tokens = tokenizer.tokenize(word) 126 | tokens.extend(word_tokens) 127 | # Use the real label id for the first token of the word, and padding ids for the remaining tokens 128 | label_ids.extend([label_map[label]] + [pad_token_label_id] * (len(word_tokens) - 1)) 129 | 130 | # Account for [CLS] and [SEP] with "- 2" and with "- 3" for RoBERTa. 131 | special_tokens_count = 3 if sep_token_extra else 2 132 | if len(tokens) > max_seq_length - special_tokens_count: 133 | tokens = tokens[:(max_seq_length - special_tokens_count)] 134 | label_ids = label_ids[:(max_seq_length - special_tokens_count)] 135 | 136 | # The convention in BERT is: 137 | # (a) For sequence pairs: 138 | # tokens: [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP] 139 | # type_ids: 0 0 0 0 0 0 0 0 1 1 1 1 1 1 140 | # (b) For single sequences: 141 | # tokens: [CLS] the dog is hairy . [SEP] 142 | # type_ids: 0 0 0 0 0 0 0 143 | # 144 | # Where "type_ids" are used to indicate whether this is the first 145 | # sequence or the second sequence. The embedding vectors for `type=0` and 146 | # `type=1` were learned during pre-training and are added to the wordpiece 147 | # embedding vector (and position vector). This is not *strictly* necessary 148 | # since the [SEP] token unambiguously separates the sequences, but it makes 149 | # it easier for the model to learn the concept of sequences. 150 | # 151 | # For classification tasks, the first vector (corresponding to [CLS]) is 152 | # used as as the "sentence vector". Note that this only makes sense because 153 | # the entire model is fine-tuned. 154 | tokens += [sep_token] 155 | label_ids += [pad_token_label_id] # [label_map["X"]] 156 | if sep_token_extra: 157 | # roberta uses an extra separator b/w pairs of sentences 158 | tokens += [sep_token] 159 | label_ids += [pad_token_label_id] 160 | segment_ids = [sequence_a_segment_id] * len(tokens) 161 | 162 | if cls_token_at_end: 163 | tokens += [cls_token] 164 | label_ids += [pad_token_label_id] 165 | segment_ids += [cls_token_segment_id] 166 | else: 167 | tokens = [cls_token] + tokens 168 | label_ids = [pad_token_label_id] + label_ids 169 | segment_ids = [cls_token_segment_id] + segment_ids 170 | 171 | if model_name: 172 | if model_name == 'xlm-roberta-base': 173 | tokenizer = XLMRobertaTokenizer.from_pretrained(model_name) 174 | input_ids = tokenizer.convert_tokens_to_ids(tokens) 175 | elif model_name.startswith('bert'): 176 | tokenizer = BertTokenizer.from_pretrained(model_name) 177 | input_ids = tokenizer.convert_tokens_to_ids(tokens) 178 | elif model_name == 'roberta': 179 | tokenizer = RobertaTokenizer.from_pretrained(model_name) 180 | input_ids = tokenizer.convert_tokens_to_ids(tokens) 181 | else: 182 | input_ids = tokenizer.convert_tokens_to_ids(tokens) 183 | 184 | # The mask has 1 for real tokens and 0 for padding tokens. Only real 185 | # tokens are attended to. 186 | input_mask = [1 if mask_padding_with_zero else 0] * len(input_ids) 187 | 188 | # Zero-pad up to the sequence length. 189 | padding_length = max_seq_length - len(input_ids) 190 | if pad_on_left: 191 | input_ids = ([pad_token] * padding_length) + input_ids 192 | input_mask = ([0 if mask_padding_with_zero else 1] * padding_length) + input_mask 193 | segment_ids = ([pad_token_segment_id] * padding_length) + segment_ids 194 | label_ids = ([pad_token_label_id] * padding_length) + label_ids 195 | else: 196 | input_ids += ([pad_token] * padding_length) 197 | input_mask += ([0 if mask_padding_with_zero else 1] * padding_length) 198 | segment_ids += ([pad_token_segment_id] * padding_length) 199 | label_ids += ([pad_token_label_id] * padding_length) 200 | 201 | assert len(input_ids) == max_seq_length 202 | assert len(input_mask) == max_seq_length 203 | assert len(segment_ids) == max_seq_length 204 | assert len(label_ids) == max_seq_length 205 | 206 | if ex_index < 0: 207 | logger.info("*** Example ***") 208 | logger.info("guid: %s", example.guid) 209 | logger.info("tokens: %s", " ".join([str(x) for x in tokens])) 210 | logger.info("input_ids: %s", " ".join([str(x) for x in input_ids])) 211 | logger.info("input_mask: %s", " ".join([str(x) for x in input_mask])) 212 | logger.info("segment_ids: %s", " ".join([str(x) for x in segment_ids])) 213 | logger.info("label_ids: %s", " ".join([str(x) for x in label_ids])) 214 | 215 | features.append( 216 | InputFeatures(input_ids=input_ids, 217 | input_mask=input_mask, 218 | segment_ids=segment_ids, 219 | label_ids=label_ids)) 220 | return features 221 | 222 | 223 | def get_labels(path): 224 | return ["O", "B-Know", "I-Know", "B-Warning", "I-Warning", "B-Catastrophe", "I-Catastrophe", "B-Placing", "I-Placing", "B-Causation", "I-Causation", "B-Arriving", "I-Arriving", "B-Sending", "I-Sending", "B-Protest", "I-Protest", "B-Preventing_or_letting", "I-Preventing_or_letting", "B-Motion", "I-Motion", "B-Damaging", "I-Damaging", "B-Destroying", "I-Destroying", "B-Death", "I-Death", "B-Perception_active", "I-Perception_active", "B-Presence", "I-Presence", "B-Influence", "I-Influence", "B-Receiving", "I-Receiving", "B-Check", "I-Check", "B-Hostile_encounter", "I-Hostile_encounter", "B-Killing", "I-Killing", "B-Conquering", "I-Conquering", "B-Releasing", "I-Releasing", "B-Attack", "I-Attack", "B-Earnings_and_losses", "I-Earnings_and_losses", "B-Choosing", "I-Choosing", "B-Traveling", "I-Traveling", "B-Recovering", "I-Recovering", "B-Using", "I-Using", "B-Coming_to_be", "I-Coming_to_be", "B-Cause_to_be_included", "I-Cause_to_be_included", "B-Process_start", "I-Process_start", "B-Change_event_time", "I-Change_event_time", "B-Reporting", "I-Reporting", "B-Bodily_harm", "I-Bodily_harm", "B-Suspicion", "I-Suspicion", "B-Statement", "I-Statement", "B-Cause_change_of_position_on_a_scale", "I-Cause_change_of_position_on_a_scale", "B-Coming_to_believe", "I-Coming_to_believe", "B-Expressing_publicly", "I-Expressing_publicly", "B-Request", "I-Request", "B-Control", "I-Control", "B-Supporting", "I-Supporting", "B-Defending", "I-Defending", "B-Building", "I-Building", "B-Military_operation", "I-Military_operation", "B-Self_motion", "I-Self_motion", "B-GetReady", "I-GetReady", "B-Forming_relationships", "I-Forming_relationships", "B-Becoming_a_member", "I-Becoming_a_member", "B-Action", "I-Action", "B-Removing", "I-Removing", "B-Surrendering", "I-Surrendering", "B-Agree_or_refuse_to_act", "I-Agree_or_refuse_to_act", "B-Participation", "I-Participation", "B-Deciding", "I-Deciding", "B-Education_teaching", "I-Education_teaching", "B-Emptying", "I-Emptying", "B-Getting", "I-Getting", "B-Besieging", "I-Besieging", "B-Creating", "I-Creating", "B-Process_end", "I-Process_end", "B-Body_movement", "I-Body_movement", "B-Expansion", "I-Expansion", "B-Telling", "I-Telling", "B-Change", "I-Change", "B-Legal_rulings", "I-Legal_rulings", "B-Bearing_arms", "I-Bearing_arms", "B-Giving", "I-Giving", "B-Name_conferral", "I-Name_conferral", "B-Arranging", "I-Arranging", "B-Use_firearm", "I-Use_firearm", "B-Committing_crime", "I-Committing_crime", "B-Assistance", "I-Assistance", "B-Surrounding", "I-Surrounding", "B-Quarreling", "I-Quarreling", "B-Expend_resource", "I-Expend_resource", "B-Motion_directional", "I-Motion_directional", "B-Bringing", "I-Bringing", "B-Communication", "I-Communication", "B-Containing", "I-Containing", "B-Manufacturing", "I-Manufacturing", "B-Social_event", "I-Social_event", "B-Robbery", "I-Robbery", "B-Competition", "I-Competition", "B-Writing", "I-Writing", "B-Rescuing", "I-Rescuing", "B-Judgment_communication", "I-Judgment_communication", "B-Change_tool", "I-Change_tool", "B-Hold", "I-Hold", "B-Being_in_operation", "I-Being_in_operation", "B-Recording", "I-Recording", "B-Carry_goods", "I-Carry_goods", "B-Cost", "I-Cost", "B-Departing", "I-Departing", "B-GiveUp", "I-GiveUp", "B-Change_of_leadership", "I-Change_of_leadership", "B-Escaping", "I-Escaping", "B-Aiming", "I-Aiming", "B-Hindering", "I-Hindering", "B-Preserving", "I-Preserving", "B-Create_artwork", "I-Create_artwork", "B-Openness", "I-Openness", "B-Connect", "I-Connect", "B-Reveal_secret", "I-Reveal_secret", "B-Response", "I-Response", "B-Scrutiny", "I-Scrutiny", "B-Lighting", "I-Lighting", "B-Criminal_investigation", "I-Criminal_investigation", "B-Hiding_objects", "I-Hiding_objects", "B-Confronting_problem", "I-Confronting_problem", "B-Renting", "I-Renting", "B-Breathing", "I-Breathing", "B-Patrolling", "I-Patrolling", "B-Arrest", "I-Arrest", "B-Convincing", "I-Convincing", "B-Commerce_sell", "I-Commerce_sell", "B-Cure", "I-Cure", "B-Temporary_stay", "I-Temporary_stay", "B-Dispersal", "I-Dispersal", "B-Collaboration", "I-Collaboration", "B-Extradition", "I-Extradition", "B-Change_sentiment", "I-Change_sentiment", "B-Commitment", "I-Commitment", "B-Commerce_pay", "I-Commerce_pay", "B-Filling", "I-Filling", "B-Becoming", "I-Becoming", "B-Achieve", "I-Achieve", "B-Practice", "I-Practice", "B-Cause_change_of_strength", "I-Cause_change_of_strength", "B-Supply", "I-Supply", "B-Cause_to_amalgamate", "I-Cause_to_amalgamate", "B-Scouring", "I-Scouring", "B-Violence", "I-Violence", "B-Reforming_a_system", "I-Reforming_a_system", "B-Come_together", "I-Come_together", "B-Wearing", "I-Wearing", "B-Cause_to_make_progress", "I-Cause_to_make_progress", "B-Legality", "I-Legality", "B-Employment", "I-Employment", "B-Rite", "I-Rite", "B-Publishing", "I-Publishing", "B-Adducing", "I-Adducing", "B-Exchange", "I-Exchange", "B-Ratification", "I-Ratification", "B-Sign_agreement", "I-Sign_agreement", "B-Commerce_buy", "I-Commerce_buy", "B-Imposing_obligation", "I-Imposing_obligation", "B-Rewards_and_punishments", "I-Rewards_and_punishments", "B-Institutionalization", "I-Institutionalization", "B-Testing", "I-Testing", "B-Ingestion", "I-Ingestion", "B-Labeling", "I-Labeling", "B-Kidnapping", "I-Kidnapping", "B-Submitting_documents", "I-Submitting_documents", "B-Prison", "I-Prison", "B-Justifying", "I-Justifying", "B-Emergency", "I-Emergency", "B-Terrorism", "I-Terrorism", "B-Vocalizations", "I-Vocalizations", "B-Risk", "I-Risk", "B-Resolve_problem", "I-Resolve_problem", "B-Revenge", "I-Revenge", "B-Limiting", "I-Limiting", "B-Research", "I-Research", "B-Having_or_lacking_access", "I-Having_or_lacking_access", "B-Theft", "I-Theft", "B-Incident", "I-Incident", "B-Award", "I-Award"] 225 | 226 | 227 | def to_crf_pad(org_array, org_mask, pad_label_id): 228 | crf_array = [aa[bb] for aa, bb in zip(org_array, org_mask)] 229 | crf_array = pad_sequence(crf_array, batch_first=True, padding_value=pad_label_id) 230 | crf_pad = (crf_array != pad_label_id) 231 | # the viterbi decoder function in CRF makes use of multiplicative property of 0, then pads wrong numbers out. 232 | # Need a*0 = 0 for CRF to work. 233 | crf_array[~crf_pad] = 0 234 | return crf_array, crf_pad 235 | 236 | 237 | def unpad_crf(returned_array, returned_mask, org_array, org_mask): 238 | out_array = org_array.clone().detach() 239 | out_array[org_mask] = returned_array[returned_mask] 240 | return out_array -------------------------------------------------------------------------------- /baselines/BERT+CRF/README.md: -------------------------------------------------------------------------------- 1 | # DMBERT 2 | This code is the implementation for BERT+CRF model. The implementations are based on [Huggingface's Transformers](https://github.com/huggingface/transformers) and the BERT+CRF implementations in [this repo](https://github.com/mezig351/transformers/tree/ner_crf/examples/ner). 3 | 4 | 5 | 6 | ## Requirements 7 | 8 | - python==3.6.9 9 | 10 | - torch==1.2.0 11 | 12 | - transformers==2.6.0 13 | 14 | - sklearn==0.20.2 15 | 16 | - seqeval 17 | 18 | 19 | 20 | ## Usage 21 | 22 | Hint: please read and delete all the comments after ```\``` in each line of the ```.sh``` scripts before running them. 23 | 24 | ### On MAVEN: 25 | The codes are in the ```BERT-CRF-MAVEN``` folder. 26 | 27 | 1. Download MAVEN data files. 28 | 2. Run ```run_MAVEN.sh``` for training and evaluation on the devlopment set. 29 | 3. Run ```run_MAVEN_infer.sh``` to get predictions on the test set (dumped to ```OUTPUT_PATH/results.jsonl```). 30 | 31 | See the two scripts for more details. 32 | 33 | ### On ACE 34 | The codes are in the ```BERT-CRF-ACE``` folder. 35 | 36 | 1. Preprocess ACE 2005 dataset as in [this repo](https://github.com/thunlp/HMEAE). 37 | 2. Run ``run_ACE.sh`` for training and evaluation. 38 | -------------------------------------------------------------------------------- /baselines/DMBERT/README.md: -------------------------------------------------------------------------------- 1 | # DMBERT 2 | This code is the implementation for [DMBERT](https://www.aclweb.org/anthology/N19-1105/) model. The implementations are based on [Huggingface's Transformers](https://github.com/huggingface/transformers), especially its example for the multiple-choice task. 3 | 4 | 5 | 6 | ## Requirements 7 | 8 | - python==3.6.9 9 | 10 | - torch==1.2.0 11 | 12 | - transformers==2.8.0 13 | 14 | - sklearn==0.20.2 15 | 16 | 17 | 18 | ## Usage 19 | 20 | Hint: please read and delete all the comments after ```\``` in each line of the ```.sh``` scripts before running them. 21 | 22 | ### On MAVEN: 23 | 24 | 1. Download MAVEN data files. 25 | 2. Run ```run_MAVEN.sh``` for training and evaluation on the devlopment set. 26 | 3. Run ```run_MAVEN_infer.sh``` to get predictions on the test set (dumped to ```results.jsonl```). 27 | 28 | See the two scripts for more details. 29 | 30 | ### On ACE 31 | 32 | 1. Preprocess ACE 2005 dataset as in [this repo](https://github.com/thunlp/HMEAE). 33 | 2. Run ``run_ACE.sh`` for training and evaluation. 34 | -------------------------------------------------------------------------------- /baselines/DMBERT/get_submission.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import argparse 4 | import json 5 | import numpy as np 6 | if __name__=='__main__': 7 | parser=argparse.ArgumentParser() 8 | parser.add_argument("--test_data",default="../maven/test.jsonl",help="path to the test data file",required=False) 9 | parser.add_argument("--preds",default="MAVEN/_preds.npy",help="path to the prediction file generated by the run_MAVEN_infer.sh script") 10 | parser.add_argument("--output",default="../maven/results.jsonl",help="path to the output file") 11 | args=parser.parse_args() 12 | preds=np.load(args.preds) 13 | fout=open(args.output,"w") 14 | with open(args.test_data,"r") as fin: 15 | lines=fin.readlines() 16 | Cnt=0 17 | for line in lines: 18 | data=json.loads(line) 19 | res={"id":data['id']} 20 | tmp=[] 21 | for mention in data['candidates']: 22 | tmp.append({"id":mention["id"],"type_id":int(preds[Cnt])}) 23 | Cnt+=1 24 | res["predictions"]=tmp 25 | fout.write(json.dumps(res)+"\n") 26 | assert Cnt==len(preds) 27 | fout.close() 28 | -------------------------------------------------------------------------------- /baselines/DMBERT/model.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | import torch 3 | import torch.nn as nn 4 | import torch.nn.functional as F 5 | from torch.nn import CrossEntropyLoss 6 | from transformers import BertPreTrainedModel,BertModel 7 | 8 | class DMBERT(BertPreTrainedModel): 9 | def __init__(self,config): 10 | super().__init__(config) 11 | self.bert=BertModel(config) 12 | self.dropout=nn.Dropout(config.hidden_dropout_prob) 13 | self.maxpooling=nn.MaxPool1d(128) 14 | self.classifier=nn.Linear(config.hidden_size*2,config.num_labels) 15 | def forward(self,input_ids=None,attention_mask=None,token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None, maskL=None, maskR=None, labels=None): 16 | batchSize=input_ids.size(0) 17 | outputs =self.bert( 18 | input_ids, 19 | attention_mask=attention_mask, 20 | token_type_ids=token_type_ids, 21 | position_ids=position_ids, 22 | head_mask=head_mask, 23 | inputs_embeds=inputs_embeds, 24 | ) 25 | conved=outputs[0] 26 | conved=conved.transpose(1,2) 27 | conved=conved.transpose(0,1) 28 | L=(conved*maskL).transpose(0,1) 29 | R=(conved*maskR).transpose(0,1) 30 | L=L+torch.ones_like(L) 31 | R=R+torch.ones_like(R) 32 | pooledL=self.maxpooling(L).contiguous().view(batchSize,self.config.hidden_size) 33 | pooledR=self.maxpooling(R).contiguous().view(batchSize,self.config.hidden_size) 34 | pooled=torch.cat((pooledL,pooledR),1) 35 | pooled=pooled-torch.ones_like(pooled) 36 | pooled=self.dropout(pooled) 37 | logits=self.classifier(pooled) 38 | reshaped_logits=logits.view(-1, self.config.num_labels) 39 | outputs = (reshaped_logits,) + outputs[2:] 40 | if labels is not None: 41 | loss_fct=CrossEntropyLoss() 42 | loss=loss_fct(reshaped_logits, labels) 43 | outputs=(loss,)+outputs 44 | return outputs 45 | -------------------------------------------------------------------------------- /baselines/DMBERT/run_ACE.sh: -------------------------------------------------------------------------------- 1 | python3 run_ee.py \ 2 | --data_dir ../ACE05/ \ # path to the preprocessed ACE 2005 dataset 3 | --model_type bert \ 4 | --model_name_or_path bert-base-uncased \ 5 | --task_name ace \ 6 | --output_dir ./ACE \ # path to dump checkpoints 7 | --max_seq_length 128 \ 8 | --do_lower_case \ 9 | --per_gpu_train_batch_size 42 \ 10 | --per_gpu_eval_batch_size 42 \ 11 | --gradient_accumulation_steps 2 \ 12 | --learning_rate 5e-5 \ 13 | --num_train_epochs 10 \ 14 | --save_steps 300 \ 15 | --logging_steps 300 \ 16 | --seed 2 \ 17 | --do_train \ 18 | --do_eval \ 19 | --do_test \ 20 | --evaluate_during_training 21 | -------------------------------------------------------------------------------- /baselines/DMBERT/run_MAVEN.sh: -------------------------------------------------------------------------------- 1 | python3 run_ee.py \ 2 | --data_dir ../maven/ \ #path to the raw MAVEN data files 3 | --model_type bert \ 4 | --model_name_or_path bert-base-uncased \ 5 | --task_name maven \ 6 | --output_dir ./MAVEN \ #path to dump checkpoints 7 | --max_seq_length 128 \ 8 | --do_lower_case \ 9 | --per_gpu_train_batch_size 42 \ 10 | --per_gpu_eval_batch_size 42 \ 11 | --gradient_accumulation_steps 3 \ 12 | --learning_rate 5e-5 \ 13 | --num_train_epochs 5 \ 14 | --save_steps 500 \ 15 | --logging_steps 500 \ 16 | --seed 42 \ 17 | --do_eval \ 18 | --do_train \ 19 | --evaluate_during_training -------------------------------------------------------------------------------- /baselines/DMBERT/run_MAVEN_infer.sh: -------------------------------------------------------------------------------- 1 | python3 run_ee.py \ 2 | --data_dir ../maven/ \ #path to the test data, remember to delete the cached files at first (otherwise the test data may be random shuffled before) 3 | --model_type bert \ 4 | --model_name_or_path ./MAVEN/checkpoint-2500 \ #path to the trained checkpoint 5 | --task_name maven_infer \ 6 | --output_dir ./MAVEN \ #output path 7 | --max_seq_length 128 \ 8 | --do_lower_case \ 9 | --per_gpu_train_batch_size 42 \ 10 | --per_gpu_eval_batch_size 42 \ 11 | --gradient_accumulation_steps 2 \ 12 | --learning_rate 5e-5 \ 13 | --num_train_epochs 5 \ 14 | --save_steps 500 \ 15 | --logging_steps 500 \ 16 | --seed 42 \ 17 | --do_infer #add this flag to do inference only 18 | python3 get_submission.py \ #convert the predictions to the submission format 19 | --test_data ../maven/test.jsonl \ #path to the test data file 20 | --preds MAVEN/checkpoint-2500/checkpoint-2500_preds.npy \ #path to the prediction file 21 | --output ./results.jsonl #output file 22 | -------------------------------------------------------------------------------- /baselines/DMBERT/utils_ee.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. 3 | # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. 4 | # Copyright 2020 Xiaozhi Wang 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | """ Multiple choice fine-tuning: utilities to work with multiple choice tasks of reading comprehension """ 18 | 19 | 20 | import csv 21 | import glob 22 | import json 23 | import logging 24 | import os 25 | from typing import List 26 | 27 | import tqdm 28 | 29 | from transformers import PreTrainedTokenizer 30 | 31 | 32 | logger = logging.getLogger(__name__) 33 | 34 | 35 | class InputExample(object): 36 | """A single training/test example for multiple choice""" 37 | 38 | def __init__(self, example_id, tokens, triggerL, triggerR, label=None): 39 | """Constructs a InputExample. 40 | 41 | Args: 42 | example_id: Unique id for the example. 43 | contexts: list of str. The untokenized text of the first sequence (context of corresponding question). 44 | question: string. The untokenized text of the second sequence (question). 45 | endings: list of str. multiple choice's options. Its length must be equal to contexts' length. 46 | label: (Optional) string. The label of the example. This should be 47 | specified for train and dev examples, but not for test examples. 48 | """ 49 | self.example_id = example_id 50 | self.tokens = tokens 51 | self.triggerL = triggerL 52 | self.triggerR = triggerR 53 | self.label = label 54 | 55 | 56 | class InputFeatures(object): 57 | def __init__(self, example_id, input_ids, input_mask, segment_ids, maskL, maskR, label): 58 | self.example_id = example_id 59 | self.input_ids = input_ids 60 | self.input_mask = input_mask 61 | self.segment_ids = segment_ids 62 | self.maskL = maskL 63 | self.maskR = maskR 64 | self.label = label 65 | 66 | 67 | class DataProcessor(object): 68 | """Base class for data converters for multiple choice data sets.""" 69 | 70 | def get_train_examples(self, data_dir): 71 | """Gets a collection of `InputExample`s for the train set.""" 72 | raise NotImplementedError() 73 | 74 | def get_dev_examples(self, data_dir): 75 | """Gets a collection of `InputExample`s for the dev set.""" 76 | raise NotImplementedError() 77 | 78 | def get_test_examples(self, data_dir): 79 | """Gets a collection of `InputExample`s for the test set.""" 80 | raise NotImplementedError() 81 | 82 | def get_labels(self): 83 | """Gets the list of labels for this data set.""" 84 | raise NotImplementedError() 85 | 86 | 87 | class ACEProcessor(DataProcessor): 88 | """Processor for the RACE data set.""" 89 | 90 | def get_train_examples(self, data_dir): 91 | """See base class.""" 92 | logger.info("LOOKING AT {} train".format(data_dir)) 93 | return self._create_examples(json.load(open(os.path.join(data_dir,'train.json'),"r")), "train") 94 | 95 | def get_dev_examples(self, data_dir): 96 | """See base class.""" 97 | logger.info("LOOKING AT {} dev".format(data_dir)) 98 | return self._create_examples(json.load(open(os.path.join(data_dir,'dev.json'),"r")), "dev") 99 | 100 | def get_test_examples(self, data_dir): 101 | """See base class.""" 102 | logger.info("LOOKING AT {} test".format(data_dir)) 103 | return self._create_examples(json.load(open(os.path.join(data_dir,'test.json'),"r")), "test") 104 | 105 | def get_labels(self): 106 | """See base class.""" 107 | return ['None', 'End-Position', 'Charge-Indict', 'Convict', 'Transfer-Ownership', 'Demonstrate', 'Transport', 'Sentence', 'Appeal', 'Start-Org', 'Start-Position', 'End-Org', 'Phone-Write', 'Nominate', 'Marry', 'Pardon', 'Release-Parole', 'Meet', 'Trial-Hearing', 'Extradite', 'Execute', 'Transfer-Money', 'Elect', 'Injure', 'Acquit', 'Divorce', 'Die', 'Arrest-Jail', 'Declare-Bankruptcy', 'Be-Born', 'Merge-Org', 'Fine', 'Sue', 'Attack'] 108 | 109 | def _create_examples(self, lines, set_type): 110 | """Creates examples for the training and dev sets.""" 111 | examples = [] 112 | for (idx, data_raw) in enumerate(lines): 113 | e_id = "%s-%s" % (set_type, idx) 114 | examples.append( 115 | InputExample( 116 | example_id=e_id, 117 | tokens=data_raw['tokens'], 118 | triggerL=data_raw['trigger_start'], 119 | triggerR=data_raw['trigger_end']+1, 120 | label=data_raw['event_type'], 121 | ) 122 | ) 123 | return examples 124 | 125 | class MAVENProcessor(DataProcessor): 126 | """Processor for the RACE data set.""" 127 | 128 | def get_train_examples(self, data_dir): 129 | """See base class.""" 130 | logger.info("LOOKING AT {} train".format(data_dir)) 131 | return self._create_examples(open(os.path.join(data_dir,'train.jsonl'),"r"), "train") 132 | 133 | def get_dev_examples(self, data_dir): 134 | """See base class.""" 135 | logger.info("LOOKING AT {} dev".format(data_dir)) 136 | return self._create_examples(open(os.path.join(data_dir,'valid.jsonl'),"r"), "dev") 137 | 138 | def get_test_examples(self, data_dir): 139 | """See base class.""" 140 | logger.info("LOOKING AT {} test".format(data_dir)) 141 | return self._create_examples(open(os.path.join(data_dir,'test.jsonl'),"r"), "test") 142 | 143 | def get_labels(self): 144 | """See base class.""" 145 | return ["None", "Know", "Warning", "Catastrophe", "Placing", "Causation", "Arriving", "Sending", "Protest", "Preventing_or_letting", "Motion", "Damaging", "Destroying", "Death", "Perception_active", "Presence", "Influence", "Receiving", "Check", "Hostile_encounter", "Killing", "Conquering", "Releasing", "Attack", "Earnings_and_losses", "Choosing", "Traveling", "Recovering", "Using", "Coming_to_be", "Cause_to_be_included", "Process_start", "Change_event_time", "Reporting", "Bodily_harm", "Suspicion", "Statement", "Cause_change_of_position_on_a_scale", "Coming_to_believe", "Expressing_publicly", "Request", "Control", "Supporting", "Defending", "Building", "Military_operation", "Self_motion", "GetReady", "Forming_relationships", "Becoming_a_member", "Action", "Removing", "Surrendering", "Agree_or_refuse_to_act", "Participation", "Deciding", "Education_teaching", "Emptying", "Getting", "Besieging", "Creating", "Process_end", "Body_movement", "Expansion", "Telling", "Change", "Legal_rulings", "Bearing_arms", "Giving", "Name_conferral", "Arranging", "Use_firearm", "Committing_crime", "Assistance", "Surrounding", "Quarreling", "Expend_resource", "Motion_directional", "Bringing", "Communication", "Containing", "Manufacturing", "Social_event", "Robbery", "Competition", "Writing", "Rescuing", "Judgment_communication", "Change_tool", "Hold", "Being_in_operation", "Recording", "Carry_goods", "Cost", "Departing", "GiveUp", "Change_of_leadership", "Escaping", "Aiming", "Hindering", "Preserving", "Create_artwork", "Openness", "Connect", "Reveal_secret", "Response", "Scrutiny", "Lighting", "Criminal_investigation", "Hiding_objects", "Confronting_problem", "Renting", "Breathing", "Patrolling", "Arrest", "Convincing", "Commerce_sell", "Cure", "Temporary_stay", "Dispersal", "Collaboration", "Extradition", "Change_sentiment", "Commitment", "Commerce_pay", "Filling", "Becoming", "Achieve", "Practice", "Cause_change_of_strength", "Supply", "Cause_to_amalgamate", "Scouring", "Violence", "Reforming_a_system", "Come_together", "Wearing", "Cause_to_make_progress", "Legality", "Employment", "Rite", "Publishing", "Adducing", "Exchange", "Ratification", "Sign_agreement", "Commerce_buy", "Imposing_obligation", "Rewards_and_punishments", "Institutionalization", "Testing", "Ingestion", "Labeling", "Kidnapping", "Submitting_documents", "Prison", "Justifying", "Emergency", "Terrorism", "Vocalizations", "Risk", "Resolve_problem", "Revenge", "Limiting", "Research", "Having_or_lacking_access", "Theft", "Incident", "Award"] 146 | def _create_examples(self, fin, set_type): 147 | """Creates examples for the training and dev sets.""" 148 | examples = [] 149 | lines=fin.readlines() 150 | for (_, data_raw) in enumerate(lines): 151 | data=json.loads(data_raw) 152 | for event in data['events']: 153 | if event['type']=='None of the above': 154 | print("?????????") 155 | for mention in event['mention']: 156 | e_id = "%s-%s" % (set_type, mention['id']) 157 | examples.append( 158 | InputExample( 159 | example_id=e_id, 160 | tokens=data['content'][mention['sent_id']]['tokens'], 161 | triggerL=mention['offset'][0], 162 | triggerR=mention['offset'][1], 163 | label=event['type'], 164 | ) 165 | ) 166 | for nIns in data['negative_triggers']: 167 | e_id = "%s-%s" % (set_type, nIns['id']) 168 | examples.append( 169 | InputExample( 170 | example_id=e_id, 171 | tokens=data['content'][nIns['sent_id']]['tokens'], 172 | triggerL=nIns['offset'][0], 173 | triggerR=nIns['offset'][1], 174 | label='None', 175 | ) 176 | ) 177 | 178 | return examples 179 | 180 | 181 | class MAVENInferProcessor(DataProcessor): 182 | """Processor for the RACE data set.""" 183 | 184 | def get_test_examples(self, data_dir): 185 | """See base class.""" 186 | logger.info("LOOKING AT {} test".format(data_dir)) 187 | return self._create_examples(open(os.path.join(data_dir,'test.jsonl'),"r"), "test") 188 | 189 | def get_labels(self): 190 | """See base class.""" 191 | return ["None", "Know", "Warning", "Catastrophe", "Placing", "Causation", "Arriving", "Sending", "Protest", "Preventing_or_letting", "Motion", "Damaging", "Destroying", "Death", "Perception_active", "Presence", "Influence", "Receiving", "Check", "Hostile_encounter", "Killing", "Conquering", "Releasing", "Attack", "Earnings_and_losses", "Choosing", "Traveling", "Recovering", "Using", "Coming_to_be", "Cause_to_be_included", "Process_start", "Change_event_time", "Reporting", "Bodily_harm", "Suspicion", "Statement", "Cause_change_of_position_on_a_scale", "Coming_to_believe", "Expressing_publicly", "Request", "Control", "Supporting", "Defending", "Building", "Military_operation", "Self_motion", "GetReady", "Forming_relationships", "Becoming_a_member", "Action", "Removing", "Surrendering", "Agree_or_refuse_to_act", "Participation", "Deciding", "Education_teaching", "Emptying", "Getting", "Besieging", "Creating", "Process_end", "Body_movement", "Expansion", "Telling", "Change", "Legal_rulings", "Bearing_arms", "Giving", "Name_conferral", "Arranging", "Use_firearm", "Committing_crime", "Assistance", "Surrounding", "Quarreling", "Expend_resource", "Motion_directional", "Bringing", "Communication", "Containing", "Manufacturing", "Social_event", "Robbery", "Competition", "Writing", "Rescuing", "Judgment_communication", "Change_tool", "Hold", "Being_in_operation", "Recording", "Carry_goods", "Cost", "Departing", "GiveUp", "Change_of_leadership", "Escaping", "Aiming", "Hindering", "Preserving", "Create_artwork", "Openness", "Connect", "Reveal_secret", "Response", "Scrutiny", "Lighting", "Criminal_investigation", "Hiding_objects", "Confronting_problem", "Renting", "Breathing", "Patrolling", "Arrest", "Convincing", "Commerce_sell", "Cure", "Temporary_stay", "Dispersal", "Collaboration", "Extradition", "Change_sentiment", "Commitment", "Commerce_pay", "Filling", "Becoming", "Achieve", "Practice", "Cause_change_of_strength", "Supply", "Cause_to_amalgamate", "Scouring", "Violence", "Reforming_a_system", "Come_together", "Wearing", "Cause_to_make_progress", "Legality", "Employment", "Rite", "Publishing", "Adducing", "Exchange", "Ratification", "Sign_agreement", "Commerce_buy", "Imposing_obligation", "Rewards_and_punishments", "Institutionalization", "Testing", "Ingestion", "Labeling", "Kidnapping", "Submitting_documents", "Prison", "Justifying", "Emergency", "Terrorism", "Vocalizations", "Risk", "Resolve_problem", "Revenge", "Limiting", "Research", "Having_or_lacking_access", "Theft", "Incident", "Award"] 192 | def _create_examples(self, fin, set_type): 193 | """Creates examples for the training and dev sets.""" 194 | examples = [] 195 | lines=fin.readlines() 196 | for (_, data_raw) in enumerate(lines): 197 | data=json.loads(data_raw) 198 | for mention in data['candidates']: 199 | e_id = "%s-%s" % (set_type, mention['id']) 200 | examples.append( 201 | InputExample( 202 | example_id=e_id, 203 | tokens=data['content'][mention['sent_id']]['tokens'], 204 | triggerL=mention['offset'][0], 205 | triggerR=mention['offset'][1], 206 | label='None', 207 | ) 208 | ) 209 | return examples 210 | 211 | def convert_examples_to_features( 212 | examples: List[InputExample], 213 | label_list: List[str], 214 | max_length: int, 215 | tokenizer: PreTrainedTokenizer, 216 | pad_token_segment_id=0, 217 | pad_on_left=False, 218 | pad_token=0, 219 | mask_padding_with_zero=True, 220 | ) -> List[InputFeatures]: 221 | """ 222 | Loads a data file into a list of `InputFeatures` 223 | """ 224 | 225 | label_map = {label: i for i, label in enumerate(label_list)} 226 | 227 | features = [] 228 | for (ex_index, example) in tqdm.tqdm(enumerate(examples), desc="convert examples to features"): 229 | if ex_index % 10000 == 0: 230 | logger.info("Writing example %d of %d" % (ex_index, len(examples))) 231 | textL = tokenizer.tokenize(" ".join(example.tokens[:example.triggerL])) 232 | textR = tokenizer.tokenize(" ".join(example.tokens[example.triggerL:example.triggerR]))+['[unused1]']+tokenizer.tokenize(" ".join(example.tokens[example.triggerR:])) 233 | maskL = [1.0 for i in range(0,len(textL)+1)] + [0.0 for i in range(0,len(textR)+2)] 234 | maskR = [0.0 for i in range(0,len(textL)+1)] + [1.0 for i in range(0,len(textR)+2)] 235 | if len(maskL)>max_length: 236 | maskL = maskL[:max_length] 237 | if len(maskR)>max_length: 238 | maskR = maskR[:max_length] 239 | inputs = tokenizer.encode_plus( 240 | textL + ['[unused0]'] + textR, add_special_tokens=True, max_length=max_length, return_token_type_ids=True 241 | ) 242 | if "num_truncated_tokens" in inputs and inputs["num_truncated_tokens"] > 0: 243 | logger.info( 244 | "Attention! you are cropping tokens." 245 | ) 246 | 247 | input_ids, token_type_ids = inputs["input_ids"], inputs["token_type_ids"] 248 | assert len(input_ids)==len(maskL) 249 | assert len(input_ids)==len(maskR) 250 | # The mask has 1 for real tokens and 0 for padding tokens. Only real 251 | # tokens are attended to. 252 | attention_mask = [1 if mask_padding_with_zero else 0] * len(input_ids) 253 | # Zero-pad up to the sequence length. 254 | padding_length = max_length - len(input_ids) 255 | if pad_on_left: 256 | input_ids = ([pad_token] * padding_length) + input_ids 257 | attention_mask = ([0 if mask_padding_with_zero else 1] * padding_length) + attention_mask 258 | token_type_ids = ([pad_token_segment_id] * padding_length) + token_type_ids 259 | maskL = ([0.0] * padding_length) + maskL 260 | maskR = ([0.0] * padding_length) + maskR 261 | else: 262 | input_ids = input_ids + ([pad_token] * padding_length) 263 | attention_mask = attention_mask + ([0 if mask_padding_with_zero else 1] * padding_length) 264 | token_type_ids = token_type_ids + ([pad_token_segment_id] * padding_length) 265 | maskL = maskL + ([0.0] * padding_length) 266 | maskR = maskR + ([0.0] * padding_length) 267 | 268 | assert len(input_ids) == max_length 269 | assert len(attention_mask) == max_length 270 | assert len(token_type_ids) == max_length 271 | 272 | label = label_map[example.label] 273 | 274 | if ex_index < 2: 275 | logger.info("*** Example ***") 276 | logger.info("example_id: {}".format(example.example_id)) 277 | logger.info("input_ids: {}".format(" ".join(map(str, input_ids)))) 278 | logger.info("attention_mask: {}".format(" ".join(map(str, attention_mask)))) 279 | logger.info("token_type_ids: {}".format(" ".join(map(str, token_type_ids)))) 280 | logger.info("maskL: {}".format(" ".join(map(str, maskL)))) 281 | logger.info("maskR: {}".format(" ".join(map(str, maskR)))) 282 | logger.info("label: {}".format(label)) 283 | 284 | features.append(InputFeatures(example_id=example.example_id, input_ids=input_ids, input_mask=attention_mask, segment_ids=token_type_ids, maskL=maskL, maskR=maskR, label=label)) 285 | 286 | return features 287 | 288 | 289 | processors = {"ace": ACEProcessor, "maven": MAVENProcessor, "maven_infer": MAVENInferProcessor} 290 | 291 | 292 | MULTIPLE_CHOICE_TASKS_NUM_LABELS = {"ace", 34, "maven", 169} 293 | -------------------------------------------------------------------------------- /baselines/DMCNN_BiLSTM_(CRF)/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/THU-KEG/MAVEN-dataset/71151f6da53bc9df9d6c5718dc81a2e1489651bf/baselines/DMCNN_BiLSTM_(CRF)/.DS_Store -------------------------------------------------------------------------------- /baselines/DMCNN_BiLSTM_(CRF)/README.md: -------------------------------------------------------------------------------- 1 | # DMCNN & BiLSTM & BiLSTM+CRF 2 | The codes are implementations of [DMCNN](https://www.aclweb.org/anthology/P15-1017/), BiLSTM and BiLSTM+CRF for event detection on MAVEN. 3 | 4 | ## Requirements 5 | 6 | + torch==1.6 7 | + CUDA==10.2 8 | + numpy 9 | + sklearn 10 | + seqeval==1.2.2 11 | + tqdm==4.44.0 12 | 13 | ## Usage 14 | 15 | To run this code, you need to: 16 | 1. put raw files of MAVEN dataset in `./raw` 17 | 2. run ```python main.py --config [path of config files] --gpu [gpu, optional]``` 18 | we will train, evaluate and test models in every epoch. We output the performance of training and evaluating, and generate test result files for submit to [CodaLab](https://competitions.codalab.org/competitions/27320#learn_the_details-submission-format). 19 | 20 | All the hyper-parameters for the three models are in config files at `./config/`, you can modify them as you wish. 21 | -------------------------------------------------------------------------------- /baselines/DMCNN_BiLSTM_(CRF)/clear.py: -------------------------------------------------------------------------------- 1 | import os 2 | import shutil 3 | 4 | del_list = os.listdir("data") 5 | for f in del_list: 6 | file_path = os.path.join("data", f) 7 | if os.path.isfile(file_path): 8 | os.remove(file_path) 9 | elif os.path.isdir(file_path): 10 | shutil.rmtree(file_path) 11 | -------------------------------------------------------------------------------- /baselines/DMCNN_BiLSTM_(CRF)/config/bilstm.config: -------------------------------------------------------------------------------- 1 | [train] 2 | epoch = 15 3 | batch_size = 200 4 | shuffle = True 5 | valid_interval = 1 6 | save_strategy = save_best 7 | 8 | [test] 9 | batch_size = 200 10 | shuffle = False 11 | 12 | [data] 13 | reader_name = MavenReader 14 | formatter_name = BilstmFormatter 15 | word2vec_file = 100.utf8 16 | split_labels = True 17 | 18 | [model] 19 | model_name = Bilstm 20 | num_layers = 1 21 | hidden_size = 256 22 | dropout = 0.5 23 | 24 | [optimizer] 25 | optimizer_name = Adam 26 | lr = 1e-3 27 | weight_decay = 1e-8 -------------------------------------------------------------------------------- /baselines/DMCNN_BiLSTM_(CRF)/config/crf.config: -------------------------------------------------------------------------------- 1 | [train] 2 | epoch = 15 3 | batch_size = 40 4 | shuffle = True 5 | valid_interval = 1 6 | save_strategy = save_best 7 | 8 | [test] 9 | batch_size = 40 10 | shuffle = False 11 | 12 | [data] 13 | reader_name = MavenReader 14 | formatter_name = CrfFormatter 15 | word2vec_file = 100.utf8 16 | sequence_length = 128 17 | BIO = True 18 | pad_label_id = -100 19 | 20 | [model] 21 | model_name = Crf 22 | num_layers = 1 23 | hidden_size = 400 24 | dropout = 0.3 25 | 26 | [optimizer] 27 | optimizer_name = Adam 28 | lr = 1e-3 29 | weight_decay = 1e-8 -------------------------------------------------------------------------------- /baselines/DMCNN_BiLSTM_(CRF)/config/dmcnn.config: -------------------------------------------------------------------------------- 1 | [train] 2 | epoch = 15 3 | batch_size = 170 4 | shuffle = True 5 | valid_interval = 1 6 | save_strategy = save_best 7 | 8 | [test] 9 | batch_size = 170 10 | shuffle = False 11 | 12 | [data] 13 | reader_name = MavenReader 14 | formatter_name = DmcnnFormatter 15 | word2vec_file = 100.utf8 16 | split_labels = True 17 | 18 | [model] 19 | model_name = Dmcnn 20 | pf_dim = 5 21 | llf_num = 3 22 | kernel_size = 3 23 | hidden_size = 200 24 | dropout = 0.5 25 | 26 | [optimizer] 27 | optimizer_name = Adam 28 | lr = 1e-3 29 | weight_decay = 1e-8 -------------------------------------------------------------------------------- /baselines/DMCNN_BiLSTM_(CRF)/formatter/BilstmFormatter.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from utils.global_variables import Global 3 | 4 | class BilstmFormatter(object): 5 | def __init__(self, config): 6 | self.config = config 7 | 8 | def process(self, data, mode): 9 | """ 10 | :param data: [{"tokens": list(int), "labels": int}, ...] 11 | :param mode: train/valid/test 12 | :return: {"tokens": LongTensor, 13 | "lables": LongTensor, 14 | "lengths": LongTensor, 15 | "indices": LongTensor} 16 | """ 17 | tokens, canids, labels, lengths, indices, docids = [], [], [], [], [], [] 18 | 19 | sequence_length = self.config.getint("runtime", "sequence_length") 20 | 21 | for item in data: 22 | length = len(item["tokens"]) 23 | docids.append(item["docids"]) 24 | tokens.append(item["tokens"] + [Global.word2id[""]] * (sequence_length - length)) 25 | canids.append(item["canids"]) 26 | if mode != "test": 27 | labels.append(item["labels"]) 28 | lengths.append(length) 29 | indices.append(item["index"]) 30 | 31 | tlt = lambda t: torch.LongTensor(t) 32 | tt = lambda t: torch.Tensor(t) 33 | tokens, lengths, indices = tlt(tokens), tlt(lengths), tlt(indices) 34 | if mode != "test": 35 | labels = tlt(labels) 36 | 37 | return {"tokens": tokens, 38 | "labels": labels, 39 | "lengths": lengths, 40 | "indices": indices, 41 | "canids": canids, 42 | "docids": docids} if mode != "test" else { 43 | "tokens": tokens, 44 | "lengths": lengths, 45 | "indices": indices, 46 | "canids": canids, 47 | "docids": docids 48 | } 49 | -------------------------------------------------------------------------------- /baselines/DMCNN_BiLSTM_(CRF)/formatter/CrfFormatter.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from utils.global_variables import Global 3 | 4 | class CrfFormatter(object): 5 | def __init__(self, config): 6 | self.config = config 7 | self.pad_label_id = config.getint("data", "pad_label_id") 8 | 9 | def process(self, data, mode): 10 | """ 11 | :param data: [{"tokens": list(int), "labels": list(int)}, ...] 12 | :param mode: train/valid/test 13 | :return: {"tokens": LongTensor, 14 | "lables": LongTensor, 15 | "masks": LongTensor, 16 | "lengths": LongTensor} 17 | """ 18 | tokens, canids, labels, flags, masks, lengths, docids = [], [], [], [], [], [], [] 19 | 20 | sequence_length = self.config.getint("runtime", "sequence_length") 21 | 22 | for item in data: 23 | docid = item["docids"] 24 | token = item["tokens"] 25 | canid_ = item["canids"] 26 | if mode != "test": 27 | label = item["labels"] 28 | else: 29 | label = [0] * len(token) 30 | flag = item["flags"] if "flags" in item else [1] * len(token) 31 | if len(token) > sequence_length: 32 | token = token[:sequence_length] 33 | canid_ = canid_[:sequence_length] 34 | label = label[:sequence_length] 35 | flag = flag[:sequence_length] 36 | length = len(token) 37 | token += [Global.word2id[""]] * (sequence_length - length) 38 | label += [self.pad_label_id] * (sequence_length - length) 39 | canid = [] 40 | for i in range(len(flag)): 41 | if flag[i] == 1: 42 | canid.append(canid_[i]) 43 | flag += [0] * (sequence_length - length) 44 | for i in range(sequence_length): 45 | if i < length and flag[i] == 1: 46 | assert label[i] != self.pad_label_id 47 | docids.append(docid) 48 | tokens.append(token) 49 | canids.append(canid) 50 | labels.append(label) 51 | flags.append(flag) 52 | masks.append([1] * length + [0] * (sequence_length - length)) 53 | lengths.append(length) 54 | for i in range(length): 55 | assert labels[-1][i] != self.pad_label_id 56 | 57 | tlt = lambda t: torch.LongTensor(t) 58 | tt = lambda t: torch.Tensor(t) 59 | 60 | tokens, labels, masks, lengths = tlt(tokens), tlt(labels), tlt(masks), tlt(lengths) 61 | 62 | return {"tokens": tokens, 63 | "labels": labels, 64 | "flags": flags, 65 | "masks": masks, 66 | "lengths": lengths, 67 | "canids": canids, 68 | "docids": docids} 69 | 70 | -------------------------------------------------------------------------------- /baselines/DMCNN_BiLSTM_(CRF)/formatter/DmcnnFormatter.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from utils.global_variables import Global 3 | 4 | 5 | class DmcnnFormatter(object): 6 | def __init__(self, config): 7 | self.config = config 8 | 9 | def process(self, data, mode): 10 | """ 11 | :param data: [{"tokens": list(int), "labels": int}, ...] 12 | :param mode: train/valid/test 13 | :return: {"tokens": LongTensor, 14 | "lables": LongTensor, 15 | "pfs": LongTensor, 16 | "llfs": LongTensor, 17 | "masks": Tensor} 18 | """ 19 | tokens, canids, labels, masks, pfs, llfs, docids = [], [], [], [], [], [], [] 20 | 21 | sequence_length = self.config.getint("runtime", "sequence_length") 22 | 23 | for item in data: 24 | length = len(item["tokens"]) 25 | docids.append(item["docids"]) 26 | tokens.append(item["tokens"] + [Global.word2id[""]] * (sequence_length - length)) 27 | canids.append(item["canids"]) 28 | if mode != "test": 29 | labels.append(item["labels"]) 30 | mask = [] 31 | for i in range(sequence_length): 32 | if 0 <= i <= item["index"]: 33 | mask.append([100, 0]) 34 | elif i < length: 35 | mask.append([0, 100]) 36 | else: 37 | mask.append([0, 0]) 38 | masks.append(mask) 39 | pfs.append([abs(item["index"] - x) for x in range(sequence_length)]) 40 | if item["index"] == 0: 41 | llfs.append([0] + tokens[-1][item["index"]: item["index"] + 2]) 42 | elif item["index"] == sequence_length - 1: 43 | llfs.append(tokens[-1][item["index"] - 1: item["index"] + 1] + [0]) 44 | else: 45 | llfs.append(tokens[-1][item["index"] - 1: item["index"] + 2]) 46 | assert len(llfs[-1]) == 3 47 | 48 | tlt = lambda t: torch.LongTensor(t) 49 | tt = lambda t: torch.Tensor(t) 50 | tokens, pfs, llfs = tlt(tokens), tlt(pfs), tlt(llfs) 51 | masks = tt(masks) 52 | if mode != "test": 53 | labels = tlt(labels) 54 | 55 | return {"tokens": tokens, 56 | "labels": labels, 57 | "pfs": pfs, 58 | "llfs": llfs, 59 | "masks": masks, 60 | "canids": canids, 61 | "docids": docids} if mode != "test" else { 62 | "tokens": tokens, 63 | "pfs": pfs, 64 | "llfs": llfs, 65 | "masks": masks, 66 | "canids": canids, 67 | "docids": docids 68 | } 69 | -------------------------------------------------------------------------------- /baselines/DMCNN_BiLSTM_(CRF)/formatter/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/THU-KEG/MAVEN-dataset/71151f6da53bc9df9d6c5718dc81a2e1489651bf/baselines/DMCNN_BiLSTM_(CRF)/formatter/__init__.py -------------------------------------------------------------------------------- /baselines/DMCNN_BiLSTM_(CRF)/main.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import argparse 3 | from utils.configparser_hook import get_config 4 | from utils.global_variables import Global 5 | from utils.initializer import initialize 6 | from utils.runner import run 7 | 8 | if __name__ == "__main__": 9 | parser = argparse.ArgumentParser() 10 | required_args = ["config"] 11 | normal_args = ["gpu"] 12 | for arg in required_args + normal_args: 13 | parser.add_argument("--{}".format(arg), required=arg in required_args) 14 | args = parser.parse_args() 15 | 16 | device = torch.device("cuda:{}".format(args.gpu) if args.gpu and torch.cuda.is_available() else "cpu") 17 | Global.device = device 18 | print("Device:", device) 19 | 20 | config = get_config(args.config) 21 | config.add_section("runtime") 22 | 23 | parameters = initialize(config, device) 24 | 25 | run(parameters, config, device) 26 | -------------------------------------------------------------------------------- /baselines/DMCNN_BiLSTM_(CRF)/model/Bilstm.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | from utils.global_variables import Global 4 | from model.layers import embedding, outputLayer 5 | 6 | class Bilstm(nn.Module): 7 | def __init__(self, config): 8 | super(Bilstm, self).__init__() 9 | self.config = config 10 | self.embedding = embedding.Embedding(config) 11 | self.rnn = DynamicRNN(config) 12 | 13 | self.dropout = nn.Dropout(config.getfloat("model", "dropout")) 14 | self.fc = nn.Linear(in_features=config.getint("model", "hidden_size"), 15 | out_features=config.getint("runtime", "num_class"), 16 | bias=True) 17 | self.out = outputLayer.OutputLayer(config) 18 | print(self) 19 | 20 | 21 | def forward(self, data, **params): 22 | """ 23 | :param data: 这一轮输入的数据 24 | :param params: 存放任何其它需要的信息 25 | """ 26 | mode = params["mode"] 27 | tokens = data["tokens"] # [B, L] 28 | if mode != "test": 29 | labels = data["labels"] # [B, ] 30 | lengths = data["lengths"] # [B, ] 31 | indices = data["indices"] # [B, ] 32 | 33 | prediction = self.embedding(tokens) # [B, L, E] 34 | prediction = self.dropout(prediction) 35 | prediction = self.rnn(prediction, lengths, indices) # [B, H] 36 | prediction = self.fc(prediction) # [B, N] 37 | 38 | if mode != "test": 39 | loss = self.out(prediction, labels) 40 | prediction = torch.argmax(prediction, dim=1) 41 | 42 | return {"loss": loss, 43 | "prediction": prediction, 44 | "labels": labels} if mode != "test" else { 45 | "prediction": prediction 46 | } 47 | 48 | 49 | class DynamicRNN(nn.Module): 50 | def __init__(self, config): 51 | super(DynamicRNN, self).__init__() 52 | self.embedding_size = config.getint("runtime", "embedding_size") 53 | self.sequence_length = config.getint("runtime", "sequence_length") 54 | self.num_layers = config.getint("model", "num_layers") 55 | self.hidden_size = config.getint("model", "hidden_size") 56 | self.rnn = nn.LSTM(input_size=self.embedding_size, 57 | hidden_size=self.hidden_size // 2, 58 | num_layers=self.num_layers, 59 | bias=True, 60 | batch_first=True, 61 | dropout=0, 62 | bidirectional=True) 63 | 64 | def forward(self, inputs, lengths, indices): 65 | embedding_packed = nn.utils.rnn.pack_padded_sequence(input=inputs, 66 | lengths=lengths, 67 | batch_first=True, 68 | enforce_sorted=False) 69 | outputs, _ = self.rnn(embedding_packed, None) 70 | outputs, _ = nn.utils.rnn.pad_packed_sequence(sequence=outputs, 71 | batch_first=True, 72 | padding_value=0.0, 73 | total_length=self.sequence_length) 74 | outputs = outputs[torch.arange(inputs.shape[0]), indices] 75 | return outputs 76 | 77 | 78 | 79 | -------------------------------------------------------------------------------- /baselines/DMCNN_BiLSTM_(CRF)/model/Crf.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | from torch.nn.utils.rnn import pad_sequence 4 | from utils.global_variables import Global 5 | from model.layers import embedding, crf, outputLayer 6 | 7 | class Crf(nn.Module): 8 | def __init__(self, config): 9 | super(Crf, self).__init__() 10 | self.config = config 11 | self.embedding = embedding.Embedding(config) 12 | self.rnn = DynamicRNN(config) 13 | self.dropout = nn.Dropout(config.getfloat("model", "dropout")) 14 | self.hidden2tag = nn.Linear(in_features=config.getint("model", "hidden_size"), 15 | out_features=config.getint("runtime", "num_class") + 2, 16 | bias=True) 17 | self.pad_label_id = config.getint("data", "pad_label_id") 18 | self.crf = crf.CRF(tagset_size=config.getint("runtime", "num_class"), use_gpu=Global.device) 19 | print(self) 20 | 21 | def forward(self, data, **params): 22 | """ 23 | :param data: 这一轮输入的数据 24 | :param params: 存放任何其它需要的信息 25 | """ 26 | mode = params["mode"] 27 | tokens = data["tokens"] # [B, L] 28 | labels = data["labels"] # [B, L] 29 | lengths = data["lengths"] # [B, ] 30 | flags = data["flags"] 31 | attention_masks = data["masks"] # [B, L] 32 | 33 | prediction = self.embedding(tokens) # [B, L, E] 34 | prediction = self.dropout(prediction) 35 | prediction = self.rnn(prediction, lengths) # [B, L, H] 36 | prediction = self.hidden2tag(prediction) # [B, L, N+2] 37 | 38 | pad_masks = (labels != self.pad_label_id) 39 | loss_masks = ((attention_masks == 1) & pad_masks) 40 | 41 | if params["crf_mode"] == "train": 42 | crf_labels, crf_masks = self.to_crf_pad(labels, loss_masks) 43 | crf_logits, _ = self.to_crf_pad(prediction, loss_masks) 44 | loss = self.crf.neg_log_likelihood(crf_logits, crf_masks, crf_labels) 45 | return {"loss": loss, 46 | "prediction": None, 47 | "labels": None} 48 | 49 | elif params["crf_mode"] == "test": 50 | masks = (attention_masks == 1) 51 | crf_logits, crf_masks = self.to_crf_pad(prediction, masks) 52 | crf_masks = crf_masks.sum(axis=2) == crf_masks.shape[2] 53 | best_path = self.crf(crf_logits, crf_masks) 54 | temp_labels = (torch.ones(loss_masks.shape) * self.pad_label_id).to(torch.long) 55 | prediction = self.unpad_crf(best_path, crf_masks, temp_labels, masks) 56 | return {"loss": None, 57 | "prediction": self.normalize(prediction, flags, lengths), 58 | "labels": self.normalize(labels, flags, lengths)} if mode != "test" else { 59 | "prediction": self.normalize(prediction, flags, lengths) 60 | } 61 | 62 | else: 63 | raise NotImplementedError 64 | 65 | def normalize(self, logits, flags, lengths): 66 | results = [] 67 | logits = logits.tolist() 68 | lengths = lengths.tolist() 69 | for logit, flag, length in zip(logits, flags, lengths): 70 | result = [] 71 | for i in range(length): 72 | if flag[i] == 1: 73 | assert logit[i] != self.pad_label_id 74 | result.append(Global.id2label[str(logit[i])]) 75 | results.append(result) 76 | return results 77 | 78 | def to_crf_pad(self, org_array, org_mask): 79 | crf_array = [aa[bb] for aa, bb in zip(org_array, org_mask)] 80 | crf_array = pad_sequence(crf_array, batch_first=True, padding_value=self.pad_label_id) 81 | crf_pad = (crf_array != self.pad_label_id) 82 | crf_array[~crf_pad] = 0 83 | return crf_array, crf_pad 84 | 85 | def unpad_crf(self, returned_array, returned_mask, org_array, org_mask): 86 | out_array = org_array.clone().detach().to(Global.device) 87 | out_array[org_mask] = returned_array[returned_mask] 88 | return out_array 89 | 90 | 91 | class DynamicRNN(nn.Module): 92 | def __init__(self, config): 93 | super(DynamicRNN, self).__init__() 94 | self.embedding_size = config.getint("runtime", "embedding_size") 95 | self.sequence_length = config.getint("runtime", "sequence_length") 96 | self.num_layers = config.getint("model", "num_layers") 97 | self.hidden_size = config.getint("model", "hidden_size") 98 | self.rnn = nn.LSTM(input_size=self.embedding_size, 99 | hidden_size=self.hidden_size // 2, 100 | num_layers=self.num_layers, 101 | bias=True, 102 | batch_first=True, 103 | dropout=0, 104 | bidirectional=True) 105 | 106 | def forward(self, inputs, lengths): 107 | embedding_packed = nn.utils.rnn.pack_padded_sequence(input=inputs, 108 | lengths=lengths, 109 | batch_first=True, 110 | enforce_sorted=False) 111 | outputs, _ = self.rnn(embedding_packed, None) 112 | outputs, _ = nn.utils.rnn.pad_packed_sequence(sequence=outputs, 113 | batch_first=True, 114 | padding_value=0.0, 115 | total_length=self.sequence_length) 116 | return outputs 117 | -------------------------------------------------------------------------------- /baselines/DMCNN_BiLSTM_(CRF)/model/Dmcnn.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | from model.layers import embedding, outputLayer 4 | 5 | 6 | class Dmcnn(nn.Module): 7 | def __init__(self, config): 8 | super(Dmcnn, self).__init__() 9 | self.config = config 10 | self.embedding = embedding.Embedding(config) 11 | self.pf_embedding = nn.Embedding(num_embeddings=config.getint("runtime", "sequence_length"), 12 | embedding_dim=config.getint("model", "pf_dim")) 13 | self.cnn = _CNN(config) 14 | self.pooling = _DynamicPooling(config) 15 | self.dropout = nn.Dropout(config.getfloat("model", "dropout")) 16 | self.fc = nn.Linear(in_features=config.getint("model", "llf_num") * config.getint("runtime", "embedding_size") + 2 * config.getint("model", "hidden_size"), 17 | out_features=config.getint("runtime", "num_class"), 18 | bias=True) 19 | self.out = outputLayer.OutputLayer(config) 20 | print(self) 21 | 22 | def forward(self, data, **params): 23 | """ 24 | :param data: 这一轮输入的数据 25 | :param params: 存放任何其它需要的信息 26 | """ 27 | mode = params["mode"] 28 | tokens = data["tokens"] 29 | if mode != "test": 30 | labels = data["labels"] 31 | masks = data["masks"] 32 | pfs = data["pfs"] 33 | llfs = data["llfs"] 34 | 35 | llf = self.embedding(llfs).view(-1, self.config.getint("model", "llf_num") * self.config.getint("runtime", "embedding_size")) 36 | prediction = torch.cat((self.embedding(tokens), self.pf_embedding(pfs)), dim=-1) # [B, L, E+P] 37 | prediction = self.cnn(prediction) # [B, H, L] 38 | prediction = self.pooling(prediction, masks) # [B, 2*H] 39 | prediction = self.dropout(prediction) 40 | prediction = torch.cat((prediction, llf), dim=-1) # [B, l*E+2*H] 41 | prediction = self.fc(prediction) # [B, N] 42 | 43 | if mode != "test": 44 | loss = self.out(prediction, labels) 45 | prediction = torch.argmax(prediction, dim=1) 46 | 47 | return {"loss": loss, 48 | "prediction": prediction, 49 | "labels": labels} if mode != "test" else { 50 | "prediction": prediction 51 | } 52 | 53 | 54 | class _CNN(nn.Module): 55 | def __init__(self, config): 56 | super(_CNN, self).__init__() 57 | self.in_channels = config.getint("runtime", "embedding_size") + config.getint("model", "pf_dim") 58 | self.out_channels = config.getint("model", "hidden_size") 59 | self.kernel_size = config.getint("model", "kernel_size") 60 | self.padding_size = (self.kernel_size - 1) >> 1 61 | self.cnn = nn.Conv1d(in_channels=self.in_channels, 62 | out_channels=self.out_channels, 63 | kernel_size=self.kernel_size, 64 | stride=1, 65 | padding=self.padding_size) 66 | self.activation = nn.ReLU() 67 | 68 | def forward(self, inputs): 69 | inputs = inputs.permute(0, 2, 1) # [B, L, E+P] -> [B, E+P, L] 70 | prediction = self.cnn(inputs) # [B, E+P, L] -> [B, H, L] 71 | prediction = self.activation(prediction) # [B, H, L] 72 | return prediction 73 | 74 | class _DynamicPooling(nn.Module): 75 | def __init__(self, config): 76 | super(_DynamicPooling, self).__init__() 77 | self.hidden_size = config.getint("model", "hidden_size") 78 | 79 | def forward(self, inputs, masks): 80 | inputs = torch.unsqueeze(inputs, dim=-1) # [B, H, L] -> [B, H, L, 1] 81 | masks = torch.unsqueeze(masks, dim=1) # [B, L, 3] -> [B, 1, L, 3] 82 | prediction = torch.max(masks + inputs, dim=2)[0] 83 | prediction -= 100 84 | prediction = prediction.view(-1, 2 * self.hidden_size) 85 | return prediction -------------------------------------------------------------------------------- /baselines/DMCNN_BiLSTM_(CRF)/model/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/THU-KEG/MAVEN-dataset/71151f6da53bc9df9d6c5718dc81a2e1489651bf/baselines/DMCNN_BiLSTM_(CRF)/model/__init__.py -------------------------------------------------------------------------------- /baselines/DMCNN_BiLSTM_(CRF)/model/layers/crf.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # @Author: Jie Yang 3 | # @Date: 2017-12-04 23:19:38 4 | # @Last Modified by: Jie Yang, Contact: jieynlp@gmail.com 5 | # @Last Modified time: 2018-05-27 22:48:17 6 | # Modified from original https://github.com/jiesutd/LatticeLSTM/blob/master/model/crf.py 7 | 8 | import torch 9 | import torch.autograd as autograd 10 | import torch.nn as nn 11 | 12 | 13 | # Compute log sum exp in a numerically stable way for the forward algorithm 14 | def log_sum_exp(vec, m_size): 15 | """ 16 | calculate log of exp sum 17 | args: 18 | vec (batch_size, vanishing_dim, hidden_dim) : input tensor 19 | m_size : hidden_dim 20 | return: 21 | batch_size, hidden_dim 22 | """ 23 | _, idx = torch.max(vec, 1) # B * 1 * M 24 | max_score = torch.gather(vec, 1, idx.view(-1, 1, m_size)).view(-1, 1, m_size) # B * M 25 | return max_score.view(-1, m_size) + torch.log(torch.sum(torch.exp(vec - max_score.expand_as(vec)), 1)).view(-1, m_size) # B * M 26 | 27 | 28 | class CRF(nn.Module): 29 | 30 | def __init__(self, tagset_size, use_gpu=False): # average_batch=False, 31 | super(CRF, self).__init__() 32 | print("build CRF...") 33 | # self.average_batch = average_batch 34 | self.gpu = use_gpu 35 | 36 | self.START_TAG = -2 37 | self.STOP_TAG = -1 38 | 39 | self.tagset_size = tagset_size 40 | 41 | # # We add 2 here, because of START_TAG and STOP_TAG 42 | # self.hidden2tag = nn.Linear(params['hidden_dim'], self.tagset_size + 2) 43 | # # transitions (f_tag_size, t_tag_size), transition value from f_tag to t_tag 44 | init_transitions = torch.zeros(self.tagset_size + 2, self.tagset_size + 2) 45 | init_transitions[:, self.START_TAG] = -10000.0 46 | init_transitions[self.STOP_TAG, :] = -10000.0 47 | if torch.cuda.is_available(): 48 | init_transitions = init_transitions.cuda(self.gpu) 49 | self.transitions = nn.Parameter(init_transitions, requires_grad=True) 50 | 51 | def init_hidden_cell(self, batch_size, layer_hidden_dim): 52 | return (torch.randn(2, batch_size, layer_hidden_dim // 2), 53 | torch.randn(2, batch_size, layer_hidden_dim // 2)) 54 | 55 | def _calculate_PZ(self, feats, mask): 56 | """ 57 | input: 58 | feats: (batch, seq_len, self.tag_size+2) 59 | masks: (batch, seq_len) 60 | """ 61 | batch_size = feats.size(0) 62 | seq_len = feats.size(1) 63 | tag_size = feats.size(2) 64 | # print feats.view(seq_len, tag_size) 65 | assert (tag_size == self.tagset_size + 2) 66 | mask = mask.transpose(1, 0).contiguous() 67 | ins_num = seq_len * batch_size 68 | ## be careful the view shape, it is .view(ins_num, 1, tag_size) but not .view(ins_num, tag_size, 1) 69 | feats = feats.transpose(1, 0).contiguous().view(ins_num, 1, tag_size).expand(ins_num, tag_size, tag_size) 70 | ## need to consider start 71 | scores = feats + self.transitions.view(1, tag_size, tag_size).expand(ins_num, tag_size, tag_size) 72 | scores = scores.view(seq_len, batch_size, tag_size, tag_size) 73 | # build iter 74 | seq_iter = enumerate(scores) 75 | _, inivalues = next(seq_iter) # bat_size * from_target_size * to_target_size 76 | # only need start from start_tag 77 | partition = inivalues[:, self.START_TAG, :].clone().view(batch_size, tag_size, 1) # bat_size * to_target_size 78 | 79 | ## add start score (from start to all tag, duplicate to batch_size) 80 | # partition = partition + self.transitions[START_TAG,:].view(1, tag_size, 1).expand(batch_size, tag_size, 1) 81 | # iter over last scores 82 | for idx, cur_values in seq_iter: 83 | # previous to_target is current from_target 84 | # partition: previous results log(exp(from_target)), #(batch_size * from_target) 85 | # cur_values: bat_size * from_target * to_target 86 | 87 | cur_values = cur_values + partition.contiguous().view(batch_size, tag_size, 1).expand(batch_size, tag_size, tag_size) 88 | cur_partition = log_sum_exp(cur_values, tag_size) 89 | # print cur_partition.data 90 | 91 | # (bat_size * from_target * to_target) -> (bat_size * to_target) 92 | # partition = utils.switch(partition, cur_partition, mask[idx].view(bat_size, 1).expand(bat_size, self.tagset_size)).view(bat_size, -1) 93 | mask_idx = mask[idx, :].view(batch_size, 1).expand(batch_size, tag_size) 94 | 95 | ## effective updated partition part, only keep the partition value of mask value = 1 96 | masked_cur_partition = cur_partition.masked_select(mask_idx) 97 | ## let mask_idx broadcastable, to disable warning 98 | mask_idx = mask_idx.contiguous().view(batch_size, tag_size, 1) 99 | 100 | ## replace the partition where the maskvalue=1, other partition value keeps the same 101 | partition.masked_scatter_(mask_idx, masked_cur_partition) 102 | # until the last state, add transition score for all partition (and do log_sum_exp) then select the value in STOP_TAG 103 | cur_values = self.transitions.view(1, tag_size, tag_size).expand(batch_size, tag_size, tag_size) + partition.contiguous().view( batch_size, tag_size, 1).expand(batch_size, tag_size, tag_size) 104 | 105 | cur_partition = log_sum_exp(cur_values, tag_size) 106 | final_partition = cur_partition[:, self.STOP_TAG] 107 | return final_partition.sum(), scores 108 | 109 | def _viterbi_decode(self, feats, mask): 110 | """ 111 | input: 112 | feats: (batch, seq_len, self.tag_size+2) 113 | mask: (batch, seq_len) 114 | output: 115 | decode_idx: (batch, seq_len) decoded sequence 116 | path_score: (batch, 1) corresponding score for each sequence (to be implementated) 117 | """ 118 | batch_size = feats.size(0) 119 | seq_len = feats.size(1) 120 | tag_size = feats.size(2) 121 | assert (tag_size == self.tagset_size + 2) 122 | ## calculate sentence length for each sentence 123 | length_mask = torch.sum(mask.long(), dim=1).view(batch_size, 1).long() 124 | ## mask to (seq_len, batch_size) 125 | mask = mask.transpose(1, 0).contiguous() 126 | ins_num = seq_len * batch_size 127 | ## be careful the view shape, it is .view(ins_num, 1, tag_size) but not .view(ins_num, tag_size, 1) 128 | feats = feats.transpose(1, 0).contiguous().view(ins_num, 1, tag_size).expand(ins_num, tag_size, tag_size) 129 | ## need to consider start 130 | scores = feats + self.transitions.view(1, tag_size, tag_size).expand(ins_num, tag_size, tag_size) 131 | scores = scores.view(seq_len, batch_size, tag_size, tag_size) 132 | 133 | # build iter 134 | seq_iter = enumerate(scores) 135 | ## record the position of best score 136 | back_points = list() 137 | partition_history = list() 138 | ## reverse mask (bug for mask = 1- mask, use this as alternative choice) 139 | # mask = 1 + (-1)*mask 140 | # mask = (1 - mask.long()).byte() 141 | mask = ~(mask) 142 | _, inivalues = next(seq_iter) # bat_size * from_target_size * to_target_size 143 | # only need start from start_tag 144 | partition = inivalues[:, self.START_TAG, :].clone().view(batch_size, tag_size) # bat_size * to_target_size 145 | # print "init part:",partition.size() 146 | partition_history.append(partition) 147 | # iter over last scores 148 | for idx, cur_values in seq_iter: 149 | # previous to_target is current from_target 150 | # partition: previous results log(exp(from_target)), #(batch_size * from_target) 151 | # cur_values: batch_size * from_target * to_target 152 | cur_values = cur_values + partition.contiguous().view(batch_size, tag_size, 1).expand(batch_size, tag_size, tag_size) 153 | ## forscores, cur_bp = torch.max(cur_values[:,:-2,:], 1) # do not consider START_TAG/STOP_TAG 154 | # print "cur value:", cur_values.size() 155 | partition, cur_bp = torch.max(cur_values, 1) 156 | # print "partsize:",partition.size() 157 | # exit(0) 158 | # print partition 159 | # print cur_bp 160 | # print "one best, ",idx 161 | partition_history.append(partition) 162 | ## cur_bp: (batch_size, tag_size) max source score position in current tag 163 | ## set padded label as 0, which will be filtered in post processing 164 | cur_bp.masked_fill_(mask[idx].view(batch_size, 1).expand(batch_size, tag_size), 0) 165 | back_points.append(cur_bp) 166 | # exit(0) 167 | ### add score to final STOP_TAG 168 | partition_history = torch.cat(partition_history, 0).view(seq_len, batch_size, -1).transpose(1, 0).contiguous() ## (batch_size, seq_len. tag_size) 169 | ### get the last position for each setences, and select the last partitions using gather() 170 | last_position = length_mask.view(batch_size, 1, 1).expand(batch_size, 1, tag_size) - 1 171 | last_partition = torch.gather(partition_history, 1, last_position).view(batch_size, tag_size, 1) 172 | ### calculate the score from last partition to end state (and then select the STOP_TAG from it) 173 | last_values = last_partition.expand(batch_size, tag_size, tag_size) + self.transitions.view(1, tag_size, tag_size).expand(batch_size, tag_size, tag_size) 174 | _, last_bp = torch.max(last_values, 1) 175 | pad_zero = autograd.Variable(torch.zeros(batch_size, tag_size)).long() 176 | if torch.cuda.is_available(): 177 | pad_zero = pad_zero.cuda(self.gpu) 178 | back_points.append(pad_zero) 179 | back_points = torch.cat(back_points).view(seq_len, batch_size, tag_size) 180 | 181 | ## select end ids in STOP_TAG 182 | pointer = last_bp[:, self.STOP_TAG] 183 | insert_last = pointer.contiguous().view(batch_size, 1, 1).expand(batch_size, 1, tag_size) 184 | back_points = back_points.transpose(1, 0).contiguous() 185 | ## move the end ids(expand to tag_size) to the corresponding position of back_points to replace the 0 values 186 | # print "lp:",last_position 187 | # print "il:",insert_last 188 | back_points.scatter_(1, last_position, insert_last) 189 | # print "bp:",back_points 190 | # exit(0) 191 | back_points = back_points.transpose(1, 0).contiguous() 192 | ## decode from the end, padded position ids are 0, which will be filtered if following evaluation 193 | decode_idx = autograd.Variable(torch.LongTensor(seq_len, batch_size)) 194 | if torch.cuda.is_available(): 195 | decode_idx = decode_idx.cuda(self.gpu) 196 | decode_idx[-1] = pointer.data # detach() 197 | for idx in range(len(back_points) - 2, -1, -1): 198 | pointer = torch.gather(back_points[idx], 1, pointer.contiguous().view(batch_size, 1)) 199 | decode_idx[idx] = pointer.data.t() # feili pointer.detach().view(batch_size) 200 | path_score = None 201 | decode_idx = decode_idx.transpose(1, 0) 202 | return path_score, decode_idx 203 | 204 | def forward(self, feats, mask): 205 | path_score, best_path = self._viterbi_decode(feats, mask) 206 | # return path_score, best_path 207 | return best_path 208 | 209 | def _score_sentence(self, scores, mask, tags): 210 | """ 211 | input: 212 | scores: variable (seq_len, batch, tag_size, tag_size) 213 | mask: (batch, seq_len) 214 | tags: tensor (batch, seq_len) 215 | output: 216 | score: sum of score for gold sequences within whole batch 217 | """ 218 | # Gives the score of a provided tag sequence 219 | batch_size = scores.size(1) 220 | seq_len = scores.size(0) 221 | tag_size = scores.size(2) 222 | ## convert tag value into a new format, recorded label bigram information to index 223 | new_tags = autograd.Variable(torch.LongTensor(batch_size, seq_len)) 224 | if torch.cuda.is_available(): 225 | new_tags = new_tags.cuda(self.gpu) 226 | for idx in range(seq_len): 227 | if idx == 0: 228 | ## start -> first score 229 | new_tags[:, 0] = (tag_size - 2) * tag_size + tags[:, 0] 230 | 231 | else: 232 | new_tags[:, idx] = tags[:, idx - 1] * tag_size + tags[:, idx] 233 | 234 | ## transition for label to STOP_TAG 235 | end_transition = self.transitions[:, self.STOP_TAG].contiguous().view(1, tag_size).expand(batch_size, tag_size) 236 | ## length for batch, last word position = length - 1 237 | length_mask = torch.sum(mask.long(), dim=1).view(batch_size, 1).long() 238 | ## index the label id of last word 239 | end_ids = torch.gather(tags, 1, length_mask - 1) 240 | 241 | ## index the transition score for end_id to STOP_TAG 242 | end_energy = torch.gather(end_transition, 1, end_ids) 243 | 244 | ## convert tag as (seq_len, batch_size, 1) 245 | new_tags = new_tags.transpose(1, 0).contiguous().view(seq_len, batch_size, 1) 246 | ### need convert tags id to search from 400 positions of scores 247 | tg_energy = torch.gather(scores.view(seq_len, batch_size, -1), 2, new_tags).view(seq_len, batch_size) # seq_len * bat_size 248 | ## mask transpose to (seq_len, batch_size) 249 | assert mask.transpose(1, 0).size() == tg_energy.size() 250 | tg_energy = tg_energy.masked_select(mask.transpose(1, 0)) 251 | 252 | # ## calculate the score from START_TAG to first label 253 | # start_transition = self.transitions[START_TAG,:].view(1, tag_size).expand(batch_size, tag_size) 254 | # start_energy = torch.gather(start_transition, 1, tags[0,:]) 255 | 256 | ## add all score together 257 | # gold_score = start_energy.sum() + tg_energy.sum() + end_energy.sum() 258 | gold_score = tg_energy.sum() + end_energy.sum() 259 | return gold_score 260 | 261 | def neg_log_likelihood(self, feats, mask, tags): 262 | # nonegative log likelihood 263 | forward_score, scores = self._calculate_PZ(feats, mask) 264 | # print('Forward', forward_score) 265 | gold_score = self._score_sentence(scores, mask, tags) 266 | # print('Gold', gold_score) 267 | # print "batch, f:", forward_score.data[0], " g:", gold_score.data[0], " dis:", forward_score.data[0] - gold_score.data[0] 268 | # exit(0) 269 | # if self.average_batch: 270 | # return (forward_score - gold_score) / batch_size 271 | # else: 272 | return forward_score - gold_score 273 | -------------------------------------------------------------------------------- /baselines/DMCNN_BiLSTM_(CRF)/model/layers/embedding.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import numpy as np 4 | from utils.global_variables import Global 5 | 6 | class Embedding(nn.Module): 7 | def __init__(self, config): 8 | super(Embedding, self).__init__() 9 | if Global.word2vec_mat is None: 10 | weight = None 11 | self.vocab_size = config.getint("runtime", "vocab_size") 12 | self.embedding_size = config.getint("runtime", "embedding_size") 13 | else: 14 | weight = torch.from_numpy(Global.word2vec_mat).float() 15 | self.vocab_size, self.embedding_size = weight.size() 16 | self.embedding = nn.Embedding.from_pretrained(weight) 17 | 18 | def forward(self, input): 19 | return self.embedding(input) 20 | -------------------------------------------------------------------------------- /baselines/DMCNN_BiLSTM_(CRF)/model/layers/outputLayer.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | from utils.global_variables import Global 5 | from utils.evaluation import Evaluation 6 | 7 | class OutputLayer(nn.Module): 8 | def __init__(self, config): 9 | super(OutputLayer, self).__init__() 10 | self.num_class = config.getint("runtime", "num_class") 11 | self.criterion = nn.CrossEntropyLoss() 12 | 13 | def forward(self, prediction, labels): 14 | loss = self.criterion(prediction, labels) # ([B, N], [B,]) 15 | return loss -------------------------------------------------------------------------------- /baselines/DMCNN_BiLSTM_(CRF)/reader/MavenReader.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | import copy 4 | import codecs 5 | import numpy as np 6 | from tqdm import tqdm 7 | from utils.global_variables import Global 8 | 9 | class MavenReader(object): 10 | def __init__(self, config): 11 | self.config = config 12 | self.data = [] 13 | self.raw_dir = "./raw" 14 | self.data_dir = "./data" 15 | self.flag_dir = "{}{}".format(config.get("data", "reader_name")[:-6], "crf" if config.has_option("data", "BIO") else "") 16 | self.word2vec_source_file = config.get("data", "word2vec_file") 17 | self.word2vec_file = "word2vec.npy" 18 | self.modes = ["train", "valid", "test"] 19 | 20 | def read(self, mode): 21 | """ 22 | :param mode: train/valid/test 23 | :return: [{"tokens": list(int), "labels": list(int)}, ...] 24 | """ 25 | self.data.clear() 26 | if not os.path.exists(os.path.join(self.data_dir, self.flag_dir, 'flag')): 27 | os.makedirs(os.path.join(self.data_dir, self.flag_dir)) 28 | self.preprocess() 29 | with open(os.path.join(self.data_dir, self.flag_dir, "{}_processed.json".format(mode)), "r+", encoding="utf-8") as f: 30 | data = json.load(f) 31 | if Global.word2vec_mat is None: 32 | Global.word2vec_mat = np.load(os.path.join(self.data_dir, self.word2vec_file)) 33 | Global.word2id = data["word2id"] 34 | Global.id2word = data["id2word"] 35 | Global.label2id = data["label2id"] 36 | Global.id2label = data["id2label"] 37 | if self.config.has_option("data", "BIO"): 38 | Global.type2id = data["type2id"] 39 | for item in data["info"]: 40 | tokens = [data["word2id"][x] if x in data["word2id"] else data["word2id"][""] for x in item["tokens"]] 41 | if mode != "test": 42 | labels = [data["label2id"][x] for x in item["labels"]] 43 | canids = item["canids"] 44 | docids = item["docids"] 45 | if self.config.has_option("data", "split_labels"): 46 | for i in range(len(canids)): 47 | if item["flags"][i]: 48 | if mode != "test": 49 | temp = {"tokens": tokens, 50 | "labels": labels[i], 51 | "canids": canids[i], 52 | "docids": docids, 53 | "index": i} 54 | else: 55 | temp = {"tokens": tokens, 56 | "canids": canids[i], 57 | "docids": docids, 58 | "index": i} 59 | self.data.append(temp) 60 | else: 61 | if mode != "test": 62 | temp = {"tokens": tokens, 63 | "labels": labels, 64 | "canids": canids, 65 | "docids": docids, 66 | "flags": item["flags"]} 67 | else: 68 | temp = {"tokens": tokens, 69 | "canids": canids, 70 | "docids": docids, 71 | "flags": item["flags"]} 72 | self.data.append(temp) 73 | 74 | self.config.set("runtime", "vocab_size", Global.word2vec_mat.shape[0]) 75 | self.config.set("runtime", "embedding_size", Global.word2vec_mat.shape[1]) 76 | self.config.set("runtime", "num_class", len(data["label2id"])) 77 | self.config.set("runtime", "sequence_length", data["sequence_length"]) 78 | 79 | print("Mode: {} | Dataset Size = {}".format(mode, len(self.data))) 80 | return copy.deepcopy(self.data) 81 | 82 | def preprocess(self): 83 | """ 84 | :return: 输出文件、整合数据以及词向量矩阵 85 | 整合数据格式:{ 86 | "info":[{"tokens": list(str), "labels": list(str), "flags": list(bool)}, ...], 87 | "word2id": {"": 0, "": 1}, 88 | "id2word": {0: "", 1: ""}, 89 | "label2id": {"None": 0}, 90 | "id2label": {0: "None"}, 91 | "sequence_length": int 92 | } 93 | """ 94 | 95 | embedding_dict = self.load_embedding_dict(os.path.join(self.raw_dir, self.word2vec_source_file)) 96 | 97 | processed_data = {"info_train": [], 98 | "info_valid": [], 99 | "info_test": [], 100 | "word2id": {}, 101 | "id2word": {}, 102 | "label2id": {}, 103 | "id2label": {}, 104 | "sequence_length": 0} 105 | 106 | if self.config.has_option("data", "BIO"): 107 | processed_data["label2id"]["O"] = 0 108 | processed_data["id2label"][0] = "O" 109 | processed_data["type2id"] = {"O": 0} 110 | else: 111 | processed_data["label2id"]["None"] = 0 112 | processed_data["id2label"][0] = "None" 113 | 114 | for mode in self.modes: 115 | with codecs.open(os.path.join(self.raw_dir, "{}.jsonl".format(mode)), 'r', encoding="utf-8", errors="ignore") as f: 116 | lines = f.readlines() 117 | for line in lines: 118 | line = line.rstrip() 119 | doc = json.loads(line) 120 | docids = doc["id"] 121 | doc_tokens, doc_labels, doc_canids, doc_flags = [], [], [], [] 122 | for item in doc["content"]: 123 | doc_tokens.append(item["tokens"]) 124 | 125 | if self.config.has_option("data", "BIO"): 126 | 127 | for tokens in doc_tokens: 128 | if mode != "test": 129 | doc_labels.append(["O"] * len(tokens)) 130 | doc_canids.append([""] * len(tokens)) 131 | doc_flags.append([0] * len(tokens)) 132 | 133 | if mode == "test": 134 | for candi in doc["candidates"]: 135 | for i in range(candi["offset"][0], candi["offset"][1]): 136 | doc_canids[candi["sent_id"]][i] = candi["id"] 137 | doc_flags[candi["sent_id"]][i] = 1 138 | else: 139 | for event in doc["events"]: 140 | tp = event["type"].replace("-", "_") 141 | if tp not in processed_data["type2id"]: 142 | processed_data["type2id"][tp] = event["type_id"] 143 | for mention in event["mention"]: 144 | for i in range(mention["offset"][0], mention["offset"][1]): 145 | doc_labels[mention["sent_id"]][i] = ("B-" + tp) if (i == mention["offset"][0]) else ("I-" + tp) 146 | doc_canids[mention["sent_id"]][i] = mention["id"] 147 | doc_flags[mention["sent_id"]][i] = 1 148 | 149 | else: 150 | 151 | for tokens in doc_tokens: 152 | if mode != "test": 153 | doc_labels.append(["None"] * len(tokens)) 154 | doc_canids.append([""] * len(tokens)) 155 | doc_flags.append([0] * len(tokens)) 156 | processed_data["sequence_length"] = max(processed_data["sequence_length"], len(tokens)) 157 | 158 | if mode == "test": 159 | for candi in doc["candidates"]: 160 | for i in range(candi["offset"][0], candi["offset"][1]): 161 | doc_canids[candi["sent_id"]][i] = candi["id"] 162 | doc_flags[candi["sent_id"]][i] = 1 163 | else: 164 | for event in doc["events"]: 165 | if event["type"] not in processed_data["label2id"]: 166 | processed_data["label2id"][event["type"]] = event["type_id"] 167 | processed_data["id2label"][event["type_id"]] = event["type"] 168 | for mention in event["mention"]: 169 | for i in range(mention["offset"][0], mention["offset"][1]): 170 | doc_labels[mention["sent_id"]][i] = event["type"] 171 | doc_canids[mention["sent_id"]][i] = mention["id"] 172 | doc_flags[mention["sent_id"]][i] = 1 173 | 174 | if mode != "test": 175 | for mention in doc["negative_triggers"]: 176 | for i in range(mention["offset"][0], mention["offset"][1]): 177 | doc_canids[mention["sent_id"]][i] = mention["id"] 178 | doc_flags[mention["sent_id"]][i] = 1 179 | 180 | for tokens, labels, canids, flags in zip(doc_tokens, doc_labels, doc_canids, doc_flags): 181 | processed_data["info_{}".format(mode)].append({"tokens": tokens, 182 | "labels": labels, 183 | "canids": canids, 184 | "flags": flags, 185 | "docids": docids}) 186 | if self.config.has_option("data", "BIO"): 187 | for label in labels: 188 | if label not in processed_data["label2id"]: 189 | id = len(processed_data["label2id"]) 190 | processed_data["label2id"][label] = id 191 | processed_data["id2label"][id] = label 192 | else: 193 | for tokens, canids, flags in zip(doc_tokens, doc_canids, doc_flags): 194 | processed_data["info_{}".format(mode)].append({"tokens": tokens, 195 | "canids": canids, 196 | "flags": flags, 197 | "docids": docids}) 198 | 199 | 200 | if self.config.has_option("data", "BIO"): 201 | processed_data["sequence_length"] = self.config.getint("data", "sequence_length") 202 | 203 | word2vec_mat = [] 204 | for (k, v) in embedding_dict.items(): 205 | id = len(processed_data["word2id"]) 206 | processed_data["word2id"][k] = id 207 | processed_data["id2word"][id] = k 208 | word2vec_mat.append(v) 209 | word2vec_mat = np.array(word2vec_mat, dtype=np.float32) 210 | if not os.path.exists(os.path.join(self.data_dir, self.word2vec_file)): 211 | np.save(os.path.join(self.data_dir, self.word2vec_file), word2vec_mat) 212 | 213 | for mode in self.modes: 214 | with open(os.path.join(self.data_dir, self.flag_dir, "{}_processed.json".format(mode)), "w", encoding="utf-8") as f: 215 | temp_data = {"info": processed_data["info_{}".format(mode)], 216 | "word2id": processed_data["word2id"], 217 | "id2word": processed_data["id2word"], 218 | "label2id": processed_data["label2id"], 219 | "id2label": processed_data["id2label"], 220 | "sequence_length": processed_data["sequence_length"]} 221 | if self.config.has_option("data", "BIO"): 222 | temp_data["type2id"] = processed_data["type2id"] 223 | json.dump(temp_data, f, indent=2, ensure_ascii=False) 224 | 225 | with open(os.path.join(self.data_dir, self.flag_dir, 'flag'), "w+") as f: 226 | f.write("") 227 | 228 | 229 | 230 | def load_embedding_dict(self, path): 231 | with open(path, "r", encoding="utf-8") as f: 232 | lines = f.readlines() 233 | embedding_dict = {} 234 | for line in lines: 235 | split = line.split(" ") 236 | embedding_dict[split[0]] = np.array(list(map(float, split[1:]))) 237 | return embedding_dict -------------------------------------------------------------------------------- /baselines/DMCNN_BiLSTM_(CRF)/reader/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/THU-KEG/MAVEN-dataset/71151f6da53bc9df9d6c5718dc81a2e1489651bf/baselines/DMCNN_BiLSTM_(CRF)/reader/__init__.py -------------------------------------------------------------------------------- /baselines/DMCNN_BiLSTM_(CRF)/utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/THU-KEG/MAVEN-dataset/71151f6da53bc9df9d6c5718dc81a2e1489651bf/baselines/DMCNN_BiLSTM_(CRF)/utils/__init__.py -------------------------------------------------------------------------------- /baselines/DMCNN_BiLSTM_(CRF)/utils/configparser_hook.py: -------------------------------------------------------------------------------- 1 | import functools 2 | import configparser 3 | 4 | class ConfigParserHook(object): 5 | def __init__(self): 6 | self.config = configparser.RawConfigParser() 7 | 8 | def read(self, config_file): 9 | self.config.read(config_file, encoding="utf-8") 10 | 11 | def set_hook(func_name): 12 | @functools.wraps(getattr(configparser.RawConfigParser, func_name)) 13 | def wrapper(self, *args, **kwargs): 14 | return getattr(self.config, func_name)(*args, **kwargs) 15 | 16 | return wrapper 17 | 18 | def get_config(config_file): 19 | for func_name in dir(configparser.RawConfigParser): 20 | if not func_name.startswith("_") and func_name != "read": 21 | setattr(ConfigParserHook, func_name, set_hook(func_name)) 22 | setattr(ConfigParserHook, "__getitem__", set_hook("__getitem__")) 23 | 24 | config = ConfigParserHook() 25 | config.read(config_file) 26 | 27 | return config -------------------------------------------------------------------------------- /baselines/DMCNN_BiLSTM_(CRF)/utils/evaluation.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from sklearn.metrics import precision_recall_fscore_support 3 | from seqeval.metrics import precision_score, recall_score, f1_score 4 | from utils.global_variables import Global 5 | 6 | import warnings 7 | warnings.filterwarnings('always') 8 | 9 | class Evaluation(object): 10 | def __init__(self, config): 11 | super(Evaluation).__init__() 12 | self.config = config 13 | self.y_pred = [] 14 | self.y_true = [] 15 | self.labels = [v for (k, v) in Global.label2id.items() if k != "None"] 16 | 17 | def get_metric(self, mode, batch_pred=None, batch_true=None): 18 | average = ["micro", "macro"] 19 | metrics = ["precision", "recall", "f1"] 20 | ret = {"{}_{}".format(t1, t2) : 0.0 for t1 in average for t2 in metrics} 21 | if mode == "batch": 22 | assert batch_pred is not None 23 | assert batch_true is not None 24 | batch_pred = torch.argmax(batch_pred, dim=1) 25 | y_pred = self.normalize(batch_pred) 26 | y_true = self.normalize(batch_true) 27 | elif mode == "all": 28 | y_pred = self.y_pred 29 | y_true = self.y_true 30 | else: 31 | raise NotImplementedError 32 | assert len(y_pred) == len(y_true) 33 | for av in average: 34 | if self.config.has_option("data", "BIO"): 35 | ret["{}_precision".format(av)] = precision_score(y_true=y_true, y_pred=y_pred) 36 | ret["{}_recall".format(av)] = recall_score(y_true=y_true, y_pred=y_pred) 37 | ret["{}_f1".format(av)] = f1_score(y_true=y_true, y_pred=y_pred) 38 | else: 39 | ret["{}_precision".format(av)], ret["{}_recall".format(av)], ret["{}_f1".format(av)], _ = precision_recall_fscore_support(y_true=y_true, 40 | y_pred=y_pred, 41 | labels=self.labels, 42 | average=av, 43 | zero_division=0) 44 | return {key : ('%.4f' % value) for key, value in ret.items() if key.startswith("micro") or key.endswith("f1")} 45 | 46 | def expand(self, batch_pred, batch_true): 47 | y_pred = batch_pred if isinstance(batch_pred, list) else self.normalize(batch_pred) 48 | y_true = batch_true if isinstance(batch_true, list) else self.normalize(batch_true) 49 | self.y_pred += y_pred 50 | self.y_true += y_true 51 | 52 | def normalize(self, x): 53 | return x.cpu().numpy().tolist() 54 | -------------------------------------------------------------------------------- /baselines/DMCNN_BiLSTM_(CRF)/utils/global_variables.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | class Global(object): 4 | device = None 5 | word2vec_mat = None 6 | word2id = None 7 | id2word = None 8 | label2id = None 9 | id2label = None 10 | type2id = None 11 | -------------------------------------------------------------------------------- /baselines/DMCNN_BiLSTM_(CRF)/utils/initializer.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | import torch 4 | import torch.optim as optim 5 | from torch.utils.data import DataLoader 6 | 7 | get_class = lambda attr, name: getattr(__import__("{}.{}".format(attr, name), fromlist=["dummy"]), name) 8 | 9 | def initialize(config, device): 10 | parameters = {} 11 | 12 | reader = get_class("reader", config.get("data", "reader_name"))(config) 13 | formatter = get_class("formatter", config.get("data", "formatter_name"))(config) 14 | batch_size = config.getint("train" ,"batch_size") 15 | shuffle = config.getboolean("train", "shuffle") 16 | 17 | collate_fn_decr = lambda mode: (lambda data, mode=mode: formatter.process(data, mode)) 18 | 19 | dataset_train = reader.read("train") 20 | dataset_valid = reader.read("valid") 21 | dataset_test = reader.read("test") 22 | parameters["dataset_train"] = DataLoader(dataset=dataset_train, batch_size=batch_size, shuffle=shuffle, collate_fn=collate_fn_decr("train")) 23 | parameters["dataset_valid"] = DataLoader(dataset=dataset_valid, batch_size=batch_size, shuffle=shuffle, collate_fn=collate_fn_decr("valid")) 24 | parameters["dataset_test"] = DataLoader(dataset=dataset_test, batch_size=batch_size, shuffle=shuffle, collate_fn=collate_fn_decr("test")) 25 | 26 | parameters["model"] = get_class("model", config.get("model", "model_name"))(config) 27 | parameters["model"] = parameters["model"].to(device) 28 | 29 | parameters["optimizer"] = get_optim(parameters["model"], config) 30 | 31 | return parameters 32 | 33 | def get_optim(model, config): 34 | hyper_params = {key: value for key, value in config["optimizer"].items() if key != "optimizer_name"} 35 | optimizer_name = config.get("optimizer", "optimizer_name") 36 | optimizer = getattr(optim, optimizer_name) 37 | command = "optim(params, {})".format(", ".join(["{}={}".format(key, value) for key, value in hyper_params.items()])) 38 | return eval(command, {"optim": optimizer, "params": model.parameters()}) 39 | -------------------------------------------------------------------------------- /baselines/DMCNN_BiLSTM_(CRF)/utils/runner.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import json 4 | import copy 5 | import torch 6 | from utils.global_variables import Global 7 | from utils.evaluation import Evaluation 8 | 9 | def run(parameters, config, device): 10 | trained_epoch = -1 11 | max_epoch = config.getint("train", "epoch") 12 | valid_interval = config.getint("train", "valid_interval") 13 | saver = {} 14 | for epoch in range(trained_epoch + 1, max_epoch): 15 | run_one_epoch(parameters, config, device, epoch, "train") 16 | if epoch % valid_interval == 0: 17 | with torch.no_grad(): 18 | valid_metric = run_one_epoch(parameters, config, device, epoch, "valid") 19 | test_metric = run_one_epoch(parameters, config, device, epoch, "test") 20 | print() 21 | if saver == {} or valid_metric["micro_f1"] > saver["valid"]["micro_f1"]: 22 | saver["epoch"] = epoch 23 | saver["valid"] = valid_metric 24 | saver["test"] = test_metric 25 | with open("./data/results_{}.jsonl".format(config.get("data", "formatter_name")[:-9]), "w", encoding="utf-8") as f: 26 | for (k, v) in test_metric.items(): 27 | f.write(json.dumps({"id": k, 28 | "predictions": v})) 29 | f.write('\n') 30 | 31 | print("Best Epoch {}\nValid Metric: {}".format(saver["epoch"], saver["valid"])) 32 | 33 | 34 | def run_one_epoch(parameters, config, device, epoch, mode): 35 | model = parameters["model"] 36 | 37 | if mode == "train": 38 | model.train() 39 | optimizer = parameters["optimizer"] 40 | elif mode == "valid" or mode == "test": 41 | model.eval() 42 | else: 43 | raise NotImplementedError 44 | 45 | dataset = copy.deepcopy(parameters["dataset_{}".format(mode)]) 46 | pred = {} 47 | total_loss = 0 48 | evaluation = Evaluation(config) 49 | for step, data in enumerate(dataset): 50 | for key in data: 51 | if isinstance(data[key], torch.Tensor): 52 | data[key] = data[key].to(device) 53 | 54 | if mode == "train": 55 | optimizer.zero_grad() 56 | 57 | if config.get("model", "model_name") == "Crf": 58 | if mode != "test": 59 | results = model(data=data, mode=mode, crf_mode="train") 60 | loss = results["loss"] 61 | total_loss += loss.item() 62 | results = model(data=data, mode=mode, crf_mode="test") 63 | evaluation.expand(results["prediction"], results["labels"]) 64 | else: 65 | results = model(data=data, mode=mode, crf_mode="test") 66 | prediction = results["prediction"] 67 | if not isinstance(prediction, list): 68 | prediction = prediction.cpu().numpy().tolist() 69 | docids = data["docids"] 70 | canids = data["canids"] 71 | for doc, can, pre in zip(docids, canids, prediction): 72 | if doc not in pred.keys(): 73 | pred[doc] = [] 74 | assert (len(can) == len(pre)) 75 | for c, p in zip(can, pre): 76 | if p != "O": 77 | p = p[2:] 78 | assert p in Global.type2id.keys() 79 | pred[doc].append({"id": c, 80 | "type_id": Global.type2id[p]}) 81 | else: 82 | results = model(data=data, mode=mode) 83 | if mode != "test": 84 | loss = results["loss"] 85 | total_loss += loss.item() 86 | evaluation.expand(results["prediction"], results["labels"]) 87 | else: 88 | prediction = results["prediction"].cpu().numpy().tolist() 89 | docids = data["docids"] 90 | canids = data["canids"] 91 | for did, cid, pre in zip(docids, canids, prediction): 92 | if did not in pred.keys(): 93 | pred[did] = [] 94 | pred[did].append({"id": cid, 95 | "type_id": pre}) 96 | if mode != "test": 97 | print("\r{}: Epoch {} Step {:0>4d}/{} | Loss = {:.4f}".format(mode, epoch, step + 1, len(dataset), round(total_loss / (step + 1), 4)), end="") 98 | else: 99 | print("\r{}: Epoch {} Step {:0>4d}/{}".format(mode, epoch, step + 1, len(dataset)), end="") 100 | 101 | if mode == "train": 102 | loss.backward() 103 | optimizer.step() 104 | 105 | if mode != "test": 106 | metric = evaluation.get_metric("all") 107 | sys.stdout.write("\r") 108 | print("\r{}: Epoch {} | Metric: {}".format(mode, epoch, metric)) 109 | return metric 110 | else: 111 | return pred -------------------------------------------------------------------------------- /baselines/MOGANED/README.md: -------------------------------------------------------------------------------- 1 | # MOGANED 2 | The code is an **unofficial** implementation of [Event Detection with Multi-order Graph Convolution and Aggregated Attention](https://www.aclweb.org/anthology/D19-1582/) (EMNLP 2019 paper). 3 | 4 | ## Requirements 5 | 6 | - tensorflow-gpu==1.10 w/ CUDA 9 (or tensorflow-gpu==1.14 w/ CUDA 10.0) 7 | 8 | - stanfordcorenlp (see https://github.com/Lynten/stanford-corenlp for detail) 9 | 10 | - numpy 11 | 12 | - tqdm 13 | 14 | ## Usage 15 | 16 | To run this code, you need to: 17 | 1. modify MAVEN dataset path, GloVe file path and stanfordcorenlp path in ```constant.py``` 18 | 2. Run ```python train.py --gpu [YOUR_GPU] --mode MOGANED``` to train. 19 | 3. Run ```python train.py --gpu [YOUR_GPU] --mode MOGANED --eval``` to get prediction on test set (dumped to ```results.jsonl```). 20 | 21 | All hyper-parameters are in ```constant.py```, you can modify them as you wish. 22 | 23 | ## About Preprocessing 24 | 25 | When you first run this code, the code will do preprocessing. The preprocessing is quite low and may take a whole night (so run it and you can go to sleep!). This is because getting dependency trees are quite slow. 26 | 27 | However, preprocessing will only run once and the preprocessed files will be dumped to the maven dataset path. Next time you run the code the code will read them and won't do any more preprocessing. 28 | 29 | ## Results on MAVEN 30 | 31 | We run this code and submit the results to the CodaLab leaderboard (username: wzq016): 32 | |Method|Precision|Recall|F1| 33 | |--|--|--|--| 34 | |MOGANED (Paper)|63.4 ± 0.88|64.1 ± 0.90|63.8 ± 0.18| 35 | |MOGANED (Leaderboard)|64.7 ± 0.05|66.0 ± 0.02|65.3 ± 0.01| 36 | 37 | P.S. We updated the running environments after the MAVEN paper was published and found that the results magically gone higher. 38 | 39 | ## Note 40 | 41 | There are some differences on training strategy between this code and the original MOGANED paper: 42 | 1. The code doesn't use BIO schema. This is because trigger words are usually a single word rather than a phrase in ACE05, this won't affect results in ACE05. 43 | 2. The code doesn't use L2-norm, only use dropout. 44 | 3. The code uses AdamOptimizer rather than AdadeltaOptimizer. During experiments, I found Adadelta can't train a good classifier, however, Adam can. 45 | 4. This code sets bias loss lambda to 1 rather than 5 since I found this will make F1 score higher. 46 | 47 | ## Running on ACE 2005 48 | 49 | Please refer to [this repo](https://github.com/wzq016/MOGANED-Implementation). 50 | 51 | -------------------------------------------------------------------------------- /baselines/MOGANED/constant.py: -------------------------------------------------------------------------------- 1 | maven_path = '../GAT/data/newdataset' 2 | GloVe_file = '../GAT/glove/glove.6B.100d.txt' 3 | corenlp_path = '../GAT/stanford-corenlp-full-2018-10-05' 4 | 5 | EVENT_TYPE_TO_ID = {i:i for i in range(169)} 6 | ROLE_TO_ID = {'None': 0, 'Person': 1, 'Place': 2, 'Buyer': 3, 'Seller': 4, 'Beneficiary': 5, 'Price': 6, 'Artifact': 7, 'Origin': 8, 'Destination': 9, 'Giver': 10, 'Recipient': 11, 'Money': 12, 'Org': 13, 'Agent': 14, 'Victim': 15, 'Instrument': 16, 'Entity': 17, 'Attacker': 18, 'Target': 19, 'Defendant': 20, 'Adjudicator': 21, 'Prosecutor': 22, 'Plaintiff': 23, 'Crime': 24, 'Position': 25, 'Sentence': 26, 'Vehicle': 27, 'Time-Within': 28, 'Time-Starting': 29, 'Time-Ending': 30, 'Time-Before': 31, 'Time-After': 32, 'Time-Holds': 33, 'Time-At-Beginning': 34, 'Time-At-End': 35} 7 | 8 | NER_TO_ID = {'': 0, '': 1, 'O': 2, 'PERSON': 3, 'ORGANIZATION': 4, 'LOCATION': 5, 'DATE': 6, 9 | 'NUMBER': 7, 'MISC': 8, 'DURATION': 9, 'MONEY': 10, 'PERCENT': 11, 'ORDINAL': 12, 'TIME': 13, 'SET': 14} 10 | 11 | POS_TO_ID = {'': 0, '': 1, 'NNP': 2, 'NN': 3, 'IN': 4, 'DT': 5, ',': 6, 'JJ': 7, 'NNS': 8, 'VBD': 9, 'CD': 10, 'CC': 11, '.': 12, 'RB': 13, 'VBN': 14, 'PRP': 15, 'TO': 16, 'VB': 17, 'VBG': 18, 'VBZ': 19, 'PRP$': 20, ':': 21, 'POS': 22, 12 | '\'\'': 23, '``': 24, '-RRB-': 25, '-LRB-': 26, 'VBP': 27, 'MD': 28, 'NNPS': 29, 'WP': 30, 'WDT': 31, 'WRB': 32, 'RP': 33, 'JJR': 34, 'JJS': 35, '$': 36, 'FW': 37, 'RBR': 38, 'SYM': 39, 'EX': 40, 'RBS': 41, 'WP$': 42, 'PDT': 43, 'LS': 44, 'UH': 45, '#': 46} 13 | 14 | 15 | 16 | INF = 1e8 17 | 18 | #general hyperparams 19 | embedding_dim = 100 20 | posi_embedding_dim = 50 21 | event_type_embedding_dim = 5 22 | cut_len = 50 #set None to not cut length 23 | 24 | 25 | #trigger hyperparameters 26 | t_filters = 200 27 | t_batch_size = 30 28 | t_lr = 0.001 29 | t_epoch = 10 30 | t_keepprob = 0.7 31 | t_bias_lambda = 1 32 | 33 | #GAT hypers 34 | pos_dim = 50 35 | ner_dim = 50 36 | hidden_dim = 100 37 | 38 | Watt_dim = 100 39 | s_dim = 100 40 | 41 | leaky_alpha = 0.2 42 | graph_dim = 150 43 | 44 | K=3 45 | 46 | -------------------------------------------------------------------------------- /baselines/MOGANED/func.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import random 3 | import tensorflow as tf 4 | import constant 5 | from tqdm import tqdm 6 | 7 | class Cudnn_RNN: 8 | 9 | def __init__(self, num_layers, num_units, mode="lstm",keep_prob=1.0, is_train=None, scope="cudnn_rnn", gpu=True): 10 | self.num_layers = num_layers 11 | self.rnns = [] 12 | self.mode = mode 13 | if mode == "gru": 14 | if gpu: 15 | rnn = tf.contrib.cudnn_rnn.CudnnGRU 16 | else: 17 | rnn = tf.contrib.rnn.GRUCell 18 | elif mode == "lstm": 19 | if gpu: 20 | rnn = tf.contrib.cudnn_rnn.CudnnLSTM 21 | else: 22 | rnn = tf.contrib.rnn.BasicLSTM 23 | else: 24 | raise Exception("Unknown mode for rnn") 25 | for layer in range(num_layers): 26 | if gpu: 27 | rnn_fw = rnn(1, num_units) 28 | rnn_bw = rnn(1, num_units) 29 | else: 30 | rnn_fw = rnn(num_units) 31 | rnn_bw = rnn(num_units) 32 | self.rnns.append((rnn_fw, rnn_bw, )) 33 | 34 | def __call__(self, inputs, seq_len, keep_prob=1.0, is_train=None, concat_layers=True): 35 | outputs = [tf.transpose(inputs, [1, 0, 2])] 36 | for layer in range(self.num_layers): 37 | rnn_fw, rnn_bw = self.rnns[layer] 38 | output = dropout(outputs[-1], keep_prob=keep_prob, is_train=is_train) 39 | with tf.variable_scope("fw_{}".format(layer)): 40 | out_fw, state_fw = rnn_fw(output) 41 | with tf.variable_scope("bw_{}".format(layer)): 42 | inputs_bw = tf.reverse_sequence(output, seq_lengths=seq_len, seq_axis=0, batch_axis=1) 43 | out_bw, state_bw = rnn_bw(inputs_bw) 44 | out_bw = tf.reverse_sequence(out_bw, seq_lengths=seq_len, seq_axis=0, batch_axis=1) 45 | outputs.append(tf.concat([out_fw, out_bw], axis=2)) 46 | if concat_layers is True: 47 | res = tf.concat(outputs[1:], axis=2) 48 | else: 49 | res = outputs[-1] 50 | res = tf.transpose(res, [1, 0, 2]) 51 | state_fw = tf.squeeze(state_fw[0], [0]) 52 | state_bw = tf.squeeze(state_bw[0], [0]) 53 | state = tf.concat([state_fw, state_bw], axis=1) 54 | return res, state 55 | 56 | def dropout(args, keep_prob, is_train, mode=None): 57 | if keep_prob < 1.0: 58 | noise_shape = None 59 | scale = 1.0 60 | shape = tf.shape(args) 61 | if mode == "embedding" and len(args.get_shape().as_list()) == 3: 62 | noise_shape = [shape[0], shape[1], 1] 63 | scale = keep_prob 64 | if mode == "recurrent" and len(args.get_shape().as_list()) == 3: 65 | noise_shape = [shape[0], 1, shape[-1]] 66 | args = tf.cond(is_train, lambda: tf.nn.dropout( 67 | args, keep_prob, noise_shape=noise_shape), lambda: args) 68 | return args 69 | 70 | def f_score(predict,golden,mode='f'): 71 | assert len(predict)==len(golden) 72 | TP = 0 73 | FP = 0 74 | FN = 0 75 | TN = 0 76 | for i in range(len(predict)): 77 | if predict[i]==golden[i] and predict[i] != 0: 78 | TP+=1 79 | elif predict[i]!=golden[i]: 80 | if predict[i]==0: 81 | FN+=1 82 | elif golden[i]==0: 83 | FP+=1 84 | else: 85 | FN+=1 86 | FP+=1 87 | else: 88 | TN+=1 89 | try: 90 | P = TP/(TP+FP) 91 | R = TP/(TP+FN) 92 | F = 2*P*R/(P+R) 93 | except: 94 | P=R=F=0 95 | 96 | if mode=='f': 97 | return P,R,F 98 | else: 99 | return TP,FN,FP,TN 100 | 101 | def get_batch(data_all,batch_size,shuffle=True): 102 | data,data_subg = data_all 103 | assert len(list(set([np.shape(d)[0] for d in data]))) == 1 104 | num_data = np.shape(data[0])[0] 105 | indices = list(np.arange(0,num_data)) 106 | if shuffle: 107 | random.shuffle(indices) 108 | for i in tqdm(range((num_data // batch_size)+1)): 109 | select_indices = indices[i*batch_size:(i+1)*batch_size] 110 | select_subg_indices = [[idx]+indice for idx,select_indice in enumerate(select_indices) for indice in data_subg[select_indice]] 111 | yield [np.take(d,select_indices,axis=0) for d in data]+[select_subg_indices] 112 | 113 | def get_trigger_feeddict(model,batch,stage,maxlen,is_train=True): 114 | if stage=='DMCNN': 115 | posis,sents,maskls,maskrs,event_types,lexical,_,_,_,_ = batch 116 | return {model.posis:posis,model.sents:sents,model.maskls:maskls,model.maskrs:maskrs, 117 | model._labels:event_types,model.lexical:lexical,model.is_train:is_train} 118 | else: 119 | posis,sents,maskls,maskrs,event_types,lexical,pos,ner,trigger_idxs,subg_indices = batch 120 | subg_vals = [1.0]*len(subg_indices) 121 | subg_shape = [sents.shape[0],maxlen,maxlen] 122 | subg = (subg_indices,subg_vals,subg_shape) 123 | 124 | gather_idxs = np.stack([np.array(np.arange(posis.shape[0])),trigger_idxs],axis=1) 125 | return {model.posis:posis,model.sents:sents,model.maskls:maskls,model.maskrs:maskrs, 126 | model._labels:event_types,model.lexical:lexical,model.is_train:is_train, 127 | model.pos_idx:pos,model.ner_idx:ner,model.subg_a:subg,model.gather_idxs:gather_idxs} 128 | 129 | 130 | #GAT util function 131 | 132 | def u_compute(ps,subg,maxlen): 133 | with tf.variable_scope("e_compute",reuse=tf.AUTO_REUSE): 134 | att = tf.layers.dense(ps,constant.Watt_dim,name='Watt') 135 | left_comb = tf.layers.dense(att,1,name='comb_left') 136 | right_comb = tf.layers.dense(att,1,name='comb_right') 137 | 138 | tile_left = tf.tile(left_comb,[1,1,maxlen],name='tile_1') 139 | tile_right = tf.tile(tf.transpose(left_comb,[0,2,1],name='transpose_1'),[1,maxlen,1],name='tile_2') 140 | tiles_concat = tile_left+tile_right 141 | 142 | e_mat = tf.nn.leaky_relu(tiles_concat,alpha=constant.leaky_alpha,name='lrelu_1') 143 | with tf.variable_scope('u_compute',reuse=tf.AUTO_REUSE): 144 | u_raw = tf.multiply(e_mat,subg,name='mul_1')-(1-subg)*1e8 145 | u_mat = tf.nn.softmax(u_raw,axis=2,name='soft_1') 146 | return u_mat 147 | 148 | def GAC_func(ps,subg,maxlen,a,k): 149 | with tf.variable_scope("GAC_compute",reuse=tf.AUTO_REUSE): 150 | u_mat = u_compute(ps,subg,maxlen) 151 | weight_name = a+'_'+str(k) 152 | dense = tf.layers.dense(ps,constant.graph_dim,name=weight_name) 153 | # dense_expand = tf.tile(tf.expand_dims(dense,2,name='expand_1'),[1,1,maxlen,1],name='tile_1') 154 | # u_mat_expand = tf.tile(tf.expand_dims(u_mat,3,name='expand_2'),[1,1,1,constant.graph_dim],name='tile_2') 155 | dense_expand = tf.expand_dims(dense,2,name='expand_1') 156 | u_mat_expand = tf.expand_dims(u_mat,3,name='expand_2') 157 | sums = tf.reduce_sum(tf.multiply(u_mat_expand,dense_expand,name='mul1'),axis=2,name='sum_1') 158 | graph_emb = tf.nn.elu(sums,name='elu_1') 159 | return graph_emb 160 | 161 | def matmuls(a,times): 162 | with tf.variable_scope('matmuls_'): 163 | res = a 164 | for i in range(times-1): 165 | res = tf.matmul(res,a) 166 | return res -------------------------------------------------------------------------------- /baselines/MOGANED/models.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import constant 3 | from func import get_batch,get_trigger_feeddict,f_score,GAC_func,Cudnn_RNN,matmuls 4 | import numpy as np 5 | 6 | class Trigger_Model(): 7 | def __init__(self,t_data,maxlen,wordemb,stage="MOGANED"): 8 | self.t_train,self.t_dev,self.t_test = t_data 9 | self.maxlen = maxlen 10 | self.wordemb = wordemb 11 | self.stage = stage 12 | self.build_graph() 13 | 14 | def build_graph(self): 15 | if self.stage=='DMCNN': 16 | print('--Building Trigger DMCNN Graph--') 17 | self.build_trigger() 18 | else: 19 | print('--Building Trigger MOGANED Graph--') 20 | self.build_GAT() 21 | 22 | def build_trigger(self,scope='DMCNN_Trigger'): 23 | maxlen = self.maxlen 24 | num_class = len(constant.EVENT_TYPE_TO_ID) 25 | keepprob = constant.t_keepprob 26 | with tf.variable_scope(scope,reuse=tf.AUTO_REUSE): 27 | with tf.variable_scope('Initialize'): 28 | posi_mat = tf.concat( 29 | [tf.zeros([1,constant.posi_embedding_dim],tf.float32), 30 | tf.get_variable('posi_emb',[2*maxlen,constant.posi_embedding_dim],tf.float32,initializer=tf.contrib.layers.xavier_initializer())],axis=0) 31 | word_mat = tf.concat([ 32 | tf.zeros((1, constant.embedding_dim),dtype=tf.float32), 33 | tf.get_variable("unk_word_embedding", [1, constant.embedding_dim], dtype=tf.float32,initializer=tf.contrib.layers.xavier_initializer()), 34 | tf.get_variable("wordemb", initializer=self.wordemb,trainable=True)], axis=0) 35 | 36 | with tf.variable_scope('placeholder'): 37 | self.sents = sents = tf.placeholder(tf.int32,[None,maxlen],'sents') 38 | self.posis = posis = tf.placeholder(tf.int32,[None,maxlen],'posis') 39 | self.maskls = maskls = tf.placeholder(tf.float32,[None,maxlen],'maskls') 40 | self.maskrs = maskrs = tf.placeholder(tf.float32,[None,maxlen],'maskrs') 41 | self._labels = _labels = tf.placeholder(tf.int32,[None],'labels') 42 | labels = tf.one_hot(_labels,num_class) 43 | self.is_train = is_train = tf.placeholder(tf.bool,[],'is_train') 44 | self.lexical = lexical = tf.placeholder(tf.int32,[None,3],'lexicals') 45 | 46 | sents_len = tf.reduce_sum(tf.cast(tf.cast(sents,tf.bool),tf.int32),axis=1) 47 | sents_mask = tf.expand_dims(tf.sequence_mask(sents_len,maxlen,tf.float32),axis=2) 48 | with tf.variable_scope('embedding'): 49 | sents_emb = tf.nn.embedding_lookup(word_mat,sents) 50 | posis_emb = tf.nn.embedding_lookup(posi_mat,posis) 51 | lexical_emb = tf.nn.embedding_lookup(word_mat,lexical) 52 | with tf.variable_scope('lexical_feature'): 53 | lexical_feature = tf.reshape(lexical_emb,[-1,3*constant.embedding_dim]) 54 | with tf.variable_scope('encoder'): 55 | emb = tf.concat([sents_emb,posis_emb],axis=2) 56 | emb_shape = tf.shape(emb) 57 | pad = tf.zeros([emb_shape[0],1,emb_shape[2]],tf.float32) 58 | conv_input = tf.concat([pad,emb,pad],axis=1) 59 | conv_res = tf.layers.conv1d( 60 | inputs=conv_input, 61 | filters=constant.t_filters, kernel_size=3, 62 | strides=1, 63 | padding='valid', 64 | activation=tf.nn.relu, 65 | kernel_initializer=tf.contrib.layers.xavier_initializer(), 66 | name='convlution_layer') 67 | conv_res = tf.reshape(conv_res,[-1,maxlen,constant.t_filters]) 68 | with tf.variable_scope('maxpooling'): 69 | maskl = tf.tile(tf.expand_dims(maskls,axis=2),[1,1,constant.t_filters]) 70 | left = maskl*conv_res 71 | maskr = tf.tile(tf.expand_dims(maskrs,axis=2),[1,1,constant.t_filters]) 72 | right = maskr*conv_res 73 | sentence_feature = tf.concat([tf.reduce_max(left,axis=1),tf.reduce_max(right,axis=1)],axis=1) 74 | with tf.variable_scope('classifier'): 75 | feature = tf.concat([sentence_feature,lexical_feature],axis=1) 76 | feature = tf.layers.dropout(feature,1-constant.t_keepprob,training=is_train) 77 | self.logits = logits = tf.layers.dense(feature,num_class,kernel_initializer=tf.contrib.layers.xavier_initializer(),bias_initializer=tf.contrib.layers.xavier_initializer()) 78 | self.pred = pred = tf.nn.softmax(logits,axis=1) 79 | self.pred_label = pred_label = tf.argmax(pred,axis=1) 80 | self.loss = loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits_v2(labels=labels,logits=logits),axis=0) 81 | self.train_op = train_op = tf.train.AdamOptimizer(constant.t_lr).minimize(loss) 82 | 83 | def build_GAT(self,scope='MOGANED_Trigger'): 84 | maxlen = self.maxlen 85 | num_class = len(constant.EVENT_TYPE_TO_ID) 86 | keepprob = constant.t_keepprob 87 | with tf.variable_scope(scope,reuse=tf.AUTO_REUSE): 88 | with tf.variable_scope('Initialize'): 89 | posi_mat = tf.concat( 90 | [tf.zeros([1,constant.posi_embedding_dim],tf.float32), 91 | tf.get_variable('posi_emb',[2*maxlen,constant.posi_embedding_dim],tf.float32,initializer=tf.contrib.layers.xavier_initializer())],axis=0) 92 | word_mat = tf.concat([ 93 | tf.zeros((1, constant.embedding_dim),dtype=tf.float32), 94 | tf.get_variable("unk_word_embedding", [1, constant.embedding_dim], dtype=tf.float32,initializer=tf.contrib.layers.xavier_initializer()), 95 | tf.get_variable("wordemb", initializer=self.wordemb,trainable=True)], axis=0) 96 | pos_mat = tf.concat([ 97 | tf.zeros((1, constant.pos_dim),dtype=tf.float32), 98 | tf.get_variable("pos_embedding", [len(constant.POS_TO_ID)-1, constant.pos_dim], dtype=tf.float32,initializer=tf.contrib.layers.xavier_initializer())],axis=0) 99 | ner_mat = tf.concat([ 100 | tf.zeros((1, constant.ner_dim),dtype=tf.float32), 101 | tf.get_variable("ner_embedding", [len(constant.NER_TO_ID)-1, constant.ner_dim], dtype=tf.float32,initializer=tf.contrib.layers.xavier_initializer())],axis=0) 102 | 103 | with tf.variable_scope("Placeholder"): 104 | self.sents = sents = tf.placeholder(tf.int32,[None,maxlen],'sents') 105 | self.posis = posis = tf.placeholder(tf.int32,[None,maxlen],'posis') 106 | self.maskls = maskls = tf.placeholder(tf.float32,[None,maxlen],'maskls') 107 | self.maskrs = maskrs = tf.placeholder(tf.float32,[None,maxlen],'maskrs') 108 | self._labels = _labels = tf.placeholder(tf.int32,[None],'labels') 109 | labels = tf.one_hot(_labels,num_class) 110 | self.is_train = is_train = tf.placeholder(tf.bool,[],'is_train') 111 | self.lexical = lexical = tf.placeholder(tf.int32,[None,3],'lexicals') 112 | 113 | self.ner_idx = ner_idx = tf.placeholder(tf.int32,[None,maxlen],'ner_tags') 114 | self.pos_idx = pos_idx = tf.placeholder(tf.int32,[None,maxlen],'pos_tags') 115 | 116 | self.subg_a = tf.sparse_placeholder(tf.float32,[None,maxlen,maxlen],'subg') 117 | 118 | self.subg_b = tf.sparse_transpose(self.subg_a,[0,2,1]) 119 | 120 | subg_a = tf.sparse_tensor_to_dense(self.subg_a,validate_indices=False) 121 | subg_b = tf.sparse_tensor_to_dense(self.subg_b,validate_indices=False) 122 | 123 | self.gather_idxs = tf.placeholder(tf.int32,[None,2],'gather_idxs') 124 | 125 | sents_len = tf.reduce_sum(tf.cast(tf.cast(sents,tf.bool),tf.int32),axis=1) 126 | sents_mask = tf.expand_dims(tf.sequence_mask(sents_len,maxlen,tf.float32),axis=2) 127 | 128 | eyes = tf.tile(tf.expand_dims(tf.eye(maxlen),0),[tf.shape(pos_idx)[0],1,1]) 129 | 130 | with tf.variable_scope("Embedding"): 131 | sents_emb = tf.nn.embedding_lookup(word_mat,sents) 132 | posis_emb = tf.nn.embedding_lookup(posi_mat,posis) 133 | pos_emb = tf.nn.embedding_lookup(pos_mat,pos_idx) 134 | ner_emb = tf.nn.embedding_lookup(ner_mat,ner_idx) 135 | concat_emb = tf.concat([sents_emb,posis_emb,pos_emb,ner_emb],axis=2) 136 | 137 | with tf.variable_scope("Lstm_layer"): 138 | rnn = Cudnn_RNN(num_layers=1, num_units=constant.hidden_dim, keep_prob=keepprob, is_train=self.is_train) 139 | ps,_ = rnn(concat_emb, seq_len=sents_len, concat_layers=False,keep_prob=keepprob,is_train=self.is_train) 140 | 141 | with tf.variable_scope("GAC"): 142 | hs = [] 143 | for layer in range(1,constant.K+1): 144 | h_layer= GAC_func(ps,matmuls(subg_a,layer),maxlen,'a',layer)+GAC_func(ps,matmuls(subg_b,layer),maxlen,'b',layer)+GAC_func(ps,eyes,maxlen,'c',layer) 145 | hs.append(h_layer) 146 | 147 | with tf.variable_scope("Aggregation"): 148 | s_ctxs = [] 149 | for layer in range(1,constant.K+1): 150 | s_raw = tf.layers.dense(hs[layer-1],constant.s_dim,name='Wawa') 151 | s_layer = tf.nn.tanh(s_raw) 152 | ctx_apply = tf.layers.dense(s_layer,1,name='ctx',use_bias=False) 153 | s_ctxs.append(ctx_apply) 154 | vs = tf.nn.softmax(tf.concat(s_ctxs,axis=2),axis=2) #[None,maxlen,3] 155 | h_concats = tf.concat([tf.expand_dims(hs[layer],2) for layer in range(constant.K)],axis=2) 156 | final_h = tf.reduce_sum(tf.multiply(tf.expand_dims(vs,3),h_concats),axis=2) 157 | gather_final_h = tf.gather_nd(final_h,self.gather_idxs) 158 | 159 | with tf.variable_scope('classifier'): 160 | bias_weight = (constant.t_bias_lambda-1)*(1-tf.cast(tf.equal(_labels,0),tf.float32))+1 161 | self.logits = logits = tf.layers.dense(gather_final_h,num_class,kernel_initializer=tf.contrib.layers.xavier_initializer(),bias_initializer=tf.contrib.layers.xavier_initializer(),name='Wo') 162 | self.pred = pred = tf.nn.softmax(logits,axis=1) 163 | self.pred_label = pred_label = tf.argmax(pred,axis=1) 164 | self.loss = loss = tf.reduce_sum(bias_weight*tf.nn.softmax_cross_entropy_with_logits_v2(labels=labels,logits=logits),axis=0)/tf.reduce_sum(bias_weight,axis=0) 165 | self.train_op = train_op = tf.train.AdamOptimizer(constant.t_lr).minimize(loss) 166 | 167 | 168 | def train_trigger(self): 169 | train,dev,test = self.t_train,self.t_dev,self.t_test 170 | saver = tf.train.Saver() 171 | maxlen = self.maxlen 172 | print('--Training Trigger--') 173 | with tf.Session() as sess: 174 | sess.run(tf.global_variables_initializer()) 175 | devbest = 0 176 | testbest = (0,0,0) 177 | from tqdm import tqdm 178 | for epoch in tqdm(range(constant.t_epoch)): 179 | loss_list =[] 180 | for batch in get_batch(train,constant.t_batch_size,True): 181 | loss,_ = sess.run([self.loss,self.train_op],feed_dict=get_trigger_feeddict(self,batch,self.stage,maxlen)) 182 | loss_list.append(loss) 183 | print('epoch:{}'.format(str(epoch))) 184 | print('loss:',np.mean(loss_list)) 185 | 186 | pred_labels = [] 187 | for batch in get_batch(dev,constant.t_batch_size,False): 188 | pred_label = sess.run(self.pred_label,feed_dict=get_trigger_feeddict(self,batch,self.stage,maxlen,is_train=False)) 189 | pred_labels.extend(list(pred_label)) 190 | golds = list(dev[0][4]) 191 | dev_p,dev_r,dev_f = f_score(pred_labels,golds) 192 | print("dev_Precision: {} dev_Recall:{} dev_F1:{}".format(str(dev_p),str(dev_r),str(dev_f))) 193 | 194 | if dev_f>devbest: 195 | devbest = dev_f 196 | testbest = (dev_p, dev_r, dev_f) 197 | saver.save(sess,"saved_models/trigger.ckpt") 198 | test_p, test_r, test_f = testbest 199 | print("dev best Precision: {} dev best Recall:{} dev best F1:{}".format(str(test_p), str(test_r), str(test_f))) 200 | 201 | def eval_trigger(self): 202 | test = self.t_test 203 | saver = tf.train.Saver() 204 | maxlen = self.maxlen 205 | from collections import defaultdict 206 | import json 207 | results = defaultdict(list) 208 | print('--Eval Trigger--') 209 | with tf.Session() as sess: 210 | saver.restore(sess,"saved_models/trigger.ckpt") 211 | pred_labels = [] 212 | for batch in get_batch(test,constant.t_batch_size,False): 213 | pred_label = sess.run(self.pred_label,feed_dict=get_trigger_feeddict(self,batch,self.stage,maxlen,is_train=False)) 214 | pred_labels.extend(list(pred_label)) 215 | with open('{}/id_align.json'.format(constant.maven_path),'r') as f: 216 | ids = json.load(f) 217 | with open('test_idxs.json','r') as f: 218 | test_idxs = json.load(f) 219 | test_idxs = {test_idx:idx for idx,test_idx in enumerate(test_idxs)} 220 | assert len(test_idxs)==len(pred_labels) 221 | for idx in range(len(ids)): 222 | id_ = ids[idx] 223 | if idx in test_idxs: 224 | label = pred_labels[test_idxs[idx]] 225 | else: 226 | label = 0 227 | results[id_[0]].append({'id':id_[1],'type_id':int(label)}) 228 | with open('results.jsonl','w') as f: 229 | for key,val in results.items(): 230 | f.write(json.dumps({'id':key,'predictions':val})+'\n') 231 | print("--Eval Finish--") 232 | 233 | -------------------------------------------------------------------------------- /baselines/MOGANED/train.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import utils 3 | from models import Trigger_Model 4 | import os 5 | from constant import * 6 | 7 | flags = tf.flags 8 | flags.DEFINE_string("gpu", "1", "The GPU to run on") 9 | flags.DEFINE_string("mode", "MOGANED", "DMCNN or MOGANED") 10 | flags.DEFINE_bool('eval', False, "Eval or Train") 11 | 12 | def main(_): 13 | config = flags.FLAGS 14 | os.environ['CUDA_VISIBLE_DEVICES'] = config.gpu 15 | os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' 16 | extractor = utils.Extractor() 17 | extractor.extract() 18 | loader = utils.Loader(cut_len) 19 | t_data = loader.load_trigger() 20 | trigger = Trigger_Model(t_data,loader.maxlen,loader.wordemb,config.mode) 21 | if not config.eval: 22 | trigger.train_trigger() 23 | else: 24 | trigger.eval_trigger() 25 | 26 | if __name__=="__main__": 27 | tf.app.run() -------------------------------------------------------------------------------- /baselines/MOGANED/utils.py: -------------------------------------------------------------------------------- 1 | import os 2 | import constant 3 | from xml.dom.minidom import parse 4 | from tqdm import tqdm 5 | import re 6 | import random 7 | import json 8 | import numpy as np 9 | import copy 10 | from stanfordcorenlp import StanfordCoreNLP 11 | 12 | class Extractor(): 13 | def __init__(self): 14 | pass 15 | 16 | def preprocess(self): 17 | splits = {'train':'train','valid':'dev','test':'test'} 18 | path = constant.maven_path 19 | nlp = StanfordCoreNLP(constant.corenlp_path) 20 | mention_ids = [] 21 | for split in tqdm(splits): 22 | split_data = [] 23 | with open(path+'/'+split+'.jsonl','r') as f: 24 | line = f.readline().rstrip() 25 | while line: 26 | doc = json.loads(line) 27 | content = doc['content'] 28 | for sent_tuple in content: 29 | origin_sent,origin_tokens = sent_tuple['sentence'],sent_tuple['tokens'] 30 | 31 | parse_sent = ' '.join(sent_tuple['tokens']) 32 | nlp_words,nlp_span = nlp.word_tokenize(parse_sent,True) 33 | nlp_span_dict = {e[0]:i for i,e in enumerate(nlp_span)} 34 | origin_span = {i:len(' '.join(origin_tokens[:i]))+1 for i in range(1,len(origin_tokens))} 35 | origin_span[0] = 0 36 | sent_tuple['origin_span'] = origin_span 37 | sent_tuple['nlp_span_dict'] = nlp_span_dict 38 | sent_tuple['nlp_words'] = nlp_words 39 | 40 | dependency_parsing =nlp.dependency_parse(parse_sent) 41 | pos_tags = [e[1] for e in nlp.pos_tag(parse_sent)] 42 | ner_tags = [e[1] for e in nlp.ner(parse_sent)] 43 | sent_tuple['ner'] = ner_tags 44 | sent_tuple['pos'] = pos_tags 45 | sent_tuple['dependency'] = dependency_parsing 46 | if split!='test': 47 | for event in doc['events']: 48 | event_type = event['type_id'] 49 | assert isinstance(event_type,int) 50 | assert event_type<169 51 | # if event_type==207: 52 | # continue 53 | for mention in event['mention']: 54 | trigger = mention['trigger_word'].lower() 55 | offset = mention['offset'] 56 | tokens = content[mention['sent_id']]['tokens'] 57 | 58 | origin_span = content[mention['sent_id']]['origin_span'] 59 | nlp_span_dict = content[mention['sent_id']]['nlp_span_dict'] 60 | nlp_words = content[mention['sent_id']]['nlp_words'] 61 | if origin_span[offset[0]] not in nlp_span_dict: 62 | real_offset = offset[0] 63 | else: 64 | real_offset = nlp_span_dict[origin_span[offset[0]]] 65 | 66 | mention_ids.append((mention['id'],event_type)) 67 | 68 | info = {'tokens':nlp_words, 69 | 'trigger_tokens':[nlp_words[real_offset]], 70 | 'ner_tags':content[mention['sent_id']]['ner'], 71 | 'pos_tags':content[mention['sent_id']]['pos'], 72 | 'dependency_parsing':content[mention['sent_id']]['dependency'], 73 | 'trigger_start':real_offset, 74 | 'trigger_end':real_offset, 75 | 'event_type':event_type} 76 | split_data.append(info) 77 | negative_triggers = 'negative_triggers' 78 | if split=='test': 79 | negative_triggers = 'candidates' 80 | for mention in doc[negative_triggers]: 81 | trigger = mention['trigger_word'].lower() 82 | offset = mention['offset'] 83 | tokens = content[mention['sent_id']]['tokens'] 84 | mention_ids.append((mention['id'],0)) 85 | origin_span = content[mention['sent_id']]['origin_span'] 86 | nlp_span_dict = content[mention['sent_id']]['nlp_span_dict'] 87 | nlp_words = content[mention['sent_id']]['nlp_words'] 88 | if origin_span[offset[0]] not in nlp_span_dict: 89 | real_offset = offset[0] 90 | else: 91 | real_offset = nlp_span_dict[origin_span[offset[0]]] 92 | 93 | info = {'tokens':nlp_words, 94 | 'trigger_tokens':[nlp_words[real_offset]], 95 | 'ner_tags':content[mention['sent_id']]['ner'], 96 | 'pos_tags':content[mention['sent_id']]['pos'], 97 | 'dependency_parsing':content[mention['sent_id']]['dependency'], 98 | 'trigger_start':real_offset, 99 | 'trigger_end':real_offset, 100 | 'event_type':0} 101 | split_data.append(info) 102 | 103 | line = f.readline().rstrip() 104 | with open(path+'/'+splits[split]+'.json','w') as f: 105 | json.dump(split_data,f) 106 | 107 | nlp.close() 108 | 109 | def id_align(self): 110 | ids = [] 111 | 112 | with open('{}/test.jsonl'.format(constant.maven_path),'r') as f: 113 | line = f.readline().rstrip() 114 | while line: 115 | doc = json.loads(line) 116 | doc_id = doc['id'] 117 | for mention in doc['candidates']: 118 | trigger_id = mention['id'] 119 | ids.append((doc_id,trigger_id)) 120 | line = f.readline().rstrip() 121 | 122 | with open('{}/id_align.json'.format(constant.maven_path),'w') as f: 123 | json.dump(ids,f) 124 | 125 | def extract(self): 126 | if not os.path.exists(constant.maven_path+'/train.json'): 127 | print('----Preprocessing----') 128 | self.preprocess() 129 | else: 130 | print("--Preprocessed files exist--") 131 | if not os.path.exists(constant.maven_path+'/id_align.json'): 132 | print('----Id Aligning----') 133 | self.id_align() 134 | 135 | class Loader(): 136 | def __init__(self,cut_len): 137 | self.train_path = constant.maven_path+'/train.json' 138 | self.dev_path = constant.maven_path+'/dev.json' 139 | self.test_path = constant.maven_path+'/test.json' 140 | self.glove_path = constant.GloVe_file 141 | self.cut_len = cut_len 142 | 143 | def load_embedding(self): 144 | word2idx = {} 145 | wordemb = [] 146 | with open(self.glove_path,'r',encoding='utf-8') as f: 147 | for line in f: 148 | splt = line.split() 149 | assert len(splt)==constant.embedding_dim+1 150 | vector = list(map(float, splt[-constant.embedding_dim:])) 151 | word = splt[0] 152 | word2idx[word] = len(word2idx)+2 153 | wordemb.append(vector) 154 | return word2idx,np.asarray(wordemb,np.float32) 155 | 156 | def get_maxlen(self): 157 | if self.cut_len!=None: 158 | self.maxlen = self.cut_len 159 | return self.maxlen 160 | paths = [self.train_path,self.dev_path,self.test_path] 161 | maxlens = [] 162 | for path in paths: 163 | with open(path,'r') as f: 164 | data = json.load(f) 165 | _maxlen = max([len(d['tokens']) for d in data]) 166 | maxlens.append(_maxlen) 167 | self.maxlen = max(maxlens) 168 | return self.maxlen 169 | 170 | def get_max_argument_len(self): 171 | paths = [self.train_path,self.dev_path,self.test_path] 172 | maxlens = [] 173 | for path in paths: 174 | with open(path,'r') as f: 175 | data = json.load(f) 176 | for instance in data: 177 | if len(instance['entities'])==0: 178 | continue 179 | _maxlen = max([entity['idx_end']+1-entity['idx_start'] for entity in instance['entities']]) 180 | maxlens.append(_maxlen) 181 | self.max_argument_len = max(maxlens) 182 | return self.max_argument_len 183 | 184 | def get_positions(self,start_idx,sent_len,maxlen): 185 | return list(range(maxlen-start_idx, maxlen)) + [maxlen] + \ 186 | list(range(maxlen+1, maxlen+sent_len - start_idx))+[0]*(maxlen-sent_len) 187 | 188 | def get_word(self,tokens,word2idx,pad_lenth): 189 | idx = [] 190 | for word in tokens: 191 | if word.lower() in word2idx: 192 | idx.append(word2idx[word.lower()]) 193 | else: 194 | idx.append(1) 195 | idx += [0]*(pad_lenth-len(idx)) 196 | return idx 197 | 198 | def get_trigger_mask(self,posi,sent_len,maxlen,direction): 199 | assert direction in ['left','right'] 200 | mask = [0.]*maxlen 201 | if direction=='left': 202 | mask[:posi] = [1.]*posi 203 | else: 204 | mask[posi:sent_len] = [1.]*(sent_len-posi) 205 | return mask 206 | 207 | def load_one_trigger(self,path,maxlen,word2idx): 208 | trigger_posis,sents,trigger_maskls,trigger_maskrs,event_types,trigger_lexical= [], [], [], [], [], [] 209 | with open(path,'r') as f: 210 | data = json.load(f) 211 | 212 | indices_s,pos,ner = [],[],[] 213 | trigger_idxs = [] 214 | 215 | test_idxs = [] 216 | 217 | 218 | for test_idx,instance in enumerate(data): 219 | tokens = instance['tokens'][:maxlen] 220 | event_type = instance['event_type'] 221 | trigger_posi = instance['trigger_start'] 222 | if trigger_posi>maxlen-1: 223 | continue 224 | ner_tags = [constant.NER_TO_ID[e] if e in constant.NER_TO_ID else 1 for e in instance['ner_tags']][:maxlen]+[0]*(maxlen-len(instance['ner_tags'])) 225 | pos_tags = [constant.POS_TO_ID[e] if e in constant.NER_TO_ID else 1 for e in instance['pos_tags']][:maxlen]+[0]*(maxlen-len(instance['pos_tags'])) 226 | ner.append(ner_tags) 227 | pos.append(pos_tags) 228 | 229 | words = self.get_word(tokens,word2idx,maxlen) 230 | dependency_parsing = instance['dependency_parsing'] 231 | 232 | start_word = 0 233 | current_max = 0 234 | indices = [] 235 | for edge in dependency_parsing: 236 | if edge[0]=="ROOT": 237 | start_word = max(start_word,current_max) 238 | else: 239 | if edge[1]-1+start_word>maxlen-1 or edge[2]-1+start_word>maxlen-1: 240 | continue 241 | indices.append([edge[1]-1+start_word,edge[2]-1+start_word]) 242 | current_max = max([current_max,edge[1]+start_word,edge[2]+start_word]) 243 | indices_s.append(indices) 244 | 245 | trigger_posis.append(self.get_positions(trigger_posi,len(tokens),maxlen)) 246 | trigger_idxs.append(trigger_posi) 247 | sents.append(words) 248 | trigger_maskls.append(self.get_trigger_mask(trigger_posi,len(tokens),maxlen,'left')) 249 | trigger_maskrs.append(self.get_trigger_mask(trigger_posi, len(tokens),maxlen, 'right')) 250 | event_types.append(constant.EVENT_TYPE_TO_ID[event_type]) 251 | 252 | _trigger_lexical = [] 253 | if trigger_posi==0: 254 | _trigger_lexical.append(0) 255 | else: 256 | _trigger_lexical.append(words[trigger_posi-1]) 257 | 258 | _trigger_lexical.append(words[trigger_posi]) 259 | 260 | if trigger_posi==len(tokens)-1: 261 | _trigger_lexical.append(0) 262 | else: 263 | _trigger_lexical.append(words[trigger_posi+1]) 264 | 265 | trigger_lexical.append(_trigger_lexical) 266 | test_idxs.append(test_idx) 267 | if path.endswith('test.json'): 268 | with open('test_idxs.json','w') as f: 269 | json.dump(test_idxs,f) 270 | return (np.array(trigger_posis,np.int32),np.array(sents,np.int32),np.array(trigger_maskls,np.int32),\ 271 | np.array(trigger_maskrs,np.int32),np.array(event_types,np.int32),np.array(trigger_lexical,np.int32),\ 272 | np.array(pos,np.int32),np.array(ner,np.int32),np.array(trigger_idxs,np.int32)),indices_s 273 | 274 | def load_trigger(self): 275 | print('--Loading Trigger--') 276 | word2idx,self.wordemb = self.load_embedding() 277 | maxlen = self.get_maxlen() 278 | paths = [self.train_path, self.dev_path, self.test_path] 279 | results = [] 280 | for path in paths: 281 | result = self.load_one_trigger(path,maxlen,word2idx) 282 | results.append(result) 283 | return results -------------------------------------------------------------------------------- /evaluate.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import sys 3 | import os 4 | import os.path 5 | import json 6 | import numpy as np 7 | from sklearn.metrics import f1_score,precision_score,recall_score 8 | 9 | input_dir = sys.argv[1] 10 | output_dir = sys.argv[2] 11 | 12 | submit_dir = os.path.join(input_dir, 'res') 13 | truth_dir = os.path.join(input_dir, 'ref') 14 | 15 | if not os.path.isdir(submit_dir): 16 | print("%s doesn't exist" % submit_dir) 17 | 18 | if os.path.isdir(submit_dir) and os.path.isdir(truth_dir): 19 | if not os.path.exists(output_dir): 20 | os.makedirs(output_dir) 21 | 22 | output_filename = os.path.join(output_dir, 'scores.txt') 23 | output_file = open(output_filename, 'wb') 24 | 25 | truth_file = os.path.join(truth_dir, "test_gold.jsonl") 26 | truth = open(truth_file, "r") 27 | 28 | submission_answer_file = os.path.join(submit_dir, "results.jsonl") 29 | submission_answer = open(submission_answer_file, "r") 30 | preds_map=dict() 31 | ans_lines = submission_answer.readlines() 32 | for line in ans_lines: 33 | data=json.loads(line) 34 | tmp=dict() 35 | for mention in data['predictions']: 36 | tmp[mention['id']]=mention['type_id'] 37 | preds_map[data['id']]=tmp 38 | 39 | ref_lines=truth.readlines() 40 | labels=[] 41 | preds=[] 42 | for line in ref_lines: 43 | data=json.loads(line) 44 | pred_tmp=preds_map[data['id']] if data['id'] in preds_map else dict() 45 | if not pred_tmp:#debug 46 | print("lose",data['id']) 47 | for event in data['events']: 48 | for mention in event['mention']: 49 | if mention['id'] in pred_tmp: 50 | preds.append(pred_tmp[mention['id']]) 51 | else: 52 | preds.append(0) 53 | print("lose Mention",mention['id']) 54 | labels.append(event['type_id']) 55 | for mention in data['negative_triggers']: 56 | if mention['id'] in pred_tmp: 57 | preds.append(pred_tmp[mention['id']]) 58 | else: 59 | preds.append(0) 60 | print("lose Mention",mention['id']) 61 | labels.append(0) 62 | assert len(labels)==len(preds) 63 | 64 | #calculate scores 65 | pos_labels=list(range(1,169)) 66 | labels=np.array(labels) 67 | preds=np.array(preds) 68 | micro_p=precision_score(labels,preds,labels=pos_labels,average='micro')*100.0 69 | micro_r=recall_score(labels,preds,labels=pos_labels,average='micro')*100.0 70 | micro_f1=f1_score(labels,preds,labels=pos_labels,average='micro')*100.0 71 | 72 | macro_p=precision_score(labels,preds,labels=pos_labels,average='macro')*100.0 73 | macro_r=recall_score(labels,preds,labels=pos_labels,average='macro')*100.0 74 | macro_f1=f1_score(labels,preds,labels=pos_labels,average='macro')*100.0 75 | 76 | print("Micro_F1:",micro_f1) 77 | print("Micro_Precision:",micro_p) 78 | print("Micro_Recall:",micro_r) 79 | print("Macro_F1:",macro_f1) 80 | print("Macro_Precision:",macro_p) 81 | print("Macro_Recall:",macro_r) 82 | 83 | output_file.write("Micro_F1: %f\n" % micro_f1) 84 | output_file.write("Micro_Precision: %f\n" % micro_p) 85 | output_file.write("Micro_Recall: %f\n" % micro_r) 86 | output_file.write("Macro_F1: %f\n" % macro_f1) 87 | output_file.write("Macro_Precision: %f\n" % macro_p) 88 | output_file.write("Macro_Recall: %f\n" % macro_r) 89 | 90 | output_file.close() 91 | --------------------------------------------------------------------------------