├── .gitignore
├── DataFormat.md
├── LICENSE
├── README.md
├── baselines
    ├── BERT+CRF
    │   ├── BERT-CRF-ACE
    │   │   ├── bert_crf.py
    │   │   ├── crf.py
    │   │   ├── run_ACE.py
    │   │   ├── run_ACE.sh
    │   │   └── utils_ACE.py
    │   ├── BERT-CRF-MAVEN
    │   │   ├── bert_crf.py
    │   │   ├── crf.py
    │   │   ├── run_MAVEN.sh
    │   │   ├── run_MAVEN_infer.sh
    │   │   ├── run_maven.py
    │   │   └── utils_maven.py
    │   └── README.md
    ├── DMBERT
    │   ├── README.md
    │   ├── get_submission.py
    │   ├── model.py
    │   ├── run_ACE.sh
    │   ├── run_MAVEN.sh
    │   ├── run_MAVEN_infer.sh
    │   ├── run_ee.py
    │   └── utils_ee.py
    ├── DMCNN_BiLSTM_(CRF)
    │   ├── .DS_Store
    │   ├── README.md
    │   ├── clear.py
    │   ├── config
    │   │   ├── bilstm.config
    │   │   ├── crf.config
    │   │   └── dmcnn.config
    │   ├── formatter
    │   │   ├── BilstmFormatter.py
    │   │   ├── CrfFormatter.py
    │   │   ├── DmcnnFormatter.py
    │   │   └── __init__.py
    │   ├── main.py
    │   ├── model
    │   │   ├── Bilstm.py
    │   │   ├── Crf.py
    │   │   ├── Dmcnn.py
    │   │   ├── __init__.py
    │   │   └── layers
    │   │   │   ├── crf.py
    │   │   │   ├── embedding.py
    │   │   │   └── outputLayer.py
    │   ├── raw
    │   │   └── 100.utf8
    │   ├── reader
    │   │   ├── MavenReader.py
    │   │   └── __init__.py
    │   └── utils
    │   │   ├── __init__.py
    │   │   ├── configparser_hook.py
    │   │   ├── evaluation.py
    │   │   ├── global_variables.py
    │   │   ├── initializer.py
    │   │   └── runner.py
    └── MOGANED
    │   ├── README.md
    │   ├── constant.py
    │   ├── func.py
    │   ├── models.py
    │   ├── train.py
    │   └── utils.py
├── docid2topic.json
└── evaluate.py


/.gitignore:
--------------------------------------------------------------------------------
  1 | /baselines/DMCNN&LSTM&CRF/data
  2 | /baselines/DMCNN&LSTM&CRF/raw
  3 | !/baselines/DMCNN&LSTM&CRF/raw/100.utf8
  4 | 
  5 | # Byte-compiled / optimized / DLL files
  6 | __pycache__/
  7 | *.py[cod]
  8 | *$py.class
  9 | 
 10 | # C extensions
 11 | *.so
 12 | 
 13 | # Distribution / packaging
 14 | .Python
 15 | build/
 16 | develop-eggs/
 17 | dist/
 18 | downloads/
 19 | eggs/
 20 | .eggs/
 21 | lib/
 22 | lib64/
 23 | parts/
 24 | sdist/
 25 | var/
 26 | wheels/
 27 | pip-wheel-metadata/
 28 | share/python-wheels/
 29 | *.egg-info/
 30 | .installed.cfg
 31 | *.egg
 32 | MANIFEST
 33 | 
 34 | # PyInstaller
 35 | #  Usually these files are written by a python script from a template
 36 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 37 | *.manifest
 38 | *.spec
 39 | 
 40 | # Installer logs
 41 | pip-log.txt
 42 | pip-delete-this-directory.txt
 43 | 
 44 | # Unit test / coverage reports
 45 | htmlcov/
 46 | .tox/
 47 | .nox/
 48 | .coverage
 49 | .coverage.*
 50 | .cache
 51 | nosetests.xml
 52 | coverage.xml
 53 | *.cover
 54 | *.py,cover
 55 | .hypothesis/
 56 | .pytest_cache/
 57 | 
 58 | # Translations
 59 | *.mo
 60 | *.pot
 61 | 
 62 | # Django stuff:
 63 | *.log
 64 | local_settings.py
 65 | db.sqlite3
 66 | db.sqlite3-journal
 67 | 
 68 | # Flask stuff:
 69 | instance/
 70 | .webassets-cache
 71 | 
 72 | # Scrapy stuff:
 73 | .scrapy
 74 | 
 75 | # Sphinx documentation
 76 | docs/_build/
 77 | 
 78 | # PyBuilder
 79 | target/
 80 | 
 81 | # Jupyter Notebook
 82 | .ipynb_checkpoints
 83 | 
 84 | # IPython
 85 | profile_default/
 86 | ipython_config.py
 87 | 
 88 | # pyenv
 89 | .python-version
 90 | 
 91 | # pipenv
 92 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 93 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 94 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 95 | #   install all needed dependencies.
 96 | #Pipfile.lock
 97 | 
 98 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 99 | __pypackages__/
100 | 
101 | # Celery stuff
102 | celerybeat-schedule
103 | celerybeat.pid
104 | 
105 | # SageMath parsed files
106 | *.sage.py
107 | 
108 | # Environments
109 | .env
110 | .venv
111 | env/
112 | venv/
113 | ENV/
114 | env.bak/
115 | venv.bak/
116 | 
117 | # Spyder project settings
118 | .spyderproject
119 | .spyproject
120 | 
121 | # Rope project settings
122 | .ropeproject
123 | 
124 | # mkdocs documentation
125 | /site
126 | 
127 | # mypy
128 | .mypy_cache/
129 | .dmypy.json
130 | dmypy.json
131 | 
132 | # Pyre type checker
133 | .pyre/


--------------------------------------------------------------------------------
/DataFormat.md:
--------------------------------------------------------------------------------
 1 | # MAVEN dataset
 2 | 
 3 | Each `.jsonl` file is a subset of MAVEN and each line in the files is a json string for a document. For the `train.jsonl` and `valid.jsonl` the json format is as below:
 4 | 
 5 | ```JSON5
 6 | {
 7 |     "id": "6b2e8c050e30872e49c2f46edb4ac044", // an unique string for each document
 8 |     "title": "Selma to Montgomery marches", // the tiltle of the document
 9 |     "content": [ // the content of the document. A list, each item is a dict for a sentence
10 |     		{
11 |     			"sentence": "...", // a string, the plain text of the sentence
12 |     			"tokens": ["...", "..."] // a list, tokens of the sentence
13 | 		}
14 |     ],
15 |     "events":[ // a list for annotated events, each item is a dict for an event
16 |         	{
17 |             		"id": "75343904ec49aefe12c5749edadb7802", // an unique string for the event
18 |             		"type": "Arranging", // the event type
19 |             		"type_id": 70, // the numerical id for the event type
20 |             		"mention":[ // a list for the event mentions of the event, each item is a dict
21 |             			{
22 |               				"id": "2db165c25298aefb682cba50c9327e4f", // an unique string for the event mention
23 |               				"trigger_word": "organized", // a string of the trigger word or phrase
24 |               				"sent_id": 1, // the index of the corresponding sentence, strates with 0
25 |               				"offset": [3, 4], // the offset of the trigger words in the tokens list
26 |               			}
27 |              	     	]
28 |         	},
29 |     ],
30 |     "negative_triggers":[ // a list for negative instances, each item is a dict for a negative mention
31 |         {
32 |         	"id": "46348f4078ae8460df4916d03573b7de",
33 |             	"trigger_word": "desire",
34 |             	"sent_id": 1,
35 |             	"offset": [10, 11],
36 |         },
37 |     ]
38 | }
39 | ```
40 | 
41 | For the `test.jsonl`, the format is almost the same but we hide the golden labels:
42 | 
43 | ```JSON5
44 | {
45 |     "id": "6b2e8c050e30872e49c2f46edb4ac044", // an unique string for each document
46 |     "title": "Selma to Montgomery marches", // the tiltle of the document
47 |     "content": [ // the content of the document. A list, each item is a dict for a sentence
48 |     		{
49 |     		 	"sentence": "...", // a string, the plain text of the sentence
50 |     		 	"tokens": ["...", "..."] // a list, tokens of the sentence
51 | 		}
52 |     ],
53 |     "candidates":[ // a list for trigger candidiates, each item is a dict for a trigger or a negative instance, you need to classify the type for each candidate
54 |     	{
55 |         	"id": "46348f4078ae8460df4916d03573b7de",
56 |             	"trigger_word": "desire",
57 |             	"sent_id": 1,
58 |             	"offset": [10, 11],
59 |         }
60 |     ]
61 | }
62 | ```
63 | 
64 | You can submit the prediction results for the test set to [CodaLab](https://competitions.codalab.org/competitions/27320) to get the test results. You need to name your result file as `results.jsonl` and compress it into a `.zip` file for submission.
65 | 
66 | Each line in the `results.jsonl` should be a json string encoding the prediction results for one document. The json format is as below:
67 | 
68 | ```JSON5
69 | {
70 | 	"id": "6b2e8c050e30872e49c2f46edb4ac044", // id for the document
71 |   	"predictions":[ // a list, prediction results for the provided candidates
72 | 		{
73 | 			"id": "46348f4078ae8460df4916d03573b7de", // id for the candidate
74 | 			"type_id": 10, // integer id for the predicted type, 0 for the negative instances
75 | 		},
76 |   	]
77 | }
78 | ```
79 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2020 THU-KEG
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # MAVEN-dataset
 2 | Source code and dataset for EMNLP 2020 paper "MAVEN: A Massive General Domain Event Detection Dataset".
 3 | 
 4 | ## Data
 5 | 
 6 | The dataset (ver. 1.0) can be obtained from [Tsinghua Cloud](https://cloud.tsinghua.edu.cn/d/874e0ad810f34272a03b/) or [Google Drive](https://drive.google.com/drive/folders/19Q0lqJE6A98OLnRqQVhbX3e6rG4BVGn8?usp=sharing). The data format is introduced in [this document](DataFormat.md).
 7 | 
 8 | We also release the document topics for data analysis and model development. The [``docid2topic.json``](docid2topic.json) is to map the document ids to their EventWiki topic labels.
 9 | 
10 | ## CodaLab
11 | 
12 | To get the test results, you can submit your predictions to our permanent [CodaLab competition](https://codalab.lisn.upsaclay.fr/competitions/395) (the [older version](https://competitions.codalab.org/competitions/27320) will be phased out soon). For the evaluation method, please refer to the [evaluation script](evaluate.py).
13 | 
14 | ## Codes
15 | 
16 | We release the source codes for the baselines, including [DMCNN](baselines/DMCNN_BiLSTM_(CRF)), [BiLSTM](baselines/DMCNN_BiLSTM_(CRF)), [BiLSTM+CRF](baselines/DMCNN_BiLSTM_(CRF)), [MOGANED](baselines/MOGANED) and [DMBERT](baselines/DMBERT).
17 | 
18 | ## Citation
19 | 
20 | If these data and codes help you, please cite this paper.
21 | 
22 | ```bib
23 | @inproceedings{wang2020MAVEN,
24 |   title={{MAVEN}: A Massive General Domain Event Detection Dataset},
25 |   author={Wang, Xiaozhi and Wang, Ziqi and Han, Xu and Jiang, Wangyi and Han, Rong and Liu, Zhiyuan and Li, Juanzi and Li, Peng and Lin, Yankai and Zhou, Jie},
26 |   booktitle={Proceedings of EMNLP 2020},
27 |   year={2020}
28 | }
29 | ```
30 | 
31 | 
32 | 
33 | 


--------------------------------------------------------------------------------
/baselines/BERT+CRF/BERT-CRF-ACE/bert_crf.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | from transformers.modeling_bert import BertModel, BertPreTrainedModel
  3 | from transformers.configuration_bert import BertConfig
  4 | 
  5 | from crf import *
  6 | from utils_ACE import to_crf_pad, unpad_crf
  7 | 
  8 | logger = logging.getLogger(__name__)
  9 | 
 10 | BERT_PRETRAINED_MODEL_ARCHIVE_MAP = {
 11 |     'bert-base-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-pytorch_model.bin",
 12 |     'bert-large-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-pytorch_model.bin",
 13 |     'bert-base-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-pytorch_model.bin",
 14 |     'bert-large-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-pytorch_model.bin",
 15 |     'bert-base-multilingual-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-uncased-pytorch_model.bin",
 16 |     'bert-base-multilingual-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-cased-pytorch_model.bin",
 17 |     'bert-base-chinese': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-chinese-pytorch_model.bin",
 18 |     'bert-base-german-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-cased-pytorch_model.bin",
 19 |     'bert-large-uncased-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-pytorch_model.bin",
 20 |     'bert-large-cased-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-pytorch_model.bin",
 21 |     'bert-large-uncased-whole-word-masking-finetuned-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-finetuned-squad-pytorch_model.bin",
 22 |     'bert-large-cased-whole-word-masking-finetuned-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-finetuned-squad-pytorch_model.bin",
 23 |     'bert-base-cased-finetuned-mrpc': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-finetuned-mrpc-pytorch_model.bin",
 24 |     'bert-base-german-dbmdz-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-dbmdz-cased-pytorch_model.bin",
 25 |     'bert-base-german-dbmdz-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-dbmdz-uncased-pytorch_model.bin",
 26 |     'bert-base-japanese': "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-pytorch_model.bin",
 27 |     'bert-base-japanese-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-whole-word-masking-pytorch_model.bin",
 28 |     'bert-base-japanese-char': "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-char-pytorch_model.bin",
 29 |     'bert-base-japanese-char-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-char-whole-word-masking-pytorch_model.bin",
 30 |     'bert-base-finnish-cased-v1': "https://s3.amazonaws.com/models.huggingface.co/bert/TurkuNLP/bert-base-finnish-cased-v1/pytorch_model.bin",
 31 |     'bert-base-finnish-uncased-v1': "https://s3.amazonaws.com/models.huggingface.co/bert/TurkuNLP/bert-base-finnish-uncased-v1/pytorch_model.bin",
 32 | }
 33 | 
 34 | 
 35 | class BertCRFForTokenClassification(BertPreTrainedModel):
 36 |     config_class = BertConfig
 37 |     pretrained_model_archive_map = BERT_PRETRAINED_MODEL_ARCHIVE_MAP
 38 |     base_model_prefix = "bert"
 39 | 
 40 |     def __init__(self, config):
 41 |         super(BertCRFForTokenClassification, self).__init__(config)
 42 |         self.num_labels = config.num_labels
 43 | 
 44 |         self.bert = BertModel(config)
 45 |         self.dropout = nn.Dropout(config.hidden_dropout_prob)
 46 |         self.classifier = nn.Linear(config.hidden_size, config.num_labels + 2)
 47 |         self.crf = CRF(self.num_labels)
 48 | 
 49 |         self.init_weights()
 50 | 
 51 |     def _get_features(self, input_ids=None, attention_mask=None, token_type_ids=None,
 52 |                       position_ids=None, head_mask=None, inputs_embeds=None):
 53 |         outputs = self.bert(input_ids,
 54 |                             attention_mask=attention_mask,
 55 |                             token_type_ids=token_type_ids,
 56 |                             position_ids=position_ids,
 57 |                             head_mask=head_mask,
 58 |                             inputs_embeds=inputs_embeds)
 59 | 
 60 |         sequence_output = outputs[0]
 61 | 
 62 |         sequence_output = self.dropout(sequence_output)
 63 |         feats = self.classifier(sequence_output)
 64 |         return feats, outputs
 65 | 
 66 |     def forward(self, input_ids=None, attention_mask=None, token_type_ids=None,
 67 |                 position_ids=None, head_mask=None, inputs_embeds=None, labels=None, pad_token_label_id=None):
 68 | 
 69 |         logits, outputs = self._get_features(input_ids, attention_mask, token_type_ids,
 70 |                                              position_ids, head_mask, inputs_embeds)
 71 | 
 72 |         outputs = (logits,) + outputs[2:]  # add hidden states and attention if they are here
 73 |         if labels is not None:
 74 |             # loss_fct = nn.CrossEntropyLoss()
 75 |             pad_mask = (labels != pad_token_label_id)
 76 | 
 77 |             # Only keep active parts of the loss
 78 |             if attention_mask is not None:
 79 |                 # active_loss = attention_mask.view(-1) == 1
 80 |                 # active_logits = logits.view(-1, self.num_labels)[active_loss]
 81 |                 # active_labels = labels.view(-1)[active_loss]
 82 |                 loss_mask = ((attention_mask == 1) & pad_mask)
 83 |             else:
 84 |                 # loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
 85 |                 loss_mask = ((torch.ones(logits.shape) == 1) & pad_mask)
 86 | 
 87 |             crf_labels, crf_mask = to_crf_pad(labels, loss_mask, pad_token_label_id)
 88 |             crf_logits, _ = to_crf_pad(logits, loss_mask, pad_token_label_id)
 89 | 
 90 |             loss = self.crf.neg_log_likelihood(crf_logits, crf_mask, crf_labels)
 91 |             # removing mask stuff from the output path is done later in my_crf_ner but it should be kept away
 92 |             # when calculating loss
 93 |             best_path = self.crf(crf_logits, crf_mask)  # (torch.ones(logits.shape) == 1)
 94 |             best_path = unpad_crf(best_path, crf_mask, labels, pad_mask)
 95 |             outputs = (loss,) + outputs + (best_path,)
 96 |         else:
 97 |             # removing mask stuff from the output path is done later in my_crf_ner but it should be kept away
 98 |             # when calculating loss
 99 |             if attention_mask is not None:
100 |                 mask = (attention_mask == 1)  # & (labels!=-100))
101 |             else:
102 |                 mask = torch.ones(logits.shape).bool()  # (labels!=-100)
103 |             crf_logits, crf_mask = to_crf_pad(logits, mask, pad_token_label_id)
104 |             crf_mask = crf_mask.sum(axis=2) == crf_mask.shape[2]
105 |             best_path = self.crf(crf_logits, crf_mask)
106 |             temp_labels = torch.ones(mask.shape) * pad_token_label_id
107 |             best_path = unpad_crf(best_path, crf_mask, temp_labels, mask)
108 |             outputs = outputs + (best_path,)
109 | 
110 |         return outputs
111 | 


--------------------------------------------------------------------------------
/baselines/BERT+CRF/BERT-CRF-ACE/crf.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | # @Author: Jie Yang
  3 | # @Date:   2017-12-04 23:19:38
  4 | # @Last Modified by:   Jie Yang,     Contact: jieynlp@gmail.com
  5 | # @Last Modified time: 2018-05-27 22:48:17
  6 | # Modified from original https://github.com/jiesutd/LatticeLSTM/blob/master/model/crf.py
  7 | 
  8 | import torch
  9 | import torch.autograd as autograd
 10 | import torch.nn as nn
 11 | 
 12 | 
 13 | # Compute log sum exp in a numerically stable way for the forward algorithm
 14 | def log_sum_exp(vec, m_size):
 15 |     """
 16 |     calculate log of exp sum
 17 |     args:
 18 |         vec (batch_size, vanishing_dim, hidden_dim) : input tensor
 19 |         m_size : hidden_dim
 20 |     return:
 21 |         batch_size, hidden_dim
 22 |     """
 23 |     _, idx = torch.max(vec, 1)  # B * 1 * M
 24 |     max_score = torch.gather(vec, 1, idx.view(-1, 1, m_size)).view(-1, 1, m_size)  # B * M
 25 |     return max_score.view(-1, m_size) + torch.log(torch.sum(torch.exp(vec - max_score.expand_as(vec)), 1)).view(-1,
 26 |                                                                                                                 m_size)  # B * M
 27 | 
 28 | 
 29 | class CRF(nn.Module):
 30 | 
 31 |     def __init__(self, tagset_size, use_gpu=False):  # average_batch=False,
 32 |         super(CRF, self).__init__()
 33 |         print("build CRF...")
 34 |         # self.average_batch = average_batch
 35 |         self.gpu = use_gpu
 36 | 
 37 |         self.START_TAG = -2
 38 |         self.STOP_TAG = -1
 39 | 
 40 |         self.tagset_size = tagset_size
 41 | 
 42 |         # # We add 2 here, because of START_TAG and STOP_TAG
 43 |         # self.hidden2tag = nn.Linear(params['hidden_dim'], self.tagset_size + 2)
 44 |         # # transitions (f_tag_size, t_tag_size), transition value from f_tag to t_tag
 45 |         init_transitions = torch.zeros(self.tagset_size + 2, self.tagset_size + 2)
 46 |         init_transitions[:, self.START_TAG] = -10000.0
 47 |         init_transitions[self.STOP_TAG, :] = -10000.0
 48 |         if torch.cuda.is_available():
 49 |             init_transitions = init_transitions.cuda()
 50 |         self.transitions = nn.Parameter(init_transitions, requires_grad=True)
 51 | 
 52 |     def init_hidden_cell(self, batch_size, layer_hidden_dim):
 53 |         return (torch.randn(2, batch_size, layer_hidden_dim // 2),
 54 |                 torch.randn(2, batch_size, layer_hidden_dim // 2))
 55 | 
 56 |     def _calculate_PZ(self, feats, mask):
 57 |         """
 58 |             input:
 59 |                 feats: (batch, seq_len, self.tag_size+2)
 60 |                 masks: (batch, seq_len)
 61 |         """
 62 |         batch_size = feats.size(0)
 63 |         seq_len = feats.size(1)
 64 |         tag_size = feats.size(2)
 65 |         # print feats.view(seq_len, tag_size)
 66 |         assert (tag_size == self.tagset_size + 2)
 67 |         mask = mask.transpose(1, 0).contiguous()
 68 |         ins_num = seq_len * batch_size
 69 |         ## be careful the view shape, it is .view(ins_num, 1, tag_size) but not .view(ins_num, tag_size, 1)
 70 |         feats = feats.transpose(1, 0).contiguous().view(ins_num, 1, tag_size).expand(ins_num, tag_size, tag_size)
 71 |         ## need to consider start
 72 |         scores = feats + self.transitions.view(1, tag_size, tag_size).expand(ins_num, tag_size, tag_size)
 73 |         scores = scores.view(seq_len, batch_size, tag_size, tag_size)
 74 |         # build iter
 75 |         seq_iter = enumerate(scores)
 76 |         _, inivalues = next(seq_iter)  # bat_size * from_target_size * to_target_size
 77 |         # only need start from start_tag
 78 |         partition = inivalues[:, self.START_TAG, :].clone().view(batch_size, tag_size, 1)  # bat_size * to_target_size
 79 | 
 80 |         ## add start score (from start to all tag, duplicate to batch_size)
 81 |         # partition = partition + self.transitions[START_TAG,:].view(1, tag_size, 1).expand(batch_size, tag_size, 1)
 82 |         # iter over last scores
 83 |         for idx, cur_values in seq_iter:
 84 |             # previous to_target is current from_target
 85 |             # partition: previous results log(exp(from_target)), #(batch_size * from_target)
 86 |             # cur_values: bat_size * from_target * to_target
 87 | 
 88 |             cur_values = cur_values + partition.contiguous().view(batch_size, tag_size, 1).expand(batch_size, tag_size,
 89 |                                                                                                   tag_size)
 90 |             cur_partition = log_sum_exp(cur_values, tag_size)
 91 |             # print cur_partition.data
 92 | 
 93 |             # (bat_size * from_target * to_target) -> (bat_size * to_target)
 94 |             # partition = utils.switch(partition, cur_partition, mask[idx].view(bat_size, 1).expand(bat_size, self.tagset_size)).view(bat_size, -1)
 95 |             mask_idx = mask[idx, :].view(batch_size, 1).expand(batch_size, tag_size)
 96 | 
 97 |             ## effective updated partition part, only keep the partition value of mask value = 1
 98 |             masked_cur_partition = cur_partition.masked_select(mask_idx)
 99 |             ## let mask_idx broadcastable, to disable warning
100 |             mask_idx = mask_idx.contiguous().view(batch_size, tag_size, 1)
101 | 
102 |             ## replace the partition where the maskvalue=1, other partition value keeps the same
103 |             partition.masked_scatter_(mask_idx, masked_cur_partition)
104 |             # until the last state, add transition score for all partition (and do log_sum_exp) then select the value in STOP_TAG
105 |         cur_values = self.transitions.view(1, tag_size, tag_size).expand(batch_size, tag_size,
106 |                                                                          tag_size) + partition.contiguous().view(
107 |             batch_size, tag_size, 1).expand(batch_size, tag_size, tag_size)
108 | 
109 |         cur_partition = log_sum_exp(cur_values, tag_size)
110 |         final_partition = cur_partition[:, self.STOP_TAG]
111 |         return final_partition.sum(), scores
112 | 
113 |     def _viterbi_decode(self, feats, mask):
114 |         """
115 |             input:
116 |                 feats: (batch, seq_len, self.tag_size+2)
117 |                 mask: (batch, seq_len)
118 |             output:
119 |                 decode_idx: (batch, seq_len) decoded sequence
120 |                 path_score: (batch, 1) corresponding score for each sequence (to be implementated)
121 |         """
122 |         batch_size = feats.size(0)
123 |         seq_len = feats.size(1)
124 |         tag_size = feats.size(2)
125 |         assert (tag_size == self.tagset_size + 2)
126 |         ## calculate sentence length for each sentence
127 |         length_mask = torch.sum(mask.long(), dim=1).view(batch_size, 1).long()
128 |         ## mask to (seq_len, batch_size)
129 |         mask = mask.transpose(1, 0).contiguous()
130 |         ins_num = seq_len * batch_size
131 |         ## be careful the view shape, it is .view(ins_num, 1, tag_size) but not .view(ins_num, tag_size, 1)
132 |         feats = feats.transpose(1, 0).contiguous().view(ins_num, 1, tag_size).expand(ins_num, tag_size, tag_size)
133 |         ## need to consider start
134 |         scores = feats + self.transitions.view(1, tag_size, tag_size).expand(ins_num, tag_size, tag_size)
135 |         scores = scores.view(seq_len, batch_size, tag_size, tag_size)
136 | 
137 |         # build iter
138 |         seq_iter = enumerate(scores)
139 |         ## record the position of best score
140 |         back_points = list()
141 |         partition_history = list()
142 |         ##  reverse mask (bug for mask = 1- mask, use this as alternative choice)
143 |         # mask = 1 + (-1)*mask
144 |         # mask = (1 - mask.long()).byte()
145 |         mask = ~(mask)
146 |         _, inivalues = next(seq_iter)  # bat_size * from_target_size * to_target_size
147 |         # only need start from start_tag
148 |         partition = inivalues[:, self.START_TAG, :].clone().view(batch_size, tag_size)  # bat_size * to_target_size
149 |         # print "init part:",partition.size()
150 |         partition_history.append(partition)
151 |         # iter over last scores
152 |         for idx, cur_values in seq_iter:
153 |             # previous to_target is current from_target
154 |             # partition: previous results log(exp(from_target)), #(batch_size * from_target)
155 |             # cur_values: batch_size * from_target * to_target
156 |             cur_values = cur_values + partition.contiguous().view(batch_size, tag_size, 1).expand(batch_size, tag_size,
157 |                                                                                                   tag_size)
158 |             ## forscores, cur_bp = torch.max(cur_values[:,:-2,:], 1) # do not consider START_TAG/STOP_TAG
159 |             # print "cur value:", cur_values.size()
160 |             partition, cur_bp = torch.max(cur_values, 1)
161 |             # print "partsize:",partition.size()
162 |             # exit(0)
163 |             # print partition
164 |             # print cur_bp
165 |             # print "one best, ",idx
166 |             partition_history.append(partition)
167 |             ## cur_bp: (batch_size, tag_size) max source score position in current tag
168 |             ## set padded label as 0, which will be filtered in post processing
169 |             cur_bp.masked_fill_(mask[idx].view(batch_size, 1).expand(batch_size, tag_size), 0)
170 |             back_points.append(cur_bp)
171 |         # exit(0)
172 |         ### add score to final STOP_TAG
173 |         partition_history = torch.cat(partition_history, 0).view(seq_len, batch_size, -1).transpose(1,
174 |                                                                                                     0).contiguous()  ## (batch_size, seq_len. tag_size)
175 |         ### get the last position for each setences, and select the last partitions using gather()
176 |         last_position = length_mask.view(batch_size, 1, 1).expand(batch_size, 1, tag_size) - 1
177 |         last_partition = torch.gather(partition_history, 1, last_position).view(batch_size, tag_size, 1)
178 |         ### calculate the score from last partition to end state (and then select the STOP_TAG from it)
179 |         last_values = last_partition.expand(batch_size, tag_size, tag_size) + self.transitions.view(1, tag_size,
180 |                                                                                                     tag_size).expand(
181 |             batch_size, tag_size, tag_size)
182 |         _, last_bp = torch.max(last_values, 1)
183 |         pad_zero = autograd.Variable(torch.zeros(batch_size, tag_size)).long()
184 |         if torch.cuda.is_available():
185 |             pad_zero = pad_zero.cuda()
186 |         back_points.append(pad_zero)
187 |         back_points = torch.cat(back_points).view(seq_len, batch_size, tag_size)
188 | 
189 |         ## select end ids in STOP_TAG
190 |         pointer = last_bp[:, self.STOP_TAG]
191 |         insert_last = pointer.contiguous().view(batch_size, 1, 1).expand(batch_size, 1, tag_size)
192 |         back_points = back_points.transpose(1, 0).contiguous()
193 |         ## move the end ids(expand to tag_size) to the corresponding position of back_points to replace the 0 values
194 |         # print "lp:",last_position
195 |         # print "il:",insert_last
196 |         back_points.scatter_(1, last_position, insert_last)
197 |         # print "bp:",back_points
198 |         # exit(0)
199 |         back_points = back_points.transpose(1, 0).contiguous()
200 |         ## decode from the end, padded position ids are 0, which will be filtered if following evaluation
201 |         decode_idx = autograd.Variable(torch.LongTensor(seq_len, batch_size))
202 |         if torch.cuda.is_available():
203 |             decode_idx = decode_idx.cuda()
204 |         decode_idx[-1] = pointer.data  # detach()
205 |         for idx in range(len(back_points) - 2, -1, -1):
206 |             pointer = torch.gather(back_points[idx], 1, pointer.contiguous().view(batch_size, 1))
207 |             decode_idx[idx] = pointer.data.t()  # feili  pointer.detach().view(batch_size)
208 |         path_score = None
209 |         decode_idx = decode_idx.transpose(1, 0)
210 |         return path_score, decode_idx
211 | 
212 |     def forward(self, feats, mask):
213 |         path_score, best_path = self._viterbi_decode(feats, mask)
214 |         # return path_score, best_path
215 |         return best_path
216 | 
217 |     def _score_sentence(self, scores, mask, tags):
218 |         """
219 |             input:
220 |                 scores: variable (seq_len, batch, tag_size, tag_size)
221 |                 mask: (batch, seq_len)
222 |                 tags: tensor  (batch, seq_len)
223 |             output:
224 |                 score: sum of score for gold sequences within whole batch
225 |         """
226 |         # Gives the score of a provided tag sequence
227 |         batch_size = scores.size(1)
228 |         seq_len = scores.size(0)
229 |         tag_size = scores.size(2)
230 |         ## convert tag value into a new format, recorded label bigram information to index
231 |         new_tags = autograd.Variable(torch.LongTensor(batch_size, seq_len))
232 |         if torch.cuda.is_available():
233 |             new_tags = new_tags.cuda()
234 |         for idx in range(seq_len):
235 |             if idx == 0:
236 |                 ## start -> first score
237 |                 new_tags[:, 0] = (tag_size - 2) * tag_size + tags[:, 0]
238 | 
239 |             else:
240 |                 new_tags[:, idx] = tags[:, idx - 1] * tag_size + tags[:, idx]
241 | 
242 |         ## transition for label to STOP_TAG
243 |         end_transition = self.transitions[:, self.STOP_TAG].contiguous().view(1, tag_size).expand(batch_size, tag_size)
244 |         ## length for batch,  last word position = length - 1
245 |         length_mask = torch.sum(mask.long(), dim=1).view(batch_size, 1).long()
246 |         ## index the label id of last word
247 |         end_ids = torch.gather(tags, 1, length_mask - 1)
248 | 
249 |         ## index the transition score for end_id to STOP_TAG
250 |         end_energy = torch.gather(end_transition, 1, end_ids)
251 | 
252 |         ## convert tag as (seq_len, batch_size, 1)
253 |         new_tags = new_tags.transpose(1, 0).contiguous().view(seq_len, batch_size, 1)
254 |         ### need convert tags id to search from 400 positions of scores
255 |         tg_energy = torch.gather(scores.view(seq_len, batch_size, -1), 2, new_tags).view(seq_len,
256 |                                                                                          batch_size)  # seq_len * bat_size
257 |         ## mask transpose to (seq_len, batch_size)
258 |         tg_energy = tg_energy.masked_select(mask.transpose(1, 0))
259 | 
260 |         # ## calculate the score from START_TAG to first label
261 |         # start_transition = self.transitions[START_TAG,:].view(1, tag_size).expand(batch_size, tag_size)
262 |         # start_energy = torch.gather(start_transition, 1, tags[0,:])
263 | 
264 |         ## add all score together
265 |         # gold_score = start_energy.sum() + tg_energy.sum() + end_energy.sum()
266 |         gold_score = tg_energy.sum() + end_energy.sum()
267 |         return gold_score
268 | 
269 |     def neg_log_likelihood(self, feats, mask, tags):
270 |         # nonegative log likelihood
271 |         forward_score, scores = self._calculate_PZ(feats, mask)
272 |         # print('Forward', forward_score)
273 |         gold_score = self._score_sentence(scores, mask, tags)
274 |         # print('Gold', gold_score)
275 |         # print "batch, f:", forward_score.data[0], " g:", gold_score.data[0], " dis:", forward_score.data[0] - gold_score.data[0]
276 |         # exit(0)
277 |         # if self.average_batch:
278 |         #     return (forward_score - gold_score) / batch_size
279 |         # else:
280 |         return forward_score - gold_score
281 | 


--------------------------------------------------------------------------------
/baselines/BERT+CRF/BERT-CRF-ACE/run_ACE.sh:
--------------------------------------------------------------------------------
 1 | python3 run_ACE.py \
 2 |     --data_dir ../ACE05/ \ # path to the preprocessed ACE 2005 dataset
 3 |     --model_type bertcrf \
 4 |     --model_name_or_path bert-base-uncased \
 5 |     --output_dir ./ACE \ #path to dump checkpoints 
 6 |     --max_seq_length 128 \
 7 |     --do_lower_case \
 8 |     --per_gpu_train_batch_size 32 \
 9 |     --per_gpu_eval_batch_size 32 \
10 |     --gradient_accumulation_steps 8 \
11 |     --learning_rate 5e-5 \
12 |     --num_train_epochs 10 \
13 |     --save_steps 36 \
14 |     --logging_steps 36 \
15 |     --seed 24 \
16 |     --do_train \
17 |     --do_eval \
18 |     --evaluate_during_training
19 | 


--------------------------------------------------------------------------------
/baselines/BERT+CRF/BERT-CRF-ACE/utils_ACE.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
  3 | # Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
  4 | #
  5 | # Licensed under the Apache License, Version 2.0 (the "License");
  6 | # you may not use this file except in compliance with the License.
  7 | # You may obtain a copy of the License at
  8 | #
  9 | #     http://www.apache.org/licenses/LICENSE-2.0
 10 | #
 11 | # Unless required by applicable law or agreed to in writing, software
 12 | # distributed under the License is distributed on an "AS IS" BASIS,
 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 | # See the License for the specific language governing permissions and
 15 | # limitations under the License.
 16 | """ Event detection CRF finetuning: utilities to work with ACE 2005 """
 17 | 
 18 | from __future__ import absolute_import, division, print_function
 19 | import json
 20 | import logging
 21 | import os
 22 | from io import open
 23 | from transformers import XLMRobertaTokenizer, BertTokenizer, RobertaTokenizer
 24 | 
 25 | from torch.nn.utils.rnn import pad_sequence
 26 | 
 27 | logger = logging.getLogger(__name__)
 28 | 
 29 | 
 30 | class InputExample(object):
 31 |     """A single training/test example for token classification."""
 32 | 
 33 |     def __init__(self, guid, words, labels):
 34 |         """Constructs a InputExample.
 35 | 
 36 |         Args:
 37 |             guid: Unique id for the example.
 38 |             words: list. The words of the sequence.
 39 |             labels: (Optional) list. The labels for each word of the sequence. This should be
 40 |             specified for train and dev examples, but not for test examples.
 41 |         """
 42 |         self.guid = guid
 43 |         self.words = words
 44 |         self.labels = labels
 45 | 
 46 | 
 47 | class InputFeatures(object):
 48 |     """A single set of features of data."""
 49 | 
 50 |     def __init__(self, input_ids, input_mask, segment_ids, label_ids):
 51 |         self.input_ids = input_ids
 52 |         self.input_mask = input_mask
 53 |         self.segment_ids = segment_ids
 54 |         self.label_ids = label_ids
 55 | 
 56 | 
 57 | def read_examples_from_file(data_dir, mode):
 58 |     file_path = os.path.join(data_dir, "{}.json".format(mode))
 59 |     examples = []
 60 |     data=json.load(open(file_path, "r"))
 61 |     words=[]
 62 |     labels=[]
 63 |     def getLabel(eT,is_start=False):
 64 |         if eT=='None':
 65 |             return 'O'
 66 |         if is_start:
 67 |             return "B-"+eT
 68 |         return "I-"+eT
 69 |     words=data[0]['tokens']
 70 |     labels=['X' for i in range(0,len(words))]
 71 |     labels[data[0]['trigger_start']]=getLabel(data[0]['event_type'],True)
 72 |     for j in range(data[0]['trigger_start']+1,data[0]['trigger_end']+1):
 73 |         labels[j]=getLabel(data[0]['event_type'])
 74 |     for i in range(1,len(data)):
 75 |         if data[i]['tokens']!=data[i-1]['tokens']:
 76 |             examples.append(InputExample(guid="%s-%d"%(mode,i),
 77 |                                         words=words,
 78 |                                         labels=labels))
 79 |             words=data[i]['tokens']
 80 |             labels=['X' for i in range(0,len(words))]
 81 |         labels[data[i]['trigger_start']]=getLabel(data[i]['event_type'],True)
 82 |         for j in range(data[i]['trigger_start']+1,data[i]['trigger_end']+1):
 83 |             labels[j]=getLabel(data[i]['event_type'])
 84 |     examples.append(InputExample(guid="%s-%d"%(mode,len(data)),words=words,labels=labels))
 85 |     return examples
 86 | 
 87 | 
 88 | def convert_examples_to_features(examples,
 89 |                                  label_list,
 90 |                                  max_seq_length,
 91 |                                  tokenizer,
 92 |                                  cls_token_at_end=False,
 93 |                                  cls_token="[CLS]",
 94 |                                  cls_token_segment_id=1,
 95 |                                  sep_token="[SEP]",
 96 |                                  sep_token_extra=False,
 97 |                                  pad_on_left=False,
 98 |                                  pad_token=0,
 99 |                                  pad_token_segment_id=0,
100 |                                  pad_token_label_id=-100,
101 |                                  sequence_a_segment_id=0,
102 |                                  mask_padding_with_zero=True,
103 |                                  model_name=None):
104 |     """ Loads a data file into a list of `InputBatch`s
105 |         `cls_token_at_end` define the location of the CLS token:
106 |             - False (Default, BERT/XLM pattern): [CLS] + A + [SEP] + B + [SEP]
107 |             - True (XLNet/GPT pattern): A + [SEP] + B + [SEP] + [CLS]
108 |         `cls_token_segment_id` define the segment id associated to the CLS token (0 for BERT, 2 for XLNet)
109 |     """
110 | 
111 |     label_map = {label: i for i, label in enumerate(label_list)}
112 | 
113 |     # my logic in crf_padding requires this check. I create mask for crf by labels==pad_token_label_id to not include it
114 |     # in loss and decoding
115 |     assert pad_token_label_id not in label_map.values()
116 | 
117 |     features = []
118 |     for (ex_index, example) in enumerate(examples):
119 |         if ex_index % 10000 == 0:
120 |             print("###############")
121 |             logger.info("Writing example %d of %d", ex_index, len(examples))
122 |             print("###############")
123 | 
124 |         tokens = []
125 |         label_ids = []
126 |         for word, label in zip(example.words, example.labels):
127 |             word_tokens = tokenizer.tokenize(word)
128 |             tokens.extend(word_tokens)
129 |             # Use the real label id for the first token of the word, and padding ids for the remaining tokens
130 |             if label!='X':
131 |                 label_ids.extend([label_map[label]] + [pad_token_label_id] * (len(word_tokens) - 1))
132 |             else:
133 |                 label_ids.extend([pad_token_label_id] + [pad_token_label_id] * (len(word_tokens) - 1))
134 | 
135 |         # Account for [CLS] and [SEP] with "- 2" and with "- 3" for RoBERTa.
136 |         special_tokens_count = 3 if sep_token_extra else 2
137 |         if len(tokens) > max_seq_length - special_tokens_count:
138 |             tokens = tokens[:(max_seq_length - special_tokens_count)]
139 |             label_ids = label_ids[:(max_seq_length - special_tokens_count)]
140 | 
141 |         # The convention in BERT is:
142 |         # (a) For sequence pairs:
143 |         #  tokens:   [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
144 |         #  type_ids:   0   0  0    0    0     0       0   0   1  1  1  1   1   1
145 |         # (b) For single sequences:
146 |         #  tokens:   [CLS] the dog is hairy . [SEP]
147 |         #  type_ids:   0   0   0   0  0     0   0
148 |         #
149 |         # Where "type_ids" are used to indicate whether this is the first
150 |         # sequence or the second sequence. The embedding vectors for `type=0` and
151 |         # `type=1` were learned during pre-training and are added to the wordpiece
152 |         # embedding vector (and position vector). This is not *strictly* necessary
153 |         # since the [SEP] token unambiguously separates the sequences, but it makes
154 |         # it easier for the model to learn the concept of sequences.
155 |         #
156 |         # For classification tasks, the first vector (corresponding to [CLS]) is
157 |         # used as as the "sentence vector". Note that this only makes sense because
158 |         # the entire model is fine-tuned.
159 |         tokens += [sep_token]
160 |         label_ids += [pad_token_label_id]  # [label_map["X"]]
161 |         if sep_token_extra:
162 |             # roberta uses an extra separator b/w pairs of sentences
163 |             tokens += [sep_token]
164 |             label_ids += [pad_token_label_id]
165 |         segment_ids = [sequence_a_segment_id] * len(tokens)
166 | 
167 |         if cls_token_at_end:
168 |             tokens += [cls_token]
169 |             label_ids += [pad_token_label_id]
170 |             segment_ids += [cls_token_segment_id]
171 |         else:
172 |             tokens = [cls_token] + tokens
173 |             label_ids = [pad_token_label_id] + label_ids
174 |             segment_ids = [cls_token_segment_id] + segment_ids
175 | 
176 |         if model_name:
177 |             if model_name == 'xlm-roberta-base':
178 |                 tokenizer = XLMRobertaTokenizer.from_pretrained(model_name)
179 |                 input_ids = tokenizer.convert_tokens_to_ids(tokens)
180 |             elif model_name.startswith('bert'):
181 |                 tokenizer = BertTokenizer.from_pretrained(model_name)
182 |                 input_ids = tokenizer.convert_tokens_to_ids(tokens)
183 |             elif model_name == 'roberta':
184 |                 tokenizer = RobertaTokenizer.from_pretrained(model_name)
185 |                 input_ids = tokenizer.convert_tokens_to_ids(tokens)
186 |         else:
187 |             input_ids = tokenizer.convert_tokens_to_ids(tokens)
188 | 
189 |         # The mask has 1 for real tokens and 0 for padding tokens. Only real
190 |         # tokens are attended to.
191 |         input_mask = [1 if mask_padding_with_zero else 0] * len(input_ids)
192 | 
193 |         # Zero-pad up to the sequence length.
194 |         padding_length = max_seq_length - len(input_ids)
195 |         if pad_on_left:
196 |             input_ids = ([pad_token] * padding_length) + input_ids
197 |             input_mask = ([0 if mask_padding_with_zero else 1] * padding_length) + input_mask
198 |             segment_ids = ([pad_token_segment_id] * padding_length) + segment_ids
199 |             label_ids = ([pad_token_label_id] * padding_length) + label_ids
200 |         else:
201 |             input_ids += ([pad_token] * padding_length)
202 |             input_mask += ([0 if mask_padding_with_zero else 1] * padding_length)
203 |             segment_ids += ([pad_token_segment_id] * padding_length)
204 |             label_ids += ([pad_token_label_id] * padding_length)
205 | 
206 |         assert len(input_ids) == max_seq_length
207 |         assert len(input_mask) == max_seq_length
208 |         assert len(segment_ids) == max_seq_length
209 |         assert len(label_ids) == max_seq_length
210 | 
211 |         if ex_index < 0:
212 |             logger.info("*** Example ***")
213 |             logger.info("guid: %s", example.guid)
214 |             logger.info("tokens: %s", " ".join([str(x) for x in tokens]))
215 |             logger.info("input_ids: %s", " ".join([str(x) for x in input_ids]))
216 |             logger.info("input_mask: %s", " ".join([str(x) for x in input_mask]))
217 |             logger.info("segment_ids: %s", " ".join([str(x) for x in segment_ids]))
218 |             logger.info("label_ids: %s", " ".join([str(x) for x in label_ids]))
219 | 
220 |         features.append(
221 |             InputFeatures(input_ids=input_ids,
222 |                           input_mask=input_mask,
223 |                           segment_ids=segment_ids,
224 |                           label_ids=label_ids))
225 |     return features
226 | 
227 | 
228 | def get_labels(path):
229 |     return ["O", "B-Attack", "I-Attack", "B-Transport", "I-Transport", "B-Die", "I-Die", "B-End-Position", "I-End-Position", "B-Meet", "I-Meet", "B-Phone-Write", "I-Phone-Write", "B-Elect", "I-Elect", "B-Injure", "I-Injure", "B-Transfer-Ownership", "I-Transfer-Ownership", "B-Start-Org", "I-Start-Org", "B-Transfer-Money", "I-Transfer-Money", "B-Sue", "I-Sue", "B-Demonstrate", "I-Demonstrate", "B-Arrest-Jail", "I-Arrest-Jail", "B-Start-Position", "I-Start-Position", "B-Be-Born", "I-Be-Born", "B-End-Org", "I-End-Org", "B-Execute", "I-Execute", "B-Nominate", "I-Nominate", "B-Fine", "I-Fine", "B-Trial-Hearing", "I-Trial-Hearing", "B-Marry", "I-Marry", "B-Charge-Indict", "I-Charge-Indict", "B-Sentence", "I-Sentence", "B-Convict", "I-Convict", "B-Appeal", "I-Appeal", "B-Declare-Bankruptcy", "I-Declare-Bankruptcy", "B-Merge-Org", "I-Merge-Org", "B-Release-Parole", "I-Release-Parole", "B-Pardon", "I-Pardon", "B-Extradite", "I-Extradite", "B-Divorce", "I-Divorce", "B-Acquit", "I-Acquit"]
230 | 
231 | def to_crf_pad(org_array, org_mask, pad_label_id):
232 |     crf_array = [aa[bb] for aa, bb in zip(org_array, org_mask)]
233 |     crf_array = pad_sequence(crf_array, batch_first=True, padding_value=pad_label_id)
234 |     crf_pad = (crf_array != pad_label_id)
235 |     # the viterbi decoder function in CRF makes use of multiplicative property of 0, then pads wrong numbers out.
236 |     # Need a*0 = 0 for CRF to work.
237 |     crf_array[~crf_pad] = 0
238 |     return crf_array, crf_pad
239 | 
240 | 
241 | def unpad_crf(returned_array, returned_mask, org_array, org_mask):
242 |     out_array = org_array.clone().detach()
243 |     out_array[org_mask] = returned_array[returned_mask]
244 |     return out_array


--------------------------------------------------------------------------------
/baselines/BERT+CRF/BERT-CRF-MAVEN/bert_crf.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | from transformers.modeling_bert import BertModel, BertPreTrainedModel
  3 | from transformers.configuration_bert import BertConfig
  4 | 
  5 | from crf import *
  6 | from utils_maven import to_crf_pad, unpad_crf
  7 | 
  8 | logger = logging.getLogger(__name__)
  9 | 
 10 | BERT_PRETRAINED_MODEL_ARCHIVE_MAP = {
 11 |     'bert-base-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-pytorch_model.bin",
 12 |     'bert-large-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-pytorch_model.bin",
 13 |     'bert-base-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-pytorch_model.bin",
 14 |     'bert-large-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-pytorch_model.bin",
 15 |     'bert-base-multilingual-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-uncased-pytorch_model.bin",
 16 |     'bert-base-multilingual-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-cased-pytorch_model.bin",
 17 |     'bert-base-chinese': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-chinese-pytorch_model.bin",
 18 |     'bert-base-german-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-cased-pytorch_model.bin",
 19 |     'bert-large-uncased-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-pytorch_model.bin",
 20 |     'bert-large-cased-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-pytorch_model.bin",
 21 |     'bert-large-uncased-whole-word-masking-finetuned-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-finetuned-squad-pytorch_model.bin",
 22 |     'bert-large-cased-whole-word-masking-finetuned-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-finetuned-squad-pytorch_model.bin",
 23 |     'bert-base-cased-finetuned-mrpc': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-finetuned-mrpc-pytorch_model.bin",
 24 |     'bert-base-german-dbmdz-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-dbmdz-cased-pytorch_model.bin",
 25 |     'bert-base-german-dbmdz-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-dbmdz-uncased-pytorch_model.bin",
 26 |     'bert-base-japanese': "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-pytorch_model.bin",
 27 |     'bert-base-japanese-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-whole-word-masking-pytorch_model.bin",
 28 |     'bert-base-japanese-char': "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-char-pytorch_model.bin",
 29 |     'bert-base-japanese-char-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-char-whole-word-masking-pytorch_model.bin",
 30 |     'bert-base-finnish-cased-v1': "https://s3.amazonaws.com/models.huggingface.co/bert/TurkuNLP/bert-base-finnish-cased-v1/pytorch_model.bin",
 31 |     'bert-base-finnish-uncased-v1': "https://s3.amazonaws.com/models.huggingface.co/bert/TurkuNLP/bert-base-finnish-uncased-v1/pytorch_model.bin",
 32 | }
 33 | 
 34 | 
 35 | class BertCRFForTokenClassification(BertPreTrainedModel):
 36 |     config_class = BertConfig
 37 |     pretrained_model_archive_map = BERT_PRETRAINED_MODEL_ARCHIVE_MAP
 38 |     base_model_prefix = "bert"
 39 | 
 40 |     def __init__(self, config):
 41 |         super(BertCRFForTokenClassification, self).__init__(config)
 42 |         self.num_labels = config.num_labels
 43 | 
 44 |         self.bert = BertModel(config)
 45 |         self.dropout = nn.Dropout(config.hidden_dropout_prob)
 46 |         self.classifier = nn.Linear(config.hidden_size, config.num_labels + 2)
 47 |         self.crf = CRF(self.num_labels)
 48 | 
 49 |         self.init_weights()
 50 | 
 51 |     def _get_features(self, input_ids=None, attention_mask=None, token_type_ids=None,
 52 |                       position_ids=None, head_mask=None, inputs_embeds=None):
 53 |         outputs = self.bert(input_ids,
 54 |                             attention_mask=attention_mask,
 55 |                             token_type_ids=token_type_ids,
 56 |                             position_ids=position_ids,
 57 |                             head_mask=head_mask,
 58 |                             inputs_embeds=inputs_embeds)
 59 | 
 60 |         sequence_output = outputs[0]
 61 | 
 62 |         sequence_output = self.dropout(sequence_output)
 63 |         feats = self.classifier(sequence_output)
 64 |         return feats, outputs
 65 | 
 66 |     def forward(self, input_ids=None, attention_mask=None, token_type_ids=None,
 67 |                 position_ids=None, head_mask=None, inputs_embeds=None, labels=None, pad_token_label_id=None):
 68 | 
 69 |         logits, outputs = self._get_features(input_ids, attention_mask, token_type_ids,
 70 |                                              position_ids, head_mask, inputs_embeds)
 71 | 
 72 |         outputs = (logits,) + outputs[2:]  # add hidden states and attention if they are here
 73 |         if labels is not None:
 74 |             # loss_fct = nn.CrossEntropyLoss()
 75 |             pad_mask = (labels != pad_token_label_id)
 76 | 
 77 |             # Only keep active parts of the loss
 78 |             if attention_mask is not None:
 79 |                 # active_loss = attention_mask.view(-1) == 1
 80 |                 # active_logits = logits.view(-1, self.num_labels)[active_loss]
 81 |                 # active_labels = labels.view(-1)[active_loss]
 82 |                 loss_mask = ((attention_mask == 1) & pad_mask)
 83 |             else:
 84 |                 # loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
 85 |                 loss_mask = ((torch.ones(logits.shape) == 1) & pad_mask)
 86 | 
 87 |             crf_labels, crf_mask = to_crf_pad(labels, loss_mask, pad_token_label_id)
 88 |             crf_logits, _ = to_crf_pad(logits, loss_mask, pad_token_label_id)
 89 | 
 90 |             loss = self.crf.neg_log_likelihood(crf_logits, crf_mask, crf_labels)
 91 |             # removing mask stuff from the output path is done later in my_crf_ner but it should be kept away
 92 |             # when calculating loss
 93 |             best_path = self.crf(crf_logits, crf_mask)  # (torch.ones(logits.shape) == 1)
 94 |             best_path = unpad_crf(best_path, crf_mask, labels, pad_mask)
 95 |             outputs = (loss,) + outputs + (best_path,)
 96 |         else:
 97 |             # removing mask stuff from the output path is done later in my_crf_ner but it should be kept away
 98 |             # when calculating loss
 99 |             if attention_mask is not None:
100 |                 mask = (attention_mask == 1)  # & (labels!=-100))
101 |             else:
102 |                 mask = torch.ones(logits.shape).bool()  # (labels!=-100)
103 |             crf_logits, crf_mask = to_crf_pad(logits, mask, pad_token_label_id)
104 |             crf_mask = crf_mask.sum(axis=2) == crf_mask.shape[2]
105 |             best_path = self.crf(crf_logits, crf_mask)
106 |             temp_labels = torch.ones(mask.shape) * pad_token_label_id
107 |             best_path = unpad_crf(best_path, crf_mask, temp_labels, mask)
108 |             outputs = outputs + (best_path,)
109 | 
110 |         return outputs
111 | 


--------------------------------------------------------------------------------
/baselines/BERT+CRF/BERT-CRF-MAVEN/crf.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | # @Author: Jie Yang
  3 | # @Date:   2017-12-04 23:19:38
  4 | # @Last Modified by:   Jie Yang,     Contact: jieynlp@gmail.com
  5 | # @Last Modified time: 2018-05-27 22:48:17
  6 | # Modified from original https://github.com/jiesutd/LatticeLSTM/blob/master/model/crf.py
  7 | 
  8 | import torch
  9 | import torch.autograd as autograd
 10 | import torch.nn as nn
 11 | 
 12 | 
 13 | # Compute log sum exp in a numerically stable way for the forward algorithm
 14 | def log_sum_exp(vec, m_size):
 15 |     """
 16 |     calculate log of exp sum
 17 |     args:
 18 |         vec (batch_size, vanishing_dim, hidden_dim) : input tensor
 19 |         m_size : hidden_dim
 20 |     return:
 21 |         batch_size, hidden_dim
 22 |     """
 23 |     _, idx = torch.max(vec, 1)  # B * 1 * M
 24 |     max_score = torch.gather(vec, 1, idx.view(-1, 1, m_size)).view(-1, 1, m_size)  # B * M
 25 |     return max_score.view(-1, m_size) + torch.log(torch.sum(torch.exp(vec - max_score.expand_as(vec)), 1)).view(-1,
 26 |                                                                                                                 m_size)  # B * M
 27 | 
 28 | 
 29 | class CRF(nn.Module):
 30 | 
 31 |     def __init__(self, tagset_size, use_gpu=False):  # average_batch=False,
 32 |         super(CRF, self).__init__()
 33 |         print("build CRF...")
 34 |         # self.average_batch = average_batch
 35 |         self.gpu = use_gpu
 36 | 
 37 |         self.START_TAG = -2
 38 |         self.STOP_TAG = -1
 39 | 
 40 |         self.tagset_size = tagset_size
 41 | 
 42 |         # # We add 2 here, because of START_TAG and STOP_TAG
 43 |         # self.hidden2tag = nn.Linear(params['hidden_dim'], self.tagset_size + 2)
 44 |         # # transitions (f_tag_size, t_tag_size), transition value from f_tag to t_tag
 45 |         init_transitions = torch.zeros(self.tagset_size + 2, self.tagset_size + 2)
 46 |         init_transitions[:, self.START_TAG] = -10000.0
 47 |         init_transitions[self.STOP_TAG, :] = -10000.0
 48 |         if torch.cuda.is_available():
 49 |             init_transitions = init_transitions.cuda()
 50 |         self.transitions = nn.Parameter(init_transitions, requires_grad=True)
 51 | 
 52 |     def init_hidden_cell(self, batch_size, layer_hidden_dim):
 53 |         return (torch.randn(2, batch_size, layer_hidden_dim // 2),
 54 |                 torch.randn(2, batch_size, layer_hidden_dim // 2))
 55 | 
 56 |     def _calculate_PZ(self, feats, mask):
 57 |         """
 58 |             input:
 59 |                 feats: (batch, seq_len, self.tag_size+2)
 60 |                 masks: (batch, seq_len)
 61 |         """
 62 |         batch_size = feats.size(0)
 63 |         seq_len = feats.size(1)
 64 |         tag_size = feats.size(2)
 65 |         # print feats.view(seq_len, tag_size)
 66 |         assert (tag_size == self.tagset_size + 2)
 67 |         mask = mask.transpose(1, 0).contiguous()
 68 |         ins_num = seq_len * batch_size
 69 |         ## be careful the view shape, it is .view(ins_num, 1, tag_size) but not .view(ins_num, tag_size, 1)
 70 |         feats = feats.transpose(1, 0).contiguous().view(ins_num, 1, tag_size).expand(ins_num, tag_size, tag_size)
 71 |         ## need to consider start
 72 |         scores = feats + self.transitions.view(1, tag_size, tag_size).expand(ins_num, tag_size, tag_size)
 73 |         scores = scores.view(seq_len, batch_size, tag_size, tag_size)
 74 |         # build iter
 75 |         seq_iter = enumerate(scores)
 76 |         _, inivalues = next(seq_iter)  # bat_size * from_target_size * to_target_size
 77 |         # only need start from start_tag
 78 |         partition = inivalues[:, self.START_TAG, :].clone().view(batch_size, tag_size, 1)  # bat_size * to_target_size
 79 | 
 80 |         ## add start score (from start to all tag, duplicate to batch_size)
 81 |         # partition = partition + self.transitions[START_TAG,:].view(1, tag_size, 1).expand(batch_size, tag_size, 1)
 82 |         # iter over last scores
 83 |         for idx, cur_values in seq_iter:
 84 |             # previous to_target is current from_target
 85 |             # partition: previous results log(exp(from_target)), #(batch_size * from_target)
 86 |             # cur_values: bat_size * from_target * to_target
 87 | 
 88 |             cur_values = cur_values + partition.contiguous().view(batch_size, tag_size, 1).expand(batch_size, tag_size,
 89 |                                                                                                   tag_size)
 90 |             cur_partition = log_sum_exp(cur_values, tag_size)
 91 |             # print cur_partition.data
 92 | 
 93 |             # (bat_size * from_target * to_target) -> (bat_size * to_target)
 94 |             # partition = utils.switch(partition, cur_partition, mask[idx].view(bat_size, 1).expand(bat_size, self.tagset_size)).view(bat_size, -1)
 95 |             mask_idx = mask[idx, :].view(batch_size, 1).expand(batch_size, tag_size)
 96 | 
 97 |             ## effective updated partition part, only keep the partition value of mask value = 1
 98 |             masked_cur_partition = cur_partition.masked_select(mask_idx)
 99 |             ## let mask_idx broadcastable, to disable warning
100 |             mask_idx = mask_idx.contiguous().view(batch_size, tag_size, 1)
101 | 
102 |             ## replace the partition where the maskvalue=1, other partition value keeps the same
103 |             partition.masked_scatter_(mask_idx, masked_cur_partition)
104 |             # until the last state, add transition score for all partition (and do log_sum_exp) then select the value in STOP_TAG
105 |         cur_values = self.transitions.view(1, tag_size, tag_size).expand(batch_size, tag_size,
106 |                                                                          tag_size) + partition.contiguous().view(
107 |             batch_size, tag_size, 1).expand(batch_size, tag_size, tag_size)
108 | 
109 |         cur_partition = log_sum_exp(cur_values, tag_size)
110 |         final_partition = cur_partition[:, self.STOP_TAG]
111 |         return final_partition.sum(), scores
112 | 
113 |     def _viterbi_decode(self, feats, mask):
114 |         """
115 |             input:
116 |                 feats: (batch, seq_len, self.tag_size+2)
117 |                 mask: (batch, seq_len)
118 |             output:
119 |                 decode_idx: (batch, seq_len) decoded sequence
120 |                 path_score: (batch, 1) corresponding score for each sequence (to be implementated)
121 |         """
122 |         batch_size = feats.size(0)
123 |         seq_len = feats.size(1)
124 |         tag_size = feats.size(2)
125 |         assert (tag_size == self.tagset_size + 2)
126 |         ## calculate sentence length for each sentence
127 |         length_mask = torch.sum(mask.long(), dim=1).view(batch_size, 1).long()
128 |         ## mask to (seq_len, batch_size)
129 |         mask = mask.transpose(1, 0).contiguous()
130 |         ins_num = seq_len * batch_size
131 |         ## be careful the view shape, it is .view(ins_num, 1, tag_size) but not .view(ins_num, tag_size, 1)
132 |         feats = feats.transpose(1, 0).contiguous().view(ins_num, 1, tag_size).expand(ins_num, tag_size, tag_size)
133 |         ## need to consider start
134 |         scores = feats + self.transitions.view(1, tag_size, tag_size).expand(ins_num, tag_size, tag_size)
135 |         scores = scores.view(seq_len, batch_size, tag_size, tag_size)
136 | 
137 |         # build iter
138 |         seq_iter = enumerate(scores)
139 |         ## record the position of best score
140 |         back_points = list()
141 |         partition_history = list()
142 |         ##  reverse mask (bug for mask = 1- mask, use this as alternative choice)
143 |         # mask = 1 + (-1)*mask
144 |         # mask = (1 - mask.long()).byte()
145 |         mask = ~(mask)
146 |         _, inivalues = next(seq_iter)  # bat_size * from_target_size * to_target_size
147 |         # only need start from start_tag
148 |         partition = inivalues[:, self.START_TAG, :].clone().view(batch_size, tag_size)  # bat_size * to_target_size
149 |         # print "init part:",partition.size()
150 |         partition_history.append(partition)
151 |         # iter over last scores
152 |         for idx, cur_values in seq_iter:
153 |             # previous to_target is current from_target
154 |             # partition: previous results log(exp(from_target)), #(batch_size * from_target)
155 |             # cur_values: batch_size * from_target * to_target
156 |             cur_values = cur_values + partition.contiguous().view(batch_size, tag_size, 1).expand(batch_size, tag_size,
157 |                                                                                                   tag_size)
158 |             ## forscores, cur_bp = torch.max(cur_values[:,:-2,:], 1) # do not consider START_TAG/STOP_TAG
159 |             # print "cur value:", cur_values.size()
160 |             partition, cur_bp = torch.max(cur_values, 1)
161 |             # print "partsize:",partition.size()
162 |             # exit(0)
163 |             # print partition
164 |             # print cur_bp
165 |             # print "one best, ",idx
166 |             partition_history.append(partition)
167 |             ## cur_bp: (batch_size, tag_size) max source score position in current tag
168 |             ## set padded label as 0, which will be filtered in post processing
169 |             cur_bp.masked_fill_(mask[idx].view(batch_size, 1).expand(batch_size, tag_size), 0)
170 |             back_points.append(cur_bp)
171 |         # exit(0)
172 |         ### add score to final STOP_TAG
173 |         partition_history = torch.cat(partition_history, 0).view(seq_len, batch_size, -1).transpose(1,
174 |                                                                                                     0).contiguous()  ## (batch_size, seq_len. tag_size)
175 |         ### get the last position for each setences, and select the last partitions using gather()
176 |         last_position = length_mask.view(batch_size, 1, 1).expand(batch_size, 1, tag_size) - 1
177 |         last_partition = torch.gather(partition_history, 1, last_position).view(batch_size, tag_size, 1)
178 |         ### calculate the score from last partition to end state (and then select the STOP_TAG from it)
179 |         last_values = last_partition.expand(batch_size, tag_size, tag_size) + self.transitions.view(1, tag_size,
180 |                                                                                                     tag_size).expand(
181 |             batch_size, tag_size, tag_size)
182 |         _, last_bp = torch.max(last_values, 1)
183 |         pad_zero = autograd.Variable(torch.zeros(batch_size, tag_size)).long()
184 |         if torch.cuda.is_available():
185 |             pad_zero = pad_zero.cuda()
186 |         back_points.append(pad_zero)
187 |         back_points = torch.cat(back_points).view(seq_len, batch_size, tag_size)
188 | 
189 |         ## select end ids in STOP_TAG
190 |         pointer = last_bp[:, self.STOP_TAG]
191 |         insert_last = pointer.contiguous().view(batch_size, 1, 1).expand(batch_size, 1, tag_size)
192 |         back_points = back_points.transpose(1, 0).contiguous()
193 |         ## move the end ids(expand to tag_size) to the corresponding position of back_points to replace the 0 values
194 |         # print "lp:",last_position
195 |         # print "il:",insert_last
196 |         back_points.scatter_(1, last_position, insert_last)
197 |         # print "bp:",back_points
198 |         # exit(0)
199 |         back_points = back_points.transpose(1, 0).contiguous()
200 |         ## decode from the end, padded position ids are 0, which will be filtered if following evaluation
201 |         decode_idx = autograd.Variable(torch.LongTensor(seq_len, batch_size))
202 |         if torch.cuda.is_available():
203 |             decode_idx = decode_idx.cuda()
204 |         decode_idx[-1] = pointer.data  # detach()
205 |         for idx in range(len(back_points) - 2, -1, -1):
206 |             pointer = torch.gather(back_points[idx], 1, pointer.contiguous().view(batch_size, 1))
207 |             decode_idx[idx] = pointer.data.t()  # feili  pointer.detach().view(batch_size)
208 |         path_score = None
209 |         decode_idx = decode_idx.transpose(1, 0)
210 |         return path_score, decode_idx
211 | 
212 |     def forward(self, feats, mask):
213 |         path_score, best_path = self._viterbi_decode(feats, mask)
214 |         # return path_score, best_path
215 |         return best_path
216 | 
217 |     def _score_sentence(self, scores, mask, tags):
218 |         """
219 |             input:
220 |                 scores: variable (seq_len, batch, tag_size, tag_size)
221 |                 mask: (batch, seq_len)
222 |                 tags: tensor  (batch, seq_len)
223 |             output:
224 |                 score: sum of score for gold sequences within whole batch
225 |         """
226 |         # Gives the score of a provided tag sequence
227 |         batch_size = scores.size(1)
228 |         seq_len = scores.size(0)
229 |         tag_size = scores.size(2)
230 |         ## convert tag value into a new format, recorded label bigram information to index
231 |         new_tags = autograd.Variable(torch.LongTensor(batch_size, seq_len))
232 |         if torch.cuda.is_available():
233 |             new_tags = new_tags.cuda()
234 |         for idx in range(seq_len):
235 |             if idx == 0:
236 |                 ## start -> first score
237 |                 new_tags[:, 0] = (tag_size - 2) * tag_size + tags[:, 0]
238 | 
239 |             else:
240 |                 new_tags[:, idx] = tags[:, idx - 1] * tag_size + tags[:, idx]
241 | 
242 |         ## transition for label to STOP_TAG
243 |         end_transition = self.transitions[:, self.STOP_TAG].contiguous().view(1, tag_size).expand(batch_size, tag_size)
244 |         ## length for batch,  last word position = length - 1
245 |         length_mask = torch.sum(mask.long(), dim=1).view(batch_size, 1).long()
246 |         ## index the label id of last word
247 |         end_ids = torch.gather(tags, 1, length_mask - 1)
248 | 
249 |         ## index the transition score for end_id to STOP_TAG
250 |         end_energy = torch.gather(end_transition, 1, end_ids)
251 | 
252 |         ## convert tag as (seq_len, batch_size, 1)
253 |         new_tags = new_tags.transpose(1, 0).contiguous().view(seq_len, batch_size, 1)
254 |         ### need convert tags id to search from 400 positions of scores
255 |         tg_energy = torch.gather(scores.view(seq_len, batch_size, -1), 2, new_tags).view(seq_len,
256 |                                                                                          batch_size)  # seq_len * bat_size
257 |         ## mask transpose to (seq_len, batch_size)
258 |         tg_energy = tg_energy.masked_select(mask.transpose(1, 0))
259 | 
260 |         # ## calculate the score from START_TAG to first label
261 |         # start_transition = self.transitions[START_TAG,:].view(1, tag_size).expand(batch_size, tag_size)
262 |         # start_energy = torch.gather(start_transition, 1, tags[0,:])
263 | 
264 |         ## add all score together
265 |         # gold_score = start_energy.sum() + tg_energy.sum() + end_energy.sum()
266 |         gold_score = tg_energy.sum() + end_energy.sum()
267 |         return gold_score
268 | 
269 |     def neg_log_likelihood(self, feats, mask, tags):
270 |         # nonegative log likelihood
271 |         forward_score, scores = self._calculate_PZ(feats, mask)
272 |         # print('Forward', forward_score)
273 |         gold_score = self._score_sentence(scores, mask, tags)
274 |         # print('Gold', gold_score)
275 |         # print "batch, f:", forward_score.data[0], " g:", gold_score.data[0], " dis:", forward_score.data[0] - gold_score.data[0]
276 |         # exit(0)
277 |         # if self.average_batch:
278 |         #     return (forward_score - gold_score) / batch_size
279 |         # else:
280 |         return forward_score - gold_score
281 | 


--------------------------------------------------------------------------------
/baselines/BERT+CRF/BERT-CRF-MAVEN/run_MAVEN.sh:
--------------------------------------------------------------------------------
 1 | python3 run_maven.py \
 2 |     --data_dir ../maven/ \ #path to the raw MAVEN data files
 3 |     --model_type bertcrf \
 4 |     --model_name_or_path bert-base-uncased \
 5 |     --output_dir ./MAVEN \ #path to dump checkpoints
 6 |     --max_seq_length 128 \
 7 |     --do_lower_case \
 8 |     --per_gpu_train_batch_size 16 \
 9 |     --per_gpu_eval_batch_size 16 \
10 |     --gradient_accumulation_steps 8 \
11 |     --learning_rate 5e-5 \
12 |     --num_train_epochs 5 \
13 |     --save_steps 100 \
14 |     --logging_steps 100 \
15 |     --seed 0 \
16 |     --do_train \
17 |     --do_eval \
18 |     --evaluate_during_training
19 | 


--------------------------------------------------------------------------------
/baselines/BERT+CRF/BERT-CRF-MAVEN/run_MAVEN_infer.sh:
--------------------------------------------------------------------------------
 1 | python3 run_maven.py \
 2 |     --data_dir ../maven/ \ #path to the test data, remember to delete the cached files at first (otherwise the test data may be random shuffled before)
 3 |     --model_type bertcrf \
 4 |     --model_name_or_path bert-base-uncased \
 5 |     --output_dir ./MAVEN/checkpoint-1200 \ #path to the trained checkpoint, the results file will also be dumped here
 6 |     --max_seq_length 128 \
 7 |     --do_lower_case \
 8 |     --per_gpu_train_batch_size 16 \
 9 |     --per_gpu_eval_batch_size 16 \
10 |     --gradient_accumulation_steps 8 \
11 |     --learning_rate 5e-5 \
12 |     --num_train_epochs 5 \
13 |     --save_steps 100 \
14 |     --logging_steps 100 \
15 |     --seed 0 \
16 |     --do_infer #add this flag to do inference only
17 | 


--------------------------------------------------------------------------------
/baselines/BERT+CRF/BERT-CRF-MAVEN/utils_maven.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
  3 | # Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
  4 | #
  5 | # Licensed under the Apache License, Version 2.0 (the "License");
  6 | # you may not use this file except in compliance with the License.
  7 | # You may obtain a copy of the License at
  8 | #
  9 | #     http://www.apache.org/licenses/LICENSE-2.0
 10 | #
 11 | # Unless required by applicable law or agreed to in writing, software
 12 | # distributed under the License is distributed on an "AS IS" BASIS,
 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 | # See the License for the specific language governing permissions and
 15 | # limitations under the License.
 16 | """ BERT-CRF fine-tuning: utilities to work with MAVEN. """
 17 | 
 18 | from __future__ import absolute_import, division, print_function
 19 | import json
 20 | import logging
 21 | import os
 22 | from io import open
 23 | from transformers import XLMRobertaTokenizer, BertTokenizer, RobertaTokenizer
 24 | 
 25 | from torch.nn.utils.rnn import pad_sequence
 26 | 
 27 | logger = logging.getLogger(__name__)
 28 | 
 29 | 
 30 | class InputExample(object):
 31 |     """A single training/test example for token classification."""
 32 | 
 33 |     def __init__(self, guid, words, labels):
 34 |         """Constructs a InputExample.
 35 | 
 36 |         Args:
 37 |             guid: Unique id for the example.
 38 |             words: list. The words of the sequence.
 39 |             labels: (Optional) list. The labels for each word of the sequence. This should be
 40 |             specified for train and dev examples, but not for test examples.
 41 |         """
 42 |         self.guid = guid
 43 |         self.words = words
 44 |         self.labels = labels
 45 | 
 46 | 
 47 | class InputFeatures(object):
 48 |     """A single set of features of data."""
 49 | 
 50 |     def __init__(self, input_ids, input_mask, segment_ids, label_ids):
 51 |         self.input_ids = input_ids
 52 |         self.input_mask = input_mask
 53 |         self.segment_ids = segment_ids
 54 |         self.label_ids = label_ids
 55 | 
 56 | 
 57 | def read_examples_from_file(data_dir, mode):
 58 |     file_path = os.path.join(data_dir, "{}.jsonl".format(mode))
 59 |     examples = []
 60 |     with open(file_path, "r") as fin:
 61 |         lines=fin.readlines()
 62 |         for line in lines:
 63 |             doc=json.loads(line)
 64 |             words=[]
 65 |             labels=[]
 66 |             for sent in doc['content']:
 67 |                 words.append(sent['tokens'])
 68 |                 labels.append(['O' for i in range(0,len(sent['tokens']))])#TBD
 69 |             if mode!='test':
 70 |                 for event in doc['events']:
 71 |                     for mention in event['mention']:
 72 |                         labels[mention['sent_id']][mention['offset'][0]]="B-"+event['type']
 73 |                         for i in range(mention['offset'][0]+1,mention['offset'][1]):
 74 |                             labels[mention['sent_id']][i]="I-"+event['type']
 75 |                 for mention in doc['negative_triggers']:
 76 |                     labels[mention['sent_id']][mention['offset'][0]]="O"
 77 |                     for i in range(mention['offset'][0]+1,mention['offset'][1]):
 78 |                         labels[mention['sent_id']][i]="O"
 79 |             for i in range(0,len(words)):
 80 |                 examples.append(InputExample(guid="%s-%s-%d"%(mode,doc['id'],i),
 81 |                                             words=words[i],
 82 |                                             labels=labels[i]))
 83 |     return examples
 84 | 
 85 | 
 86 | def convert_examples_to_features(examples,
 87 |                                  label_list,
 88 |                                  max_seq_length,
 89 |                                  tokenizer,
 90 |                                  cls_token_at_end=False,
 91 |                                  cls_token="[CLS]",
 92 |                                  cls_token_segment_id=1,
 93 |                                  sep_token="[SEP]",
 94 |                                  sep_token_extra=False,
 95 |                                  pad_on_left=False,
 96 |                                  pad_token=0,
 97 |                                  pad_token_segment_id=0,
 98 |                                  pad_token_label_id=-100,
 99 |                                  sequence_a_segment_id=0,
100 |                                  mask_padding_with_zero=True,
101 |                                  model_name=None):
102 |     """ Loads a data file into a list of `InputBatch`s
103 |         `cls_token_at_end` define the location of the CLS token:
104 |             - False (Default, BERT/XLM pattern): [CLS] + A + [SEP] + B + [SEP]
105 |             - True (XLNet/GPT pattern): A + [SEP] + B + [SEP] + [CLS]
106 |         `cls_token_segment_id` define the segment id associated to the CLS token (0 for BERT, 2 for XLNet)
107 |     """
108 | 
109 |     label_map = {label: i for i, label in enumerate(label_list)}
110 | 
111 |     # my logic in crf_padding requires this check. I create mask for crf by labels==pad_token_label_id to not include it
112 |     # in loss and decoding
113 |     assert pad_token_label_id not in label_map.values()
114 | 
115 |     features = []
116 |     for (ex_index, example) in enumerate(examples):
117 |         if ex_index % 10000 == 0:
118 |             print("###############")
119 |             logger.info("Writing example %d of %d", ex_index, len(examples))
120 |             print("###############")
121 | 
122 |         tokens = []
123 |         label_ids = []
124 |         for word, label in zip(example.words, example.labels):
125 |             word_tokens = tokenizer.tokenize(word)
126 |             tokens.extend(word_tokens)
127 |             # Use the real label id for the first token of the word, and padding ids for the remaining tokens
128 |             label_ids.extend([label_map[label]] + [pad_token_label_id] * (len(word_tokens) - 1))
129 | 
130 |         # Account for [CLS] and [SEP] with "- 2" and with "- 3" for RoBERTa.
131 |         special_tokens_count = 3 if sep_token_extra else 2
132 |         if len(tokens) > max_seq_length - special_tokens_count:
133 |             tokens = tokens[:(max_seq_length - special_tokens_count)]
134 |             label_ids = label_ids[:(max_seq_length - special_tokens_count)]
135 | 
136 |         # The convention in BERT is:
137 |         # (a) For sequence pairs:
138 |         #  tokens:   [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
139 |         #  type_ids:   0   0  0    0    0     0       0   0   1  1  1  1   1   1
140 |         # (b) For single sequences:
141 |         #  tokens:   [CLS] the dog is hairy . [SEP]
142 |         #  type_ids:   0   0   0   0  0     0   0
143 |         #
144 |         # Where "type_ids" are used to indicate whether this is the first
145 |         # sequence or the second sequence. The embedding vectors for `type=0` and
146 |         # `type=1` were learned during pre-training and are added to the wordpiece
147 |         # embedding vector (and position vector). This is not *strictly* necessary
148 |         # since the [SEP] token unambiguously separates the sequences, but it makes
149 |         # it easier for the model to learn the concept of sequences.
150 |         #
151 |         # For classification tasks, the first vector (corresponding to [CLS]) is
152 |         # used as as the "sentence vector". Note that this only makes sense because
153 |         # the entire model is fine-tuned.
154 |         tokens += [sep_token]
155 |         label_ids += [pad_token_label_id]  # [label_map["X"]]
156 |         if sep_token_extra:
157 |             # roberta uses an extra separator b/w pairs of sentences
158 |             tokens += [sep_token]
159 |             label_ids += [pad_token_label_id]
160 |         segment_ids = [sequence_a_segment_id] * len(tokens)
161 | 
162 |         if cls_token_at_end:
163 |             tokens += [cls_token]
164 |             label_ids += [pad_token_label_id]
165 |             segment_ids += [cls_token_segment_id]
166 |         else:
167 |             tokens = [cls_token] + tokens
168 |             label_ids = [pad_token_label_id] + label_ids
169 |             segment_ids = [cls_token_segment_id] + segment_ids
170 | 
171 |         if model_name:
172 |             if model_name == 'xlm-roberta-base':
173 |                 tokenizer = XLMRobertaTokenizer.from_pretrained(model_name)
174 |                 input_ids = tokenizer.convert_tokens_to_ids(tokens)
175 |             elif model_name.startswith('bert'):
176 |                 tokenizer = BertTokenizer.from_pretrained(model_name)
177 |                 input_ids = tokenizer.convert_tokens_to_ids(tokens)
178 |             elif model_name == 'roberta':
179 |                 tokenizer = RobertaTokenizer.from_pretrained(model_name)
180 |                 input_ids = tokenizer.convert_tokens_to_ids(tokens)
181 |         else:
182 |             input_ids = tokenizer.convert_tokens_to_ids(tokens)
183 | 
184 |         # The mask has 1 for real tokens and 0 for padding tokens. Only real
185 |         # tokens are attended to.
186 |         input_mask = [1 if mask_padding_with_zero else 0] * len(input_ids)
187 | 
188 |         # Zero-pad up to the sequence length.
189 |         padding_length = max_seq_length - len(input_ids)
190 |         if pad_on_left:
191 |             input_ids = ([pad_token] * padding_length) + input_ids
192 |             input_mask = ([0 if mask_padding_with_zero else 1] * padding_length) + input_mask
193 |             segment_ids = ([pad_token_segment_id] * padding_length) + segment_ids
194 |             label_ids = ([pad_token_label_id] * padding_length) + label_ids
195 |         else:
196 |             input_ids += ([pad_token] * padding_length)
197 |             input_mask += ([0 if mask_padding_with_zero else 1] * padding_length)
198 |             segment_ids += ([pad_token_segment_id] * padding_length)
199 |             label_ids += ([pad_token_label_id] * padding_length)
200 | 
201 |         assert len(input_ids) == max_seq_length
202 |         assert len(input_mask) == max_seq_length
203 |         assert len(segment_ids) == max_seq_length
204 |         assert len(label_ids) == max_seq_length
205 | 
206 |         if ex_index < 0:
207 |             logger.info("*** Example ***")
208 |             logger.info("guid: %s", example.guid)
209 |             logger.info("tokens: %s", " ".join([str(x) for x in tokens]))
210 |             logger.info("input_ids: %s", " ".join([str(x) for x in input_ids]))
211 |             logger.info("input_mask: %s", " ".join([str(x) for x in input_mask]))
212 |             logger.info("segment_ids: %s", " ".join([str(x) for x in segment_ids]))
213 |             logger.info("label_ids: %s", " ".join([str(x) for x in label_ids]))
214 | 
215 |         features.append(
216 |             InputFeatures(input_ids=input_ids,
217 |                           input_mask=input_mask,
218 |                           segment_ids=segment_ids,
219 |                           label_ids=label_ids))
220 |     return features
221 | 
222 | 
223 | def get_labels(path):
224 |     return ["O", "B-Know", "I-Know", "B-Warning", "I-Warning", "B-Catastrophe", "I-Catastrophe", "B-Placing", "I-Placing", "B-Causation", "I-Causation", "B-Arriving", "I-Arriving", "B-Sending", "I-Sending", "B-Protest", "I-Protest", "B-Preventing_or_letting", "I-Preventing_or_letting", "B-Motion", "I-Motion", "B-Damaging", "I-Damaging", "B-Destroying", "I-Destroying", "B-Death", "I-Death", "B-Perception_active", "I-Perception_active", "B-Presence", "I-Presence", "B-Influence", "I-Influence", "B-Receiving", "I-Receiving", "B-Check", "I-Check", "B-Hostile_encounter", "I-Hostile_encounter", "B-Killing", "I-Killing", "B-Conquering", "I-Conquering", "B-Releasing", "I-Releasing", "B-Attack", "I-Attack", "B-Earnings_and_losses", "I-Earnings_and_losses", "B-Choosing", "I-Choosing", "B-Traveling", "I-Traveling", "B-Recovering", "I-Recovering", "B-Using", "I-Using", "B-Coming_to_be", "I-Coming_to_be", "B-Cause_to_be_included", "I-Cause_to_be_included", "B-Process_start", "I-Process_start", "B-Change_event_time", "I-Change_event_time", "B-Reporting", "I-Reporting", "B-Bodily_harm", "I-Bodily_harm", "B-Suspicion", "I-Suspicion", "B-Statement", "I-Statement", "B-Cause_change_of_position_on_a_scale", "I-Cause_change_of_position_on_a_scale", "B-Coming_to_believe", "I-Coming_to_believe", "B-Expressing_publicly", "I-Expressing_publicly", "B-Request", "I-Request", "B-Control", "I-Control", "B-Supporting", "I-Supporting", "B-Defending", "I-Defending", "B-Building", "I-Building", "B-Military_operation", "I-Military_operation", "B-Self_motion", "I-Self_motion", "B-GetReady", "I-GetReady", "B-Forming_relationships", "I-Forming_relationships", "B-Becoming_a_member", "I-Becoming_a_member", "B-Action", "I-Action", "B-Removing", "I-Removing", "B-Surrendering", "I-Surrendering", "B-Agree_or_refuse_to_act", "I-Agree_or_refuse_to_act", "B-Participation", "I-Participation", "B-Deciding", "I-Deciding", "B-Education_teaching", "I-Education_teaching", "B-Emptying", "I-Emptying", "B-Getting", "I-Getting", "B-Besieging", "I-Besieging", "B-Creating", "I-Creating", "B-Process_end", "I-Process_end", "B-Body_movement", "I-Body_movement", "B-Expansion", "I-Expansion", "B-Telling", "I-Telling", "B-Change", "I-Change", "B-Legal_rulings", "I-Legal_rulings", "B-Bearing_arms", "I-Bearing_arms", "B-Giving", "I-Giving", "B-Name_conferral", "I-Name_conferral", "B-Arranging", "I-Arranging", "B-Use_firearm", "I-Use_firearm", "B-Committing_crime", "I-Committing_crime", "B-Assistance", "I-Assistance", "B-Surrounding", "I-Surrounding", "B-Quarreling", "I-Quarreling", "B-Expend_resource", "I-Expend_resource", "B-Motion_directional", "I-Motion_directional", "B-Bringing", "I-Bringing", "B-Communication", "I-Communication", "B-Containing", "I-Containing", "B-Manufacturing", "I-Manufacturing", "B-Social_event", "I-Social_event", "B-Robbery", "I-Robbery", "B-Competition", "I-Competition", "B-Writing", "I-Writing", "B-Rescuing", "I-Rescuing", "B-Judgment_communication", "I-Judgment_communication", "B-Change_tool", "I-Change_tool", "B-Hold", "I-Hold", "B-Being_in_operation", "I-Being_in_operation", "B-Recording", "I-Recording", "B-Carry_goods", "I-Carry_goods", "B-Cost", "I-Cost", "B-Departing", "I-Departing", "B-GiveUp", "I-GiveUp", "B-Change_of_leadership", "I-Change_of_leadership", "B-Escaping", "I-Escaping", "B-Aiming", "I-Aiming", "B-Hindering", "I-Hindering", "B-Preserving", "I-Preserving", "B-Create_artwork", "I-Create_artwork", "B-Openness", "I-Openness", "B-Connect", "I-Connect", "B-Reveal_secret", "I-Reveal_secret", "B-Response", "I-Response", "B-Scrutiny", "I-Scrutiny", "B-Lighting", "I-Lighting", "B-Criminal_investigation", "I-Criminal_investigation", "B-Hiding_objects", "I-Hiding_objects", "B-Confronting_problem", "I-Confronting_problem", "B-Renting", "I-Renting", "B-Breathing", "I-Breathing", "B-Patrolling", "I-Patrolling", "B-Arrest", "I-Arrest", "B-Convincing", "I-Convincing", "B-Commerce_sell", "I-Commerce_sell", "B-Cure", "I-Cure", "B-Temporary_stay", "I-Temporary_stay", "B-Dispersal", "I-Dispersal", "B-Collaboration", "I-Collaboration", "B-Extradition", "I-Extradition", "B-Change_sentiment", "I-Change_sentiment", "B-Commitment", "I-Commitment", "B-Commerce_pay", "I-Commerce_pay", "B-Filling", "I-Filling", "B-Becoming", "I-Becoming", "B-Achieve", "I-Achieve", "B-Practice", "I-Practice", "B-Cause_change_of_strength", "I-Cause_change_of_strength", "B-Supply", "I-Supply", "B-Cause_to_amalgamate", "I-Cause_to_amalgamate", "B-Scouring", "I-Scouring", "B-Violence", "I-Violence", "B-Reforming_a_system", "I-Reforming_a_system", "B-Come_together", "I-Come_together", "B-Wearing", "I-Wearing", "B-Cause_to_make_progress", "I-Cause_to_make_progress", "B-Legality", "I-Legality", "B-Employment", "I-Employment", "B-Rite", "I-Rite", "B-Publishing", "I-Publishing", "B-Adducing", "I-Adducing", "B-Exchange", "I-Exchange", "B-Ratification", "I-Ratification", "B-Sign_agreement", "I-Sign_agreement", "B-Commerce_buy", "I-Commerce_buy", "B-Imposing_obligation", "I-Imposing_obligation", "B-Rewards_and_punishments", "I-Rewards_and_punishments", "B-Institutionalization", "I-Institutionalization", "B-Testing", "I-Testing", "B-Ingestion", "I-Ingestion", "B-Labeling", "I-Labeling", "B-Kidnapping", "I-Kidnapping", "B-Submitting_documents", "I-Submitting_documents", "B-Prison", "I-Prison", "B-Justifying", "I-Justifying", "B-Emergency", "I-Emergency", "B-Terrorism", "I-Terrorism", "B-Vocalizations", "I-Vocalizations", "B-Risk", "I-Risk", "B-Resolve_problem", "I-Resolve_problem", "B-Revenge", "I-Revenge", "B-Limiting", "I-Limiting", "B-Research", "I-Research", "B-Having_or_lacking_access", "I-Having_or_lacking_access", "B-Theft", "I-Theft", "B-Incident", "I-Incident", "B-Award", "I-Award"]
225 | 
226 | 
227 | def to_crf_pad(org_array, org_mask, pad_label_id):
228 |     crf_array = [aa[bb] for aa, bb in zip(org_array, org_mask)]
229 |     crf_array = pad_sequence(crf_array, batch_first=True, padding_value=pad_label_id)
230 |     crf_pad = (crf_array != pad_label_id)
231 |     # the viterbi decoder function in CRF makes use of multiplicative property of 0, then pads wrong numbers out.
232 |     # Need a*0 = 0 for CRF to work.
233 |     crf_array[~crf_pad] = 0
234 |     return crf_array, crf_pad
235 | 
236 | 
237 | def unpad_crf(returned_array, returned_mask, org_array, org_mask):
238 |     out_array = org_array.clone().detach()
239 |     out_array[org_mask] = returned_array[returned_mask]
240 |     return out_array


--------------------------------------------------------------------------------
/baselines/BERT+CRF/README.md:
--------------------------------------------------------------------------------
 1 | # DMBERT
 2 | This code is the implementation for BERT+CRF model. The implementations are based on [Huggingface's Transformers](https://github.com/huggingface/transformers) and the BERT+CRF implementations in [this repo](https://github.com/mezig351/transformers/tree/ner_crf/examples/ner).
 3 | 
 4 | 
 5 | 
 6 | ## Requirements
 7 | 
 8 | - python==3.6.9
 9 | 
10 | - torch==1.2.0
11 | 
12 | - transformers==2.6.0
13 | 
14 | - sklearn==0.20.2
15 | 
16 | - seqeval
17 | 
18 |   
19 | 
20 | ## Usage
21 | 
22 | Hint: please read and delete all the comments after ```\``` in each line of the ```.sh``` scripts before running them.
23 | 
24 | ### On MAVEN:
25 | The codes are in the ```BERT-CRF-MAVEN``` folder.
26 | 
27 | 1. Download MAVEN data files.
28 | 2. Run ```run_MAVEN.sh``` for training and evaluation on the devlopment set.  
29 | 3. Run ```run_MAVEN_infer.sh``` to get predictions on the test set (dumped to ```OUTPUT_PATH/results.jsonl```).
30 | 
31 | See the two scripts for more details.
32 | 
33 | ### On ACE
34 | The codes are in the ```BERT-CRF-ACE``` folder.
35 | 
36 | 1. Preprocess ACE 2005 dataset as in [this repo](https://github.com/thunlp/HMEAE).
37 | 2. Run ``run_ACE.sh`` for training and evaluation.
38 | 


--------------------------------------------------------------------------------
/baselines/DMBERT/README.md:
--------------------------------------------------------------------------------
 1 | # DMBERT
 2 | This code is the implementation for [DMBERT](https://www.aclweb.org/anthology/N19-1105/) model. The implementations are based on [Huggingface's Transformers](https://github.com/huggingface/transformers), especially its example for the multiple-choice task.
 3 | 
 4 | 
 5 | 
 6 | ## Requirements
 7 | 
 8 | - python==3.6.9
 9 | 
10 | - torch==1.2.0
11 | 
12 | - transformers==2.8.0
13 | 
14 | - sklearn==0.20.2
15 | 
16 |   
17 | 
18 | ## Usage
19 | 
20 | Hint: please read and delete all the comments after ```\``` in each line of the ```.sh``` scripts before running them.
21 | 
22 | ### On MAVEN:
23 | 
24 | 1. Download MAVEN data files.
25 | 2. Run ```run_MAVEN.sh``` for training and evaluation on the devlopment set.  
26 | 3. Run ```run_MAVEN_infer.sh``` to get predictions on the test set (dumped to ```results.jsonl```).
27 | 
28 | See the two scripts for more details.
29 | 
30 | ### On ACE
31 | 
32 | 1. Preprocess ACE 2005 dataset as in [this repo](https://github.com/thunlp/HMEAE).
33 | 2. Run ``run_ACE.sh`` for training and evaluation.
34 | 


--------------------------------------------------------------------------------
/baselines/DMBERT/get_submission.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | import argparse
 4 | import json
 5 | import numpy as np
 6 | if __name__=='__main__':
 7 |     parser=argparse.ArgumentParser()
 8 |     parser.add_argument("--test_data",default="../maven/test.jsonl",help="path to the test data file",required=False)
 9 |     parser.add_argument("--preds",default="MAVEN/_preds.npy",help="path to the prediction file generated by the run_MAVEN_infer.sh script")
10 |     parser.add_argument("--output",default="../maven/results.jsonl",help="path to the output file")
11 |     args=parser.parse_args()
12 |     preds=np.load(args.preds)
13 |     fout=open(args.output,"w")
14 |     with open(args.test_data,"r") as fin:
15 |         lines=fin.readlines()
16 |         Cnt=0
17 |         for line in lines:
18 |             data=json.loads(line)
19 |             res={"id":data['id']}
20 |             tmp=[]
21 |             for mention in data['candidates']:
22 |                 tmp.append({"id":mention["id"],"type_id":int(preds[Cnt])})
23 |                 Cnt+=1
24 |             res["predictions"]=tmp
25 |             fout.write(json.dumps(res)+"\n")
26 |         assert Cnt==len(preds)
27 |     fout.close()
28 | 


--------------------------------------------------------------------------------
/baselines/DMBERT/model.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function
 2 | import torch
 3 | import torch.nn as nn
 4 | import torch.nn.functional as F
 5 | from torch.nn import CrossEntropyLoss
 6 | from transformers import BertPreTrainedModel,BertModel
 7 | 
 8 | class DMBERT(BertPreTrainedModel):
 9 |     def __init__(self,config):
10 |         super().__init__(config)
11 |         self.bert=BertModel(config)
12 |         self.dropout=nn.Dropout(config.hidden_dropout_prob)
13 |         self.maxpooling=nn.MaxPool1d(128)
14 |         self.classifier=nn.Linear(config.hidden_size*2,config.num_labels)
15 |     def forward(self,input_ids=None,attention_mask=None,token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None, maskL=None, maskR=None, labels=None):
16 |         batchSize=input_ids.size(0)
17 |         outputs =self.bert(
18 |             input_ids,
19 |             attention_mask=attention_mask,
20 |             token_type_ids=token_type_ids,
21 |             position_ids=position_ids,
22 |             head_mask=head_mask,
23 |             inputs_embeds=inputs_embeds,
24 |         )
25 |         conved=outputs[0]
26 |         conved=conved.transpose(1,2)
27 |         conved=conved.transpose(0,1)
28 |         L=(conved*maskL).transpose(0,1)
29 |         R=(conved*maskR).transpose(0,1)
30 |         L=L+torch.ones_like(L)
31 |         R=R+torch.ones_like(R)
32 |         pooledL=self.maxpooling(L).contiguous().view(batchSize,self.config.hidden_size)
33 |         pooledR=self.maxpooling(R).contiguous().view(batchSize,self.config.hidden_size)
34 |         pooled=torch.cat((pooledL,pooledR),1)
35 |         pooled=pooled-torch.ones_like(pooled)
36 |         pooled=self.dropout(pooled)
37 |         logits=self.classifier(pooled)
38 |         reshaped_logits=logits.view(-1, self.config.num_labels)
39 |         outputs = (reshaped_logits,) + outputs[2:]
40 |         if labels is not None:
41 |             loss_fct=CrossEntropyLoss()
42 |             loss=loss_fct(reshaped_logits, labels)
43 |             outputs=(loss,)+outputs
44 |         return outputs
45 | 


--------------------------------------------------------------------------------
/baselines/DMBERT/run_ACE.sh:
--------------------------------------------------------------------------------
 1 | python3 run_ee.py \
 2 |     --data_dir ../ACE05/ \     # path to the preprocessed ACE 2005 dataset
 3 |     --model_type bert \
 4 |     --model_name_or_path bert-base-uncased \
 5 |     --task_name ace \
 6 |     --output_dir ./ACE \       # path to dump checkpoints
 7 |     --max_seq_length 128 \
 8 |     --do_lower_case \
 9 |     --per_gpu_train_batch_size 42 \
10 |     --per_gpu_eval_batch_size 42 \
11 |     --gradient_accumulation_steps 2 \
12 |     --learning_rate 5e-5 \
13 |     --num_train_epochs 10 \
14 |     --save_steps 300 \
15 |     --logging_steps 300 \
16 |     --seed 2 \
17 |     --do_train \
18 |     --do_eval \
19 |     --do_test \
20 |     --evaluate_during_training
21 | 


--------------------------------------------------------------------------------
/baselines/DMBERT/run_MAVEN.sh:
--------------------------------------------------------------------------------
 1 | python3 run_ee.py \
 2 |     --data_dir ../maven/ \ #path to the raw MAVEN data files
 3 |     --model_type bert \
 4 |     --model_name_or_path bert-base-uncased \
 5 |     --task_name maven \
 6 |     --output_dir ./MAVEN \ #path to dump checkpoints
 7 |     --max_seq_length 128 \
 8 |     --do_lower_case \
 9 |     --per_gpu_train_batch_size 42 \
10 |     --per_gpu_eval_batch_size 42 \
11 |     --gradient_accumulation_steps 3 \
12 |     --learning_rate 5e-5 \
13 |     --num_train_epochs 5 \
14 |     --save_steps 500 \
15 |     --logging_steps 500 \
16 |     --seed 42 \
17 |     --do_eval \
18 |     --do_train \
19 |     --evaluate_during_training


--------------------------------------------------------------------------------
/baselines/DMBERT/run_MAVEN_infer.sh:
--------------------------------------------------------------------------------
 1 | python3 run_ee.py \
 2 |     --data_dir ../maven/ \ #path to the test data, remember to delete the cached files at first (otherwise the test data may be random shuffled before)
 3 |     --model_type bert \
 4 |     --model_name_or_path ./MAVEN/checkpoint-2500 \ #path to the trained checkpoint
 5 |     --task_name maven_infer \
 6 |     --output_dir ./MAVEN \ #output path
 7 |     --max_seq_length 128 \
 8 |     --do_lower_case \
 9 |     --per_gpu_train_batch_size 42 \
10 |     --per_gpu_eval_batch_size 42 \
11 |     --gradient_accumulation_steps 2 \
12 |     --learning_rate 5e-5 \
13 |     --num_train_epochs 5 \
14 |     --save_steps 500 \
15 |     --logging_steps 500 \
16 |     --seed 42 \
17 |     --do_infer #add this flag to do inference only
18 | python3 get_submission.py \ #convert the predictions to the submission format
19 |     --test_data ../maven/test.jsonl \ #path to the test data file
20 |     --preds MAVEN/checkpoint-2500/checkpoint-2500_preds.npy \ #path to the prediction file
21 |     --output ./results.jsonl #output file
22 | 


--------------------------------------------------------------------------------
/baselines/DMBERT/utils_ee.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
  3 | # Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
  4 | # Copyright 2020 Xiaozhi Wang
  5 | #
  6 | # Licensed under the Apache License, Version 2.0 (the "License");
  7 | # you may not use this file except in compliance with the License.
  8 | # You may obtain a copy of the License at
  9 | #
 10 | #     http://www.apache.org/licenses/LICENSE-2.0
 11 | #
 12 | # Unless required by applicable law or agreed to in writing, software
 13 | # distributed under the License is distributed on an "AS IS" BASIS,
 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 15 | # See the License for the specific language governing permissions and
 16 | # limitations under the License.
 17 | """ Multiple choice fine-tuning: utilities to work with multiple choice tasks of reading comprehension  """
 18 | 
 19 | 
 20 | import csv
 21 | import glob
 22 | import json
 23 | import logging
 24 | import os
 25 | from typing import List
 26 | 
 27 | import tqdm
 28 | 
 29 | from transformers import PreTrainedTokenizer
 30 | 
 31 | 
 32 | logger = logging.getLogger(__name__)
 33 | 
 34 | 
 35 | class InputExample(object):
 36 |     """A single training/test example for multiple choice"""
 37 | 
 38 |     def __init__(self, example_id, tokens, triggerL, triggerR, label=None):
 39 |         """Constructs a InputExample.
 40 | 
 41 |         Args:
 42 |             example_id: Unique id for the example.
 43 |             contexts: list of str. The untokenized text of the first sequence (context of corresponding question).
 44 |             question: string. The untokenized text of the second sequence (question).
 45 |             endings: list of str. multiple choice's options. Its length must be equal to contexts' length.
 46 |             label: (Optional) string. The label of the example. This should be
 47 |             specified for train and dev examples, but not for test examples.
 48 |         """
 49 |         self.example_id = example_id
 50 |         self.tokens = tokens
 51 |         self.triggerL = triggerL
 52 |         self.triggerR = triggerR
 53 |         self.label = label
 54 | 
 55 | 
 56 | class InputFeatures(object):
 57 |     def __init__(self, example_id, input_ids, input_mask, segment_ids, maskL, maskR, label):
 58 |         self.example_id = example_id
 59 |         self.input_ids = input_ids
 60 |         self.input_mask = input_mask
 61 |         self.segment_ids = segment_ids
 62 |         self.maskL = maskL
 63 |         self.maskR = maskR
 64 |         self.label = label
 65 | 
 66 | 
 67 | class DataProcessor(object):
 68 |     """Base class for data converters for multiple choice data sets."""
 69 | 
 70 |     def get_train_examples(self, data_dir):
 71 |         """Gets a collection of `InputExample`s for the train set."""
 72 |         raise NotImplementedError()
 73 | 
 74 |     def get_dev_examples(self, data_dir):
 75 |         """Gets a collection of `InputExample`s for the dev set."""
 76 |         raise NotImplementedError()
 77 | 
 78 |     def get_test_examples(self, data_dir):
 79 |         """Gets a collection of `InputExample`s for the test set."""
 80 |         raise NotImplementedError()
 81 | 
 82 |     def get_labels(self):
 83 |         """Gets the list of labels for this data set."""
 84 |         raise NotImplementedError()
 85 | 
 86 | 
 87 | class ACEProcessor(DataProcessor):
 88 |     """Processor for the RACE data set."""
 89 | 
 90 |     def get_train_examples(self, data_dir):
 91 |         """See base class."""
 92 |         logger.info("LOOKING AT {} train".format(data_dir))
 93 |         return self._create_examples(json.load(open(os.path.join(data_dir,'train.json'),"r")), "train")
 94 | 
 95 |     def get_dev_examples(self, data_dir):
 96 |         """See base class."""
 97 |         logger.info("LOOKING AT {} dev".format(data_dir))
 98 |         return self._create_examples(json.load(open(os.path.join(data_dir,'dev.json'),"r")), "dev")
 99 | 
100 |     def get_test_examples(self, data_dir):
101 |         """See base class."""
102 |         logger.info("LOOKING AT {} test".format(data_dir))
103 |         return self._create_examples(json.load(open(os.path.join(data_dir,'test.json'),"r")), "test")
104 | 
105 |     def get_labels(self):
106 |         """See base class."""
107 |         return ['None', 'End-Position', 'Charge-Indict', 'Convict', 'Transfer-Ownership', 'Demonstrate', 'Transport', 'Sentence', 'Appeal', 'Start-Org', 'Start-Position', 'End-Org', 'Phone-Write', 'Nominate', 'Marry', 'Pardon', 'Release-Parole', 'Meet', 'Trial-Hearing', 'Extradite', 'Execute', 'Transfer-Money', 'Elect', 'Injure', 'Acquit', 'Divorce', 'Die', 'Arrest-Jail', 'Declare-Bankruptcy', 'Be-Born', 'Merge-Org', 'Fine', 'Sue', 'Attack']
108 | 
109 |     def _create_examples(self, lines, set_type):
110 |         """Creates examples for the training and dev sets."""
111 |         examples = []
112 |         for (idx, data_raw) in enumerate(lines):
113 |             e_id = "%s-%s" % (set_type, idx)
114 |             examples.append(
115 |                 InputExample(
116 |                     example_id=e_id,
117 |                     tokens=data_raw['tokens'],
118 |                     triggerL=data_raw['trigger_start'],
119 |                     triggerR=data_raw['trigger_end']+1,
120 |                     label=data_raw['event_type'],
121 |                 )
122 |             )
123 |         return examples
124 | 
125 | class MAVENProcessor(DataProcessor):
126 |     """Processor for the RACE data set."""
127 | 
128 |     def get_train_examples(self, data_dir):
129 |         """See base class."""
130 |         logger.info("LOOKING AT {} train".format(data_dir))
131 |         return self._create_examples(open(os.path.join(data_dir,'train.jsonl'),"r"), "train")
132 | 
133 |     def get_dev_examples(self, data_dir):
134 |         """See base class."""
135 |         logger.info("LOOKING AT {} dev".format(data_dir))
136 |         return self._create_examples(open(os.path.join(data_dir,'valid.jsonl'),"r"), "dev")
137 | 
138 |     def get_test_examples(self, data_dir):
139 |         """See base class."""
140 |         logger.info("LOOKING AT {} test".format(data_dir))
141 |         return self._create_examples(open(os.path.join(data_dir,'test.jsonl'),"r"), "test")
142 | 
143 |     def get_labels(self):
144 |         """See base class."""
145 |         return ["None", "Know", "Warning", "Catastrophe", "Placing", "Causation", "Arriving", "Sending", "Protest", "Preventing_or_letting", "Motion", "Damaging", "Destroying", "Death", "Perception_active", "Presence", "Influence", "Receiving", "Check", "Hostile_encounter", "Killing", "Conquering", "Releasing", "Attack", "Earnings_and_losses", "Choosing", "Traveling", "Recovering", "Using", "Coming_to_be", "Cause_to_be_included", "Process_start", "Change_event_time", "Reporting", "Bodily_harm", "Suspicion", "Statement", "Cause_change_of_position_on_a_scale", "Coming_to_believe", "Expressing_publicly", "Request", "Control", "Supporting", "Defending", "Building", "Military_operation", "Self_motion", "GetReady", "Forming_relationships", "Becoming_a_member", "Action", "Removing", "Surrendering", "Agree_or_refuse_to_act", "Participation", "Deciding", "Education_teaching", "Emptying", "Getting", "Besieging", "Creating", "Process_end", "Body_movement", "Expansion", "Telling", "Change", "Legal_rulings", "Bearing_arms", "Giving", "Name_conferral", "Arranging", "Use_firearm", "Committing_crime", "Assistance", "Surrounding", "Quarreling", "Expend_resource", "Motion_directional", "Bringing", "Communication", "Containing", "Manufacturing", "Social_event", "Robbery", "Competition", "Writing", "Rescuing", "Judgment_communication", "Change_tool", "Hold", "Being_in_operation", "Recording", "Carry_goods", "Cost", "Departing", "GiveUp", "Change_of_leadership", "Escaping", "Aiming", "Hindering", "Preserving", "Create_artwork", "Openness", "Connect", "Reveal_secret", "Response", "Scrutiny", "Lighting", "Criminal_investigation", "Hiding_objects", "Confronting_problem", "Renting", "Breathing", "Patrolling", "Arrest", "Convincing", "Commerce_sell", "Cure", "Temporary_stay", "Dispersal", "Collaboration", "Extradition", "Change_sentiment", "Commitment", "Commerce_pay", "Filling", "Becoming", "Achieve", "Practice", "Cause_change_of_strength", "Supply", "Cause_to_amalgamate", "Scouring", "Violence", "Reforming_a_system", "Come_together", "Wearing", "Cause_to_make_progress", "Legality", "Employment", "Rite", "Publishing", "Adducing", "Exchange", "Ratification", "Sign_agreement", "Commerce_buy", "Imposing_obligation", "Rewards_and_punishments", "Institutionalization", "Testing", "Ingestion", "Labeling", "Kidnapping", "Submitting_documents", "Prison", "Justifying", "Emergency", "Terrorism", "Vocalizations", "Risk", "Resolve_problem", "Revenge", "Limiting", "Research", "Having_or_lacking_access", "Theft", "Incident", "Award"]
146 |     def _create_examples(self, fin, set_type):
147 |         """Creates examples for the training and dev sets."""
148 |         examples = []
149 |         lines=fin.readlines()
150 |         for (_, data_raw) in enumerate(lines):
151 |             data=json.loads(data_raw)
152 |             for event in data['events']:
153 |                 if event['type']=='None of the above':
154 |                     print("?????????")
155 |                 for mention in event['mention']:
156 |                     e_id = "%s-%s" % (set_type, mention['id'])
157 |                     examples.append(
158 |                         InputExample(
159 |                             example_id=e_id,
160 |                             tokens=data['content'][mention['sent_id']]['tokens'],
161 |                             triggerL=mention['offset'][0],
162 |                             triggerR=mention['offset'][1],
163 |                             label=event['type'],
164 |                         )
165 |                     )
166 |             for nIns in data['negative_triggers']:
167 |                 e_id = "%s-%s" % (set_type, nIns['id'])
168 |                 examples.append(
169 |                     InputExample(
170 |                         example_id=e_id,
171 |                         tokens=data['content'][nIns['sent_id']]['tokens'],
172 |                         triggerL=nIns['offset'][0],
173 |                         triggerR=nIns['offset'][1],
174 |                         label='None',
175 |                     )
176 |                 )
177 | 
178 |         return examples
179 | 
180 | 
181 | class MAVENInferProcessor(DataProcessor):
182 |     """Processor for the RACE data set."""
183 | 
184 |     def get_test_examples(self, data_dir):
185 |         """See base class."""
186 |         logger.info("LOOKING AT {} test".format(data_dir))
187 |         return self._create_examples(open(os.path.join(data_dir,'test.jsonl'),"r"), "test")
188 | 
189 |     def get_labels(self):
190 |         """See base class."""
191 |         return ["None", "Know", "Warning", "Catastrophe", "Placing", "Causation", "Arriving", "Sending", "Protest", "Preventing_or_letting", "Motion", "Damaging", "Destroying", "Death", "Perception_active", "Presence", "Influence", "Receiving", "Check", "Hostile_encounter", "Killing", "Conquering", "Releasing", "Attack", "Earnings_and_losses", "Choosing", "Traveling", "Recovering", "Using", "Coming_to_be", "Cause_to_be_included", "Process_start", "Change_event_time", "Reporting", "Bodily_harm", "Suspicion", "Statement", "Cause_change_of_position_on_a_scale", "Coming_to_believe", "Expressing_publicly", "Request", "Control", "Supporting", "Defending", "Building", "Military_operation", "Self_motion", "GetReady", "Forming_relationships", "Becoming_a_member", "Action", "Removing", "Surrendering", "Agree_or_refuse_to_act", "Participation", "Deciding", "Education_teaching", "Emptying", "Getting", "Besieging", "Creating", "Process_end", "Body_movement", "Expansion", "Telling", "Change", "Legal_rulings", "Bearing_arms", "Giving", "Name_conferral", "Arranging", "Use_firearm", "Committing_crime", "Assistance", "Surrounding", "Quarreling", "Expend_resource", "Motion_directional", "Bringing", "Communication", "Containing", "Manufacturing", "Social_event", "Robbery", "Competition", "Writing", "Rescuing", "Judgment_communication", "Change_tool", "Hold", "Being_in_operation", "Recording", "Carry_goods", "Cost", "Departing", "GiveUp", "Change_of_leadership", "Escaping", "Aiming", "Hindering", "Preserving", "Create_artwork", "Openness", "Connect", "Reveal_secret", "Response", "Scrutiny", "Lighting", "Criminal_investigation", "Hiding_objects", "Confronting_problem", "Renting", "Breathing", "Patrolling", "Arrest", "Convincing", "Commerce_sell", "Cure", "Temporary_stay", "Dispersal", "Collaboration", "Extradition", "Change_sentiment", "Commitment", "Commerce_pay", "Filling", "Becoming", "Achieve", "Practice", "Cause_change_of_strength", "Supply", "Cause_to_amalgamate", "Scouring", "Violence", "Reforming_a_system", "Come_together", "Wearing", "Cause_to_make_progress", "Legality", "Employment", "Rite", "Publishing", "Adducing", "Exchange", "Ratification", "Sign_agreement", "Commerce_buy", "Imposing_obligation", "Rewards_and_punishments", "Institutionalization", "Testing", "Ingestion", "Labeling", "Kidnapping", "Submitting_documents", "Prison", "Justifying", "Emergency", "Terrorism", "Vocalizations", "Risk", "Resolve_problem", "Revenge", "Limiting", "Research", "Having_or_lacking_access", "Theft", "Incident", "Award"]
192 |     def _create_examples(self, fin, set_type):
193 |         """Creates examples for the training and dev sets."""
194 |         examples = []
195 |         lines=fin.readlines()
196 |         for (_, data_raw) in enumerate(lines):
197 |             data=json.loads(data_raw)
198 |             for mention in data['candidates']:
199 |                 e_id = "%s-%s" % (set_type, mention['id'])
200 |                 examples.append(
201 |                     InputExample(
202 |                         example_id=e_id,
203 |                         tokens=data['content'][mention['sent_id']]['tokens'],
204 |                         triggerL=mention['offset'][0],
205 |                         triggerR=mention['offset'][1],
206 |                         label='None',
207 |                     )
208 |                 )
209 |         return examples
210 | 
211 | def convert_examples_to_features(
212 |     examples: List[InputExample],
213 |     label_list: List[str],
214 |     max_length: int,
215 |     tokenizer: PreTrainedTokenizer,
216 |     pad_token_segment_id=0,
217 |     pad_on_left=False,
218 |     pad_token=0,
219 |     mask_padding_with_zero=True,
220 | ) -> List[InputFeatures]:
221 |     """
222 |     Loads a data file into a list of `InputFeatures`
223 |     """
224 | 
225 |     label_map = {label: i for i, label in enumerate(label_list)}
226 | 
227 |     features = []
228 |     for (ex_index, example) in tqdm.tqdm(enumerate(examples), desc="convert examples to features"):
229 |         if ex_index % 10000 == 0:
230 |             logger.info("Writing example %d of %d" % (ex_index, len(examples)))
231 |         textL = tokenizer.tokenize(" ".join(example.tokens[:example.triggerL]))
232 |         textR = tokenizer.tokenize(" ".join(example.tokens[example.triggerL:example.triggerR]))+['[unused1]']+tokenizer.tokenize(" ".join(example.tokens[example.triggerR:]))
233 |         maskL = [1.0 for i in range(0,len(textL)+1)] + [0.0 for i in range(0,len(textR)+2)]
234 |         maskR = [0.0 for i in range(0,len(textL)+1)] + [1.0 for i in range(0,len(textR)+2)]
235 |         if len(maskL)>max_length:
236 |             maskL = maskL[:max_length]
237 |         if len(maskR)>max_length:
238 |             maskR = maskR[:max_length]
239 |         inputs = tokenizer.encode_plus(
240 |             textL + ['[unused0]'] + textR, add_special_tokens=True, max_length=max_length, return_token_type_ids=True
241 |         )
242 |         if "num_truncated_tokens" in inputs and inputs["num_truncated_tokens"] > 0:
243 |             logger.info(
244 |                 "Attention! you are cropping tokens."
245 |             )
246 | 
247 |         input_ids, token_type_ids = inputs["input_ids"], inputs["token_type_ids"]
248 |         assert len(input_ids)==len(maskL)
249 |         assert len(input_ids)==len(maskR)
250 |         # The mask has 1 for real tokens and 0 for padding tokens. Only real
251 |         # tokens are attended to.
252 |         attention_mask = [1 if mask_padding_with_zero else 0] * len(input_ids)
253 |         # Zero-pad up to the sequence length.
254 |         padding_length = max_length - len(input_ids)
255 |         if pad_on_left:
256 |             input_ids = ([pad_token] * padding_length) + input_ids
257 |             attention_mask = ([0 if mask_padding_with_zero else 1] * padding_length) + attention_mask
258 |             token_type_ids = ([pad_token_segment_id] * padding_length) + token_type_ids
259 |             maskL = ([0.0] * padding_length) + maskL
260 |             maskR = ([0.0] * padding_length) + maskR
261 |         else:
262 |             input_ids = input_ids + ([pad_token] * padding_length)
263 |             attention_mask = attention_mask + ([0 if mask_padding_with_zero else 1] * padding_length)
264 |             token_type_ids = token_type_ids + ([pad_token_segment_id] * padding_length)
265 |             maskL = maskL + ([0.0] * padding_length)
266 |             maskR = maskR + ([0.0] * padding_length)
267 | 
268 |         assert len(input_ids) == max_length
269 |         assert len(attention_mask) == max_length
270 |         assert len(token_type_ids) == max_length
271 | 
272 |         label = label_map[example.label]
273 | 
274 |         if ex_index < 2:
275 |             logger.info("*** Example ***")
276 |             logger.info("example_id: {}".format(example.example_id))
277 |             logger.info("input_ids: {}".format(" ".join(map(str, input_ids))))
278 |             logger.info("attention_mask: {}".format(" ".join(map(str, attention_mask))))
279 |             logger.info("token_type_ids: {}".format(" ".join(map(str, token_type_ids))))
280 |             logger.info("maskL: {}".format(" ".join(map(str, maskL))))
281 |             logger.info("maskR: {}".format(" ".join(map(str, maskR))))
282 |             logger.info("label: {}".format(label))
283 | 
284 |         features.append(InputFeatures(example_id=example.example_id, input_ids=input_ids, input_mask=attention_mask, segment_ids=token_type_ids, maskL=maskL, maskR=maskR, label=label))
285 | 
286 |     return features
287 | 
288 | 
289 | processors = {"ace": ACEProcessor, "maven": MAVENProcessor, "maven_infer": MAVENInferProcessor}
290 | 
291 | 
292 | MULTIPLE_CHOICE_TASKS_NUM_LABELS = {"ace", 34, "maven", 169}
293 | 


--------------------------------------------------------------------------------
/baselines/DMCNN_BiLSTM_(CRF)/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/THU-KEG/MAVEN-dataset/71151f6da53bc9df9d6c5718dc81a2e1489651bf/baselines/DMCNN_BiLSTM_(CRF)/.DS_Store


--------------------------------------------------------------------------------
/baselines/DMCNN_BiLSTM_(CRF)/README.md:
--------------------------------------------------------------------------------
 1 | # DMCNN & BiLSTM & BiLSTM+CRF
 2 | The codes are implementations of [DMCNN](https://www.aclweb.org/anthology/P15-1017/), BiLSTM and BiLSTM+CRF for event detection on MAVEN. 
 3 | 
 4 | ## Requirements
 5 | 
 6 | + torch==1.6
 7 | + CUDA==10.2
 8 | + numpy
 9 | + sklearn
10 | + seqeval==1.2.2
11 | + tqdm==4.44.0
12 | 
13 | ## Usage
14 | 
15 | To run this code, you need to:
16 | 1. put raw files of MAVEN dataset in `./raw`
17 | 2. run ```python main.py --config [path of config files] --gpu [gpu, optional]```  
18 | we will train, evaluate and test models in every epoch. We output the performance of training and evaluating, and generate test result files for submit to [CodaLab](https://competitions.codalab.org/competitions/27320#learn_the_details-submission-format).
19 | 
20 | All the hyper-parameters for the three models are in config files at `./config/`, you can modify them as you wish.
21 | 


--------------------------------------------------------------------------------
/baselines/DMCNN_BiLSTM_(CRF)/clear.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import shutil
 3 | 
 4 | del_list = os.listdir("data")
 5 | for f in del_list:
 6 |     file_path = os.path.join("data", f)
 7 |     if os.path.isfile(file_path):
 8 |         os.remove(file_path)
 9 |     elif os.path.isdir(file_path):
10 |         shutil.rmtree(file_path)
11 | 


--------------------------------------------------------------------------------
/baselines/DMCNN_BiLSTM_(CRF)/config/bilstm.config:
--------------------------------------------------------------------------------
 1 | [train]
 2 | epoch = 15
 3 | batch_size = 200
 4 | shuffle = True
 5 | valid_interval = 1
 6 | save_strategy = save_best
 7 | 
 8 | [test]
 9 | batch_size = 200
10 | shuffle = False
11 | 
12 | [data]
13 | reader_name = MavenReader
14 | formatter_name = BilstmFormatter
15 | word2vec_file = 100.utf8
16 | split_labels = True
17 | 
18 | [model]
19 | model_name = Bilstm
20 | num_layers = 1
21 | hidden_size = 256
22 | dropout = 0.5
23 | 
24 | [optimizer]
25 | optimizer_name = Adam
26 | lr = 1e-3
27 | weight_decay = 1e-8


--------------------------------------------------------------------------------
/baselines/DMCNN_BiLSTM_(CRF)/config/crf.config:
--------------------------------------------------------------------------------
 1 | [train]
 2 | epoch = 15
 3 | batch_size = 40        
 4 | shuffle = True
 5 | valid_interval = 1
 6 | save_strategy = save_best
 7 | 
 8 | [test]
 9 | batch_size = 40
10 | shuffle = False
11 | 
12 | [data]
13 | reader_name = MavenReader
14 | formatter_name = CrfFormatter
15 | word2vec_file = 100.utf8
16 | sequence_length = 128
17 | BIO = True
18 | pad_label_id = -100
19 | 
20 | [model]
21 | model_name = Crf
22 | num_layers = 1
23 | hidden_size = 400
24 | dropout = 0.3
25 | 
26 | [optimizer]
27 | optimizer_name = Adam
28 | lr = 1e-3
29 | weight_decay = 1e-8


--------------------------------------------------------------------------------
/baselines/DMCNN_BiLSTM_(CRF)/config/dmcnn.config:
--------------------------------------------------------------------------------
 1 | [train]
 2 | epoch = 15
 3 | batch_size = 170
 4 | shuffle = True
 5 | valid_interval = 1
 6 | save_strategy = save_best
 7 | 
 8 | [test]
 9 | batch_size = 170
10 | shuffle = False
11 | 
12 | [data]
13 | reader_name = MavenReader
14 | formatter_name = DmcnnFormatter
15 | word2vec_file = 100.utf8
16 | split_labels = True
17 | 
18 | [model]
19 | model_name = Dmcnn
20 | pf_dim = 5
21 | llf_num = 3
22 | kernel_size = 3
23 | hidden_size = 200
24 | dropout = 0.5
25 | 
26 | [optimizer]
27 | optimizer_name = Adam
28 | lr = 1e-3
29 | weight_decay = 1e-8


--------------------------------------------------------------------------------
/baselines/DMCNN_BiLSTM_(CRF)/formatter/BilstmFormatter.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from utils.global_variables import Global
 3 | 
 4 | class BilstmFormatter(object):
 5 |     def __init__(self, config):
 6 |         self.config = config
 7 | 
 8 |     def process(self, data, mode):
 9 |         """
10 |         :param data: [{"tokens": list(int), "labels": int}, ...]
11 |         :param mode: train/valid/test
12 |         :return: {"tokens": LongTensor,
13 |                   "lables": LongTensor,
14 |                   "lengths": LongTensor,
15 |                   "indices": LongTensor}
16 |         """
17 |         tokens, canids, labels, lengths, indices, docids = [], [], [], [], [], []
18 | 
19 |         sequence_length = self.config.getint("runtime", "sequence_length")
20 | 
21 |         for item in data:
22 |             length = len(item["tokens"])
23 |             docids.append(item["docids"])
24 |             tokens.append(item["tokens"] + [Global.word2id["<PAD>"]] * (sequence_length - length))
25 |             canids.append(item["canids"])
26 |             if mode != "test":
27 |                 labels.append(item["labels"])
28 |             lengths.append(length)
29 |             indices.append(item["index"])
30 | 
31 |         tlt = lambda t: torch.LongTensor(t)
32 |         tt = lambda t: torch.Tensor(t)
33 |         tokens, lengths, indices = tlt(tokens), tlt(lengths), tlt(indices)
34 |         if mode != "test":
35 |             labels = tlt(labels)
36 | 
37 |         return {"tokens": tokens,
38 |                 "labels": labels,
39 |                 "lengths": lengths,
40 |                 "indices": indices,
41 |                 "canids": canids,
42 |                 "docids": docids} if mode != "test" else {
43 |                     "tokens": tokens,
44 |                     "lengths": lengths,
45 |                     "indices": indices,
46 |                     "canids": canids,
47 |                     "docids": docids
48 |                 }
49 | 


--------------------------------------------------------------------------------
/baselines/DMCNN_BiLSTM_(CRF)/formatter/CrfFormatter.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from utils.global_variables import Global
 3 | 
 4 | class CrfFormatter(object):
 5 |     def __init__(self, config):
 6 |         self.config = config
 7 |         self.pad_label_id = config.getint("data", "pad_label_id")
 8 | 
 9 |     def process(self, data, mode):
10 |         """
11 |         :param data: [{"tokens": list(int), "labels": list(int)}, ...]
12 |         :param mode: train/valid/test
13 |         :return: {"tokens": LongTensor,
14 |                   "lables": LongTensor,
15 |                   "masks": LongTensor,
16 |                   "lengths": LongTensor}
17 |         """
18 |         tokens, canids, labels, flags, masks, lengths, docids = [], [], [], [], [], [], []
19 | 
20 |         sequence_length = self.config.getint("runtime", "sequence_length")
21 | 
22 |         for item in data:
23 |             docid = item["docids"]
24 |             token = item["tokens"]
25 |             canid_ = item["canids"]
26 |             if mode != "test":
27 |                 label = item["labels"]
28 |             else:
29 |                 label = [0] * len(token)
30 |             flag = item["flags"] if "flags" in item else [1] * len(token)
31 |             if len(token) > sequence_length:
32 |                 token = token[:sequence_length]
33 |                 canid_ = canid_[:sequence_length]
34 |                 label = label[:sequence_length]
35 |                 flag = flag[:sequence_length]
36 |             length = len(token)
37 |             token += [Global.word2id["<PAD>"]] * (sequence_length - length)
38 |             label += [self.pad_label_id] * (sequence_length - length)
39 |             canid = []
40 |             for i in range(len(flag)):
41 |                 if flag[i] == 1:
42 |                     canid.append(canid_[i])
43 |             flag += [0] * (sequence_length - length)
44 |             for i in range(sequence_length):
45 |                 if i < length and flag[i] == 1:
46 |                     assert label[i] != self.pad_label_id
47 |             docids.append(docid)
48 |             tokens.append(token)
49 |             canids.append(canid)
50 |             labels.append(label)
51 |             flags.append(flag)
52 |             masks.append([1] * length + [0] * (sequence_length - length))
53 |             lengths.append(length)
54 |             for i in range(length):
55 |                 assert labels[-1][i] != self.pad_label_id
56 | 
57 |         tlt = lambda t: torch.LongTensor(t)
58 |         tt = lambda t: torch.Tensor(t)
59 |         
60 |         tokens, labels, masks, lengths = tlt(tokens), tlt(labels), tlt(masks), tlt(lengths)
61 | 
62 |         return {"tokens": tokens,
63 |                 "labels": labels,
64 |                 "flags": flags,
65 |                 "masks": masks,
66 |                 "lengths": lengths,
67 |                 "canids": canids,
68 |                 "docids": docids}
69 | 
70 | 


--------------------------------------------------------------------------------
/baselines/DMCNN_BiLSTM_(CRF)/formatter/DmcnnFormatter.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from utils.global_variables import Global
 3 | 
 4 | 
 5 | class DmcnnFormatter(object):
 6 |     def __init__(self, config):
 7 |         self.config = config
 8 | 
 9 |     def process(self, data, mode):
10 |         """
11 |         :param data: [{"tokens": list(int), "labels": int}, ...]
12 |         :param mode: train/valid/test
13 |         :return: {"tokens": LongTensor,
14 |                   "lables": LongTensor,
15 |                   "pfs": LongTensor,
16 |                   "llfs": LongTensor,
17 |                   "masks": Tensor}
18 |         """
19 |         tokens, canids, labels, masks, pfs, llfs, docids = [], [], [], [], [], [], []
20 | 
21 |         sequence_length = self.config.getint("runtime", "sequence_length")
22 | 
23 |         for item in data:
24 |             length = len(item["tokens"])
25 |             docids.append(item["docids"])
26 |             tokens.append(item["tokens"] + [Global.word2id["<PAD>"]] * (sequence_length - length))
27 |             canids.append(item["canids"])
28 |             if mode != "test":
29 |                 labels.append(item["labels"])
30 |             mask = []
31 |             for i in range(sequence_length):
32 |                 if 0 <= i <= item["index"]:
33 |                     mask.append([100, 0])
34 |                 elif i < length:
35 |                     mask.append([0, 100])
36 |                 else:
37 |                     mask.append([0, 0])
38 |             masks.append(mask)
39 |             pfs.append([abs(item["index"] - x) for x in range(sequence_length)])
40 |             if item["index"] == 0:
41 |                 llfs.append([0] + tokens[-1][item["index"]: item["index"] + 2])
42 |             elif item["index"] == sequence_length - 1:
43 |                 llfs.append(tokens[-1][item["index"] - 1: item["index"] + 1] + [0])
44 |             else:
45 |                 llfs.append(tokens[-1][item["index"] - 1: item["index"] + 2])
46 |             assert len(llfs[-1]) == 3
47 | 
48 |         tlt = lambda t: torch.LongTensor(t)
49 |         tt = lambda t: torch.Tensor(t)
50 |         tokens, pfs, llfs = tlt(tokens), tlt(pfs), tlt(llfs)
51 |         masks = tt(masks)
52 |         if mode != "test":
53 |             labels = tlt(labels)
54 | 
55 |         return {"tokens": tokens, 
56 |                 "labels": labels,
57 |                 "pfs": pfs,
58 |                 "llfs": llfs,
59 |                 "masks": masks,
60 |                 "canids": canids,
61 |                 "docids": docids} if mode != "test" else {
62 |                     "tokens": tokens, 
63 |                     "pfs": pfs,
64 |                     "llfs": llfs,
65 |                     "masks": masks,
66 |                     "canids": canids,
67 |                     "docids": docids
68 |                 }
69 | 


--------------------------------------------------------------------------------
/baselines/DMCNN_BiLSTM_(CRF)/formatter/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/THU-KEG/MAVEN-dataset/71151f6da53bc9df9d6c5718dc81a2e1489651bf/baselines/DMCNN_BiLSTM_(CRF)/formatter/__init__.py


--------------------------------------------------------------------------------
/baselines/DMCNN_BiLSTM_(CRF)/main.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import argparse
 3 | from utils.configparser_hook import get_config
 4 | from utils.global_variables import Global
 5 | from utils.initializer import initialize
 6 | from utils.runner import run
 7 | 
 8 | if __name__ == "__main__":
 9 |     parser = argparse.ArgumentParser()
10 |     required_args = ["config"]
11 |     normal_args = ["gpu"]
12 |     for arg in required_args + normal_args:
13 |         parser.add_argument("--{}".format(arg), required=arg in required_args)
14 |     args = parser.parse_args()
15 |     
16 |     device = torch.device("cuda:{}".format(args.gpu) if args.gpu and torch.cuda.is_available() else "cpu")
17 |     Global.device = device
18 |     print("Device:", device)
19 | 
20 |     config = get_config(args.config)
21 |     config.add_section("runtime")
22 | 
23 |     parameters = initialize(config, device)
24 | 
25 |     run(parameters, config, device)    
26 |     


--------------------------------------------------------------------------------
/baselines/DMCNN_BiLSTM_(CRF)/model/Bilstm.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | from utils.global_variables import Global
 4 | from model.layers import embedding, outputLayer
 5 | 
 6 | class Bilstm(nn.Module):
 7 |     def __init__(self, config):
 8 |         super(Bilstm, self).__init__()
 9 |         self.config = config
10 |         self.embedding = embedding.Embedding(config)
11 |         self.rnn = DynamicRNN(config)
12 |         
13 |         self.dropout = nn.Dropout(config.getfloat("model", "dropout"))
14 |         self.fc = nn.Linear(in_features=config.getint("model", "hidden_size"),
15 |                             out_features=config.getint("runtime", "num_class"),
16 |                             bias=True)
17 |         self.out = outputLayer.OutputLayer(config)
18 |         print(self)
19 | 
20 | 
21 |     def forward(self, data, **params):
22 |         """
23 |         :param data: 这一轮输入的数据
24 |         :param params: 存放任何其它需要的信息
25 |         """
26 |         mode = params["mode"]
27 |         tokens = data["tokens"]     # [B, L]
28 |         if mode != "test":
29 |             labels = data["labels"]     # [B, ]
30 |         lengths = data["lengths"]   # [B, ]
31 |         indices = data["indices"]   # [B, ]
32 |         
33 |         prediction = self.embedding(tokens)     # [B, L, E]
34 |         prediction = self.dropout(prediction)
35 |         prediction = self.rnn(prediction, lengths, indices)    # [B, H]
36 |         prediction = self.fc(prediction)    # [B, N]
37 | 
38 |         if mode != "test":
39 |             loss = self.out(prediction, labels)
40 |         prediction = torch.argmax(prediction, dim=1)
41 | 
42 |         return {"loss": loss, 
43 |                 "prediction": prediction, 
44 |                 "labels": labels} if mode != "test" else {
45 |                     "prediction": prediction
46 |                 }
47 | 
48 | 
49 | class DynamicRNN(nn.Module):
50 |     def __init__(self, config):
51 |         super(DynamicRNN, self).__init__()
52 |         self.embedding_size = config.getint("runtime", "embedding_size")
53 |         self.sequence_length = config.getint("runtime", "sequence_length")
54 |         self.num_layers = config.getint("model", "num_layers")
55 |         self.hidden_size = config.getint("model", "hidden_size")
56 |         self.rnn = nn.LSTM(input_size=self.embedding_size,
57 |                            hidden_size=self.hidden_size // 2,
58 |                            num_layers=self.num_layers,
59 |                            bias=True,
60 |                            batch_first=True,
61 |                            dropout=0,
62 |                            bidirectional=True)
63 | 
64 |     def forward(self, inputs, lengths, indices):
65 |         embedding_packed = nn.utils.rnn.pack_padded_sequence(input=inputs,
66 |                                                              lengths=lengths,
67 |                                                              batch_first=True,
68 |                                                              enforce_sorted=False)
69 |         outputs, _ = self.rnn(embedding_packed, None)
70 |         outputs, _ = nn.utils.rnn.pad_packed_sequence(sequence=outputs,
71 |                                                       batch_first=True,
72 |                                                       padding_value=0.0,
73 |                                                       total_length=self.sequence_length)
74 |         outputs = outputs[torch.arange(inputs.shape[0]), indices]
75 |         return outputs
76 |                                 
77 |         
78 | 
79 |         


--------------------------------------------------------------------------------
/baselines/DMCNN_BiLSTM_(CRF)/model/Crf.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | from torch.nn.utils.rnn import pad_sequence
  4 | from utils.global_variables import Global
  5 | from model.layers import embedding, crf, outputLayer
  6 | 
  7 | class Crf(nn.Module):
  8 |     def __init__(self, config):
  9 |         super(Crf, self).__init__()
 10 |         self.config = config
 11 |         self.embedding = embedding.Embedding(config)
 12 |         self.rnn = DynamicRNN(config)
 13 |         self.dropout = nn.Dropout(config.getfloat("model", "dropout"))
 14 |         self.hidden2tag = nn.Linear(in_features=config.getint("model", "hidden_size"),
 15 |                                     out_features=config.getint("runtime", "num_class") + 2,
 16 |                                     bias=True)
 17 |         self.pad_label_id = config.getint("data", "pad_label_id")
 18 |         self.crf = crf.CRF(tagset_size=config.getint("runtime", "num_class"), use_gpu=Global.device)
 19 |         print(self)
 20 | 
 21 |     def forward(self, data, **params):
 22 |         """
 23 |         :param data: 这一轮输入的数据
 24 |         :param params: 存放任何其它需要的信息
 25 |         """
 26 |         mode = params["mode"]
 27 |         tokens = data["tokens"]         # [B, L]
 28 |         labels = data["labels"]         # [B, L]
 29 |         lengths = data["lengths"]       # [B, ]
 30 |         flags = data["flags"]
 31 |         attention_masks = data["masks"] # [B, L]
 32 | 
 33 |         prediction = self.embedding(tokens)     # [B, L, E]
 34 |         prediction = self.dropout(prediction)
 35 |         prediction = self.rnn(prediction, lengths)  # [B, L, H]
 36 |         prediction = self.hidden2tag(prediction)    # [B, L, N+2]
 37 | 
 38 |         pad_masks = (labels != self.pad_label_id)
 39 |         loss_masks = ((attention_masks == 1) & pad_masks)
 40 | 
 41 |         if params["crf_mode"] == "train":
 42 |             crf_labels, crf_masks = self.to_crf_pad(labels, loss_masks)
 43 |             crf_logits, _ = self.to_crf_pad(prediction, loss_masks)
 44 |             loss = self.crf.neg_log_likelihood(crf_logits, crf_masks, crf_labels)
 45 |             return {"loss": loss,
 46 |                     "prediction": None,
 47 |                     "labels": None}
 48 | 
 49 |         elif params["crf_mode"] == "test":
 50 |             masks = (attention_masks == 1)
 51 |             crf_logits, crf_masks = self.to_crf_pad(prediction, masks)
 52 |             crf_masks = crf_masks.sum(axis=2) == crf_masks.shape[2]
 53 |             best_path = self.crf(crf_logits, crf_masks)
 54 |             temp_labels = (torch.ones(loss_masks.shape) * self.pad_label_id).to(torch.long)
 55 |             prediction = self.unpad_crf(best_path, crf_masks, temp_labels, masks)
 56 |             return {"loss": None,
 57 |                     "prediction": self.normalize(prediction, flags, lengths),
 58 |                     "labels": self.normalize(labels, flags, lengths)} if mode != "test" else {
 59 |                         "prediction": self.normalize(prediction, flags, lengths)
 60 |                     }
 61 | 
 62 |         else:
 63 |             raise NotImplementedError
 64 | 
 65 |     def normalize(self, logits, flags, lengths):
 66 |         results = []
 67 |         logits = logits.tolist()
 68 |         lengths = lengths.tolist()
 69 |         for logit, flag, length in zip(logits, flags, lengths):
 70 |             result = []
 71 |             for i in range(length):
 72 |                 if flag[i] == 1:
 73 |                     assert logit[i] != self.pad_label_id
 74 |                     result.append(Global.id2label[str(logit[i])])
 75 |             results.append(result)
 76 |         return results
 77 | 
 78 |     def to_crf_pad(self, org_array, org_mask):
 79 |         crf_array = [aa[bb] for aa, bb in zip(org_array, org_mask)]
 80 |         crf_array = pad_sequence(crf_array, batch_first=True, padding_value=self.pad_label_id)
 81 |         crf_pad = (crf_array != self.pad_label_id)
 82 |         crf_array[~crf_pad] = 0
 83 |         return crf_array, crf_pad
 84 | 
 85 |     def unpad_crf(self, returned_array, returned_mask, org_array, org_mask):
 86 |         out_array = org_array.clone().detach().to(Global.device)
 87 |         out_array[org_mask] = returned_array[returned_mask]
 88 |         return out_array
 89 | 
 90 | 
 91 | class DynamicRNN(nn.Module):
 92 |     def __init__(self, config):
 93 |         super(DynamicRNN, self).__init__()
 94 |         self.embedding_size = config.getint("runtime", "embedding_size")
 95 |         self.sequence_length = config.getint("runtime", "sequence_length")
 96 |         self.num_layers = config.getint("model", "num_layers")
 97 |         self.hidden_size = config.getint("model", "hidden_size")
 98 |         self.rnn = nn.LSTM(input_size=self.embedding_size,
 99 |                            hidden_size=self.hidden_size // 2,
100 |                            num_layers=self.num_layers,
101 |                            bias=True,
102 |                            batch_first=True,
103 |                            dropout=0,
104 |                            bidirectional=True)
105 | 
106 |     def forward(self, inputs, lengths):
107 |         embedding_packed = nn.utils.rnn.pack_padded_sequence(input=inputs,
108 |                                                              lengths=lengths,
109 |                                                              batch_first=True,
110 |                                                              enforce_sorted=False)
111 |         outputs, _ = self.rnn(embedding_packed, None)
112 |         outputs, _ = nn.utils.rnn.pad_packed_sequence(sequence=outputs,
113 |                                                       batch_first=True,
114 |                                                       padding_value=0.0,
115 |                                                       total_length=self.sequence_length)
116 |         return outputs
117 | 


--------------------------------------------------------------------------------
/baselines/DMCNN_BiLSTM_(CRF)/model/Dmcnn.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | from model.layers import embedding, outputLayer
 4 | 
 5 | 
 6 | class Dmcnn(nn.Module):
 7 |     def __init__(self, config):
 8 |         super(Dmcnn, self).__init__()
 9 |         self.config = config
10 |         self.embedding = embedding.Embedding(config)
11 |         self.pf_embedding = nn.Embedding(num_embeddings=config.getint("runtime", "sequence_length"),
12 |                                          embedding_dim=config.getint("model", "pf_dim"))
13 |         self.cnn = _CNN(config)
14 |         self.pooling = _DynamicPooling(config)
15 |         self.dropout = nn.Dropout(config.getfloat("model", "dropout"))
16 |         self.fc = nn.Linear(in_features=config.getint("model", "llf_num") * config.getint("runtime", "embedding_size") + 2 * config.getint("model", "hidden_size"),
17 |                             out_features=config.getint("runtime", "num_class"),
18 |                             bias=True)
19 |         self.out = outputLayer.OutputLayer(config)
20 |         print(self)
21 | 
22 |     def forward(self, data, **params):
23 |         """
24 |         :param data: 这一轮输入的数据
25 |         :param params: 存放任何其它需要的信息
26 |         """
27 |         mode = params["mode"]
28 |         tokens = data["tokens"]
29 |         if mode != "test":
30 |             labels = data["labels"]
31 |         masks = data["masks"]
32 |         pfs = data["pfs"]
33 |         llfs = data["llfs"]
34 | 
35 |         llf = self.embedding(llfs).view(-1, self.config.getint("model", "llf_num") * self.config.getint("runtime", "embedding_size"))
36 |         prediction = torch.cat((self.embedding(tokens), self.pf_embedding(pfs)), dim=-1)    # [B, L, E+P]
37 |         prediction = self.cnn(prediction)   # [B, H, L]
38 |         prediction = self.pooling(prediction, masks)    # [B, 2*H]
39 |         prediction = self.dropout(prediction)
40 |         prediction = torch.cat((prediction, llf), dim=-1)   # [B, l*E+2*H]
41 |         prediction = self.fc(prediction)    # [B, N]
42 | 
43 |         if mode != "test":
44 |             loss = self.out(prediction, labels)
45 |         prediction = torch.argmax(prediction, dim=1)
46 | 
47 |         return {"loss": loss, 
48 |                 "prediction": prediction, 
49 |                 "labels": labels} if mode != "test" else {
50 |                     "prediction": prediction
51 |                 }
52 | 
53 | 
54 | class _CNN(nn.Module):
55 |     def __init__(self, config):
56 |         super(_CNN, self).__init__()
57 |         self.in_channels = config.getint("runtime", "embedding_size") + config.getint("model", "pf_dim")
58 |         self.out_channels = config.getint("model", "hidden_size")
59 |         self.kernel_size = config.getint("model", "kernel_size")
60 |         self.padding_size = (self.kernel_size - 1) >> 1
61 |         self.cnn = nn.Conv1d(in_channels=self.in_channels,
62 |                              out_channels=self.out_channels,
63 |                              kernel_size=self.kernel_size,
64 |                              stride=1,
65 |                              padding=self.padding_size)
66 |         self.activation = nn.ReLU()
67 | 
68 |     def forward(self, inputs):
69 |         inputs = inputs.permute(0, 2, 1)    # [B, L, E+P] -> [B, E+P, L]
70 |         prediction = self.cnn(inputs)       # [B, E+P, L] -> [B, H, L]
71 |         prediction = self.activation(prediction)    # [B, H, L]
72 |         return prediction
73 | 
74 | class _DynamicPooling(nn.Module):
75 |     def __init__(self, config):
76 |         super(_DynamicPooling, self).__init__()
77 |         self.hidden_size = config.getint("model", "hidden_size")
78 | 
79 |     def forward(self, inputs, masks):
80 |         inputs = torch.unsqueeze(inputs, dim=-1) # [B, H, L] -> [B, H, L, 1]
81 |         masks = torch.unsqueeze(masks, dim=1)    # [B, L, 3] -> [B, 1, L, 3]
82 |         prediction = torch.max(masks + inputs, dim=2)[0]
83 |         prediction -= 100
84 |         prediction = prediction.view(-1, 2 * self.hidden_size)
85 |         return prediction


--------------------------------------------------------------------------------
/baselines/DMCNN_BiLSTM_(CRF)/model/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/THU-KEG/MAVEN-dataset/71151f6da53bc9df9d6c5718dc81a2e1489651bf/baselines/DMCNN_BiLSTM_(CRF)/model/__init__.py


--------------------------------------------------------------------------------
/baselines/DMCNN_BiLSTM_(CRF)/model/layers/crf.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | # @Author: Jie Yang
  3 | # @Date:   2017-12-04 23:19:38
  4 | # @Last Modified by:   Jie Yang,     Contact: jieynlp@gmail.com
  5 | # @Last Modified time: 2018-05-27 22:48:17
  6 | # Modified from original https://github.com/jiesutd/LatticeLSTM/blob/master/model/crf.py
  7 | 
  8 | import torch
  9 | import torch.autograd as autograd
 10 | import torch.nn as nn
 11 | 
 12 | 
 13 | # Compute log sum exp in a numerically stable way for the forward algorithm
 14 | def log_sum_exp(vec, m_size):
 15 |     """
 16 |     calculate log of exp sum
 17 |     args:
 18 |         vec (batch_size, vanishing_dim, hidden_dim) : input tensor
 19 |         m_size : hidden_dim
 20 |     return:
 21 |         batch_size, hidden_dim
 22 |     """
 23 |     _, idx = torch.max(vec, 1)  # B * 1 * M
 24 |     max_score = torch.gather(vec, 1, idx.view(-1, 1, m_size)).view(-1, 1, m_size)  # B * M
 25 |     return max_score.view(-1, m_size) + torch.log(torch.sum(torch.exp(vec - max_score.expand_as(vec)), 1)).view(-1, m_size)  # B * M
 26 | 
 27 | 
 28 | class CRF(nn.Module):
 29 | 
 30 |     def __init__(self, tagset_size, use_gpu=False):  # average_batch=False,
 31 |         super(CRF, self).__init__()
 32 |         print("build CRF...")
 33 |         # self.average_batch = average_batch
 34 |         self.gpu = use_gpu
 35 | 
 36 |         self.START_TAG = -2
 37 |         self.STOP_TAG = -1
 38 | 
 39 |         self.tagset_size = tagset_size
 40 | 
 41 |         # # We add 2 here, because of START_TAG and STOP_TAG
 42 |         # self.hidden2tag = nn.Linear(params['hidden_dim'], self.tagset_size + 2)
 43 |         # # transitions (f_tag_size, t_tag_size), transition value from f_tag to t_tag
 44 |         init_transitions = torch.zeros(self.tagset_size + 2, self.tagset_size + 2)
 45 |         init_transitions[:, self.START_TAG] = -10000.0
 46 |         init_transitions[self.STOP_TAG, :] = -10000.0
 47 |         if torch.cuda.is_available():
 48 |             init_transitions = init_transitions.cuda(self.gpu)
 49 |         self.transitions = nn.Parameter(init_transitions, requires_grad=True)
 50 | 
 51 |     def init_hidden_cell(self, batch_size, layer_hidden_dim):
 52 |         return (torch.randn(2, batch_size, layer_hidden_dim // 2),
 53 |                 torch.randn(2, batch_size, layer_hidden_dim // 2))
 54 | 
 55 |     def _calculate_PZ(self, feats, mask):
 56 |         """
 57 |             input:
 58 |                 feats: (batch, seq_len, self.tag_size+2)
 59 |                 masks: (batch, seq_len)
 60 |         """
 61 |         batch_size = feats.size(0)
 62 |         seq_len = feats.size(1)
 63 |         tag_size = feats.size(2)
 64 |         # print feats.view(seq_len, tag_size)
 65 |         assert (tag_size == self.tagset_size + 2)
 66 |         mask = mask.transpose(1, 0).contiguous()
 67 |         ins_num = seq_len * batch_size
 68 |         ## be careful the view shape, it is .view(ins_num, 1, tag_size) but not .view(ins_num, tag_size, 1)
 69 |         feats = feats.transpose(1, 0).contiguous().view(ins_num, 1, tag_size).expand(ins_num, tag_size, tag_size)
 70 |         ## need to consider start
 71 |         scores = feats + self.transitions.view(1, tag_size, tag_size).expand(ins_num, tag_size, tag_size)
 72 |         scores = scores.view(seq_len, batch_size, tag_size, tag_size)
 73 |         # build iter
 74 |         seq_iter = enumerate(scores)
 75 |         _, inivalues = next(seq_iter)  # bat_size * from_target_size * to_target_size
 76 |         # only need start from start_tag
 77 |         partition = inivalues[:, self.START_TAG, :].clone().view(batch_size, tag_size, 1)  # bat_size * to_target_size
 78 | 
 79 |         ## add start score (from start to all tag, duplicate to batch_size)
 80 |         # partition = partition + self.transitions[START_TAG,:].view(1, tag_size, 1).expand(batch_size, tag_size, 1)
 81 |         # iter over last scores
 82 |         for idx, cur_values in seq_iter:
 83 |             # previous to_target is current from_target
 84 |             # partition: previous results log(exp(from_target)), #(batch_size * from_target)
 85 |             # cur_values: bat_size * from_target * to_target
 86 | 
 87 |             cur_values = cur_values + partition.contiguous().view(batch_size, tag_size, 1).expand(batch_size, tag_size, tag_size)
 88 |             cur_partition = log_sum_exp(cur_values, tag_size)
 89 |             # print cur_partition.data
 90 | 
 91 |             # (bat_size * from_target * to_target) -> (bat_size * to_target)
 92 |             # partition = utils.switch(partition, cur_partition, mask[idx].view(bat_size, 1).expand(bat_size, self.tagset_size)).view(bat_size, -1)
 93 |             mask_idx = mask[idx, :].view(batch_size, 1).expand(batch_size, tag_size)
 94 | 
 95 |             ## effective updated partition part, only keep the partition value of mask value = 1
 96 |             masked_cur_partition = cur_partition.masked_select(mask_idx)
 97 |             ## let mask_idx broadcastable, to disable warning
 98 |             mask_idx = mask_idx.contiguous().view(batch_size, tag_size, 1)
 99 | 
100 |             ## replace the partition where the maskvalue=1, other partition value keeps the same
101 |             partition.masked_scatter_(mask_idx, masked_cur_partition)
102 |             # until the last state, add transition score for all partition (and do log_sum_exp) then select the value in STOP_TAG
103 |         cur_values = self.transitions.view(1, tag_size, tag_size).expand(batch_size, tag_size, tag_size) + partition.contiguous().view( batch_size, tag_size, 1).expand(batch_size, tag_size, tag_size)
104 | 
105 |         cur_partition = log_sum_exp(cur_values, tag_size)
106 |         final_partition = cur_partition[:, self.STOP_TAG]
107 |         return final_partition.sum(), scores
108 | 
109 |     def _viterbi_decode(self, feats, mask):
110 |         """
111 |             input:
112 |                 feats: (batch, seq_len, self.tag_size+2)
113 |                 mask: (batch, seq_len)
114 |             output:
115 |                 decode_idx: (batch, seq_len) decoded sequence
116 |                 path_score: (batch, 1) corresponding score for each sequence (to be implementated)
117 |         """
118 |         batch_size = feats.size(0)
119 |         seq_len = feats.size(1)
120 |         tag_size = feats.size(2)
121 |         assert (tag_size == self.tagset_size + 2)
122 |         ## calculate sentence length for each sentence
123 |         length_mask = torch.sum(mask.long(), dim=1).view(batch_size, 1).long()
124 |         ## mask to (seq_len, batch_size)
125 |         mask = mask.transpose(1, 0).contiguous()
126 |         ins_num = seq_len * batch_size
127 |         ## be careful the view shape, it is .view(ins_num, 1, tag_size) but not .view(ins_num, tag_size, 1)
128 |         feats = feats.transpose(1, 0).contiguous().view(ins_num, 1, tag_size).expand(ins_num, tag_size, tag_size)
129 |         ## need to consider start
130 |         scores = feats + self.transitions.view(1, tag_size, tag_size).expand(ins_num, tag_size, tag_size)
131 |         scores = scores.view(seq_len, batch_size, tag_size, tag_size)
132 | 
133 |         # build iter
134 |         seq_iter = enumerate(scores)
135 |         ## record the position of best score
136 |         back_points = list()
137 |         partition_history = list()
138 |         ##  reverse mask (bug for mask = 1- mask, use this as alternative choice)
139 |         # mask = 1 + (-1)*mask
140 |         # mask = (1 - mask.long()).byte()
141 |         mask = ~(mask)
142 |         _, inivalues = next(seq_iter)  # bat_size * from_target_size * to_target_size
143 |         # only need start from start_tag
144 |         partition = inivalues[:, self.START_TAG, :].clone().view(batch_size, tag_size)  # bat_size * to_target_size
145 |         # print "init part:",partition.size()
146 |         partition_history.append(partition)
147 |         # iter over last scores
148 |         for idx, cur_values in seq_iter:
149 |             # previous to_target is current from_target
150 |             # partition: previous results log(exp(from_target)), #(batch_size * from_target)
151 |             # cur_values: batch_size * from_target * to_target
152 |             cur_values = cur_values + partition.contiguous().view(batch_size, tag_size, 1).expand(batch_size, tag_size, tag_size)
153 |             ## forscores, cur_bp = torch.max(cur_values[:,:-2,:], 1) # do not consider START_TAG/STOP_TAG
154 |             # print "cur value:", cur_values.size()
155 |             partition, cur_bp = torch.max(cur_values, 1)
156 |             # print "partsize:",partition.size()
157 |             # exit(0)
158 |             # print partition
159 |             # print cur_bp
160 |             # print "one best, ",idx
161 |             partition_history.append(partition)
162 |             ## cur_bp: (batch_size, tag_size) max source score position in current tag
163 |             ## set padded label as 0, which will be filtered in post processing
164 |             cur_bp.masked_fill_(mask[idx].view(batch_size, 1).expand(batch_size, tag_size), 0)
165 |             back_points.append(cur_bp)
166 |         # exit(0)
167 |         ### add score to final STOP_TAG
168 |         partition_history = torch.cat(partition_history, 0).view(seq_len, batch_size, -1).transpose(1, 0).contiguous()  ## (batch_size, seq_len. tag_size)
169 |         ### get the last position for each setences, and select the last partitions using gather()
170 |         last_position = length_mask.view(batch_size, 1, 1).expand(batch_size, 1, tag_size) - 1
171 |         last_partition = torch.gather(partition_history, 1, last_position).view(batch_size, tag_size, 1)
172 |         ### calculate the score from last partition to end state (and then select the STOP_TAG from it)
173 |         last_values = last_partition.expand(batch_size, tag_size, tag_size) + self.transitions.view(1, tag_size, tag_size).expand(batch_size, tag_size, tag_size)
174 |         _, last_bp = torch.max(last_values, 1)
175 |         pad_zero = autograd.Variable(torch.zeros(batch_size, tag_size)).long()
176 |         if torch.cuda.is_available():
177 |             pad_zero = pad_zero.cuda(self.gpu)
178 |         back_points.append(pad_zero)
179 |         back_points = torch.cat(back_points).view(seq_len, batch_size, tag_size)
180 | 
181 |         ## select end ids in STOP_TAG
182 |         pointer = last_bp[:, self.STOP_TAG]
183 |         insert_last = pointer.contiguous().view(batch_size, 1, 1).expand(batch_size, 1, tag_size)
184 |         back_points = back_points.transpose(1, 0).contiguous()
185 |         ## move the end ids(expand to tag_size) to the corresponding position of back_points to replace the 0 values
186 |         # print "lp:",last_position
187 |         # print "il:",insert_last
188 |         back_points.scatter_(1, last_position, insert_last)
189 |         # print "bp:",back_points
190 |         # exit(0)
191 |         back_points = back_points.transpose(1, 0).contiguous()
192 |         ## decode from the end, padded position ids are 0, which will be filtered if following evaluation
193 |         decode_idx = autograd.Variable(torch.LongTensor(seq_len, batch_size))
194 |         if torch.cuda.is_available():
195 |             decode_idx = decode_idx.cuda(self.gpu)
196 |         decode_idx[-1] = pointer.data  # detach()
197 |         for idx in range(len(back_points) - 2, -1, -1):
198 |             pointer = torch.gather(back_points[idx], 1, pointer.contiguous().view(batch_size, 1))
199 |             decode_idx[idx] = pointer.data.t()  # feili  pointer.detach().view(batch_size)
200 |         path_score = None
201 |         decode_idx = decode_idx.transpose(1, 0)
202 |         return path_score, decode_idx
203 | 
204 |     def forward(self, feats, mask):
205 |         path_score, best_path = self._viterbi_decode(feats, mask)
206 |         # return path_score, best_path
207 |         return best_path
208 | 
209 |     def _score_sentence(self, scores, mask, tags):
210 |         """
211 |             input:
212 |                 scores: variable (seq_len, batch, tag_size, tag_size)
213 |                 mask: (batch, seq_len)
214 |                 tags: tensor  (batch, seq_len)
215 |             output:
216 |                 score: sum of score for gold sequences within whole batch
217 |         """
218 |         # Gives the score of a provided tag sequence
219 |         batch_size = scores.size(1)
220 |         seq_len = scores.size(0)
221 |         tag_size = scores.size(2)
222 |         ## convert tag value into a new format, recorded label bigram information to index
223 |         new_tags = autograd.Variable(torch.LongTensor(batch_size, seq_len))
224 |         if torch.cuda.is_available():
225 |             new_tags = new_tags.cuda(self.gpu)
226 |         for idx in range(seq_len):
227 |             if idx == 0:
228 |                 ## start -> first score
229 |                 new_tags[:, 0] = (tag_size - 2) * tag_size + tags[:, 0]
230 | 
231 |             else:
232 |                 new_tags[:, idx] = tags[:, idx - 1] * tag_size + tags[:, idx]
233 | 
234 |         ## transition for label to STOP_TAG
235 |         end_transition = self.transitions[:, self.STOP_TAG].contiguous().view(1, tag_size).expand(batch_size, tag_size)
236 |         ## length for batch,  last word position = length - 1
237 |         length_mask = torch.sum(mask.long(), dim=1).view(batch_size, 1).long()
238 |         ## index the label id of last word
239 |         end_ids = torch.gather(tags, 1, length_mask - 1)
240 | 
241 |         ## index the transition score for end_id to STOP_TAG
242 |         end_energy = torch.gather(end_transition, 1, end_ids)
243 | 
244 |         ## convert tag as (seq_len, batch_size, 1)
245 |         new_tags = new_tags.transpose(1, 0).contiguous().view(seq_len, batch_size, 1)
246 |         ### need convert tags id to search from 400 positions of scores
247 |         tg_energy = torch.gather(scores.view(seq_len, batch_size, -1), 2, new_tags).view(seq_len, batch_size)  # seq_len * bat_size
248 |         ## mask transpose to (seq_len, batch_size)
249 |         assert mask.transpose(1, 0).size() == tg_energy.size()
250 |         tg_energy = tg_energy.masked_select(mask.transpose(1, 0))
251 | 
252 |         # ## calculate the score from START_TAG to first label
253 |         # start_transition = self.transitions[START_TAG,:].view(1, tag_size).expand(batch_size, tag_size)
254 |         # start_energy = torch.gather(start_transition, 1, tags[0,:])
255 | 
256 |         ## add all score together
257 |         # gold_score = start_energy.sum() + tg_energy.sum() + end_energy.sum()
258 |         gold_score = tg_energy.sum() + end_energy.sum()
259 |         return gold_score
260 | 
261 |     def neg_log_likelihood(self, feats, mask, tags):
262 |         # nonegative log likelihood
263 |         forward_score, scores = self._calculate_PZ(feats, mask)
264 |         # print('Forward', forward_score)
265 |         gold_score = self._score_sentence(scores, mask, tags)
266 |         # print('Gold', gold_score)
267 |         # print "batch, f:", forward_score.data[0], " g:", gold_score.data[0], " dis:", forward_score.data[0] - gold_score.data[0]
268 |         # exit(0)
269 |         # if self.average_batch:
270 |         #     return (forward_score - gold_score) / batch_size
271 |         # else:
272 |         return forward_score - gold_score
273 | 


--------------------------------------------------------------------------------
/baselines/DMCNN_BiLSTM_(CRF)/model/layers/embedding.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | import numpy as np
 4 | from utils.global_variables import Global
 5 | 
 6 | class Embedding(nn.Module):
 7 |     def __init__(self, config):
 8 |         super(Embedding, self).__init__()
 9 |         if Global.word2vec_mat is None:
10 |             weight = None
11 |             self.vocab_size = config.getint("runtime", "vocab_size")
12 |             self.embedding_size = config.getint("runtime", "embedding_size")
13 |         else:
14 |             weight = torch.from_numpy(Global.word2vec_mat).float()
15 |             self.vocab_size, self.embedding_size = weight.size()
16 |         self.embedding = nn.Embedding.from_pretrained(weight)
17 |         
18 |     def forward(self, input):
19 |         return self.embedding(input)
20 | 


--------------------------------------------------------------------------------
/baselines/DMCNN_BiLSTM_(CRF)/model/layers/outputLayer.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | import torch.nn.functional as F
 4 | from utils.global_variables import Global
 5 | from utils.evaluation import Evaluation
 6 | 
 7 | class OutputLayer(nn.Module):
 8 |     def __init__(self, config):
 9 |         super(OutputLayer, self).__init__()
10 |         self.num_class = config.getint("runtime", "num_class")
11 |         self.criterion = nn.CrossEntropyLoss()
12 | 
13 |     def forward(self, prediction, labels):
14 |         loss = self.criterion(prediction, labels)   # ([B, N], [B,])
15 |         return loss


--------------------------------------------------------------------------------
/baselines/DMCNN_BiLSTM_(CRF)/reader/MavenReader.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import json
  3 | import copy
  4 | import codecs
  5 | import numpy as np
  6 | from tqdm import tqdm
  7 | from utils.global_variables import Global
  8 | 
  9 | class MavenReader(object):
 10 |     def __init__(self, config):
 11 |         self.config = config
 12 |         self.data = []
 13 |         self.raw_dir = "./raw"
 14 |         self.data_dir = "./data"
 15 |         self.flag_dir = "{}{}".format(config.get("data", "reader_name")[:-6], "crf" if config.has_option("data", "BIO") else "")
 16 |         self.word2vec_source_file = config.get("data", "word2vec_file")
 17 |         self.word2vec_file = "word2vec.npy"
 18 |         self.modes = ["train", "valid", "test"]
 19 | 
 20 |     def read(self, mode):
 21 |         """
 22 |         :param mode: train/valid/test
 23 |         :return: [{"tokens": list(int), "labels": list(int)}, ...]
 24 |         """
 25 |         self.data.clear()
 26 |         if not os.path.exists(os.path.join(self.data_dir, self.flag_dir, 'flag')):
 27 |             os.makedirs(os.path.join(self.data_dir, self.flag_dir))
 28 |             self.preprocess()
 29 |         with open(os.path.join(self.data_dir, self.flag_dir, "{}_processed.json".format(mode)), "r+", encoding="utf-8") as f:
 30 |             data = json.load(f)
 31 |         if Global.word2vec_mat is None:
 32 |             Global.word2vec_mat = np.load(os.path.join(self.data_dir, self.word2vec_file))
 33 |             Global.word2id = data["word2id"]
 34 |             Global.id2word = data["id2word"]
 35 |             Global.label2id = data["label2id"]
 36 |             Global.id2label = data["id2label"]
 37 |             if self.config.has_option("data", "BIO"):
 38 |                 Global.type2id = data["type2id"]
 39 |         for item in data["info"]:
 40 |             tokens = [data["word2id"][x]  if x in data["word2id"] else data["word2id"]["<UNK>"] for x in item["tokens"]]
 41 |             if mode != "test":
 42 |                 labels = [data["label2id"][x] for x in item["labels"]]
 43 |             canids = item["canids"]     
 44 |             docids = item["docids"]
 45 |             if self.config.has_option("data", "split_labels"):
 46 |                 for i in range(len(canids)):
 47 |                     if item["flags"][i]:
 48 |                         if mode != "test":
 49 |                             temp = {"tokens": tokens,
 50 |                                     "labels": labels[i],
 51 |                                     "canids": canids[i],
 52 |                                     "docids": docids,
 53 |                                     "index": i}
 54 |                         else:
 55 |                             temp = {"tokens": tokens,
 56 |                                     "canids": canids[i],
 57 |                                     "docids": docids,
 58 |                                     "index": i}
 59 |                         self.data.append(temp)
 60 |             else:
 61 |                 if mode != "test":
 62 |                     temp = {"tokens": tokens,
 63 |                             "labels": labels,
 64 |                             "canids": canids,
 65 |                             "docids": docids,
 66 |                             "flags": item["flags"]}
 67 |                 else:
 68 |                     temp = {"tokens": tokens,
 69 |                             "canids": canids,
 70 |                             "docids": docids,
 71 |                             "flags": item["flags"]}
 72 |                 self.data.append(temp)
 73 | 
 74 |         self.config.set("runtime", "vocab_size", Global.word2vec_mat.shape[0])
 75 |         self.config.set("runtime", "embedding_size", Global.word2vec_mat.shape[1])
 76 |         self.config.set("runtime", "num_class", len(data["label2id"]))
 77 |         self.config.set("runtime", "sequence_length", data["sequence_length"])
 78 | 
 79 |         print("Mode: {} | Dataset Size = {}".format(mode, len(self.data)))
 80 |         return copy.deepcopy(self.data)
 81 | 
 82 |     def preprocess(self):
 83 |         """
 84 |         :return: 输出文件、整合数据以及词向量矩阵
 85 |         整合数据格式：{
 86 |             "info":[{"tokens": list(str), "labels": list(str), "flags": list(bool)}, ...],
 87 |             "word2id": {"<PAD>": 0, "<UNK>": 1},
 88 |             "id2word": {0: "<PAD>", 1: "<UNK>"},
 89 |             "label2id": {"None": 0},
 90 |             "id2label": {0: "None"},
 91 |             "sequence_length": int
 92 |         }
 93 |         """
 94 | 
 95 |         embedding_dict = self.load_embedding_dict(os.path.join(self.raw_dir, self.word2vec_source_file))
 96 | 
 97 |         processed_data = {"info_train": [],
 98 |                           "info_valid": [],
 99 |                           "info_test": [],
100 |                           "word2id": {},
101 |                           "id2word": {},
102 |                           "label2id": {},
103 |                           "id2label": {},
104 |                           "sequence_length": 0}
105 | 
106 |         if self.config.has_option("data", "BIO"):
107 |             processed_data["label2id"]["O"] = 0
108 |             processed_data["id2label"][0] = "O"
109 |             processed_data["type2id"] = {"O": 0}
110 |         else:
111 |             processed_data["label2id"]["None"] = 0
112 |             processed_data["id2label"][0] = "None"
113 |             
114 |         for mode in self.modes:
115 |             with codecs.open(os.path.join(self.raw_dir, "{}.jsonl".format(mode)), 'r', encoding="utf-8", errors="ignore") as f:
116 |                 lines = f.readlines()
117 |                 for line in lines:
118 |                     line = line.rstrip()
119 |                     doc = json.loads(line)
120 |                     docids = doc["id"]
121 |                     doc_tokens, doc_labels, doc_canids, doc_flags = [], [], [], []
122 |                     for item in doc["content"]:
123 |                         doc_tokens.append(item["tokens"])
124 | 
125 |                     if self.config.has_option("data", "BIO"):
126 | 
127 |                         for tokens in doc_tokens:
128 |                             if mode != "test":
129 |                                 doc_labels.append(["O"] * len(tokens))
130 |                             doc_canids.append([""] * len(tokens))
131 |                             doc_flags.append([0] * len(tokens))
132 |                         
133 |                         if mode == "test":
134 |                             for candi in doc["candidates"]:
135 |                                 for i in range(candi["offset"][0], candi["offset"][1]):
136 |                                     doc_canids[candi["sent_id"]][i] = candi["id"]
137 |                                     doc_flags[candi["sent_id"]][i] = 1
138 |                         else:
139 |                             for event in doc["events"]:
140 |                                 tp = event["type"].replace("-", "_")
141 |                                 if tp not in processed_data["type2id"]:
142 |                                     processed_data["type2id"][tp] = event["type_id"]
143 |                                 for mention in event["mention"]:
144 |                                     for i in range(mention["offset"][0], mention["offset"][1]):
145 |                                         doc_labels[mention["sent_id"]][i] = ("B-" + tp) if (i == mention["offset"][0]) else ("I-" + tp)
146 |                                         doc_canids[mention["sent_id"]][i] = mention["id"]
147 |                                         doc_flags[mention["sent_id"]][i] = 1
148 | 
149 |                     else:
150 | 
151 |                         for tokens in doc_tokens:
152 |                             if mode != "test":
153 |                                 doc_labels.append(["None"] * len(tokens))
154 |                             doc_canids.append([""] * len(tokens))
155 |                             doc_flags.append([0] * len(tokens))
156 |                             processed_data["sequence_length"] = max(processed_data["sequence_length"], len(tokens))
157 |                         
158 |                         if mode == "test":
159 |                             for candi in doc["candidates"]:
160 |                                 for i in range(candi["offset"][0], candi["offset"][1]):
161 |                                     doc_canids[candi["sent_id"]][i] = candi["id"]
162 |                                     doc_flags[candi["sent_id"]][i] = 1
163 |                         else:
164 |                             for event in doc["events"]:
165 |                                 if event["type"] not in processed_data["label2id"]:
166 |                                     processed_data["label2id"][event["type"]] = event["type_id"]
167 |                                     processed_data["id2label"][event["type_id"]] = event["type"]
168 |                                 for mention in event["mention"]:
169 |                                     for i in range(mention["offset"][0], mention["offset"][1]):
170 |                                         doc_labels[mention["sent_id"]][i] = event["type"]
171 |                                         doc_canids[mention["sent_id"]][i] = mention["id"]
172 |                                         doc_flags[mention["sent_id"]][i] = 1
173 | 
174 |                     if mode != "test":
175 |                         for mention in doc["negative_triggers"]:
176 |                             for i in range(mention["offset"][0], mention["offset"][1]):
177 |                                 doc_canids[mention["sent_id"]][i] = mention["id"]
178 |                                 doc_flags[mention["sent_id"]][i] = 1
179 | 
180 |                         for tokens, labels, canids, flags in zip(doc_tokens, doc_labels, doc_canids, doc_flags):
181 |                             processed_data["info_{}".format(mode)].append({"tokens": tokens,
182 |                                                                            "labels": labels,
183 |                                                                            "canids": canids,
184 |                                                                            "flags": flags,
185 |                                                                            "docids": docids})
186 |                             if self.config.has_option("data", "BIO"):
187 |                                 for label in labels:
188 |                                     if label not in processed_data["label2id"]:
189 |                                         id = len(processed_data["label2id"])
190 |                                         processed_data["label2id"][label] = id
191 |                                         processed_data["id2label"][id] = label
192 |                     else:
193 |                         for tokens, canids, flags in zip(doc_tokens, doc_canids, doc_flags):
194 |                             processed_data["info_{}".format(mode)].append({"tokens": tokens,
195 |                                                                            "canids": canids,
196 |                                                                            "flags": flags,
197 |                                                                            "docids": docids})
198 |                     
199 | 
200 |         if self.config.has_option("data", "BIO"):
201 |             processed_data["sequence_length"] = self.config.getint("data", "sequence_length")
202 |         
203 |         word2vec_mat = []
204 |         for (k, v) in embedding_dict.items():
205 |             id = len(processed_data["word2id"])
206 |             processed_data["word2id"][k] = id
207 |             processed_data["id2word"][id] = k
208 |             word2vec_mat.append(v)
209 |         word2vec_mat = np.array(word2vec_mat, dtype=np.float32)
210 |         if not os.path.exists(os.path.join(self.data_dir, self.word2vec_file)):
211 |             np.save(os.path.join(self.data_dir, self.word2vec_file), word2vec_mat)
212 | 
213 |         for mode in self.modes:
214 |             with open(os.path.join(self.data_dir, self.flag_dir, "{}_processed.json".format(mode)), "w", encoding="utf-8") as f:
215 |                 temp_data = {"info": processed_data["info_{}".format(mode)],
216 |                              "word2id": processed_data["word2id"],
217 |                              "id2word": processed_data["id2word"],
218 |                              "label2id": processed_data["label2id"],
219 |                              "id2label": processed_data["id2label"],
220 |                              "sequence_length": processed_data["sequence_length"]}
221 |                 if self.config.has_option("data", "BIO"):
222 |                     temp_data["type2id"] = processed_data["type2id"]
223 |                 json.dump(temp_data, f, indent=2, ensure_ascii=False)
224 | 
225 |         with open(os.path.join(self.data_dir, self.flag_dir, 'flag'), "w+") as f:
226 |             f.write("")
227 | 
228 |         
229 | 
230 |     def load_embedding_dict(self, path):
231 |         with open(path, "r", encoding="utf-8") as f:
232 |             lines = f.readlines()
233 |         embedding_dict = {}
234 |         for line in lines:
235 |             split = line.split(" ")
236 |             embedding_dict[split[0]] = np.array(list(map(float, split[1:])))
237 |         return embedding_dict


--------------------------------------------------------------------------------
/baselines/DMCNN_BiLSTM_(CRF)/reader/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/THU-KEG/MAVEN-dataset/71151f6da53bc9df9d6c5718dc81a2e1489651bf/baselines/DMCNN_BiLSTM_(CRF)/reader/__init__.py


--------------------------------------------------------------------------------
/baselines/DMCNN_BiLSTM_(CRF)/utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/THU-KEG/MAVEN-dataset/71151f6da53bc9df9d6c5718dc81a2e1489651bf/baselines/DMCNN_BiLSTM_(CRF)/utils/__init__.py


--------------------------------------------------------------------------------
/baselines/DMCNN_BiLSTM_(CRF)/utils/configparser_hook.py:
--------------------------------------------------------------------------------
 1 | import functools
 2 | import configparser
 3 | 
 4 | class ConfigParserHook(object):
 5 |     def __init__(self):
 6 |         self.config = configparser.RawConfigParser()
 7 | 
 8 |     def read(self, config_file):
 9 |         self.config.read(config_file, encoding="utf-8")
10 | 
11 | def set_hook(func_name):
12 |     @functools.wraps(getattr(configparser.RawConfigParser, func_name))
13 |     def wrapper(self, *args, **kwargs):
14 |         return getattr(self.config, func_name)(*args, **kwargs)
15 |     
16 |     return wrapper
17 | 
18 | def get_config(config_file):
19 |     for func_name in dir(configparser.RawConfigParser):
20 |         if not func_name.startswith("_") and func_name != "read":
21 |             setattr(ConfigParserHook, func_name, set_hook(func_name))
22 |     setattr(ConfigParserHook, "__getitem__", set_hook("__getitem__"))
23 |     
24 |     config = ConfigParserHook()
25 |     config.read(config_file)
26 | 
27 |     return config


--------------------------------------------------------------------------------
/baselines/DMCNN_BiLSTM_(CRF)/utils/evaluation.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from sklearn.metrics import precision_recall_fscore_support
 3 | from seqeval.metrics import precision_score, recall_score, f1_score
 4 | from utils.global_variables import Global
 5 | 
 6 | import warnings
 7 | warnings.filterwarnings('always')
 8 | 
 9 | class Evaluation(object):
10 |     def __init__(self, config):
11 |         super(Evaluation).__init__()
12 |         self.config = config
13 |         self.y_pred = []
14 |         self.y_true = []
15 |         self.labels = [v for (k, v) in Global.label2id.items() if k != "None"]
16 | 
17 |     def get_metric(self, mode, batch_pred=None, batch_true=None):
18 |         average = ["micro", "macro"]
19 |         metrics = ["precision", "recall", "f1"]
20 |         ret = {"{}_{}".format(t1, t2) : 0.0 for t1 in average for t2 in metrics}
21 |         if mode == "batch":
22 |             assert batch_pred is not None
23 |             assert batch_true is not None
24 |             batch_pred = torch.argmax(batch_pred, dim=1)
25 |             y_pred = self.normalize(batch_pred)
26 |             y_true = self.normalize(batch_true)
27 |         elif mode == "all":
28 |             y_pred = self.y_pred
29 |             y_true = self.y_true
30 |         else:
31 |             raise NotImplementedError
32 |         assert len(y_pred) == len(y_true)
33 |         for av in average:
34 |             if self.config.has_option("data", "BIO"):
35 |                 ret["{}_precision".format(av)] = precision_score(y_true=y_true, y_pred=y_pred)
36 |                 ret["{}_recall".format(av)] = recall_score(y_true=y_true, y_pred=y_pred)
37 |                 ret["{}_f1".format(av)] = f1_score(y_true=y_true, y_pred=y_pred)
38 |             else:
39 |                 ret["{}_precision".format(av)], ret["{}_recall".format(av)], ret["{}_f1".format(av)], _ = precision_recall_fscore_support(y_true=y_true, 
40 |                                                                                                                                           y_pred=y_pred, 
41 |                                                                                                                                           labels=self.labels, 
42 |                                                                                                                                           average=av,
43 |                                                                                                                                           zero_division=0)
44 |         return {key : ('%.4f' % value) for key, value in ret.items() if key.startswith("micro") or key.endswith("f1")}
45 | 
46 |     def expand(self, batch_pred, batch_true):
47 |         y_pred = batch_pred if isinstance(batch_pred, list) else self.normalize(batch_pred)
48 |         y_true = batch_true if isinstance(batch_true, list) else self.normalize(batch_true)
49 |         self.y_pred += y_pred
50 |         self.y_true += y_true
51 | 
52 |     def normalize(self, x):
53 |         return x.cpu().numpy().tolist()
54 | 


--------------------------------------------------------------------------------
/baselines/DMCNN_BiLSTM_(CRF)/utils/global_variables.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | class Global(object):
 4 |     device = None
 5 |     word2vec_mat = None
 6 |     word2id = None
 7 |     id2word = None
 8 |     label2id = None
 9 |     id2label = None
10 |     type2id = None
11 |     


--------------------------------------------------------------------------------
/baselines/DMCNN_BiLSTM_(CRF)/utils/initializer.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import json
 3 | import torch
 4 | import torch.optim as optim
 5 | from torch.utils.data import DataLoader
 6 | 
 7 | get_class = lambda attr, name: getattr(__import__("{}.{}".format(attr, name), fromlist=["dummy"]), name)
 8 | 
 9 | def initialize(config, device):
10 |     parameters = {}
11 |     
12 |     reader = get_class("reader", config.get("data", "reader_name"))(config)
13 |     formatter = get_class("formatter", config.get("data", "formatter_name"))(config)
14 |     batch_size = config.getint("train" ,"batch_size")
15 |     shuffle = config.getboolean("train", "shuffle")
16 | 
17 |     collate_fn_decr = lambda mode: (lambda data, mode=mode: formatter.process(data, mode))
18 |     
19 |     dataset_train = reader.read("train")
20 |     dataset_valid = reader.read("valid")
21 |     dataset_test = reader.read("test")
22 |     parameters["dataset_train"] = DataLoader(dataset=dataset_train, batch_size=batch_size, shuffle=shuffle, collate_fn=collate_fn_decr("train"))
23 |     parameters["dataset_valid"] = DataLoader(dataset=dataset_valid, batch_size=batch_size, shuffle=shuffle, collate_fn=collate_fn_decr("valid"))
24 |     parameters["dataset_test"] = DataLoader(dataset=dataset_test, batch_size=batch_size, shuffle=shuffle, collate_fn=collate_fn_decr("test"))
25 |     
26 |     parameters["model"] = get_class("model", config.get("model", "model_name"))(config)
27 |     parameters["model"] = parameters["model"].to(device)
28 |     
29 |     parameters["optimizer"] = get_optim(parameters["model"], config)
30 | 
31 |     return parameters
32 | 
33 | def get_optim(model, config):
34 |     hyper_params = {key: value for key, value in config["optimizer"].items() if key != "optimizer_name"}
35 |     optimizer_name = config.get("optimizer", "optimizer_name")
36 |     optimizer = getattr(optim, optimizer_name)
37 |     command = "optim(params, {})".format(", ".join(["{}={}".format(key, value) for key, value in hyper_params.items()]))
38 |     return eval(command, {"optim": optimizer, "params": model.parameters()})
39 | 


--------------------------------------------------------------------------------
/baselines/DMCNN_BiLSTM_(CRF)/utils/runner.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import sys
  3 | import json
  4 | import copy
  5 | import torch
  6 | from utils.global_variables import Global
  7 | from utils.evaluation import Evaluation
  8 | 
  9 | def run(parameters, config, device):
 10 |     trained_epoch = -1
 11 |     max_epoch = config.getint("train", "epoch")
 12 |     valid_interval = config.getint("train", "valid_interval")
 13 |     saver = {}
 14 |     for epoch in range(trained_epoch + 1, max_epoch):
 15 |         run_one_epoch(parameters, config, device, epoch, "train")
 16 |         if epoch % valid_interval == 0:
 17 |             with torch.no_grad():
 18 |                 valid_metric = run_one_epoch(parameters, config, device, epoch, "valid")
 19 |                 test_metric = run_one_epoch(parameters, config, device, epoch, "test")
 20 |                 print()
 21 |                 if saver == {} or valid_metric["micro_f1"] > saver["valid"]["micro_f1"]:
 22 |                     saver["epoch"] = epoch
 23 |                     saver["valid"] = valid_metric
 24 |                     saver["test"] = test_metric
 25 |                     with open("./data/results_{}.jsonl".format(config.get("data", "formatter_name")[:-9]), "w", encoding="utf-8") as f:
 26 |                         for (k, v) in test_metric.items():
 27 |                             f.write(json.dumps({"id": k,
 28 |                                                 "predictions": v}))
 29 |                             f.write('\n')
 30 |                         
 31 |     print("Best Epoch {}\nValid Metric: {}".format(saver["epoch"], saver["valid"]))
 32 | 
 33 | 
 34 | def run_one_epoch(parameters, config, device, epoch, mode):
 35 |     model = parameters["model"]
 36 | 
 37 |     if mode == "train":
 38 |         model.train()
 39 |         optimizer = parameters["optimizer"]
 40 |     elif mode == "valid" or mode == "test":
 41 |         model.eval()
 42 |     else:
 43 |         raise NotImplementedError
 44 | 
 45 |     dataset = copy.deepcopy(parameters["dataset_{}".format(mode)])
 46 |     pred = {}
 47 |     total_loss = 0
 48 |     evaluation = Evaluation(config)
 49 |     for step, data in enumerate(dataset):
 50 |         for key in data:
 51 |             if isinstance(data[key], torch.Tensor):
 52 |                 data[key] = data[key].to(device)
 53 | 
 54 |         if mode == "train":
 55 |             optimizer.zero_grad()
 56 |         
 57 |         if config.get("model", "model_name") == "Crf":
 58 |             if mode != "test":
 59 |                 results = model(data=data, mode=mode, crf_mode="train")
 60 |                 loss = results["loss"]
 61 |                 total_loss += loss.item()
 62 |                 results = model(data=data, mode=mode, crf_mode="test")
 63 |                 evaluation.expand(results["prediction"], results["labels"])
 64 |             else:
 65 |                 results = model(data=data, mode=mode, crf_mode="test")
 66 |                 prediction = results["prediction"]
 67 |                 if not isinstance(prediction, list):
 68 |                     prediction = prediction.cpu().numpy().tolist()
 69 |                 docids = data["docids"]
 70 |                 canids = data["canids"]
 71 |                 for doc, can, pre in zip(docids, canids, prediction):
 72 |                     if doc not in pred.keys():
 73 |                         pred[doc] = []
 74 |                     assert (len(can) == len(pre))
 75 |                     for c, p in zip(can, pre):
 76 |                         if p != "O":
 77 |                             p = p[2:]
 78 |                         assert p in Global.type2id.keys()
 79 |                         pred[doc].append({"id": c,
 80 |                                           "type_id": Global.type2id[p]})
 81 |         else:
 82 |             results = model(data=data, mode=mode)
 83 |             if mode != "test":
 84 |                 loss = results["loss"]
 85 |                 total_loss += loss.item()
 86 |                 evaluation.expand(results["prediction"], results["labels"])
 87 |             else:
 88 |                 prediction = results["prediction"].cpu().numpy().tolist()
 89 |                 docids = data["docids"]
 90 |                 canids = data["canids"]
 91 |                 for did, cid, pre in zip(docids, canids, prediction):
 92 |                     if did not in pred.keys():
 93 |                         pred[did] = []
 94 |                     pred[did].append({"id": cid,
 95 |                                       "type_id": pre})
 96 |         if mode != "test":
 97 |             print("\r{}: Epoch {} Step {:0>4d}/{} | Loss = {:.4f}".format(mode, epoch, step + 1, len(dataset), round(total_loss / (step + 1), 4)), end="")
 98 |         else:
 99 |             print("\r{}: Epoch {} Step {:0>4d}/{}".format(mode, epoch, step + 1, len(dataset)), end="")
100 | 
101 |         if mode == "train":
102 |             loss.backward()
103 |             optimizer.step()
104 | 
105 |     if mode != "test":
106 |         metric = evaluation.get_metric("all")
107 |         sys.stdout.write("\r")
108 |         print("\r{}: Epoch {} | Metric: {}".format(mode, epoch, metric))
109 |         return metric
110 |     else:
111 |         return pred


--------------------------------------------------------------------------------
/baselines/MOGANED/README.md:
--------------------------------------------------------------------------------
 1 | # MOGANED
 2 | The code is an **unofficial** implementation of [Event Detection with Multi-order Graph Convolution and Aggregated Attention](https://www.aclweb.org/anthology/D19-1582/) (EMNLP 2019 paper). 
 3 | 
 4 | ## Requirements
 5 | 
 6 | - tensorflow-gpu==1.10 w/ CUDA 9 (or tensorflow-gpu==1.14 w/ CUDA 10.0)
 7 | 
 8 | - stanfordcorenlp (see https://github.com/Lynten/stanford-corenlp for detail)
 9 | 
10 | - numpy
11 | 
12 | - tqdm
13 | 
14 | ## Usage
15 | 
16 | To run this code, you need to:
17 | 1. modify MAVEN dataset path, GloVe file path and stanfordcorenlp path in ```constant.py```
18 | 2. Run ```python train.py --gpu [YOUR_GPU] --mode MOGANED``` to train.  
19 | 3. Run ```python train.py --gpu [YOUR_GPU] --mode MOGANED --eval``` to get prediction on test set (dumped to ```results.jsonl```).
20 | 
21 | All hyper-parameters are in ```constant.py```, you can modify them as you wish.
22 | 
23 | ## About Preprocessing
24 | 
25 | When you first run this code, the code will do preprocessing. The preprocessing is quite low and may take a whole night (so run it and you can go to sleep!). This is because getting dependency trees are quite slow.
26 | 
27 | However, preprocessing will only run once and the preprocessed files will be dumped to the maven dataset path. Next time you run the code the code will read them and won't do any more preprocessing.
28 | 
29 | ## Results on MAVEN
30 | 
31 | We run this code and submit the results to the CodaLab leaderboard (username: wzq016):
32 | |Method|Precision|Recall|F1|
33 | |--|--|--|--|
34 | |MOGANED (Paper)|63.4 ± 0.88|64.1 ± 0.90|63.8 ± 0.18|
35 | |MOGANED (Leaderboard)|64.7 ± 0.05|66.0 ± 0.02|65.3 ± 0.01|
36 | 
37 | P.S. We updated the running environments after the MAVEN paper was published and found that the results magically gone higher.
38 | 
39 | ## Note
40 | 
41 | There are some differences on training strategy between this code and the original MOGANED paper:
42 | 1. The code doesn't use BIO schema. This is because trigger words are usually a single word rather than a phrase in ACE05, this won't affect results in ACE05.
43 | 2. The code doesn't use L2-norm, only use dropout. 
44 | 3. The code uses AdamOptimizer rather than AdadeltaOptimizer. During experiments, I found Adadelta can't train a good classifier, however, Adam can. 
45 | 4. This code sets bias loss lambda to 1 rather than 5 since I found this will make F1 score higher.
46 | 
47 | ## Running on ACE 2005
48 | 
49 | Please refer to [this repo](https://github.com/wzq016/MOGANED-Implementation).
50 | 
51 | 


--------------------------------------------------------------------------------
/baselines/MOGANED/constant.py:
--------------------------------------------------------------------------------
 1 | maven_path = '../GAT/data/newdataset'
 2 | GloVe_file = '../GAT/glove/glove.6B.100d.txt'
 3 | corenlp_path = '../GAT/stanford-corenlp-full-2018-10-05'
 4 | 
 5 | EVENT_TYPE_TO_ID = {i:i for i in range(169)}
 6 | ROLE_TO_ID = {'None': 0, 'Person': 1, 'Place': 2, 'Buyer': 3, 'Seller': 4, 'Beneficiary': 5, 'Price': 6, 'Artifact': 7, 'Origin': 8, 'Destination': 9, 'Giver': 10, 'Recipient': 11, 'Money': 12, 'Org': 13, 'Agent': 14, 'Victim': 15, 'Instrument': 16, 'Entity': 17, 'Attacker': 18, 'Target': 19, 'Defendant': 20, 'Adjudicator': 21, 'Prosecutor': 22, 'Plaintiff': 23, 'Crime': 24, 'Position': 25, 'Sentence': 26, 'Vehicle': 27, 'Time-Within': 28, 'Time-Starting': 29, 'Time-Ending': 30, 'Time-Before': 31, 'Time-After': 32, 'Time-Holds': 33, 'Time-At-Beginning': 34, 'Time-At-End': 35}
 7 | 
 8 | NER_TO_ID = {'<PAD>': 0, '<UNK>': 1, 'O': 2, 'PERSON': 3, 'ORGANIZATION': 4, 'LOCATION': 5, 'DATE': 6,
 9 |              'NUMBER': 7, 'MISC': 8, 'DURATION': 9, 'MONEY': 10, 'PERCENT': 11, 'ORDINAL': 12, 'TIME': 13, 'SET': 14}
10 | 
11 | POS_TO_ID = {'<PAD>': 0, '<UNK>': 1, 'NNP': 2, 'NN': 3, 'IN': 4, 'DT': 5, ',': 6, 'JJ': 7, 'NNS': 8, 'VBD': 9, 'CD': 10, 'CC': 11, '.': 12, 'RB': 13, 'VBN': 14, 'PRP': 15, 'TO': 16, 'VB': 17, 'VBG': 18, 'VBZ': 19, 'PRP$': 20, ':': 21, 'POS': 22,
12 |              '\'\'': 23, '``': 24, '-RRB-': 25, '-LRB-': 26, 'VBP': 27, 'MD': 28, 'NNPS': 29, 'WP': 30, 'WDT': 31, 'WRB': 32, 'RP': 33, 'JJR': 34, 'JJS': 35, '$': 36, 'FW': 37, 'RBR': 38, 'SYM': 39, 'EX': 40, 'RBS': 41, 'WP$': 42, 'PDT': 43, 'LS': 44, 'UH': 45, '#': 46}
13 | 
14 | 
15 | 
16 | INF = 1e8
17 | 
18 | #general hyperparams
19 | embedding_dim = 100
20 | posi_embedding_dim = 50
21 | event_type_embedding_dim = 5
22 | cut_len = 50                      #set None to not cut length
23 | 
24 | 
25 | #trigger hyperparameters
26 | t_filters = 200
27 | t_batch_size = 30
28 | t_lr = 0.001
29 | t_epoch = 10
30 | t_keepprob = 0.7
31 | t_bias_lambda = 1
32 | 
33 | #GAT hypers
34 | pos_dim = 50
35 | ner_dim = 50
36 | hidden_dim  = 100
37 | 
38 | Watt_dim = 100
39 | s_dim = 100
40 | 
41 | leaky_alpha = 0.2
42 | graph_dim = 150
43 | 
44 | K=3
45 | 
46 | 


--------------------------------------------------------------------------------
/baselines/MOGANED/func.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import random
  3 | import tensorflow as tf
  4 | import constant
  5 | from tqdm import tqdm
  6 | 
  7 | class Cudnn_RNN:
  8 | 
  9 |     def __init__(self, num_layers, num_units, mode="lstm",keep_prob=1.0, is_train=None, scope="cudnn_rnn", gpu=True):
 10 |         self.num_layers = num_layers
 11 |         self.rnns = []
 12 |         self.mode = mode
 13 |         if mode == "gru":
 14 |             if gpu:
 15 |                 rnn = tf.contrib.cudnn_rnn.CudnnGRU
 16 |             else:
 17 |                 rnn = tf.contrib.rnn.GRUCell
 18 |         elif mode == "lstm":
 19 |             if gpu:
 20 |                 rnn = tf.contrib.cudnn_rnn.CudnnLSTM
 21 |             else:
 22 |                 rnn = tf.contrib.rnn.BasicLSTM
 23 |         else:
 24 |             raise Exception("Unknown mode for rnn")
 25 |         for layer in range(num_layers):
 26 |             if gpu:
 27 |                 rnn_fw = rnn(1, num_units)
 28 |                 rnn_bw = rnn(1, num_units)
 29 |             else:
 30 |                 rnn_fw = rnn(num_units)
 31 |                 rnn_bw = rnn(num_units)
 32 |             self.rnns.append((rnn_fw, rnn_bw, ))
 33 | 
 34 |     def __call__(self, inputs, seq_len, keep_prob=1.0, is_train=None, concat_layers=True):
 35 |         outputs = [tf.transpose(inputs, [1, 0, 2])]
 36 |         for layer in range(self.num_layers):
 37 |             rnn_fw, rnn_bw = self.rnns[layer]
 38 |             output = dropout(outputs[-1], keep_prob=keep_prob, is_train=is_train)
 39 |             with tf.variable_scope("fw_{}".format(layer)):
 40 |                 out_fw, state_fw = rnn_fw(output)
 41 |             with tf.variable_scope("bw_{}".format(layer)):
 42 |                 inputs_bw = tf.reverse_sequence(output, seq_lengths=seq_len, seq_axis=0, batch_axis=1)
 43 |                 out_bw, state_bw = rnn_bw(inputs_bw)
 44 |                 out_bw = tf.reverse_sequence(out_bw, seq_lengths=seq_len, seq_axis=0, batch_axis=1)
 45 |             outputs.append(tf.concat([out_fw, out_bw], axis=2))
 46 |         if concat_layers is True:
 47 |             res = tf.concat(outputs[1:], axis=2)
 48 |         else:
 49 |             res = outputs[-1]
 50 |         res = tf.transpose(res, [1, 0, 2])
 51 |         state_fw = tf.squeeze(state_fw[0], [0])
 52 |         state_bw = tf.squeeze(state_bw[0], [0])
 53 |         state = tf.concat([state_fw, state_bw], axis=1)
 54 |         return res, state
 55 | 
 56 | def dropout(args, keep_prob, is_train, mode=None):
 57 |     if keep_prob < 1.0:
 58 |         noise_shape = None
 59 |         scale = 1.0
 60 |         shape = tf.shape(args)
 61 |         if mode == "embedding" and len(args.get_shape().as_list()) == 3:
 62 |             noise_shape = [shape[0], shape[1], 1]
 63 |             scale = keep_prob
 64 |         if mode == "recurrent" and len(args.get_shape().as_list()) == 3:
 65 |             noise_shape = [shape[0], 1, shape[-1]]
 66 |         args = tf.cond(is_train, lambda: tf.nn.dropout(
 67 |             args, keep_prob, noise_shape=noise_shape), lambda: args)
 68 |     return args
 69 | 
 70 | def f_score(predict,golden,mode='f'):
 71 |     assert len(predict)==len(golden)
 72 |     TP = 0
 73 |     FP = 0
 74 |     FN = 0
 75 |     TN = 0
 76 |     for i in range(len(predict)):
 77 |         if predict[i]==golden[i] and predict[i] != 0:
 78 |             TP+=1
 79 |         elif predict[i]!=golden[i]:
 80 |             if predict[i]==0:
 81 |                 FN+=1
 82 |             elif golden[i]==0:
 83 |                 FP+=1
 84 |             else:
 85 |                 FN+=1
 86 |                 FP+=1
 87 |         else:
 88 |             TN+=1
 89 |     try:
 90 |         P = TP/(TP+FP)
 91 |         R = TP/(TP+FN)
 92 |         F = 2*P*R/(P+R)
 93 |     except:
 94 |         P=R=F=0
 95 | 
 96 |     if mode=='f':
 97 |         return P,R,F
 98 |     else:
 99 |         return TP,FN,FP,TN
100 | 
101 | def get_batch(data_all,batch_size,shuffle=True):
102 |     data,data_subg = data_all
103 |     assert len(list(set([np.shape(d)[0] for d in data]))) == 1
104 |     num_data = np.shape(data[0])[0]
105 |     indices = list(np.arange(0,num_data))
106 |     if shuffle:
107 |         random.shuffle(indices)
108 |     for i in tqdm(range((num_data // batch_size)+1)):
109 |         select_indices = indices[i*batch_size:(i+1)*batch_size]
110 |         select_subg_indices = [[idx]+indice for idx,select_indice in enumerate(select_indices) for indice in data_subg[select_indice]]
111 |         yield [np.take(d,select_indices,axis=0) for d in data]+[select_subg_indices]
112 | 
113 | def get_trigger_feeddict(model,batch,stage,maxlen,is_train=True):
114 |     if stage=='DMCNN':
115 |         posis,sents,maskls,maskrs,event_types,lexical,_,_,_,_ = batch
116 |         return {model.posis:posis,model.sents:sents,model.maskls:maskls,model.maskrs:maskrs,
117 |                 model._labels:event_types,model.lexical:lexical,model.is_train:is_train}
118 |     else:
119 |         posis,sents,maskls,maskrs,event_types,lexical,pos,ner,trigger_idxs,subg_indices = batch
120 |         subg_vals = [1.0]*len(subg_indices)
121 |         subg_shape = [sents.shape[0],maxlen,maxlen]
122 |         subg = (subg_indices,subg_vals,subg_shape)
123 | 
124 |         gather_idxs = np.stack([np.array(np.arange(posis.shape[0])),trigger_idxs],axis=1)
125 |         return {model.posis:posis,model.sents:sents,model.maskls:maskls,model.maskrs:maskrs,
126 |                 model._labels:event_types,model.lexical:lexical,model.is_train:is_train,
127 |                 model.pos_idx:pos,model.ner_idx:ner,model.subg_a:subg,model.gather_idxs:gather_idxs}
128 | 
129 | 
130 | #GAT util function
131 | 
132 | def u_compute(ps,subg,maxlen):
133 |     with tf.variable_scope("e_compute",reuse=tf.AUTO_REUSE):
134 |         att = tf.layers.dense(ps,constant.Watt_dim,name='Watt')
135 |         left_comb = tf.layers.dense(att,1,name='comb_left')
136 |         right_comb = tf.layers.dense(att,1,name='comb_right')
137 | 
138 |         tile_left = tf.tile(left_comb,[1,1,maxlen],name='tile_1')
139 |         tile_right = tf.tile(tf.transpose(left_comb,[0,2,1],name='transpose_1'),[1,maxlen,1],name='tile_2')
140 |         tiles_concat = tile_left+tile_right
141 | 
142 |         e_mat = tf.nn.leaky_relu(tiles_concat,alpha=constant.leaky_alpha,name='lrelu_1')
143 |     with tf.variable_scope('u_compute',reuse=tf.AUTO_REUSE):
144 |         u_raw = tf.multiply(e_mat,subg,name='mul_1')-(1-subg)*1e8
145 |         u_mat = tf.nn.softmax(u_raw,axis=2,name='soft_1')
146 |     return u_mat
147 | 
148 | def GAC_func(ps,subg,maxlen,a,k):
149 |     with tf.variable_scope("GAC_compute",reuse=tf.AUTO_REUSE):
150 |         u_mat = u_compute(ps,subg,maxlen)
151 |         weight_name = a+'_'+str(k)
152 |         dense = tf.layers.dense(ps,constant.graph_dim,name=weight_name)
153 |         # dense_expand = tf.tile(tf.expand_dims(dense,2,name='expand_1'),[1,1,maxlen,1],name='tile_1')
154 |         # u_mat_expand = tf.tile(tf.expand_dims(u_mat,3,name='expand_2'),[1,1,1,constant.graph_dim],name='tile_2')
155 |         dense_expand = tf.expand_dims(dense,2,name='expand_1')
156 |         u_mat_expand = tf.expand_dims(u_mat,3,name='expand_2')
157 |         sums = tf.reduce_sum(tf.multiply(u_mat_expand,dense_expand,name='mul1'),axis=2,name='sum_1')
158 |         graph_emb = tf.nn.elu(sums,name='elu_1')
159 |     return graph_emb
160 |         
161 | def matmuls(a,times):
162 |     with tf.variable_scope('matmuls_'):
163 |         res = a
164 |         for i in range(times-1):
165 |             res = tf.matmul(res,a)
166 |     return res


--------------------------------------------------------------------------------
/baselines/MOGANED/models.py:
--------------------------------------------------------------------------------
  1 | import tensorflow as tf
  2 | import constant
  3 | from func import get_batch,get_trigger_feeddict,f_score,GAC_func,Cudnn_RNN,matmuls
  4 | import numpy as np
  5 | 
  6 | class Trigger_Model():
  7 |     def __init__(self,t_data,maxlen,wordemb,stage="MOGANED"):
  8 |         self.t_train,self.t_dev,self.t_test = t_data
  9 |         self.maxlen = maxlen
 10 |         self.wordemb = wordemb
 11 |         self.stage = stage
 12 |         self.build_graph()
 13 |     
 14 |     def build_graph(self):
 15 |         if self.stage=='DMCNN':
 16 |             print('--Building Trigger DMCNN Graph--')
 17 |             self.build_trigger()
 18 |         else:
 19 |             print('--Building Trigger MOGANED Graph--')
 20 |             self.build_GAT()
 21 | 
 22 |     def build_trigger(self,scope='DMCNN_Trigger'):
 23 |         maxlen = self.maxlen
 24 |         num_class = len(constant.EVENT_TYPE_TO_ID)
 25 |         keepprob = constant.t_keepprob
 26 |         with tf.variable_scope(scope,reuse=tf.AUTO_REUSE):
 27 |             with tf.variable_scope('Initialize'):
 28 |                 posi_mat = tf.concat(
 29 |                             [tf.zeros([1,constant.posi_embedding_dim],tf.float32),
 30 |                             tf.get_variable('posi_emb',[2*maxlen,constant.posi_embedding_dim],tf.float32,initializer=tf.contrib.layers.xavier_initializer())],axis=0)
 31 |                 word_mat = tf.concat([
 32 |                             tf.zeros((1, constant.embedding_dim),dtype=tf.float32),
 33 |                             tf.get_variable("unk_word_embedding", [1, constant.embedding_dim], dtype=tf.float32,initializer=tf.contrib.layers.xavier_initializer()),
 34 |                             tf.get_variable("wordemb", initializer=self.wordemb,trainable=True)], axis=0)
 35 | 
 36 |             with tf.variable_scope('placeholder'):
 37 |                 self.sents = sents = tf.placeholder(tf.int32,[None,maxlen],'sents')
 38 |                 self.posis = posis = tf.placeholder(tf.int32,[None,maxlen],'posis')
 39 |                 self.maskls = maskls = tf.placeholder(tf.float32,[None,maxlen],'maskls')
 40 |                 self.maskrs = maskrs = tf.placeholder(tf.float32,[None,maxlen],'maskrs')
 41 |                 self._labels = _labels = tf.placeholder(tf.int32,[None],'labels')
 42 |                 labels = tf.one_hot(_labels,num_class)
 43 |                 self.is_train = is_train = tf.placeholder(tf.bool,[],'is_train')
 44 |                 self.lexical = lexical = tf.placeholder(tf.int32,[None,3],'lexicals')
 45 | 
 46 |                 sents_len = tf.reduce_sum(tf.cast(tf.cast(sents,tf.bool),tf.int32),axis=1)
 47 |                 sents_mask = tf.expand_dims(tf.sequence_mask(sents_len,maxlen,tf.float32),axis=2)
 48 |             with tf.variable_scope('embedding'):
 49 |                 sents_emb = tf.nn.embedding_lookup(word_mat,sents)
 50 |                 posis_emb  = tf.nn.embedding_lookup(posi_mat,posis)
 51 |                 lexical_emb = tf.nn.embedding_lookup(word_mat,lexical)
 52 |             with tf.variable_scope('lexical_feature'):
 53 |                 lexical_feature = tf.reshape(lexical_emb,[-1,3*constant.embedding_dim])
 54 |             with tf.variable_scope('encoder'):
 55 |                 emb = tf.concat([sents_emb,posis_emb],axis=2)
 56 |                 emb_shape = tf.shape(emb)
 57 |                 pad = tf.zeros([emb_shape[0],1,emb_shape[2]],tf.float32)
 58 |                 conv_input = tf.concat([pad,emb,pad],axis=1)
 59 |                 conv_res = tf.layers.conv1d(
 60 |                         inputs=conv_input,
 61 |                         filters=constant.t_filters, kernel_size=3,
 62 |                         strides=1,
 63 |                         padding='valid',
 64 |                         activation=tf.nn.relu,
 65 |                         kernel_initializer=tf.contrib.layers.xavier_initializer(),
 66 |                         name='convlution_layer')
 67 |                 conv_res = tf.reshape(conv_res,[-1,maxlen,constant.t_filters])
 68 |             with tf.variable_scope('maxpooling'):
 69 |                 maskl = tf.tile(tf.expand_dims(maskls,axis=2),[1,1,constant.t_filters])
 70 |                 left = maskl*conv_res
 71 |                 maskr = tf.tile(tf.expand_dims(maskrs,axis=2),[1,1,constant.t_filters])
 72 |                 right = maskr*conv_res
 73 |                 sentence_feature = tf.concat([tf.reduce_max(left,axis=1),tf.reduce_max(right,axis=1)],axis=1)
 74 |             with tf.variable_scope('classifier'):
 75 |                 feature = tf.concat([sentence_feature,lexical_feature],axis=1)
 76 |                 feature = tf.layers.dropout(feature,1-constant.t_keepprob,training=is_train)
 77 |                 self.logits = logits = tf.layers.dense(feature,num_class,kernel_initializer=tf.contrib.layers.xavier_initializer(),bias_initializer=tf.contrib.layers.xavier_initializer())
 78 |                 self.pred = pred = tf.nn.softmax(logits,axis=1)
 79 |                 self.pred_label = pred_label = tf.argmax(pred,axis=1)
 80 |                 self.loss = loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits_v2(labels=labels,logits=logits),axis=0)
 81 |                 self.train_op = train_op = tf.train.AdamOptimizer(constant.t_lr).minimize(loss)
 82 | 
 83 |     def build_GAT(self,scope='MOGANED_Trigger'):
 84 |         maxlen = self.maxlen
 85 |         num_class = len(constant.EVENT_TYPE_TO_ID)
 86 |         keepprob = constant.t_keepprob
 87 |         with tf.variable_scope(scope,reuse=tf.AUTO_REUSE):
 88 |             with tf.variable_scope('Initialize'):
 89 |                 posi_mat = tf.concat(
 90 |                             [tf.zeros([1,constant.posi_embedding_dim],tf.float32),
 91 |                             tf.get_variable('posi_emb',[2*maxlen,constant.posi_embedding_dim],tf.float32,initializer=tf.contrib.layers.xavier_initializer())],axis=0)
 92 |                 word_mat = tf.concat([
 93 |                             tf.zeros((1, constant.embedding_dim),dtype=tf.float32),
 94 |                             tf.get_variable("unk_word_embedding", [1, constant.embedding_dim], dtype=tf.float32,initializer=tf.contrib.layers.xavier_initializer()),
 95 |                             tf.get_variable("wordemb", initializer=self.wordemb,trainable=True)], axis=0)
 96 |                 pos_mat = tf.concat([
 97 |                             tf.zeros((1, constant.pos_dim),dtype=tf.float32),
 98 |                             tf.get_variable("pos_embedding", [len(constant.POS_TO_ID)-1, constant.pos_dim], dtype=tf.float32,initializer=tf.contrib.layers.xavier_initializer())],axis=0)
 99 |                 ner_mat = tf.concat([
100 |                             tf.zeros((1, constant.ner_dim),dtype=tf.float32),
101 |                             tf.get_variable("ner_embedding", [len(constant.NER_TO_ID)-1, constant.ner_dim], dtype=tf.float32,initializer=tf.contrib.layers.xavier_initializer())],axis=0)
102 |             
103 |             with tf.variable_scope("Placeholder"):
104 |                 self.sents = sents = tf.placeholder(tf.int32,[None,maxlen],'sents')
105 |                 self.posis = posis = tf.placeholder(tf.int32,[None,maxlen],'posis')
106 |                 self.maskls = maskls = tf.placeholder(tf.float32,[None,maxlen],'maskls')
107 |                 self.maskrs = maskrs = tf.placeholder(tf.float32,[None,maxlen],'maskrs')
108 |                 self._labels = _labels = tf.placeholder(tf.int32,[None],'labels')
109 |                 labels = tf.one_hot(_labels,num_class)
110 |                 self.is_train = is_train = tf.placeholder(tf.bool,[],'is_train')
111 |                 self.lexical = lexical = tf.placeholder(tf.int32,[None,3],'lexicals')
112 | 
113 |                 self.ner_idx = ner_idx = tf.placeholder(tf.int32,[None,maxlen],'ner_tags')
114 |                 self.pos_idx = pos_idx = tf.placeholder(tf.int32,[None,maxlen],'pos_tags')
115 | 
116 |                 self.subg_a =  tf.sparse_placeholder(tf.float32,[None,maxlen,maxlen],'subg')
117 | 
118 |                 self.subg_b =  tf.sparse_transpose(self.subg_a,[0,2,1])
119 | 
120 |                 subg_a = tf.sparse_tensor_to_dense(self.subg_a,validate_indices=False)
121 |                 subg_b = tf.sparse_tensor_to_dense(self.subg_b,validate_indices=False)
122 | 
123 |                 self.gather_idxs = tf.placeholder(tf.int32,[None,2],'gather_idxs')
124 | 
125 |                 sents_len = tf.reduce_sum(tf.cast(tf.cast(sents,tf.bool),tf.int32),axis=1)
126 |                 sents_mask = tf.expand_dims(tf.sequence_mask(sents_len,maxlen,tf.float32),axis=2)
127 | 
128 |                 eyes = tf.tile(tf.expand_dims(tf.eye(maxlen),0),[tf.shape(pos_idx)[0],1,1])
129 | 
130 |             with tf.variable_scope("Embedding"):
131 |                 sents_emb = tf.nn.embedding_lookup(word_mat,sents)
132 |                 posis_emb  = tf.nn.embedding_lookup(posi_mat,posis)
133 |                 pos_emb = tf.nn.embedding_lookup(pos_mat,pos_idx)
134 |                 ner_emb = tf.nn.embedding_lookup(ner_mat,ner_idx)
135 |                 concat_emb = tf.concat([sents_emb,posis_emb,pos_emb,ner_emb],axis=2)
136 | 
137 |             with tf.variable_scope("Lstm_layer"):
138 |                 rnn = Cudnn_RNN(num_layers=1, num_units=constant.hidden_dim, keep_prob=keepprob, is_train=self.is_train)
139 |                 ps,_ = rnn(concat_emb, seq_len=sents_len, concat_layers=False,keep_prob=keepprob,is_train=self.is_train)
140 |             
141 |             with tf.variable_scope("GAC"):
142 |                 hs = []
143 |                 for layer in range(1,constant.K+1):
144 |                     h_layer= GAC_func(ps,matmuls(subg_a,layer),maxlen,'a',layer)+GAC_func(ps,matmuls(subg_b,layer),maxlen,'b',layer)+GAC_func(ps,eyes,maxlen,'c',layer)
145 |                     hs.append(h_layer)
146 | 
147 |             with tf.variable_scope("Aggregation"):
148 |                 s_ctxs = []
149 |                 for layer in range(1,constant.K+1):
150 |                     s_raw = tf.layers.dense(hs[layer-1],constant.s_dim,name='Wawa')
151 |                     s_layer = tf.nn.tanh(s_raw)
152 |                     ctx_apply = tf.layers.dense(s_layer,1,name='ctx',use_bias=False)
153 |                     s_ctxs.append(ctx_apply)
154 |                 vs = tf.nn.softmax(tf.concat(s_ctxs,axis=2),axis=2) #[None,maxlen,3]
155 |                 h_concats = tf.concat([tf.expand_dims(hs[layer],2) for layer in range(constant.K)],axis=2)
156 |                 final_h = tf.reduce_sum(tf.multiply(tf.expand_dims(vs,3),h_concats),axis=2)
157 |                 gather_final_h = tf.gather_nd(final_h,self.gather_idxs)
158 |             
159 |             with tf.variable_scope('classifier'):
160 |                 bias_weight = (constant.t_bias_lambda-1)*(1-tf.cast(tf.equal(_labels,0),tf.float32))+1
161 |                 self.logits = logits = tf.layers.dense(gather_final_h,num_class,kernel_initializer=tf.contrib.layers.xavier_initializer(),bias_initializer=tf.contrib.layers.xavier_initializer(),name='Wo')
162 |                 self.pred = pred = tf.nn.softmax(logits,axis=1)
163 |                 self.pred_label = pred_label = tf.argmax(pred,axis=1)
164 |                 self.loss = loss = tf.reduce_sum(bias_weight*tf.nn.softmax_cross_entropy_with_logits_v2(labels=labels,logits=logits),axis=0)/tf.reduce_sum(bias_weight,axis=0)
165 |                 self.train_op = train_op = tf.train.AdamOptimizer(constant.t_lr).minimize(loss)
166 |                 
167 |                 
168 |     def train_trigger(self):
169 |         train,dev,test = self.t_train,self.t_dev,self.t_test
170 |         saver = tf.train.Saver()
171 |         maxlen = self.maxlen
172 |         print('--Training Trigger--')
173 |         with tf.Session() as sess:
174 |             sess.run(tf.global_variables_initializer())
175 |             devbest = 0
176 |             testbest = (0,0,0)
177 |             from tqdm import tqdm
178 |             for epoch in tqdm(range(constant.t_epoch)):
179 |                 loss_list =[]
180 |                 for batch in get_batch(train,constant.t_batch_size,True):
181 |                     loss,_ = sess.run([self.loss,self.train_op],feed_dict=get_trigger_feeddict(self,batch,self.stage,maxlen))
182 |                     loss_list.append(loss)
183 |                 print('epoch:{}'.format(str(epoch)))
184 |                 print('loss:',np.mean(loss_list))
185 | 
186 |                 pred_labels = []
187 |                 for batch in get_batch(dev,constant.t_batch_size,False):
188 |                     pred_label = sess.run(self.pred_label,feed_dict=get_trigger_feeddict(self,batch,self.stage,maxlen,is_train=False))
189 |                     pred_labels.extend(list(pred_label))
190 |                 golds = list(dev[0][4])
191 |                 dev_p,dev_r,dev_f = f_score(pred_labels,golds)
192 |                 print("dev_Precision: {} dev_Recall:{} dev_F1:{}".format(str(dev_p),str(dev_r),str(dev_f)))
193 | 
194 |                 if dev_f>devbest:
195 |                     devbest = dev_f
196 |                     testbest = (dev_p, dev_r, dev_f)
197 |                     saver.save(sess,"saved_models/trigger.ckpt")
198 |             test_p, test_r, test_f = testbest
199 |             print("dev best Precision: {} dev best Recall:{} dev best F1:{}".format(str(test_p), str(test_r), str(test_f)))
200 |     
201 |     def eval_trigger(self):
202 |         test = self.t_test
203 |         saver = tf.train.Saver()
204 |         maxlen = self.maxlen
205 |         from collections import defaultdict
206 |         import json
207 |         results = defaultdict(list)
208 |         print('--Eval Trigger--')
209 |         with tf.Session() as sess:
210 |             saver.restore(sess,"saved_models/trigger.ckpt")
211 |             pred_labels = []
212 |             for batch in get_batch(test,constant.t_batch_size,False):
213 |                 pred_label = sess.run(self.pred_label,feed_dict=get_trigger_feeddict(self,batch,self.stage,maxlen,is_train=False))
214 |                 pred_labels.extend(list(pred_label))
215 |             with open('{}/id_align.json'.format(constant.maven_path),'r') as f:
216 |                 ids = json.load(f)
217 |             with open('test_idxs.json','r') as f:
218 |                 test_idxs = json.load(f)
219 |             test_idxs = {test_idx:idx for idx,test_idx in enumerate(test_idxs)}
220 |             assert len(test_idxs)==len(pred_labels)
221 |             for idx in range(len(ids)):
222 |                 id_ = ids[idx]
223 |                 if idx in test_idxs:
224 |                     label = pred_labels[test_idxs[idx]]
225 |                 else:
226 |                     label = 0
227 |                 results[id_[0]].append({'id':id_[1],'type_id':int(label)})
228 |             with open('results.jsonl','w') as f:
229 |                 for key,val in results.items():
230 |                     f.write(json.dumps({'id':key,'predictions':val})+'\n')
231 |         print("--Eval Finish--")
232 | 
233 | 


--------------------------------------------------------------------------------
/baselines/MOGANED/train.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | import utils
 3 | from models import Trigger_Model
 4 | import os
 5 | from constant import *
 6 | 
 7 | flags = tf.flags
 8 | flags.DEFINE_string("gpu", "1", "The GPU to run on")
 9 | flags.DEFINE_string("mode", "MOGANED", "DMCNN or MOGANED")
10 | flags.DEFINE_bool('eval', False, "Eval or Train")
11 | 
12 | def main(_):
13 |     config = flags.FLAGS
14 |     os.environ['CUDA_VISIBLE_DEVICES'] = config.gpu
15 |     os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
16 |     extractor = utils.Extractor()
17 |     extractor.extract()
18 |     loader = utils.Loader(cut_len)
19 |     t_data = loader.load_trigger()
20 |     trigger = Trigger_Model(t_data,loader.maxlen,loader.wordemb,config.mode)
21 |     if not config.eval:
22 |         trigger.train_trigger()
23 |     else:
24 |         trigger.eval_trigger()
25 | 
26 | if __name__=="__main__":
27 |     tf.app.run()


--------------------------------------------------------------------------------
/baselines/MOGANED/utils.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import constant
  3 | from xml.dom.minidom import parse
  4 | from tqdm import tqdm
  5 | import re
  6 | import random
  7 | import json
  8 | import numpy as np
  9 | import copy
 10 | from stanfordcorenlp import StanfordCoreNLP
 11 | 
 12 | class Extractor():
 13 |     def __init__(self):
 14 |         pass
 15 | 
 16 |     def preprocess(self):
 17 |         splits = {'train':'train','valid':'dev','test':'test'}
 18 |         path = constant.maven_path
 19 |         nlp = StanfordCoreNLP(constant.corenlp_path)
 20 |         mention_ids = []
 21 |         for split in tqdm(splits):
 22 |             split_data = []
 23 |             with open(path+'/'+split+'.jsonl','r') as f:
 24 |                 line = f.readline().rstrip()
 25 |                 while line:
 26 |                     doc = json.loads(line)
 27 |                     content = doc['content']
 28 |                     for sent_tuple in content:
 29 |                         origin_sent,origin_tokens = sent_tuple['sentence'],sent_tuple['tokens']
 30 | 
 31 |                         parse_sent = ' '.join(sent_tuple['tokens'])
 32 |                         nlp_words,nlp_span = nlp.word_tokenize(parse_sent,True)
 33 |                         nlp_span_dict = {e[0]:i for i,e in enumerate(nlp_span)}
 34 |                         origin_span = {i:len(' '.join(origin_tokens[:i]))+1 for i in range(1,len(origin_tokens))}
 35 |                         origin_span[0] = 0
 36 |                         sent_tuple['origin_span'] = origin_span
 37 |                         sent_tuple['nlp_span_dict'] = nlp_span_dict
 38 |                         sent_tuple['nlp_words'] = nlp_words
 39 |                         
 40 |                         dependency_parsing =nlp.dependency_parse(parse_sent)
 41 |                         pos_tags = [e[1] for e in nlp.pos_tag(parse_sent)]
 42 |                         ner_tags = [e[1] for e in nlp.ner(parse_sent)]
 43 |                         sent_tuple['ner'] = ner_tags
 44 |                         sent_tuple['pos'] = pos_tags
 45 |                         sent_tuple['dependency'] = dependency_parsing
 46 |                     if split!='test':
 47 |                         for event in doc['events']:
 48 |                             event_type = event['type_id']
 49 |                             assert isinstance(event_type,int)
 50 |                             assert event_type<169
 51 |                             # if event_type==207:
 52 |                                 # continue
 53 |                             for mention in event['mention']:
 54 |                                 trigger = mention['trigger_word'].lower()
 55 |                                 offset = mention['offset']
 56 |                                 tokens = content[mention['sent_id']]['tokens']
 57 |                                 
 58 |                                 origin_span = content[mention['sent_id']]['origin_span']
 59 |                                 nlp_span_dict = content[mention['sent_id']]['nlp_span_dict']
 60 |                                 nlp_words = content[mention['sent_id']]['nlp_words']
 61 |                                 if origin_span[offset[0]] not in nlp_span_dict:
 62 |                                     real_offset = offset[0]
 63 |                                 else:
 64 |                                     real_offset = nlp_span_dict[origin_span[offset[0]]]
 65 | 
 66 |                                 mention_ids.append((mention['id'],event_type))
 67 | 
 68 |                                 info = {'tokens':nlp_words,
 69 |                                         'trigger_tokens':[nlp_words[real_offset]],
 70 |                                         'ner_tags':content[mention['sent_id']]['ner'],
 71 |                                         'pos_tags':content[mention['sent_id']]['pos'],
 72 |                                         'dependency_parsing':content[mention['sent_id']]['dependency'],
 73 |                                         'trigger_start':real_offset,
 74 |                                         'trigger_end':real_offset,
 75 |                                         'event_type':event_type}
 76 |                                 split_data.append(info)
 77 |                     negative_triggers = 'negative_triggers'
 78 |                     if split=='test':
 79 |                         negative_triggers = 'candidates'
 80 |                     for mention in doc[negative_triggers]:
 81 |                         trigger = mention['trigger_word'].lower()
 82 |                         offset = mention['offset']
 83 |                         tokens = content[mention['sent_id']]['tokens']
 84 |                         mention_ids.append((mention['id'],0))
 85 |                         origin_span = content[mention['sent_id']]['origin_span']
 86 |                         nlp_span_dict = content[mention['sent_id']]['nlp_span_dict']
 87 |                         nlp_words = content[mention['sent_id']]['nlp_words']
 88 |                         if origin_span[offset[0]] not in nlp_span_dict:
 89 |                             real_offset = offset[0]
 90 |                         else:
 91 |                             real_offset = nlp_span_dict[origin_span[offset[0]]]
 92 | 
 93 |                         info = {'tokens':nlp_words,
 94 |                                 'trigger_tokens':[nlp_words[real_offset]],
 95 |                                 'ner_tags':content[mention['sent_id']]['ner'],
 96 |                                 'pos_tags':content[mention['sent_id']]['pos'],
 97 |                                 'dependency_parsing':content[mention['sent_id']]['dependency'],
 98 |                                 'trigger_start':real_offset,
 99 |                                 'trigger_end':real_offset,
100 |                                 'event_type':0}
101 |                         split_data.append(info)
102 | 
103 |                     line = f.readline().rstrip()
104 |             with open(path+'/'+splits[split]+'.json','w') as f:
105 |                 json.dump(split_data,f)
106 | 
107 |         nlp.close()
108 |         
109 |     def id_align(self):
110 |         ids = []
111 | 
112 |         with open('{}/test.jsonl'.format(constant.maven_path),'r') as f:
113 |             line = f.readline().rstrip()
114 |             while line:
115 |                 doc = json.loads(line)
116 |                 doc_id = doc['id']
117 |                 for mention in doc['candidates']:
118 |                     trigger_id = mention['id']
119 |                     ids.append((doc_id,trigger_id))
120 |                 line = f.readline().rstrip()
121 | 
122 |         with open('{}/id_align.json'.format(constant.maven_path),'w') as f:
123 |             json.dump(ids,f)
124 |     
125 |     def extract(self):
126 |         if not os.path.exists(constant.maven_path+'/train.json'):
127 |             print('----Preprocessing----')
128 |             self.preprocess()
129 |         else:
130 |             print("--Preprocessed files exist--")
131 |         if not os.path.exists(constant.maven_path+'/id_align.json'):
132 |             print('----Id Aligning----')
133 |             self.id_align()
134 | 
135 | class Loader():
136 |     def __init__(self,cut_len):
137 |         self.train_path = constant.maven_path+'/train.json'
138 |         self.dev_path = constant.maven_path+'/dev.json'
139 |         self.test_path = constant.maven_path+'/test.json'
140 |         self.glove_path = constant.GloVe_file
141 |         self.cut_len = cut_len
142 | 
143 |     def load_embedding(self):
144 |         word2idx = {}
145 |         wordemb = []
146 |         with open(self.glove_path,'r',encoding='utf-8') as f:
147 |             for line in f:
148 |                 splt = line.split()
149 |                 assert len(splt)==constant.embedding_dim+1
150 |                 vector = list(map(float, splt[-constant.embedding_dim:]))
151 |                 word = splt[0]
152 |                 word2idx[word] = len(word2idx)+2
153 |                 wordemb.append(vector)
154 |         return word2idx,np.asarray(wordemb,np.float32)
155 | 
156 |     def get_maxlen(self):
157 |         if self.cut_len!=None:
158 |             self.maxlen = self.cut_len
159 |             return self.maxlen
160 |         paths = [self.train_path,self.dev_path,self.test_path]
161 |         maxlens = []
162 |         for path in paths:
163 |             with open(path,'r') as f:
164 |                 data = json.load(f)
165 |             _maxlen = max([len(d['tokens']) for d in data])
166 |             maxlens.append(_maxlen)
167 |         self.maxlen = max(maxlens)
168 |         return self.maxlen
169 |     
170 |     def get_max_argument_len(self):
171 |         paths = [self.train_path,self.dev_path,self.test_path]
172 |         maxlens = []
173 |         for path in paths:
174 |             with open(path,'r') as f:
175 |                 data = json.load(f)
176 |             for instance in data:
177 |                 if len(instance['entities'])==0:
178 |                     continue
179 |                 _maxlen = max([entity['idx_end']+1-entity['idx_start'] for entity in instance['entities']])
180 |                 maxlens.append(_maxlen)
181 |         self.max_argument_len = max(maxlens)
182 |         return self.max_argument_len
183 | 
184 |     def get_positions(self,start_idx,sent_len,maxlen):
185 |         return list(range(maxlen-start_idx, maxlen)) + [maxlen]  + \
186 |                list(range(maxlen+1, maxlen+sent_len - start_idx))+[0]*(maxlen-sent_len)
187 | 
188 |     def get_word(self,tokens,word2idx,pad_lenth):
189 |         idx = []
190 |         for word in tokens:
191 |             if word.lower() in word2idx:
192 |                 idx.append(word2idx[word.lower()])
193 |             else:
194 |                 idx.append(1)
195 |         idx += [0]*(pad_lenth-len(idx))
196 |         return idx
197 | 
198 |     def get_trigger_mask(self,posi,sent_len,maxlen,direction):
199 |         assert direction in ['left','right']
200 |         mask = [0.]*maxlen
201 |         if direction=='left':
202 |             mask[:posi] = [1.]*posi
203 |         else:
204 |             mask[posi:sent_len] = [1.]*(sent_len-posi)
205 |         return mask
206 | 
207 |     def load_one_trigger(self,path,maxlen,word2idx):
208 |         trigger_posis,sents,trigger_maskls,trigger_maskrs,event_types,trigger_lexical= [], [], [], [], [], []
209 |         with open(path,'r') as f:
210 |             data = json.load(f)
211 |         
212 |         indices_s,pos,ner = [],[],[]
213 |         trigger_idxs = []
214 | 
215 |         test_idxs = []
216 |         
217 | 
218 |         for test_idx,instance in enumerate(data):
219 |             tokens = instance['tokens'][:maxlen]
220 |             event_type = instance['event_type']
221 |             trigger_posi = instance['trigger_start']
222 |             if trigger_posi>maxlen-1:
223 |                 continue
224 |             ner_tags = [constant.NER_TO_ID[e] if e in constant.NER_TO_ID else 1 for e in instance['ner_tags']][:maxlen]+[0]*(maxlen-len(instance['ner_tags']))
225 |             pos_tags = [constant.POS_TO_ID[e] if e in constant.NER_TO_ID else 1 for e in instance['pos_tags']][:maxlen]+[0]*(maxlen-len(instance['pos_tags']))
226 |             ner.append(ner_tags)
227 |             pos.append(pos_tags)
228 | 
229 |             words = self.get_word(tokens,word2idx,maxlen)
230 |             dependency_parsing = instance['dependency_parsing']
231 | 
232 |             start_word = 0
233 |             current_max = 0
234 |             indices = []
235 |             for edge in dependency_parsing:
236 |                 if edge[0]=="ROOT":
237 |                     start_word = max(start_word,current_max)
238 |                 else:
239 |                     if edge[1]-1+start_word>maxlen-1 or edge[2]-1+start_word>maxlen-1:
240 |                         continue
241 |                     indices.append([edge[1]-1+start_word,edge[2]-1+start_word])
242 |                 current_max = max([current_max,edge[1]+start_word,edge[2]+start_word])
243 |             indices_s.append(indices)
244 | 
245 |             trigger_posis.append(self.get_positions(trigger_posi,len(tokens),maxlen))
246 |             trigger_idxs.append(trigger_posi)
247 |             sents.append(words)
248 |             trigger_maskls.append(self.get_trigger_mask(trigger_posi,len(tokens),maxlen,'left'))
249 |             trigger_maskrs.append(self.get_trigger_mask(trigger_posi, len(tokens),maxlen, 'right'))
250 |             event_types.append(constant.EVENT_TYPE_TO_ID[event_type])
251 | 
252 |             _trigger_lexical = []
253 |             if trigger_posi==0:
254 |                 _trigger_lexical.append(0)
255 |             else:
256 |                 _trigger_lexical.append(words[trigger_posi-1])
257 | 
258 |             _trigger_lexical.append(words[trigger_posi])
259 | 
260 |             if trigger_posi==len(tokens)-1:
261 |                 _trigger_lexical.append(0)
262 |             else:
263 |                 _trigger_lexical.append(words[trigger_posi+1])
264 | 
265 |             trigger_lexical.append(_trigger_lexical)
266 |             test_idxs.append(test_idx)
267 |         if path.endswith('test.json'):
268 |             with open('test_idxs.json','w') as f:
269 |                 json.dump(test_idxs,f)
270 |         return (np.array(trigger_posis,np.int32),np.array(sents,np.int32),np.array(trigger_maskls,np.int32),\
271 |                np.array(trigger_maskrs,np.int32),np.array(event_types,np.int32),np.array(trigger_lexical,np.int32),\
272 |                np.array(pos,np.int32),np.array(ner,np.int32),np.array(trigger_idxs,np.int32)),indices_s
273 | 
274 |     def load_trigger(self):
275 |         print('--Loading Trigger--')
276 |         word2idx,self.wordemb = self.load_embedding()
277 |         maxlen = self.get_maxlen()
278 |         paths = [self.train_path, self.dev_path, self.test_path]
279 |         results = []
280 |         for path in paths:
281 |             result = self.load_one_trigger(path,maxlen,word2idx)
282 |             results.append(result)
283 |         return results


--------------------------------------------------------------------------------
/evaluate.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | import sys
 3 | import os
 4 | import os.path
 5 | import json
 6 | import numpy as np
 7 | from sklearn.metrics import f1_score,precision_score,recall_score
 8 | 
 9 | input_dir = sys.argv[1]
10 | output_dir = sys.argv[2]
11 | 
12 | submit_dir = os.path.join(input_dir, 'res')
13 | truth_dir = os.path.join(input_dir, 'ref')
14 | 
15 | if not os.path.isdir(submit_dir):
16 |     print("%s doesn't exist" % submit_dir)
17 | 
18 | if os.path.isdir(submit_dir) and os.path.isdir(truth_dir):
19 |     if not os.path.exists(output_dir):
20 |         os.makedirs(output_dir)
21 | 
22 |     output_filename = os.path.join(output_dir, 'scores.txt')
23 |     output_file = open(output_filename, 'wb')
24 | 
25 |     truth_file = os.path.join(truth_dir, "test_gold.jsonl")
26 |     truth = open(truth_file, "r")
27 | 
28 |     submission_answer_file = os.path.join(submit_dir, "results.jsonl")
29 |     submission_answer = open(submission_answer_file, "r")
30 |     preds_map=dict()
31 |     ans_lines = submission_answer.readlines()
32 |     for line in ans_lines:
33 |         data=json.loads(line)
34 |         tmp=dict()
35 |         for mention in data['predictions']:
36 |             tmp[mention['id']]=mention['type_id']
37 |         preds_map[data['id']]=tmp
38 |     
39 |     ref_lines=truth.readlines()
40 |     labels=[]
41 |     preds=[]
42 |     for line in ref_lines:
43 |         data=json.loads(line)
44 |         pred_tmp=preds_map[data['id']] if data['id'] in preds_map else dict()
45 |         if not pred_tmp:#debug
46 |             print("lose",data['id'])
47 |         for event in data['events']:
48 |             for mention in event['mention']:
49 |                 if mention['id'] in pred_tmp:
50 |                     preds.append(pred_tmp[mention['id']])
51 |                 else:
52 |                     preds.append(0)
53 |                     print("lose Mention",mention['id'])
54 |                 labels.append(event['type_id'])
55 |         for mention in data['negative_triggers']:
56 |             if mention['id'] in pred_tmp:
57 |                 preds.append(pred_tmp[mention['id']])
58 |             else:
59 |                 preds.append(0)
60 |                 print("lose Mention",mention['id'])
61 |             labels.append(0)
62 |     assert len(labels)==len(preds)
63 |     
64 |     #calculate scores
65 |     pos_labels=list(range(1,169))
66 |     labels=np.array(labels)
67 |     preds=np.array(preds)
68 |     micro_p=precision_score(labels,preds,labels=pos_labels,average='micro')*100.0
69 |     micro_r=recall_score(labels,preds,labels=pos_labels,average='micro')*100.0
70 |     micro_f1=f1_score(labels,preds,labels=pos_labels,average='micro')*100.0
71 | 
72 |     macro_p=precision_score(labels,preds,labels=pos_labels,average='macro')*100.0
73 |     macro_r=recall_score(labels,preds,labels=pos_labels,average='macro')*100.0
74 |     macro_f1=f1_score(labels,preds,labels=pos_labels,average='macro')*100.0
75 |     
76 |     print("Micro_F1:",micro_f1)
77 |     print("Micro_Precision:",micro_p)
78 |     print("Micro_Recall:",micro_r)
79 |     print("Macro_F1:",macro_f1)
80 |     print("Macro_Precision:",macro_p)
81 |     print("Macro_Recall:",macro_r)
82 | 
83 |     output_file.write("Micro_F1: %f\n" % micro_f1)
84 |     output_file.write("Micro_Precision: %f\n" % micro_p)
85 |     output_file.write("Micro_Recall: %f\n" % micro_r)
86 |     output_file.write("Macro_F1: %f\n" % macro_f1)
87 |     output_file.write("Macro_Precision: %f\n" % macro_p)
88 |     output_file.write("Macro_Recall: %f\n" % macro_r)
89 | 
90 |     output_file.close()
91 | 


--------------------------------------------------------------------------------