├── README.md ├── scripts └── eval.py └── src ├── __init__.py ├── main.py ├── nn ├── __init__.py ├── activations.py ├── initializers.py ├── layers │ ├── __init__.py │ ├── core.py │ ├── embeddings.py │ ├── recurrent.py │ ├── seqlabel.py │ └── stack.py ├── losses.py ├── metrics.py ├── optimizers.py ├── regularizers.py └── utils.py ├── srl ├── __init__.py ├── decoders.py ├── model_api.py ├── models.py ├── preprocessors.py ├── testers.py └── trainers.py └── utils ├── __init__.py ├── evaluators.py ├── loaders.py ├── misc.py ├── savers.py ├── sent.py └── vocab.py /README.md: -------------------------------------------------------------------------------- 1 | # A Span Selection Model for Semantic Role Labeling 2 | 3 | ## Citation 4 | * A Span Selection Model for Semantic Role Labeling 5 | * Hiroki Ouchi (RIKEN AIP/Tohoku Univ.), Hiroyuki Shindo (NAIST) and Yuji Matsumoto (NAIST) 6 | * In EMNLP 2018 7 | * Conference paper: http://aclweb.org/anthology/D18-1191 8 | * arXiv version: https://arxiv.org/abs/1810.02245 9 | ``` 10 | @InProceedings{D18-1191, 11 | author = "Ouchi, Hiroki 12 | and Shindo, Hiroyuki 13 | and Matsumoto, Yuji", 14 | title = "A Span Selection Model for Semantic Role Labeling", 15 | booktitle = "Proceedings of the 2018 Conference on Empirical Methods in Natural Language Processing", 16 | year = "2018", 17 | publisher = "Association for Computational Linguistics", 18 | pages = "1630--1642", 19 | location = "Brussels, Belgium", 20 | url = "http://aclweb.org/anthology/D18-1191" 21 | } 22 | ``` 23 | 24 | 25 | ## Prerequisites 26 | * [python3](https://www.python.org/downloads/) 27 | * [Theano](http://deeplearning.net/software/theano/) 28 | * [h5py](https://www.h5py.org/) 29 | 30 | ## Installation 31 | ``` 32 | conda create -n theano-py3 python=3.6 33 | source activate theano-py3 34 | conda install -c conda-forge theano 35 | conda install -c anaconda h5py 36 | ``` 37 | 38 | ## Data 39 | ### CoNLL-2005 40 | * [Treebank-2](https://catalog.ldc.upenn.edu/LDC95T7) 41 | ### CoNLL-2012 42 | * [OntoNotes Release 5.0](https://catalog.ldc.upenn.edu/LDC2013T19) 43 | * We create the dataset by following the process described at http://cemantix.org/data/ontonotes.html 44 | ### Word Representations 45 | - [SENNA](https://ronan.collobert.com/senna/download.html) 46 | - Download the software and make the word-embedding pair file as follows. 47 | - `paste hash/words.lst embeddings/embeddings.txt > senna.emb.txt` 48 | 49 | * [ELMo](https://github.com/allenai/allennlp/tree/v0.6.1) 50 | 51 | ### Data Format 52 | #### CoNLL-2005 Training & Development Sets 53 | ``` 54 | 0:WORD 1:POS 2:PARSE 3:NE 4:FRAME 5:LEMMA 6-:ARGS 55 | Ms. NNP (S1(S(NP* * - - (A0* 56 | Haag NNP *) (LOC*) - - *) 57 | plays VBZ (VP* * 02 play (V*) 58 | Elianti NNP (NP*)) * - - (A1*) 59 | . . *)) * - - * 60 | ``` 61 | 62 | #### CoNLL-2005 Test Set (Not including FRAME ID) 63 | ``` 64 | 0:WORD 1:POS 2:PARSE 3:NE 4:LEMMA 5-:ARGS 65 | The DT (S1(S(NP* * (A1* 66 | finger-pointing JJ *) * - *) 67 | has AUX (VP* * - * 68 | already RB (ADVP*) * - (AM-TMP*) 69 | begun VBN (VP*)) * begin (V*) 70 | . 71 | ``` 72 | 73 | #### CoNLL-2012 Training/Development/Test Sets 74 | ``` 75 | 0:DOCUMENT 1:PART 2:INDEX 3:WORD 4:POS 5:PARSE 6:LEMMA 7:FRAME 8:SENSE 9:SPEAKER 10:NE 11-N:ARGS N:COREF 76 | bc/cctv/00/cctv_0001 0 0 This DT (TOP(S(NP* - - - Speaker#1 * (ARG2* (61 77 | bc/cctv/00/cctv_0001 0 1 map NN *) - - - Speaker#1 * *) 61) 78 | bc/cctv/00/cctv_0001 0 2 reflected VBD (VP* reflect 01 1 Speaker#1 * (V*) - 79 | bc/cctv/00/cctv_0001 0 3 the DT (NP* - - - Speaker#1 * (ARG1* - 80 | bc/cctv/00/cctv_0001 0 4 European JJ * - - - Speaker#1 (NORP) * - 81 | bc/cctv/00/cctv_0001 0 5 battlefield NN * - - - Speaker#1 * * - 82 | bc/cctv/00/cctv_0001 0 6 situation NN *)) - - - Speaker#1 * *) - 83 | bc/cctv/00/cctv_0001 0 7 . . *)) - - - Speaker#1 * * - 84 | ``` 85 | 86 | 87 | ## Usage 88 | ### Training: span selection model 89 | SENNA: `python src/main.py --method span --mode train --train_data path/to/conll2005.train.txt --dev_data path/to/conll2005.dev.txt --data_type conll05 --drop_rate 0.1 --reg 0.0001 --hidden_dim 300 --n_layers 4 --halve_lr --word_emb path/to/senna --save --output_dir output` 90 | 91 | ELMo: `python src/main.py --method span --mode train --train_data path/to/conll2005.train.txt --dev_data path/to/conll2005.dev.txt --data_type conll05 --drop_rate 0.1 --reg 0.0001 --hidden_dim 300 --n_layers 4 --halve_lr --train_elmo_emb path/to/elmo.conll2005.train.hdf5 --dev_elmo_emb path/to/elmo.conll2005.dev.hdf5 --save --output_dir output` 92 | 93 | ### Training: CRF model 94 | SENNA: `python src/main.py --method crf --mode train --train_data path/to/conll2005.train.txt --dev_data path/to/conll2005.dev.txt --data_type conll05 --drop_rate 0.1 --reg 0.0001 --hidden_dim 300 --n_layers 4 --halve_lr --word_emb path/to/senna --save --output_dir output` 95 | 96 | ELMo: `python src/main.py --method crf --mode train --train_data path/to/conll2005.train.txt --dev_data path/to/conll2005.dev.txt --data_type conll05 --drop_rate 0.1 --reg 0.0001 --hidden_dim 300 --n_layers 4 --halve_lr --train_elmo_emb path/to/elmo.conll2005.train.hdf5 --dev_elmo_emb path/to/elmo.conll2005.dev.hdf5 --save --output_dir output` 97 | 98 | ### Predicting: span selection model 99 | SENNA: `python src/main.py --method span --mode test --test_data path/to/conll2005.test.txt --data_type conll05 --drop_rate 0.1 --hidden_dim 300 --n_layers 4 --output_dir output --output_fn conll2005.test --word_emb path/to/senna --load_label output/label_ids.txt --load_param output/param.epoch-0.pkl.gz` 100 | 101 | ELMo: `python src/main.py --method span --mode test --test_data path/to/conll2005.test.txt --data_type conll05 --drop_rate 0.1 --hidden_dim 300 --n_layers 4 --output_dir output --output_fn conll2005.test --test_elmo_emb path/to/elmo.conll2005.test.hdf5 --load_label output/label_ids.txt --load_param output/param.epoch-0.pkl.gz` 102 | 103 | ### Predicting: CRF model 104 | SENNA: `python src/main.py --method crf --mode test --test_data path/to/conll2005.test.txt --data_type conll05 --drop_rate 0.1 --hidden_dim 300 --n_layers 4 --output_dir output --output_fn conll2005.test --word_emb path/to/senna --load_label output/label_ids.txt --load_param output/param.epoch-0.pkl.gz` 105 | 106 | ELMo: `python src/main.py --method crf --mode test --test_data path/to/conll2005.test.txt --data_type conll05 --drop_rate 0.1 --hidden_dim 300 --n_layers 4 --output_dir output --output_fn conll2005.test --test_elmo_emb path/to/elmo.conll2005.test.hdf5 --load_label output/label_ids.txt --load_param output/param.epoch-0.pkl.gz` 107 | 108 | 109 | ## LICENSE 110 | MIT License 111 | -------------------------------------------------------------------------------- /scripts/eval.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | import numpy as np 4 | 5 | 6 | def load(path, data_size=100000000): 7 | corpus = [] 8 | sent = [] 9 | with open(path) as f: 10 | for line in f: 11 | elem = [l for l in line.rstrip().split()] 12 | if len(elem) > 0: 13 | sent.append(elem) 14 | else: 15 | corpus.append(sent) 16 | sent = [] 17 | if len(corpus) >= data_size: 18 | break 19 | return corpus 20 | 21 | 22 | def f_score(crr_total, p_total, r_total): 23 | precision = crr_total / p_total if p_total > 0 else 0. 24 | recall = crr_total / r_total if r_total > 0 else 0. 25 | f1 = (2 * precision * recall) / (precision + recall) if precision + recall > 0 else 0. 26 | return precision, recall, f1 27 | 28 | 29 | def accuracy(crr_total, total): 30 | return crr_total / total if total > 0 else 0. 31 | 32 | 33 | def srl_metrics(y_true, y_pred): 34 | """ 35 | :param y_true: 1D: n_sents, 2D: n_prds, 3D: n_words; elem=label 36 | :param y_pred: 1D: n_sents, 2D: n_prds, 3D: n_words; elem=label 37 | """ 38 | p_total = 0. 39 | r_total = 0. 40 | crr_total = 0. 41 | 42 | assert len(y_true) == len(y_pred) 43 | for y_true_i, y_pred_i in zip(y_true, y_pred): 44 | assert len(y_true_i) == len(y_pred_i) 45 | for y_true_j, y_pred_j in zip(y_true_i[1:], y_pred_i[1:]): 46 | assert len(y_true_j) == len(y_pred_j) 47 | y_true_spans = get_labeled_spans(y_true_j) 48 | y_pred_spans = get_labeled_spans(y_pred_j) 49 | p_total += len(y_pred_spans) 50 | r_total += len(y_true_spans) 51 | for y_pred_span in y_pred_spans: 52 | if y_pred_span in y_true_spans: 53 | crr_total += 1. 54 | return crr_total, p_total, r_total 55 | 56 | 57 | def span_metrics(y_true, y_pred): 58 | """ 59 | :param y_true: 1D: n_sents, 2D: n_prds, 3D: n_words; elem=label 60 | :param y_pred: 1D: n_sents, 2D: n_prds, 3D: n_words; elem=label 61 | """ 62 | p_total = 0. 63 | r_total = 0. 64 | crr_total = 0. 65 | 66 | assert len(y_true) == len(y_pred) 67 | for y_true_i, y_pred_i in zip(y_true, y_pred): 68 | assert len(y_true_i) == len(y_pred_i) 69 | for y_true_j, y_pred_j in zip(y_true_i[1:], y_pred_i[1:]): 70 | assert len(y_true_j) == len(y_pred_j) 71 | y_true_spans = get_labeled_spans(y_true_j) 72 | y_pred_spans = get_labeled_spans(y_pred_j) 73 | p_total += len(y_pred_spans) 74 | r_total += len(y_true_spans) 75 | 76 | y_true_boundary = [span[1:] for span in y_true_spans] 77 | for y_pred_span in y_pred_spans: 78 | if y_pred_span[1:] in y_true_boundary: 79 | crr_total += 1. 80 | return crr_total, p_total, r_total 81 | 82 | 83 | def label_metrics(y_true, y_pred): 84 | """ 85 | :param y_true: 1D: n_sents, 2D: n_prds, 3D: n_words; elem=label 86 | :param y_pred: 1D: n_sents, 2D: n_prds, 3D: n_words; elem=label 87 | """ 88 | total = 0. 89 | crr_total = 0. 90 | 91 | assert len(y_true) == len(y_pred) 92 | for y_true_i, y_pred_i in zip(y_true, y_pred): 93 | assert len(y_true_i) == len(y_pred_i) 94 | for y_true_j, y_pred_j in zip(y_true_i[1:], y_pred_i[1:]): 95 | assert len(y_true_j) == len(y_pred_j) 96 | y_true_spans = get_labeled_spans(y_true_j) 97 | y_pred_spans = get_labeled_spans(y_pred_j) 98 | 99 | y_true_boundary = [span[1:] for span in y_true_spans] 100 | for y_pred_span in y_pred_spans: 101 | if y_pred_span[1:] in y_true_boundary: 102 | total += 1. 103 | index = y_true_boundary.index(y_pred_span[1:]) 104 | y_true_span = y_true_spans[index] 105 | if y_pred_span[0] == y_true_span[0]: 106 | crr_total += 1. 107 | return crr_total, total 108 | 109 | 110 | def srl_metrics_per_distance(y_true, y_pred): 111 | """ 112 | :param y_true: 1D: n_sents, 2D: n_prds, 3D: n_words; elem=label 113 | :param y_pred: 1D: n_sents, 2D: n_prds, 3D: n_words; elem=label 114 | """ 115 | def _dist(i_, j_, prd_index_): 116 | if j_ < prd_index_: 117 | return prd_index_ - j_ - 1 118 | return i_ - prd_index_ - 1 119 | 120 | def _dist_bin(dist): 121 | if dist == 0: 122 | return 0 123 | elif 0 < dist < 3: 124 | return 1 125 | elif 3 <= dist < 7: 126 | return 2 127 | return 3 128 | 129 | dist_dict = np.zeros(shape=(4, 3), dtype="float32") 130 | 131 | assert len(y_true) == len(y_pred) 132 | for y_true_i, y_pred_i in zip(y_true, y_pred): 133 | assert len(y_true_i) == len(y_pred_i) 134 | 135 | prds = y_true_i[0] 136 | prd_indices = [i for i, y in enumerate(prds) if y != "-"] 137 | 138 | for y_true_j, y_pred_j, prd_index in zip(y_true_i[1:], y_pred_i[1:], prd_indices): 139 | assert len(y_true_j) == len(y_pred_j) 140 | y_true_spans = get_labeled_spans(y_true_j) 141 | y_pred_spans = get_labeled_spans(y_pred_j) 142 | 143 | for span in y_true_spans: 144 | # Remove continuous spans 145 | if len(span) > 3: 146 | continue 147 | (label, i, j) = span 148 | dist = _dist(i, j, prd_index) 149 | binned_dist = _dist_bin(dist) 150 | dist_dict[binned_dist][2] += 1 151 | 152 | for span in y_pred_spans: 153 | if len(span) > 3: 154 | continue 155 | (label, i, j) = span 156 | dist = _dist(i, j, prd_index) 157 | binned_dist = _dist_bin(dist) 158 | dist_dict[binned_dist][1] += 1 159 | 160 | for y_pred_span in y_pred_spans: 161 | if y_pred_span in y_true_spans: 162 | if len(y_pred_span) > 3: 163 | continue 164 | label, i, j = y_pred_span 165 | dist = _dist(i, j, prd_index) 166 | binned_dist = _dist_bin(dist) 167 | dist_dict[binned_dist][0] += 1 168 | 169 | return dist_dict 170 | 171 | 172 | def get_labeled_spans(prop): 173 | """ 174 | :param prop: 1D: n_words; elem=bracket label 175 | :return: 1D: n_words; elem=BIO label 176 | """ 177 | def _concat_c_spans(_spans): 178 | labels = [_span[0] for _span in _spans] 179 | c_indices = [i for i, _span in enumerate(_spans) if _span[0].startswith('C')] 180 | non_ant_c_spans = [] 181 | 182 | for c_index in c_indices: 183 | c_span = _spans[c_index] 184 | _label = c_span[0][2:] 185 | if _label in labels: 186 | _spans[labels.index(_label)].extend(c_span[1:]) 187 | else: 188 | non_ant_c_spans.append([_label] + c_span[1:]) 189 | concated_spans = [span for i, span in enumerate(_spans) if i not in c_indices] 190 | _spans = concated_spans + non_ant_c_spans 191 | return _spans 192 | 193 | labeled_spans = [] 194 | labeled_span = [] 195 | for i, arg in enumerate(prop): 196 | if arg.startswith('('): 197 | if arg.endswith(')'): 198 | label = arg.split("*")[0][1:] 199 | labeled_span = [label, i, i] 200 | else: 201 | label = arg[1:-1] 202 | labeled_span = [label, i] 203 | elif arg.endswith(')'): 204 | labeled_span.append(i) 205 | 206 | if len(labeled_span) == 3 and labeled_span[0] != "V" and labeled_span[0] != "C-V": 207 | labeled_spans.append(labeled_span) 208 | labeled_span = [] 209 | 210 | labeled_spans = _concat_c_spans(labeled_spans) 211 | return labeled_spans 212 | 213 | 214 | def print_metrics(y_true, y_pred): 215 | """ 216 | :param y_true: 1D: n_sents, 2D: n_words, 3D: n_prds; elem=label 217 | :param y_pred: 1D: n_sents, 2D: n_words, 3D: n_prds; elem=label 218 | """ 219 | crr_total, p_total, r_total = srl_metrics(y_true, y_pred) 220 | p, r, f = f_score(crr_total, p_total, r_total) 221 | sys.stdout.write('SRL RESULTS\n\tF:{:>7.2%} P:{:>7.2%} ({:>5}/{:>5}) R:{:>7.2%} ({:>5}/{:>5})\n'.format( 222 | f, p, int(crr_total), int(p_total), r, int(crr_total), int(r_total))) 223 | sys.stdout.flush() 224 | 225 | crr_total, p_total, r_total = span_metrics(y_true, y_pred) 226 | p, r, f = f_score(crr_total, p_total, r_total) 227 | sys.stdout.write('SPAN BOUNDARY MATCH\n\tF:{:>7.2%} P:{:>7.2%} ({:>5}/{:>5}) R:{:>7.2%} ({:>5}/{:>5})\n'.format( 228 | f, p, int(crr_total), int(p_total), r, int(crr_total), int(r_total))) 229 | sys.stdout.flush() 230 | 231 | crr_total, total = label_metrics(y_true, y_pred) 232 | acc = accuracy(crr_total, total) 233 | 234 | sys.stdout.write('LABEL MATCH\n\tACCURACY:{:>7.2%} ({:>5}/{:>5})\n'.format( 235 | acc, int(crr_total), int(total))) 236 | sys.stdout.flush() 237 | 238 | 239 | def print_metrics_per_dist(y_true, y_pred): 240 | metric_matrix = srl_metrics_per_distance(y_true, y_pred) 241 | sys.stdout.write('SRL RESULTS PER DISTANCE (C-LABEL removed)\n') 242 | for i, metric in enumerate(metric_matrix): 243 | crr_total, p_total, r_total = metric 244 | if i == 0: 245 | dist = '0' 246 | elif i == 1: 247 | dist = '1-2' 248 | elif i == 2: 249 | dist = '3-6' 250 | else: 251 | dist = '7-max' 252 | 253 | p, r, f = f_score(crr_total, p_total, r_total) 254 | sys.stdout.write('\t{}\tF:{:>7.2%} P:{:>7.2%} ({:>5}/{:>5}) R:{:>7.2%} ({:>5}/{:>5})\n'.format( 255 | dist, f, p, int(crr_total), int(p_total), r, int(crr_total), int(r_total))) 256 | sys.stdout.flush() 257 | 258 | 259 | def main(argv): 260 | sys.stdout.write("\nEVALUATION START\n") 261 | sys.stdout.flush() 262 | 263 | sents1 = load(argv[1]) 264 | sents2 = load(argv[2]) 265 | 266 | sents1 = [list(zip(*sent)) for sent in sents1] 267 | sents2 = [list(zip(*sent)) for sent in sents2] 268 | 269 | print_metrics(sents1, sents2) 270 | print_metrics_per_dist(sents1, sents2) 271 | 272 | 273 | if __name__ == '__main__': 274 | main(sys.argv) 275 | -------------------------------------------------------------------------------- /src/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hiroki13/span-based-srl/2c8b677c4e00b6c607e09ef4f9fe3d54961e4f2e/src/__init__.py -------------------------------------------------------------------------------- /src/main.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import argparse 3 | 4 | import numpy as np 5 | import theano 6 | 7 | sys.setrecursionlimit(100000000) 8 | theano.config.floatX = 'float32' 9 | 10 | if theano.config.device.startswith('cuda'): 11 | import locale 12 | 13 | locale.setlocale(locale.LC_CTYPE, 'C.UTF-8') 14 | 15 | 16 | def parse_args(): 17 | parser = argparse.ArgumentParser(description='SPAN SELECTION MODEL') 18 | 19 | parser.add_argument('--mode', default='train', help='train/test') 20 | parser.add_argument('--method', default='span', help='crf/span') 21 | parser.add_argument('--seed', type=int, default=0, help='seed') 22 | 23 | ################## 24 | # Input Datasets # 25 | ################## 26 | parser.add_argument('--train_data', help='path to train data') 27 | parser.add_argument('--dev_data', help='path to dev data') 28 | parser.add_argument('--test_data', help='path to test data') 29 | parser.add_argument('--data_type', default='conll05', help='conll05/conll12') 30 | parser.add_argument('--data_size', type=int, default=100000000, help='data size to be used') 31 | 32 | ################## 33 | # Output Options # 34 | ################## 35 | parser.add_argument('--save', action='store_true', default=False, help='parameters to be saved or not') 36 | parser.add_argument('--output_dir', type=str, default='output', help='output directory name') 37 | parser.add_argument('--output_fn', type=str, default=None, help='output file name') 38 | 39 | ########## 40 | # Search # 41 | ########## 42 | parser.add_argument('--search', type=str, default='greedy', help='argmax/greedy') 43 | 44 | ################### 45 | # NN Architecture # 46 | ################### 47 | parser.add_argument('--emb_dim', type=int, default=50, help='dimension of embeddings') 48 | parser.add_argument('--hidden_dim', type=int, default=32, help='dimension of hidden layer') 49 | parser.add_argument('--n_layers', type=int, default=1, help='number of layers') 50 | parser.add_argument('--n_experts', type=int, default=0, help='number of ensemble models') 51 | 52 | #################### 53 | # Training Options # 54 | #################### 55 | parser.add_argument('--epoch', type=int, default=100, help='number of epochs to train') 56 | parser.add_argument('--batch_size', type=int, default=32, help='mini-batch size') 57 | parser.add_argument('--word_emb', default=None, help='Initial embeddings to be loaded') 58 | parser.add_argument('--train_elmo_emb', default=None, help='ELMo embeddings to be loaded') 59 | parser.add_argument('--dev_elmo_emb', default=None, help='ELMo embeddings to be loaded') 60 | parser.add_argument('--test_elmo_emb', default=None, help='ELMo embeddings to be loaded') 61 | 62 | ######################## 63 | # Optimization Options # 64 | ######################## 65 | parser.add_argument('--lr', type=float, default=0.001, help='learning rate') 66 | parser.add_argument('--halve_lr', action='store_true', default=False, help='halve learning rate') 67 | parser.add_argument('--opt_type', default='adam', help='sgd/adam') 68 | parser.add_argument('--grad_clip', action='store_true', default=False, help='gradient clipping') 69 | parser.add_argument('--reg', type=float, default=0.0001, help='L2 Reg rate') 70 | parser.add_argument('--drop_rate', type=float, default=0.0, help='Dropout Rate') 71 | 72 | ################### 73 | # Loading Options # 74 | ################### 75 | parser.add_argument('--load_param', default=None, help='path to params') 76 | parser.add_argument('--load_param_dir', default=None, help='path to param dir') 77 | parser.add_argument('--load_param_latest', action='store_true', default=False, help='load the latest params') 78 | parser.add_argument('--load_opt_param', default=None, help='path to params') 79 | parser.add_argument('--load_label', default=None, help='path to labels') 80 | 81 | return parser.parse_args() 82 | 83 | 84 | def main(): 85 | argv = parse_args() 86 | np.random.seed(argv.seed) 87 | 88 | if argv.data_type == "conll05": 89 | from utils.loaders import Conll05Loader 90 | loader = Conll05Loader(argv) 91 | else: 92 | from utils.loaders import Conll12Loader 93 | loader = Conll12Loader(argv) 94 | 95 | if argv.method == "span": 96 | from srl.preprocessors import SpanPreprocessor 97 | from utils.evaluators import SpanEvaluator 98 | from srl.model_api import SpanModelAPI 99 | 100 | if argv.mode == "train": 101 | from srl.trainers import Trainer 102 | 103 | Trainer(argv=argv, 104 | loader=loader, 105 | preprocessor=SpanPreprocessor(argv), 106 | evaluator=SpanEvaluator(argv), 107 | model_api=SpanModelAPI(argv) 108 | ).train() 109 | else: 110 | from srl.testers import Tester 111 | from utils.savers import SpanSaver 112 | 113 | Tester(argv=argv, 114 | loader=loader, 115 | saver=SpanSaver(argv), 116 | preprocessor=SpanPreprocessor(argv), 117 | evaluator=SpanEvaluator(argv), 118 | model_api=SpanModelAPI(argv) 119 | ).predict() 120 | else: 121 | from srl.preprocessors import BIOPreprocessor 122 | from utils.evaluators import BIOEvaluator 123 | from srl.model_api import BIOModelAPI 124 | 125 | if argv.mode == "train": 126 | from srl.trainers import Trainer 127 | 128 | trainer = Trainer(argv=argv, 129 | loader=loader, 130 | preprocessor=BIOPreprocessor(argv), 131 | evaluator=BIOEvaluator(argv), 132 | model_api=BIOModelAPI(argv) 133 | ) 134 | trainer.train() 135 | else: 136 | from srl.testers import Tester 137 | from utils.savers import BIOSaver 138 | 139 | Tester(argv=argv, 140 | loader=loader, 141 | saver=BIOSaver(argv), 142 | preprocessor=BIOPreprocessor(argv), 143 | evaluator=BIOEvaluator(argv), 144 | model_api=BIOModelAPI(argv) 145 | ).predict() 146 | 147 | 148 | if __name__ == '__main__': 149 | main() 150 | -------------------------------------------------------------------------------- /src/nn/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hiroki13/span-based-srl/2c8b677c4e00b6c607e09ef4f9fe3d54961e4f2e/src/nn/__init__.py -------------------------------------------------------------------------------- /src/nn/activations.py: -------------------------------------------------------------------------------- 1 | import theano.tensor as T 2 | 3 | 4 | def softmax(x): 5 | if x.ndim == 3: 6 | x_shape = x.shape 7 | x = x.reshape((x_shape[0] * x_shape[1], x_shape[2])) 8 | return T.nnet.softmax(x).reshape(x_shape) 9 | elif x.ndim == 4: 10 | x_shape = x.shape 11 | x = x.reshape((x_shape[0] * x_shape[1] * x_shape[2], x_shape[3])) 12 | return T.nnet.softmax(x).reshape(x_shape) 13 | return T.nnet.softmax(x) 14 | 15 | 16 | def sigmoid(x): 17 | return T.nnet.sigmoid(x) 18 | 19 | 20 | def tanh(x): 21 | return T.tanh(x) 22 | 23 | 24 | def relu(x): 25 | return T.nnet.relu(x) 26 | -------------------------------------------------------------------------------- /src/nn/initializers.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import theano 3 | 4 | 5 | class Initializer(object): 6 | def __call__(self, shape, shared=True, name=None): 7 | raise NotImplementedError 8 | 9 | 10 | class Zero(Initializer): 11 | def __call__(self, shape, shared=True, name=None): 12 | param = np.zeros(shape, theano.config.floatX) 13 | if shared: 14 | return theano.shared(value=param, name=name, borrow=True) 15 | return param 16 | 17 | 18 | class One(Initializer): 19 | def __call__(self, shape, shared=True, name=None): 20 | param = np.ones(shape, theano.config.floatX) 21 | if shared: 22 | return theano.shared(value=param, name=name, borrow=True) 23 | return param 24 | 25 | 26 | class Identity(Initializer): 27 | def __call__(self, shape, shared=True, name=None): 28 | assert len(shape) == 2 29 | param = np.ones(shape[0], theano.config.floatX) 30 | param = np.diag(param) 31 | if shared: 32 | return theano.shared(value=param, name=name, borrow=True) 33 | return param 34 | 35 | 36 | class Uniform(Initializer): 37 | def __call__(self, shape, shared=True, name=None): 38 | param = np.asarray(np.random.uniform(low=-0.01, 39 | high=0.01, 40 | size=shape), 41 | dtype=theano.config.floatX) 42 | if shared: 43 | return theano.shared(value=param, name=name, borrow=True) 44 | return param 45 | 46 | 47 | class Normal(Initializer): 48 | def __call__(self, shape, shared=True, name=None): 49 | param = np.asarray(np.random.normal(0.0, 0.01, shape), 50 | dtype=theano.config.floatX) 51 | if shared: 52 | return theano.shared(value=param, name=name, borrow=True) 53 | return param 54 | 55 | 56 | class Xavier(Initializer): 57 | def __call__(self, shape, shared=True, name=None): 58 | param = np.asarray(np.random.uniform(low=-np.sqrt(6.0 / np.sum(shape)), 59 | high=np.sqrt(6.0 / np.sum(shape)), 60 | size=shape), 61 | dtype=theano.config.floatX) 62 | if shared: 63 | return theano.shared(value=param, name=name, borrow=True) 64 | return param 65 | 66 | 67 | class Orthonormal(Initializer): 68 | """ 69 | This is based on the implementation of Luheng He; 70 | https://github.com/luheng/deep_srl 71 | """ 72 | def __call__(self, shape, shared=True, name=None): 73 | assert len(shape) == 2 74 | if shape[0] == shape[1]: 75 | M = np.random.randn(*shape).astype(theano.config.floatX) 76 | Q, R = np.linalg.qr(M) 77 | Q = Q * np.sign(np.diag(R)) 78 | param = Q * 1.0 79 | else: 80 | M1 = np.random.randn(shape[0], shape[0]).astype(theano.config.floatX) 81 | M2 = np.random.randn(shape[1], shape[1]).astype(theano.config.floatX) 82 | Q1, R1 = np.linalg.qr(M1) 83 | Q2, R2 = np.linalg.qr(M2) 84 | Q1 = Q1 * np.sign(np.diag(R1)) 85 | Q2 = Q2 * np.sign(np.diag(R2)) 86 | n_min = min(shape[0], shape[1]) 87 | param = np.dot(Q1[:, :n_min], Q2[:n_min, :]) * 1.0 88 | if shared: 89 | return theano.shared(value=param, name=name, borrow=True) 90 | return param 91 | -------------------------------------------------------------------------------- /src/nn/layers/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hiroki13/span-based-srl/2c8b677c4e00b6c607e09ef4f9fe3d54961e4f2e/src/nn/layers/__init__.py -------------------------------------------------------------------------------- /src/nn/layers/core.py: -------------------------------------------------------------------------------- 1 | import theano 2 | import theano.tensor as T 3 | 4 | from nn.initializers import Zero, One, Identity, Uniform, Normal, Xavier, Orthonormal 5 | from nn.activations import sigmoid, tanh, relu, softmax 6 | 7 | 8 | class Unit(object): 9 | def __init__(self, name='unit'): 10 | self.name = name 11 | 12 | @staticmethod 13 | def _set_param(shape, init_type=None, name=None): 14 | if init_type == 'zero': 15 | init = Zero() 16 | elif init_type == 'one': 17 | init = One() 18 | elif init_type == 'xavier': 19 | init = Xavier() 20 | elif init_type == 'orth': 21 | init = Orthonormal() 22 | elif init_type == 'identity': 23 | init = Identity() 24 | elif init_type == 'uniform': 25 | init = Uniform() 26 | else: 27 | init = Normal() 28 | return init(shape=shape, name=name) 29 | 30 | @staticmethod 31 | def _set_activation(activation_type): 32 | if activation_type == 'sigmoid': 33 | return sigmoid 34 | elif activation_type == 'tanh': 35 | return tanh 36 | elif activation_type == 'relu': 37 | return relu 38 | elif activation_type == 'softmax': 39 | return softmax 40 | return None 41 | 42 | 43 | class Dense(Unit): 44 | def __init__(self, 45 | input_dim, 46 | output_dim, 47 | activation=None, 48 | use_bias=True, 49 | weight_init='xavier', 50 | bias_init='zero'): 51 | super(Dense, self).__init__(name='Dense(%dx%d,%s)' % (input_dim, output_dim, activation)) 52 | 53 | self.W = self._set_param(shape=(input_dim, output_dim), 54 | init_type=weight_init, 55 | name='W_dense') 56 | if use_bias: 57 | self.b = self._set_param(shape=output_dim, 58 | init_type=bias_init, 59 | name='b_dense') 60 | self.params = [self.W, self.b] 61 | else: 62 | self.b = None 63 | self.params = [self.W] 64 | 65 | self.activation = self._set_activation(activation) 66 | 67 | def forward(self, x): 68 | h = T.dot(x, self.W) 69 | if self.b: 70 | h = h + self.b 71 | if self.activation: 72 | h = self.activation(h) 73 | return h 74 | 75 | 76 | class Dropout(Unit): 77 | """ 78 | Reference: [Dropout: A Simple Way to Prevent Neural Networks from Overfitting] 79 | """ 80 | def __init__(self, rate, seed=0): 81 | super(Dropout, self).__init__(name='Dropout(p={:>1.1})'.format(rate)) 82 | self.rate = min(1., max(0., rate)) 83 | self.srng = T.shared_randomstreams.RandomStreams(seed=seed) 84 | 85 | def forward(self, x, is_train): 86 | drop_mask = self.srng.binomial(size=x.shape, n=1, p=1 - self.rate, dtype=theano.config.floatX) 87 | return T.switch(T.eq(is_train, 1), x * drop_mask, x * (1 - self.rate)) 88 | -------------------------------------------------------------------------------- /src/nn/layers/embeddings.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import theano 3 | import theano.tensor as T 4 | 5 | from nn.layers.core import Unit, Dropout 6 | 7 | 8 | class Embedding(Unit): 9 | def __init__(self, 10 | input_dim, 11 | output_dim, 12 | init_emb=None, 13 | param_init='xavier', 14 | param_fix=False, 15 | drop_rate=0.0, 16 | name=None): 17 | super(Embedding, self).__init__(name=name if name else 'Emb(%dx%d)' % (input_dim, output_dim)) 18 | self.dropout = Dropout(drop_rate) 19 | 20 | self.W = self._set_weight(input_dim, output_dim, init_emb, param_init) 21 | if param_fix: 22 | self.params = [] 23 | else: 24 | self.params = [self.W] 25 | 26 | def _set_weight(self, input_dim, output_dim, init_emb, param_init): 27 | if init_emb is None: 28 | return self._set_param(shape=(input_dim, output_dim), 29 | init_type=param_init, 30 | name='embedding') 31 | return theano.shared(init_emb) 32 | 33 | def forward(self, x, is_train=0): 34 | return self.dropout.forward(x=self.W[x], is_train=is_train) 35 | 36 | 37 | class ElmoLayer(Unit): 38 | def __init__(self, drop_rate=0.0, name=None): 39 | super(ElmoLayer, self).__init__(name=name if name else 'ElmoEmb') 40 | self.dropout = Dropout(drop_rate) 41 | 42 | self.gamma = theano.shared(value=np.asarray([[1.0]], theano.config.floatX), 43 | name='gamma', 44 | borrow=True) 45 | self.scalar_mix = theano.shared(value=np.zeros(shape=(1, 3), dtype=theano.config.floatX), 46 | name='scalar_mix', 47 | borrow=True) 48 | self.params = [self.gamma, self.scalar_mix] 49 | 50 | def forward(self, x, is_train=0): 51 | """ 52 | :param x: 1D: batch_size, 2D: n_words, 3D: n_layers, 4D: dim 53 | :param is_train: 0/1 54 | :return: 55 | """ 56 | s = T.nnet.softmax(self.scalar_mix).dimshuffle('x', 'x', 1, 0) 57 | s = T.repeat(s, repeats=x.shape[3], axis=3) 58 | x = self.gamma[0, 0] * T.sum(s * x, axis=2) 59 | return self.dropout.forward(x=x, is_train=is_train) 60 | -------------------------------------------------------------------------------- /src/nn/layers/recurrent.py: -------------------------------------------------------------------------------- 1 | import theano 2 | import theano.tensor as T 3 | 4 | from nn.layers.core import Unit, sigmoid, tanh 5 | 6 | 7 | class LSTM(Unit): 8 | def __init__(self, 9 | input_dim, 10 | output_dim, 11 | use_bias=True, 12 | recurrent_init='orth', 13 | bias_init='zero'): 14 | super(LSTM, self).__init__(name='LSTM(%dx%d)' % (input_dim, output_dim)) 15 | 16 | self.input_dim = input_dim 17 | self.output_dim = output_dim 18 | 19 | # inout gate parameters 20 | self.W_xi = self._set_param(shape=(input_dim, output_dim), 21 | init_type=recurrent_init, 22 | name='W_xi') 23 | self.W_hi = self._set_param(shape=(output_dim, output_dim), 24 | init_type=recurrent_init, 25 | name='W_hi') 26 | self.W_ci = self._set_param(shape=output_dim, 27 | init_type='xavier', 28 | name='W_ci') 29 | 30 | # forget gate parameters 31 | self.W_xf = self._set_param(shape=(input_dim, output_dim), 32 | init_type=recurrent_init, 33 | name='W_xf') 34 | self.W_hf = self._set_param(shape=(output_dim, output_dim), 35 | init_type=recurrent_init, 36 | name='W_hf') 37 | self.W_cf = self._set_param(shape=output_dim, 38 | init_type='xavier', 39 | name='W_cf') 40 | 41 | # cell parameters 42 | self.W_xc = self._set_param(shape=(input_dim, output_dim), 43 | init_type=recurrent_init, 44 | name='W_xc') 45 | self.W_hc = self._set_param(shape=(output_dim, output_dim), 46 | init_type=recurrent_init, 47 | name='W_hc') 48 | 49 | # output gate parameters 50 | self.W_xo = self._set_param(shape=(input_dim, output_dim), 51 | init_type=recurrent_init, 52 | name='W_xf') 53 | self.W_ho = self._set_param(shape=(output_dim, output_dim), 54 | init_type=recurrent_init, 55 | name='W_hf') 56 | self.W_co = self._set_param(shape=output_dim, 57 | init_type='xavier', 58 | name='W_cf') 59 | 60 | if use_bias: 61 | self.b_xi = self._set_param(shape=output_dim, 62 | init_type=bias_init, 63 | name='b_xi') 64 | self.b_xf = self._set_param(shape=output_dim, 65 | init_type='one', 66 | name='b_xf') 67 | self.b_xc = self._set_param(shape=output_dim, 68 | init_type=bias_init, 69 | name='b_xc') 70 | self.b_xo = self._set_param(shape=output_dim, 71 | init_type=bias_init, 72 | name='b_xo') 73 | self.params = [self.W_xi, self.W_hi, self.W_ci, self.W_xf, self.W_hf, self.W_cf, 74 | self.W_xc, self.W_hc, self.W_xo, self.W_ho, self.W_co, 75 | self.b_xi, self.b_xf, self.b_xc, self.b_xo] 76 | else: 77 | self.b_xi = None 78 | self.b_xf = None 79 | self.b_xc = None 80 | self.b_xo = None 81 | self.params = [self.W_xi, self.W_hi, self.W_ci, self.W_xf, self.W_hf, self.W_cf, 82 | self.W_xc, self.W_hc, self.W_xo, self.W_ho, self.W_co] 83 | 84 | def _step(self, xi_t, xf_t, xc_t, xo_t, h_tm1, c_tm1): 85 | i_t = sigmoid(xi_t + T.dot(h_tm1, self.W_hi) + c_tm1 * self.W_ci) 86 | f_t = sigmoid(xf_t + T.dot(h_tm1, self.W_hf) + c_tm1 * self.W_cf) 87 | c_t = f_t * c_tm1 + i_t * tanh(xc_t + T.dot(h_tm1, self.W_hc)) 88 | o_t = sigmoid(xo_t + T.dot(h_tm1, self.W_ho) + c_t * self.W_co) 89 | h_t = o_t * tanh(c_t) 90 | return h_t, c_t 91 | 92 | def forward(self, x, h0=None, mask=None): 93 | xi = T.dot(x, self.W_xi) + self.b_xi 94 | xf = T.dot(x, self.W_xf) + self.b_xf 95 | xc = T.dot(x, self.W_xc) + self.b_xc 96 | xo = T.dot(x, self.W_xo) + self.b_xo 97 | 98 | inputs = [xi, xf, xc, xo] 99 | 100 | if h0 is None: 101 | h0 = T.zeros(shape=(x[0].shape[0], self.output_dim), dtype=theano.config.floatX) 102 | c0 = T.zeros(shape=(x[0].shape[0], self.output_dim), dtype=theano.config.floatX) 103 | 104 | [h, _], _ = theano.scan(fn=self._step, 105 | sequences=inputs, 106 | outputs_info=[h0, c0]) 107 | return h 108 | -------------------------------------------------------------------------------- /src/nn/layers/seqlabel.py: -------------------------------------------------------------------------------- 1 | import theano 2 | import theano.tensor as T 3 | 4 | from nn.layers.core import Unit 5 | from nn.utils import logsumexp 6 | 7 | 8 | class SeqLabelAlg(Unit): 9 | def __init__(self, name='SeqLabelModel'): 10 | super(SeqLabelAlg, self).__init__(name=name) 11 | 12 | def viterbi(self, emit_scores, trans_scores): 13 | """ 14 | :param emit_scores: 1D: n_words, 2D: batch_size, 3D: n_labels 15 | :param trans_scores: 1D: n_words, 2D: n_labels 16 | :return: 1D: n_words; 2D: batch_size, elem=label id 17 | """ 18 | [scores, labels], _ = theano.scan(fn=self._viterbi_forward, 19 | sequences=[emit_scores[1:]], 20 | outputs_info=[emit_scores[0], None], 21 | non_sequences=trans_scores) 22 | 23 | label_max_last = T.argmax(scores[-1], axis=1) 24 | labels_max, _ = theano.scan(fn=self._viterbi_backward, 25 | sequences=labels[::-1], 26 | outputs_info=label_max_last) 27 | 28 | y = T.zeros(shape=(emit_scores.shape[0], emit_scores.shape[1]), dtype='int32') 29 | y = T.set_subtensor(y[-1], label_max_last) 30 | y = T.set_subtensor(y[:-1], labels_max[::-1]) 31 | return y 32 | 33 | @staticmethod 34 | def _viterbi_forward(e_t, score_prev, trans): 35 | """ 36 | :param e_t: 1D: batch_size, 2D: n_labels 37 | :param score_prev: 1D: batch_size, 2D: n_labels 38 | :param trans: 1D: n_labels, 2D, n_labels 39 | :return: max_scores_t: 1D: batch_size, 2D: n_labels 40 | :return: max_labels_t: 1D: batch_size, 2D: n_labels 41 | """ 42 | score = score_prev.dimshuffle(0, 'x', 1) + trans + e_t.dimshuffle(0, 1, 'x') 43 | max_scores_t, max_labels_t = T.max_and_argmax(score, axis=2) 44 | return max_scores_t, max_labels_t 45 | 46 | @staticmethod 47 | def _viterbi_backward(labels_t, label_max): 48 | """ 49 | :param labels_t: 1D: batch_size, 2D: n_labels; elem=label id 50 | :param label_max: 1D: batch_size; elem=label id 51 | :return: 1D: batch_size; elem=label id 52 | """ 53 | return labels_t[T.arange(labels_t.shape[0]), label_max] 54 | 55 | 56 | class CRF(SeqLabelAlg): 57 | def __init__(self, 58 | input_dim, 59 | output_dim, 60 | use_bias=True, 61 | weight_init='xavier', 62 | bias_init='zero'): 63 | super(CRF, self).__init__(name='CRF(%dx%d)' % (input_dim, output_dim)) 64 | self.W = self._set_param(shape=(input_dim, output_dim), 65 | init_type=weight_init, 66 | name='W_crf') 67 | self.W_t = self._set_param(shape=(output_dim, output_dim), 68 | init_type=weight_init, 69 | name='W_tran_crf') 70 | 71 | if use_bias: 72 | self.b = self._set_param(shape=output_dim, 73 | init_type=bias_init, 74 | name='b_crf') 75 | self.params = [self.W, self.W_t, self.b] 76 | else: 77 | self.b = None 78 | self.params = [self.W, self.W_t] 79 | 80 | def forward(self, x): 81 | emit_scores = T.dot(x, self.W) 82 | if self.b: 83 | emit_scores = emit_scores + self.b 84 | return emit_scores 85 | 86 | def get_y_proba(self, emit_scores, y_true): 87 | """ 88 | :param emit_scores: 1D: n_words, 2D: batch_size, 3D: n_labels 89 | :param y_true: 1D: n_words, 2D: batch_size 90 | :return: 1D: batch_size; elem=log probability 91 | """ 92 | # 1D: batch_size, 2D: n_labels 93 | z_score0 = emit_scores[0] 94 | # 1D: batch_size; elem=path score 95 | y_score0 = z_score0[T.arange(z_score0.shape[0]), y_true[0]] 96 | 97 | inputs = [emit_scores[1:], y_true[1:]] 98 | [_, y_scores, z_scores], _ = theano.scan(fn=self._forward_step, 99 | sequences=inputs, 100 | outputs_info=[y_true[0], y_score0, z_score0], 101 | non_sequences=self.W_t) 102 | 103 | y_score = y_scores[-1] 104 | z_score = logsumexp(z_scores[-1], axis=1).flatten() 105 | 106 | return y_score - z_score 107 | 108 | @staticmethod 109 | def _forward_step(h_t, y_t, y_prev, y_score_prev, z_score_prev, trans): 110 | """ 111 | :param h_t: 1D: batch_size, 2D: n_labels 112 | :param y_t: 1D: batch_size 113 | :param y_prev: 1D: batch_size 114 | :param y_score_prev: 1D: batch_size 115 | :param z_score_prev: 1D: batch_size, 2D: n_labels 116 | :param trans: 1D: n_labels, 2D, n_labels 117 | """ 118 | # 1D: batch_size 119 | y_score_t = y_score_prev + trans[y_t, y_prev] + h_t[T.arange(h_t.shape[0]), y_t] 120 | # 1D: batch_size, 2D: n_labels, 3D: n_labels 121 | z_sum = z_score_prev.dimshuffle(0, 'x', 1) + trans 122 | # 1D: batch_size, 2D: n_labels 123 | z_score_t = logsumexp(z_sum, axis=2).reshape(h_t.shape) + h_t 124 | return y_t, y_score_t, z_score_t 125 | 126 | def get_y_pred(self, emit_scores): 127 | """ 128 | :param emit_scores: 1D: n_words, 2D: batch_size, 3D: n_labels 129 | :return: 1D: batch_size, 2D: n_words; elem=label id 130 | """ 131 | return self.viterbi(emit_scores=emit_scores, trans_scores=self.W_t).dimshuffle(1, 0) 132 | -------------------------------------------------------------------------------- /src/nn/layers/stack.py: -------------------------------------------------------------------------------- 1 | import theano.tensor as T 2 | 3 | from nn.layers.core import Dense, Dropout 4 | from nn.layers.recurrent import LSTM 5 | 6 | 7 | class StackLayer(object): 8 | def __init__(self, name='StackLayer'): 9 | self.name = name 10 | self.layers = [] 11 | self.params = [] 12 | 13 | def _set_layers(self): 14 | raise NotImplementedError 15 | 16 | @staticmethod 17 | def _set_rnn_unit(unit_type): 18 | return LSTM 19 | 20 | @staticmethod 21 | def _set_connect_unit(connect_type): 22 | return Dense 23 | 24 | def _set_params(self): 25 | params = [] 26 | for layer in self.layers: 27 | params.extend(layer.params) 28 | return params 29 | 30 | def forward(self, x, **kwargs): 31 | raise NotImplementedError 32 | 33 | 34 | class BiRNNLayer(StackLayer): 35 | def __init__(self, 36 | input_dim, 37 | output_dim, 38 | n_layers, 39 | unit_type, 40 | connect_type, 41 | drop_rate=0.0): 42 | name = 'BiRNNs-%d:(%dx%d)' % (n_layers, input_dim, output_dim) 43 | super(BiRNNLayer, self).__init__(name=name) 44 | 45 | self.input_dim = input_dim 46 | self.output_dim = output_dim 47 | self.n_layers = n_layers 48 | self.rnn_unit = self._set_rnn_unit(unit_type) 49 | self.connect_unit = self._set_connect_unit(connect_type) 50 | self.dropout = Dropout(drop_rate) 51 | 52 | self.layers = self._set_layers() 53 | self.params = self._set_params() 54 | 55 | def _set_layers(self): 56 | layers = [] 57 | for i in range(self.n_layers): 58 | if i == 0: 59 | rnn_input_dim = self.input_dim 60 | connect_input_dim = self.input_dim + self.output_dim 61 | else: 62 | rnn_input_dim = self.output_dim 63 | connect_input_dim = self.output_dim * 2 64 | 65 | r_unit = self.rnn_unit(input_dim=rnn_input_dim, 66 | output_dim=self.output_dim) 67 | c_unit = self.connect_unit(input_dim=connect_input_dim, 68 | output_dim=self.output_dim, 69 | activation='relu') 70 | layers += [r_unit, c_unit] 71 | return layers 72 | 73 | def forward(self, x, mask=None, is_train=False): 74 | n_layers = int(len(self.layers) / 2) 75 | for i in range(n_layers): 76 | if mask is None: 77 | h = self.layers[i * 2].forward(x=x) 78 | h = self.dropout.forward(x=h, is_train=is_train) 79 | x = self.layers[i * 2 + 1].forward(T.concatenate([x, h], axis=2)) 80 | else: 81 | h = self.layers[i * 2].forward(x=x, mask=mask) 82 | h = self.dropout.forward(x=h, is_train=is_train) 83 | x = self.layers[i * 2 + 1].forward(T.concatenate([x, h], axis=2)) * mask 84 | mask = mask[::-1] 85 | x = x[::-1] 86 | if (n_layers % 2) == 1: 87 | return x[::-1] 88 | return x 89 | -------------------------------------------------------------------------------- /src/nn/losses.py: -------------------------------------------------------------------------------- 1 | import theano.tensor as T 2 | 3 | 4 | def binary_cross_entropy(output, target): 5 | return T.nnet.binary_crossentropy(output=output, target=target) 6 | 7 | 8 | def negative_log_likelihood(y_proba, y_true=None): 9 | """ 10 | :param y_proba: 1D: batch_size, 2D: n_words, 3D: n_words; elem=word id 11 | :param y_true: 1D: batch_size, 2D: n_words; elem=word id 12 | """ 13 | if y_true: 14 | y_true_flatten = y_true.flatten() 15 | y_proba = y_proba.reshape((y_proba.shape[0] * y_proba.shape[1], y_proba.shape[2])) 16 | nll = - T.sum(T.log(y_proba[T.arange(y_true_flatten.shape[0]), y_true_flatten]).reshape(y_true.shape), axis=1) 17 | else: 18 | nll = - y_proba 19 | return nll 20 | -------------------------------------------------------------------------------- /src/nn/metrics.py: -------------------------------------------------------------------------------- 1 | import theano.tensor as T 2 | 3 | 4 | def categorical_accuracy(y_true, y_pred): 5 | return T.sum(T.eq(y_true, y_pred)) 6 | 7 | 8 | def log_likelihood(y_true, y_proba): 9 | y_true = y_true.flatten() 10 | y_proba = y_proba.reshape((y_proba.shape[0] * y_proba.shape[1], -1)) 11 | return T.sum(T.log(y_proba[T.arange(y_true.shape[0]), y_true])) 12 | -------------------------------------------------------------------------------- /src/nn/optimizers.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import theano 3 | import theano.tensor as T 4 | 5 | from utils.savers import save_pickle 6 | from utils.loaders import load_pickle 7 | 8 | 9 | def get_optimizer(argv): 10 | if argv.opt_type == 'adam': 11 | return Adam(argv=argv, lr=argv.lr, grad_clip=argv.grad_clip) 12 | return SGD(argv=argv, lr=argv.lr, grad_clip=argv.grad_clip) 13 | 14 | 15 | class Optimizer(object): 16 | def __init__(self, **kwargs): 17 | self.argv = kwargs['argv'] 18 | self.grad_clip = kwargs['grad_clip'] 19 | self.params = [] 20 | 21 | def __call__(self, grads, params): 22 | raise NotImplementedError 23 | 24 | def set_params(self, **kwargs): 25 | raise NotImplementedError 26 | 27 | def init_params(self): 28 | for p in self.params: 29 | p.set_value(p.get_value(borrow=True) * 0) 30 | 31 | @staticmethod 32 | def _grad_clipping(gradients, max_norm=5.0): 33 | global_grad_norm = T.sqrt(sum(map(lambda x: T.sqr(x).sum(), gradients))) 34 | multiplier = T.switch(global_grad_norm < max_norm, 1.0, max_norm / global_grad_norm) 35 | return [g * multiplier for g in gradients] 36 | 37 | def save_params(self, epoch=0): 38 | argv = self.argv 39 | if argv.output_dir: 40 | dir_name = argv.output_dir 41 | else: 42 | dir_name = 'output' 43 | if argv.output_fn: 44 | file_name = '/opt.param.%s.epoch-%d' % (argv.output_fn, epoch) 45 | else: 46 | file_name = '/opt.param.%s.epoch-%d' % (argv.method, epoch) 47 | 48 | fn = dir_name + file_name 49 | params = [p.get_value(borrow=True) for p in self.params] 50 | save_pickle(fn=fn, data=params) 51 | 52 | def load_params(self, path): 53 | params = load_pickle(path) 54 | assert len(self.params) == len(params) 55 | for p1, p2 in zip(self.params, params): 56 | p1.set_value(p2) 57 | 58 | 59 | class SGD(Optimizer): 60 | def __init__(self, lr=0.001, **kwargs): 61 | super(SGD, self).__init__(**kwargs) 62 | self.lr = theano.shared(np.asarray(lr, dtype=theano.config.floatX), borrow=True) 63 | 64 | def __call__(self, params, grads): 65 | updates = [] 66 | if self.grad_clip: 67 | grads = self._grad_clipping(grads, max_norm=1.0) 68 | for p, g in zip(params, grads): 69 | updates.append((p, p - self.lr * g)) 70 | return updates 71 | 72 | def set_params(self): 73 | pass 74 | 75 | 76 | class Adam(Optimizer): 77 | def __init__(self, lr=0.001, b1=0.9, b2=0.999, eps=1e-8, **kwargs): 78 | super(Adam, self).__init__(**kwargs) 79 | self.lr = theano.shared(np.asarray(lr, dtype=theano.config.floatX), borrow=True) 80 | self.b1 = b1 81 | self.b2 = b2 82 | self.eps = eps 83 | 84 | def __call__(self, params, grads): 85 | updates = [] 86 | 87 | i = self.params[0] 88 | i_t = i + 1. 89 | a_t = self.lr * T.sqrt(1 - self.b2 ** i_t) / (1 - self.b1 ** i_t) 90 | 91 | if self.grad_clip: 92 | grads = self._grad_clipping(grads, max_norm=1.0) 93 | 94 | for index, (p, g) in enumerate(zip(params, grads)): 95 | v = self.params[2 * index + 1] 96 | r = self.params[2 * index + 2] 97 | index += 2 98 | 99 | v_t = self.b1 * v + (1. - self.b1) * g 100 | r_t = self.b2 * r + (1. - self.b2) * g ** 2 101 | 102 | step = a_t * v_t / (T.sqrt(r_t) + self.eps) 103 | 104 | updates.append((v, v_t)) 105 | updates.append((r, r_t)) 106 | updates.append((p, p - step)) 107 | 108 | updates.append((i, i_t)) 109 | return updates 110 | 111 | def set_params(self, params): 112 | i = theano.shared(np.asarray(.0, dtype=theano.config.floatX)) 113 | self.params.append(i) 114 | for p in params: 115 | p_tm = p.get_value(borrow=True) 116 | v = theano.shared(np.zeros(p_tm.shape, dtype=p_tm.dtype)) 117 | r = theano.shared(np.zeros(p_tm.shape, dtype=p_tm.dtype)) 118 | self.params += [v, r] 119 | -------------------------------------------------------------------------------- /src/nn/regularizers.py: -------------------------------------------------------------------------------- 1 | import theano.tensor as T 2 | 3 | 4 | class Regularizer(object): 5 | def __call__(self, **kwargs): 6 | raise NotImplementedError 7 | 8 | 9 | class L2Regularizer(Regularizer): 10 | def __call__(self, alpha, params): 11 | return alpha * l2_sqr(params) / 2. 12 | 13 | 14 | def l2_sqr(params): 15 | sqr = 0.0 16 | for p in params: 17 | sqr += T.sum((p ** 2)) 18 | return sqr 19 | -------------------------------------------------------------------------------- /src/nn/utils.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import theano.tensor as T 3 | 4 | 5 | def normalize_3d(x, eps=1e-8): 6 | l2 = x.norm(2, axis=2).dimshuffle((0, 1, 'x')) 7 | return x / (l2 + eps) 8 | 9 | 10 | def logsumexp(x, axis, keepdim=True): 11 | """ 12 | :param x: 1D: batch, 2D: n_y, 3D: n_y 13 | :return: 1D: batch, 2D: n_y, 3D: n_y 14 | """ 15 | x_max = T.max(x, axis=axis, keepdims=True) 16 | if keepdim: 17 | return T.log(T.sum(T.exp(x - x_max), axis=axis, keepdims=keepdim)) + x_max 18 | return T.log(T.sum(T.exp(x - x_max), axis=axis)) + x_max.dimshuffle(0) 19 | 20 | 21 | def logsumexp3d(x, axis=2): 22 | # 1D: batch_size, 2D: n_labels, 3D: 1 23 | x_max = T.max(x, axis=axis, keepdims=True) 24 | # 1D: batch_size, 2D: n_labels 25 | return T.log(T.sum(T.exp(x - x_max), axis=axis)) + x_max.dimshuffle(0, 1) 26 | 27 | 28 | def log0(x): 29 | return T.switch(T.eq(x, 0.0), 0.0, T.log(x)) 30 | 31 | 32 | def frobenius_norm(matrix): 33 | if type(matrix) is list: 34 | return T.sqrt(T.sum(map(lambda m: T.sum(m ** 2), matrix))) 35 | return T.sqrt(T.maximum(T.sum(T.sqr(matrix)), 1e-8)) 36 | 37 | 38 | def np_frobenius_norm(matrix): 39 | return np.sqrt(np.sum(matrix**2)) 40 | 41 | 42 | def layer_normalization(x, axis=1, eps=1e-8): 43 | return (x - x.mean(axis=axis, keepdims=True)) / T.sqrt((x.var(axis=axis, keepdims=True) + eps)) 44 | -------------------------------------------------------------------------------- /src/srl/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hiroki13/span-based-srl/2c8b677c4e00b6c607e09ef4f9fe3d54961e4f2e/src/srl/__init__.py -------------------------------------------------------------------------------- /src/srl/decoders.py: -------------------------------------------------------------------------------- 1 | from copy import deepcopy 2 | from itertools import combinations_with_replacement as comb 3 | 4 | from utils.misc import span_to_span_index 5 | 6 | 7 | class Decoder(object): 8 | def __init__(self, argv, vocab_label): 9 | self.argv = argv 10 | self.core_label_ids = self.set_core_labels(vocab_label) 11 | self.span_list = None 12 | 13 | def set_core_labels(self, vocab_label): 14 | if self.argv.data_type == 'conll05': 15 | core_labels = ["A0", "A1", "A2", "A3", "A4", "A5"] 16 | else: 17 | core_labels = ["ARG0", "ARG1", "ARG2", "ARG3", "ARG4", "ARG5"] 18 | return [vocab_label.get_id(label) 19 | for label in core_labels 20 | if vocab_label.has_key(label)] 21 | 22 | def argmax_span_triples(self, span_indices, marks): 23 | """ 24 | :param span_indices: 1D: batch_size, 2D; n_labels; span index 25 | :param marks: 1D: batch_size, 2D; n_words 26 | :return: 1D: batch_size, 2D: n_spans; [r, i, j] 27 | """ 28 | n_words = len(marks[0]) 29 | self.span_list = list(comb(range(n_words), 2)) 30 | return [self._argmax_search(span_indices_i, mark) 31 | for span_indices_i, mark in zip(span_indices, marks)] 32 | 33 | def _argmax_search(self, span_indices, mark): 34 | spans = [] 35 | prd_index = mark.nonzero()[0][0] 36 | for r, span_index in enumerate(span_indices): 37 | (i, j) = self.span_list[span_index] 38 | if i <= prd_index <= j: 39 | continue 40 | spans.append([r, i, j]) 41 | return spans 42 | 43 | def greedy_span_triples(self, scores, marks): 44 | """ 45 | :param scores: 1D: batch_size, 2D; n_labels, 3D: n_spans; score 46 | :param marks: 1D: batch_size, 2D; n_words 47 | :return: 1D: batch_size, 2D: n_spans; [r, i, j] 48 | """ 49 | n_words = len(marks[0]) 50 | self.span_list = list(comb(range(n_words), 2)) 51 | return [self._greedy_search(score, mark) 52 | for score, mark in zip(scores, marks)] 53 | 54 | def _greedy_search(self, scores, mark): 55 | """ 56 | :param scores: 1D: n_labels, 2D: n_spans; score 57 | :param mark: 1D: n_words; elem=0/1 58 | :return: 1D: n_spans, 2D: [r, i, j] 59 | """ 60 | triples = [] 61 | used_words = deepcopy(mark) 62 | used_labels = [] 63 | 64 | n_words = len(mark) 65 | prd_index = mark.nonzero()[0][0] 66 | prd_span_index = span_to_span_index(i=prd_index, 67 | j=prd_index, 68 | n_words=n_words) 69 | spans = self._sort_spans(scores=scores, 70 | prd_index=prd_index, 71 | prd_span_index=prd_span_index) 72 | 73 | for (r, i, j, _) in spans: 74 | if r in used_labels: 75 | continue 76 | if used_words[i: j + 1].sum() > 0: 77 | continue 78 | 79 | triples.append([r, i, j]) 80 | 81 | used_words[i: j + 1] = 1 82 | if r in self.core_label_ids: 83 | used_labels.append(r) 84 | 85 | return triples 86 | 87 | def _sort_spans(self, scores, prd_index, prd_span_index): 88 | """ 89 | :param scores: 1D: n_labels, 2D: n_spans; score 90 | :return: 1D: n_labels, 2D: n_words * n_words; elem=(r, i, j, score) 91 | """ 92 | spans = [] 93 | for r, scores_row in enumerate(scores): 94 | score_prd = scores_row[prd_span_index] 95 | for index, score in enumerate(scores_row): 96 | (i, j) = self.span_list[index] 97 | if i <= prd_index <= j: 98 | continue 99 | if score_prd < score: 100 | spans.append((r, i, j, score)) 101 | spans.sort(key=lambda span: span[-1], reverse=True) 102 | return spans 103 | -------------------------------------------------------------------------------- /src/srl/model_api.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import time 3 | import math 4 | import glob 5 | 6 | import numpy as np 7 | import theano 8 | import theano.tensor as T 9 | 10 | from srl.models import SpanModel, MoEModel, CRFModel 11 | from srl.decoders import Decoder 12 | from nn.regularizers import L2Regularizer 13 | from nn.optimizers import get_optimizer 14 | from utils.evaluators import f_score, correct_and_pred_spans, metrics_for_bio 15 | from utils.savers import save_pickle 16 | from utils.loaders import load_pickle 17 | from utils.misc import write 18 | 19 | 20 | class SpanModelAPI(object): 21 | def __init__(self, argv): 22 | self.argv = argv 23 | 24 | self.model = None 25 | self.experts = None 26 | self.train_func = None 27 | self.pred_func = None 28 | 29 | self.vocab_word = None 30 | self.vocab_label = None 31 | self.vocab_label_valid = None 32 | 33 | self.input_dim = None 34 | self.hidden_dim = None 35 | self.output_dim = None 36 | self.use_elmo = None 37 | 38 | self.decoder = None 39 | self.optimizer = None 40 | 41 | self.n_true_spans = 0. 42 | 43 | def set_model(self, **kwargs): 44 | write('Setting a model...') 45 | argv = self.argv 46 | 47 | self.vocab_word = kwargs['vocab_word'] 48 | self.use_elmo = kwargs['use_elmo'] 49 | self.vocab_label = kwargs['vocab_label'] 50 | self.vocab_label_valid = kwargs['vocab_label_valid'] 51 | word_emb = kwargs['word_emb'] 52 | vocab_word_size = self.vocab_word.size() if self.vocab_word else 0 53 | 54 | self.input_dim = argv.emb_dim if word_emb is None else word_emb.shape[1] 55 | self.hidden_dim = argv.hidden_dim 56 | self.output_dim = -1 57 | 58 | self.decoder = Decoder(argv=argv, vocab_label=self.vocab_label) 59 | 60 | self.model = SpanModel() 61 | self.model.compile(inputs=self._set_inputs(), 62 | vocab_word_size=vocab_word_size, 63 | use_elmo=self.use_elmo, 64 | word_emb=word_emb, 65 | input_dim=[self.input_dim, self.input_dim], 66 | hidden_dim=self.hidden_dim, 67 | feat_dim=2 * self.hidden_dim, 68 | output_dim=self.vocab_label.size(), 69 | n_layers=argv.n_layers, 70 | drop_rate=argv.drop_rate) 71 | 72 | write('\t- {}'.format("\n\t- ".join([l.name for l in self.model.layers]))) 73 | self._show_model_config() 74 | 75 | def set_ensemble_model(self, **kwargs): 76 | write('Setting a model...') 77 | argv = self.argv 78 | 79 | self.vocab_word = kwargs['vocab_word'] 80 | self.use_elmo = kwargs['use_elmo'] 81 | self.vocab_label = kwargs['vocab_label'] 82 | self.vocab_label_valid = kwargs['vocab_label_valid'] 83 | word_emb = kwargs['word_emb'] 84 | vocab_word_size = self.vocab_word.size() if self.vocab_word else 0 85 | 86 | self.input_dim = argv.emb_dim if word_emb is None else word_emb.shape[1] 87 | self.hidden_dim = argv.hidden_dim 88 | self.output_dim = -1 89 | 90 | self.decoder = Decoder(argv=argv, vocab_label=self.vocab_label) 91 | 92 | ################# 93 | # Set MoE model # 94 | ################# 95 | inputs = self._set_inputs() 96 | self.model = MoEModel() 97 | self.model.compile(inputs=inputs, 98 | feat_dim=2 * self.hidden_dim, 99 | output_dim=self.vocab_label.size(), 100 | drop_rate=argv.drop_rate, 101 | n_experts=argv.n_experts) 102 | write('\t- {}\n'.format("\n\t- ".join([l.name for l in self.model.layers]))) 103 | 104 | ############### 105 | # Set experts # 106 | ############### 107 | experts = [] 108 | for _ in range(argv.n_experts): 109 | model = SpanModel() 110 | model.compile(inputs=self.model.inputs, 111 | vocab_word_size=vocab_word_size, 112 | use_elmo=self.use_elmo, 113 | input_dim=[self.input_dim, self.input_dim], 114 | hidden_dim=self.hidden_dim, 115 | feat_dim=2 * self.hidden_dim, 116 | output_dim=self.vocab_label.size(), 117 | n_layers=argv.n_layers, 118 | word_emb=word_emb, 119 | drop_rate=argv.drop_rate) 120 | write('\t- {}\n'.format("\n\t- ".join([l.name for l in model.layers]))) 121 | experts.append(model) 122 | 123 | self.experts = experts 124 | 125 | def _set_inputs(self): 126 | x = [] 127 | if self.vocab_word: 128 | x.append(T.imatrix('x_word')) 129 | if self.use_elmo: 130 | x.append(T.ftensor4('x_elmo')) 131 | x.append(T.imatrix('x_mark')) 132 | assert len(x) > 1 133 | return x 134 | 135 | def _show_model_config(self): 136 | model = self.model 137 | write('Model configuration') 138 | write('\t- Input Dim: {}'.format(self.input_dim)) 139 | write('\t- Hidden Dim: {}'.format(self.hidden_dim)) 140 | write('\t- Output Dim: {}'.format(self.output_dim)) 141 | write('\t- Parameters: {}'.format(sum(len(x.get_value(borrow=True).ravel()) 142 | for x in model.params))) 143 | 144 | def save_params(self, epoch=-1): 145 | argv = self.argv 146 | if argv.output_dir: 147 | dir_name = argv.output_dir 148 | else: 149 | dir_name = 'output' 150 | if argv.output_fn: 151 | file_name = '/param.%s.epoch-%d' % (argv.output_fn, epoch) 152 | else: 153 | file_name = '/param.epoch-%d' % epoch 154 | 155 | fn = dir_name + file_name 156 | params = [p.get_value(borrow=True) for p in self.model.params] 157 | save_pickle(fn=fn, data=params) 158 | 159 | def load_params(self, path): 160 | params = load_pickle(path) 161 | assert len(self.model.params) == len(params) 162 | for p1, p2 in zip(self.model.params, params): 163 | p1.set_value(p2) 164 | 165 | def load_experts_params(self, path): 166 | write('Loading experts params...') 167 | param_files = glob.glob(path + '/*') 168 | param_files = [fn for fn in param_files 169 | if fn.split('/')[-1].startswith('param')] 170 | write("\t - Param Files: %s" % str(param_files)) 171 | for i, path in enumerate(param_files[:self.argv.n_experts]): 172 | params = load_pickle(path) 173 | assert len(self.experts[i].params) == len(params) 174 | for p1, p2 in zip(self.experts[i].params, params): 175 | p1.set_value(p2) 176 | 177 | def set_init_ensemble_param(self): 178 | write('Initializing params...') 179 | W = np.zeros(shape=(2 * self.hidden_dim, self.vocab_label.size()), 180 | dtype=theano.config.floatX) 181 | b = np.zeros(shape=self.vocab_label.size(), 182 | dtype=theano.config.floatX) 183 | for model in self.experts: 184 | W += model.params[-2].get_value(borrow=True) 185 | for model in self.experts: 186 | b += model.params[-1].get_value(borrow=True) 187 | W = W / len(self.experts) 188 | b = b / len(self.experts) 189 | self.model.params[-2].set_value(W) 190 | self.model.params[-1].set_value(b) 191 | 192 | def set_train_func(self): 193 | write('Building a training function...') 194 | 195 | self.optimizer = get_optimizer(self.argv) 196 | self.optimizer.set_params(self.model.params) 197 | if self.argv.load_opt_param: 198 | self.optimizer.load_params(self.argv.load_opt_param) 199 | 200 | # 1D: batch_size * n_spans, 2D: [batch index, label id, span index] 201 | span_true = T.imatrix('span_true') 202 | 203 | # 1D: batch_size, 2D: n_spans, 3D: 2 * hidden_dim 204 | h_span = self.model.span_feats(inputs=self.model.inputs) 205 | # 1D: batch_size, 2D: n_labels, 3D: n_spans; score 206 | span_score = self.model.label_layer.logit_scores(h=h_span) 207 | # 1D: batch_size, 2D: n_labels; label id 208 | span_pred = self.model.argmax_span(span_score=span_score) 209 | 210 | nll = self.model.loss(span_score, span_true) 211 | l2_reg = L2Regularizer() 212 | objective = nll + l2_reg(alpha=self.argv.reg, 213 | params=self.model.params) 214 | 215 | grads = T.grad(cost=objective, wrt=self.model.params) 216 | updates = self.optimizer(grads=grads, params=self.model.params) 217 | 218 | self.train_func = theano.function( 219 | inputs=self.model.inputs + [span_true], 220 | outputs=[objective, span_pred], 221 | updates=updates, 222 | mode='FAST_RUN' 223 | ) 224 | 225 | def set_pred_func(self): 226 | write('Building a predicting function...') 227 | if self.argv.search == 'argmax': 228 | self.set_pred_argmax_func() 229 | else: 230 | self.set_pred_score_func() 231 | 232 | def set_pred_argmax_func(self): 233 | # 1D: batch_size, 2D: n_spans, 3D: hidden_dim 234 | h_span = self.model.span_feats(inputs=self.model.inputs) 235 | # 1D: batch_size, 2D: n_labels, 3D: n_spans; score 236 | logits = self.model.label_layer.logit_scores(h_span) 237 | # 1D: batch_size, 2D: n_labels; span index 238 | span_pred = self.model.argmax_span(logits) 239 | 240 | self.pred_func = theano.function( 241 | inputs=self.model.inputs, 242 | outputs=span_pred, 243 | mode='FAST_RUN' 244 | ) 245 | 246 | def set_pred_score_func(self): 247 | # 1D: batch_size, 2D: n_spans, 3D: hidden_dim 248 | h_span = self.model.span_feats(inputs=self.model.inputs) 249 | # 1D: batch_size, 2D: n_labels, 3D: n_spans; score 250 | logits = self.model.label_layer.logit_scores(h_span) 251 | # 1D: batch_size, 2D: n_labels, 3D: n_spans; score 252 | span_score = self.model.exp_score(logits) 253 | 254 | self.pred_func = theano.function( 255 | inputs=self.model.inputs, 256 | outputs=span_score, 257 | mode='FAST_RUN' 258 | ) 259 | 260 | def set_ensemble_train_func(self): 261 | write('Building an ensemble training function...') 262 | 263 | self.optimizer = get_optimizer(self.argv) 264 | self.optimizer.set_params(self.model.params) 265 | if self.argv.load_opt_param: 266 | self.optimizer.load_params(self.argv.load_opt_param) 267 | 268 | # 1D: batch_size * n_spans, 2D: [batch index, label id, span index] 269 | span_true = T.imatrix('span_true') 270 | 271 | # 1D: batch_size, 2D: n_spans, 3D: 2 * hidden_dim 272 | h_span = self.model.feat_layer.forward(self.model.inputs, 273 | self.experts) 274 | # 1D: batch_size, 2D: n_labels, 3D: n_spans; score 275 | logits = self.model.feat_layer.logit_scores(h=h_span) 276 | # 1D: batch_size, 2D: n_labels; span index 277 | span_pred = self.model.argmax_span(logits) 278 | 279 | nll = self.model.loss(logits, span_true) 280 | l2_reg = L2Regularizer() 281 | objective = nll + l2_reg(alpha=self.argv.reg, 282 | params=self.model.params) 283 | 284 | grads = T.grad(cost=objective, wrt=self.model.params) 285 | updates = self.optimizer(grads=grads, 286 | params=self.model.params) 287 | 288 | self.train_func = theano.function( 289 | inputs=self.model.inputs + [span_true], 290 | outputs=[objective, span_pred], 291 | updates=updates, 292 | mode='FAST_RUN' 293 | ) 294 | 295 | def set_ensemble_pred_func(self): 296 | write('Building an ensemble predicting function...') 297 | if self.argv.search == 'argmax': 298 | self.set_ensemble_pred_argmax_func() 299 | else: 300 | self.set_ensemble_pred_score_func() 301 | 302 | def set_ensemble_pred_argmax_func(self): 303 | # 1D: batch_size, 2D: n_spans, 3D: 2 * hidden_dim 304 | h_span = self.model.feat_layer.forward(self.model.inputs, 305 | self.experts) 306 | # 1D: batch_size, 2D: n_labels, 3D: n_spans; score 307 | span_score = self.model.feat_layer.logit_scores(h=h_span) 308 | # 1D: batch_size, 2D: n_labels; span index 309 | span_pred = self.model.argmax_span(span_score=span_score) 310 | 311 | self.pred_func = theano.function( 312 | inputs=self.model.inputs, 313 | outputs=span_pred, 314 | mode='FAST_RUN' 315 | ) 316 | 317 | def set_ensemble_pred_score_func(self): 318 | # 1D: batch_size, 2D: n_spans, 3D: 2 * hidden_dim 319 | h_span = self.model.feat_layer.forward(self.model.inputs, 320 | self.experts) 321 | # 1D: batch_size, 2D: n_labels, 3D: n_spans; score 322 | logits = self.model.feat_layer.logit_scores(h=h_span) 323 | # 1D: batch_size, 2D: n_labels, 3D: n_spans; score 324 | span_score = self.model.exp_score(logits) 325 | 326 | self.pred_func = theano.function( 327 | inputs=self.model.inputs, 328 | outputs=span_score, 329 | mode='FAST_RUN' 330 | ) 331 | 332 | def train(self, batches): 333 | start = time.time() 334 | n_batches = 0. 335 | loss_total = 0. 336 | p_total = 0. 337 | correct = 0. 338 | 339 | self.model.feat_layer.is_train.set_value(1) 340 | if self.experts: 341 | for model in self.experts: 342 | model.feat_layer.is_train.set_value(1) 343 | 344 | for inputs in batches: 345 | n_batches += 1 346 | 347 | if n_batches % 100 == 0: 348 | sys.stdout.write("%d " % n_batches) 349 | sys.stdout.flush() 350 | 351 | n_words = len(inputs[0][0]) 352 | if n_words < 2 or 100 < n_words: 353 | continue 354 | 355 | loss, span_pred = self.train_func(*inputs) 356 | 357 | if math.isnan(loss): 358 | write('\n\nNAN: Index: %d\n' % n_batches) 359 | exit() 360 | 361 | loss_total += loss 362 | correct_i, p_total_i = correct_and_pred_spans(span_true=inputs[-1], 363 | span_pred=span_pred, 364 | marks=inputs[1]) 365 | correct += correct_i 366 | p_total += p_total_i 367 | 368 | self.model.feat_layer.is_train.set_value(0) 369 | if self.experts: 370 | for model in self.experts: 371 | model.feat_layer.is_train.set_value(0) 372 | 373 | avg_loss = loss_total / n_batches 374 | p, r, f = f_score(correct, p_total, self.n_true_spans) 375 | 376 | write('\n\tTime: %f seconds' % (time.time() - start)) 377 | write('\tAverage Negative Log Likelihood: %f(%f/%d)' % (avg_loss, loss_total, n_batches)) 378 | write('\tF:{:>7.2%} P:{:>7.2%} ({:>5}/{:>5}) R:{:>7.2%} ({:>5}/{:>5})'.format( 379 | f, p, int(correct), int(p_total), r, int(correct), int(self.n_true_spans))) 380 | 381 | def predict(self, batches): 382 | if self.argv.search == 'argmax': 383 | return self.predict_argmax(batches) 384 | else: 385 | return self.predict_greedy(batches) 386 | 387 | def predict_argmax(self, batches): 388 | """ 389 | :param batches: 1D: n_sents, 2D: n_prds, 3D: n_feats, 4D: n_words; elem=(x_w, x_m) 390 | :return: y: 1D: n_sents, 2D: n_prds, 3D: n_spans, 3D: [label_id, pre_index, post_index] 391 | """ 392 | start = time.time() 393 | y = [] 394 | 395 | for index, inputs in enumerate(batches): 396 | if (index + 1) % 100 == 0: 397 | sys.stdout.write("%d " % (index + 1)) 398 | sys.stdout.flush() 399 | 400 | if len(inputs) == 0: 401 | span_triples = [] 402 | else: 403 | span_pred = self.pred_func(*inputs) 404 | span_triples = self.decoder.argmax_span_triples(span_indices=span_pred, 405 | marks=inputs[-1]) 406 | y.append(span_triples) 407 | 408 | write('\n\tTime: %f seconds' % (time.time() - start)) 409 | return y 410 | 411 | def predict_greedy(self, batches): 412 | """ 413 | :param batches: 1D: n_sents, 2D: n_prds, 3D: n_feats, 4D: n_words; elem=(x_w, x_m) 414 | :return: y: 1D: n_sents, 2D: n_prds, 3D: n_spans, 3D: [label_id, pre_index, post_index] 415 | """ 416 | start = time.time() 417 | y = [] 418 | 419 | for index, inputs in enumerate(batches): 420 | if (index + 1) % 100 == 0: 421 | sys.stdout.write("%d " % (index + 1)) 422 | sys.stdout.flush() 423 | 424 | if len(inputs) == 0: 425 | span_triples = [] 426 | else: 427 | scores = self.pred_func(*inputs) 428 | span_triples = self.decoder.greedy_span_triples(scores=scores, 429 | marks=inputs[-1]) 430 | y.append(span_triples) 431 | 432 | write('\n\tTime: %f seconds' % (time.time() - start)) 433 | return y 434 | 435 | 436 | class BIOModelAPI(SpanModelAPI): 437 | def set_model(self, **kwargs): 438 | write('Setting a model...') 439 | argv = self.argv 440 | 441 | self.vocab_word = kwargs['vocab_word'] 442 | self.use_elmo = kwargs['use_elmo'] 443 | self.vocab_label = kwargs['vocab_label'] 444 | self.vocab_label_valid = kwargs['vocab_label_valid'] 445 | word_emb = kwargs['word_emb'] 446 | vocab_word_size = self.vocab_word.size() if self.vocab_word else 0 447 | 448 | self.input_dim = argv.emb_dim if word_emb is None else word_emb.shape[1] 449 | self.hidden_dim = argv.hidden_dim 450 | self.output_dim = self.vocab_label.size() 451 | 452 | self.model = CRFModel() 453 | self.model.compile(inputs=self._set_inputs(), 454 | vocab_word_size=vocab_word_size, 455 | use_elmo=self.use_elmo, 456 | word_emb=word_emb, 457 | input_dim=[self.input_dim, self.input_dim], 458 | hidden_dim=self.hidden_dim, 459 | output_dim=self.output_dim, 460 | n_layers=argv.n_layers, 461 | init_emb=word_emb, 462 | drop_rate=argv.drop_rate) 463 | 464 | write('\t- {}'.format("\n\t- ".join([l.name for l in self.model.layers]))) 465 | self._show_model_config() 466 | 467 | def set_train_func(self): 468 | write('Building a training function...') 469 | 470 | self.optimizer = get_optimizer(self.argv) 471 | self.optimizer.set_params(self.model.params) 472 | if self.argv.load_opt_param: 473 | write('\tLoading optimization params...') 474 | self.optimizer.load_params(self.argv.load_opt_param) 475 | 476 | y_true = T.imatrix('y') 477 | 478 | # 1D: batch_size, 2D: n_words, 3D: output_dim 479 | emit_scores = self.model.get_emit_scores() 480 | # 1D: batch_size, 2D: n_words; elem=label id 481 | y_pred = self.model.label_layer.get_y_pred(emit_scores) 482 | # 1D: batch_size; elem=log proba 483 | y_path_proba = self.model.label_layer.get_y_path_proba(emit_scores, y_true) 484 | 485 | l2_reg = L2Regularizer() 486 | cost = - T.mean(y_path_proba) + l2_reg(alpha=self.argv.reg, 487 | params=self.model.params) 488 | 489 | grads = T.grad(cost=cost, wrt=self.model.params) 490 | updates = self.optimizer(grads=grads, params=self.model.params) 491 | 492 | self.train_func = theano.function( 493 | inputs=self.model.inputs + [y_true], 494 | outputs=[cost, y_pred], 495 | updates=updates, 496 | on_unused_input='warn', 497 | mode='FAST_RUN' 498 | ) 499 | 500 | def set_pred_func(self): 501 | write('Building a predicting function...') 502 | 503 | # 1D: batch_size, 2D: n_words, 3D: output_dim 504 | o = self.model.get_emit_scores() 505 | # 1D: batch_size, 2D: n_words; elem=label id 506 | y_pred = self.model.label_layer.get_y_pred(o) 507 | 508 | self.pred_func = theano.function( 509 | inputs=self.model.inputs, 510 | outputs=y_pred, 511 | on_unused_input='warn', 512 | mode='FAST_RUN' 513 | ) 514 | 515 | def train(self, batches): 516 | start = time.time() 517 | n_batches = 0. 518 | n_samples = 0. 519 | loss_total = 0. 520 | p_total = 0. 521 | r_total = 0. 522 | correct = 0. 523 | 524 | self.model.feat_layer.is_train.set_value(1) 525 | 526 | for index, inputs in enumerate(batches): 527 | if (index + 1) % 100 == 0: 528 | sys.stdout.write('%d ' % (index + 1)) 529 | sys.stdout.flush() 530 | 531 | batch_size = len(inputs[0]) 532 | n_words = len(inputs[0][0]) 533 | if n_words < 2 or 100 < n_words: 534 | continue 535 | 536 | loss, y_pred = self.train_func(*inputs) 537 | 538 | if math.isnan(loss): 539 | write('\n\nNAN: Index: %d\n' % (index + 1)) 540 | exit() 541 | 542 | loss_total += loss 543 | n_batches += 1 544 | n_samples += batch_size * n_words 545 | 546 | correct_i, p_total_i, r_total_i = metrics_for_bio(y_true=inputs[-1], 547 | y_pred=y_pred, 548 | vocab_label=self.vocab_label) 549 | correct += correct_i 550 | p_total += p_total_i 551 | r_total += r_total_i 552 | 553 | self.model.feat_layer.is_train.set_value(0) 554 | 555 | avg_loss = loss_total / n_batches 556 | p, r, f = f_score(correct, p_total, r_total) 557 | 558 | write('\n\tTime: %f seconds' % (time.time() - start)) 559 | write('\tAverage Negative Log Likelihood: %f(%f/%d)' % (avg_loss, loss_total, n_batches)) 560 | write('\tF:{:>7.2%} P:{:>7.2%} ({:>5}/{:>5}) R:{:>7.2%} ({:>5}/{:>5})'.format( 561 | f, p, int(correct), int(p_total), r, int(correct), int(r_total))) 562 | 563 | def predict(self, batches): 564 | """ 565 | :param batches: 1D: n_batches, 2D: n_words; elem=(x_w, x_m) 566 | :return: y: 1D: n_batches, 2D: batch_size; elem=(y_pred(1D:n_words), y_proba(float)) 567 | """ 568 | start = time.time() 569 | y = [] 570 | 571 | for index, inputs in enumerate(batches): 572 | if (index + 1) % 100 == 0: 573 | sys.stdout.write("%d " % (index + 1)) 574 | sys.stdout.flush() 575 | 576 | if len(inputs) == 0: 577 | y_pred = [] 578 | elif len(inputs[0][0]) < 2: 579 | y_pred = [[0] for _ in range(len(inputs[0]))] 580 | else: 581 | y_pred = self.pred_func(*inputs) 582 | y.append(y_pred) 583 | 584 | write('\n\tTime: %f seconds' % (time.time() - start)) 585 | return y 586 | -------------------------------------------------------------------------------- /src/srl/models.py: -------------------------------------------------------------------------------- 1 | import theano 2 | import theano.tensor as T 3 | 4 | from nn.layers.embeddings import Embedding, ElmoLayer 5 | from nn.layers.core import Dense, Dropout 6 | from nn.layers.seqlabel import CRF 7 | from nn.layers.stack import BiRNNLayer 8 | from nn.utils import logsumexp3d 9 | 10 | 11 | class Model(object): 12 | def __init__(self): 13 | self.is_train = theano.shared(0, borrow=True) 14 | self.inputs = None 15 | self.outputs = None 16 | self.dropout = None 17 | self.input_layers = [] 18 | self.hidden_layers = [] 19 | self.output_layers = [] 20 | self.layers = [] 21 | self.params = [] 22 | 23 | def compile(self, **kwargs): 24 | raise NotImplementedError 25 | 26 | def _set_params(self): 27 | for l in self.layers: 28 | self.params += l.params 29 | 30 | 31 | class FeatureLayer(Model): 32 | def compile(self, **kwargs): 33 | self._set_layers(kwargs) 34 | self._set_params() 35 | 36 | def forward(self, inputs): 37 | embs = [] 38 | for i in range(len(inputs)): 39 | # 1D: batch_size, 2D: n_words, 3D: input_dim 40 | emb_i = self.input_layers[i].forward(x=inputs[i], 41 | is_train=self.is_train) 42 | embs.append(emb_i) 43 | 44 | # 1D: batch_size, 2D: n_words, 3D: input_dim 45 | x = T.concatenate(tensor_list=embs, axis=2) 46 | # 1D: n_words, 2D: batch_size, 3D: hidden_dim 47 | h = self.hidden_layers[0].forward(x=x.dimshuffle(1, 0, 2), 48 | is_train=self.is_train) 49 | return h 50 | 51 | def _set_layers(self, args): 52 | x_w_dim, x_m_dim = args['input_dim'] 53 | hidden_dim = args['hidden_dim'] 54 | drop_rate = args['drop_rate'] 55 | 56 | ################ 57 | # Input layers # 58 | ################ 59 | if args['vocab_word_size'] > 0: 60 | emb_word = Embedding(input_dim=args['vocab_word_size'], 61 | output_dim=x_w_dim, 62 | init_emb=args['word_emb'], 63 | param_fix=True, 64 | drop_rate=drop_rate, 65 | name='EmbWord') 66 | self.input_layers.append(emb_word) 67 | 68 | if args['use_elmo']: 69 | emb_elmo = ElmoLayer(drop_rate=0.5, 70 | name='EmbElmo') 71 | self.input_layers.append(emb_elmo) 72 | 73 | emb_mark = Embedding(input_dim=2, 74 | output_dim=x_m_dim, 75 | init_emb=None, 76 | param_init='xavier', 77 | param_fix=False, 78 | drop_rate=drop_rate, 79 | name='EmbMark') 80 | self.input_layers.append(emb_mark) 81 | 82 | ################# 83 | # Hidden layers # 84 | ################# 85 | if args['use_elmo']: 86 | hidden_input_dim = (len(self.input_layers) - 2) * x_w_dim + x_m_dim + 1024 87 | else: 88 | hidden_input_dim = (len(self.input_layers) - 1) * x_w_dim + x_m_dim 89 | hidden_layer = BiRNNLayer(input_dim=hidden_input_dim, 90 | output_dim=hidden_dim, 91 | n_layers=args['n_layers'], 92 | unit_type='lstm', 93 | connect_type='dense', 94 | drop_rate=drop_rate) 95 | self.hidden_layers = [hidden_layer] 96 | self.layers = self.input_layers + self.hidden_layers 97 | 98 | 99 | class LabelLayer(Model): 100 | def compile(self, **kwargs): 101 | self._set_layers(hidden_dim=kwargs['feat_dim'], 102 | output_dim=kwargs['output_dim']) 103 | self._set_params() 104 | 105 | def _set_layers(self, hidden_dim, output_dim): 106 | self.layers = [Dense(input_dim=hidden_dim, 107 | output_dim=output_dim)] 108 | 109 | def span_feats2(self, h): 110 | """ 111 | :param h: 1D: n_words, 2D: batch_size, 3D: hidden_dim 112 | :return: 1D: batch_size, 2D: n_spans, 3D: 2 * hidden_dim 113 | """ 114 | h = h.dimshuffle(1, 0, 2) 115 | n_words = h.shape[1] 116 | 117 | m = T.triu(T.ones(shape=(n_words, n_words))) 118 | indices = m.nonzero() 119 | 120 | # 1D: batch_size, 2D: n_spans, 3D: hidden_dim 121 | h_i = h[:, indices[0]] 122 | h_j = h[:, indices[1]] 123 | 124 | h_diff = h_i - h_j 125 | h_add = h_i + h_j 126 | 127 | return T.concatenate([h_add, h_diff], axis=2) 128 | 129 | def span_feats(self, h): 130 | """ 131 | :param h: 1D: n_words, 2D: batch_size, 3D: hidden_dim 132 | :return: 1D: batch_size, 2D: n_words(i), 3D: n_words(j), 4D: 2 * hidden_dim 133 | """ 134 | h = h.dimshuffle(1, 0, 2) 135 | n_words = h.shape[1] 136 | pad = T.zeros(shape=(h.shape[0], 1, h.shape[2])) 137 | h_pad = T.concatenate([h, pad], axis=1) 138 | 139 | m = T.triu(T.ones(shape=(n_words, n_words))) 140 | indices = m.nonzero() 141 | 142 | # 1D: batch_size, 2D: n_spans, 3D: hidden_dim 143 | h_i = h[:, indices[0]] 144 | h_j = h_pad[:, indices[1] + 1] 145 | 146 | h_diff = h_i - h_j 147 | h_add = h_i + h_j 148 | 149 | return T.concatenate([h_add, h_diff], axis=2) 150 | 151 | def logit_scores(self, h): 152 | """ 153 | :param h: 1D: batch_size, 2D: n_spans, 3D: 2 * hidden_dim 154 | :return: 1D: batch_size, 2D: n_labels, 3D: n_spans; score 155 | """ 156 | return self.layers[-1].forward(h).dimshuffle(0, 2, 1) 157 | 158 | 159 | class MoELabelLayer(LabelLayer): 160 | def __init__(self): 161 | super(MoELabelLayer, self).__init__() 162 | self.hidden_dim = -1 163 | 164 | def compile(self, **kwargs): 165 | self.dropout = Dropout(rate=kwargs['drop_rate']) 166 | self._set_layers(n_experts=kwargs['n_experts'], 167 | hidden_dim=kwargs['feat_dim'], 168 | output_dim=kwargs['output_dim']) 169 | 170 | def _set_layers(self, n_experts, hidden_dim, output_dim): 171 | mixture = Dense(input_dim=1, 172 | output_dim=n_experts, 173 | activation=None, 174 | use_bias=False, 175 | weight_init='zero', 176 | bias_init='zero') 177 | hidden_layer = Dense(input_dim=hidden_dim, 178 | output_dim=hidden_dim, 179 | weight_init="identity") 180 | output_layer = Dense(input_dim=hidden_dim, 181 | output_dim=output_dim) 182 | self.hidden_dim = hidden_dim 183 | self.layers = [mixture, hidden_layer, output_layer] 184 | 185 | def forward(self, x, experts): 186 | """ 187 | :param x: 1D: n_inputs, 2D: batch_size, 3D: n_words; feat id 188 | :param experts: 1D: n_experts; model 189 | :return: 1D: batch_size, 2D: n_spans, 3D: 2 * hidden_dim 190 | """ 191 | # 1D: 1, 2D: n_experts, 3D: 1 192 | mixture = T.nnet.softmax(self.layers[0].W).dimshuffle('x', 1, 0) 193 | # 1D: 1, 2D: n_experts, 3D: 2 * hidden_dim 194 | mixture = T.repeat(mixture, repeats=self.hidden_dim, axis=2) 195 | 196 | batch_size = x[0].shape[0] 197 | n_words = x[0].shape[1] 198 | n_spans = T.cast(n_words * (n_words + 1) / 2, dtype='int32') 199 | 200 | # 1D: batch_size, 2D: n_spans, 3D: 2 * hidden_dim; score 201 | h_span = T.zeros(shape=(batch_size, n_spans, self.hidden_dim), 202 | dtype=theano.config.floatX) 203 | 204 | for i, expert in enumerate(experts): 205 | # 1D: batch_size, 2D: n_spans, 3D: 2 * hidden_dim 206 | h_span_tm = expert.span_feats(inputs=x) 207 | h_span = h_span + mixture[:, i] * h_span_tm 208 | 209 | return self.layers[1].forward(h_span) 210 | 211 | 212 | class CRFLayer(Model): 213 | def compile(self, **kwargs): 214 | self._set_layers(kwargs) 215 | self._set_params() 216 | 217 | def _set_layers(self, args): 218 | layer = CRF(input_dim=args['hidden_dim'], 219 | output_dim=args['output_dim']) 220 | self.layers = [layer] 221 | 222 | def forward(self, h): 223 | """ 224 | :param h: 1D: n_words, 2D: batch_size, 3D: hidden_dim 225 | :return: 1D: batch_size, 2D: n_words, 3D: output_dim; elem=emit score 226 | """ 227 | return self.layers[0].forward(x=h).dimshuffle(1, 0, 2) 228 | 229 | def get_y_pred(self, o): 230 | """ 231 | :param o: 1D: batch_size, 2D: n_words, 3D: output_dim; elem=emit score 232 | :return: 1D: batch_size, 2D: n_words; elem=label id 233 | """ 234 | return self.layers[0].get_y_pred(emit_scores=o.dimshuffle(1, 0, 2)) 235 | 236 | def get_y_path_proba(self, o, y_true): 237 | """ 238 | :param o: 1D: batch_size, 2D: n_words, 3D: output_dim; elem=emit score 239 | :param y_true: 1D: batch_size, 2D: n_words; elem=label id 240 | :return: 1D: batch_size; elem=log proba 241 | """ 242 | return self.layers[0].get_y_proba(emit_scores=o.dimshuffle(1, 0, 2), 243 | y_true=y_true.dimshuffle(1, 0)) 244 | 245 | 246 | class SpanModel(Model): 247 | def __init__(self): 248 | super(SpanModel, self).__init__() 249 | self.feat_layer = None 250 | self.label_layer = None 251 | 252 | def compile(self, inputs, **kwargs): 253 | self.inputs = inputs 254 | self.feat_layer = FeatureLayer() 255 | self.feat_layer.compile(**kwargs) 256 | self.label_layer = LabelLayer() 257 | self.label_layer.compile(**kwargs) 258 | self.layers = self.feat_layer.layers + self.label_layer.layers 259 | self._set_params() 260 | 261 | def span_feats(self, inputs): 262 | """ 263 | :param inputs: 1D: n_inputs, 2D: batch_size, 3D: n_words; feat id 264 | :return: 1D: batch_size, 2D: n_spans, 3D: 2 * hidden_dim 265 | """ 266 | # 1D: n_words, 2D: batch_size, 3D: 2 * hidden_dim 267 | h_rnn = self.feat_layer.forward(inputs) 268 | return self.label_layer.span_feats(h_rnn) 269 | 270 | @staticmethod 271 | def argmax_span(span_score): 272 | """ 273 | :param span_score: 1D: batch_size, 2D: n_labels, 3D: n_spans 274 | :return: 1D: batch_size, 2D: n_labels; span index 275 | """ 276 | return T.argmax(span_score, axis=2) 277 | 278 | @staticmethod 279 | def loss(span_score, span_true): 280 | """ 281 | :param span_score: 1D: batch_size, 2D: n_labels, 3D: n_spans 282 | :param span_true: 1D: batch_size * n_spans; (batch index, label id, span index) 283 | """ 284 | batch_size = span_score.shape[0] 285 | 286 | # 1D: batch_size * n_spans; index 287 | batch_index = span_true[:, 0] 288 | label_index = span_true[:, 1] 289 | span_index = span_true[:, 2] 290 | 291 | # 1D: batch_size * n_spans; score 292 | true_span_score = span_score[batch_index, label_index, span_index] 293 | 294 | # 1D: batch_size, 2D: n_labels; elem=score 295 | z = logsumexp3d(span_score, axis=2) 296 | # 1D: batch_size * n_spans; score 297 | z = z[batch_index, label_index] 298 | 299 | # 1D: batch_size * n_spans; score 300 | nll = true_span_score - z 301 | 302 | return - T.sum(nll) / batch_size 303 | 304 | @staticmethod 305 | def exp_score(span_score): 306 | """ 307 | :param span_score: 1D: batch_size, 2D: n_labels, 3D: n_spans; logit score 308 | :return: 1D: batch_size, 2D: n_labels, 3D: n_spans 309 | """ 310 | return T.exp(span_score) 311 | 312 | 313 | class MoEModel(SpanModel): 314 | def compile(self, inputs, **kwargs): 315 | self.inputs = inputs 316 | self.feat_layer = MoELabelLayer() 317 | self.feat_layer.compile(**kwargs) 318 | self.layers = self.feat_layer.layers 319 | self._set_params() 320 | 321 | 322 | class CRFModel(Model): 323 | def __init__(self): 324 | super(CRFModel, self).__init__() 325 | self.feat_layer = None 326 | self.label_layer = None 327 | 328 | def compile(self, inputs, **kwargs): 329 | self.inputs = inputs 330 | self.feat_layer = FeatureLayer() 331 | self.feat_layer.compile(**kwargs) 332 | self.label_layer = CRFLayer() 333 | self.label_layer.compile(**kwargs) 334 | self.layers = self.feat_layer.layers + self.label_layer.layers 335 | self._set_params() 336 | 337 | def get_emit_scores(self): 338 | """ 339 | :return: 1D: batch_size, 2D: n_words, 3D: output_dim 340 | """ 341 | h = self.feat_layer.forward(self.inputs) 342 | return self.label_layer.forward(h) 343 | 344 | -------------------------------------------------------------------------------- /src/srl/preprocessors.py: -------------------------------------------------------------------------------- 1 | from collections import Counter 2 | from copy import deepcopy 3 | 4 | import numpy as np 5 | 6 | from utils.vocab import Vocab, UNK 7 | from utils.sent import Conll05Sent, Conll12Sent 8 | from utils.misc import span_to_span_index, make_vocab_from_ids 9 | from utils.savers import save_key_value_format 10 | from utils.loaders import load_key_value_format 11 | 12 | 13 | class Preprocessor(object): 14 | def __init__(self, argv): 15 | self.argv = argv 16 | self.data_type = argv.data_type 17 | 18 | @staticmethod 19 | def make_vocab_word(word_list): 20 | vocab_word = Vocab() 21 | vocab_word.add_word(UNK) 22 | for w in word_list: 23 | vocab_word.add_word(w) 24 | return vocab_word 25 | 26 | def make_and_save_vocab_label(self, 27 | sents, 28 | vocab_label_init=None, 29 | save=False, 30 | load=False): 31 | argv = self.argv 32 | 33 | if load and argv.load_label: 34 | label_key_value = load_key_value_format(argv.load_label) 35 | vocab_label = make_vocab_from_ids(label_key_value) 36 | else: 37 | vocab_label = self.make_vocab_label(sents=sents, 38 | vocab_label_init=vocab_label_init) 39 | if save: 40 | if argv.output_dir: 41 | dir_name = argv.output_dir 42 | else: 43 | dir_name = 'output' 44 | if argv.output_fn: 45 | file_name = '/label_ids.' + argv.output_fn 46 | else: 47 | file_name = '/label_ids' 48 | 49 | fn = dir_name + file_name 50 | values, keys = map(lambda x: x, zip(*enumerate(vocab_label.i2w))) 51 | save_key_value_format(fn=fn, keys=keys, values=values) 52 | 53 | return vocab_label 54 | 55 | def make_sents(self, corpus): 56 | """ 57 | :param corpus: 1D: n_sents, 2D: n_words 58 | :return: 1D: n_sents 59 | """ 60 | if len(corpus) == 0: 61 | return [] 62 | 63 | if self.data_type == 'conll05': 64 | column = 6 65 | gen_sent = Conll05Sent 66 | else: 67 | column = 12 68 | gen_sent = Conll12Sent 69 | 70 | is_test = True if len(corpus[0][0]) < column else False 71 | return [gen_sent(sent, is_test) for sent in corpus] 72 | 73 | @staticmethod 74 | def split_x_and_y(batches, index=-1): 75 | """ 76 | :param batches: 1D: n_batches, 2D: batch_size; elem=(x, m, y) 77 | :param index: split column index 78 | :return 1D: n_batches, 2D: batch_size; elem=(x, m) 79 | :return 1D: n_batches, 2D: batch_size; elem=y 80 | """ 81 | x = [] 82 | y = [] 83 | for batch in batches: 84 | x.append(batch[:index]) 85 | y.append(batch[index]) 86 | return x, y 87 | 88 | def make_batches(self, 89 | samples, 90 | is_valid_data=False, 91 | shuffle=True): 92 | """ 93 | :param samples: 1D: n_samples, 2D: [x, m, y] 94 | :param is_valid_data: boolean 95 | :param shuffle: boolean 96 | :return 1D: n_batches, 2D: batch_size; elem=[x, m, y] 97 | """ 98 | if shuffle: 99 | np.random.shuffle(samples) 100 | samples.sort(key=lambda sample: len(sample[0])) 101 | 102 | batches = [] 103 | batch = [] 104 | prev_n_words = len(samples[0][0]) 105 | 106 | for sample in samples: 107 | n_words = len(sample[0]) 108 | if len(batch) == self.argv.batch_size or prev_n_words != n_words: 109 | batches.append(self._make_one_batch(batch, is_valid_data)) 110 | batch = [] 111 | prev_n_words = n_words 112 | batch.append(sample) 113 | 114 | if batch: 115 | batches.append(self._make_one_batch(batch, is_valid_data)) 116 | 117 | if shuffle: 118 | np.random.shuffle(batches) 119 | 120 | for batch in batches: 121 | yield batch 122 | 123 | @staticmethod 124 | def _make_one_batch(batch, is_valid_data): 125 | raise NotImplementedError 126 | 127 | @staticmethod 128 | def make_batch_per_sent(sents): 129 | """ 130 | :param sents: 1D: n_sents; Sent() 131 | :return 1D: n_sents, 2D: n_prds; elem=[x, m] 132 | """ 133 | batches = [] 134 | for sent in sents: 135 | x = [] 136 | 137 | x_word_ids = sent.word_ids 138 | if x_word_ids is not None: 139 | x.append(x_word_ids) 140 | 141 | x_elmo_emb = sent.elmo_emb 142 | if x_elmo_emb is not None: 143 | x.append(x_elmo_emb) 144 | 145 | batch = list(map(lambda m: x + [m], sent.mark_ids)) 146 | batches.append(list(map(lambda b: b, zip(*batch)))) 147 | 148 | return batches 149 | 150 | @staticmethod 151 | def set_sent_config(sents, elmo_emb, vocab_word, vocab_label): 152 | raise NotImplementedError 153 | 154 | @staticmethod 155 | def make_samples(sents, is_valid_data=False): 156 | raise NotImplementedError 157 | 158 | def make_vocab_label(self, 159 | sents, 160 | vocab_label_init=None): 161 | raise NotImplementedError 162 | 163 | 164 | class SpanPreprocessor(Preprocessor): 165 | def make_vocab_label(self, 166 | sents, 167 | vocab_label_init=None): 168 | if len(sents) == 0: 169 | return None 170 | 171 | if vocab_label_init: 172 | vocab_label = deepcopy(vocab_label_init) 173 | else: 174 | vocab_label = Vocab() 175 | if self.argv.data_type == 'conll05': 176 | core_labels = ["A0", "A1", "A2", "A3", "A4", "A5"] 177 | else: 178 | core_labels = ["ARG0", "ARG1", "ARG2", "ARG3", "ARG4", "ARG5"] 179 | for label in core_labels: 180 | vocab_label.add_word(label) 181 | 182 | bio_labels = [] 183 | for sent in sents: 184 | for props in sent.prd_bio_labels: 185 | bio_labels += props 186 | cnt = Counter(bio_labels) 187 | bio_labels = [(w, c) for w, c in cnt.most_common()] 188 | 189 | for label, count in bio_labels: 190 | if not label.endswith('-V') and len(label) > 1: 191 | vocab_label.add_word(label[2:]) 192 | 193 | return vocab_label 194 | 195 | @staticmethod 196 | def set_sent_config(sents, elmo_emb, vocab_word, vocab_label): 197 | for index, sent in enumerate(sents): 198 | sent.set_mark_ids() 199 | if vocab_word: 200 | sent.set_word_ids(vocab_word) 201 | if elmo_emb: 202 | sent.set_elmo_emb(elmo_emb[str(index)]) 203 | if vocab_label: 204 | sent.set_span_triples(vocab_label) 205 | sent.set_span_triples_with_null(vocab_label.size()) 206 | return sents 207 | 208 | @staticmethod 209 | def make_samples(sents, is_valid_data=False): 210 | samples = [] 211 | 212 | for sent in sents: 213 | x = [] 214 | 215 | x_word_ids = sent.word_ids 216 | if x_word_ids is not None: 217 | x.append(x_word_ids) 218 | 219 | x_elmo_emb = sent.elmo_emb 220 | if x_elmo_emb is not None: 221 | x.append(x_elmo_emb) 222 | 223 | if is_valid_data: 224 | triples = sent.span_triples 225 | else: 226 | triples = sent.span_triples_with_null 227 | 228 | assert len(sent.mark_ids) == len(triples) 229 | for m, spans in zip(sent.mark_ids, triples): 230 | # spans: 1D: n_spans, 2D: (r, i, j) 231 | samples.append(x + [m, spans]) 232 | 233 | return samples 234 | 235 | @staticmethod 236 | def _make_one_batch(batch, is_valid_data): 237 | if is_valid_data: 238 | return list(map(lambda b: b, zip(*batch))) 239 | 240 | b = [] 241 | y = [] 242 | n_words = len(batch[0][0]) 243 | for b_index, sample in enumerate(batch): 244 | b.append(sample[:-1]) 245 | y_tmp = [] 246 | for (r, i, j) in sample[-1]: 247 | span_index = span_to_span_index(i, j, n_words) 248 | y_tmp.append([b_index, r, span_index]) 249 | y += y_tmp 250 | 251 | x = list(map(lambda b_i: b_i, zip(*b))) 252 | 253 | return x + [y] 254 | 255 | 256 | class BIOPreprocessor(Preprocessor): 257 | def make_vocab_label(self, 258 | sents, 259 | vocab_label_init=None): 260 | if len(sents) == 0: 261 | return None 262 | 263 | if vocab_label_init: 264 | vocab_label = deepcopy(vocab_label_init) 265 | else: 266 | vocab_label = Vocab() 267 | none_label = 'O' 268 | vocab_label.add_word(none_label) 269 | 270 | labels = [] 271 | for sent in sents: 272 | if sent.has_prds: 273 | for prop in sent.prd_bio_labels: 274 | labels += prop 275 | cnt = Counter(labels) 276 | labels = [(w, c) for w, c in cnt.most_common()] 277 | 278 | for label, count in labels: 279 | vocab_label.add_word(label) 280 | 281 | return vocab_label 282 | 283 | @staticmethod 284 | def set_sent_config(sents, elmo_emb, vocab_word, vocab_label): 285 | for index, sent in enumerate(sents): 286 | sent.set_mark_ids() 287 | if vocab_word: 288 | sent.set_word_ids(vocab_word) 289 | if elmo_emb: 290 | sent.set_elmo_emb(elmo_emb[str(index)]) 291 | if vocab_label: 292 | sent.set_label_ids(vocab_label) 293 | return sents 294 | 295 | @staticmethod 296 | def make_samples(sents, is_valid_data=False): 297 | samples = [] 298 | 299 | for sent in sents: 300 | x = [] 301 | 302 | x_word_ids = sent.word_ids 303 | if x_word_ids is not None: 304 | x.append(x_word_ids) 305 | 306 | x_elmo_emb = sent.elmo_emb 307 | if x_elmo_emb is not None: 308 | x.append(x_elmo_emb) 309 | 310 | assert len(sent.mark_ids) == len(sent.bio_label_ids) 311 | for m, spans in zip(sent.mark_ids, sent.bio_label_ids): 312 | samples.append(x + [m, spans]) 313 | 314 | return samples 315 | 316 | @staticmethod 317 | def _make_one_batch(batch, is_valid_data): 318 | return list(map(lambda b: b, zip(*batch))) 319 | -------------------------------------------------------------------------------- /src/srl/testers.py: -------------------------------------------------------------------------------- 1 | from utils.loaders import load_emb 2 | from utils.misc import write, make_vocab_from_ids 3 | 4 | 5 | class Tester(object): 6 | def __init__(self, 7 | argv, 8 | loader, 9 | saver, 10 | preprocessor, 11 | evaluator, 12 | model_api): 13 | self.argv = argv 14 | self.loader = loader 15 | self.saver = saver 16 | self.preprocessor = preprocessor 17 | self.evaluator = evaluator 18 | self.model_api = model_api 19 | 20 | def predict(self): 21 | argv = self.argv 22 | pproc = self.preprocessor 23 | loader = self.loader 24 | 25 | ################ 26 | # Load dataset # 27 | ################ 28 | write('Loading Dataset...') 29 | test_corpus = loader.load(path=argv.test_data, 30 | data_size=argv.data_size, 31 | is_test=True) 32 | test_sents = pproc.make_sents(test_corpus) 33 | 34 | ################# 35 | # Load init emb # 36 | ################# 37 | if argv.word_emb: 38 | write('Loading Embeddings...') 39 | word_list, word_emb = load_emb(argv.word_emb) 40 | vocab_word = pproc.make_vocab_word(word_list) 41 | write('\t- # Embedding Words: %d' % vocab_word.size()) 42 | else: 43 | vocab_word = word_emb = None 44 | 45 | if argv.test_elmo_emb: 46 | write('Loading ELMo Embeddings...') 47 | test_elmo_emb = loader.load_hdf5(argv.test_elmo_emb) 48 | else: 49 | test_elmo_emb = None 50 | 51 | ############### 52 | # Make labels # 53 | ############### 54 | label_key_value = loader.load_key_value_format(argv.load_label) 55 | vocab_label = make_vocab_from_ids(label_key_value) 56 | write('\t- # Labels: %d' % vocab_label.size()) 57 | 58 | ################### 59 | # Set sent params # 60 | ################### 61 | test_sents = pproc.set_sent_config(sents=test_sents, 62 | elmo_emb=test_elmo_emb, 63 | vocab_word=vocab_word, 64 | vocab_label=None) 65 | ################ 66 | # Make samples # 67 | ################ 68 | write('Making Test Samples...') 69 | test_batches = pproc.make_batch_per_sent(sents=test_sents) 70 | write('\t- # Test Samples: %d' % len(test_batches)) 71 | 72 | ############# 73 | # Model API # 74 | ############# 75 | use_elmo = True if test_elmo_emb is not None else False 76 | 77 | if argv.n_experts > 0: 78 | self.model_api.set_ensemble_model(word_emb=word_emb, 79 | use_elmo=use_elmo, 80 | vocab_word=vocab_word, 81 | vocab_label=vocab_label, 82 | vocab_label_valid=None) 83 | self.model_api.load_params(argv.load_param) 84 | self.model_api.load_experts_params(argv.load_param_dir) 85 | self.model_api.set_ensemble_pred_func() 86 | else: 87 | self.model_api.set_model(word_emb=word_emb, 88 | use_elmo=use_elmo, 89 | vocab_word=vocab_word, 90 | vocab_label=vocab_label, 91 | vocab_label_valid=None) 92 | self.model_api.load_params(argv.load_param) 93 | self.model_api.set_pred_func() 94 | 95 | ########### 96 | # Testing # 97 | ########### 98 | write('\nPREDICTION START') 99 | test_y_pred = self.model_api.predict(test_batches) 100 | self.saver.save_props(corpus=test_sents, 101 | labels=test_y_pred, 102 | vocab_label=vocab_label) 103 | self.saver.save_json_format(corpus=test_sents, 104 | labels=test_y_pred, 105 | vocab_label=vocab_label) 106 | -------------------------------------------------------------------------------- /src/srl/trainers.py: -------------------------------------------------------------------------------- 1 | from utils.evaluators import count_true_spans 2 | from utils.loaders import load_emb 3 | from utils.misc import write, show_score_history 4 | from utils.misc import make_output_dir, get_file_names_in_dir, get_latest_param_fn 5 | 6 | 7 | class Trainer(object): 8 | def __init__(self, 9 | argv, 10 | loader, 11 | preprocessor, 12 | evaluator, 13 | model_api): 14 | self.argv = argv 15 | self.loader = loader 16 | self.preprocessor = preprocessor 17 | self.evaluator = evaluator 18 | self.model_api = model_api 19 | 20 | self.f1_history = {} 21 | self.best_valid_f1 = 0.0 22 | self.best_epoch = -1 23 | 24 | def train(self): 25 | write('\nTRAINING START\n') 26 | 27 | argv = self.argv 28 | loader = self.loader 29 | pproc = self.preprocessor 30 | 31 | make_output_dir(self.argv) 32 | 33 | ################# 34 | # Load word emb # 35 | ################# 36 | if argv.word_emb: 37 | write('Loading Word Embeddings...') 38 | word_list, word_emb = load_emb(argv.word_emb) 39 | vocab_word = pproc.make_vocab_word(word_list) 40 | write('\t- # Vocabs: %d' % vocab_word.size()) 41 | else: 42 | vocab_word = word_emb = None 43 | 44 | ################# 45 | # Load elmo emb # 46 | ################# 47 | if self.argv.train_elmo_emb: 48 | write('Loading ELMo Embeddings...') 49 | train_elmo_emb = loader.load_hdf5(self.argv.train_elmo_emb) 50 | else: 51 | train_elmo_emb = None 52 | if self.argv.dev_elmo_emb: 53 | valid_elmo_emb = loader.load_hdf5(self.argv.dev_elmo_emb) 54 | else: 55 | valid_elmo_emb = None 56 | 57 | ############### 58 | # Load corpus # 59 | ############### 60 | write('Loading Corpus...') 61 | train_corpus = loader.load(path=argv.train_data, 62 | data_size=argv.data_size, 63 | is_test=False) 64 | valid_corpus = loader.load(path=argv.dev_data, 65 | data_size=argv.data_size, 66 | is_test=False) 67 | write('\t- # Sents: Train:%d Valid:%d' % (len(train_corpus), len(valid_corpus))) 68 | 69 | ############## 70 | # Make sents # 71 | ############## 72 | train_sents = pproc.make_sents(train_corpus) 73 | valid_sents = pproc.make_sents(valid_corpus) 74 | 75 | ############### 76 | # Make labels # 77 | ############### 78 | write('Making Labels...') 79 | vocab_label_train = pproc.make_and_save_vocab_label(sents=train_sents, 80 | vocab_label_init=None, 81 | save=argv.save, 82 | load=True) 83 | vocab_label_valid = pproc.make_and_save_vocab_label(sents=valid_sents, 84 | vocab_label_init=vocab_label_train, 85 | save=False, 86 | load=False) 87 | write('\t- # Labels: %d' % vocab_label_train.size()) 88 | 89 | ################### 90 | # Set sent params # 91 | ################### 92 | train_sents = pproc.set_sent_config(sents=train_sents, 93 | elmo_emb=train_elmo_emb, 94 | vocab_word=vocab_word, 95 | vocab_label=vocab_label_train) 96 | valid_sents = pproc.set_sent_config(sents=valid_sents, 97 | elmo_emb=valid_elmo_emb, 98 | vocab_word=vocab_word, 99 | vocab_label=vocab_label_valid) 100 | 101 | ################ 102 | # Make samples # 103 | ################ 104 | write('Making Samples...') 105 | train_samples = pproc.make_samples(sents=train_sents, 106 | is_valid_data=False) 107 | valid_samples = pproc.make_samples(sents=valid_sents, 108 | is_valid_data=True) 109 | write('\t- # Samples: Train:%d Valid:%d' % (len(train_samples), 110 | len(valid_samples))) 111 | 112 | ################# 113 | # Set Model API # 114 | ################# 115 | if train_elmo_emb is not None: 116 | use_elmo = True 117 | else: 118 | use_elmo = False 119 | 120 | if argv.n_experts > 0: 121 | is_ensemble = True 122 | else: 123 | is_ensemble = False 124 | 125 | if argv.method == 'span': 126 | self.model_api.n_true_spans = count_true_spans(train_sents) 127 | 128 | if is_ensemble: 129 | self.model_api.set_ensemble_model(word_emb=word_emb, 130 | use_elmo=use_elmo, 131 | vocab_word=vocab_word, 132 | vocab_label=vocab_label_train, 133 | vocab_label_valid=vocab_label_valid) 134 | self.model_api.load_experts_params(argv.load_param_dir) 135 | self.model_api.set_init_ensemble_param() 136 | self.model_api.set_ensemble_train_func() 137 | if self.model_api.vocab_label_valid: 138 | self.model_api.set_ensemble_pred_func() 139 | init_epoch = 0 140 | else: 141 | self.model_api.set_model(word_emb=word_emb, 142 | use_elmo=use_elmo, 143 | vocab_word=vocab_word, 144 | vocab_label=vocab_label_train, 145 | vocab_label_valid=vocab_label_valid) 146 | if argv.load_param_latest: 147 | if argv.output_dir: 148 | dir_name = argv.output_dir 149 | else: 150 | dir_name = 'output' 151 | param_fns = get_file_names_in_dir(dir_path=dir_name, 152 | prefix='param') 153 | opt_param_fns = get_file_names_in_dir(dir_path=dir_name, 154 | prefix='opt') 155 | param_fn, latest_epoch = get_latest_param_fn(file_names=param_fns) 156 | opt_param_fn, _ = get_latest_param_fn(file_names=opt_param_fns) 157 | self.model_api.argv.load_param = param_fn 158 | self.model_api.argv.load_opt_param = opt_param_fn 159 | self.model_api.load_params(param_fn) 160 | init_epoch = latest_epoch + 1 161 | elif argv.load_param: 162 | self.model_api.load_params(argv.load_param) 163 | init_epoch = 0 164 | else: 165 | init_epoch = 0 166 | 167 | self.model_api.set_train_func() 168 | if self.model_api.vocab_label_valid: 169 | self.model_api.set_pred_func() 170 | 171 | ####################### 172 | # Run training epochs # 173 | ####################### 174 | self._run_epochs(train_samples, valid_samples, init_epoch) 175 | 176 | def _run_epochs(self, train_samples, valid_samples=None, init_epoch=0): 177 | write('\nTRAIN START') 178 | 179 | argv = self.argv 180 | pproc = self.preprocessor 181 | vocab_label_valid = self.model_api.vocab_label_valid 182 | 183 | if valid_samples: 184 | valid_batches = pproc.make_batches(samples=valid_samples, 185 | is_valid_data=True) 186 | valid_batch_x, valid_batch_y = pproc.split_x_and_y(valid_batches) 187 | else: 188 | valid_batch_x = valid_batch_y = [] 189 | 190 | ########################################## 191 | # Initial result with pre-trained params # 192 | ########################################## 193 | if (argv.load_param or argv.load_param_dir) and valid_samples: 194 | write('\nEpoch: 0 (Using the Pre-trained Params)') 195 | write('VALID') 196 | valid_batch_y_pred = self.model_api.predict(valid_batch_x) 197 | self.best_valid_f1 = self.evaluator.f_score(y_true=valid_batch_y, 198 | y_pred=valid_batch_y_pred, 199 | vocab_label=vocab_label_valid) 200 | 201 | ############# 202 | # Main loop # 203 | ############# 204 | for epoch in range(init_epoch, argv.epoch): 205 | write('\nEpoch: %d' % (epoch + 1)) 206 | write('TRAIN') 207 | 208 | if argv.halve_lr and epoch > 49 and (epoch % 25) == 0: 209 | lr = self.model_api.optimizer.lr.get_value(borrow=True) 210 | self.model_api.optimizer.lr.set_value(lr * 0.5) 211 | write('### HALVE LEARNING RATE: %f -> %f' % (lr, lr * 0.5)) 212 | 213 | ############ 214 | # Training # 215 | ############ 216 | train_batches = pproc.make_batches(train_samples) 217 | self.model_api.train(train_batches) 218 | 219 | ############## 220 | # Validating # 221 | ############## 222 | if valid_samples: 223 | write('VALID') 224 | valid_batch_y_pred = self.model_api.predict(valid_batch_x) 225 | valid_f1 = self.evaluator.f_score(y_true=valid_batch_y, 226 | y_pred=valid_batch_y_pred, 227 | vocab_label=vocab_label_valid) 228 | if self.best_valid_f1 < valid_f1: 229 | self.best_valid_f1 = valid_f1 230 | self.best_epoch = epoch 231 | self.f1_history[self.best_epoch + 1] = [self.best_valid_f1] 232 | 233 | if argv.save: 234 | self.model_api.save_params(epoch=0) 235 | self.model_api.optimizer.save_params(epoch=0) 236 | 237 | show_score_history(self.f1_history) 238 | -------------------------------------------------------------------------------- /src/utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hiroki13/span-based-srl/2c8b677c4e00b6c607e09ef4f9fe3d54961e4f2e/src/utils/__init__.py -------------------------------------------------------------------------------- /src/utils/evaluators.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from utils.misc import write, span_to_span_index 4 | 5 | 6 | class Evaluator(object): 7 | def __init__(self, argv): 8 | self.argv = argv 9 | 10 | def f_score(self, y_true, y_pred, vocab_label): 11 | """ 12 | :param y_true: 1D: n_batches, 2D: batch_size, 3D: n_spans, 4D: [label_id, pre_index, post_index] 13 | :param y_pred: 1D: n_batches, 2D: batch_size, 3D: n_spans, 4D: [label_id, pre_index, post_index] 14 | """ 15 | correct, p_total, r_total = self.metrics(y_true=y_true, 16 | y_pred=y_pred, 17 | vocab_label=vocab_label) 18 | p, r, f = f_score(correct, p_total, r_total) 19 | write('\tF:{:>7.2%} P:{:>7.2%} ({:>5}/{:>5}) R:{:>7.2%} ({:>5}/{:>5})'.format( 20 | f, p, int(correct), int(p_total), r, int(correct), int(r_total)) 21 | ) 22 | return f 23 | 24 | def metrics(self, **kwargs): 25 | raise NotImplementedError 26 | 27 | 28 | class SpanEvaluator(Evaluator): 29 | def metrics(self, y_true, y_pred, vocab_label): 30 | """ 31 | :param y_true: 1D: n_batches, 2D: batch_size, 3D: n_spans, 4D: [label_id, pre_index, post_index] 32 | :param y_pred: 1D: n_batches, 2D: batch_size, 3D: n_spans, 4D: [label_id, pre_index, post_index] 33 | """ 34 | p_total = 0. 35 | r_total = 0. 36 | correct = 0. 37 | for span_true_batch, span_pred_batch in zip(y_true, y_pred): 38 | for spans_true, spans_pred in zip(span_true_batch, span_pred_batch): 39 | spans_true = concat_c_spans_from_spans(spans_true, vocab_label) 40 | spans_pred = concat_c_spans_from_spans(spans_pred, vocab_label) 41 | p_total += len(spans_pred) 42 | r_total += len(spans_true) 43 | for span in spans_pred: 44 | if span in spans_true: 45 | correct += 1 46 | return correct, p_total, r_total 47 | 48 | 49 | class BIOEvaluator(Evaluator): 50 | def metrics(self, y_true, y_pred, vocab_label): 51 | p_total = 0. 52 | r_total = 0. 53 | correct = 0. 54 | for y_true_batch, y_pred_batch in zip(y_true, y_pred): 55 | for y_true_i, y_pred_i in zip(y_true_batch, y_pred_batch): 56 | y_true_spans = get_spans_from_bio_labels(y_true_i, vocab_label) 57 | y_pred_spans = get_spans_from_bio_labels(y_pred_i, vocab_label) 58 | p_total += len(y_pred_spans) 59 | r_total += len(y_true_spans) 60 | for y_pred_span in y_pred_spans: 61 | if y_pred_span in y_true_spans: 62 | correct += 1. 63 | return correct, p_total, r_total 64 | 65 | 66 | def f_score(correct, p_total, r_total): 67 | precision = correct / p_total if p_total > 0 else 0. 68 | recall = correct / r_total if r_total > 0 else 0. 69 | f1 = (2 * precision * recall) / (precision + recall) if precision + recall > 0 else 0. 70 | return precision, recall, f1 71 | 72 | 73 | def get_spans_from_bio_labels(sent, vocab_label): 74 | spans = [] 75 | span = [] 76 | for w_i, label_id in enumerate(sent): 77 | label = vocab_label.get_word(label_id) 78 | if label[-2:] == '-V': 79 | continue 80 | if label.startswith('B-'): 81 | if span: 82 | spans.append(span) 83 | span = [label[2:], w_i, w_i] 84 | elif label.startswith('I-'): 85 | if span: 86 | if label[2:] == span[0]: 87 | span[2] = w_i 88 | else: 89 | spans.append(span) 90 | span = [label[2:], w_i, w_i] 91 | else: 92 | span = [label[2:], w_i, w_i] 93 | else: 94 | if span: 95 | spans.append(span) 96 | span = [] 97 | if span: 98 | spans.append(span) 99 | 100 | return concat_c_spans_from_bio_labels(spans) 101 | 102 | 103 | def concat_c_spans_from_bio_labels(spans): 104 | labels = [span[0] for span in spans] 105 | c_indices = [i for i, span in enumerate(spans) if span[0].startswith('C')] 106 | non_ant_c_spans = [] 107 | 108 | for c_index in c_indices: 109 | c_span = spans[c_index] 110 | c_label = c_span[0][2:] 111 | if c_label in labels: 112 | spans[labels.index(c_label)].extend(c_span[1:]) 113 | else: 114 | non_ant_c_spans.append([c_label] + c_span[1:]) 115 | concated_spans = [span for i, span in enumerate(spans) if i not in c_indices] 116 | spans = concated_spans + non_ant_c_spans 117 | return spans 118 | 119 | 120 | def concat_c_spans_from_spans(spans, vocab_label): 121 | spans = [[vocab_label.get_word(l), i, j] for (l, i, j) in spans] 122 | labels = [l for (l, i, j) in spans] 123 | c_indices = [index for index, (l, i, j) in enumerate(spans) if l.startswith('C')] 124 | non_ant_c_spans = [] 125 | 126 | for c_index in c_indices: 127 | c_span = spans[c_index] 128 | label = c_span[0][2:] 129 | if label in labels: 130 | spans[labels.index(label)].extend(c_span[1:]) 131 | 132 | concated_spans = [span for i, span in enumerate(spans) if i not in c_indices] 133 | spans = concated_spans + non_ant_c_spans 134 | return spans 135 | 136 | 137 | def metrics_for_bio(y_true, y_pred, vocab_label): 138 | p_total = 0. 139 | r_total = 0. 140 | correct = 0. 141 | for y_true_i, y_pred_i in zip(y_true, y_pred): 142 | y_true_spans = get_spans_from_bio_labels(y_true_i, vocab_label) 143 | y_pred_spans = get_spans_from_bio_labels(y_pred_i, vocab_label) 144 | p_total += len(y_pred_spans) 145 | r_total += len(y_true_spans) 146 | for y_pred_span in y_pred_spans: 147 | if y_pred_span in y_true_spans: 148 | correct += 1. 149 | return correct, p_total, r_total 150 | 151 | 152 | def correct_and_pred_spans(span_true, span_pred, marks): 153 | """ 154 | :param span_true: 1D: batch_size * n_spans; span index 155 | :param span_pred: 1D: batch_size, 2D: n_labels; span index 156 | :param marks: 1D: batch_size, 2D: n_words; elem=0/1 157 | """ 158 | correct = 0. 159 | n_pred_spans = 0. 160 | n_words = len(marks[0]) 161 | _, prd_indices = np.array(marks).nonzero() 162 | prd_indices = [span_to_span_index(p, p, n_words) for p in prd_indices] 163 | 164 | for b_index, span_pred_tmp in enumerate(span_pred): 165 | prd_index = prd_indices[b_index] 166 | for label_id, span_index in enumerate(span_pred_tmp): 167 | if span_index == prd_index: 168 | continue 169 | if [b_index, label_id, span_index] in span_true: 170 | correct += 1 171 | n_pred_spans += 1 172 | 173 | return correct, n_pred_spans 174 | 175 | 176 | def count_true_spans(sents): 177 | """ 178 | :param sents: 1D: n_sents 179 | :return: total number of spans 180 | """ 181 | return sum([len(triple) for sent in sents for triple in sent.span_triples]) 182 | -------------------------------------------------------------------------------- /src/utils/loaders.py: -------------------------------------------------------------------------------- 1 | import os 2 | import gzip 3 | import pickle 4 | import h5py 5 | 6 | import numpy as np 7 | import theano 8 | 9 | from utils.misc import get_file_names_in_dir 10 | from utils.vocab import UNK 11 | 12 | 13 | class Loader(object): 14 | def __init__(self, argv): 15 | self.argv = argv 16 | 17 | def load(self, **kwargs): 18 | raise NotImplementedError 19 | 20 | @staticmethod 21 | def load_data(fn): 22 | with gzip.open(fn, 'rb') as gf: 23 | return pickle.load(gf) 24 | 25 | @staticmethod 26 | def load_key_value_format(fn): 27 | data = [] 28 | with open(fn, 'r') as f: 29 | for line in f: 30 | key, value = line.rstrip().split() 31 | data.append((key, int(value))) 32 | return data 33 | 34 | @staticmethod 35 | def load_hdf5(path): 36 | return h5py.File(path, 'r') 37 | 38 | def load_txt_from_dir(self, dir_path, file_prefix): 39 | file_names = get_file_names_in_dir(dir_path + '/*') 40 | file_names = [fn for fn in file_names 41 | if os.path.basename(fn).startswith(file_prefix) 42 | and fn.endswith('txt')] 43 | return [self.load(path=fn) for fn in file_names] 44 | 45 | def load_hdf5_from_dir(self, dir_path, file_prefix): 46 | file_names = get_file_names_in_dir(dir_path + '/*') 47 | file_names = [fn for fn in file_names 48 | if os.path.basename(fn).startswith(file_prefix) 49 | and fn.endswith('hdf5')] 50 | return [self.load_hdf5(fn) for fn in file_names] 51 | 52 | 53 | class Conll05Loader(Loader): 54 | 55 | def load(self, path, data_size=1000000, is_test=False): 56 | if path is None: 57 | return [] 58 | 59 | corpus = [] 60 | sent = [] 61 | 62 | with open(path) as f: 63 | for line in f: 64 | elem = [l for l in line.rstrip().split()] 65 | if len(elem) > 0: 66 | if is_test: 67 | sent.append(elem[:6]) 68 | else: 69 | sent.append(elem) 70 | else: 71 | corpus.append(sent) 72 | sent = [] 73 | if len(corpus) >= data_size: 74 | break 75 | return corpus 76 | 77 | 78 | class Conll12Loader(Loader): 79 | 80 | def load(self, path, data_size=1000000, is_test=False): 81 | if path is None: 82 | return [] 83 | 84 | corpus = [] 85 | sent = [] 86 | 87 | with open(path) as f: 88 | for line in f: 89 | elem = [l for l in line.rstrip().split()] 90 | if len(elem) > 10: 91 | if is_test: 92 | sent.append(elem[:11]) 93 | else: 94 | sent.append(elem) 95 | elif len(elem) == 0: 96 | corpus.append(sent) 97 | sent = [] 98 | if len(corpus) >= data_size: 99 | break 100 | return corpus 101 | 102 | 103 | def load_emb(path): 104 | word_list = [] 105 | emb = [] 106 | with open(path) as f: 107 | for line in f: 108 | line = line.rstrip().split() 109 | word_list.append(line[0]) 110 | emb.append(line[1:]) 111 | emb = np.asarray(emb, dtype=theano.config.floatX) 112 | 113 | if UNK not in word_list: 114 | word_list = [UNK] + word_list 115 | unk_vector = np.mean(emb, axis=0) 116 | emb = np.vstack((unk_vector, emb)) 117 | 118 | return word_list, emb 119 | 120 | 121 | def load_pickle(fn): 122 | with gzip.open(fn, 'rb') as gf: 123 | return pickle.load(gf) 124 | 125 | 126 | def load_key_value_format(fn): 127 | data = [] 128 | with open(fn, 'r') as f: 129 | for line in f: 130 | key, value = line.rstrip().split() 131 | data.append((key, int(value))) 132 | return data 133 | -------------------------------------------------------------------------------- /src/utils/misc.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import glob 4 | 5 | import numpy as np 6 | import theano 7 | 8 | from utils.vocab import Vocab 9 | 10 | 11 | def write(s, stream=sys.stdout): 12 | stream.write(s + '\n') 13 | stream.flush() 14 | 15 | 16 | def show_score_history(history, memo=''): 17 | write('F1 HISTORY' + memo) 18 | for k, v in sorted(history.items()): 19 | epoch_tm = '\t- EPOCH-{:d} '.format(k) 20 | if len(v) == 1: 21 | f1_valid = '\tBEST VALID {:>7.2%}'.format(v[0]) 22 | write(epoch_tm + f1_valid) 23 | else: 24 | v1, v2 = v 25 | f1_valid = '\tBEST VALID {:>7.2%}'.format(v1) 26 | f1_evalu = '\tEVALU {:>7.2%}'.format(v2) 27 | write(epoch_tm + f1_valid + f1_evalu) 28 | 29 | 30 | def str_to_id(sent, vocab, unk): 31 | """ 32 | :param sent: 1D: n_words 33 | :param vocab: Vocab() 34 | :return: 1D: n_words; elem=id 35 | """ 36 | return list(map(lambda w: vocab.get_id(w) if vocab.has_key(w) else vocab.get_id(unk), sent)) 37 | 38 | 39 | def make_vocab_from_ids(key_value_format): 40 | vocab = Vocab() 41 | for key, value in key_value_format: 42 | vocab.add_word(key) 43 | return vocab 44 | 45 | 46 | def array(sample, is_float=False): 47 | if is_float: 48 | return np.asarray(sample, dtype=theano.config.floatX) 49 | return np.asarray(sample, dtype='int32') 50 | 51 | 52 | def average_vector(emb): 53 | return np.mean(np.asarray(emb[2:], dtype=theano.config.floatX), axis=0) 54 | 55 | 56 | def unit_vector(vecs, axis): 57 | return vecs / np.sqrt(np.sum(vecs ** 2, axis=axis, keepdims=True)) 58 | 59 | 60 | def make_output_dir(argv): 61 | if argv.output_dir: 62 | output_dir = argv.output_dir 63 | else: 64 | output_dir = 'output' 65 | os.makedirs(output_dir, exist_ok=True) 66 | 67 | 68 | def join_dir_and_file_names(dir_name, file_name): 69 | return os.path.join(dir_name, file_name) 70 | 71 | 72 | def get_file_names_in_dir(dir_path, prefix=None, suffix=None): 73 | file_names = glob.glob(dir_path + '/*') 74 | if prefix: 75 | file_names = [fn for fn in file_names 76 | if os.path.basename(fn).startswith(prefix)] 77 | if suffix: 78 | file_names = [fn for fn in file_names 79 | if fn.endswith(suffix)] 80 | return file_names 81 | 82 | 83 | def get_latest_param_fn(file_names): 84 | latest_epoch = -1 85 | latest_fn = None 86 | for fn in file_names: 87 | for elem in fn.split('.'): 88 | if elem.startswith('epoch'): 89 | epoch = int(elem[6:]) 90 | if latest_epoch < epoch: 91 | latest_epoch = epoch 92 | latest_fn = fn 93 | break 94 | assert latest_fn is not None 95 | return latest_fn, latest_epoch 96 | 97 | 98 | def span_to_span_index(i, j, n_words): 99 | return i * (n_words - 1) + j - np.arange(i).sum() 100 | -------------------------------------------------------------------------------- /src/utils/savers.py: -------------------------------------------------------------------------------- 1 | import gzip 2 | import pickle 3 | import json 4 | 5 | 6 | class Saver(object): 7 | 8 | def __init__(self, argv): 9 | self.argv = argv 10 | 11 | def save_props(self, **kwargs): 12 | raise NotImplementedError 13 | 14 | def save_json_format(self, **kwargs): 15 | raise NotImplementedError 16 | 17 | 18 | class SpanSaver(Saver): 19 | def save_props(self, corpus, labels, vocab_label): 20 | """ 21 | :param corpus: 1D: n_sents, 2D: n_words; elem=line 22 | :param labels: 1D: n_sents, 2D: n_prds, 3D: n_spans, 4D: [r, i, j] 23 | """ 24 | assert len(corpus) == len(labels), '%d %d' % (len(corpus), len(labels)) 25 | 26 | fn = self.argv.output_dir 27 | if self.argv.output_fn: 28 | fn += '/results.%s.prop' % self.argv.output_fn 29 | else: 30 | fn += '/results.prop' 31 | f = open(fn, 'w') 32 | 33 | for sent, spans_sent in zip(corpus, labels): 34 | columns = [[mark] for mark in sent.marks] 35 | n_words = sent.n_words 36 | assert len(sent.prd_indices) == len(spans_sent) 37 | for prd_index, spans in zip(sent.prd_indices, spans_sent): 38 | prop = self._span_to_prop(spans=spans, 39 | prd_index=prd_index, 40 | n_words=n_words, 41 | vocab_label=vocab_label) 42 | for i, p in enumerate(prop): 43 | columns[i].append(p) 44 | for c in columns: 45 | f.write("%s\n" % "\t".join(c)) 46 | f.write("\n") 47 | f.close() 48 | 49 | def save_json_format(self, corpus, labels, vocab_label): 50 | """ 51 | :param corpus: 1D: n_sents, 2D: n_words; elem=line 52 | :param labels: 1D: n_sents, 2D: n_prds, 3D: n_spans, 4D: [r, i, j] 53 | :param vocab_label: Vocab() 54 | """ 55 | assert len(corpus) == len(labels), '%d %d' % (len(corpus), len(labels)) 56 | 57 | fn = self.argv.output_dir 58 | if self.argv.output_fn: 59 | fn += '/results.%s.json' % self.argv.output_fn 60 | else: 61 | fn += '/results.json' 62 | f = open(fn, 'w') 63 | 64 | corpus_dic = {} 65 | for sent_index, (sent, spans_sent) in enumerate(zip(corpus, labels)): 66 | assert len(sent.prd_indices) == len(spans_sent) 67 | 68 | prop_dic = {} 69 | for prd_index, spans in zip(sent.prd_indices, spans_sent): 70 | arg_dic = {} 71 | for (r, i, j) in spans: 72 | key = '(%s,%d,%d)' % (vocab_label.get_word(r), i, j) 73 | value = " ".join(sent.strings[i: j + 1]) 74 | arg_dic[key] = value 75 | 76 | prd_dic = {'prd': sent.forms[prd_index], 77 | 'arg': arg_dic} 78 | prop_dic['prd-%d' % prd_index] = prd_dic 79 | 80 | sent_dic = {'text': " ".join(sent.strings), 81 | 'mark': " ".join(sent.marks), 82 | 'prop': prop_dic} 83 | corpus_dic['sent-%d' % sent_index] = sent_dic 84 | 85 | json.dump(corpus_dic, f, indent=4) 86 | f.close() 87 | 88 | @staticmethod 89 | def _span_to_prop(spans, prd_index, n_words, vocab_label): 90 | """ 91 | :param spans: 1D: n_spans, 2D: [r, i, j] 92 | :return: 1D: n_words; elem=str; e.g. (A0* & *) 93 | """ 94 | prop = ['*' for _ in range(n_words)] 95 | prop[prd_index] = '(V*)' 96 | for (label_id, pre_index, post_index) in spans: 97 | label = vocab_label.get_word(label_id) 98 | if pre_index == post_index: 99 | prop[pre_index] = '(%s*)' % label 100 | else: 101 | prop[pre_index] = '(%s*' % label 102 | prop[post_index] = '*)' 103 | return prop 104 | 105 | 106 | class BIOSaver(Saver): 107 | def save_props(self, corpus, labels, vocab_label): 108 | """ 109 | :param corpus: 1D: n_sents, 2D: n_words; elem=line 110 | :param labels: 1D: n_sents, 2D: n_prds, 3D: n_words; elem=label id 111 | :param vocab_label: Vocab() 112 | """ 113 | assert len(corpus) == len(labels), '%d %d' % (len(corpus), len(labels)) 114 | 115 | fn = self.argv.output_dir 116 | if self.argv.output_fn: 117 | fn += '/results.%s.prop' % self.argv.output_fn 118 | else: 119 | fn += '/results.prop' 120 | f = open(fn, 'w') 121 | 122 | for sent, labels_sent in zip(corpus, labels): 123 | columns = [[mark] for mark in sent.marks] 124 | for labels_prd in labels_sent: 125 | assert len(columns) == len(labels_prd) 126 | spans = self._get_spans(labels_prd, vocab_label) 127 | labels = self._span_to_prop(len(labels_prd), spans) 128 | for i, label in enumerate(labels): 129 | columns[i].append(label) 130 | for c in columns: 131 | f.write("%s\n" % "\t".join(c)) 132 | f.write("\n") 133 | f.close() 134 | 135 | def save_json_format(self, corpus, labels, vocab_label): 136 | """ 137 | :param corpus: 1D: n_sents, 2D: n_words; elem=line 138 | :param labels: 1D: n_sents, 2D: n_prds, 3D: n_words; elem=label id 139 | :param vocab_label: Vocab() 140 | """ 141 | assert len(corpus) == len(labels), '%d %d' % (len(corpus), len(labels)) 142 | 143 | fn = self.argv.output_dir 144 | if self.argv.output_fn: 145 | fn += '/results.%s.json' % self.argv.output_fn 146 | else: 147 | fn += '/results.json' 148 | f = open(fn, 'w') 149 | 150 | corpus_dic = {} 151 | for sent_index, (sent, labels_sent) in enumerate(zip(corpus, labels)): 152 | assert len(sent.prd_indices) == len(labels_sent) 153 | 154 | prop_dic = {} 155 | for prd_index, labels_prd in zip(sent.prd_indices, labels_sent): 156 | arg_dic = {} 157 | spans = self._get_spans(labels_prd, vocab_label) 158 | for (label, i, j) in spans: 159 | if label == 'V': 160 | continue 161 | key = '(%s,%d,%d)' % (label, i, j) 162 | value = " ".join(sent.strings[i: j + 1]) 163 | arg_dic[key] = value 164 | 165 | prd_dic = {'prd': sent.forms[prd_index], 166 | 'arg': arg_dic} 167 | prop_dic['prd-%d' % prd_index] = prd_dic 168 | 169 | sent_dic = {'text': " ".join(sent.strings), 170 | 'mark': " ".join(sent.marks), 171 | 'prop': prop_dic} 172 | corpus_dic['sent-%d' % sent_index] = sent_dic 173 | 174 | json.dump(corpus_dic, f, indent=4) 175 | f.close() 176 | 177 | @staticmethod 178 | def _get_spans(labels, vocab_label): 179 | """ 180 | :param labels: 1D: n_words; elem=label id 181 | :param vocab_label: label id dict 182 | :return: 1D: n_spans; elem=[label, i, j] 183 | """ 184 | spans = [] 185 | span = [] 186 | for w_i, label_id in enumerate(labels): 187 | label = vocab_label.get_word(label_id) 188 | if label.startswith('B-'): 189 | if span: 190 | spans.append(span) 191 | span = [label[2:], w_i, w_i] 192 | elif label.startswith('I-'): 193 | if span: 194 | if label[2:] == span[0]: 195 | span[2] = w_i 196 | else: 197 | spans.append(span) 198 | span = [label[2:], w_i, w_i] 199 | else: 200 | span = [label[2:], w_i, w_i] 201 | else: 202 | if span: 203 | spans.append(span) 204 | span = [] 205 | if span: 206 | spans.append(span) 207 | return spans 208 | 209 | @staticmethod 210 | def _span_to_prop(n_words, spans): 211 | """ 212 | :param n_words: int 213 | :param spans: 1D: n_spans; elem=[label, i, j] 214 | :return: 1D: n_words; elem=label 215 | """ 216 | k = 0 217 | args = [] 218 | for w_i in range(n_words): 219 | if k >= len(spans): 220 | args.append('*') 221 | continue 222 | span = spans[k] 223 | if span[1] < w_i < span[2]: # within span 224 | args.append('*') 225 | elif w_i == span[1] and w_i == span[2]: # begin and end of span 226 | args.append('(' + span[0] + '*)') 227 | k += 1 228 | elif w_i == span[1]: # begin of span 229 | args.append('(' + span[0] + '*') 230 | elif w_i == span[2]: # end of span 231 | args.append('*)') 232 | k += 1 233 | else: 234 | args.append('*') # without span 235 | return args 236 | 237 | 238 | def save_pickle(fn, data): 239 | with gzip.open(fn + '.pkl.gz', 'wb') as gf: 240 | pickle.dump(data, gf, pickle.HIGHEST_PROTOCOL) 241 | 242 | 243 | def save_key_value_format(fn, keys, values): 244 | assert len(keys) == len(values) 245 | if type(values[0]) is not str: 246 | values = map(lambda v: str(v), values) 247 | with open(fn + '.txt', 'w') as f: 248 | for key, value in zip(keys, values): 249 | f.write("%s\t%s\n" % (key, value)) 250 | -------------------------------------------------------------------------------- /src/utils/sent.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from utils.misc import array, str_to_id 4 | from utils.vocab import HYPH, UNK 5 | 6 | 7 | class Sent(object): 8 | def __init__(self, sent, is_test=True): 9 | self.words = self._make_words(sent=sent, is_test=is_test) 10 | 11 | self.forms = [word.form for word in self.words] 12 | self.strings = [word.string for word in self.words] 13 | self.marks = self._set_marks(self.words) 14 | self.props = [word.prop for word in self.words] 15 | 16 | self.prd_indices = self._set_prd_indices(self.marks) 17 | self.prd_forms = [self.forms[i] for i in self.prd_indices] 18 | self.prd_bio_labels = self._set_prd_bio_labels(self.props) 19 | self.has_prds = True if len(self.prd_indices) > 0 else False 20 | 21 | self.n_words = len(sent) 22 | self.n_prds = len(self.prd_indices) 23 | 24 | self.word_ids = None 25 | self.mark_ids = None 26 | self.elmo_emb = None 27 | self.bio_label_ids = None 28 | self.span_triples = None 29 | self.span_triples_with_null = None 30 | 31 | def _make_words(self, sent, is_test=True): 32 | return [self._make_word(line, is_test) for line in sent] 33 | 34 | @staticmethod 35 | def _make_word(line, is_test=True): 36 | raise NotImplementedError 37 | 38 | def _set_marks(self, words): 39 | raise NotImplementedError 40 | 41 | @staticmethod 42 | def _make_bio_labels(prop): 43 | """ 44 | :param prop: 1D: n_words; elem=bracket label 45 | :return: 1D: n_words; elem=BIO label 46 | """ 47 | labels = [] 48 | prev = None 49 | for arg in prop: 50 | if arg.startswith('('): 51 | if arg.endswith(')'): 52 | prev = arg.split("*")[0][1:] 53 | label = 'B-' + prev 54 | prev = None 55 | else: 56 | prev = arg[1:-1] 57 | label = 'B-' + prev 58 | else: 59 | if prev: 60 | label = 'I-' + prev 61 | if arg.endswith(')'): 62 | prev = None 63 | else: 64 | label = 'O' 65 | labels.append(label) 66 | return labels 67 | 68 | @staticmethod 69 | def _set_prd_indices(marks): 70 | return [i for i, mark in enumerate(marks) if mark != HYPH] 71 | 72 | def _set_prd_bio_labels(self, props): 73 | """ 74 | :param props: 1D: n_words, 2D: n_prds 75 | :return: 1D: n_prds, 2D: n_words 76 | """ 77 | props = map(lambda p: p, zip(*props)) 78 | return [self._make_bio_labels(prop) for prop in props] 79 | 80 | def set_word_ids(self, vocab_word): 81 | self.word_ids = array(str_to_id(sent=self.forms, 82 | vocab=vocab_word, 83 | unk=UNK)) 84 | 85 | def set_mark_ids(self): 86 | mark_ids = [[0 for _ in range(self.n_words)] for _ in range(self.n_prds)] 87 | for i, prd_index in enumerate(self.prd_indices): 88 | mark_ids[i][prd_index] = 1 89 | self.mark_ids = array(mark_ids) 90 | 91 | def set_label_ids(self, vocab_label): 92 | """ 93 | :param vocab_label: Vocab (BIO labels); e.g. B-A0, I-A0 94 | """ 95 | assert len(self.prd_indices) == len(self.prd_bio_labels) 96 | label_ids = [] 97 | for prd_index, props in zip(self.prd_indices, self.prd_bio_labels): 98 | y = str_to_id(sent=props, vocab=vocab_label, unk='O') 99 | label_ids.append(y) 100 | self.bio_label_ids = array(label_ids) 101 | 102 | def set_elmo_emb(self, elmo_emb): 103 | """ 104 | :param elmo_emb: 1D: n_layers, 2D: n_words, 3D: dim 105 | """ 106 | elmo_emb = np.asarray(elmo_emb) 107 | elmo_emb = elmo_emb.transpose((1, 0, 2)) 108 | assert len(elmo_emb) == self.n_words 109 | self.elmo_emb = elmo_emb 110 | 111 | def set_span_triples(self, vocab_label): 112 | """ 113 | :param vocab_label: Vocab (labels); e.g. A0, A1 114 | """ 115 | triples = [] 116 | for bio_labels in self.prd_bio_labels: 117 | prd_triples = [] 118 | for (label, i, j) in self._get_spans(bio_labels): 119 | r = vocab_label.get_id(label) 120 | prd_triples.append((r, i, j)) 121 | triples.append(prd_triples) 122 | self.span_triples = triples 123 | 124 | @staticmethod 125 | def _get_spans(bio_labels): 126 | """ 127 | :param bio_labels: 1D: n_words; elem=bio label 128 | :return: 1D: n_spans; elem=[label, i, j] 129 | """ 130 | spans = [] 131 | span = [] 132 | for i, label in enumerate(bio_labels): 133 | if label[-2:] == '-V': 134 | continue 135 | if label.startswith('B-'): 136 | if span: 137 | spans.append(span) 138 | span = [label[2:], i, i] 139 | elif label.startswith('I-'): 140 | if span: 141 | if label[2:] == span[0]: 142 | span[2] = i 143 | else: 144 | spans.append(span) 145 | span = [label[2:], i, i] 146 | else: 147 | span = [label[2:], i, i] 148 | else: 149 | if span: 150 | spans.append(span) 151 | span = [] 152 | if span: 153 | spans.append(span) 154 | return spans 155 | 156 | def set_span_triples_with_null(self, n_labels): 157 | assert len(self.span_triples) == len(self.prd_indices) 158 | triples_with_null = [] 159 | for prd_index, spans in zip(self.prd_indices, self.span_triples): 160 | used_labels = [r for (r, i, j) in spans] 161 | null_spans = [(r, prd_index, prd_index) 162 | for r in range(n_labels) 163 | if r not in used_labels] 164 | triples = spans + null_spans 165 | triples.sort(key=lambda s: s[0]) 166 | triples_with_null.append(triples) 167 | self.span_triples_with_null = triples_with_null 168 | 169 | 170 | class Conll05Sent(Sent): 171 | @staticmethod 172 | def _make_word(line, is_test=False): 173 | return Word(form=line[0], 174 | mark=line[5] if is_test is False else line[4], 175 | sense=line[4] if is_test is False else None, 176 | prop=line[6:] if is_test is False else []) 177 | 178 | def _set_marks(self, words): 179 | return [word.mark for word in words] 180 | 181 | 182 | class Conll12Sent(Sent): 183 | @staticmethod 184 | def _make_word(line, is_test=False): 185 | return Word(form=line[3], 186 | mark=line[6], 187 | sense=line[7], 188 | prop=line[11:-1] if is_test is False else []) 189 | 190 | def _set_marks(self, words): 191 | return list(map(lambda w: w.mark if w.sense != HYPH else HYPH, words)) 192 | 193 | 194 | class Word(object): 195 | def __init__(self, form, mark, sense, prop): 196 | self.form = form.lower() 197 | self.string = form 198 | self.mark = mark 199 | self.sense = sense 200 | self.prop = prop 201 | -------------------------------------------------------------------------------- /src/utils/vocab.py: -------------------------------------------------------------------------------- 1 | HYPH = u'-' 2 | UNK = u'UNKNOWN' 3 | 4 | 5 | class Vocab(object): 6 | def __init__(self): 7 | self.i2w = [] 8 | self.w2i = {} 9 | 10 | def add_word(self, word): 11 | if word not in self.w2i: 12 | new_id = self.size() 13 | self.i2w.append(word) 14 | self.w2i[word] = new_id 15 | 16 | def get_id(self, word): 17 | return self.w2i.get(word) 18 | 19 | def get_id_or_unk_id(self, word): 20 | if word in self.w2i: 21 | return self.w2i.get(word) 22 | return self.w2i.get(UNK) 23 | 24 | def get_and_add_id(self, word): 25 | self.add_word(word) 26 | return self.w2i.get(word) 27 | 28 | def get_word(self, w_id): 29 | return self.i2w[w_id] 30 | 31 | def has_key(self, word): 32 | return word in self.w2i 33 | 34 | def size(self): 35 | return len(self.i2w) 36 | --------------------------------------------------------------------------------