├── README.md
├── scripts
    └── eval.py
└── src
    ├── __init__.py
    ├── main.py
    ├── nn
        ├── __init__.py
        ├── activations.py
        ├── initializers.py
        ├── layers
        │   ├── __init__.py
        │   ├── core.py
        │   ├── embeddings.py
        │   ├── recurrent.py
        │   ├── seqlabel.py
        │   └── stack.py
        ├── losses.py
        ├── metrics.py
        ├── optimizers.py
        ├── regularizers.py
        └── utils.py
    ├── srl
        ├── __init__.py
        ├── decoders.py
        ├── model_api.py
        ├── models.py
        ├── preprocessors.py
        ├── testers.py
        └── trainers.py
    └── utils
        ├── __init__.py
        ├── evaluators.py
        ├── loaders.py
        ├── misc.py
        ├── savers.py
        ├── sent.py
        └── vocab.py


/README.md:
--------------------------------------------------------------------------------
  1 | # A Span Selection Model for Semantic Role Labeling
  2 | 
  3 | ## Citation
  4 | * A Span Selection Model for Semantic Role Labeling
  5 | * Hiroki Ouchi (RIKEN AIP/Tohoku Univ.), Hiroyuki Shindo (NAIST) and Yuji Matsumoto (NAIST)
  6 | * In EMNLP 2018
  7 | * Conference paper: http://aclweb.org/anthology/D18-1191
  8 | * arXiv version: https://arxiv.org/abs/1810.02245
  9 | ```
 10 | @InProceedings{D18-1191,
 11 |   author = 	"Ouchi, Hiroki
 12 | 		and Shindo, Hiroyuki
 13 | 		and Matsumoto, Yuji",
 14 |   title = 	"A Span Selection Model for Semantic Role Labeling",
 15 |   booktitle = 	"Proceedings of the 2018 Conference on Empirical Methods in Natural Language Processing",
 16 |   year = 	"2018",
 17 |   publisher = 	"Association for Computational Linguistics",
 18 |   pages = 	"1630--1642",
 19 |   location = 	"Brussels, Belgium",
 20 |   url = 	"http://aclweb.org/anthology/D18-1191"
 21 | }
 22 | ```
 23 | 
 24 | 
 25 | ## Prerequisites
 26 | * [python3](https://www.python.org/downloads/)
 27 | * [Theano](http://deeplearning.net/software/theano/)
 28 | * [h5py](https://www.h5py.org/)
 29 | 
 30 | ## Installation
 31 | ```
 32 | conda create -n theano-py3 python=3.6
 33 | source activate theano-py3
 34 | conda install -c conda-forge theano
 35 | conda install -c anaconda h5py
 36 | ```
 37 | 
 38 | ## Data
 39 | ### CoNLL-2005
 40 | * [Treebank-2](https://catalog.ldc.upenn.edu/LDC95T7)
 41 | ### CoNLL-2012
 42 | * [OntoNotes Release 5.0](https://catalog.ldc.upenn.edu/LDC2013T19)
 43 | * We create the dataset by following the process described at http://cemantix.org/data/ontonotes.html
 44 | ### Word Representations
 45 | - [SENNA](https://ronan.collobert.com/senna/download.html)
 46 |     - Download the software and make the word-embedding pair file as follows.
 47 |     - `paste hash/words.lst embeddings/embeddings.txt > senna.emb.txt`
 48 | 
 49 | * [ELMo](https://github.com/allenai/allennlp/tree/v0.6.1)
 50 | 
 51 | ### Data Format
 52 | #### CoNLL-2005 Training & Development Sets
 53 | ```
 54 | 0:WORD 1:POS 2:PARSE 3:NE 4:FRAME 5:LEMMA 6-:ARGS
 55 | Ms.                NNP    (S1(S(NP*         *    -   -       (A0*
 56 | Haag               NNP            *)    (LOC*)   -   -          *)
 57 | plays              VBZ         (VP*         *    02  play     (V*)
 58 | Elianti            NNP         (NP*))       *    -   -       (A1*)
 59 | .                   .             *))       *    -   -          *
 60 | ```
 61 | 
 62 | #### CoNLL-2005 Test Set (Not including FRAME ID)
 63 | ```
 64 | 0:WORD 1:POS 2:PARSE 3:NE 4:LEMMA 5-:ARGS
 65 | The                DT     (S1(S(NP*         *                 (A1*
 66 | finger-pointing    JJ             *)        *    -               *)
 67 | has                AUX         (VP*         *    -               *
 68 | already            RB        (ADVP*)        *    -        (AM-TMP*)
 69 | begun              VBN         (VP*))       *    begin         (V*)
 70 | .
 71 | ```
 72 | 
 73 | #### CoNLL-2012 Training/Development/Test Sets
 74 | ```
 75 | 0:DOCUMENT 1:PART 2:INDEX 3:WORD 4:POS 5:PARSE 6:LEMMA 7:FRAME 8:SENSE 9:SPEAKER 10:NE 11-N:ARGS N:COREF
 76 | bc/cctv/00/cctv_0001   0   0           This    DT  (TOP(S(NP*         -    -   -   Speaker#1        *   (ARG2*   (61
 77 | bc/cctv/00/cctv_0001   0   1            map    NN           *)        -    -   -   Speaker#1        *        *)   61)
 78 | bc/cctv/00/cctv_0001   0   2      reflected   VBD        (VP*    reflect  01   1   Speaker#1        *      (V*)    -
 79 | bc/cctv/00/cctv_0001   0   3            the    DT        (NP*         -    -   -   Speaker#1        *   (ARG1*     -
 80 | bc/cctv/00/cctv_0001   0   4       European    JJ           *         -    -   -   Speaker#1    (NORP)       *     -
 81 | bc/cctv/00/cctv_0001   0   5    battlefield    NN           *         -    -   -   Speaker#1        *        *     -
 82 | bc/cctv/00/cctv_0001   0   6      situation    NN           *))       -    -   -   Speaker#1        *        *)    -
 83 | bc/cctv/00/cctv_0001   0   7              .     .           *))       -    -   -   Speaker#1        *        *     -
 84 | ```
 85 | 
 86 | 
 87 | ## Usage
 88 | ### Training: span selection model
 89 | SENNA: `python src/main.py --method span --mode train --train_data path/to/conll2005.train.txt --dev_data path/to/conll2005.dev.txt --data_type conll05 --drop_rate 0.1 --reg 0.0001 --hidden_dim 300 --n_layers 4 --halve_lr --word_emb path/to/senna --save --output_dir output`
 90 | 
 91 | ELMo: `python src/main.py --method span --mode train --train_data path/to/conll2005.train.txt --dev_data path/to/conll2005.dev.txt --data_type conll05 --drop_rate 0.1 --reg 0.0001 --hidden_dim 300 --n_layers 4 --halve_lr --train_elmo_emb path/to/elmo.conll2005.train.hdf5 --dev_elmo_emb path/to/elmo.conll2005.dev.hdf5 --save --output_dir output`
 92 | 
 93 | ### Training: CRF model
 94 | SENNA: `python src/main.py --method crf --mode train --train_data path/to/conll2005.train.txt --dev_data path/to/conll2005.dev.txt --data_type conll05 --drop_rate 0.1 --reg 0.0001 --hidden_dim 300 --n_layers 4 --halve_lr --word_emb path/to/senna --save --output_dir output`
 95 | 
 96 | ELMo: `python src/main.py --method crf --mode train --train_data path/to/conll2005.train.txt --dev_data path/to/conll2005.dev.txt --data_type conll05 --drop_rate 0.1 --reg 0.0001 --hidden_dim 300 --n_layers 4 --halve_lr --train_elmo_emb path/to/elmo.conll2005.train.hdf5 --dev_elmo_emb path/to/elmo.conll2005.dev.hdf5 --save --output_dir output`
 97 | 
 98 | ### Predicting: span selection model
 99 | SENNA: `python src/main.py --method span --mode test --test_data path/to/conll2005.test.txt --data_type conll05 --drop_rate 0.1 --hidden_dim 300 --n_layers 4 --output_dir output --output_fn conll2005.test --word_emb path/to/senna --load_label output/label_ids.txt --load_param output/param.epoch-0.pkl.gz`
100 | 
101 | ELMo: `python src/main.py --method span --mode test --test_data path/to/conll2005.test.txt --data_type conll05 --drop_rate 0.1 --hidden_dim 300 --n_layers 4 --output_dir output --output_fn conll2005.test --test_elmo_emb path/to/elmo.conll2005.test.hdf5 --load_label output/label_ids.txt --load_param output/param.epoch-0.pkl.gz`
102 | 
103 | ### Predicting: CRF model
104 | SENNA: `python src/main.py --method crf --mode test --test_data path/to/conll2005.test.txt --data_type conll05 --drop_rate 0.1 --hidden_dim 300 --n_layers 4 --output_dir output --output_fn conll2005.test --word_emb path/to/senna --load_label output/label_ids.txt --load_param output/param.epoch-0.pkl.gz`
105 | 
106 | ELMo: `python src/main.py --method crf --mode test --test_data path/to/conll2005.test.txt --data_type conll05 --drop_rate 0.1 --hidden_dim 300 --n_layers 4 --output_dir output --output_fn conll2005.test --test_elmo_emb path/to/elmo.conll2005.test.hdf5 --load_label output/label_ids.txt --load_param output/param.epoch-0.pkl.gz`
107 | 
108 | 
109 | ## LICENSE
110 | MIT License
111 | 


--------------------------------------------------------------------------------
/scripts/eval.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | 
  3 | import numpy as np
  4 | 
  5 | 
  6 | def load(path, data_size=100000000):
  7 |     corpus = []
  8 |     sent = []
  9 |     with open(path) as f:
 10 |         for line in f:
 11 |             elem = [l for l in line.rstrip().split()]
 12 |             if len(elem) > 0:
 13 |                 sent.append(elem)
 14 |             else:
 15 |                 corpus.append(sent)
 16 |                 sent = []
 17 |             if len(corpus) >= data_size:
 18 |                 break
 19 |     return corpus
 20 | 
 21 | 
 22 | def f_score(crr_total, p_total, r_total):
 23 |     precision = crr_total / p_total if p_total > 0 else 0.
 24 |     recall = crr_total / r_total if r_total > 0 else 0.
 25 |     f1 = (2 * precision * recall) / (precision + recall) if precision + recall > 0 else 0.
 26 |     return precision, recall, f1
 27 | 
 28 | 
 29 | def accuracy(crr_total, total):
 30 |     return crr_total / total if total > 0 else 0.
 31 | 
 32 | 
 33 | def srl_metrics(y_true, y_pred):
 34 |     """
 35 |     :param y_true: 1D: n_sents, 2D: n_prds, 3D: n_words; elem=label
 36 |     :param y_pred: 1D: n_sents, 2D: n_prds, 3D: n_words; elem=label
 37 |     """
 38 |     p_total = 0.
 39 |     r_total = 0.
 40 |     crr_total = 0.
 41 | 
 42 |     assert len(y_true) == len(y_pred)
 43 |     for y_true_i, y_pred_i in zip(y_true, y_pred):
 44 |         assert len(y_true_i) == len(y_pred_i)
 45 |         for y_true_j, y_pred_j in zip(y_true_i[1:], y_pred_i[1:]):
 46 |             assert len(y_true_j) == len(y_pred_j)
 47 |             y_true_spans = get_labeled_spans(y_true_j)
 48 |             y_pred_spans = get_labeled_spans(y_pred_j)
 49 |             p_total += len(y_pred_spans)
 50 |             r_total += len(y_true_spans)
 51 |             for y_pred_span in y_pred_spans:
 52 |                 if y_pred_span in y_true_spans:
 53 |                     crr_total += 1.
 54 |     return crr_total, p_total, r_total
 55 | 
 56 | 
 57 | def span_metrics(y_true, y_pred):
 58 |     """
 59 |     :param y_true: 1D: n_sents, 2D: n_prds, 3D: n_words; elem=label
 60 |     :param y_pred: 1D: n_sents, 2D: n_prds, 3D: n_words; elem=label
 61 |     """
 62 |     p_total = 0.
 63 |     r_total = 0.
 64 |     crr_total = 0.
 65 | 
 66 |     assert len(y_true) == len(y_pred)
 67 |     for y_true_i, y_pred_i in zip(y_true, y_pred):
 68 |         assert len(y_true_i) == len(y_pred_i)
 69 |         for y_true_j, y_pred_j in zip(y_true_i[1:], y_pred_i[1:]):
 70 |             assert len(y_true_j) == len(y_pred_j)
 71 |             y_true_spans = get_labeled_spans(y_true_j)
 72 |             y_pred_spans = get_labeled_spans(y_pred_j)
 73 |             p_total += len(y_pred_spans)
 74 |             r_total += len(y_true_spans)
 75 | 
 76 |             y_true_boundary = [span[1:] for span in y_true_spans]
 77 |             for y_pred_span in y_pred_spans:
 78 |                 if y_pred_span[1:] in y_true_boundary:
 79 |                     crr_total += 1.
 80 |     return crr_total, p_total, r_total
 81 | 
 82 | 
 83 | def label_metrics(y_true, y_pred):
 84 |     """
 85 |     :param y_true: 1D: n_sents, 2D: n_prds, 3D: n_words; elem=label
 86 |     :param y_pred: 1D: n_sents, 2D: n_prds, 3D: n_words; elem=label
 87 |     """
 88 |     total = 0.
 89 |     crr_total = 0.
 90 | 
 91 |     assert len(y_true) == len(y_pred)
 92 |     for y_true_i, y_pred_i in zip(y_true, y_pred):
 93 |         assert len(y_true_i) == len(y_pred_i)
 94 |         for y_true_j, y_pred_j in zip(y_true_i[1:], y_pred_i[1:]):
 95 |             assert len(y_true_j) == len(y_pred_j)
 96 |             y_true_spans = get_labeled_spans(y_true_j)
 97 |             y_pred_spans = get_labeled_spans(y_pred_j)
 98 | 
 99 |             y_true_boundary = [span[1:] for span in y_true_spans]
100 |             for y_pred_span in y_pred_spans:
101 |                 if y_pred_span[1:] in y_true_boundary:
102 |                     total += 1.
103 |                     index = y_true_boundary.index(y_pred_span[1:])
104 |                     y_true_span = y_true_spans[index]
105 |                     if y_pred_span[0] == y_true_span[0]:
106 |                         crr_total += 1.
107 |     return crr_total, total
108 | 
109 | 
110 | def srl_metrics_per_distance(y_true, y_pred):
111 |     """
112 |     :param y_true: 1D: n_sents, 2D: n_prds, 3D: n_words; elem=label
113 |     :param y_pred: 1D: n_sents, 2D: n_prds, 3D: n_words; elem=label
114 |     """
115 |     def _dist(i_, j_, prd_index_):
116 |         if j_ < prd_index_:
117 |             return prd_index_ - j_ - 1
118 |         return i_ - prd_index_ - 1
119 | 
120 |     def _dist_bin(dist):
121 |         if dist == 0:
122 |             return 0
123 |         elif 0 < dist < 3:
124 |             return 1
125 |         elif 3 <= dist < 7:
126 |             return 2
127 |         return 3
128 | 
129 |     dist_dict = np.zeros(shape=(4, 3), dtype="float32")
130 | 
131 |     assert len(y_true) == len(y_pred)
132 |     for y_true_i, y_pred_i in zip(y_true, y_pred):
133 |         assert len(y_true_i) == len(y_pred_i)
134 | 
135 |         prds = y_true_i[0]
136 |         prd_indices = [i for i, y in enumerate(prds) if y != "-"]
137 | 
138 |         for y_true_j, y_pred_j, prd_index in zip(y_true_i[1:], y_pred_i[1:], prd_indices):
139 |             assert len(y_true_j) == len(y_pred_j)
140 |             y_true_spans = get_labeled_spans(y_true_j)
141 |             y_pred_spans = get_labeled_spans(y_pred_j)
142 | 
143 |             for span in y_true_spans:
144 |                 # Remove continuous spans
145 |                 if len(span) > 3:
146 |                     continue
147 |                 (label, i, j) = span
148 |                 dist = _dist(i, j, prd_index)
149 |                 binned_dist = _dist_bin(dist)
150 |                 dist_dict[binned_dist][2] += 1
151 | 
152 |             for span in y_pred_spans:
153 |                 if len(span) > 3:
154 |                     continue
155 |                 (label, i, j) = span
156 |                 dist = _dist(i, j, prd_index)
157 |                 binned_dist = _dist_bin(dist)
158 |                 dist_dict[binned_dist][1] += 1
159 | 
160 |             for y_pred_span in y_pred_spans:
161 |                 if y_pred_span in y_true_spans:
162 |                     if len(y_pred_span) > 3:
163 |                         continue
164 |                     label, i, j = y_pred_span
165 |                     dist = _dist(i, j, prd_index)
166 |                     binned_dist = _dist_bin(dist)
167 |                     dist_dict[binned_dist][0] += 1
168 | 
169 |     return dist_dict
170 | 
171 | 
172 | def get_labeled_spans(prop):
173 |     """
174 |     :param prop: 1D: n_words; elem=bracket label
175 |     :return: 1D: n_words; elem=BIO label
176 |     """
177 |     def _concat_c_spans(_spans):
178 |         labels = [_span[0] for _span in _spans]
179 |         c_indices = [i for i, _span in enumerate(_spans) if _span[0].startswith('C')]
180 |         non_ant_c_spans = []
181 | 
182 |         for c_index in c_indices:
183 |             c_span = _spans[c_index]
184 |             _label = c_span[0][2:]
185 |             if _label in labels:
186 |                 _spans[labels.index(_label)].extend(c_span[1:])
187 |             else:
188 |                 non_ant_c_spans.append([_label] + c_span[1:])
189 |         concated_spans = [span for i, span in enumerate(_spans) if i not in c_indices]
190 |         _spans = concated_spans + non_ant_c_spans
191 |         return _spans
192 | 
193 |     labeled_spans = []
194 |     labeled_span = []
195 |     for i, arg in enumerate(prop):
196 |         if arg.startswith('('):
197 |             if arg.endswith(')'):
198 |                 label = arg.split("*")[0][1:]
199 |                 labeled_span = [label, i, i]
200 |             else:
201 |                 label = arg[1:-1]
202 |                 labeled_span = [label, i]
203 |         elif arg.endswith(')'):
204 |             labeled_span.append(i)
205 | 
206 |         if len(labeled_span) == 3 and labeled_span[0] != "V" and labeled_span[0] != "C-V":
207 |             labeled_spans.append(labeled_span)
208 |             labeled_span = []
209 | 
210 |     labeled_spans = _concat_c_spans(labeled_spans)
211 |     return labeled_spans
212 | 
213 | 
214 | def print_metrics(y_true, y_pred):
215 |     """
216 |     :param y_true: 1D: n_sents, 2D: n_words, 3D: n_prds; elem=label
217 |     :param y_pred: 1D: n_sents, 2D: n_words, 3D: n_prds; elem=label
218 |     """
219 |     crr_total, p_total, r_total = srl_metrics(y_true, y_pred)
220 |     p, r, f = f_score(crr_total, p_total, r_total)
221 |     sys.stdout.write('SRL RESULTS\n\tF:{:>7.2%}  P:{:>7.2%} ({:>5}/{:>5})  R:{:>7.2%} ({:>5}/{:>5})\n'.format(
222 |         f, p, int(crr_total), int(p_total), r, int(crr_total), int(r_total)))
223 |     sys.stdout.flush()
224 | 
225 |     crr_total, p_total, r_total = span_metrics(y_true, y_pred)
226 |     p, r, f = f_score(crr_total, p_total, r_total)
227 |     sys.stdout.write('SPAN BOUNDARY MATCH\n\tF:{:>7.2%}  P:{:>7.2%} ({:>5}/{:>5})  R:{:>7.2%} ({:>5}/{:>5})\n'.format(
228 |         f, p, int(crr_total), int(p_total), r, int(crr_total), int(r_total)))
229 |     sys.stdout.flush()
230 | 
231 |     crr_total, total = label_metrics(y_true, y_pred)
232 |     acc = accuracy(crr_total, total)
233 | 
234 |     sys.stdout.write('LABEL MATCH\n\tACCURACY:{:>7.2%} ({:>5}/{:>5})\n'.format(
235 |         acc, int(crr_total), int(total)))
236 |     sys.stdout.flush()
237 | 
238 | 
239 | def print_metrics_per_dist(y_true, y_pred):
240 |     metric_matrix = srl_metrics_per_distance(y_true, y_pred)
241 |     sys.stdout.write('SRL RESULTS PER DISTANCE (C-LABEL removed)\n')
242 |     for i, metric in enumerate(metric_matrix):
243 |         crr_total, p_total, r_total = metric
244 |         if i == 0:
245 |             dist = '0'
246 |         elif i == 1:
247 |             dist = '1-2'
248 |         elif i == 2:
249 |             dist = '3-6'
250 |         else:
251 |             dist = '7-max'
252 | 
253 |         p, r, f = f_score(crr_total, p_total, r_total)
254 |         sys.stdout.write('\t{}\tF:{:>7.2%}  P:{:>7.2%} ({:>5}/{:>5})  R:{:>7.2%} ({:>5}/{:>5})\n'.format(
255 |             dist, f, p, int(crr_total), int(p_total), r, int(crr_total), int(r_total)))
256 |         sys.stdout.flush()
257 | 
258 | 
259 | def main(argv):
260 |     sys.stdout.write("\nEVALUATION START\n")
261 |     sys.stdout.flush()
262 | 
263 |     sents1 = load(argv[1])
264 |     sents2 = load(argv[2])
265 | 
266 |     sents1 = [list(zip(*sent)) for sent in sents1]
267 |     sents2 = [list(zip(*sent)) for sent in sents2]
268 | 
269 |     print_metrics(sents1, sents2)
270 |     print_metrics_per_dist(sents1, sents2)
271 | 
272 | 
273 | if __name__ == '__main__':
274 |     main(sys.argv)
275 | 


--------------------------------------------------------------------------------
/src/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hiroki13/span-based-srl/2c8b677c4e00b6c607e09ef4f9fe3d54961e4f2e/src/__init__.py


--------------------------------------------------------------------------------
/src/main.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | import argparse
  3 | 
  4 | import numpy as np
  5 | import theano
  6 | 
  7 | sys.setrecursionlimit(100000000)
  8 | theano.config.floatX = 'float32'
  9 | 
 10 | if theano.config.device.startswith('cuda'):
 11 |     import locale
 12 | 
 13 |     locale.setlocale(locale.LC_CTYPE, 'C.UTF-8')
 14 | 
 15 | 
 16 | def parse_args():
 17 |     parser = argparse.ArgumentParser(description='SPAN SELECTION MODEL')
 18 | 
 19 |     parser.add_argument('--mode', default='train', help='train/test')
 20 |     parser.add_argument('--method', default='span', help='crf/span')
 21 |     parser.add_argument('--seed', type=int, default=0, help='seed')
 22 | 
 23 |     ##################
 24 |     # Input Datasets #
 25 |     ##################
 26 |     parser.add_argument('--train_data', help='path to train data')
 27 |     parser.add_argument('--dev_data', help='path to dev data')
 28 |     parser.add_argument('--test_data', help='path to test data')
 29 |     parser.add_argument('--data_type', default='conll05', help='conll05/conll12')
 30 |     parser.add_argument('--data_size', type=int, default=100000000, help='data size to be used')
 31 | 
 32 |     ##################
 33 |     # Output Options #
 34 |     ##################
 35 |     parser.add_argument('--save', action='store_true', default=False, help='parameters to be saved or not')
 36 |     parser.add_argument('--output_dir', type=str, default='output', help='output directory name')
 37 |     parser.add_argument('--output_fn', type=str, default=None, help='output file name')
 38 | 
 39 |     ##########
 40 |     # Search #
 41 |     ##########
 42 |     parser.add_argument('--search', type=str, default='greedy', help='argmax/greedy')
 43 | 
 44 |     ###################
 45 |     # NN Architecture #
 46 |     ###################
 47 |     parser.add_argument('--emb_dim', type=int, default=50, help='dimension of embeddings')
 48 |     parser.add_argument('--hidden_dim', type=int, default=32, help='dimension of hidden layer')
 49 |     parser.add_argument('--n_layers', type=int, default=1, help='number of layers')
 50 |     parser.add_argument('--n_experts', type=int, default=0, help='number of ensemble models')
 51 | 
 52 |     ####################
 53 |     # Training Options #
 54 |     ####################
 55 |     parser.add_argument('--epoch', type=int, default=100, help='number of epochs to train')
 56 |     parser.add_argument('--batch_size', type=int, default=32, help='mini-batch size')
 57 |     parser.add_argument('--word_emb', default=None, help='Initial embeddings to be loaded')
 58 |     parser.add_argument('--train_elmo_emb', default=None, help='ELMo embeddings to be loaded')
 59 |     parser.add_argument('--dev_elmo_emb', default=None, help='ELMo embeddings to be loaded')
 60 |     parser.add_argument('--test_elmo_emb', default=None, help='ELMo embeddings to be loaded')
 61 | 
 62 |     ########################
 63 |     # Optimization Options #
 64 |     ########################
 65 |     parser.add_argument('--lr', type=float, default=0.001, help='learning rate')
 66 |     parser.add_argument('--halve_lr', action='store_true', default=False, help='halve learning rate')
 67 |     parser.add_argument('--opt_type', default='adam', help='sgd/adam')
 68 |     parser.add_argument('--grad_clip', action='store_true', default=False, help='gradient clipping')
 69 |     parser.add_argument('--reg', type=float, default=0.0001, help='L2 Reg rate')
 70 |     parser.add_argument('--drop_rate', type=float, default=0.0, help='Dropout Rate')
 71 | 
 72 |     ###################
 73 |     # Loading Options #
 74 |     ###################
 75 |     parser.add_argument('--load_param', default=None, help='path to params')
 76 |     parser.add_argument('--load_param_dir', default=None, help='path to param dir')
 77 |     parser.add_argument('--load_param_latest', action='store_true', default=False, help='load the latest params')
 78 |     parser.add_argument('--load_opt_param', default=None, help='path to params')
 79 |     parser.add_argument('--load_label', default=None, help='path to labels')
 80 | 
 81 |     return parser.parse_args()
 82 | 
 83 | 
 84 | def main():
 85 |     argv = parse_args()
 86 |     np.random.seed(argv.seed)
 87 | 
 88 |     if argv.data_type == "conll05":
 89 |         from utils.loaders import Conll05Loader
 90 |         loader = Conll05Loader(argv)
 91 |     else:
 92 |         from utils.loaders import Conll12Loader
 93 |         loader = Conll12Loader(argv)
 94 | 
 95 |     if argv.method == "span":
 96 |         from srl.preprocessors import SpanPreprocessor
 97 |         from utils.evaluators import SpanEvaluator
 98 |         from srl.model_api import SpanModelAPI
 99 | 
100 |         if argv.mode == "train":
101 |             from srl.trainers import Trainer
102 | 
103 |             Trainer(argv=argv,
104 |                     loader=loader,
105 |                     preprocessor=SpanPreprocessor(argv),
106 |                     evaluator=SpanEvaluator(argv),
107 |                     model_api=SpanModelAPI(argv)
108 |                     ).train()
109 |         else:
110 |             from srl.testers import Tester
111 |             from utils.savers import SpanSaver
112 | 
113 |             Tester(argv=argv,
114 |                    loader=loader,
115 |                    saver=SpanSaver(argv),
116 |                    preprocessor=SpanPreprocessor(argv),
117 |                    evaluator=SpanEvaluator(argv),
118 |                    model_api=SpanModelAPI(argv)
119 |                    ).predict()
120 |     else:
121 |         from srl.preprocessors import BIOPreprocessor
122 |         from utils.evaluators import BIOEvaluator
123 |         from srl.model_api import BIOModelAPI
124 | 
125 |         if argv.mode == "train":
126 |             from srl.trainers import Trainer
127 | 
128 |             trainer = Trainer(argv=argv,
129 |                               loader=loader,
130 |                               preprocessor=BIOPreprocessor(argv),
131 |                               evaluator=BIOEvaluator(argv),
132 |                               model_api=BIOModelAPI(argv)
133 |                               )
134 |             trainer.train()
135 |         else:
136 |             from srl.testers import Tester
137 |             from utils.savers import BIOSaver
138 | 
139 |             Tester(argv=argv,
140 |                    loader=loader,
141 |                    saver=BIOSaver(argv),
142 |                    preprocessor=BIOPreprocessor(argv),
143 |                    evaluator=BIOEvaluator(argv),
144 |                    model_api=BIOModelAPI(argv)
145 |                    ).predict()
146 | 
147 | 
148 | if __name__ == '__main__':
149 |     main()
150 | 


--------------------------------------------------------------------------------
/src/nn/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hiroki13/span-based-srl/2c8b677c4e00b6c607e09ef4f9fe3d54961e4f2e/src/nn/__init__.py


--------------------------------------------------------------------------------
/src/nn/activations.py:
--------------------------------------------------------------------------------
 1 | import theano.tensor as T
 2 | 
 3 | 
 4 | def softmax(x):
 5 |     if x.ndim == 3:
 6 |         x_shape = x.shape
 7 |         x = x.reshape((x_shape[0] * x_shape[1], x_shape[2]))
 8 |         return T.nnet.softmax(x).reshape(x_shape)
 9 |     elif x.ndim == 4:
10 |         x_shape = x.shape
11 |         x = x.reshape((x_shape[0] * x_shape[1] * x_shape[2], x_shape[3]))
12 |         return T.nnet.softmax(x).reshape(x_shape)
13 |     return T.nnet.softmax(x)
14 | 
15 | 
16 | def sigmoid(x):
17 |     return T.nnet.sigmoid(x)
18 | 
19 | 
20 | def tanh(x):
21 |     return T.tanh(x)
22 | 
23 | 
24 | def relu(x):
25 |     return T.nnet.relu(x)
26 | 


--------------------------------------------------------------------------------
/src/nn/initializers.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import theano
 3 | 
 4 | 
 5 | class Initializer(object):
 6 |     def __call__(self, shape, shared=True, name=None):
 7 |         raise NotImplementedError
 8 | 
 9 | 
10 | class Zero(Initializer):
11 |     def __call__(self, shape, shared=True, name=None):
12 |         param = np.zeros(shape, theano.config.floatX)
13 |         if shared:
14 |             return theano.shared(value=param, name=name, borrow=True)
15 |         return param
16 | 
17 | 
18 | class One(Initializer):
19 |     def __call__(self, shape, shared=True, name=None):
20 |         param = np.ones(shape, theano.config.floatX)
21 |         if shared:
22 |             return theano.shared(value=param, name=name, borrow=True)
23 |         return param
24 | 
25 | 
26 | class Identity(Initializer):
27 |     def __call__(self, shape, shared=True, name=None):
28 |         assert len(shape) == 2
29 |         param = np.ones(shape[0], theano.config.floatX)
30 |         param = np.diag(param)
31 |         if shared:
32 |             return theano.shared(value=param, name=name, borrow=True)
33 |         return param
34 | 
35 | 
36 | class Uniform(Initializer):
37 |     def __call__(self, shape, shared=True, name=None):
38 |         param = np.asarray(np.random.uniform(low=-0.01,
39 |                                              high=0.01,
40 |                                              size=shape),
41 |                            dtype=theano.config.floatX)
42 |         if shared:
43 |             return theano.shared(value=param, name=name, borrow=True)
44 |         return param
45 | 
46 | 
47 | class Normal(Initializer):
48 |     def __call__(self, shape, shared=True, name=None):
49 |         param = np.asarray(np.random.normal(0.0, 0.01, shape),
50 |                            dtype=theano.config.floatX)
51 |         if shared:
52 |             return theano.shared(value=param, name=name, borrow=True)
53 |         return param
54 | 
55 | 
56 | class Xavier(Initializer):
57 |     def __call__(self, shape, shared=True, name=None):
58 |         param = np.asarray(np.random.uniform(low=-np.sqrt(6.0 / np.sum(shape)),
59 |                                              high=np.sqrt(6.0 / np.sum(shape)),
60 |                                              size=shape),
61 |                            dtype=theano.config.floatX)
62 |         if shared:
63 |             return theano.shared(value=param, name=name, borrow=True)
64 |         return param
65 | 
66 | 
67 | class Orthonormal(Initializer):
68 |     """
69 |     This is based on the implementation of Luheng He;
70 |     https://github.com/luheng/deep_srl
71 |     """
72 |     def __call__(self, shape, shared=True, name=None):
73 |         assert len(shape) == 2
74 |         if shape[0] == shape[1]:
75 |             M = np.random.randn(*shape).astype(theano.config.floatX)
76 |             Q, R = np.linalg.qr(M)
77 |             Q = Q * np.sign(np.diag(R))
78 |             param = Q * 1.0
79 |         else:
80 |             M1 = np.random.randn(shape[0], shape[0]).astype(theano.config.floatX)
81 |             M2 = np.random.randn(shape[1], shape[1]).astype(theano.config.floatX)
82 |             Q1, R1 = np.linalg.qr(M1)
83 |             Q2, R2 = np.linalg.qr(M2)
84 |             Q1 = Q1 * np.sign(np.diag(R1))
85 |             Q2 = Q2 * np.sign(np.diag(R2))
86 |             n_min = min(shape[0], shape[1])
87 |             param = np.dot(Q1[:, :n_min], Q2[:n_min, :]) * 1.0
88 |         if shared:
89 |             return theano.shared(value=param, name=name, borrow=True)
90 |         return param
91 | 


--------------------------------------------------------------------------------
/src/nn/layers/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hiroki13/span-based-srl/2c8b677c4e00b6c607e09ef4f9fe3d54961e4f2e/src/nn/layers/__init__.py


--------------------------------------------------------------------------------
/src/nn/layers/core.py:
--------------------------------------------------------------------------------
 1 | import theano
 2 | import theano.tensor as T
 3 | 
 4 | from nn.initializers import Zero, One, Identity, Uniform, Normal, Xavier, Orthonormal
 5 | from nn.activations import sigmoid, tanh, relu, softmax
 6 | 
 7 | 
 8 | class Unit(object):
 9 |     def __init__(self, name='unit'):
10 |         self.name = name
11 | 
12 |     @staticmethod
13 |     def _set_param(shape, init_type=None, name=None):
14 |         if init_type == 'zero':
15 |             init = Zero()
16 |         elif init_type == 'one':
17 |             init = One()
18 |         elif init_type == 'xavier':
19 |             init = Xavier()
20 |         elif init_type == 'orth':
21 |             init = Orthonormal()
22 |         elif init_type == 'identity':
23 |             init = Identity()
24 |         elif init_type == 'uniform':
25 |             init = Uniform()
26 |         else:
27 |             init = Normal()
28 |         return init(shape=shape, name=name)
29 | 
30 |     @staticmethod
31 |     def _set_activation(activation_type):
32 |         if activation_type == 'sigmoid':
33 |             return sigmoid
34 |         elif activation_type == 'tanh':
35 |             return tanh
36 |         elif activation_type == 'relu':
37 |             return relu
38 |         elif activation_type == 'softmax':
39 |             return softmax
40 |         return None
41 | 
42 | 
43 | class Dense(Unit):
44 |     def __init__(self,
45 |                  input_dim,
46 |                  output_dim,
47 |                  activation=None,
48 |                  use_bias=True,
49 |                  weight_init='xavier',
50 |                  bias_init='zero'):
51 |         super(Dense, self).__init__(name='Dense(%dx%d,%s)' % (input_dim, output_dim, activation))
52 | 
53 |         self.W = self._set_param(shape=(input_dim, output_dim),
54 |                                  init_type=weight_init,
55 |                                  name='W_dense')
56 |         if use_bias:
57 |             self.b = self._set_param(shape=output_dim,
58 |                                      init_type=bias_init,
59 |                                      name='b_dense')
60 |             self.params = [self.W, self.b]
61 |         else:
62 |             self.b = None
63 |             self.params = [self.W]
64 | 
65 |         self.activation = self._set_activation(activation)
66 | 
67 |     def forward(self, x):
68 |         h = T.dot(x, self.W)
69 |         if self.b:
70 |             h = h + self.b
71 |         if self.activation:
72 |             h = self.activation(h)
73 |         return h
74 | 
75 | 
76 | class Dropout(Unit):
77 |     """
78 |     Reference: [Dropout: A Simple Way to Prevent Neural Networks from Overfitting]
79 |     """
80 |     def __init__(self, rate, seed=0):
81 |         super(Dropout, self).__init__(name='Dropout(p={:>1.1})'.format(rate))
82 |         self.rate = min(1., max(0., rate))
83 |         self.srng = T.shared_randomstreams.RandomStreams(seed=seed)
84 | 
85 |     def forward(self, x, is_train):
86 |         drop_mask = self.srng.binomial(size=x.shape, n=1, p=1 - self.rate, dtype=theano.config.floatX)
87 |         return T.switch(T.eq(is_train, 1), x * drop_mask, x * (1 - self.rate))
88 | 


--------------------------------------------------------------------------------
/src/nn/layers/embeddings.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import theano
 3 | import theano.tensor as T
 4 | 
 5 | from nn.layers.core import Unit, Dropout
 6 | 
 7 | 
 8 | class Embedding(Unit):
 9 |     def __init__(self,
10 |                  input_dim,
11 |                  output_dim,
12 |                  init_emb=None,
13 |                  param_init='xavier',
14 |                  param_fix=False,
15 |                  drop_rate=0.0,
16 |                  name=None):
17 |         super(Embedding, self).__init__(name=name if name else 'Emb(%dx%d)' % (input_dim, output_dim))
18 |         self.dropout = Dropout(drop_rate)
19 | 
20 |         self.W = self._set_weight(input_dim, output_dim, init_emb, param_init)
21 |         if param_fix:
22 |             self.params = []
23 |         else:
24 |             self.params = [self.W]
25 | 
26 |     def _set_weight(self, input_dim, output_dim, init_emb, param_init):
27 |         if init_emb is None:
28 |             return self._set_param(shape=(input_dim, output_dim),
29 |                                    init_type=param_init,
30 |                                    name='embedding')
31 |         return theano.shared(init_emb)
32 | 
33 |     def forward(self, x, is_train=0):
34 |         return self.dropout.forward(x=self.W[x], is_train=is_train)
35 | 
36 | 
37 | class ElmoLayer(Unit):
38 |     def __init__(self, drop_rate=0.0, name=None):
39 |         super(ElmoLayer, self).__init__(name=name if name else 'ElmoEmb')
40 |         self.dropout = Dropout(drop_rate)
41 | 
42 |         self.gamma = theano.shared(value=np.asarray([[1.0]], theano.config.floatX),
43 |                                    name='gamma',
44 |                                    borrow=True)
45 |         self.scalar_mix = theano.shared(value=np.zeros(shape=(1, 3), dtype=theano.config.floatX),
46 |                                         name='scalar_mix',
47 |                                         borrow=True)
48 |         self.params = [self.gamma, self.scalar_mix]
49 | 
50 |     def forward(self, x, is_train=0):
51 |         """
52 |         :param x: 1D: batch_size, 2D: n_words, 3D: n_layers, 4D: dim
53 |         :param is_train: 0/1
54 |         :return:
55 |         """
56 |         s = T.nnet.softmax(self.scalar_mix).dimshuffle('x', 'x', 1, 0)
57 |         s = T.repeat(s, repeats=x.shape[3], axis=3)
58 |         x = self.gamma[0, 0] * T.sum(s * x, axis=2)
59 |         return self.dropout.forward(x=x, is_train=is_train)
60 | 


--------------------------------------------------------------------------------
/src/nn/layers/recurrent.py:
--------------------------------------------------------------------------------
  1 | import theano
  2 | import theano.tensor as T
  3 | 
  4 | from nn.layers.core import Unit, sigmoid, tanh
  5 | 
  6 | 
  7 | class LSTM(Unit):
  8 |     def __init__(self,
  9 |                  input_dim,
 10 |                  output_dim,
 11 |                  use_bias=True,
 12 |                  recurrent_init='orth',
 13 |                  bias_init='zero'):
 14 |         super(LSTM, self).__init__(name='LSTM(%dx%d)' % (input_dim, output_dim))
 15 | 
 16 |         self.input_dim = input_dim
 17 |         self.output_dim = output_dim
 18 | 
 19 |         # inout gate parameters
 20 |         self.W_xi = self._set_param(shape=(input_dim, output_dim),
 21 |                                     init_type=recurrent_init,
 22 |                                     name='W_xi')
 23 |         self.W_hi = self._set_param(shape=(output_dim, output_dim),
 24 |                                     init_type=recurrent_init,
 25 |                                     name='W_hi')
 26 |         self.W_ci = self._set_param(shape=output_dim,
 27 |                                     init_type='xavier',
 28 |                                     name='W_ci')
 29 | 
 30 |         # forget gate parameters
 31 |         self.W_xf = self._set_param(shape=(input_dim, output_dim),
 32 |                                     init_type=recurrent_init,
 33 |                                     name='W_xf')
 34 |         self.W_hf = self._set_param(shape=(output_dim, output_dim),
 35 |                                     init_type=recurrent_init,
 36 |                                     name='W_hf')
 37 |         self.W_cf = self._set_param(shape=output_dim,
 38 |                                     init_type='xavier',
 39 |                                     name='W_cf')
 40 | 
 41 |         # cell parameters
 42 |         self.W_xc = self._set_param(shape=(input_dim, output_dim),
 43 |                                     init_type=recurrent_init,
 44 |                                     name='W_xc')
 45 |         self.W_hc = self._set_param(shape=(output_dim, output_dim),
 46 |                                     init_type=recurrent_init,
 47 |                                     name='W_hc')
 48 | 
 49 |         # output gate parameters
 50 |         self.W_xo = self._set_param(shape=(input_dim, output_dim),
 51 |                                     init_type=recurrent_init,
 52 |                                     name='W_xf')
 53 |         self.W_ho = self._set_param(shape=(output_dim, output_dim),
 54 |                                     init_type=recurrent_init,
 55 |                                     name='W_hf')
 56 |         self.W_co = self._set_param(shape=output_dim,
 57 |                                     init_type='xavier',
 58 |                                     name='W_cf')
 59 | 
 60 |         if use_bias:
 61 |             self.b_xi = self._set_param(shape=output_dim,
 62 |                                         init_type=bias_init,
 63 |                                         name='b_xi')
 64 |             self.b_xf = self._set_param(shape=output_dim,
 65 |                                         init_type='one',
 66 |                                         name='b_xf')
 67 |             self.b_xc = self._set_param(shape=output_dim,
 68 |                                         init_type=bias_init,
 69 |                                         name='b_xc')
 70 |             self.b_xo = self._set_param(shape=output_dim,
 71 |                                         init_type=bias_init,
 72 |                                         name='b_xo')
 73 |             self.params = [self.W_xi, self.W_hi, self.W_ci, self.W_xf, self.W_hf, self.W_cf,
 74 |                            self.W_xc, self.W_hc, self.W_xo, self.W_ho, self.W_co,
 75 |                            self.b_xi, self.b_xf, self.b_xc, self.b_xo]
 76 |         else:
 77 |             self.b_xi = None
 78 |             self.b_xf = None
 79 |             self.b_xc = None
 80 |             self.b_xo = None
 81 |             self.params = [self.W_xi, self.W_hi, self.W_ci, self.W_xf, self.W_hf, self.W_cf,
 82 |                            self.W_xc, self.W_hc, self.W_xo, self.W_ho, self.W_co]
 83 | 
 84 |     def _step(self, xi_t, xf_t, xc_t, xo_t, h_tm1, c_tm1):
 85 |         i_t = sigmoid(xi_t + T.dot(h_tm1, self.W_hi) + c_tm1 * self.W_ci)
 86 |         f_t = sigmoid(xf_t + T.dot(h_tm1, self.W_hf) + c_tm1 * self.W_cf)
 87 |         c_t = f_t * c_tm1 + i_t * tanh(xc_t + T.dot(h_tm1, self.W_hc))
 88 |         o_t = sigmoid(xo_t + T.dot(h_tm1, self.W_ho) + c_t * self.W_co)
 89 |         h_t = o_t * tanh(c_t)
 90 |         return h_t, c_t
 91 | 
 92 |     def forward(self, x, h0=None, mask=None):
 93 |         xi = T.dot(x, self.W_xi) + self.b_xi
 94 |         xf = T.dot(x, self.W_xf) + self.b_xf
 95 |         xc = T.dot(x, self.W_xc) + self.b_xc
 96 |         xo = T.dot(x, self.W_xo) + self.b_xo
 97 | 
 98 |         inputs = [xi, xf, xc, xo]
 99 | 
100 |         if h0 is None:
101 |             h0 = T.zeros(shape=(x[0].shape[0], self.output_dim), dtype=theano.config.floatX)
102 |         c0 = T.zeros(shape=(x[0].shape[0], self.output_dim), dtype=theano.config.floatX)
103 | 
104 |         [h, _], _ = theano.scan(fn=self._step,
105 |                                 sequences=inputs,
106 |                                 outputs_info=[h0, c0])
107 |         return h
108 | 


--------------------------------------------------------------------------------
/src/nn/layers/seqlabel.py:
--------------------------------------------------------------------------------
  1 | import theano
  2 | import theano.tensor as T
  3 | 
  4 | from nn.layers.core import Unit
  5 | from nn.utils import logsumexp
  6 | 
  7 | 
  8 | class SeqLabelAlg(Unit):
  9 |     def __init__(self, name='SeqLabelModel'):
 10 |         super(SeqLabelAlg, self).__init__(name=name)
 11 | 
 12 |     def viterbi(self, emit_scores, trans_scores):
 13 |         """
 14 |         :param emit_scores: 1D: n_words, 2D: batch_size, 3D: n_labels
 15 |         :param trans_scores: 1D: n_words, 2D: n_labels
 16 |         :return: 1D: n_words; 2D: batch_size, elem=label id
 17 |         """
 18 |         [scores, labels], _ = theano.scan(fn=self._viterbi_forward,
 19 |                                           sequences=[emit_scores[1:]],
 20 |                                           outputs_info=[emit_scores[0], None],
 21 |                                           non_sequences=trans_scores)
 22 | 
 23 |         label_max_last = T.argmax(scores[-1], axis=1)
 24 |         labels_max, _ = theano.scan(fn=self._viterbi_backward,
 25 |                                     sequences=labels[::-1],
 26 |                                     outputs_info=label_max_last)
 27 | 
 28 |         y = T.zeros(shape=(emit_scores.shape[0], emit_scores.shape[1]), dtype='int32')
 29 |         y = T.set_subtensor(y[-1], label_max_last)
 30 |         y = T.set_subtensor(y[:-1], labels_max[::-1])
 31 |         return y
 32 | 
 33 |     @staticmethod
 34 |     def _viterbi_forward(e_t, score_prev, trans):
 35 |         """
 36 |         :param e_t: 1D: batch_size, 2D: n_labels
 37 |         :param score_prev: 1D: batch_size, 2D: n_labels
 38 |         :param trans: 1D: n_labels, 2D, n_labels
 39 |         :return: max_scores_t: 1D: batch_size, 2D: n_labels
 40 |         :return: max_labels_t: 1D: batch_size, 2D: n_labels
 41 |         """
 42 |         score = score_prev.dimshuffle(0, 'x', 1) + trans + e_t.dimshuffle(0, 1, 'x')
 43 |         max_scores_t, max_labels_t = T.max_and_argmax(score, axis=2)
 44 |         return max_scores_t, max_labels_t
 45 | 
 46 |     @staticmethod
 47 |     def _viterbi_backward(labels_t, label_max):
 48 |         """
 49 |         :param labels_t: 1D: batch_size, 2D: n_labels; elem=label id
 50 |         :param label_max: 1D: batch_size; elem=label id
 51 |         :return: 1D: batch_size; elem=label id
 52 |         """
 53 |         return labels_t[T.arange(labels_t.shape[0]), label_max]
 54 | 
 55 | 
 56 | class CRF(SeqLabelAlg):
 57 |     def __init__(self,
 58 |                  input_dim,
 59 |                  output_dim,
 60 |                  use_bias=True,
 61 |                  weight_init='xavier',
 62 |                  bias_init='zero'):
 63 |         super(CRF, self).__init__(name='CRF(%dx%d)' % (input_dim, output_dim))
 64 |         self.W = self._set_param(shape=(input_dim, output_dim),
 65 |                                  init_type=weight_init,
 66 |                                  name='W_crf')
 67 |         self.W_t = self._set_param(shape=(output_dim, output_dim),
 68 |                                    init_type=weight_init,
 69 |                                    name='W_tran_crf')
 70 | 
 71 |         if use_bias:
 72 |             self.b = self._set_param(shape=output_dim,
 73 |                                      init_type=bias_init,
 74 |                                      name='b_crf')
 75 |             self.params = [self.W, self.W_t, self.b]
 76 |         else:
 77 |             self.b = None
 78 |             self.params = [self.W, self.W_t]
 79 | 
 80 |     def forward(self, x):
 81 |         emit_scores = T.dot(x, self.W)
 82 |         if self.b:
 83 |             emit_scores = emit_scores + self.b
 84 |         return emit_scores
 85 | 
 86 |     def get_y_proba(self, emit_scores, y_true):
 87 |         """
 88 |         :param emit_scores: 1D: n_words, 2D: batch_size, 3D: n_labels
 89 |         :param y_true: 1D: n_words, 2D: batch_size
 90 |         :return: 1D: batch_size; elem=log probability
 91 |         """
 92 |         # 1D: batch_size, 2D: n_labels
 93 |         z_score0 = emit_scores[0]
 94 |         # 1D: batch_size; elem=path score
 95 |         y_score0 = z_score0[T.arange(z_score0.shape[0]), y_true[0]]
 96 | 
 97 |         inputs = [emit_scores[1:], y_true[1:]]
 98 |         [_, y_scores, z_scores], _ = theano.scan(fn=self._forward_step,
 99 |                                                  sequences=inputs,
100 |                                                  outputs_info=[y_true[0], y_score0, z_score0],
101 |                                                  non_sequences=self.W_t)
102 | 
103 |         y_score = y_scores[-1]
104 |         z_score = logsumexp(z_scores[-1], axis=1).flatten()
105 | 
106 |         return y_score - z_score
107 | 
108 |     @staticmethod
109 |     def _forward_step(h_t, y_t, y_prev, y_score_prev, z_score_prev, trans):
110 |         """
111 |         :param h_t: 1D: batch_size, 2D: n_labels
112 |         :param y_t: 1D: batch_size
113 |         :param y_prev: 1D: batch_size
114 |         :param y_score_prev: 1D: batch_size
115 |         :param z_score_prev: 1D: batch_size, 2D: n_labels
116 |         :param trans: 1D: n_labels, 2D, n_labels
117 |         """
118 |         # 1D: batch_size
119 |         y_score_t = y_score_prev + trans[y_t, y_prev] + h_t[T.arange(h_t.shape[0]), y_t]
120 |         # 1D: batch_size, 2D: n_labels, 3D: n_labels
121 |         z_sum = z_score_prev.dimshuffle(0, 'x', 1) + trans
122 |         # 1D: batch_size, 2D: n_labels
123 |         z_score_t = logsumexp(z_sum, axis=2).reshape(h_t.shape) + h_t
124 |         return y_t, y_score_t, z_score_t
125 | 
126 |     def get_y_pred(self, emit_scores):
127 |         """
128 |         :param emit_scores: 1D: n_words, 2D: batch_size, 3D: n_labels
129 |         :return: 1D: batch_size, 2D: n_words; elem=label id
130 |         """
131 |         return self.viterbi(emit_scores=emit_scores, trans_scores=self.W_t).dimshuffle(1, 0)
132 | 


--------------------------------------------------------------------------------
/src/nn/layers/stack.py:
--------------------------------------------------------------------------------
 1 | import theano.tensor as T
 2 | 
 3 | from nn.layers.core import Dense, Dropout
 4 | from nn.layers.recurrent import LSTM
 5 | 
 6 | 
 7 | class StackLayer(object):
 8 |     def __init__(self, name='StackLayer'):
 9 |         self.name = name
10 |         self.layers = []
11 |         self.params = []
12 | 
13 |     def _set_layers(self):
14 |         raise NotImplementedError
15 | 
16 |     @staticmethod
17 |     def _set_rnn_unit(unit_type):
18 |         return LSTM
19 | 
20 |     @staticmethod
21 |     def _set_connect_unit(connect_type):
22 |         return Dense
23 | 
24 |     def _set_params(self):
25 |         params = []
26 |         for layer in self.layers:
27 |             params.extend(layer.params)
28 |         return params
29 | 
30 |     def forward(self, x, **kwargs):
31 |         raise NotImplementedError
32 | 
33 | 
34 | class BiRNNLayer(StackLayer):
35 |     def __init__(self,
36 |                  input_dim,
37 |                  output_dim,
38 |                  n_layers,
39 |                  unit_type,
40 |                  connect_type,
41 |                  drop_rate=0.0):
42 |         name = 'BiRNNs-%d:(%dx%d)' % (n_layers, input_dim, output_dim)
43 |         super(BiRNNLayer, self).__init__(name=name)
44 | 
45 |         self.input_dim = input_dim
46 |         self.output_dim = output_dim
47 |         self.n_layers = n_layers
48 |         self.rnn_unit = self._set_rnn_unit(unit_type)
49 |         self.connect_unit = self._set_connect_unit(connect_type)
50 |         self.dropout = Dropout(drop_rate)
51 | 
52 |         self.layers = self._set_layers()
53 |         self.params = self._set_params()
54 | 
55 |     def _set_layers(self):
56 |         layers = []
57 |         for i in range(self.n_layers):
58 |             if i == 0:
59 |                 rnn_input_dim = self.input_dim
60 |                 connect_input_dim = self.input_dim + self.output_dim
61 |             else:
62 |                 rnn_input_dim = self.output_dim
63 |                 connect_input_dim = self.output_dim * 2
64 | 
65 |             r_unit = self.rnn_unit(input_dim=rnn_input_dim,
66 |                                    output_dim=self.output_dim)
67 |             c_unit = self.connect_unit(input_dim=connect_input_dim,
68 |                                        output_dim=self.output_dim,
69 |                                        activation='relu')
70 |             layers += [r_unit, c_unit]
71 |         return layers
72 | 
73 |     def forward(self, x, mask=None, is_train=False):
74 |         n_layers = int(len(self.layers) / 2)
75 |         for i in range(n_layers):
76 |             if mask is None:
77 |                 h = self.layers[i * 2].forward(x=x)
78 |                 h = self.dropout.forward(x=h, is_train=is_train)
79 |                 x = self.layers[i * 2 + 1].forward(T.concatenate([x, h], axis=2))
80 |             else:
81 |                 h = self.layers[i * 2].forward(x=x, mask=mask)
82 |                 h = self.dropout.forward(x=h, is_train=is_train)
83 |                 x = self.layers[i * 2 + 1].forward(T.concatenate([x, h], axis=2)) * mask
84 |                 mask = mask[::-1]
85 |             x = x[::-1]
86 |         if (n_layers % 2) == 1:
87 |             return x[::-1]
88 |         return x
89 | 


--------------------------------------------------------------------------------
/src/nn/losses.py:
--------------------------------------------------------------------------------
 1 | import theano.tensor as T
 2 | 
 3 | 
 4 | def binary_cross_entropy(output, target):
 5 |     return T.nnet.binary_crossentropy(output=output, target=target)
 6 | 
 7 | 
 8 | def negative_log_likelihood(y_proba, y_true=None):
 9 |     """
10 |     :param y_proba: 1D: batch_size, 2D: n_words, 3D: n_words; elem=word id
11 |     :param y_true: 1D: batch_size, 2D: n_words; elem=word id
12 |     """
13 |     if y_true:
14 |         y_true_flatten = y_true.flatten()
15 |         y_proba = y_proba.reshape((y_proba.shape[0] * y_proba.shape[1], y_proba.shape[2]))
16 |         nll = - T.sum(T.log(y_proba[T.arange(y_true_flatten.shape[0]), y_true_flatten]).reshape(y_true.shape), axis=1)
17 |     else:
18 |         nll = - y_proba
19 |     return nll
20 | 


--------------------------------------------------------------------------------
/src/nn/metrics.py:
--------------------------------------------------------------------------------
 1 | import theano.tensor as T
 2 | 
 3 | 
 4 | def categorical_accuracy(y_true, y_pred):
 5 |     return T.sum(T.eq(y_true, y_pred))
 6 | 
 7 | 
 8 | def log_likelihood(y_true, y_proba):
 9 |     y_true = y_true.flatten()
10 |     y_proba = y_proba.reshape((y_proba.shape[0] * y_proba.shape[1], -1))
11 |     return T.sum(T.log(y_proba[T.arange(y_true.shape[0]), y_true]))
12 | 


--------------------------------------------------------------------------------
/src/nn/optimizers.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import theano
  3 | import theano.tensor as T
  4 | 
  5 | from utils.savers import save_pickle
  6 | from utils.loaders import load_pickle
  7 | 
  8 | 
  9 | def get_optimizer(argv):
 10 |     if argv.opt_type == 'adam':
 11 |         return Adam(argv=argv, lr=argv.lr, grad_clip=argv.grad_clip)
 12 |     return SGD(argv=argv, lr=argv.lr, grad_clip=argv.grad_clip)
 13 | 
 14 | 
 15 | class Optimizer(object):
 16 |     def __init__(self, **kwargs):
 17 |         self.argv = kwargs['argv']
 18 |         self.grad_clip = kwargs['grad_clip']
 19 |         self.params = []
 20 | 
 21 |     def __call__(self, grads, params):
 22 |         raise NotImplementedError
 23 | 
 24 |     def set_params(self, **kwargs):
 25 |         raise NotImplementedError
 26 | 
 27 |     def init_params(self):
 28 |         for p in self.params:
 29 |             p.set_value(p.get_value(borrow=True) * 0)
 30 | 
 31 |     @staticmethod
 32 |     def _grad_clipping(gradients, max_norm=5.0):
 33 |         global_grad_norm = T.sqrt(sum(map(lambda x: T.sqr(x).sum(), gradients)))
 34 |         multiplier = T.switch(global_grad_norm < max_norm, 1.0, max_norm / global_grad_norm)
 35 |         return [g * multiplier for g in gradients]
 36 | 
 37 |     def save_params(self, epoch=0):
 38 |         argv = self.argv
 39 |         if argv.output_dir:
 40 |             dir_name = argv.output_dir
 41 |         else:
 42 |             dir_name = 'output'
 43 |         if argv.output_fn:
 44 |             file_name = '/opt.param.%s.epoch-%d' % (argv.output_fn, epoch)
 45 |         else:
 46 |             file_name = '/opt.param.%s.epoch-%d' % (argv.method, epoch)
 47 | 
 48 |         fn = dir_name + file_name
 49 |         params = [p.get_value(borrow=True) for p in self.params]
 50 |         save_pickle(fn=fn, data=params)
 51 | 
 52 |     def load_params(self, path):
 53 |         params = load_pickle(path)
 54 |         assert len(self.params) == len(params)
 55 |         for p1, p2 in zip(self.params, params):
 56 |             p1.set_value(p2)
 57 | 
 58 | 
 59 | class SGD(Optimizer):
 60 |     def __init__(self, lr=0.001, **kwargs):
 61 |         super(SGD, self).__init__(**kwargs)
 62 |         self.lr = theano.shared(np.asarray(lr, dtype=theano.config.floatX), borrow=True)
 63 | 
 64 |     def __call__(self, params, grads):
 65 |         updates = []
 66 |         if self.grad_clip:
 67 |             grads = self._grad_clipping(grads, max_norm=1.0)
 68 |         for p, g in zip(params, grads):
 69 |             updates.append((p, p - self.lr * g))
 70 |         return updates
 71 | 
 72 |     def set_params(self):
 73 |         pass
 74 | 
 75 | 
 76 | class Adam(Optimizer):
 77 |     def __init__(self, lr=0.001, b1=0.9, b2=0.999, eps=1e-8, **kwargs):
 78 |         super(Adam, self).__init__(**kwargs)
 79 |         self.lr = theano.shared(np.asarray(lr, dtype=theano.config.floatX), borrow=True)
 80 |         self.b1 = b1
 81 |         self.b2 = b2
 82 |         self.eps = eps
 83 | 
 84 |     def __call__(self, params, grads):
 85 |         updates = []
 86 | 
 87 |         i = self.params[0]
 88 |         i_t = i + 1.
 89 |         a_t = self.lr * T.sqrt(1 - self.b2 ** i_t) / (1 - self.b1 ** i_t)
 90 | 
 91 |         if self.grad_clip:
 92 |             grads = self._grad_clipping(grads, max_norm=1.0)
 93 | 
 94 |         for index, (p, g) in enumerate(zip(params, grads)):
 95 |             v = self.params[2 * index + 1]
 96 |             r = self.params[2 * index + 2]
 97 |             index += 2
 98 | 
 99 |             v_t = self.b1 * v + (1. - self.b1) * g
100 |             r_t = self.b2 * r + (1. - self.b2) * g ** 2
101 | 
102 |             step = a_t * v_t / (T.sqrt(r_t) + self.eps)
103 | 
104 |             updates.append((v, v_t))
105 |             updates.append((r, r_t))
106 |             updates.append((p, p - step))
107 | 
108 |         updates.append((i, i_t))
109 |         return updates
110 | 
111 |     def set_params(self, params):
112 |         i = theano.shared(np.asarray(.0, dtype=theano.config.floatX))
113 |         self.params.append(i)
114 |         for p in params:
115 |             p_tm = p.get_value(borrow=True)
116 |             v = theano.shared(np.zeros(p_tm.shape, dtype=p_tm.dtype))
117 |             r = theano.shared(np.zeros(p_tm.shape, dtype=p_tm.dtype))
118 |             self.params += [v, r]
119 | 


--------------------------------------------------------------------------------
/src/nn/regularizers.py:
--------------------------------------------------------------------------------
 1 | import theano.tensor as T
 2 | 
 3 | 
 4 | class Regularizer(object):
 5 |     def __call__(self, **kwargs):
 6 |         raise NotImplementedError
 7 | 
 8 | 
 9 | class L2Regularizer(Regularizer):
10 |     def __call__(self, alpha, params):
11 |         return alpha * l2_sqr(params) / 2.
12 | 
13 | 
14 | def l2_sqr(params):
15 |     sqr = 0.0
16 |     for p in params:
17 |         sqr += T.sum((p ** 2))
18 |     return sqr
19 | 


--------------------------------------------------------------------------------
/src/nn/utils.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import theano.tensor as T
 3 | 
 4 | 
 5 | def normalize_3d(x, eps=1e-8):
 6 |     l2 = x.norm(2, axis=2).dimshuffle((0, 1, 'x'))
 7 |     return x / (l2 + eps)
 8 | 
 9 | 
10 | def logsumexp(x, axis, keepdim=True):
11 |     """
12 |     :param x: 1D: batch, 2D: n_y, 3D: n_y
13 |     :return: 1D: batch, 2D: n_y, 3D: n_y
14 |     """
15 |     x_max = T.max(x, axis=axis, keepdims=True)
16 |     if keepdim:
17 |         return T.log(T.sum(T.exp(x - x_max), axis=axis, keepdims=keepdim)) + x_max
18 |     return T.log(T.sum(T.exp(x - x_max), axis=axis)) + x_max.dimshuffle(0)
19 | 
20 | 
21 | def logsumexp3d(x, axis=2):
22 |     # 1D: batch_size, 2D: n_labels, 3D: 1
23 |     x_max = T.max(x, axis=axis, keepdims=True)
24 |     # 1D: batch_size, 2D: n_labels
25 |     return T.log(T.sum(T.exp(x - x_max), axis=axis)) + x_max.dimshuffle(0, 1)
26 | 
27 | 
28 | def log0(x):
29 |     return T.switch(T.eq(x, 0.0), 0.0, T.log(x))
30 | 
31 | 
32 | def frobenius_norm(matrix):
33 |     if type(matrix) is list:
34 |         return T.sqrt(T.sum(map(lambda m: T.sum(m ** 2), matrix)))
35 |     return T.sqrt(T.maximum(T.sum(T.sqr(matrix)), 1e-8))
36 | 
37 | 
38 | def np_frobenius_norm(matrix):
39 |     return np.sqrt(np.sum(matrix**2))
40 | 
41 | 
42 | def layer_normalization(x, axis=1, eps=1e-8):
43 |     return (x - x.mean(axis=axis, keepdims=True)) / T.sqrt((x.var(axis=axis, keepdims=True) + eps))
44 | 


--------------------------------------------------------------------------------
/src/srl/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hiroki13/span-based-srl/2c8b677c4e00b6c607e09ef4f9fe3d54961e4f2e/src/srl/__init__.py


--------------------------------------------------------------------------------
/src/srl/decoders.py:
--------------------------------------------------------------------------------
  1 | from copy import deepcopy
  2 | from itertools import combinations_with_replacement as comb
  3 | 
  4 | from utils.misc import span_to_span_index
  5 | 
  6 | 
  7 | class Decoder(object):
  8 |     def __init__(self, argv, vocab_label):
  9 |         self.argv = argv
 10 |         self.core_label_ids = self.set_core_labels(vocab_label)
 11 |         self.span_list = None
 12 | 
 13 |     def set_core_labels(self, vocab_label):
 14 |         if self.argv.data_type == 'conll05':
 15 |             core_labels = ["A0", "A1", "A2", "A3", "A4", "A5"]
 16 |         else:
 17 |             core_labels = ["ARG0", "ARG1", "ARG2", "ARG3", "ARG4", "ARG5"]
 18 |         return [vocab_label.get_id(label)
 19 |                 for label in core_labels
 20 |                 if vocab_label.has_key(label)]
 21 | 
 22 |     def argmax_span_triples(self, span_indices, marks):
 23 |         """
 24 |         :param span_indices: 1D: batch_size, 2D; n_labels; span index
 25 |         :param marks: 1D: batch_size, 2D; n_words
 26 |         :return: 1D: batch_size, 2D: n_spans; [r, i, j]
 27 |         """
 28 |         n_words = len(marks[0])
 29 |         self.span_list = list(comb(range(n_words), 2))
 30 |         return [self._argmax_search(span_indices_i, mark)
 31 |                 for span_indices_i, mark in zip(span_indices, marks)]
 32 | 
 33 |     def _argmax_search(self, span_indices, mark):
 34 |         spans = []
 35 |         prd_index = mark.nonzero()[0][0]
 36 |         for r, span_index in enumerate(span_indices):
 37 |             (i, j) = self.span_list[span_index]
 38 |             if i <= prd_index <= j:
 39 |                 continue
 40 |             spans.append([r, i, j])
 41 |         return spans
 42 | 
 43 |     def greedy_span_triples(self, scores, marks):
 44 |         """
 45 |         :param scores: 1D: batch_size, 2D; n_labels, 3D: n_spans; score
 46 |         :param marks: 1D: batch_size, 2D; n_words
 47 |         :return: 1D: batch_size, 2D: n_spans; [r, i, j]
 48 |         """
 49 |         n_words = len(marks[0])
 50 |         self.span_list = list(comb(range(n_words), 2))
 51 |         return [self._greedy_search(score, mark)
 52 |                 for score, mark in zip(scores, marks)]
 53 | 
 54 |     def _greedy_search(self, scores, mark):
 55 |         """
 56 |         :param scores: 1D: n_labels, 2D: n_spans; score
 57 |         :param mark: 1D: n_words; elem=0/1
 58 |         :return: 1D: n_spans, 2D: [r, i, j]
 59 |         """
 60 |         triples = []
 61 |         used_words = deepcopy(mark)
 62 |         used_labels = []
 63 | 
 64 |         n_words = len(mark)
 65 |         prd_index = mark.nonzero()[0][0]
 66 |         prd_span_index = span_to_span_index(i=prd_index,
 67 |                                             j=prd_index,
 68 |                                             n_words=n_words)
 69 |         spans = self._sort_spans(scores=scores,
 70 |                                  prd_index=prd_index,
 71 |                                  prd_span_index=prd_span_index)
 72 | 
 73 |         for (r, i, j, _) in spans:
 74 |             if r in used_labels:
 75 |                 continue
 76 |             if used_words[i: j + 1].sum() > 0:
 77 |                 continue
 78 | 
 79 |             triples.append([r, i, j])
 80 | 
 81 |             used_words[i: j + 1] = 1
 82 |             if r in self.core_label_ids:
 83 |                 used_labels.append(r)
 84 | 
 85 |         return triples
 86 | 
 87 |     def _sort_spans(self, scores, prd_index, prd_span_index):
 88 |         """
 89 |         :param scores: 1D: n_labels, 2D: n_spans; score
 90 |         :return: 1D: n_labels, 2D: n_words * n_words; elem=(r, i, j, score)
 91 |         """
 92 |         spans = []
 93 |         for r, scores_row in enumerate(scores):
 94 |             score_prd = scores_row[prd_span_index]
 95 |             for index, score in enumerate(scores_row):
 96 |                 (i, j) = self.span_list[index]
 97 |                 if i <= prd_index <= j:
 98 |                     continue
 99 |                 if score_prd < score:
100 |                     spans.append((r, i, j, score))
101 |         spans.sort(key=lambda span: span[-1], reverse=True)
102 |         return spans
103 | 


--------------------------------------------------------------------------------
/src/srl/model_api.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | import time
  3 | import math
  4 | import glob
  5 | 
  6 | import numpy as np
  7 | import theano
  8 | import theano.tensor as T
  9 | 
 10 | from srl.models import SpanModel, MoEModel, CRFModel
 11 | from srl.decoders import Decoder
 12 | from nn.regularizers import L2Regularizer
 13 | from nn.optimizers import get_optimizer
 14 | from utils.evaluators import f_score, correct_and_pred_spans, metrics_for_bio
 15 | from utils.savers import save_pickle
 16 | from utils.loaders import load_pickle
 17 | from utils.misc import write
 18 | 
 19 | 
 20 | class SpanModelAPI(object):
 21 |     def __init__(self, argv):
 22 |         self.argv = argv
 23 | 
 24 |         self.model = None
 25 |         self.experts = None
 26 |         self.train_func = None
 27 |         self.pred_func = None
 28 | 
 29 |         self.vocab_word = None
 30 |         self.vocab_label = None
 31 |         self.vocab_label_valid = None
 32 | 
 33 |         self.input_dim = None
 34 |         self.hidden_dim = None
 35 |         self.output_dim = None
 36 |         self.use_elmo = None
 37 | 
 38 |         self.decoder = None
 39 |         self.optimizer = None
 40 | 
 41 |         self.n_true_spans = 0.
 42 | 
 43 |     def set_model(self, **kwargs):
 44 |         write('Setting a model...')
 45 |         argv = self.argv
 46 | 
 47 |         self.vocab_word = kwargs['vocab_word']
 48 |         self.use_elmo = kwargs['use_elmo']
 49 |         self.vocab_label = kwargs['vocab_label']
 50 |         self.vocab_label_valid = kwargs['vocab_label_valid']
 51 |         word_emb = kwargs['word_emb']
 52 |         vocab_word_size = self.vocab_word.size() if self.vocab_word else 0
 53 | 
 54 |         self.input_dim = argv.emb_dim if word_emb is None else word_emb.shape[1]
 55 |         self.hidden_dim = argv.hidden_dim
 56 |         self.output_dim = -1
 57 | 
 58 |         self.decoder = Decoder(argv=argv, vocab_label=self.vocab_label)
 59 | 
 60 |         self.model = SpanModel()
 61 |         self.model.compile(inputs=self._set_inputs(),
 62 |                            vocab_word_size=vocab_word_size,
 63 |                            use_elmo=self.use_elmo,
 64 |                            word_emb=word_emb,
 65 |                            input_dim=[self.input_dim, self.input_dim],
 66 |                            hidden_dim=self.hidden_dim,
 67 |                            feat_dim=2 * self.hidden_dim,
 68 |                            output_dim=self.vocab_label.size(),
 69 |                            n_layers=argv.n_layers,
 70 |                            drop_rate=argv.drop_rate)
 71 | 
 72 |         write('\t- {}'.format("\n\t- ".join([l.name for l in self.model.layers])))
 73 |         self._show_model_config()
 74 | 
 75 |     def set_ensemble_model(self, **kwargs):
 76 |         write('Setting a model...')
 77 |         argv = self.argv
 78 | 
 79 |         self.vocab_word = kwargs['vocab_word']
 80 |         self.use_elmo = kwargs['use_elmo']
 81 |         self.vocab_label = kwargs['vocab_label']
 82 |         self.vocab_label_valid = kwargs['vocab_label_valid']
 83 |         word_emb = kwargs['word_emb']
 84 |         vocab_word_size = self.vocab_word.size() if self.vocab_word else 0
 85 | 
 86 |         self.input_dim = argv.emb_dim if word_emb is None else word_emb.shape[1]
 87 |         self.hidden_dim = argv.hidden_dim
 88 |         self.output_dim = -1
 89 | 
 90 |         self.decoder = Decoder(argv=argv, vocab_label=self.vocab_label)
 91 | 
 92 |         #################
 93 |         # Set MoE model #
 94 |         #################
 95 |         inputs = self._set_inputs()
 96 |         self.model = MoEModel()
 97 |         self.model.compile(inputs=inputs,
 98 |                            feat_dim=2 * self.hidden_dim,
 99 |                            output_dim=self.vocab_label.size(),
100 |                            drop_rate=argv.drop_rate,
101 |                            n_experts=argv.n_experts)
102 |         write('\t- {}\n'.format("\n\t- ".join([l.name for l in self.model.layers])))
103 | 
104 |         ###############
105 |         # Set experts #
106 |         ###############
107 |         experts = []
108 |         for _ in range(argv.n_experts):
109 |             model = SpanModel()
110 |             model.compile(inputs=self.model.inputs,
111 |                           vocab_word_size=vocab_word_size,
112 |                           use_elmo=self.use_elmo,
113 |                           input_dim=[self.input_dim, self.input_dim],
114 |                           hidden_dim=self.hidden_dim,
115 |                           feat_dim=2 * self.hidden_dim,
116 |                           output_dim=self.vocab_label.size(),
117 |                           n_layers=argv.n_layers,
118 |                           word_emb=word_emb,
119 |                           drop_rate=argv.drop_rate)
120 |             write('\t- {}\n'.format("\n\t- ".join([l.name for l in model.layers])))
121 |             experts.append(model)
122 | 
123 |         self.experts = experts
124 | 
125 |     def _set_inputs(self):
126 |         x = []
127 |         if self.vocab_word:
128 |             x.append(T.imatrix('x_word'))
129 |         if self.use_elmo:
130 |             x.append(T.ftensor4('x_elmo'))
131 |         x.append(T.imatrix('x_mark'))
132 |         assert len(x) > 1
133 |         return x
134 | 
135 |     def _show_model_config(self):
136 |         model = self.model
137 |         write('Model configuration')
138 |         write('\t- Input  Dim: {}'.format(self.input_dim))
139 |         write('\t- Hidden Dim: {}'.format(self.hidden_dim))
140 |         write('\t- Output Dim: {}'.format(self.output_dim))
141 |         write('\t- Parameters: {}'.format(sum(len(x.get_value(borrow=True).ravel())
142 |                                               for x in model.params)))
143 | 
144 |     def save_params(self, epoch=-1):
145 |         argv = self.argv
146 |         if argv.output_dir:
147 |             dir_name = argv.output_dir
148 |         else:
149 |             dir_name = 'output'
150 |         if argv.output_fn:
151 |             file_name = '/param.%s.epoch-%d' % (argv.output_fn, epoch)
152 |         else:
153 |             file_name = '/param.epoch-%d' % epoch
154 | 
155 |         fn = dir_name + file_name
156 |         params = [p.get_value(borrow=True) for p in self.model.params]
157 |         save_pickle(fn=fn, data=params)
158 | 
159 |     def load_params(self, path):
160 |         params = load_pickle(path)
161 |         assert len(self.model.params) == len(params)
162 |         for p1, p2 in zip(self.model.params, params):
163 |             p1.set_value(p2)
164 | 
165 |     def load_experts_params(self, path):
166 |         write('Loading experts params...')
167 |         param_files = glob.glob(path + '/*')
168 |         param_files = [fn for fn in param_files
169 |                        if fn.split('/')[-1].startswith('param')]
170 |         write("\t - Param Files: %s" % str(param_files))
171 |         for i, path in enumerate(param_files[:self.argv.n_experts]):
172 |             params = load_pickle(path)
173 |             assert len(self.experts[i].params) == len(params)
174 |             for p1, p2 in zip(self.experts[i].params, params):
175 |                 p1.set_value(p2)
176 | 
177 |     def set_init_ensemble_param(self):
178 |         write('Initializing params...')
179 |         W = np.zeros(shape=(2 * self.hidden_dim, self.vocab_label.size()),
180 |                      dtype=theano.config.floatX)
181 |         b = np.zeros(shape=self.vocab_label.size(),
182 |                      dtype=theano.config.floatX)
183 |         for model in self.experts:
184 |             W += model.params[-2].get_value(borrow=True)
185 |         for model in self.experts:
186 |             b += model.params[-1].get_value(borrow=True)
187 |         W = W / len(self.experts)
188 |         b = b / len(self.experts)
189 |         self.model.params[-2].set_value(W)
190 |         self.model.params[-1].set_value(b)
191 | 
192 |     def set_train_func(self):
193 |         write('Building a training function...')
194 | 
195 |         self.optimizer = get_optimizer(self.argv)
196 |         self.optimizer.set_params(self.model.params)
197 |         if self.argv.load_opt_param:
198 |             self.optimizer.load_params(self.argv.load_opt_param)
199 | 
200 |         # 1D: batch_size * n_spans, 2D: [batch index, label id, span index]
201 |         span_true = T.imatrix('span_true')
202 | 
203 |         # 1D: batch_size, 2D: n_spans, 3D: 2 * hidden_dim
204 |         h_span = self.model.span_feats(inputs=self.model.inputs)
205 |         # 1D: batch_size, 2D: n_labels, 3D: n_spans; score
206 |         span_score = self.model.label_layer.logit_scores(h=h_span)
207 |         # 1D: batch_size, 2D: n_labels; label id
208 |         span_pred = self.model.argmax_span(span_score=span_score)
209 | 
210 |         nll = self.model.loss(span_score, span_true)
211 |         l2_reg = L2Regularizer()
212 |         objective = nll + l2_reg(alpha=self.argv.reg,
213 |                                  params=self.model.params)
214 | 
215 |         grads = T.grad(cost=objective, wrt=self.model.params)
216 |         updates = self.optimizer(grads=grads, params=self.model.params)
217 | 
218 |         self.train_func = theano.function(
219 |             inputs=self.model.inputs + [span_true],
220 |             outputs=[objective, span_pred],
221 |             updates=updates,
222 |             mode='FAST_RUN'
223 |         )
224 | 
225 |     def set_pred_func(self):
226 |         write('Building a predicting function...')
227 |         if self.argv.search == 'argmax':
228 |             self.set_pred_argmax_func()
229 |         else:
230 |             self.set_pred_score_func()
231 | 
232 |     def set_pred_argmax_func(self):
233 |         # 1D: batch_size, 2D: n_spans, 3D: hidden_dim
234 |         h_span = self.model.span_feats(inputs=self.model.inputs)
235 |         # 1D: batch_size, 2D: n_labels, 3D: n_spans; score
236 |         logits = self.model.label_layer.logit_scores(h_span)
237 |         # 1D: batch_size, 2D: n_labels; span index
238 |         span_pred = self.model.argmax_span(logits)
239 | 
240 |         self.pred_func = theano.function(
241 |             inputs=self.model.inputs,
242 |             outputs=span_pred,
243 |             mode='FAST_RUN'
244 |         )
245 | 
246 |     def set_pred_score_func(self):
247 |         # 1D: batch_size, 2D: n_spans, 3D: hidden_dim
248 |         h_span = self.model.span_feats(inputs=self.model.inputs)
249 |         # 1D: batch_size, 2D: n_labels, 3D: n_spans; score
250 |         logits = self.model.label_layer.logit_scores(h_span)
251 |         # 1D: batch_size, 2D: n_labels, 3D: n_spans; score
252 |         span_score = self.model.exp_score(logits)
253 | 
254 |         self.pred_func = theano.function(
255 |             inputs=self.model.inputs,
256 |             outputs=span_score,
257 |             mode='FAST_RUN'
258 |         )
259 | 
260 |     def set_ensemble_train_func(self):
261 |         write('Building an ensemble training function...')
262 | 
263 |         self.optimizer = get_optimizer(self.argv)
264 |         self.optimizer.set_params(self.model.params)
265 |         if self.argv.load_opt_param:
266 |             self.optimizer.load_params(self.argv.load_opt_param)
267 | 
268 |         # 1D: batch_size * n_spans, 2D: [batch index, label id, span index]
269 |         span_true = T.imatrix('span_true')
270 | 
271 |         # 1D: batch_size, 2D: n_spans, 3D: 2 * hidden_dim
272 |         h_span = self.model.feat_layer.forward(self.model.inputs,
273 |                                                self.experts)
274 |         # 1D: batch_size, 2D: n_labels, 3D: n_spans; score
275 |         logits = self.model.feat_layer.logit_scores(h=h_span)
276 |         # 1D: batch_size, 2D: n_labels; span index
277 |         span_pred = self.model.argmax_span(logits)
278 | 
279 |         nll = self.model.loss(logits, span_true)
280 |         l2_reg = L2Regularizer()
281 |         objective = nll + l2_reg(alpha=self.argv.reg,
282 |                                  params=self.model.params)
283 | 
284 |         grads = T.grad(cost=objective, wrt=self.model.params)
285 |         updates = self.optimizer(grads=grads,
286 |                                  params=self.model.params)
287 | 
288 |         self.train_func = theano.function(
289 |             inputs=self.model.inputs + [span_true],
290 |             outputs=[objective, span_pred],
291 |             updates=updates,
292 |             mode='FAST_RUN'
293 |         )
294 | 
295 |     def set_ensemble_pred_func(self):
296 |         write('Building an ensemble predicting function...')
297 |         if self.argv.search == 'argmax':
298 |             self.set_ensemble_pred_argmax_func()
299 |         else:
300 |             self.set_ensemble_pred_score_func()
301 | 
302 |     def set_ensemble_pred_argmax_func(self):
303 |         # 1D: batch_size, 2D: n_spans, 3D: 2 * hidden_dim
304 |         h_span = self.model.feat_layer.forward(self.model.inputs,
305 |                                                self.experts)
306 |         # 1D: batch_size, 2D: n_labels, 3D: n_spans; score
307 |         span_score = self.model.feat_layer.logit_scores(h=h_span)
308 |         # 1D: batch_size, 2D: n_labels; span index
309 |         span_pred = self.model.argmax_span(span_score=span_score)
310 | 
311 |         self.pred_func = theano.function(
312 |             inputs=self.model.inputs,
313 |             outputs=span_pred,
314 |             mode='FAST_RUN'
315 |         )
316 | 
317 |     def set_ensemble_pred_score_func(self):
318 |         # 1D: batch_size, 2D: n_spans, 3D: 2 * hidden_dim
319 |         h_span = self.model.feat_layer.forward(self.model.inputs,
320 |                                                self.experts)
321 |         # 1D: batch_size, 2D: n_labels, 3D: n_spans; score
322 |         logits = self.model.feat_layer.logit_scores(h=h_span)
323 |         # 1D: batch_size, 2D: n_labels, 3D: n_spans; score
324 |         span_score = self.model.exp_score(logits)
325 | 
326 |         self.pred_func = theano.function(
327 |             inputs=self.model.inputs,
328 |             outputs=span_score,
329 |             mode='FAST_RUN'
330 |         )
331 | 
332 |     def train(self, batches):
333 |         start = time.time()
334 |         n_batches = 0.
335 |         loss_total = 0.
336 |         p_total = 0.
337 |         correct = 0.
338 | 
339 |         self.model.feat_layer.is_train.set_value(1)
340 |         if self.experts:
341 |             for model in self.experts:
342 |                 model.feat_layer.is_train.set_value(1)
343 | 
344 |         for inputs in batches:
345 |             n_batches += 1
346 | 
347 |             if n_batches % 100 == 0:
348 |                 sys.stdout.write("%d " % n_batches)
349 |                 sys.stdout.flush()
350 | 
351 |             n_words = len(inputs[0][0])
352 |             if n_words < 2 or 100 < n_words:
353 |                 continue
354 | 
355 |             loss, span_pred = self.train_func(*inputs)
356 | 
357 |             if math.isnan(loss):
358 |                 write('\n\nNAN: Index: %d\n' % n_batches)
359 |                 exit()
360 | 
361 |             loss_total += loss
362 |             correct_i, p_total_i = correct_and_pred_spans(span_true=inputs[-1],
363 |                                                           span_pred=span_pred,
364 |                                                           marks=inputs[1])
365 |             correct += correct_i
366 |             p_total += p_total_i
367 | 
368 |         self.model.feat_layer.is_train.set_value(0)
369 |         if self.experts:
370 |             for model in self.experts:
371 |                 model.feat_layer.is_train.set_value(0)
372 | 
373 |         avg_loss = loss_total / n_batches
374 |         p, r, f = f_score(correct, p_total, self.n_true_spans)
375 | 
376 |         write('\n\tTime: %f seconds' % (time.time() - start))
377 |         write('\tAverage Negative Log Likelihood: %f(%f/%d)' % (avg_loss, loss_total, n_batches))
378 |         write('\tF:{:>7.2%}  P:{:>7.2%} ({:>5}/{:>5})  R:{:>7.2%} ({:>5}/{:>5})'.format(
379 |             f, p, int(correct), int(p_total), r, int(correct), int(self.n_true_spans)))
380 | 
381 |     def predict(self, batches):
382 |         if self.argv.search == 'argmax':
383 |             return self.predict_argmax(batches)
384 |         else:
385 |             return self.predict_greedy(batches)
386 | 
387 |     def predict_argmax(self, batches):
388 |         """
389 |         :param batches: 1D: n_sents, 2D: n_prds, 3D: n_feats, 4D: n_words; elem=(x_w, x_m)
390 |         :return: y: 1D: n_sents, 2D: n_prds, 3D: n_spans, 3D: [label_id, pre_index, post_index]
391 |         """
392 |         start = time.time()
393 |         y = []
394 | 
395 |         for index, inputs in enumerate(batches):
396 |             if (index + 1) % 100 == 0:
397 |                 sys.stdout.write("%d " % (index + 1))
398 |                 sys.stdout.flush()
399 | 
400 |             if len(inputs) == 0:
401 |                 span_triples = []
402 |             else:
403 |                 span_pred = self.pred_func(*inputs)
404 |                 span_triples = self.decoder.argmax_span_triples(span_indices=span_pred,
405 |                                                                 marks=inputs[-1])
406 |             y.append(span_triples)
407 | 
408 |         write('\n\tTime: %f seconds' % (time.time() - start))
409 |         return y
410 | 
411 |     def predict_greedy(self, batches):
412 |         """
413 |         :param batches: 1D: n_sents, 2D: n_prds, 3D: n_feats, 4D: n_words; elem=(x_w, x_m)
414 |         :return: y: 1D: n_sents, 2D: n_prds, 3D: n_spans, 3D: [label_id, pre_index, post_index]
415 |         """
416 |         start = time.time()
417 |         y = []
418 | 
419 |         for index, inputs in enumerate(batches):
420 |             if (index + 1) % 100 == 0:
421 |                 sys.stdout.write("%d " % (index + 1))
422 |                 sys.stdout.flush()
423 | 
424 |             if len(inputs) == 0:
425 |                 span_triples = []
426 |             else:
427 |                 scores = self.pred_func(*inputs)
428 |                 span_triples = self.decoder.greedy_span_triples(scores=scores,
429 |                                                                 marks=inputs[-1])
430 |             y.append(span_triples)
431 | 
432 |         write('\n\tTime: %f seconds' % (time.time() - start))
433 |         return y
434 | 
435 | 
436 | class BIOModelAPI(SpanModelAPI):
437 |     def set_model(self, **kwargs):
438 |         write('Setting a model...')
439 |         argv = self.argv
440 | 
441 |         self.vocab_word = kwargs['vocab_word']
442 |         self.use_elmo = kwargs['use_elmo']
443 |         self.vocab_label = kwargs['vocab_label']
444 |         self.vocab_label_valid = kwargs['vocab_label_valid']
445 |         word_emb = kwargs['word_emb']
446 |         vocab_word_size = self.vocab_word.size() if self.vocab_word else 0
447 | 
448 |         self.input_dim = argv.emb_dim if word_emb is None else word_emb.shape[1]
449 |         self.hidden_dim = argv.hidden_dim
450 |         self.output_dim = self.vocab_label.size()
451 | 
452 |         self.model = CRFModel()
453 |         self.model.compile(inputs=self._set_inputs(),
454 |                            vocab_word_size=vocab_word_size,
455 |                            use_elmo=self.use_elmo,
456 |                            word_emb=word_emb,
457 |                            input_dim=[self.input_dim, self.input_dim],
458 |                            hidden_dim=self.hidden_dim,
459 |                            output_dim=self.output_dim,
460 |                            n_layers=argv.n_layers,
461 |                            init_emb=word_emb,
462 |                            drop_rate=argv.drop_rate)
463 | 
464 |         write('\t- {}'.format("\n\t- ".join([l.name for l in self.model.layers])))
465 |         self._show_model_config()
466 | 
467 |     def set_train_func(self):
468 |         write('Building a training function...')
469 | 
470 |         self.optimizer = get_optimizer(self.argv)
471 |         self.optimizer.set_params(self.model.params)
472 |         if self.argv.load_opt_param:
473 |             write('\tLoading optimization params...')
474 |             self.optimizer.load_params(self.argv.load_opt_param)
475 | 
476 |         y_true = T.imatrix('y')
477 | 
478 |         # 1D: batch_size, 2D: n_words, 3D: output_dim
479 |         emit_scores = self.model.get_emit_scores()
480 |         # 1D: batch_size, 2D: n_words; elem=label id
481 |         y_pred = self.model.label_layer.get_y_pred(emit_scores)
482 |         # 1D: batch_size; elem=log proba
483 |         y_path_proba = self.model.label_layer.get_y_path_proba(emit_scores, y_true)
484 | 
485 |         l2_reg = L2Regularizer()
486 |         cost = - T.mean(y_path_proba) + l2_reg(alpha=self.argv.reg,
487 |                                                params=self.model.params)
488 | 
489 |         grads = T.grad(cost=cost, wrt=self.model.params)
490 |         updates = self.optimizer(grads=grads, params=self.model.params)
491 | 
492 |         self.train_func = theano.function(
493 |             inputs=self.model.inputs + [y_true],
494 |             outputs=[cost, y_pred],
495 |             updates=updates,
496 |             on_unused_input='warn',
497 |             mode='FAST_RUN'
498 |         )
499 | 
500 |     def set_pred_func(self):
501 |         write('Building a predicting function...')
502 | 
503 |         # 1D: batch_size, 2D: n_words, 3D: output_dim
504 |         o = self.model.get_emit_scores()
505 |         # 1D: batch_size, 2D: n_words; elem=label id
506 |         y_pred = self.model.label_layer.get_y_pred(o)
507 | 
508 |         self.pred_func = theano.function(
509 |             inputs=self.model.inputs,
510 |             outputs=y_pred,
511 |             on_unused_input='warn',
512 |             mode='FAST_RUN'
513 |         )
514 | 
515 |     def train(self, batches):
516 |         start = time.time()
517 |         n_batches = 0.
518 |         n_samples = 0.
519 |         loss_total = 0.
520 |         p_total = 0.
521 |         r_total = 0.
522 |         correct = 0.
523 | 
524 |         self.model.feat_layer.is_train.set_value(1)
525 | 
526 |         for index, inputs in enumerate(batches):
527 |             if (index + 1) % 100 == 0:
528 |                 sys.stdout.write('%d ' % (index + 1))
529 |                 sys.stdout.flush()
530 | 
531 |             batch_size = len(inputs[0])
532 |             n_words = len(inputs[0][0])
533 |             if n_words < 2 or 100 < n_words:
534 |                 continue
535 | 
536 |             loss, y_pred = self.train_func(*inputs)
537 | 
538 |             if math.isnan(loss):
539 |                 write('\n\nNAN: Index: %d\n' % (index + 1))
540 |                 exit()
541 | 
542 |             loss_total += loss
543 |             n_batches += 1
544 |             n_samples += batch_size * n_words
545 | 
546 |             correct_i, p_total_i, r_total_i = metrics_for_bio(y_true=inputs[-1],
547 |                                                               y_pred=y_pred,
548 |                                                               vocab_label=self.vocab_label)
549 |             correct += correct_i
550 |             p_total += p_total_i
551 |             r_total += r_total_i
552 | 
553 |         self.model.feat_layer.is_train.set_value(0)
554 | 
555 |         avg_loss = loss_total / n_batches
556 |         p, r, f = f_score(correct, p_total, r_total)
557 | 
558 |         write('\n\tTime: %f seconds' % (time.time() - start))
559 |         write('\tAverage Negative Log Likelihood: %f(%f/%d)' % (avg_loss, loss_total, n_batches))
560 |         write('\tF:{:>7.2%}  P:{:>7.2%} ({:>5}/{:>5})  R:{:>7.2%} ({:>5}/{:>5})'.format(
561 |             f, p, int(correct), int(p_total), r, int(correct), int(r_total)))
562 | 
563 |     def predict(self, batches):
564 |         """
565 |         :param batches: 1D: n_batches, 2D: n_words; elem=(x_w, x_m)
566 |         :return: y: 1D: n_batches, 2D: batch_size; elem=(y_pred(1D:n_words), y_proba(float))
567 |         """
568 |         start = time.time()
569 |         y = []
570 | 
571 |         for index, inputs in enumerate(batches):
572 |             if (index + 1) % 100 == 0:
573 |                 sys.stdout.write("%d " % (index + 1))
574 |                 sys.stdout.flush()
575 | 
576 |             if len(inputs) == 0:
577 |                 y_pred = []
578 |             elif len(inputs[0][0]) < 2:
579 |                 y_pred = [[0] for _ in range(len(inputs[0]))]
580 |             else:
581 |                 y_pred = self.pred_func(*inputs)
582 |             y.append(y_pred)
583 | 
584 |         write('\n\tTime: %f seconds' % (time.time() - start))
585 |         return y
586 | 


--------------------------------------------------------------------------------
/src/srl/models.py:
--------------------------------------------------------------------------------
  1 | import theano
  2 | import theano.tensor as T
  3 | 
  4 | from nn.layers.embeddings import Embedding, ElmoLayer
  5 | from nn.layers.core import Dense, Dropout
  6 | from nn.layers.seqlabel import CRF
  7 | from nn.layers.stack import BiRNNLayer
  8 | from nn.utils import logsumexp3d
  9 | 
 10 | 
 11 | class Model(object):
 12 |     def __init__(self):
 13 |         self.is_train = theano.shared(0, borrow=True)
 14 |         self.inputs = None
 15 |         self.outputs = None
 16 |         self.dropout = None
 17 |         self.input_layers = []
 18 |         self.hidden_layers = []
 19 |         self.output_layers = []
 20 |         self.layers = []
 21 |         self.params = []
 22 | 
 23 |     def compile(self, **kwargs):
 24 |         raise NotImplementedError
 25 | 
 26 |     def _set_params(self):
 27 |         for l in self.layers:
 28 |             self.params += l.params
 29 | 
 30 | 
 31 | class FeatureLayer(Model):
 32 |     def compile(self, **kwargs):
 33 |         self._set_layers(kwargs)
 34 |         self._set_params()
 35 | 
 36 |     def forward(self, inputs):
 37 |         embs = []
 38 |         for i in range(len(inputs)):
 39 |             # 1D: batch_size, 2D: n_words, 3D: input_dim
 40 |             emb_i = self.input_layers[i].forward(x=inputs[i],
 41 |                                                  is_train=self.is_train)
 42 |             embs.append(emb_i)
 43 | 
 44 |         # 1D: batch_size, 2D: n_words, 3D: input_dim
 45 |         x = T.concatenate(tensor_list=embs, axis=2)
 46 |         # 1D: n_words, 2D: batch_size, 3D: hidden_dim
 47 |         h = self.hidden_layers[0].forward(x=x.dimshuffle(1, 0, 2),
 48 |                                           is_train=self.is_train)
 49 |         return h
 50 | 
 51 |     def _set_layers(self, args):
 52 |         x_w_dim, x_m_dim = args['input_dim']
 53 |         hidden_dim = args['hidden_dim']
 54 |         drop_rate = args['drop_rate']
 55 | 
 56 |         ################
 57 |         # Input layers #
 58 |         ################
 59 |         if args['vocab_word_size'] > 0:
 60 |             emb_word = Embedding(input_dim=args['vocab_word_size'],
 61 |                                  output_dim=x_w_dim,
 62 |                                  init_emb=args['word_emb'],
 63 |                                  param_fix=True,
 64 |                                  drop_rate=drop_rate,
 65 |                                  name='EmbWord')
 66 |             self.input_layers.append(emb_word)
 67 | 
 68 |         if args['use_elmo']:
 69 |             emb_elmo = ElmoLayer(drop_rate=0.5,
 70 |                                  name='EmbElmo')
 71 |             self.input_layers.append(emb_elmo)
 72 | 
 73 |         emb_mark = Embedding(input_dim=2,
 74 |                              output_dim=x_m_dim,
 75 |                              init_emb=None,
 76 |                              param_init='xavier',
 77 |                              param_fix=False,
 78 |                              drop_rate=drop_rate,
 79 |                              name='EmbMark')
 80 |         self.input_layers.append(emb_mark)
 81 | 
 82 |         #################
 83 |         # Hidden layers #
 84 |         #################
 85 |         if args['use_elmo']:
 86 |             hidden_input_dim = (len(self.input_layers) - 2) * x_w_dim + x_m_dim + 1024
 87 |         else:
 88 |             hidden_input_dim = (len(self.input_layers) - 1) * x_w_dim + x_m_dim
 89 |         hidden_layer = BiRNNLayer(input_dim=hidden_input_dim,
 90 |                                   output_dim=hidden_dim,
 91 |                                   n_layers=args['n_layers'],
 92 |                                   unit_type='lstm',
 93 |                                   connect_type='dense',
 94 |                                   drop_rate=drop_rate)
 95 |         self.hidden_layers = [hidden_layer]
 96 |         self.layers = self.input_layers + self.hidden_layers
 97 | 
 98 | 
 99 | class LabelLayer(Model):
100 |     def compile(self, **kwargs):
101 |         self._set_layers(hidden_dim=kwargs['feat_dim'],
102 |                          output_dim=kwargs['output_dim'])
103 |         self._set_params()
104 | 
105 |     def _set_layers(self, hidden_dim, output_dim):
106 |         self.layers = [Dense(input_dim=hidden_dim,
107 |                              output_dim=output_dim)]
108 | 
109 |     def span_feats2(self, h):
110 |         """
111 |         :param h: 1D: n_words, 2D: batch_size, 3D: hidden_dim
112 |         :return: 1D: batch_size, 2D: n_spans, 3D: 2 * hidden_dim
113 |         """
114 |         h = h.dimshuffle(1, 0, 2)
115 |         n_words = h.shape[1]
116 | 
117 |         m = T.triu(T.ones(shape=(n_words, n_words)))
118 |         indices = m.nonzero()
119 | 
120 |         # 1D: batch_size, 2D: n_spans, 3D: hidden_dim
121 |         h_i = h[:, indices[0]]
122 |         h_j = h[:, indices[1]]
123 | 
124 |         h_diff = h_i - h_j
125 |         h_add = h_i + h_j
126 | 
127 |         return T.concatenate([h_add, h_diff], axis=2)
128 | 
129 |     def span_feats(self, h):
130 |         """
131 |         :param h: 1D: n_words, 2D: batch_size, 3D: hidden_dim
132 |         :return: 1D: batch_size, 2D: n_words(i), 3D: n_words(j), 4D: 2 * hidden_dim
133 |         """
134 |         h = h.dimshuffle(1, 0, 2)
135 |         n_words = h.shape[1]
136 |         pad = T.zeros(shape=(h.shape[0], 1, h.shape[2]))
137 |         h_pad = T.concatenate([h, pad], axis=1)
138 | 
139 |         m = T.triu(T.ones(shape=(n_words, n_words)))
140 |         indices = m.nonzero()
141 | 
142 |         # 1D: batch_size, 2D: n_spans, 3D: hidden_dim
143 |         h_i = h[:, indices[0]]
144 |         h_j = h_pad[:, indices[1] + 1]
145 | 
146 |         h_diff = h_i - h_j
147 |         h_add = h_i + h_j
148 | 
149 |         return T.concatenate([h_add, h_diff], axis=2)
150 | 
151 |     def logit_scores(self, h):
152 |         """
153 |         :param h: 1D: batch_size, 2D: n_spans, 3D: 2 * hidden_dim
154 |         :return: 1D: batch_size, 2D: n_labels, 3D: n_spans; score
155 |         """
156 |         return self.layers[-1].forward(h).dimshuffle(0, 2, 1)
157 | 
158 | 
159 | class MoELabelLayer(LabelLayer):
160 |     def __init__(self):
161 |         super(MoELabelLayer, self).__init__()
162 |         self.hidden_dim = -1
163 | 
164 |     def compile(self, **kwargs):
165 |         self.dropout = Dropout(rate=kwargs['drop_rate'])
166 |         self._set_layers(n_experts=kwargs['n_experts'],
167 |                          hidden_dim=kwargs['feat_dim'],
168 |                          output_dim=kwargs['output_dim'])
169 | 
170 |     def _set_layers(self, n_experts, hidden_dim, output_dim):
171 |         mixture = Dense(input_dim=1,
172 |                         output_dim=n_experts,
173 |                         activation=None,
174 |                         use_bias=False,
175 |                         weight_init='zero',
176 |                         bias_init='zero')
177 |         hidden_layer = Dense(input_dim=hidden_dim,
178 |                              output_dim=hidden_dim,
179 |                              weight_init="identity")
180 |         output_layer = Dense(input_dim=hidden_dim,
181 |                              output_dim=output_dim)
182 |         self.hidden_dim = hidden_dim
183 |         self.layers = [mixture, hidden_layer, output_layer]
184 | 
185 |     def forward(self, x, experts):
186 |         """
187 |         :param x: 1D: n_inputs, 2D: batch_size, 3D: n_words; feat id
188 |         :param experts: 1D: n_experts; model
189 |         :return: 1D: batch_size, 2D: n_spans, 3D: 2 * hidden_dim
190 |         """
191 |         # 1D: 1, 2D: n_experts, 3D: 1
192 |         mixture = T.nnet.softmax(self.layers[0].W).dimshuffle('x', 1, 0)
193 |         # 1D: 1, 2D: n_experts, 3D: 2 * hidden_dim
194 |         mixture = T.repeat(mixture, repeats=self.hidden_dim, axis=2)
195 | 
196 |         batch_size = x[0].shape[0]
197 |         n_words = x[0].shape[1]
198 |         n_spans = T.cast(n_words * (n_words + 1) / 2, dtype='int32')
199 | 
200 |         # 1D: batch_size, 2D: n_spans, 3D: 2 * hidden_dim; score
201 |         h_span = T.zeros(shape=(batch_size, n_spans, self.hidden_dim),
202 |                          dtype=theano.config.floatX)
203 | 
204 |         for i, expert in enumerate(experts):
205 |             # 1D: batch_size, 2D: n_spans, 3D: 2 * hidden_dim
206 |             h_span_tm = expert.span_feats(inputs=x)
207 |             h_span = h_span + mixture[:, i] * h_span_tm
208 | 
209 |         return self.layers[1].forward(h_span)
210 | 
211 | 
212 | class CRFLayer(Model):
213 |     def compile(self, **kwargs):
214 |         self._set_layers(kwargs)
215 |         self._set_params()
216 | 
217 |     def _set_layers(self, args):
218 |         layer = CRF(input_dim=args['hidden_dim'],
219 |                     output_dim=args['output_dim'])
220 |         self.layers = [layer]
221 | 
222 |     def forward(self, h):
223 |         """
224 |         :param h: 1D: n_words, 2D: batch_size, 3D: hidden_dim
225 |         :return: 1D: batch_size, 2D: n_words, 3D: output_dim; elem=emit score
226 |         """
227 |         return self.layers[0].forward(x=h).dimshuffle(1, 0, 2)
228 | 
229 |     def get_y_pred(self, o):
230 |         """
231 |         :param o: 1D: batch_size, 2D: n_words, 3D: output_dim; elem=emit score
232 |         :return: 1D: batch_size, 2D: n_words; elem=label id
233 |         """
234 |         return self.layers[0].get_y_pred(emit_scores=o.dimshuffle(1, 0, 2))
235 | 
236 |     def get_y_path_proba(self, o, y_true):
237 |         """
238 |         :param o: 1D: batch_size, 2D: n_words, 3D: output_dim; elem=emit score
239 |         :param y_true: 1D: batch_size, 2D: n_words; elem=label id
240 |         :return: 1D: batch_size; elem=log proba
241 |         """
242 |         return self.layers[0].get_y_proba(emit_scores=o.dimshuffle(1, 0, 2),
243 |                                           y_true=y_true.dimshuffle(1, 0))
244 | 
245 | 
246 | class SpanModel(Model):
247 |     def __init__(self):
248 |         super(SpanModel, self).__init__()
249 |         self.feat_layer = None
250 |         self.label_layer = None
251 | 
252 |     def compile(self, inputs, **kwargs):
253 |         self.inputs = inputs
254 |         self.feat_layer = FeatureLayer()
255 |         self.feat_layer.compile(**kwargs)
256 |         self.label_layer = LabelLayer()
257 |         self.label_layer.compile(**kwargs)
258 |         self.layers = self.feat_layer.layers + self.label_layer.layers
259 |         self._set_params()
260 | 
261 |     def span_feats(self, inputs):
262 |         """
263 |         :param inputs: 1D: n_inputs, 2D: batch_size, 3D: n_words; feat id
264 |         :return: 1D: batch_size, 2D: n_spans, 3D: 2 * hidden_dim
265 |         """
266 |         # 1D: n_words, 2D: batch_size, 3D: 2 * hidden_dim
267 |         h_rnn = self.feat_layer.forward(inputs)
268 |         return self.label_layer.span_feats(h_rnn)
269 | 
270 |     @staticmethod
271 |     def argmax_span(span_score):
272 |         """
273 |         :param span_score: 1D: batch_size, 2D: n_labels, 3D: n_spans
274 |         :return: 1D: batch_size, 2D: n_labels; span index
275 |         """
276 |         return T.argmax(span_score, axis=2)
277 | 
278 |     @staticmethod
279 |     def loss(span_score, span_true):
280 |         """
281 |         :param span_score: 1D: batch_size, 2D: n_labels, 3D: n_spans
282 |         :param span_true: 1D: batch_size * n_spans; (batch index, label id, span index)
283 |         """
284 |         batch_size = span_score.shape[0]
285 | 
286 |         # 1D: batch_size * n_spans; index
287 |         batch_index = span_true[:, 0]
288 |         label_index = span_true[:, 1]
289 |         span_index = span_true[:, 2]
290 | 
291 |         # 1D: batch_size * n_spans; score
292 |         true_span_score = span_score[batch_index, label_index, span_index]
293 | 
294 |         # 1D: batch_size, 2D: n_labels; elem=score
295 |         z = logsumexp3d(span_score, axis=2)
296 |         # 1D: batch_size * n_spans; score
297 |         z = z[batch_index, label_index]
298 | 
299 |         # 1D: batch_size * n_spans; score
300 |         nll = true_span_score - z
301 | 
302 |         return - T.sum(nll) / batch_size
303 | 
304 |     @staticmethod
305 |     def exp_score(span_score):
306 |         """
307 |         :param span_score: 1D: batch_size, 2D: n_labels, 3D: n_spans; logit score
308 |         :return: 1D: batch_size, 2D: n_labels, 3D: n_spans
309 |         """
310 |         return T.exp(span_score)
311 | 
312 | 
313 | class MoEModel(SpanModel):
314 |     def compile(self, inputs, **kwargs):
315 |         self.inputs = inputs
316 |         self.feat_layer = MoELabelLayer()
317 |         self.feat_layer.compile(**kwargs)
318 |         self.layers = self.feat_layer.layers
319 |         self._set_params()
320 | 
321 | 
322 | class CRFModel(Model):
323 |     def __init__(self):
324 |         super(CRFModel, self).__init__()
325 |         self.feat_layer = None
326 |         self.label_layer = None
327 | 
328 |     def compile(self, inputs, **kwargs):
329 |         self.inputs = inputs
330 |         self.feat_layer = FeatureLayer()
331 |         self.feat_layer.compile(**kwargs)
332 |         self.label_layer = CRFLayer()
333 |         self.label_layer.compile(**kwargs)
334 |         self.layers = self.feat_layer.layers + self.label_layer.layers
335 |         self._set_params()
336 | 
337 |     def get_emit_scores(self):
338 |         """
339 |         :return: 1D: batch_size, 2D: n_words, 3D: output_dim
340 |         """
341 |         h = self.feat_layer.forward(self.inputs)
342 |         return self.label_layer.forward(h)
343 | 
344 | 


--------------------------------------------------------------------------------
/src/srl/preprocessors.py:
--------------------------------------------------------------------------------
  1 | from collections import Counter
  2 | from copy import deepcopy
  3 | 
  4 | import numpy as np
  5 | 
  6 | from utils.vocab import Vocab, UNK
  7 | from utils.sent import Conll05Sent, Conll12Sent
  8 | from utils.misc import span_to_span_index, make_vocab_from_ids
  9 | from utils.savers import save_key_value_format
 10 | from utils.loaders import load_key_value_format
 11 | 
 12 | 
 13 | class Preprocessor(object):
 14 |     def __init__(self, argv):
 15 |         self.argv = argv
 16 |         self.data_type = argv.data_type
 17 | 
 18 |     @staticmethod
 19 |     def make_vocab_word(word_list):
 20 |         vocab_word = Vocab()
 21 |         vocab_word.add_word(UNK)
 22 |         for w in word_list:
 23 |             vocab_word.add_word(w)
 24 |         return vocab_word
 25 | 
 26 |     def make_and_save_vocab_label(self,
 27 |                                   sents,
 28 |                                   vocab_label_init=None,
 29 |                                   save=False,
 30 |                                   load=False):
 31 |         argv = self.argv
 32 | 
 33 |         if load and argv.load_label:
 34 |             label_key_value = load_key_value_format(argv.load_label)
 35 |             vocab_label = make_vocab_from_ids(label_key_value)
 36 |         else:
 37 |             vocab_label = self.make_vocab_label(sents=sents,
 38 |                                                 vocab_label_init=vocab_label_init)
 39 |         if save:
 40 |             if argv.output_dir:
 41 |                 dir_name = argv.output_dir
 42 |             else:
 43 |                 dir_name = 'output'
 44 |             if argv.output_fn:
 45 |                 file_name = '/label_ids.' + argv.output_fn
 46 |             else:
 47 |                 file_name = '/label_ids'
 48 | 
 49 |             fn = dir_name + file_name
 50 |             values, keys = map(lambda x: x, zip(*enumerate(vocab_label.i2w)))
 51 |             save_key_value_format(fn=fn, keys=keys, values=values)
 52 | 
 53 |         return vocab_label
 54 | 
 55 |     def make_sents(self, corpus):
 56 |         """
 57 |         :param corpus: 1D: n_sents, 2D: n_words
 58 |         :return: 1D: n_sents
 59 |         """
 60 |         if len(corpus) == 0:
 61 |             return []
 62 | 
 63 |         if self.data_type == 'conll05':
 64 |             column = 6
 65 |             gen_sent = Conll05Sent
 66 |         else:
 67 |             column = 12
 68 |             gen_sent = Conll12Sent
 69 | 
 70 |         is_test = True if len(corpus[0][0]) < column else False
 71 |         return [gen_sent(sent, is_test) for sent in corpus]
 72 | 
 73 |     @staticmethod
 74 |     def split_x_and_y(batches, index=-1):
 75 |         """
 76 |         :param batches: 1D: n_batches, 2D: batch_size; elem=(x, m, y)
 77 |         :param index: split column index
 78 |         :return 1D: n_batches, 2D: batch_size; elem=(x, m)
 79 |         :return 1D: n_batches, 2D: batch_size; elem=y
 80 |         """
 81 |         x = []
 82 |         y = []
 83 |         for batch in batches:
 84 |             x.append(batch[:index])
 85 |             y.append(batch[index])
 86 |         return x, y
 87 | 
 88 |     def make_batches(self,
 89 |                      samples,
 90 |                      is_valid_data=False,
 91 |                      shuffle=True):
 92 |         """
 93 |         :param samples: 1D: n_samples, 2D: [x, m, y]
 94 |         :param is_valid_data: boolean
 95 |         :param shuffle: boolean
 96 |         :return 1D: n_batches, 2D: batch_size; elem=[x, m, y]
 97 |         """
 98 |         if shuffle:
 99 |             np.random.shuffle(samples)
100 |             samples.sort(key=lambda sample: len(sample[0]))
101 | 
102 |         batches = []
103 |         batch = []
104 |         prev_n_words = len(samples[0][0])
105 | 
106 |         for sample in samples:
107 |             n_words = len(sample[0])
108 |             if len(batch) == self.argv.batch_size or prev_n_words != n_words:
109 |                 batches.append(self._make_one_batch(batch, is_valid_data))
110 |                 batch = []
111 |                 prev_n_words = n_words
112 |             batch.append(sample)
113 | 
114 |         if batch:
115 |             batches.append(self._make_one_batch(batch, is_valid_data))
116 | 
117 |         if shuffle:
118 |             np.random.shuffle(batches)
119 | 
120 |         for batch in batches:
121 |             yield batch
122 | 
123 |     @staticmethod
124 |     def _make_one_batch(batch, is_valid_data):
125 |         raise NotImplementedError
126 | 
127 |     @staticmethod
128 |     def make_batch_per_sent(sents):
129 |         """
130 |         :param sents: 1D: n_sents; Sent()
131 |         :return 1D: n_sents, 2D: n_prds; elem=[x, m]
132 |         """
133 |         batches = []
134 |         for sent in sents:
135 |             x = []
136 | 
137 |             x_word_ids = sent.word_ids
138 |             if x_word_ids is not None:
139 |                 x.append(x_word_ids)
140 | 
141 |             x_elmo_emb = sent.elmo_emb
142 |             if x_elmo_emb is not None:
143 |                 x.append(x_elmo_emb)
144 | 
145 |             batch = list(map(lambda m: x + [m], sent.mark_ids))
146 |             batches.append(list(map(lambda b: b, zip(*batch))))
147 | 
148 |         return batches
149 | 
150 |     @staticmethod
151 |     def set_sent_config(sents, elmo_emb, vocab_word, vocab_label):
152 |         raise NotImplementedError
153 | 
154 |     @staticmethod
155 |     def make_samples(sents, is_valid_data=False):
156 |         raise NotImplementedError
157 | 
158 |     def make_vocab_label(self,
159 |                          sents,
160 |                          vocab_label_init=None):
161 |         raise NotImplementedError
162 | 
163 | 
164 | class SpanPreprocessor(Preprocessor):
165 |     def make_vocab_label(self,
166 |                          sents,
167 |                          vocab_label_init=None):
168 |         if len(sents) == 0:
169 |             return None
170 | 
171 |         if vocab_label_init:
172 |             vocab_label = deepcopy(vocab_label_init)
173 |         else:
174 |             vocab_label = Vocab()
175 |             if self.argv.data_type == 'conll05':
176 |                 core_labels = ["A0", "A1", "A2", "A3", "A4", "A5"]
177 |             else:
178 |                 core_labels = ["ARG0", "ARG1", "ARG2", "ARG3", "ARG4", "ARG5"]
179 |             for label in core_labels:
180 |                 vocab_label.add_word(label)
181 | 
182 |         bio_labels = []
183 |         for sent in sents:
184 |             for props in sent.prd_bio_labels:
185 |                 bio_labels += props
186 |         cnt = Counter(bio_labels)
187 |         bio_labels = [(w, c) for w, c in cnt.most_common()]
188 | 
189 |         for label, count in bio_labels:
190 |             if not label.endswith('-V') and len(label) > 1:
191 |                 vocab_label.add_word(label[2:])
192 | 
193 |         return vocab_label
194 | 
195 |     @staticmethod
196 |     def set_sent_config(sents, elmo_emb, vocab_word, vocab_label):
197 |         for index, sent in enumerate(sents):
198 |             sent.set_mark_ids()
199 |             if vocab_word:
200 |                 sent.set_word_ids(vocab_word)
201 |             if elmo_emb:
202 |                 sent.set_elmo_emb(elmo_emb[str(index)])
203 |             if vocab_label:
204 |                 sent.set_span_triples(vocab_label)
205 |                 sent.set_span_triples_with_null(vocab_label.size())
206 |         return sents
207 | 
208 |     @staticmethod
209 |     def make_samples(sents, is_valid_data=False):
210 |         samples = []
211 | 
212 |         for sent in sents:
213 |             x = []
214 | 
215 |             x_word_ids = sent.word_ids
216 |             if x_word_ids is not None:
217 |                 x.append(x_word_ids)
218 | 
219 |             x_elmo_emb = sent.elmo_emb
220 |             if x_elmo_emb is not None:
221 |                 x.append(x_elmo_emb)
222 | 
223 |             if is_valid_data:
224 |                 triples = sent.span_triples
225 |             else:
226 |                 triples = sent.span_triples_with_null
227 | 
228 |             assert len(sent.mark_ids) == len(triples)
229 |             for m, spans in zip(sent.mark_ids, triples):
230 |                 # spans: 1D: n_spans, 2D: (r, i, j)
231 |                 samples.append(x + [m, spans])
232 | 
233 |         return samples
234 | 
235 |     @staticmethod
236 |     def _make_one_batch(batch, is_valid_data):
237 |         if is_valid_data:
238 |             return list(map(lambda b: b, zip(*batch)))
239 | 
240 |         b = []
241 |         y = []
242 |         n_words = len(batch[0][0])
243 |         for b_index, sample in enumerate(batch):
244 |             b.append(sample[:-1])
245 |             y_tmp = []
246 |             for (r, i, j) in sample[-1]:
247 |                 span_index = span_to_span_index(i, j, n_words)
248 |                 y_tmp.append([b_index, r, span_index])
249 |             y += y_tmp
250 | 
251 |         x = list(map(lambda b_i: b_i, zip(*b)))
252 | 
253 |         return x + [y]
254 | 
255 | 
256 | class BIOPreprocessor(Preprocessor):
257 |     def make_vocab_label(self,
258 |                          sents,
259 |                          vocab_label_init=None):
260 |         if len(sents) == 0:
261 |             return None
262 | 
263 |         if vocab_label_init:
264 |             vocab_label = deepcopy(vocab_label_init)
265 |         else:
266 |             vocab_label = Vocab()
267 |             none_label = 'O'
268 |             vocab_label.add_word(none_label)
269 | 
270 |         labels = []
271 |         for sent in sents:
272 |             if sent.has_prds:
273 |                 for prop in sent.prd_bio_labels:
274 |                     labels += prop
275 |         cnt = Counter(labels)
276 |         labels = [(w, c) for w, c in cnt.most_common()]
277 | 
278 |         for label, count in labels:
279 |             vocab_label.add_word(label)
280 | 
281 |         return vocab_label
282 | 
283 |     @staticmethod
284 |     def set_sent_config(sents, elmo_emb, vocab_word, vocab_label):
285 |         for index, sent in enumerate(sents):
286 |             sent.set_mark_ids()
287 |             if vocab_word:
288 |                 sent.set_word_ids(vocab_word)
289 |             if elmo_emb:
290 |                 sent.set_elmo_emb(elmo_emb[str(index)])
291 |             if vocab_label:
292 |                 sent.set_label_ids(vocab_label)
293 |         return sents
294 | 
295 |     @staticmethod
296 |     def make_samples(sents, is_valid_data=False):
297 |         samples = []
298 | 
299 |         for sent in sents:
300 |             x = []
301 | 
302 |             x_word_ids = sent.word_ids
303 |             if x_word_ids is not None:
304 |                 x.append(x_word_ids)
305 | 
306 |             x_elmo_emb = sent.elmo_emb
307 |             if x_elmo_emb is not None:
308 |                 x.append(x_elmo_emb)
309 | 
310 |             assert len(sent.mark_ids) == len(sent.bio_label_ids)
311 |             for m, spans in zip(sent.mark_ids, sent.bio_label_ids):
312 |                 samples.append(x + [m, spans])
313 | 
314 |         return samples
315 | 
316 |     @staticmethod
317 |     def _make_one_batch(batch, is_valid_data):
318 |         return list(map(lambda b: b, zip(*batch)))
319 | 


--------------------------------------------------------------------------------
/src/srl/testers.py:
--------------------------------------------------------------------------------
  1 | from utils.loaders import load_emb
  2 | from utils.misc import write, make_vocab_from_ids
  3 | 
  4 | 
  5 | class Tester(object):
  6 |     def __init__(self,
  7 |                  argv,
  8 |                  loader,
  9 |                  saver,
 10 |                  preprocessor,
 11 |                  evaluator,
 12 |                  model_api):
 13 |         self.argv = argv
 14 |         self.loader = loader
 15 |         self.saver = saver
 16 |         self.preprocessor = preprocessor
 17 |         self.evaluator = evaluator
 18 |         self.model_api = model_api
 19 | 
 20 |     def predict(self):
 21 |         argv = self.argv
 22 |         pproc = self.preprocessor
 23 |         loader = self.loader
 24 | 
 25 |         ################
 26 |         # Load dataset #
 27 |         ################
 28 |         write('Loading Dataset...')
 29 |         test_corpus = loader.load(path=argv.test_data,
 30 |                                   data_size=argv.data_size,
 31 |                                   is_test=True)
 32 |         test_sents = pproc.make_sents(test_corpus)
 33 | 
 34 |         #################
 35 |         # Load init emb #
 36 |         #################
 37 |         if argv.word_emb:
 38 |             write('Loading Embeddings...')
 39 |             word_list, word_emb = load_emb(argv.word_emb)
 40 |             vocab_word = pproc.make_vocab_word(word_list)
 41 |             write('\t- # Embedding Words: %d' % vocab_word.size())
 42 |         else:
 43 |             vocab_word = word_emb = None
 44 | 
 45 |         if argv.test_elmo_emb:
 46 |             write('Loading ELMo Embeddings...')
 47 |             test_elmo_emb = loader.load_hdf5(argv.test_elmo_emb)
 48 |         else:
 49 |             test_elmo_emb = None
 50 | 
 51 |         ###############
 52 |         # Make labels #
 53 |         ###############
 54 |         label_key_value = loader.load_key_value_format(argv.load_label)
 55 |         vocab_label = make_vocab_from_ids(label_key_value)
 56 |         write('\t- # Labels: %d' % vocab_label.size())
 57 | 
 58 |         ###################
 59 |         # Set sent params #
 60 |         ###################
 61 |         test_sents = pproc.set_sent_config(sents=test_sents,
 62 |                                            elmo_emb=test_elmo_emb,
 63 |                                            vocab_word=vocab_word,
 64 |                                            vocab_label=None)
 65 |         ################
 66 |         # Make samples #
 67 |         ################
 68 |         write('Making Test Samples...')
 69 |         test_batches = pproc.make_batch_per_sent(sents=test_sents)
 70 |         write('\t- # Test Samples: %d' % len(test_batches))
 71 | 
 72 |         #############
 73 |         # Model API #
 74 |         #############
 75 |         use_elmo = True if test_elmo_emb is not None else False
 76 | 
 77 |         if argv.n_experts > 0:
 78 |             self.model_api.set_ensemble_model(word_emb=word_emb,
 79 |                                               use_elmo=use_elmo,
 80 |                                               vocab_word=vocab_word,
 81 |                                               vocab_label=vocab_label,
 82 |                                               vocab_label_valid=None)
 83 |             self.model_api.load_params(argv.load_param)
 84 |             self.model_api.load_experts_params(argv.load_param_dir)
 85 |             self.model_api.set_ensemble_pred_func()
 86 |         else:
 87 |             self.model_api.set_model(word_emb=word_emb,
 88 |                                      use_elmo=use_elmo,
 89 |                                      vocab_word=vocab_word,
 90 |                                      vocab_label=vocab_label,
 91 |                                      vocab_label_valid=None)
 92 |             self.model_api.load_params(argv.load_param)
 93 |             self.model_api.set_pred_func()
 94 | 
 95 |         ###########
 96 |         # Testing #
 97 |         ###########
 98 |         write('\nPREDICTION START')
 99 |         test_y_pred = self.model_api.predict(test_batches)
100 |         self.saver.save_props(corpus=test_sents,
101 |                               labels=test_y_pred,
102 |                               vocab_label=vocab_label)
103 |         self.saver.save_json_format(corpus=test_sents,
104 |                                     labels=test_y_pred,
105 |                                     vocab_label=vocab_label)
106 | 


--------------------------------------------------------------------------------
/src/srl/trainers.py:
--------------------------------------------------------------------------------
  1 | from utils.evaluators import count_true_spans
  2 | from utils.loaders import load_emb
  3 | from utils.misc import write, show_score_history
  4 | from utils.misc import make_output_dir, get_file_names_in_dir, get_latest_param_fn
  5 | 
  6 | 
  7 | class Trainer(object):
  8 |     def __init__(self,
  9 |                  argv,
 10 |                  loader,
 11 |                  preprocessor,
 12 |                  evaluator,
 13 |                  model_api):
 14 |         self.argv = argv
 15 |         self.loader = loader
 16 |         self.preprocessor = preprocessor
 17 |         self.evaluator = evaluator
 18 |         self.model_api = model_api
 19 | 
 20 |         self.f1_history = {}
 21 |         self.best_valid_f1 = 0.0
 22 |         self.best_epoch = -1
 23 | 
 24 |     def train(self):
 25 |         write('\nTRAINING START\n')
 26 | 
 27 |         argv = self.argv
 28 |         loader = self.loader
 29 |         pproc = self.preprocessor
 30 | 
 31 |         make_output_dir(self.argv)
 32 | 
 33 |         #################
 34 |         # Load word emb #
 35 |         #################
 36 |         if argv.word_emb:
 37 |             write('Loading Word Embeddings...')
 38 |             word_list, word_emb = load_emb(argv.word_emb)
 39 |             vocab_word = pproc.make_vocab_word(word_list)
 40 |             write('\t- # Vocabs: %d' % vocab_word.size())
 41 |         else:
 42 |             vocab_word = word_emb = None
 43 | 
 44 |         #################
 45 |         # Load elmo emb #
 46 |         #################
 47 |         if self.argv.train_elmo_emb:
 48 |             write('Loading ELMo Embeddings...')
 49 |             train_elmo_emb = loader.load_hdf5(self.argv.train_elmo_emb)
 50 |         else:
 51 |             train_elmo_emb = None
 52 |         if self.argv.dev_elmo_emb:
 53 |             valid_elmo_emb = loader.load_hdf5(self.argv.dev_elmo_emb)
 54 |         else:
 55 |             valid_elmo_emb = None
 56 | 
 57 |         ###############
 58 |         # Load corpus #
 59 |         ###############
 60 |         write('Loading Corpus...')
 61 |         train_corpus = loader.load(path=argv.train_data,
 62 |                                    data_size=argv.data_size,
 63 |                                    is_test=False)
 64 |         valid_corpus = loader.load(path=argv.dev_data,
 65 |                                    data_size=argv.data_size,
 66 |                                    is_test=False)
 67 |         write('\t- # Sents: Train:%d  Valid:%d' % (len(train_corpus), len(valid_corpus)))
 68 | 
 69 |         ##############
 70 |         # Make sents #
 71 |         ##############
 72 |         train_sents = pproc.make_sents(train_corpus)
 73 |         valid_sents = pproc.make_sents(valid_corpus)
 74 | 
 75 |         ###############
 76 |         # Make labels #
 77 |         ###############
 78 |         write('Making Labels...')
 79 |         vocab_label_train = pproc.make_and_save_vocab_label(sents=train_sents,
 80 |                                                             vocab_label_init=None,
 81 |                                                             save=argv.save,
 82 |                                                             load=True)
 83 |         vocab_label_valid = pproc.make_and_save_vocab_label(sents=valid_sents,
 84 |                                                             vocab_label_init=vocab_label_train,
 85 |                                                             save=False,
 86 |                                                             load=False)
 87 |         write('\t- # Labels: %d' % vocab_label_train.size())
 88 | 
 89 |         ###################
 90 |         # Set sent params #
 91 |         ###################
 92 |         train_sents = pproc.set_sent_config(sents=train_sents,
 93 |                                             elmo_emb=train_elmo_emb,
 94 |                                             vocab_word=vocab_word,
 95 |                                             vocab_label=vocab_label_train)
 96 |         valid_sents = pproc.set_sent_config(sents=valid_sents,
 97 |                                             elmo_emb=valid_elmo_emb,
 98 |                                             vocab_word=vocab_word,
 99 |                                             vocab_label=vocab_label_valid)
100 | 
101 |         ################
102 |         # Make samples #
103 |         ################
104 |         write('Making Samples...')
105 |         train_samples = pproc.make_samples(sents=train_sents,
106 |                                            is_valid_data=False)
107 |         valid_samples = pproc.make_samples(sents=valid_sents,
108 |                                            is_valid_data=True)
109 |         write('\t- # Samples: Train:%d  Valid:%d' % (len(train_samples),
110 |                                                      len(valid_samples)))
111 | 
112 |         #################
113 |         # Set Model API #
114 |         #################
115 |         if train_elmo_emb is not None:
116 |             use_elmo = True
117 |         else:
118 |             use_elmo = False
119 | 
120 |         if argv.n_experts > 0:
121 |             is_ensemble = True
122 |         else:
123 |             is_ensemble = False
124 | 
125 |         if argv.method == 'span':
126 |             self.model_api.n_true_spans = count_true_spans(train_sents)
127 | 
128 |         if is_ensemble:
129 |             self.model_api.set_ensemble_model(word_emb=word_emb,
130 |                                               use_elmo=use_elmo,
131 |                                               vocab_word=vocab_word,
132 |                                               vocab_label=vocab_label_train,
133 |                                               vocab_label_valid=vocab_label_valid)
134 |             self.model_api.load_experts_params(argv.load_param_dir)
135 |             self.model_api.set_init_ensemble_param()
136 |             self.model_api.set_ensemble_train_func()
137 |             if self.model_api.vocab_label_valid:
138 |                 self.model_api.set_ensemble_pred_func()
139 |             init_epoch = 0
140 |         else:
141 |             self.model_api.set_model(word_emb=word_emb,
142 |                                      use_elmo=use_elmo,
143 |                                      vocab_word=vocab_word,
144 |                                      vocab_label=vocab_label_train,
145 |                                      vocab_label_valid=vocab_label_valid)
146 |             if argv.load_param_latest:
147 |                 if argv.output_dir:
148 |                     dir_name = argv.output_dir
149 |                 else:
150 |                     dir_name = 'output'
151 |                 param_fns = get_file_names_in_dir(dir_path=dir_name,
152 |                                                   prefix='param')
153 |                 opt_param_fns = get_file_names_in_dir(dir_path=dir_name,
154 |                                                       prefix='opt')
155 |                 param_fn, latest_epoch = get_latest_param_fn(file_names=param_fns)
156 |                 opt_param_fn, _ = get_latest_param_fn(file_names=opt_param_fns)
157 |                 self.model_api.argv.load_param = param_fn
158 |                 self.model_api.argv.load_opt_param = opt_param_fn
159 |                 self.model_api.load_params(param_fn)
160 |                 init_epoch = latest_epoch + 1
161 |             elif argv.load_param:
162 |                 self.model_api.load_params(argv.load_param)
163 |                 init_epoch = 0
164 |             else:
165 |                 init_epoch = 0
166 | 
167 |             self.model_api.set_train_func()
168 |             if self.model_api.vocab_label_valid:
169 |                 self.model_api.set_pred_func()
170 | 
171 |         #######################
172 |         # Run training epochs #
173 |         #######################
174 |         self._run_epochs(train_samples, valid_samples, init_epoch)
175 | 
176 |     def _run_epochs(self, train_samples, valid_samples=None, init_epoch=0):
177 |         write('\nTRAIN START')
178 | 
179 |         argv = self.argv
180 |         pproc = self.preprocessor
181 |         vocab_label_valid = self.model_api.vocab_label_valid
182 | 
183 |         if valid_samples:
184 |             valid_batches = pproc.make_batches(samples=valid_samples,
185 |                                                is_valid_data=True)
186 |             valid_batch_x, valid_batch_y = pproc.split_x_and_y(valid_batches)
187 |         else:
188 |             valid_batch_x = valid_batch_y = []
189 | 
190 |         ##########################################
191 |         # Initial result with pre-trained params #
192 |         ##########################################
193 |         if (argv.load_param or argv.load_param_dir) and valid_samples:
194 |             write('\nEpoch: 0 (Using the Pre-trained Params)')
195 |             write('VALID')
196 |             valid_batch_y_pred = self.model_api.predict(valid_batch_x)
197 |             self.best_valid_f1 = self.evaluator.f_score(y_true=valid_batch_y,
198 |                                                         y_pred=valid_batch_y_pred,
199 |                                                         vocab_label=vocab_label_valid)
200 | 
201 |         #############
202 |         # Main loop #
203 |         #############
204 |         for epoch in range(init_epoch, argv.epoch):
205 |             write('\nEpoch: %d' % (epoch + 1))
206 |             write('TRAIN')
207 | 
208 |             if argv.halve_lr and epoch > 49 and (epoch % 25) == 0:
209 |                 lr = self.model_api.optimizer.lr.get_value(borrow=True)
210 |                 self.model_api.optimizer.lr.set_value(lr * 0.5)
211 |                 write('### HALVE LEARNING RATE: %f -> %f' % (lr, lr * 0.5))
212 | 
213 |             ############
214 |             # Training #
215 |             ############
216 |             train_batches = pproc.make_batches(train_samples)
217 |             self.model_api.train(train_batches)
218 | 
219 |             ##############
220 |             # Validating #
221 |             ##############
222 |             if valid_samples:
223 |                 write('VALID')
224 |                 valid_batch_y_pred = self.model_api.predict(valid_batch_x)
225 |                 valid_f1 = self.evaluator.f_score(y_true=valid_batch_y,
226 |                                                   y_pred=valid_batch_y_pred,
227 |                                                   vocab_label=vocab_label_valid)
228 |                 if self.best_valid_f1 < valid_f1:
229 |                     self.best_valid_f1 = valid_f1
230 |                     self.best_epoch = epoch
231 |                     self.f1_history[self.best_epoch + 1] = [self.best_valid_f1]
232 | 
233 |                     if argv.save:
234 |                         self.model_api.save_params(epoch=0)
235 |                         self.model_api.optimizer.save_params(epoch=0)
236 | 
237 |             show_score_history(self.f1_history)
238 | 


--------------------------------------------------------------------------------
/src/utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hiroki13/span-based-srl/2c8b677c4e00b6c607e09ef4f9fe3d54961e4f2e/src/utils/__init__.py


--------------------------------------------------------------------------------
/src/utils/evaluators.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | 
  3 | from utils.misc import write, span_to_span_index
  4 | 
  5 | 
  6 | class Evaluator(object):
  7 |     def __init__(self, argv):
  8 |         self.argv = argv
  9 | 
 10 |     def f_score(self, y_true, y_pred, vocab_label):
 11 |         """
 12 |         :param y_true: 1D: n_batches, 2D: batch_size, 3D: n_spans, 4D: [label_id, pre_index, post_index]
 13 |         :param y_pred: 1D: n_batches, 2D: batch_size, 3D: n_spans, 4D: [label_id, pre_index, post_index]
 14 |         """
 15 |         correct, p_total, r_total = self.metrics(y_true=y_true,
 16 |                                                  y_pred=y_pred,
 17 |                                                  vocab_label=vocab_label)
 18 |         p, r, f = f_score(correct, p_total, r_total)
 19 |         write('\tF:{:>7.2%}  P:{:>7.2%} ({:>5}/{:>5})  R:{:>7.2%} ({:>5}/{:>5})'.format(
 20 |             f, p, int(correct), int(p_total), r, int(correct), int(r_total))
 21 |         )
 22 |         return f
 23 | 
 24 |     def metrics(self, **kwargs):
 25 |         raise NotImplementedError
 26 | 
 27 | 
 28 | class SpanEvaluator(Evaluator):
 29 |     def metrics(self, y_true, y_pred, vocab_label):
 30 |         """
 31 |         :param y_true: 1D: n_batches, 2D: batch_size, 3D: n_spans, 4D: [label_id, pre_index, post_index]
 32 |         :param y_pred: 1D: n_batches, 2D: batch_size, 3D: n_spans, 4D: [label_id, pre_index, post_index]
 33 |         """
 34 |         p_total = 0.
 35 |         r_total = 0.
 36 |         correct = 0.
 37 |         for span_true_batch, span_pred_batch in zip(y_true, y_pred):
 38 |             for spans_true, spans_pred in zip(span_true_batch, span_pred_batch):
 39 |                 spans_true = concat_c_spans_from_spans(spans_true, vocab_label)
 40 |                 spans_pred = concat_c_spans_from_spans(spans_pred, vocab_label)
 41 |                 p_total += len(spans_pred)
 42 |                 r_total += len(spans_true)
 43 |                 for span in spans_pred:
 44 |                     if span in spans_true:
 45 |                         correct += 1
 46 |         return correct, p_total, r_total
 47 | 
 48 | 
 49 | class BIOEvaluator(Evaluator):
 50 |     def metrics(self, y_true, y_pred, vocab_label):
 51 |         p_total = 0.
 52 |         r_total = 0.
 53 |         correct = 0.
 54 |         for y_true_batch, y_pred_batch in zip(y_true, y_pred):
 55 |             for y_true_i, y_pred_i in zip(y_true_batch, y_pred_batch):
 56 |                 y_true_spans = get_spans_from_bio_labels(y_true_i, vocab_label)
 57 |                 y_pred_spans = get_spans_from_bio_labels(y_pred_i, vocab_label)
 58 |                 p_total += len(y_pred_spans)
 59 |                 r_total += len(y_true_spans)
 60 |                 for y_pred_span in y_pred_spans:
 61 |                     if y_pred_span in y_true_spans:
 62 |                         correct += 1.
 63 |         return correct, p_total, r_total
 64 | 
 65 | 
 66 | def f_score(correct, p_total, r_total):
 67 |     precision = correct / p_total if p_total > 0 else 0.
 68 |     recall = correct / r_total if r_total > 0 else 0.
 69 |     f1 = (2 * precision * recall) / (precision + recall) if precision + recall > 0 else 0.
 70 |     return precision, recall, f1
 71 | 
 72 | 
 73 | def get_spans_from_bio_labels(sent, vocab_label):
 74 |     spans = []
 75 |     span = []
 76 |     for w_i, label_id in enumerate(sent):
 77 |         label = vocab_label.get_word(label_id)
 78 |         if label[-2:] == '-V':
 79 |             continue
 80 |         if label.startswith('B-'):
 81 |             if span:
 82 |                 spans.append(span)
 83 |             span = [label[2:], w_i, w_i]
 84 |         elif label.startswith('I-'):
 85 |             if span:
 86 |                 if label[2:] == span[0]:
 87 |                     span[2] = w_i
 88 |                 else:
 89 |                     spans.append(span)
 90 |                     span = [label[2:], w_i, w_i]
 91 |             else:
 92 |                 span = [label[2:], w_i, w_i]
 93 |         else:
 94 |             if span:
 95 |                 spans.append(span)
 96 |             span = []
 97 |     if span:
 98 |         spans.append(span)
 99 | 
100 |     return concat_c_spans_from_bio_labels(spans)
101 | 
102 | 
103 | def concat_c_spans_from_bio_labels(spans):
104 |     labels = [span[0] for span in spans]
105 |     c_indices = [i for i, span in enumerate(spans) if span[0].startswith('C')]
106 |     non_ant_c_spans = []
107 | 
108 |     for c_index in c_indices:
109 |         c_span = spans[c_index]
110 |         c_label = c_span[0][2:]
111 |         if c_label in labels:
112 |             spans[labels.index(c_label)].extend(c_span[1:])
113 |         else:
114 |             non_ant_c_spans.append([c_label] + c_span[1:])
115 |     concated_spans = [span for i, span in enumerate(spans) if i not in c_indices]
116 |     spans = concated_spans + non_ant_c_spans
117 |     return spans
118 | 
119 | 
120 | def concat_c_spans_from_spans(spans, vocab_label):
121 |     spans = [[vocab_label.get_word(l), i, j] for (l, i, j) in spans]
122 |     labels = [l for (l, i, j) in spans]
123 |     c_indices = [index for index, (l, i, j) in enumerate(spans) if l.startswith('C')]
124 |     non_ant_c_spans = []
125 | 
126 |     for c_index in c_indices:
127 |         c_span = spans[c_index]
128 |         label = c_span[0][2:]
129 |         if label in labels:
130 |             spans[labels.index(label)].extend(c_span[1:])
131 | 
132 |     concated_spans = [span for i, span in enumerate(spans) if i not in c_indices]
133 |     spans = concated_spans + non_ant_c_spans
134 |     return spans
135 | 
136 | 
137 | def metrics_for_bio(y_true, y_pred, vocab_label):
138 |     p_total = 0.
139 |     r_total = 0.
140 |     correct = 0.
141 |     for y_true_i, y_pred_i in zip(y_true, y_pred):
142 |         y_true_spans = get_spans_from_bio_labels(y_true_i, vocab_label)
143 |         y_pred_spans = get_spans_from_bio_labels(y_pred_i, vocab_label)
144 |         p_total += len(y_pred_spans)
145 |         r_total += len(y_true_spans)
146 |         for y_pred_span in y_pred_spans:
147 |             if y_pred_span in y_true_spans:
148 |                 correct += 1.
149 |     return correct, p_total, r_total
150 | 
151 | 
152 | def correct_and_pred_spans(span_true, span_pred, marks):
153 |     """
154 |     :param span_true: 1D: batch_size * n_spans; span index
155 |     :param span_pred: 1D: batch_size, 2D: n_labels; span index
156 |     :param marks: 1D: batch_size, 2D: n_words; elem=0/1
157 |     """
158 |     correct = 0.
159 |     n_pred_spans = 0.
160 |     n_words = len(marks[0])
161 |     _, prd_indices = np.array(marks).nonzero()
162 |     prd_indices = [span_to_span_index(p, p, n_words) for p in prd_indices]
163 | 
164 |     for b_index, span_pred_tmp in enumerate(span_pred):
165 |         prd_index = prd_indices[b_index]
166 |         for label_id, span_index in enumerate(span_pred_tmp):
167 |             if span_index == prd_index:
168 |                 continue
169 |             if [b_index, label_id, span_index] in span_true:
170 |                 correct += 1
171 |             n_pred_spans += 1
172 | 
173 |     return correct, n_pred_spans
174 | 
175 | 
176 | def count_true_spans(sents):
177 |     """
178 |     :param sents: 1D: n_sents
179 |     :return: total number of spans
180 |     """
181 |     return sum([len(triple) for sent in sents for triple in sent.span_triples])
182 | 


--------------------------------------------------------------------------------
/src/utils/loaders.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import gzip
  3 | import pickle
  4 | import h5py
  5 | 
  6 | import numpy as np
  7 | import theano
  8 | 
  9 | from utils.misc import get_file_names_in_dir
 10 | from utils.vocab import UNK
 11 | 
 12 | 
 13 | class Loader(object):
 14 |     def __init__(self, argv):
 15 |         self.argv = argv
 16 | 
 17 |     def load(self, **kwargs):
 18 |         raise NotImplementedError
 19 | 
 20 |     @staticmethod
 21 |     def load_data(fn):
 22 |         with gzip.open(fn, 'rb') as gf:
 23 |             return pickle.load(gf)
 24 | 
 25 |     @staticmethod
 26 |     def load_key_value_format(fn):
 27 |         data = []
 28 |         with open(fn, 'r') as f:
 29 |             for line in f:
 30 |                 key, value = line.rstrip().split()
 31 |                 data.append((key, int(value)))
 32 |         return data
 33 | 
 34 |     @staticmethod
 35 |     def load_hdf5(path):
 36 |         return h5py.File(path, 'r')
 37 | 
 38 |     def load_txt_from_dir(self, dir_path, file_prefix):
 39 |         file_names = get_file_names_in_dir(dir_path + '/*')
 40 |         file_names = [fn for fn in file_names
 41 |                       if os.path.basename(fn).startswith(file_prefix)
 42 |                       and fn.endswith('txt')]
 43 |         return [self.load(path=fn) for fn in file_names]
 44 | 
 45 |     def load_hdf5_from_dir(self, dir_path, file_prefix):
 46 |         file_names = get_file_names_in_dir(dir_path + '/*')
 47 |         file_names = [fn for fn in file_names
 48 |                       if os.path.basename(fn).startswith(file_prefix)
 49 |                       and fn.endswith('hdf5')]
 50 |         return [self.load_hdf5(fn) for fn in file_names]
 51 | 
 52 | 
 53 | class Conll05Loader(Loader):
 54 | 
 55 |     def load(self, path, data_size=1000000, is_test=False):
 56 |         if path is None:
 57 |             return []
 58 | 
 59 |         corpus = []
 60 |         sent = []
 61 | 
 62 |         with open(path) as f:
 63 |             for line in f:
 64 |                 elem = [l for l in line.rstrip().split()]
 65 |                 if len(elem) > 0:
 66 |                     if is_test:
 67 |                         sent.append(elem[:6])
 68 |                     else:
 69 |                         sent.append(elem)
 70 |                 else:
 71 |                     corpus.append(sent)
 72 |                     sent = []
 73 |                 if len(corpus) >= data_size:
 74 |                     break
 75 |         return corpus
 76 | 
 77 | 
 78 | class Conll12Loader(Loader):
 79 | 
 80 |     def load(self, path, data_size=1000000, is_test=False):
 81 |         if path is None:
 82 |             return []
 83 | 
 84 |         corpus = []
 85 |         sent = []
 86 | 
 87 |         with open(path) as f:
 88 |             for line in f:
 89 |                 elem = [l for l in line.rstrip().split()]
 90 |                 if len(elem) > 10:
 91 |                     if is_test:
 92 |                         sent.append(elem[:11])
 93 |                     else:
 94 |                         sent.append(elem)
 95 |                 elif len(elem) == 0:
 96 |                     corpus.append(sent)
 97 |                     sent = []
 98 |                 if len(corpus) >= data_size:
 99 |                     break
100 |         return corpus
101 | 
102 | 
103 | def load_emb(path):
104 |     word_list = []
105 |     emb = []
106 |     with open(path) as f:
107 |         for line in f:
108 |             line = line.rstrip().split()
109 |             word_list.append(line[0])
110 |             emb.append(line[1:])
111 |     emb = np.asarray(emb, dtype=theano.config.floatX)
112 | 
113 |     if UNK not in word_list:
114 |         word_list = [UNK] + word_list
115 |         unk_vector = np.mean(emb, axis=0)
116 |         emb = np.vstack((unk_vector, emb))
117 | 
118 |     return word_list, emb
119 | 
120 | 
121 | def load_pickle(fn):
122 |     with gzip.open(fn, 'rb') as gf:
123 |         return pickle.load(gf)
124 | 
125 | 
126 | def load_key_value_format(fn):
127 |     data = []
128 |     with open(fn, 'r') as f:
129 |         for line in f:
130 |             key, value = line.rstrip().split()
131 |             data.append((key, int(value)))
132 |     return data
133 | 


--------------------------------------------------------------------------------
/src/utils/misc.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import sys
  3 | import glob
  4 | 
  5 | import numpy as np
  6 | import theano
  7 | 
  8 | from utils.vocab import Vocab
  9 | 
 10 | 
 11 | def write(s, stream=sys.stdout):
 12 |     stream.write(s + '\n')
 13 |     stream.flush()
 14 | 
 15 | 
 16 | def show_score_history(history, memo=''):
 17 |     write('F1 HISTORY' + memo)
 18 |     for k, v in sorted(history.items()):
 19 |         epoch_tm = '\t- EPOCH-{:d}  '.format(k)
 20 |         if len(v) == 1:
 21 |             f1_valid = '\tBEST VALID {:>7.2%}'.format(v[0])
 22 |             write(epoch_tm + f1_valid)
 23 |         else:
 24 |             v1, v2 = v
 25 |             f1_valid = '\tBEST VALID {:>7.2%}'.format(v1)
 26 |             f1_evalu = '\tEVALU {:>7.2%}'.format(v2)
 27 |             write(epoch_tm + f1_valid + f1_evalu)
 28 | 
 29 | 
 30 | def str_to_id(sent, vocab, unk):
 31 |     """
 32 |     :param sent: 1D: n_words
 33 |     :param vocab: Vocab()
 34 |     :return: 1D: n_words; elem=id
 35 |     """
 36 |     return list(map(lambda w: vocab.get_id(w) if vocab.has_key(w) else vocab.get_id(unk), sent))
 37 | 
 38 | 
 39 | def make_vocab_from_ids(key_value_format):
 40 |     vocab = Vocab()
 41 |     for key, value in key_value_format:
 42 |         vocab.add_word(key)
 43 |     return vocab
 44 | 
 45 | 
 46 | def array(sample, is_float=False):
 47 |     if is_float:
 48 |         return np.asarray(sample, dtype=theano.config.floatX)
 49 |     return np.asarray(sample, dtype='int32')
 50 | 
 51 | 
 52 | def average_vector(emb):
 53 |     return np.mean(np.asarray(emb[2:], dtype=theano.config.floatX), axis=0)
 54 | 
 55 | 
 56 | def unit_vector(vecs, axis):
 57 |     return vecs / np.sqrt(np.sum(vecs ** 2, axis=axis, keepdims=True))
 58 | 
 59 | 
 60 | def make_output_dir(argv):
 61 |     if argv.output_dir:
 62 |         output_dir = argv.output_dir
 63 |     else:
 64 |         output_dir = 'output'
 65 |     os.makedirs(output_dir, exist_ok=True)
 66 | 
 67 | 
 68 | def join_dir_and_file_names(dir_name, file_name):
 69 |     return os.path.join(dir_name, file_name)
 70 | 
 71 | 
 72 | def get_file_names_in_dir(dir_path, prefix=None, suffix=None):
 73 |     file_names = glob.glob(dir_path + '/*')
 74 |     if prefix:
 75 |         file_names = [fn for fn in file_names
 76 |                       if os.path.basename(fn).startswith(prefix)]
 77 |     if suffix:
 78 |         file_names = [fn for fn in file_names
 79 |                       if fn.endswith(suffix)]
 80 |     return file_names
 81 | 
 82 | 
 83 | def get_latest_param_fn(file_names):
 84 |     latest_epoch = -1
 85 |     latest_fn = None
 86 |     for fn in file_names:
 87 |         for elem in fn.split('.'):
 88 |             if elem.startswith('epoch'):
 89 |                 epoch = int(elem[6:])
 90 |                 if latest_epoch < epoch:
 91 |                     latest_epoch = epoch
 92 |                     latest_fn = fn
 93 |                     break
 94 |     assert latest_fn is not None
 95 |     return latest_fn, latest_epoch
 96 | 
 97 | 
 98 | def span_to_span_index(i, j, n_words):
 99 |     return i * (n_words - 1) + j - np.arange(i).sum()
100 | 


--------------------------------------------------------------------------------
/src/utils/savers.py:
--------------------------------------------------------------------------------
  1 | import gzip
  2 | import pickle
  3 | import json
  4 | 
  5 | 
  6 | class Saver(object):
  7 | 
  8 |     def __init__(self, argv):
  9 |         self.argv = argv
 10 | 
 11 |     def save_props(self, **kwargs):
 12 |         raise NotImplementedError
 13 | 
 14 |     def save_json_format(self, **kwargs):
 15 |         raise NotImplementedError
 16 | 
 17 | 
 18 | class SpanSaver(Saver):
 19 |     def save_props(self, corpus, labels, vocab_label):
 20 |         """
 21 |         :param corpus: 1D: n_sents, 2D: n_words; elem=line
 22 |         :param labels: 1D: n_sents, 2D: n_prds, 3D: n_spans, 4D: [r, i, j]
 23 |         """
 24 |         assert len(corpus) == len(labels), '%d %d' % (len(corpus), len(labels))
 25 | 
 26 |         fn = self.argv.output_dir
 27 |         if self.argv.output_fn:
 28 |             fn += '/results.%s.prop' % self.argv.output_fn
 29 |         else:
 30 |             fn += '/results.prop'
 31 |         f = open(fn, 'w')
 32 | 
 33 |         for sent, spans_sent in zip(corpus, labels):
 34 |             columns = [[mark] for mark in sent.marks]
 35 |             n_words = sent.n_words
 36 |             assert len(sent.prd_indices) == len(spans_sent)
 37 |             for prd_index, spans in zip(sent.prd_indices, spans_sent):
 38 |                 prop = self._span_to_prop(spans=spans,
 39 |                                           prd_index=prd_index,
 40 |                                           n_words=n_words,
 41 |                                           vocab_label=vocab_label)
 42 |                 for i, p in enumerate(prop):
 43 |                     columns[i].append(p)
 44 |             for c in columns:
 45 |                 f.write("%s\n" % "\t".join(c))
 46 |             f.write("\n")
 47 |         f.close()
 48 | 
 49 |     def save_json_format(self, corpus, labels, vocab_label):
 50 |         """
 51 |         :param corpus: 1D: n_sents, 2D: n_words; elem=line
 52 |         :param labels: 1D: n_sents, 2D: n_prds, 3D: n_spans, 4D: [r, i, j]
 53 |         :param vocab_label: Vocab()
 54 |         """
 55 |         assert len(corpus) == len(labels), '%d %d' % (len(corpus), len(labels))
 56 | 
 57 |         fn = self.argv.output_dir
 58 |         if self.argv.output_fn:
 59 |             fn += '/results.%s.json' % self.argv.output_fn
 60 |         else:
 61 |             fn += '/results.json'
 62 |         f = open(fn, 'w')
 63 | 
 64 |         corpus_dic = {}
 65 |         for sent_index, (sent, spans_sent) in enumerate(zip(corpus, labels)):
 66 |             assert len(sent.prd_indices) == len(spans_sent)
 67 | 
 68 |             prop_dic = {}
 69 |             for prd_index, spans in zip(sent.prd_indices, spans_sent):
 70 |                 arg_dic = {}
 71 |                 for (r, i, j) in spans:
 72 |                     key = '(%s,%d,%d)' % (vocab_label.get_word(r), i, j)
 73 |                     value = " ".join(sent.strings[i: j + 1])
 74 |                     arg_dic[key] = value
 75 | 
 76 |                 prd_dic = {'prd': sent.forms[prd_index],
 77 |                            'arg': arg_dic}
 78 |                 prop_dic['prd-%d' % prd_index] = prd_dic
 79 | 
 80 |             sent_dic = {'text': " ".join(sent.strings),
 81 |                         'mark': " ".join(sent.marks),
 82 |                         'prop': prop_dic}
 83 |             corpus_dic['sent-%d' % sent_index] = sent_dic
 84 | 
 85 |         json.dump(corpus_dic, f, indent=4)
 86 |         f.close()
 87 | 
 88 |     @staticmethod
 89 |     def _span_to_prop(spans, prd_index, n_words, vocab_label):
 90 |         """
 91 |         :param spans: 1D: n_spans, 2D: [r, i, j]
 92 |         :return: 1D: n_words; elem=str; e.g. (A0* & *)
 93 |         """
 94 |         prop = ['*' for _ in range(n_words)]
 95 |         prop[prd_index] = '(V*)'
 96 |         for (label_id, pre_index, post_index) in spans:
 97 |             label = vocab_label.get_word(label_id)
 98 |             if pre_index == post_index:
 99 |                 prop[pre_index] = '(%s*)' % label
100 |             else:
101 |                 prop[pre_index] = '(%s*' % label
102 |                 prop[post_index] = '*)'
103 |         return prop
104 | 
105 | 
106 | class BIOSaver(Saver):
107 |     def save_props(self, corpus, labels, vocab_label):
108 |         """
109 |         :param corpus: 1D: n_sents, 2D: n_words; elem=line
110 |         :param labels: 1D: n_sents, 2D: n_prds, 3D: n_words; elem=label id
111 |         :param vocab_label: Vocab()
112 |         """
113 |         assert len(corpus) == len(labels), '%d %d' % (len(corpus), len(labels))
114 | 
115 |         fn = self.argv.output_dir
116 |         if self.argv.output_fn:
117 |             fn += '/results.%s.prop' % self.argv.output_fn
118 |         else:
119 |             fn += '/results.prop'
120 |         f = open(fn, 'w')
121 | 
122 |         for sent, labels_sent in zip(corpus, labels):
123 |             columns = [[mark] for mark in sent.marks]
124 |             for labels_prd in labels_sent:
125 |                 assert len(columns) == len(labels_prd)
126 |                 spans = self._get_spans(labels_prd, vocab_label)
127 |                 labels = self._span_to_prop(len(labels_prd), spans)
128 |                 for i, label in enumerate(labels):
129 |                     columns[i].append(label)
130 |             for c in columns:
131 |                 f.write("%s\n" % "\t".join(c))
132 |             f.write("\n")
133 |         f.close()
134 | 
135 |     def save_json_format(self, corpus, labels, vocab_label):
136 |         """
137 |         :param corpus: 1D: n_sents, 2D: n_words; elem=line
138 |         :param labels: 1D: n_sents, 2D: n_prds, 3D: n_words; elem=label id
139 |         :param vocab_label: Vocab()
140 |         """
141 |         assert len(corpus) == len(labels), '%d %d' % (len(corpus), len(labels))
142 | 
143 |         fn = self.argv.output_dir
144 |         if self.argv.output_fn:
145 |             fn += '/results.%s.json' % self.argv.output_fn
146 |         else:
147 |             fn += '/results.json'
148 |         f = open(fn, 'w')
149 | 
150 |         corpus_dic = {}
151 |         for sent_index, (sent, labels_sent) in enumerate(zip(corpus, labels)):
152 |             assert len(sent.prd_indices) == len(labels_sent)
153 | 
154 |             prop_dic = {}
155 |             for prd_index, labels_prd in zip(sent.prd_indices, labels_sent):
156 |                 arg_dic = {}
157 |                 spans = self._get_spans(labels_prd, vocab_label)
158 |                 for (label, i, j) in spans:
159 |                     if label == 'V':
160 |                         continue
161 |                     key = '(%s,%d,%d)' % (label, i, j)
162 |                     value = " ".join(sent.strings[i: j + 1])
163 |                     arg_dic[key] = value
164 | 
165 |                 prd_dic = {'prd': sent.forms[prd_index],
166 |                            'arg': arg_dic}
167 |                 prop_dic['prd-%d' % prd_index] = prd_dic
168 | 
169 |             sent_dic = {'text': " ".join(sent.strings),
170 |                         'mark': " ".join(sent.marks),
171 |                         'prop': prop_dic}
172 |             corpus_dic['sent-%d' % sent_index] = sent_dic
173 | 
174 |         json.dump(corpus_dic, f, indent=4)
175 |         f.close()
176 | 
177 |     @staticmethod
178 |     def _get_spans(labels, vocab_label):
179 |         """
180 |         :param labels: 1D: n_words; elem=label id
181 |         :param vocab_label: label id dict
182 |         :return: 1D: n_spans; elem=[label, i, j]
183 |         """
184 |         spans = []
185 |         span = []
186 |         for w_i, label_id in enumerate(labels):
187 |             label = vocab_label.get_word(label_id)
188 |             if label.startswith('B-'):
189 |                 if span:
190 |                     spans.append(span)
191 |                 span = [label[2:], w_i, w_i]
192 |             elif label.startswith('I-'):
193 |                 if span:
194 |                     if label[2:] == span[0]:
195 |                         span[2] = w_i
196 |                     else:
197 |                         spans.append(span)
198 |                         span = [label[2:], w_i, w_i]
199 |                 else:
200 |                     span = [label[2:], w_i, w_i]
201 |             else:
202 |                 if span:
203 |                     spans.append(span)
204 |                 span = []
205 |         if span:
206 |             spans.append(span)
207 |         return spans
208 | 
209 |     @staticmethod
210 |     def _span_to_prop(n_words, spans):
211 |         """
212 |         :param n_words: int
213 |         :param spans: 1D: n_spans; elem=[label, i, j]
214 |         :return: 1D: n_words; elem=label
215 |         """
216 |         k = 0
217 |         args = []
218 |         for w_i in range(n_words):
219 |             if k >= len(spans):
220 |                 args.append('*')
221 |                 continue
222 |             span = spans[k]
223 |             if span[1] < w_i < span[2]:  # within span
224 |                 args.append('*')
225 |             elif w_i == span[1] and w_i == span[2]:  # begin and end of span
226 |                 args.append('(' + span[0] + '*)')
227 |                 k += 1
228 |             elif w_i == span[1]:  # begin of span
229 |                 args.append('(' + span[0] + '*')
230 |             elif w_i == span[2]:  # end of span
231 |                 args.append('*)')
232 |                 k += 1
233 |             else:
234 |                 args.append('*')  # without span
235 |         return args
236 | 
237 | 
238 | def save_pickle(fn, data):
239 |     with gzip.open(fn + '.pkl.gz', 'wb') as gf:
240 |         pickle.dump(data, gf, pickle.HIGHEST_PROTOCOL)
241 | 
242 | 
243 | def save_key_value_format(fn, keys, values):
244 |     assert len(keys) == len(values)
245 |     if type(values[0]) is not str:
246 |         values = map(lambda v: str(v), values)
247 |     with open(fn + '.txt', 'w') as f:
248 |         for key, value in zip(keys, values):
249 |             f.write("%s\t%s\n" % (key, value))
250 | 


--------------------------------------------------------------------------------
/src/utils/sent.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | 
  3 | from utils.misc import array, str_to_id
  4 | from utils.vocab import HYPH, UNK
  5 | 
  6 | 
  7 | class Sent(object):
  8 |     def __init__(self, sent, is_test=True):
  9 |         self.words = self._make_words(sent=sent, is_test=is_test)
 10 | 
 11 |         self.forms = [word.form for word in self.words]
 12 |         self.strings = [word.string for word in self.words]
 13 |         self.marks = self._set_marks(self.words)
 14 |         self.props = [word.prop for word in self.words]
 15 | 
 16 |         self.prd_indices = self._set_prd_indices(self.marks)
 17 |         self.prd_forms = [self.forms[i] for i in self.prd_indices]
 18 |         self.prd_bio_labels = self._set_prd_bio_labels(self.props)
 19 |         self.has_prds = True if len(self.prd_indices) > 0 else False
 20 | 
 21 |         self.n_words = len(sent)
 22 |         self.n_prds = len(self.prd_indices)
 23 | 
 24 |         self.word_ids = None
 25 |         self.mark_ids = None
 26 |         self.elmo_emb = None
 27 |         self.bio_label_ids = None
 28 |         self.span_triples = None
 29 |         self.span_triples_with_null = None
 30 | 
 31 |     def _make_words(self, sent, is_test=True):
 32 |         return [self._make_word(line, is_test) for line in sent]
 33 | 
 34 |     @staticmethod
 35 |     def _make_word(line, is_test=True):
 36 |         raise NotImplementedError
 37 | 
 38 |     def _set_marks(self, words):
 39 |         raise NotImplementedError
 40 | 
 41 |     @staticmethod
 42 |     def _make_bio_labels(prop):
 43 |         """
 44 |         :param prop: 1D: n_words; elem=bracket label
 45 |         :return: 1D: n_words; elem=BIO label
 46 |         """
 47 |         labels = []
 48 |         prev = None
 49 |         for arg in prop:
 50 |             if arg.startswith('('):
 51 |                 if arg.endswith(')'):
 52 |                     prev = arg.split("*")[0][1:]
 53 |                     label = 'B-' + prev
 54 |                     prev = None
 55 |                 else:
 56 |                     prev = arg[1:-1]
 57 |                     label = 'B-' + prev
 58 |             else:
 59 |                 if prev:
 60 |                     label = 'I-' + prev
 61 |                     if arg.endswith(')'):
 62 |                         prev = None
 63 |                 else:
 64 |                     label = 'O'
 65 |             labels.append(label)
 66 |         return labels
 67 | 
 68 |     @staticmethod
 69 |     def _set_prd_indices(marks):
 70 |         return [i for i, mark in enumerate(marks) if mark != HYPH]
 71 | 
 72 |     def _set_prd_bio_labels(self, props):
 73 |         """
 74 |         :param props: 1D: n_words, 2D: n_prds
 75 |         :return: 1D: n_prds, 2D: n_words
 76 |         """
 77 |         props = map(lambda p: p, zip(*props))
 78 |         return [self._make_bio_labels(prop) for prop in props]
 79 | 
 80 |     def set_word_ids(self, vocab_word):
 81 |         self.word_ids = array(str_to_id(sent=self.forms,
 82 |                                         vocab=vocab_word,
 83 |                                         unk=UNK))
 84 | 
 85 |     def set_mark_ids(self):
 86 |         mark_ids = [[0 for _ in range(self.n_words)] for _ in range(self.n_prds)]
 87 |         for i, prd_index in enumerate(self.prd_indices):
 88 |             mark_ids[i][prd_index] = 1
 89 |         self.mark_ids = array(mark_ids)
 90 | 
 91 |     def set_label_ids(self, vocab_label):
 92 |         """
 93 |         :param vocab_label: Vocab (BIO labels); e.g. B-A0, I-A0
 94 |         """
 95 |         assert len(self.prd_indices) == len(self.prd_bio_labels)
 96 |         label_ids = []
 97 |         for prd_index, props in zip(self.prd_indices, self.prd_bio_labels):
 98 |             y = str_to_id(sent=props, vocab=vocab_label, unk='O')
 99 |             label_ids.append(y)
100 |         self.bio_label_ids = array(label_ids)
101 | 
102 |     def set_elmo_emb(self, elmo_emb):
103 |         """
104 |         :param elmo_emb: 1D: n_layers, 2D: n_words, 3D: dim
105 |         """
106 |         elmo_emb = np.asarray(elmo_emb)
107 |         elmo_emb = elmo_emb.transpose((1, 0, 2))
108 |         assert len(elmo_emb) == self.n_words
109 |         self.elmo_emb = elmo_emb
110 | 
111 |     def set_span_triples(self, vocab_label):
112 |         """
113 |         :param vocab_label: Vocab (labels); e.g. A0, A1
114 |         """
115 |         triples = []
116 |         for bio_labels in self.prd_bio_labels:
117 |             prd_triples = []
118 |             for (label, i, j) in self._get_spans(bio_labels):
119 |                 r = vocab_label.get_id(label)
120 |                 prd_triples.append((r, i, j))
121 |             triples.append(prd_triples)
122 |         self.span_triples = triples
123 | 
124 |     @staticmethod
125 |     def _get_spans(bio_labels):
126 |         """
127 |         :param bio_labels: 1D: n_words; elem=bio label
128 |         :return: 1D: n_spans; elem=[label, i, j]
129 |         """
130 |         spans = []
131 |         span = []
132 |         for i, label in enumerate(bio_labels):
133 |             if label[-2:] == '-V':
134 |                 continue
135 |             if label.startswith('B-'):
136 |                 if span:
137 |                     spans.append(span)
138 |                 span = [label[2:], i, i]
139 |             elif label.startswith('I-'):
140 |                 if span:
141 |                     if label[2:] == span[0]:
142 |                         span[2] = i
143 |                     else:
144 |                         spans.append(span)
145 |                         span = [label[2:], i, i]
146 |                 else:
147 |                     span = [label[2:], i, i]
148 |             else:
149 |                 if span:
150 |                     spans.append(span)
151 |                 span = []
152 |         if span:
153 |             spans.append(span)
154 |         return spans
155 | 
156 |     def set_span_triples_with_null(self, n_labels):
157 |         assert len(self.span_triples) == len(self.prd_indices)
158 |         triples_with_null = []
159 |         for prd_index, spans in zip(self.prd_indices, self.span_triples):
160 |             used_labels = [r for (r, i, j) in spans]
161 |             null_spans = [(r, prd_index, prd_index)
162 |                           for r in range(n_labels)
163 |                           if r not in used_labels]
164 |             triples = spans + null_spans
165 |             triples.sort(key=lambda s: s[0])
166 |             triples_with_null.append(triples)
167 |         self.span_triples_with_null = triples_with_null
168 | 
169 | 
170 | class Conll05Sent(Sent):
171 |     @staticmethod
172 |     def _make_word(line, is_test=False):
173 |         return Word(form=line[0],
174 |                     mark=line[5] if is_test is False else line[4],
175 |                     sense=line[4] if is_test is False else None,
176 |                     prop=line[6:] if is_test is False else [])
177 | 
178 |     def _set_marks(self, words):
179 |         return [word.mark for word in words]
180 | 
181 | 
182 | class Conll12Sent(Sent):
183 |     @staticmethod
184 |     def _make_word(line, is_test=False):
185 |         return Word(form=line[3],
186 |                     mark=line[6],
187 |                     sense=line[7],
188 |                     prop=line[11:-1] if is_test is False else [])
189 | 
190 |     def _set_marks(self, words):
191 |         return list(map(lambda w: w.mark if w.sense != HYPH else HYPH, words))
192 | 
193 | 
194 | class Word(object):
195 |     def __init__(self, form, mark, sense, prop):
196 |         self.form = form.lower()
197 |         self.string = form
198 |         self.mark = mark
199 |         self.sense = sense
200 |         self.prop = prop
201 | 


--------------------------------------------------------------------------------
/src/utils/vocab.py:
--------------------------------------------------------------------------------
 1 | HYPH = u'-'
 2 | UNK = u'UNKNOWN'
 3 | 
 4 | 
 5 | class Vocab(object):
 6 |     def __init__(self):
 7 |         self.i2w = []
 8 |         self.w2i = {}
 9 | 
10 |     def add_word(self, word):
11 |         if word not in self.w2i:
12 |             new_id = self.size()
13 |             self.i2w.append(word)
14 |             self.w2i[word] = new_id
15 | 
16 |     def get_id(self, word):
17 |         return self.w2i.get(word)
18 | 
19 |     def get_id_or_unk_id(self, word):
20 |         if word in self.w2i:
21 |             return self.w2i.get(word)
22 |         return self.w2i.get(UNK)
23 | 
24 |     def get_and_add_id(self, word):
25 |         self.add_word(word)
26 |         return self.w2i.get(word)
27 | 
28 |     def get_word(self, w_id):
29 |         return self.i2w[w_id]
30 | 
31 |     def has_key(self, word):
32 |         return word in self.w2i
33 | 
34 |     def size(self):
35 |         return len(self.i2w)
36 | 


--------------------------------------------------------------------------------