├── pics
    └── result1.jpg
├── bilm
    ├── __init__.py
    ├── elmo.py
    ├── data.py
    └── model.py
├── README.md
├── util
    ├── log_wrapper.py
    ├── tokenizer.py
    ├── h5py_generator.py
    ├── spacy_tokenizer.py
    ├── get_tfrecords.py
    └── util.py
├── loss.py
├── test.py
├── train_finetune.py
├── layers.py
├── train_h5py.py
├── layersV0.py
├── train_tfrecords.py
├── RMR_modelV3.py
├── RMR_modelV6.py
└── RMR_modelV6_squad2.py


/pics/result1.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ewrfcas/Reinforced-Mnemonic-Reader/HEAD/pics/result1.jpg


--------------------------------------------------------------------------------
/bilm/__init__.py:
--------------------------------------------------------------------------------
1 | 
2 | from .data import Batcher, TokenBatcher
3 | from .model import BidirectionalLanguageModel, dump_token_embeddings, \
4 |     dump_bilm_embeddings
5 | from .elmo import weight_layers, all_layers
6 | 
7 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | ## Reinforced Mnemonic Reader in tensorflow
 2 | RMR: https://arxiv.org/abs/1705.02798
 3 | 
 4 | ## Pipline
 5 | 1. Run the ``preprocess.ipynb`` to get the input datasets.
 6 | 2. Run ``train_h5py.py`` to start training. Now elmo and cove is not useable.
 7 | 
 8 | ### notes
 9 | 1. `conv1d` in `tensor2tensor` is used to instead of the matrix matmul (full connection) operation in RMR model.
10 | 2. Welcome to discuss any problem about this project (especially the RL loss).
11 | 3. The reinforcement loss should be used after the convergence of cross-entropy.
12 | 4. RMR_modelV3 is based on the version 3, and RMR_modelV6 is based on the version 6 of [https://arxiv.org/abs/1705.02798v3]. It seems that v3 performs better than v6.
13 | 
14 | ## Updates
15 | - [x] Init for the RMR model (without dynamic-critical reinforcement learning DCRL)
16 | - [x] Add the self-critical sequence training (SCST) (no test)
17 | - [x] Update cudnn LSTM and squad 2.0
18 | - [x] Update v3 in modelV0
19 | - [ ] Test the RL loss
20 | 
21 | ## Results
22 | 
23 | ### Result on dev set of squad 1.1
24 | EM:71.17% F1:79.56% (no elmo, no cove paper v6)
25 | 
26 | EM:74.37% F1:82.67% (hidden size=256 +elmo v6)
27 | 
28 | EM:72.08% F1:80.51% (no elmo, no cove paper v3)
29 | 
30 | EM:72.87% F1:81.47% (pytorch version, based on paper v3)
31 | 
32 | ### Result on dev set of squad 2.0
33 | EM:64.89% F1:67.81% (+elmo+cove v3)


--------------------------------------------------------------------------------
/util/log_wrapper.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import logging
 3 | from time import gmtime, strftime
 4 | from colorlog import ColoredFormatter
 5 | 
 6 | def create_logger(name, silent=False, to_disk=False, log_file=None, prefix=None):
 7 |     """Logger wrapper
 8 |     by xiaodl
 9 |     """
10 |     # setup logger
11 |     log = logging.getLogger(name)
12 |     log.setLevel(logging.DEBUG)
13 |     formatter = ColoredFormatter(
14 |         "%(asctime)s %(log_color)s%(levelname)-8s%(reset)s [%(blue)s%(message)s%(reset)s]",
15 |         datefmt='%Y-%m-%d %I:%M:%S',
16 |         reset=True,
17 |         log_colors={
18 |             'DEBUG':    'cyan',
19 |             'INFO':     'green',
20 |             'WARNING':  'yellow',
21 |             'ERROR':    'red',
22 |             'CRITICAL': 'red,bg_white',
23 |         },
24 |         secondary_log_colors={},
25 |         style='%'
26 |     )
27 |     fformatter = logging.Formatter(
28 |         "%(asctime)s [%(funcName)-12s] %(levelname)-8s [%(message)s]",
29 |         datefmt='%Y-%m-%d %I:%M:%S',
30 |         style='%'
31 |     )
32 |     if not silent:
33 |         ch = logging.StreamHandler(sys.stdout)
34 |         ch.setLevel(logging.INFO)
35 |         ch.setFormatter(formatter)
36 |         log.addHandler(ch)
37 |     if to_disk:
38 |         prefix = prefix if prefix is not None else 'my_log'
39 |         log_file = log_file if log_file is not None else strftime('{}-%Y-%m-%d-%H-%M-%S.log'.format(prefix), gmtime())
40 |         fh = logging.FileHandler(log_file)
41 |         fh.setLevel(logging.DEBUG)
42 |         fh.setFormatter(fformatter)
43 |         log.addHandler(fh)
44 |     # disable elmo info
45 |     log.propagate = False
46 |     return log
47 | 


--------------------------------------------------------------------------------
/util/tokenizer.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | Created October, 2017
  3 | Author: xiaodl@microsoft.com
  4 | '''
  5 | import re
  6 | import warnings
  7 | import spacy
  8 | import tqdm
  9 | import logging
 10 | import unicodedata
 11 | from collections import Counter
 12 | 
 13 | 
 14 | logger = logging.getLogger(__name__)
 15 | 
 16 | PAD = '<PAD>'
 17 | UNK = '<UNK>'
 18 | STA= '<BOS>'
 19 | END = '<EOS>'
 20 | 
 21 | PAD_ID = 0
 22 | UNK_ID = 1
 23 | STA_ID = 2
 24 | END_ID = 3
 25 | 
 26 | DigitsMapper = {'0': 'zero', '1': 'one', '2': 'two', '3': 'three', '4': 'four', '5': 'five', '6': 'six', '7': 'seven', '8': 'eight', '9': 'nine', '10': 'ten',
 27 |                 'zero': '0', 'one': '1', 'two': '2', 'three': '3', 'four': '4', 'five': '5', 'six': '6', 'seven': '7','eight': '8', 'nine': '9', 'ten': '10'}
 28 | 
 29 | def normal_query(query, document):
 30 |     """ normalize digits
 31 |     """
 32 |     nq = []
 33 |     for w in query:
 34 |         if w in DigitsMapper and w not in document:
 35 |             if DigitsMapper[w] in document:
 36 |                 w = DigitsMapper[w]
 37 |         nq.append(w)
 38 |     return nq
 39 | 
 40 | 
 41 | def normalize_text(text):
 42 |     return unicodedata.normalize('NFD', text)
 43 | 
 44 | def token_extend(reg_rules):
 45 |     return ' ' + reg_rules.group(0) + ' '
 46 | 
 47 | def reform_text(text):
 48 |     text = re.sub(u'-|¢|¥|€|£|\u2010|\u2011|\u2012|\u2013|\u2014|\u2015|%|\[|\]|:|\(|\)|/', token_extend, text)
 49 |     text = text.strip(' \n')
 50 |     text = re.sub('\s+', ' ', text)
 51 |     return text
 52 | 
 53 | def reform_simple(text):
 54 |     text = text.strip(' \n')
 55 |     text = re.sub('\s+', ' ', text)
 56 |     return text
 57 | 
 58 | class Vocabulary(object):
 59 |     INIT_LEN = 4
 60 |     def __init__(self, neat=False):
 61 |         self.neat = neat
 62 |         if not neat:
 63 |             self.tok2ind = {PAD: PAD_ID, UNK: UNK_ID, STA: STA_ID, END: END_ID}
 64 |             self.ind2tok = {PAD_ID: PAD, UNK_ID: UNK, STA_ID: STA, END_ID:END}
 65 |         else:
 66 |             self.tok2ind = {}
 67 |             self.ind2tok = {}
 68 | 
 69 |     def __len__(self):
 70 |         return len(self.tok2ind)
 71 | 
 72 |     def __iter__(self):
 73 |         return iter(self.tok2ind)
 74 | 
 75 |     def __contains__(self, key):
 76 |         if type(key) == int:
 77 |             return key in self.ind2tok
 78 |         elif type(key) == str:
 79 |             return key in self.tok2ind
 80 | 
 81 |     def __getitem__(self, key):
 82 |         if type(key) == int:
 83 |             return self.ind2tok.get(key, -1) if self.neat else self.ind2tok.get(key, UNK)
 84 |         if type(key) == str:
 85 |             return self.tok2ind.get(key, None) if self.neat else self.tok2ind.get(key,self.tok2ind.get(UNK))
 86 | 
 87 |     def __setitem__(self, key, item):
 88 |         if type(key) == int and type(item) == str:
 89 |             self.ind2tok[key] = item
 90 |         elif type(key) == str and type(item) == int:
 91 |             self.tok2ind[key] = item
 92 |         else:
 93 |             raise RuntimeError('Invalid (key, item) types.')
 94 | 
 95 |     def add(self, token):
 96 |         if token not in self.tok2ind:
 97 |             index = len(self.tok2ind)
 98 |             self.tok2ind[token] = index
 99 |             self.ind2tok[index] = token
100 | 
101 |     def get_vocab_list(self, with_order=True):
102 |         if with_order:
103 |             words = [self[k] for k in range(0, len(self))]
104 |         else:
105 |             words = [k for k in self.tok2ind.keys()
106 |                       if k not in {PAD, UNK, STA, END}]
107 |         return words
108 | 
109 |     def toidx(self, tokens):
110 |         return [self[tok] for tok in tokens]
111 | 
112 |     def copy(self):
113 |         new_vocab = Vocabulary(self.neat)
114 |         for w in self:
115 |             new_vocab.add(w)
116 |         return new_vocab
117 | 
118 |     def build(words, neat=False):
119 |         vocab = Vocabulary(neat)
120 |         for w in words: vocab.add(w)
121 |         return vocab
122 | 


--------------------------------------------------------------------------------
/util/h5py_generator.py:
--------------------------------------------------------------------------------
  1 | import h5py
  2 | import numpy as np
  3 | import time
  4 | 
  5 | class Generator():
  6 |     def __init__(self, data_path, qid, batch_size=32, shuffle=True, padding_value=0, data_keys=None, use_elmo=0,
  7 |                  use_cove=0, elmo_path=None, cove_path=None):
  8 |         self.batch_size = batch_size
  9 |         if isinstance(qid, str):
 10 |             self.qid = np.load(qid)
 11 |         else:
 12 |             self.qid = qid
 13 |         self.length = len(self.qid)
 14 |         self.shuffle = shuffle
 15 |         self.data_path = data_path
 16 |         self.max_batch = self.length // self.batch_size
 17 |         if self.length % self.batch_size != 0:
 18 |             self.max_batch += 1
 19 |         self.padding_value = padding_value
 20 |         if self.shuffle:
 21 |             self.run_shuffle()
 22 |             print('Loaded {} samples'.format(self.length))
 23 |         self.i_batch = 0
 24 |         self.data_keys = data_keys
 25 |         self.use_elmo = use_elmo
 26 |         self.use_cove = use_cove
 27 |         if use_elmo == 1:
 28 |             assert elmo_path is not None
 29 |             self.elmo_path = elmo_path
 30 |         if use_cove == 1:
 31 |             assert cove_path is not None
 32 |             self.cove_path = cove_path
 33 | 
 34 |         self.get_time = 0
 35 |         self.pad_time = 0
 36 | 
 37 |     def reset(self):
 38 |         self.i_batch = 0
 39 |         self.run_shuffle()
 40 | 
 41 |     def run_shuffle(self):
 42 |         if self.shuffle:
 43 |             np.random.shuffle(self.qid)
 44 |         else:
 45 |             pass
 46 | 
 47 |     def padding(self, datas):
 48 |         max_len = max([d.shape[0] for d in datas])
 49 |         paded_datas = np.zeros([len(datas), max_len] + list(datas[0].shape[1:]), dtype=datas[0].dtype)
 50 |         for i in range(len(datas)):
 51 |             paded_datas[i, 0:datas[i].shape[0]] = datas[i]
 52 |         return paded_datas
 53 | 
 54 |     def __len__(self):
 55 |         return self.length
 56 | 
 57 |     def __next__(self):
 58 |         batch_data = {}
 59 |         if self.use_elmo == 1:
 60 |             elmo_h5f = h5py.File(self.elmo_path, 'r')
 61 |         if self.use_cove == 1:
 62 |             cove_h5f = h5py.File(self.cove_path, 'r')
 63 |         # st = time.time()
 64 |         with h5py.File(self.data_path, 'r') as h5f:
 65 |             qid_batch = self.qid[self.i_batch * self.batch_size:(self.i_batch + 1) * self.batch_size]
 66 |             for id in qid_batch:
 67 |                 group = h5f[str(id)]
 68 |                 # normal features
 69 |                 if self.data_keys is None:
 70 |                     self.data_keys = list(group.keys())
 71 |                 for k in self.data_keys:
 72 |                     if k not in batch_data:
 73 |                         batch_data[k] = [group[k][:]]
 74 |                     else:
 75 |                         batch_data[k].append(group[k][:])
 76 |                 # elmo features
 77 |                 if self.use_elmo == 1:
 78 |                     if 'elmo_cont' not in batch_data:
 79 |                         batch_data['elmo_cont'] = [elmo_h5f[str(id) + 'c'][:]]
 80 |                     else:
 81 |                         batch_data['elmo_cont'].append(elmo_h5f[str(id) + 'c'][:])
 82 |                     if 'elmo_ques' not in batch_data:
 83 |                         batch_data['elmo_ques'] = [elmo_h5f[str(id) + 'q'][:]]
 84 |                     else:
 85 |                         batch_data['elmo_ques'].append(elmo_h5f[str(id) + 'q'][:])
 86 |                 # cove features
 87 |                 if self.use_cove == 1:
 88 |                     if 'cove_cont' not in batch_data:
 89 |                         batch_data['cove_cont'] = [cove_h5f[str(id) + 'c'][:]]
 90 |                     else:
 91 |                         batch_data['cove_cont'].append(cove_h5f[str(id) + 'c'][:])
 92 |                     if 'cove_ques' not in batch_data:
 93 |                         batch_data['cove_ques'] = [cove_h5f[str(id) + 'q'][:]]
 94 |                     else:
 95 |                         batch_data['cove_ques'].append(cove_h5f[str(id) + 'q'][:])
 96 |         if self.use_elmo == 1:
 97 |             elmo_h5f.close()
 98 |         if self.use_cove == 1:
 99 |             cove_h5f.close()
100 |         # ed = time.time()
101 |         # self.get_time += float(ed - st)
102 | 
103 |         # st = time.time()
104 |         for k in batch_data:
105 |             batch_data[k] = self.padding(batch_data[k])
106 |         # ed = time.time()
107 |         # self.pad_time += float(ed - st)
108 |         # print('get_time:', self.get_time)
109 |         # print('pad_time:', self.pad_time)
110 |         self.i_batch += 1
111 |         if self.i_batch == self.max_batch:
112 |             self.i_batch = 0
113 |             self.run_shuffle()
114 |         return batch_data
115 | 


--------------------------------------------------------------------------------
/bilm/elmo.py:
--------------------------------------------------------------------------------
  1 | import tensorflow as tf
  2 | 
  3 | 
  4 | def weight_layers(name, bilm_ops, l2_coef=None,
  5 |                   use_top_only=False, do_layer_norm=False):
  6 |     '''
  7 |     Weight the layers of a biLM with trainable scalar weights to
  8 |     compute ELMo representations.
  9 | 
 10 |     For each output layer, this returns two ops.  The first computes
 11 |         a layer specific weighted average of the biLM layers, and
 12 |         the second the l2 regularizer loss term.
 13 |     The regularization terms are also add to tf.GraphKeys.REGULARIZATION_LOSSES 
 14 | 
 15 |     Input:
 16 |         name = a string prefix used for the trainable variable names
 17 |         bilm_ops = the tensorflow ops returned to compute internal
 18 |             representations from a biLM.  This is the return value
 19 |             from BidirectionalLanguageModel(...)(ids_placeholder)
 20 |         l2_coef: the l2 regularization coefficient $\lambda$.
 21 |             Pass None or 0.0 for no regularization.
 22 |         use_top_only: if True, then only use the top layer.
 23 |         do_layer_norm: if True, then apply layer normalization to each biLM
 24 |             layer before normalizing
 25 | 
 26 |     Output:
 27 |         {
 28 |             'weighted_op': op to compute weighted average for output,
 29 |             'regularization_op': op to compute regularization term
 30 |         }
 31 |     '''
 32 | 
 33 |     def _l2_regularizer(weights):
 34 |         if l2_coef is not None:
 35 |             return l2_coef * tf.reduce_sum(tf.square(weights))
 36 |         else:
 37 |             return 0.0
 38 | 
 39 |     # Get ops for computing LM embeddings and mask
 40 |     lm_embeddings = bilm_ops['lm_embeddings']
 41 |     mask = bilm_ops['mask']
 42 | 
 43 |     n_lm_layers = int(lm_embeddings.get_shape()[1])
 44 |     lm_dim = int(lm_embeddings.get_shape()[3])
 45 | 
 46 |     with tf.control_dependencies([lm_embeddings, mask]):
 47 |         # Cast the mask and broadcast for layer use.
 48 |         mask_float = tf.cast(mask, 'float32')
 49 |         broadcast_mask = tf.expand_dims(mask_float, axis=-1)
 50 | 
 51 |         def _do_ln(x):
 52 |             # do layer normalization excluding the mask
 53 |             x_masked = x * broadcast_mask
 54 |             N = tf.reduce_sum(mask_float) * lm_dim
 55 |             mean = tf.reduce_sum(x_masked) / N
 56 |             variance = tf.reduce_sum(((x_masked - mean) * broadcast_mask) ** 2
 57 |                                      ) / N
 58 |             return tf.nn.batch_normalization(
 59 |                 x, mean, variance, None, None, 1E-12
 60 |             )
 61 | 
 62 |         if use_top_only:
 63 |             layers = tf.split(lm_embeddings, n_lm_layers, axis=1)
 64 |             # just the top layer
 65 |             sum_pieces = tf.squeeze(layers[-1], squeeze_dims=1)
 66 |             # no regularization
 67 |             reg = 0.0
 68 |         else:
 69 |             W = tf.get_variable(
 70 |                 '{}_ELMo_W'.format(name),
 71 |                 shape=(n_lm_layers,),
 72 |                 initializer=tf.zeros_initializer,
 73 |                 regularizer=_l2_regularizer,
 74 |                 trainable=True,
 75 |             )
 76 | 
 77 |             # normalize the weights
 78 |             normed_weights = tf.split(
 79 |                 tf.nn.softmax(W + 1.0 / n_lm_layers), n_lm_layers
 80 |             )
 81 |             # split LM layers
 82 |             layers = tf.split(lm_embeddings, n_lm_layers, axis=1)
 83 | 
 84 |             # compute the weighted, normalized LM activations
 85 |             pieces = []
 86 |             for w, t in zip(normed_weights, layers):
 87 |                 if do_layer_norm:
 88 |                     pieces.append(w * _do_ln(tf.squeeze(t, squeeze_dims=1)))
 89 |                 else:
 90 |                     pieces.append(w * tf.squeeze(t, squeeze_dims=1))
 91 |             sum_pieces = tf.add_n(pieces)
 92 | 
 93 |             # get the regularizer
 94 |             reg = [
 95 |                 r for r in tf.get_collection(
 96 |                     tf.GraphKeys.REGULARIZATION_LOSSES)
 97 |                 if r.name.find('{}_ELMo_W/'.format(name)) >= 0
 98 |             ]
 99 |             if len(reg) != 1:
100 |                 raise ValueError
101 | 
102 |         # scale the weighted sum by gamma
103 |         gamma = tf.get_variable(
104 |             '{}_ELMo_gamma'.format(name),
105 |             shape=(1,),
106 |             initializer=tf.ones_initializer,
107 |             regularizer=None,
108 |             trainable=True,
109 |         )
110 |         weighted_lm_layers = sum_pieces * gamma
111 | 
112 |         ret = {'weighted_op': weighted_lm_layers, 'regularization_op': reg}
113 | 
114 |     return ret
115 | 
116 | 
117 | def all_layers(bilm_ops):
118 |     # Get ops for computing LM embeddings and mask
119 |     lm_embeddings = bilm_ops['lm_embeddings']
120 | 
121 |     return lm_embeddings
122 | 
123 | 


--------------------------------------------------------------------------------
/loss.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | 
 3 | 
 4 | def mask_to_start(score, start, score_mask_value=-1e30):
 5 |     score_mask = tf.cast(tf.ones_like(start) - tf.cumsum(start, axis=-1), tf.float32)
 6 |     return score + score_mask * score_mask_value
 7 | 
 8 | 
 9 | def mask_to_topk(score, topk_ind, c_maxlen, score_mask_value=-1e30):
10 |     score_mask = tf.reduce_sum(tf.one_hot(topk_ind, depth=c_maxlen), axis=-2)  # [bs, topk]->[bs, topk, c_len]->[bs, c_len]
11 |     score_mask = tf.cast(tf.ones_like(score_mask) - score_mask, tf.float32)
12 |     return score + score_mask * score_mask_value
13 | 
14 | 
15 | def get_tf_f1(y_pred, y_true):
16 |     y_true = tf.cast(y_true, tf.float32)
17 |     y_union = tf.clip_by_value(y_pred + y_true, 0, 1)  # [bs, c_maxlen]
18 |     y_diff = tf.abs(y_pred - y_true)  # [bs, c_maxlen]
19 |     num_same = tf.cast(tf.reduce_sum(y_union, axis=-1) - tf.reduce_sum(y_diff, axis=-1), tf.float32)  # [bs,]
20 |     y_precision = num_same / (tf.cast(tf.reduce_sum(y_pred, axis=-1), tf.float32) + 1e-7)  # [bs,]
21 |     y_recall = num_same / (tf.cast(tf.reduce_sum(y_true, axis=-1), tf.float32) + 1e-7)  # [bs,]
22 |     y_f1 = (2.0 * y_precision * y_recall) / (tf.cast(y_precision + y_recall, tf.float32) + 1e-7)  # [bs,]
23 |     return tf.clip_by_value(y_f1, 0, 1)
24 | 
25 | 
26 | def rl_loss(logits_start, logits_end, y_start, y_end, c_maxlen, rl_loss_type = 'topk_DCRL', topk=3):
27 |     assert rl_loss_type == 'DCRL' or rl_loss_type == 'SCST' or rl_loss_type == 'topk_DCRL'
28 |     # get ground truth prediction
29 |     # s:[0,1,0,0,0], e:[0,0,0,1,0]->[0,1,1,1,1]-[0,0,0,1,1]->[0,1,1,0,0]+e:[0,0,0,1,0]->pred:[0,1,1,1,0]
30 |     y_start_cumsum = tf.cumsum(y_start, axis=-1)
31 |     y_end_cumsum = tf.cumsum(y_end, axis=-1)
32 |     ground_truth = y_start_cumsum - y_end_cumsum + y_end  # [bs, c_maxlen]
33 | 
34 |     # get greedy prediction
35 |     greedy_start = tf.one_hot(tf.argmax(logits_start, axis=-1), c_maxlen,
36 |                               axis=-1)  # [bs, c_maxlen]->[bs,]->[bs, c_maxlen]
37 |     masked_logits_end = mask_to_start(logits_end, greedy_start)
38 |     greedy_end = tf.one_hot(tf.argmax(masked_logits_end, axis=-1), c_maxlen, axis=-1)
39 |     greedy_start_cumsum = tf.cumsum(greedy_start, axis=-1)
40 |     greedy_end_cumsum = tf.cumsum(greedy_end, axis=-1)
41 |     greedy_prediction = greedy_start_cumsum - greedy_end_cumsum + greedy_end  # [bs, c_maxlen]
42 |     # get greedy f1
43 |     greedy_f1 = get_tf_f1(greedy_prediction, ground_truth)
44 | 
45 |     # get sampled prediction (use tf.multinomial)
46 |     if rl_loss_type == 'topk_DCRL':
47 |         start_topk_ind = tf.nn.top_k(logits_start, topk).indices  # [bs, topk_size]
48 |         masked_logits_start = mask_to_topk(logits_start, start_topk_ind, c_maxlen)
49 |     else:
50 |         masked_logits_start = logits_start
51 |     sampled_start_ind = tf.squeeze(tf.multinomial(tf.log(tf.nn.softmax(masked_logits_start)), 1),
52 |                                    axis=-1)  # [bs, c_maxlen]->[bs, 1]->[bs,]
53 |     sampled_start = tf.one_hot(sampled_start_ind, c_maxlen, axis=-1)  # [bs, c_maxlen]->[bs,]->[bs, c_maxlen]
54 |     masked_logits_end = mask_to_start(logits_end, sampled_start)
55 |     if rl_loss_type == 'topk_DCRL':
56 |         end_topk_ind = tf.nn.top_k(masked_logits_end, topk).indices  # [bs, topk_size]
57 |         masked_logits_end = mask_to_topk(masked_logits_end, end_topk_ind, c_maxlen)
58 |     sampled_end_ind = tf.squeeze(tf.multinomial(tf.log(tf.nn.softmax(masked_logits_end)), 1), axis=-1)
59 |     sampled_end = tf.one_hot(sampled_end_ind, c_maxlen, axis=-1)
60 |     sampled_start_cumsum = tf.cumsum(sampled_start, axis=-1)
61 |     sampled_end_cumsum = tf.cumsum(sampled_end, axis=-1)
62 |     sampled_prediction = sampled_start_cumsum - sampled_end_cumsum + sampled_end  # [bs, c_maxlen]
63 |     # get sampled f1
64 |     sampled_f1 = get_tf_f1(sampled_prediction, ground_truth)
65 | 
66 |     reward = tf.stop_gradient(sampled_f1 - greedy_f1)  # (sampled - baseline)
67 |     sampled_start_loss = tf.nn.softmax_cross_entropy_with_logits_v2(logits=logits_start, labels=sampled_start)
68 |     sampled_end_loss = tf.nn.softmax_cross_entropy_with_logits_v2(logits=logits_end, labels=sampled_end)
69 | 
70 |     if rl_loss_type == 'DCRL' or rl_loss_type == 'topk_DCRL':
71 |         reward = tf.clip_by_value(reward, 0., 1e7)
72 |         reward_greedy = tf.clip_by_value(tf.stop_gradient(greedy_f1 - sampled_f1), 0., 1e7)
73 |         greedy_start_loss = tf.nn.softmax_cross_entropy_with_logits_v2(logits=logits_start, labels=greedy_start)
74 |         greedy_end_loss = tf.nn.softmax_cross_entropy_with_logits_v2(logits=logits_end, labels=greedy_end)
75 |         return tf.reduce_mean(reward * (sampled_start_loss + sampled_end_loss) + reward_greedy * (
76 |                 greedy_start_loss + greedy_end_loss)), sampled_f1, greedy_f1
77 |     elif rl_loss_type == 'SCST':
78 |         return tf.reduce_mean(reward * (sampled_start_loss + sampled_end_loss)), sampled_f1, greedy_f1
79 | 


--------------------------------------------------------------------------------
/util/spacy_tokenizer.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # Copyright 2018-present, HKUST-KnowComp.
  3 | # All rights reserved.
  4 | #
  5 | # This source code is licensed under the license found in the
  6 | # LICENSE file in the root directory of this source tree.
  7 | """Tokenizer that is backed by spaCy (spacy.io).
  8 | Requires spaCy package and the spaCy english model.
  9 | """
 10 | 
 11 | import spacy
 12 | from tqdm import tqdm
 13 | import copy
 14 | 
 15 | 
 16 | class Tokens(object):
 17 |     """A class to represent a list of tokenized text."""
 18 |     TEXT = 0
 19 |     CHAR = 1
 20 |     TEXT_WS = 2
 21 |     SPAN = 3
 22 |     POS = 4
 23 |     LEMMA = 5
 24 |     NER = 6
 25 | 
 26 |     def __init__(self, data, annotators, opts=None):
 27 |         self.data = data
 28 |         self.annotators = annotators
 29 |         self.opts = opts or {}
 30 | 
 31 |     def __len__(self):
 32 |         """The number of tokens."""
 33 |         return len(self.data)
 34 | 
 35 |     def slice(self, i=None, j=None):
 36 |         """Return a view of the list of tokens from [i, j)."""
 37 |         new_tokens = copy.copy(self)
 38 |         new_tokens.data = self.data[i: j]
 39 |         return new_tokens
 40 | 
 41 |     def untokenize(self):
 42 |         """Returns the original text (with whitespace reinserted)."""
 43 |         return ''.join([t[self.TEXT_WS] for t in self.data]).strip()
 44 | 
 45 |     def chars(self, uncased=False):
 46 |         """Returns a list of the first character of each token
 47 |         Args:
 48 |             uncased: lower cases characters
 49 |         """
 50 |         if uncased:
 51 |             return [t[self.CHAR].lower() for t in self.data]
 52 |         else:
 53 |             return [t[self.CHAR] for t in self.data]
 54 | 
 55 |     def words(self, uncased=False):
 56 |         """Returns a list of the text of each token
 57 |         Args:
 58 |             uncased: lower cases text
 59 |         """
 60 |         if uncased:
 61 |             return [t[self.TEXT].lower() for t in self.data]
 62 |         else:
 63 |             return [t[self.TEXT] for t in self.data]
 64 | 
 65 |     def offsets(self):
 66 |         """Returns a list of [start, end) character offsets of each token."""
 67 |         return [t[self.SPAN] for t in self.data]
 68 | 
 69 |     def pos(self):
 70 |         """Returns a list of part-of-speech tags of each token.
 71 |         Returns None if this annotation was not included.
 72 |         """
 73 |         if 'pos' not in self.annotators:
 74 |             return None
 75 |         return [t[self.POS] for t in self.data]
 76 | 
 77 |     def lemmas(self):
 78 |         """Returns a list of the lemmatized text of each token.
 79 |         Returns None if this annotation was not included.
 80 |         """
 81 |         if 'lemma' not in self.annotators:
 82 |             return None
 83 |         return [t[self.LEMMA] for t in self.data]
 84 | 
 85 |     def entities(self):
 86 |         """Returns a list of named-entity-recognition tags of each token.
 87 |         Returns None if this annotation was not included.
 88 |         """
 89 |         if 'ner' not in self.annotators:
 90 |             return None
 91 |         return [t[self.NER] for t in self.data]
 92 | 
 93 |     def ngrams(self, n=1, uncased=False, filter_fn=None, as_strings=True):
 94 |         """Returns a list of all ngrams from length 1 to n.
 95 |         Args:
 96 |             n: upper limit of ngram length
 97 |             uncased: lower cases text
 98 |             filter_fn: user function that takes in an ngram list and returns
 99 |               True or False to keep or not keep the ngram
100 |             as_string: return the ngram as a string vs list
101 |         """
102 | 
103 |         def _skip(gram):
104 |             if not filter_fn:
105 |                 return False
106 |             return filter_fn(gram)
107 | 
108 |         words = self.words(uncased)
109 |         ngrams = [(s, e + 1)
110 |                   for s in range(len(words))
111 |                   for e in range(s, min(s + n, len(words)))
112 |                   if not _skip(words[s:e + 1])]
113 | 
114 |         # Concatenate into strings
115 |         if as_strings:
116 |             ngrams = ['{}'.format(' '.join(words[s:e])) for (s, e) in ngrams]
117 | 
118 |         return ngrams
119 | 
120 |     def entity_groups(self):
121 |         """Group consecutive entity tokens with the same NER tag."""
122 |         entities = self.entities()
123 |         if not entities:
124 |             return None
125 |         non_ent = self.opts.get('non_ent', 'O')
126 |         groups = []
127 |         idx = 0
128 |         while idx < len(entities):
129 |             ner_tag = entities[idx]
130 |             # Check for entity tag
131 |             if ner_tag != non_ent:
132 |                 # Chomp the sequence
133 |                 start = idx
134 |                 while (idx < len(entities) and entities[idx] == ner_tag):
135 |                     idx += 1
136 |                 groups.append((self.slice(start, idx).untokenize(), ner_tag))
137 |             else:
138 |                 idx += 1
139 |         return groups
140 | 
141 | 
142 | class SpacyTokenizer(object):
143 | 
144 |     def __init__(self, **kwargs):
145 |         """
146 |         Args:
147 |             annotators: set that can include pos, lemma, and ner.
148 |             model: spaCy model to use (either path, or keyword like 'en').
149 |         """
150 |         model = kwargs.get('model', 'en')
151 |         self.annotators = copy.deepcopy(kwargs.get('annotators', set()))
152 |         self.nlp = spacy.load(model)
153 |         self.nlp.remove_pipe('parser')
154 |         if not any([p in self.annotators for p in ['lemma', 'pos', 'ner']]):
155 |             self.nlp.remove_pipe('tagger')
156 |         if 'ner' not in self.annotators:
157 |             self.nlp.remove_pipe('ner')
158 | 
159 |     def tokenize(self, text):
160 |         # We don't treat new lines as tokens.
161 |         clean_text = text.replace('\n', ' ')
162 |         tokens = self.nlp(clean_text)
163 | 
164 |         data = []
165 |         for i in range(len(tokens)):
166 |             # Get whitespace
167 |             start_ws = tokens[i].idx
168 |             if i + 1 < len(tokens):
169 |                 end_ws = tokens[i + 1].idx
170 |             else:
171 |                 end_ws = tokens[i].idx + len(tokens[i].text)
172 | 
173 |             data.append((
174 |                 tokens[i].text,
175 |                 tokens[i].text[0] if len(tokens[i].text) > 0 else '',
176 |                 text[start_ws: end_ws],
177 |                 (tokens[i].idx, tokens[i].idx + len(tokens[i].text)),
178 |                 tokens[i].tag_,
179 |                 tokens[i].lemma_,
180 |                 tokens[i].ent_type_,
181 |             ))
182 | 
183 |         # Set special option for non-entity tag: '' vs 'O' in spaCy
184 |         return Tokens(data, self.annotators, opts={'non_ent': ''})
185 | 
186 |     def shutdown(self):
187 |         pass
188 | 
189 |     def __del__(self):
190 |         self.shutdown()


--------------------------------------------------------------------------------
/util/get_tfrecords.py:
--------------------------------------------------------------------------------
  1 | import tensorflow as tf
  2 | import numpy as np
  3 | from tqdm import tqdm
  4 | import random
  5 | import h5py
  6 | 
  7 | data_type = 'train'
  8 | data_source = 'dataset_pre3'
  9 | 
 10 | # load trainset
 11 | qid = np.load(data_source + '/' + data_type + '_qid.npy').astype(np.int32)
 12 | print(data_type + 'data loading over...')
 13 | 
 14 | length = qid.shape[0]
 15 | print(length)
 16 | index = [i for i in range(0, length)]
 17 | random.shuffle(index)
 18 | print(index[0:10])
 19 | 
 20 | qid = qid[index]
 21 | tfrecords_filename = 'tfrecords/' + data_type + '_pre_elmo_cove3.tfrecords'
 22 | writer = tf.python_io.TFRecordWriter(tfrecords_filename)
 23 | 
 24 | with h5py.File(data_source + '/train_ELMO_feats.h5', 'r') as f1:
 25 |     with h5py.File(data_source + '/train_COVE_feats.h5', 'r') as f2:
 26 |         with h5py.File(data_source + '/train_data.h5', 'r') as f:
 27 |             for i in tqdm(range(len(qid))):
 28 |                 elmo_context_feat = f1[str(qid[i]) + 'c'][:]
 29 |                 elmo_question_feat = f1[str(qid[i]) + 'q'][:]
 30 |                 cove_context_feat = f2[str(qid[i]) + 'c'][:]
 31 |                 cove_question_feat = f2[str(qid[i]) + 'q'][:]
 32 | 
 33 |                 data_simple = f[str(qid[i])]
 34 |                 context_ids = data_simple['context_ids'][:]
 35 |                 ques_ids = data_simple['ques_ids'][:]
 36 |                 context_char_ids = data_simple['context_char_ids'][:]
 37 |                 ques_char_ids = data_simple['ques_char_ids'][:]
 38 |                 y1 = data_simple['y1'][:]
 39 |                 y2 = data_simple['y2'][:]
 40 |                 y1p = data_simple['y1p'][:]
 41 |                 y2p = data_simple['y2p'][:]
 42 |                 context_feat = data_simple['context_feat'][:]
 43 |                 ques_feat = data_simple['ques_feat'][:]
 44 | 
 45 |                 record = tf.train.Example(features=tf.train.Features(feature={
 46 |                     "context_ids": tf.train.Feature(bytes_list=tf.train.BytesList(value=[context_ids.tostring()])),
 47 |                     "ques_ids": tf.train.Feature(bytes_list=tf.train.BytesList(value=[ques_ids.tostring()])),
 48 |                     "context_char_ids": tf.train.Feature(
 49 |                         bytes_list=tf.train.BytesList(value=[context_char_ids.tostring()])),
 50 |                     "ques_char_ids": tf.train.Feature(bytes_list=tf.train.BytesList(value=[ques_char_ids.tostring()])),
 51 |                     "context_feat": tf.train.Feature(bytes_list=tf.train.BytesList(value=[context_feat.tostring()])),
 52 |                     "ques_feat": tf.train.Feature(bytes_list=tf.train.BytesList(value=[ques_feat.tostring()])),
 53 |                     'elmo_context_feat': tf.train.Feature(
 54 |                         bytes_list=tf.train.BytesList(value=[elmo_context_feat.tostring()])),
 55 |                     'elmo_question_feat': tf.train.Feature(
 56 |                         bytes_list=tf.train.BytesList(value=[elmo_question_feat.tostring()])),
 57 |                     'cove_context_feat': tf.train.Feature(
 58 |                         bytes_list=tf.train.BytesList(value=[cove_context_feat.tostring()])),
 59 |                     'cove_question_feat': tf.train.Feature(
 60 |                         bytes_list=tf.train.BytesList(value=[cove_question_feat.tostring()])),
 61 |                     "y1": tf.train.Feature(bytes_list=tf.train.BytesList(value=[y1.tostring()])),
 62 |                     "y2": tf.train.Feature(bytes_list=tf.train.BytesList(value=[y2.tostring()])),
 63 |                     "y1p": tf.train.Feature(bytes_list=tf.train.BytesList(value=[y1p.tostring()])),
 64 |                     "y2p": tf.train.Feature(bytes_list=tf.train.BytesList(value=[y2p.tostring()])),
 65 |                     "qid": tf.train.Feature(int64_list=tf.train.Int64List(value=[qid[i]]))
 66 |                 }))
 67 |                 writer.write(record.SerializeToString())
 68 | writer.close()
 69 | 
 70 | data_type = 'dev'
 71 | data_source = 'dataset_pre3'
 72 | 
 73 | # load trainset
 74 | qid = np.load(data_source + '/' + data_type + '_qid.npy').astype(np.int32)
 75 | print(data_type + 'data loading over...')
 76 | 
 77 | tfrecords_filename = 'tfrecords/' + data_type + '_pre_elmo_cove3.tfrecords'
 78 | writer = tf.python_io.TFRecordWriter(tfrecords_filename)
 79 | 
 80 | with h5py.File(data_source + '/dev_ELMO_feats.h5', 'r') as f1:
 81 |     with h5py.File(data_source + '/dev_COVE_feats.h5', 'r') as f2:
 82 |         with h5py.File(data_source + '/dev_data.h5', 'r') as f:
 83 |             for i in tqdm(range(len(qid))):
 84 |                 elmo_context_feat = f1[str(qid[i]) + 'c'][:]
 85 |                 elmo_question_feat = f1[str(qid[i]) + 'q'][:]
 86 |                 cove_context_feat = f2[str(qid[i]) + 'c'][:]
 87 |                 cove_question_feat = f2[str(qid[i]) + 'q'][:]
 88 | 
 89 |                 data_simple = f[str(qid[i])]
 90 |                 context_ids = data_simple['context_ids'][:]
 91 |                 ques_ids = data_simple['ques_ids'][:]
 92 |                 context_char_ids = data_simple['context_char_ids'][:]
 93 |                 ques_char_ids = data_simple['ques_char_ids'][:]
 94 |                 y1 = data_simple['y1'][:]
 95 |                 y2 = data_simple['y2'][:]
 96 |                 y1p = data_simple['y1p'][:]
 97 |                 y2p = data_simple['y2p'][:]
 98 |                 context_feat = data_simple['context_feat'][:]
 99 |                 ques_feat = data_simple['ques_feat'][:]
100 | 
101 |                 record = tf.train.Example(features=tf.train.Features(feature={
102 |                     "context_ids": tf.train.Feature(bytes_list=tf.train.BytesList(value=[context_ids.tostring()])),
103 |                     "ques_ids": tf.train.Feature(bytes_list=tf.train.BytesList(value=[ques_ids.tostring()])),
104 |                     "context_char_ids": tf.train.Feature(
105 |                         bytes_list=tf.train.BytesList(value=[context_char_ids.tostring()])),
106 |                     "ques_char_ids": tf.train.Feature(bytes_list=tf.train.BytesList(value=[ques_char_ids.tostring()])),
107 |                     "context_feat": tf.train.Feature(bytes_list=tf.train.BytesList(value=[context_feat.tostring()])),
108 |                     "ques_feat": tf.train.Feature(bytes_list=tf.train.BytesList(value=[ques_feat.tostring()])),
109 |                     'elmo_context_feat': tf.train.Feature(
110 |                         bytes_list=tf.train.BytesList(value=[elmo_context_feat.tostring()])),
111 |                     'elmo_question_feat': tf.train.Feature(
112 |                         bytes_list=tf.train.BytesList(value=[elmo_question_feat.tostring()])),
113 |                     'cove_context_feat': tf.train.Feature(
114 |                         bytes_list=tf.train.BytesList(value=[cove_context_feat.tostring()])),
115 |                     'cove_question_feat': tf.train.Feature(
116 |                         bytes_list=tf.train.BytesList(value=[cove_question_feat.tostring()])),
117 |                     "y1": tf.train.Feature(bytes_list=tf.train.BytesList(value=[y1.tostring()])),
118 |                     "y2": tf.train.Feature(bytes_list=tf.train.BytesList(value=[y2.tostring()])),
119 |                     "y1p": tf.train.Feature(bytes_list=tf.train.BytesList(value=[y1p.tostring()])),
120 |                     "y2p": tf.train.Feature(bytes_list=tf.train.BytesList(value=[y2p.tostring()])),
121 |                     "qid": tf.train.Feature(int64_list=tf.train.Int64List(value=[qid[i]]))
122 |                 }))
123 |                 writer.write(record.SerializeToString())
124 | writer.close()
125 | 


--------------------------------------------------------------------------------
/test.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import numpy as np
  3 | import json
  4 | import RMR_modelV6_squad2 as RMR
  5 | import tensorflow.contrib.slim as slim
  6 | from util.util import *
  7 | import tensorflow as tf
  8 | import pandas as pd
  9 | from util.log_wrapper import create_logger
 10 | 
 11 | os.environ["CUDA_VISIBLE_DEVICES"] = '4'
 12 | 
 13 | if __name__ == '__main__':
 14 | 
 15 |     data_source = '../QANet_tf/dataset_pre3'
 16 | 
 17 |     config = {
 18 |         'char_dim': 300,
 19 |         'cont_limit': 400,
 20 |         'ques_limit': 50,
 21 |         'char_limit': 16,
 22 |         'ans_limit': -1,
 23 |         'filters': 300,
 24 |         'dropout': 0.175,
 25 |         'dropout_emb': 0.15,
 26 |         'dropout_att': 0.2,
 27 |         'dropout_rnn': 0.1,
 28 |         'l2_norm': 3e-7,
 29 |         'decay': 1,
 30 |         'gamma_b': 0.3,
 31 |         'gamma_c': 1.0,
 32 |         'init_lambda': 3.0,
 33 |         'learning_rate': 8e-4,
 34 |         'shuffle_size': 25000,
 35 |         'grad_clip': 5.0,
 36 |         'use_elmo': 0,
 37 |         'use_cove': 0,
 38 |         'use_feat': True,
 39 |         'use_rlloss': True,
 40 |         'rlw': 0.0,
 41 |         'rlw2': 0.8,
 42 |         'optimizer': 'adam',
 43 |         'cove_path': '../SAN_tf/Keras_CoVe_2layers.h5',
 44 |         'elmo_weights_path': '../SAN_tf/elmo_tf/models/squad_context_concat_lm_2x4096_512_2048cnn_2xhighway_skip.hdf5',
 45 |         'elmo_options_path': '../SAN_tf/elmo_tf/models/options_squad_lm_2x4096_512_2048cnn_2xhighway_skip.json',
 46 |         'train_tfrecords': '../QANet_tf/tfrecords/train_pre_elmo_cove3.tfrecords',
 47 |         'dev_tfrecords': '../QANet_tf/tfrecords/dev_pre_elmo_cove3.tfrecords',
 48 |         'batch_size': 32,
 49 |         'epoch': 25,
 50 |         'origin_path': None,  # not finetune
 51 |         'path': 'RMR005'
 52 |     }
 53 | 
 54 |     global logger
 55 |     logger = create_logger(__name__, to_disk=False)
 56 | 
 57 |     logger.info('loading data...')
 58 |     dev_qid = np.load(data_source + '/dev_qid.npy').astype(np.int32)
 59 |     with open(data_source + '/test_eval.json', "r") as fh:
 60 |         eval_file = json.load(fh)
 61 | 
 62 |     # load embedding matrix
 63 |     logger.info('loading embedding...')
 64 |     word_mat = np.load(data_source + '/word_emb_mat.npy')
 65 |     char_mat_fix = np.load(data_source + '/char_emb_mat_fix.npy').astype(np.float32)
 66 |     char_mat_trainable = np.load(data_source + '/char_emb_mat_trainable.npy').astype(np.float32)
 67 | 
 68 | 
 69 |     logger.info('generate dev tfrecords...')
 70 |     dev_dataset = tf.data.TFRecordDataset(config['dev_tfrecords']) \
 71 |         .map(get_record_parser(config), num_parallel_calls=8) \
 72 |         .padded_batch(config['batch_size'], padded_shapes=([None],
 73 |                                                            [None],
 74 |                                                            [None, None],
 75 |                                                            [None, None],
 76 |                                                            [None, None],
 77 |                                                            [None, None],
 78 |                                                            [None, None, None],
 79 |                                                            [None, None, None],
 80 |                                                            [None, None, None],
 81 |                                                            [None, None, None],
 82 |                                                            [None],
 83 |                                                            [None],
 84 |                                                            [None],
 85 |                                                            [None]))
 86 |     dev_iterator = dev_dataset.make_initializable_iterator()
 87 |     dev_next_element = dev_iterator.get_next()
 88 |     dev_sum = 11730
 89 | 
 90 |     logger.info('init model...')
 91 |     model = RMR.Model(config, word_mat=word_mat, char_mat_trainable=char_mat_trainable, char_mat_fix=char_mat_fix)
 92 |     sess_config = tf.ConfigProto(allow_soft_placement=True)
 93 |     sess_config.gpu_options.allow_growth = True
 94 |     best_f1 = 0
 95 |     best_em = 0
 96 |     f1s = []
 97 |     ems = []
 98 | 
 99 |     with tf.Session(config=sess_config) as sess:
100 |         sess.run(tf.global_variables_initializer())
101 |         # scope with trainable weights
102 |         variables_to_restore = slim.get_variables_to_restore(include=['Input_Embedding_Mat',
103 |                                                                       'Input_Embedding_Layer',
104 |                                                                       'Iterative_Reattention_Aligner',
105 |                                                                       'Answer_Pointer',
106 |                                                                       'EMA_Weights'])
107 |         saver = tf.train.Saver(variables_to_restore, max_to_keep=10)
108 |         if config['origin_path'] is not None and os.path.exists(
109 |                 os.path.join('model', config['origin_path'], 'checkpoint')):
110 |             saver.restore(sess, tf.train.latest_checkpoint(os.path.join('model', str(config['origin_path']) + '/')))
111 | 
112 |         i_batch = 0
113 |         val_n_batch = dev_sum // config['batch_size'] + 1
114 |         sum_loss = 0
115 | 
116 |         # validating step
117 |         # save the temp weights and do ema
118 |         if config['decay'] < 1.0:
119 |             sess.run(model.assign_vars)
120 |             print('EMA over...')
121 |         sess.run(dev_iterator.initializer)
122 |         logger.info('validating...')
123 |         sum_loss_val = 0
124 |         y1s = []
125 |         y2s = []
126 |         i_batch = 0
127 |         while True:
128 |             try:
129 |                 context_idxs, ques_idxs, \
130 |                 context_char_idxs, ques_char_idxs, \
131 |                 context_feat, ques_feat, \
132 |                 elmo_context_feat, elmo_question_feat, \
133 |                 cove_context_feat, cove_question_feat, \
134 |                 y1, y2, y1p, y2p = sess.run(dev_next_element)
135 |                 feed_dict_ = {model.contw_input: context_idxs, model.quesw_input: ques_idxs,
136 |                               model.contc_input: context_char_idxs, model.quesc_input: ques_char_idxs,
137 |                               model.y_start: y1, model.y_end: y2,
138 |                               model.yp_start: y1p, model.yp_end: y2p,
139 |                               model.un_size: context_idxs.shape[0]}
140 |                 if config['use_feat']:
141 |                     feed_dict_[model.cont_feat] = context_feat
142 |                     feed_dict_[model.ques_feat] = ques_feat
143 |                 if config['use_elmo'] == 1:
144 |                     feed_dict_[model.elmo_cont] = elmo_context_feat
145 |                     feed_dict_[model.elmo_ques] = elmo_question_feat
146 |                 if config['use_cove'] == 1:
147 |                     feed_dict_[model.cove_cont] = cove_context_feat
148 |                     feed_dict_[model.cove_ques] = cove_question_feat
149 |                 loss_value, y1, y2 = sess.run([model.loss, model.mask_output1, model.mask_output2],
150 |                                               feed_dict=feed_dict_)
151 |                 y1s.append(y1)
152 |                 y2s.append(y2)
153 |                 sum_loss_val += loss_value
154 |                 i_batch += 1
155 |             except tf.errors.OutOfRangeError:
156 |                 y1s = np.concatenate(y1s)
157 |                 y2s = np.concatenate(y2s)
158 |                 answer_dict, _, noanswer_num = convert_tokens(eval_file, dev_qid.tolist(), y1s.tolist(),
159 |                                                               y2s.tolist(), data_type=2)
160 |                 metrics = evaluate(eval_file, answer_dict)
161 |                 ems.append(metrics['exact_match'])
162 |                 f1s.append(metrics['f1'])
163 | 
164 |                 if metrics['f1'] < f1s[-1]:
165 |                     config['learning_rate'] *= 0.5
166 |                     logger.warning('learning rate reduce to:' + str(config['learning_rate']))
167 |                     if config['learning_rate'] <= 1e-4:
168 |                         logger.warning('rl loss start...')
169 |                         config['rlw'] = config['rlw2']
170 | 
171 |                 if ems[-1] > best_em:
172 |                     best_em = ems[-1]
173 |                 if f1s[-1] > best_f1:
174 |                     best_f1 = f1s[-1]
175 |                 logger.warning("-loss: %.4f -EM:%.2f%% (best: %.2f%%), -F1:%.2f%% (best: %.2f%%) -Noanswer:%d" %
176 |                                (sum_loss_val / (i_batch + 1), metrics['exact_match'], best_em, metrics['f1'],
177 |                                 best_f1, noanswer_num))
178 |                 metrics = evaluate_acc(eval_file, answer_dict)
179 |                 logger.warning("Has answer acc:%.2f%%, No answer acc::%.2f%%" % (
180 |                     metrics['has_answer_acc'] * 100, metrics['hasno_answer_acc'] * 100))
181 | 
182 |                 break
183 | 


--------------------------------------------------------------------------------
/train_finetune.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import pandas as pd
  3 | import RMR_modelV6
  4 | import tensorflow as tf
  5 | import json
  6 | import os
  7 | import util
  8 | import time
  9 | import tensorflow.contrib.slim as slim
 10 | 
 11 | os.environ["CUDA_VISIBLE_DEVICES"] = '0'
 12 | 
 13 | 
 14 | def training_shuffle(data, seed=None):
 15 |     if seed is not None:
 16 |         np.random.seed(seed)
 17 |     index = np.arange(data[0].shape[0])
 18 |     np.random.shuffle(index)
 19 |     for i, d in enumerate(data):
 20 |         if len(d.shape) > 1:
 21 |             data[i] = data[i][index, ::]
 22 |         else:
 23 |             data[i] = data[i][index]
 24 |     return data
 25 | 
 26 | 
 27 | def next_batch(data, batch_size, iteration):
 28 |     data_temp = []
 29 |     start_index = iteration * batch_size
 30 |     end_index = (iteration + 1) * batch_size
 31 |     for i, d in enumerate(data):
 32 |         data_temp.append(data[i][start_index: end_index, ::])
 33 |     return data_temp
 34 | 
 35 | 
 36 | def cal_ETA(t_start, i, n_batch):
 37 |     t_temp = time.time()
 38 |     t_avg = float(int(t_temp) - int(t_start)) / float(i + 1)
 39 |     if n_batch - i - 1 > 0:
 40 |         return int((n_batch - i - 1) * t_avg)
 41 |     else:
 42 |         return int(t_temp) - int(t_start)
 43 | 
 44 | 
 45 | # load trainset
 46 | context_word = np.load('../QANet_tf/dataset1.0/train_contw_input.npy').astype(np.int32)
 47 | question_word = np.load('../QANet_tf/dataset1.0/train_quesw_input.npy').astype(np.int32)
 48 | context_char = np.load('../QANet_tf/dataset1.0/train_contc_input.npy').astype(np.int32)
 49 | question_char = np.load('../QANet_tf/dataset1.0/train_quesc_input.npy').astype(np.int32)
 50 | start_label = np.load('../QANet_tf/dataset1.0/train_y_start.npy').astype(np.int32)
 51 | end_label = np.load('../QANet_tf/dataset1.0/train_y_end.npy').astype(np.int32)
 52 | context_string = np.load('../QANet_tf/dataset1.0/train_contw_strings.npy')
 53 | ques_string = np.load('../QANet_tf/dataset1.0/train_quesw_strings.npy')
 54 | 
 55 | # load valset
 56 | val_context_word = np.load('../QANet_tf/dataset1.0/dev_contw_input.npy').astype(np.int32)
 57 | val_question_word = np.load('../QANet_tf/dataset1.0/dev_quesw_input.npy').astype(np.int32)
 58 | val_context_char = np.load('../QANet_tf/dataset1.0/dev_contc_input.npy').astype(np.int32)
 59 | val_question_char = np.load('../QANet_tf/dataset1.0/dev_quesc_input.npy').astype(np.int32)
 60 | val_start_label = np.load('../QANet_tf/dataset1.0/dev_y_start.npy').astype(np.int32)
 61 | val_end_label = np.load('../QANet_tf/dataset1.0/dev_y_end.npy').astype(np.int32)
 62 | val_qid = np.load('../QANet_tf/dataset1.0/dev_qid.npy').astype(np.int32)
 63 | val_context_string = np.load('../QANet_tf/dataset1.0/dev_contw_strings.npy')
 64 | val_ques_string = np.load('../QANet_tf/dataset1.0/dev_quesw_strings.npy')
 65 | 
 66 | with open('../QANet_tf/dataset1.0/test_eval.json', "r") as fh:
 67 |     eval_file = json.load(fh)
 68 | 
 69 | # load embedding matrix
 70 | word_mat = np.load('../QANet_tf/dataset1.0/word_emb_mat.npy')
 71 | char_mat = np.load('../QANet_tf/dataset1.0/char_emb_mat.npy')
 72 | 
 73 | train_set = [context_word, question_word, context_char, question_char, context_string, ques_string, start_label,
 74 |              end_label]
 75 | val_set = [val_context_word, val_question_word, val_context_char, val_question_char, val_context_string,
 76 |            val_ques_string, val_start_label, val_end_label]
 77 | 
 78 | config = {
 79 |     'char_dim': 64,
 80 |     'cont_limit': 400,
 81 |     'ques_limit': 50,
 82 |     'char_limit': 16,
 83 |     'ans_limit': 50,
 84 |     'filters': 100,
 85 |     'dropout': 0.3,
 86 |     'l2_norm': 3e-7,
 87 |     'decay': 0.9999,
 88 |     'learning_rate': 1e-4,
 89 |     'grad_clip': 5.0,
 90 |     'batch_size': 32,
 91 |     'epoch': 20,
 92 |     'per_steps': 500,
 93 |     'init_lambda': 3.0,
 94 |     'rl_loss_type': 'DCRL', # ['SCTC', 'DCRL', 'topk_DCRL', None]
 95 |     'origin_path': 'RMRV0',
 96 |     'path': 'RMRV0_f'
 97 | }
 98 | 
 99 | model = RMR_modelV6.Model(config, word_mat=word_mat, char_mat=char_mat, elmo_path="../QANet_tf/tfhub_elmo")
100 | sess_config = tf.ConfigProto(allow_soft_placement=True)
101 | sess_config.gpu_options.allow_growth = True
102 | 
103 | best_f1 = 0
104 | best_em = 0
105 | f1s = []
106 | ems = []
107 | 
108 | with tf.Session(config=sess_config) as sess:
109 |     if not os.path.exists(os.path.join('model', config['path'])):
110 |         os.mkdir(os.path.join('model', config['path']))
111 |     sess.run(tf.global_variables_initializer())
112 |     variables_to_restore = slim.get_variables_to_restore(include=['Input_Embedding_Layer',
113 |                                                                   'Iterative_Reattention_Aligner',
114 |                                                                   'Answer_Pointer'])
115 |     saver = tf.train.Saver(variables_to_restore)
116 |     if os.path.exists(os.path.join('model',config['origin_path'],'checkpoint')):
117 |         saver.restore(sess, tf.train.latest_checkpoint(os.path.join('model',config['origin_path'])))
118 |     n_batch = context_word.shape[0] // config['batch_size']
119 |     n_batch_val = val_context_word.shape[0] // config['batch_size']
120 | 
121 |     # during the finetune with rl_loss we validate the result per 500 steps
122 |     for epoch in range(config['epoch']):
123 |         train_set = training_shuffle(train_set)
124 |         last_train_str = "\r"
125 |         # training step
126 |         sum_loss = 0
127 |         sum_rl_loss = 0
128 |         for i in range(n_batch):
129 |             contw_input, quesw_input, contc_input, quesc_input, contw_string, quesw_string, y_start, y_end \
130 |                 = next_batch(train_set, config['batch_size'], i)
131 |             loss_value, rl_loss_value, theta_a, theta_b, sampled_f1, greedy_f1, _ = sess.run([model.loss, model.rl_loss, model.theta_a, model.theta_b,
132 |                                                                        model.sampled_f1, model.greedy_f1, model.train_op],
133 |                                      feed_dict={model.contw_input_: contw_input, model.quesw_input_: quesw_input,
134 |                                                 model.contc_input_: contc_input, model.quesc_input_: quesc_input,
135 |                                                 model.contw_strings: contw_string, model.quesw_strings: quesw_string,
136 |                                                 model.y_start_: y_start, model.y_end_: y_end,
137 |                                                 model.dropout: config['dropout']})
138 |             sum_loss += loss_value
139 |             sum_rl_loss += rl_loss_value
140 |             last_train_str = "\r[epoch:%d/%d, steps:%d/%d] loss:%.4f rl_loss:%.4f" % (
141 |                 epoch + 1, config['epoch'], i + 1, n_batch, sum_loss/(i+1), rl_loss_value)
142 |             print(last_train_str, end='      ', flush=True)
143 |             # print('sf1:',sampled_f1)
144 |             # print('gf1:',greedy_f1)
145 |             if (i+1)%config['per_steps']==0 or i+1==n_batch:
146 |                 # validating step
147 |                 sum_loss_val = 0
148 |                 sum_rl_loss_val = 0
149 |                 y1s = []
150 |                 y2s = []
151 |                 last_val_str = "\r"
152 |                 for i in range(n_batch_val):
153 |                     contw_input, quesw_input, contc_input, quesc_input, contw_string, quesw_string, y_start, y_end \
154 |                         = next_batch(val_set, config['batch_size'], i)
155 |                     loss_value, rl_loss_value, y1, y2 = sess.run([model.loss, model.rl_loss, model.output1, model.output2],
156 |                                                   feed_dict={model.contw_input_: contw_input,
157 |                                                              model.quesw_input_: quesw_input,
158 |                                                              model.contc_input_: contc_input,
159 |                                                              model.quesc_input_: quesc_input,
160 |                                                              model.contw_strings: contw_string,
161 |                                                              model.quesw_strings: quesw_string,
162 |                                                              model.y_start_: y_start, model.y_end_: y_end})
163 |                     y1s.append(y1)
164 |                     y2s.append(y2)
165 |                     sum_loss_val += loss_value
166 |                     sum_rl_loss_val += rl_loss_value
167 |                     last_val_str = last_train_str + " [validate:%d/%d] loss:%.4f rl_loss:%.4f" % (
168 |                         i + 1, n_batch_val, sum_loss_val / (i + 1), rl_loss_value)
169 |                     print(last_val_str, end='      ', flush=True)
170 |                 y1s = np.concatenate(y1s)
171 |                 y2s = np.concatenate(y2s)
172 |                 answer_dict, _, noanswer_num = util.convert_tokens(eval_file, val_qid.tolist(), y1s.tolist(),
173 |                                                                    y2s.tolist(),
174 |                                                                    data_type=1)
175 |                 metrics = util.evaluate(eval_file, answer_dict)
176 |                 ems.append(metrics['exact_match'])
177 |                 f1s.append(metrics['f1'])
178 | 
179 |                 if metrics['f1'] > best_f1:
180 |                     best_f1 = metrics['f1']
181 |                     saver.save(sess, os.path.join('model', config['path'], 'model.ckpt'),
182 |                                global_step=(epoch + 1) * n_batch)
183 | 
184 |                 print(last_val_str,
185 |                       " -EM: %.2f%%, -F1: %.2f%% -Noanswer: %d" % (metrics['exact_match'], metrics['f1'], noanswer_num),
186 |                       end=' ', flush=True)
187 |                 print('\n')
188 | 
189 |                 result = pd.DataFrame([ems, f1s], index=['em', 'f1']).transpose()
190 |                 result.to_csv('log/result_' + config['path'] + '.csv', index=None)
191 | 
192 |         saver.save(sess, os.path.join('model', config['path'], 'model.ckpt'), global_step=config['epoch'] * n_batch)


--------------------------------------------------------------------------------
/layers.py:
--------------------------------------------------------------------------------
  1 | import tensorflow as tf
  2 | from tensor2tensor.layers.common_layers import conv1d
  3 | from tensorflow.contrib.cudnn_rnn import CudnnLSTM
  4 | from tensorflow.contrib.keras import backend
  5 | from tensorflow.contrib.layers import variance_scaling_initializer, l2_regularizer
  6 | 
  7 | initializer = lambda: variance_scaling_initializer(factor=1.0, mode='FAN_AVG', uniform=True, dtype=tf.float32)
  8 | initializer_relu = lambda: variance_scaling_initializer(factor=2.0, mode='FAN_IN', uniform=False, dtype=tf.float32)
  9 | regularizer = l2_regularizer(scale=3e-7)
 10 | 
 11 | 
 12 | # cudnnLSTM
 13 | def BiLSTM(x, filters, dropout=0.0, name='BiLSTM', layers=1, return_state=False):
 14 |     cudnn_lstm = CudnnLSTM(layers, filters, direction='bidirectional', name=name)
 15 |     if type(x) == list:
 16 |         assert len(x) == 2
 17 |         x1, x2 = x
 18 |         # cudnn compatibility: time first, batch second
 19 |         x1 = tf.transpose(x1, [1, 0, 2])
 20 |         x2 = tf.transpose(x2, [1, 0, 2])
 21 |         x1, x1_state = cudnn_lstm(x1)  # state:[2, bs, dim]
 22 |         x2, x2_state = cudnn_lstm(x2)
 23 |         x1 = tf.transpose(x1, [1, 0, 2])
 24 |         x2 = tf.transpose(x2, [1, 0, 2])
 25 |         x1_state = tf.concat(tf.unstack(x1_state[0], axis=0), axis=-1)
 26 |         x2_state = tf.concat(tf.unstack(x2_state[0], axis=0), axis=-1)
 27 |         if return_state:
 28 |             return tf.nn.dropout(x1_state, 1 - dropout), tf.nn.dropout(x2_state, 1 - dropout)
 29 |         else:
 30 |             return tf.nn.dropout(x1, 1 - dropout), tf.nn.dropout(x2, 1 - dropout)
 31 |     else:
 32 |         # cudnn compatibility: time first, batch second
 33 |         x = tf.transpose(x, [1, 0, 2])
 34 |         x, x_state = cudnn_lstm(x)
 35 |         if return_state:
 36 |             x_state = tf.concat(tf.unstack(x_state[0], axis=0), axis=-1)
 37 |             return tf.nn.dropout(x_state, 1 - dropout)
 38 |         else:
 39 |             x = tf.transpose(x, [1, 0, 2])
 40 |             return tf.nn.dropout(x, 1 - dropout)
 41 | 
 42 | 
 43 | def exp_mask(inputs, mask, mask_value=-1e30):
 44 |     mask = tf.cast(mask, tf.float32)
 45 |     return inputs + mask_value * (1 - mask)
 46 | 
 47 | 
 48 | def align_block(u, v, c_mask, q_mask, Lambda, filters=128, E_0=None, B_0=None, Z_0=None, dropout=0.0):
 49 |     with tf.variable_scope("Interactive_Alignment"):
 50 |         # attention
 51 |         u_ = tf.nn.relu(conv1d(u, filters, 1, name="Wu"))  # [bs, len_c, dim]
 52 |         v_ = tf.nn.relu(conv1d(v, filters, 1, name="Wv"))  # [bs, len_q, dim]
 53 |         E = tf.matmul(v_, u_, transpose_b=True)  # [bs, len_q, len_c]
 54 |         if E_0 is not None:
 55 |             E += (Lambda * E_0)
 56 |         E_ = tf.nn.softmax(exp_mask(E, tf.expand_dims(q_mask, axis=-1)), axis=1)  # [bs, len_q, len_c]
 57 |         v_E = tf.matmul(E_, v, transpose_a=True)  # [bs, len_c, dim]
 58 | 
 59 |         # fusion
 60 |         uv = tf.concat([u, v_E, u * v_E, u - v_E], axis=-1)
 61 |         x = tf.nn.relu(conv1d(uv, filters, 1, name='Wr'))
 62 |         g = tf.nn.sigmoid(conv1d(uv, filters, 1, name='Wg'))
 63 |         h = g * x + (1 - g) * u  # [bs, len_c, dim]
 64 | 
 65 |     with tf.variable_scope("Self_Alignment"):
 66 |         # attention
 67 |         h_1 = tf.nn.relu(conv1d(h, filters, 1, name='Wh1'))
 68 |         h_2 = tf.nn.relu(conv1d(h, filters, 1, name='Wh2'))
 69 |         B = tf.matmul(h_2, h_1, transpose_b=True)  # [bs, len_c, len_c]
 70 |         if B_0 is not None:
 71 |             B += (Lambda * B_0)
 72 |         B_ = tf.nn.softmax(exp_mask(B, tf.expand_dims(c_mask, axis=-1)), axis=1)  # [bs, len_c, len_c]
 73 |         h_B = tf.matmul(B_, h, transpose_a=True)
 74 | 
 75 |         # fusion
 76 |         hh = tf.concat([h, h_B, h * h_B, h - h_B], axis=-1)
 77 |         x = tf.nn.relu(conv1d(hh, filters, 1, name='Wr'))
 78 |         g = tf.nn.sigmoid(conv1d(hh, filters, 1, name='Wg'))
 79 |         Z = g * x + (1 - g) * h  # [bs, len_c, dim]
 80 | 
 81 |     with tf.variable_scope("Evidence_Collection"):
 82 |         if Z_0 is not None:
 83 |             Z = tf.concat([Z, Z_0[0], Z_0[1]], axis=-1)
 84 |         R = BiLSTM(Z, filters // 2, name='bilstm', dropout=dropout)  # [bs, len_c, dim]
 85 | 
 86 |     # return the E_t, B_t
 87 |     E_t = tf.nn.softmax(exp_mask(E, tf.expand_dims(c_mask, axis=1)), axis=-1)  # [bs, len_q, len_c]
 88 |     E_t = tf.matmul(E_t, B_)
 89 |     B_t = tf.nn.softmax(exp_mask(B, tf.expand_dims(c_mask, axis=1)), axis=-1)  # [bs, len_c, len_c]
 90 |     B_t = tf.matmul(B_t, B_)
 91 | 
 92 |     return R, Z, E_t, B_t
 93 | 
 94 | 
 95 | def summary_vector(q_emb, c_maxlen, mask):
 96 |     with tf.variable_scope("Question_Summary"):
 97 |         alpha = tf.nn.softmax(exp_mask(tf.squeeze(conv1d(q_emb, 1, 1), axis=-1), mask))
 98 |         s = tf.expand_dims(alpha, axis=-1) * q_emb
 99 |         s = tf.reduce_sum(s, axis=1, keepdims=True)  # [bs, 1, dim]
100 |         s = tf.tile(s, [1, c_maxlen, 1])  # [bs, len_c, dim]
101 |     return s
102 | 
103 | 
104 | def start_logits(R, s, mask, filters=128, name='Start_Pointer'):
105 |     with tf.variable_scope(name):
106 |         if R.get_shape()[-1] == s.get_shape()[-1]:
107 |             logits1 = tf.concat([R, s, R * s, R - s], axis=-1)
108 |         else:
109 |             logits1 = tf.concat([R, s], axis=-1)
110 |         logits1 = tf.nn.tanh(conv1d(logits1, filters, 1, name='Wt'))
111 |         logits1 = tf.squeeze(conv1d(logits1, 1, 1, name='Wf'), axis=-1)
112 |         logits1 = exp_mask(logits1, mask)
113 |     return logits1
114 | 
115 | 
116 | def end_logits(R, logits1, s, mask, filters=128, name='End_Pointer'):
117 |     with tf.variable_scope(name):
118 |         l = R * tf.expand_dims(tf.nn.softmax(logits1, axis=-1), axis=-1)  # [bs, len_c, dim]
119 |         if s.get_shape()[-1] == l.get_shape()[-1]:
120 |             s_ = tf.concat([s, l, s * l, s - l], axis=-1)
121 |         else:
122 |             s_ = tf.concat([s, l], axis=-1)
123 |         x = tf.nn.relu(conv1d(s_, filters, 1, name='Wr'))  # [bs, len_c, dim]
124 |         g = tf.nn.sigmoid(conv1d(s_, filters, 1, name='Wg'))  # [bs, len_c, dim]
125 |         s_ = g * x + (1 - g) * s  # [bs, len_c, dim]
126 | 
127 |         if R.get_shape()[-1] == s_.get_shape()[-1]:
128 |             logits2 = tf.concat([R, s_, R * s_, R - s_], axis=-1)
129 |         else:
130 |             logits2 = tf.concat([R, s_], axis=-1)
131 |         logits2 = tf.nn.tanh(conv1d(logits2, filters, 1, name='Wt'))
132 |         logits2 = tf.squeeze(conv1d(logits2, 1, 1, name='Wf'), axis=-1)
133 |         logits2 = exp_mask(logits2, mask)
134 |     return logits2
135 | 
136 | 
137 | def ElmoCombineLayer(elmo_feats, name):  # [bs, len, 3, 1024]
138 |     n_lm_layers = int(elmo_feats.get_shape()[2])  # 3
139 |     W = tf.get_variable(
140 |         '{}_ELMo_W'.format(name),
141 |         shape=(n_lm_layers,),
142 |         initializer=tf.zeros_initializer,
143 |         regularizer=regularizer,
144 |         trainable=True,
145 |     )
146 |     normed_weights = tf.split(tf.nn.softmax(W + 1.0 / n_lm_layers), n_lm_layers)  # [1]*3
147 |     # split LM layers
148 |     layers = tf.split(elmo_feats, n_lm_layers, axis=2)  # [bs, len, 1, 1024]*3
149 | 
150 |     # compute the weighted, normalized LM activations
151 |     pieces = []
152 |     for w, t in zip(normed_weights, layers):
153 |         pieces.append(w * tf.squeeze(t, axis=2))
154 |     sum_pieces = tf.add_n(pieces)
155 | 
156 |     # scale the weighted sum by gamma
157 |     gamma = tf.get_variable(
158 |         '{}_ELMo_gamma'.format(name),
159 |         shape=(1,),
160 |         initializer=tf.ones_initializer,
161 |         regularizer=None,
162 |         trainable=True,
163 |     )
164 |     return sum_pieces * gamma  # [bs, len, 1024]
165 | 
166 | 
167 | def CoveCombineLayer(cove_feats, name):  # [bs, len, 2, 600]
168 |     n_lm_layers = int(cove_feats.get_shape()[2])  # 2
169 |     W = tf.get_variable(
170 |         '{}_Cove_W'.format(name),
171 |         shape=(n_lm_layers,),
172 |         initializer=tf.zeros_initializer,
173 |         regularizer=regularizer,
174 |         trainable=True,
175 |     )
176 |     normed_weights = tf.split(tf.nn.softmax(W + 1.0 / n_lm_layers), n_lm_layers)  # [1]*2
177 |     # split LM layers
178 |     layers = tf.split(cove_feats, n_lm_layers, axis=2)  # [bs, len, 1, 600]*2
179 | 
180 |     # compute the weighted, normalized LM activations
181 |     pieces = []
182 |     for w, t in zip(normed_weights, layers):
183 |         pieces.append(w * tf.squeeze(t, axis=2))
184 |     sum_pieces = tf.add_n(pieces)
185 | 
186 |     # scale the weighted sum by gamma
187 |     gamma = tf.get_variable(
188 |         '{}_Cove_gamma'.format(name),
189 |         shape=(1,),
190 |         initializer=tf.ones_initializer,
191 |         regularizer=None,
192 |         trainable=True,
193 |     )
194 |     return sum_pieces * gamma  # [bs, len, 600]
195 | 
196 | 
197 | def optimized_trilinear_for_attention(args, c_maxlen, q_maxlen, input_keep_prob=1.0, scope='efficient_trilinear',
198 |                                       bias_initializer=tf.zeros_initializer(), kernel_initializer=initializer()):
199 |     assert len(args) == 2, "just use for computing attention with two input"
200 |     arg0_shape = args[0].get_shape().as_list()
201 |     arg1_shape = args[1].get_shape().as_list()
202 |     if len(arg0_shape) != 3 or len(arg1_shape) != 3:
203 |         raise ValueError("`args` must be 3 dims (batch_size, len, dimension)")
204 |     if arg0_shape[2] != arg1_shape[2]:
205 |         raise ValueError("the last dimension of `args` must equal")
206 |     arg_size = arg0_shape[2]
207 |     dtype = args[0].dtype
208 |     droped_args = [tf.nn.dropout(arg, input_keep_prob) for arg in args]
209 |     with tf.variable_scope(scope):
210 |         weights4arg0 = tf.get_variable(
211 |             "linear_kernel4arg0", [arg_size, 1],
212 |             dtype=dtype,
213 |             regularizer=regularizer,
214 |             initializer=kernel_initializer)
215 |         weights4arg1 = tf.get_variable(
216 |             "linear_kernel4arg1", [arg_size, 1],
217 |             dtype=dtype,
218 |             regularizer=regularizer,
219 |             initializer=kernel_initializer)
220 |         weights4mlu = tf.get_variable(
221 |             "linear_kernel4mul", [1, 1, arg_size],
222 |             dtype=dtype,
223 |             regularizer=regularizer,
224 |             initializer=kernel_initializer)
225 |         biases = tf.get_variable(
226 |             "linear_bias", [1],
227 |             dtype=dtype,
228 |             regularizer=regularizer,
229 |             initializer=bias_initializer)
230 |         subres0 = tf.tile(backend.dot(droped_args[0], weights4arg0), [1, 1, q_maxlen])
231 |         subres1 = tf.tile(tf.transpose(backend.dot(droped_args[1], weights4arg1), perm=(0, 2, 1)), [1, c_maxlen, 1])
232 |         subres2 = backend.batch_dot(droped_args[0] * weights4mlu, tf.transpose(droped_args[1], perm=(0, 2, 1)))
233 |         res = subres0 + subres1 + subres2
234 |         res += biases
235 |         return res
236 | 
237 | 
238 | def ElmoAttention(inputs, c_maxlen, q_maxlen, q_mask, dropout):
239 |     c, q = inputs
240 |     S = optimized_trilinear_for_attention([c, q], c_maxlen, q_maxlen, input_keep_prob=1. - dropout,
241 |                                           scope='elmo_efficient_trilinear')
242 |     q_mask = tf.expand_dims(q_mask, 1)
243 |     S_ = tf.nn.softmax(exp_mask(S, mask=q_mask))
244 |     c2q = tf.matmul(S_, q)
245 |     return tf.concat([c, c2q], axis=-1)
246 | 
247 | 
248 | def total_params(exclude=None):
249 |     total_parameters = 0
250 |     if exclude is not None:
251 |         trainable_variables = list(set(tf.trainable_variables()) ^ set(tf.trainable_variables(exclude)))
252 |     else:
253 |         trainable_variables = tf.trainable_variables()
254 |     for variable in trainable_variables:
255 |         shape = variable.get_shape()
256 |         variable_parametes = 1
257 |         try:
258 |             for dim in shape:
259 |                 variable_parametes *= dim.value
260 |             total_parameters += variable_parametes
261 |         except:
262 |             print(shape, 'cudnn weights is unknown')
263 |     print("Total number of trainable parameters: {}".format(total_parameters))
264 | 


--------------------------------------------------------------------------------
/train_h5py.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import json
  3 | import RMR_modelV6 as RMR
  4 | import tensorflow.contrib.slim as slim
  5 | from util.util import *
  6 | import tensorflow as tf
  7 | import pandas as pd
  8 | from util.h5py_generator import Generator
  9 | from util.log_wrapper import create_logger
 10 | 
 11 | os.environ["CUDA_VISIBLE_DEVICES"] = '4'
 12 | 
 13 | if __name__ == '__main__':
 14 | 
 15 |     data_source = 'dataset'
 16 | 
 17 |     config = {
 18 |         'char_dim': 300,
 19 |         'cont_limit': 400,
 20 |         'ques_limit': 50,
 21 |         'char_limit': 16,
 22 |         'ans_limit': -1,
 23 |         'filters': 128,
 24 |         'char_filters': 100,
 25 |         'dropout': 0.175,
 26 |         'dropout_emb': 0.15,
 27 |         'dropout_att': 0.2,
 28 |         'dropout_rnn': 0.15,
 29 |         'l2_norm': 3e-7,
 30 |         'decay': 1,
 31 |         'gamma_b': 0.3,
 32 |         'gamma_c': 1.0,
 33 |         'init_lambda': 3.0,
 34 |         'learning_rate': 1e-3,
 35 |         'shuffle_size': 25000,
 36 |         'grad_clip': 5.0,
 37 |         'use_elmo': 0,
 38 |         'use_cove': 0,
 39 |         'use_feat': True,
 40 |         'use_rlloss': False,
 41 |         'rlw': 0.0,
 42 |         'rlw2': 0.8,
 43 |         'optimizer': 'adam',
 44 |         'cove_path': '../SAN_tf/Keras_CoVe_2layers.h5',
 45 |         'elmo_weights_path': '../SAN_tf/elmo_tf/models/squad_context_concat_lm_2x4096_512_2048cnn_2xhighway_skip.hdf5',
 46 |         'elmo_options_path': '../SAN_tf/elmo_tf/models/options_squad_lm_2x4096_512_2048cnn_2xhighway_skip.json',
 47 |         'train_tfrecords': '../QANet_tf/tfrecords/train_pre_elmo_cove3.tfrecords',
 48 |         'dev_tfrecords': '../QANet_tf/tfrecords/dev_pre_elmo_cove3.tfrecords',
 49 |         'batch_size': 32,
 50 |         'epoch': 30,
 51 |         'origin_path': None,  # not finetune
 52 |         'path': 'RMRV102'
 53 |     }
 54 | 
 55 |     global logger
 56 |     logger = create_logger(__name__, to_disk=True, log_file='log/' + config['path'] + '.log')
 57 | 
 58 |     logger.info('loading data...')
 59 |     train_qid = np.load(data_source + '/train_qid.npy').astype(np.int32)
 60 |     dev_qid = np.load(data_source + '/dev_qid.npy').astype(np.int32)
 61 |     with open(data_source + '/test_eval.json', "r") as fh:
 62 |         eval_file = json.load(fh)
 63 | 
 64 |     # load embedding matrix
 65 |     logger.info('loading embedding...')
 66 |     word_mat = np.load(data_source + '/word_emb_mat.npy')
 67 |     char_mat_fix = np.load(data_source + '/char_emb_mat_fix.npy').astype(np.float32)
 68 |     char_mat_trainable = np.load(data_source + '/char_emb_mat_trainable.npy').astype(np.float32)
 69 | 
 70 |     logger.info('init model...')
 71 |     model = RMR.Model(config, word_mat=word_mat, char_mat_trainable=char_mat_trainable, char_mat_fix=char_mat_fix)
 72 |     sess_config = tf.ConfigProto(allow_soft_placement=True)
 73 |     sess_config.gpu_options.allow_growth = True
 74 |     best_f1 = 0
 75 |     best_em = 0
 76 |     f1s = []
 77 |     ems = []
 78 | 
 79 |     logger.info('init generator...')
 80 |     train_gen = Generator(data_source + '/train_data.h5', train_qid, batch_size=config['batch_size'], shuffle=True,
 81 |                           use_elmo=config['use_elmo'], use_cove=config['use_cove'],
 82 |                           elmo_path=data_source + '/train_ELMO_feats.h5',
 83 |                           cove_path=data_source + '/train_COVE_feats.h5')
 84 |     dev_gen = Generator(data_source + '/dev_data.h5', dev_qid, batch_size=config['batch_size'], shuffle=False,
 85 |                         use_elmo=config['use_elmo'], use_cove=config['use_cove'],
 86 |                         elmo_path=data_source + '/dev_ELMO_feats.h5', cove_path=data_source + '/dev_COVE_feats.h5')
 87 | 
 88 |     logger.info('starting session...')
 89 |     with tf.Session(config=sess_config) as sess:
 90 |         sess.run(tf.global_variables_initializer())
 91 |         # scope with trainable weights
 92 |         variables_to_restore = slim.get_variables_to_restore(include=['Input_Embedding_Mat',
 93 |                                                                       'Input_Embedding_Layer',
 94 |                                                                       'Iterative_Reattention_Aligner',
 95 |                                                                       'Answer_Pointer',
 96 |                                                                       'EMA_Weights'])
 97 |         saver = tf.train.Saver(variables_to_restore, max_to_keep=10)
 98 |         if config['origin_path'] is not None and os.path.exists(
 99 |                 os.path.join('model', config['origin_path'], 'checkpoint')):
100 |             saver.restore(sess, tf.train.latest_checkpoint(os.path.join('model', str(config['origin_path']) + '/')))
101 | 
102 |         use_rl=False
103 |         for i_epoch in range(config['epoch']):
104 |             if (i_epoch + 1) % 8 == 0:
105 |                 config['learning_rate'] *= 0.5
106 |             #     logger.warning('learning rate reduce to:' + str(config['learning_rate']))
107 |             #     if config['learning_rate'] <= 2.5e-4:
108 |             #         use_rl=True
109 |             #         logger.warning('rl loss start...')
110 |             #         config['rlw'] = config['rlw2']
111 | 
112 |             sum_loss = 0
113 |             for i_batch in range(train_gen.max_batch):
114 |                 assert i_batch == train_gen.i_batch
115 |                 # if use_rl:
116 |                 #     config['rlw'] = min(config['rlw2'], config['rlw']+config['rlw2']/5000)
117 |                 if i_batch == 1:
118 |                     t_start = time.time()
119 |                 data_batch = next(train_gen)
120 |                 feed_dict_ = {model.contw_input: data_batch['context_ids'], model.quesw_input: data_batch['ques_ids'],
121 |                               model.contc_input: data_batch['context_char_ids'],
122 |                               model.quesc_input: data_batch['ques_char_ids'],
123 |                               model.y_start: data_batch['y1'], model.y_end: data_batch['y2'],
124 |                               # model.yp_start: data_batch['y1p'], model.yp_end: data_batch['y2p'],
125 |                               model.un_size: data_batch['context_ids'].shape[0],
126 |                               model.dropout: config['dropout'],
127 |                               model.dropout_emb: config['dropout_emb'],
128 |                               model.dropout_att: config['dropout_att'],
129 |                               model.dropout_rnn: config['dropout_rnn'],
130 |                               model.learning_rate: config['learning_rate'],
131 |                               model.rlw: config['rlw']}
132 |                 if config['use_feat']:
133 |                     feed_dict_[model.cont_feat] = data_batch['context_feat']
134 |                     feed_dict_[model.ques_feat] = data_batch['ques_feat']
135 |                 if config['use_elmo'] == 1:
136 |                     feed_dict_[model.elmo_cont] = data_batch['elmo_cont']
137 |                     feed_dict_[model.elmo_ques] = data_batch['elmo_ques']
138 |                 if config['use_cove'] == 1:
139 |                     feed_dict_[model.cove_cont] = data_batch['cove_cont']
140 |                     feed_dict_[model.cove_ques] = data_batch['cove_ques']
141 |                 if config['decay'] < 1:
142 |                     loss_value, _ = sess.run([model.loss, model.ema_train_op], feed_dict=feed_dict_)
143 |                 else:
144 |                     loss_value, _ = sess.run([model.loss, model.train_op], feed_dict=feed_dict_)
145 |                 char_mat = sess.run(model.char_mat)
146 | #                 ipdb.set_trace()
147 |                 char_mat[-char_mat_fix.shape[0]:,::] = char_mat_fix
148 |                 _ = sess.run(model.assign_char_mat, feed_dict={model.old_char_mat:char_mat})
149 |                 sum_loss += loss_value
150 | 
151 |                 # # check embedding
152 |                 # fix_feat, tra_feat = sess.run([model.char_mat[-93:, :], model.char_mat[0:1140, :]])
153 |                 # fix_feat = np.sum(fix_feat)
154 |                 # tra_feat = np.sum(tra_feat)
155 |                 # print('fix:', fix_feat)
156 |                 # print('trainable:', tra_feat)
157 | 
158 |                 last_train_str = "[epoch:%d/%d, steps:%d/%d] -loss:%.4f" % (i_epoch + 1, config['epoch'], i_batch + 1,
159 |                                                                             train_gen.max_batch,
160 |                                                                             sum_loss / (i_batch + 1))
161 |                 if i_batch > 0:
162 |                     last_train_str += (' -ETA:%ds' % cal_ETA(t_start, i_batch, train_gen.max_batch))
163 |                 if i_batch % 100 == 0:
164 |                     logger.info(last_train_str)
165 |             logger.info(last_train_str)
166 | 
167 |             # validating step
168 |             # # save the temp weights and do ema
169 |             # if config['decay'] < 1.0:
170 |             #     saver.save(sess, os.path.join('model', config['path'], 'temp_model.ckpt'))
171 |             #     sess.run(model.assign_vars)
172 |             #     print('EMA over...')
173 |             logger.info('validating...')
174 |             sum_loss_val = 0
175 |             y1s = []
176 |             y2s = []
177 |             dev_gen.reset()
178 |             for i_batch in range(dev_gen.max_batch):
179 |                 assert i_batch == dev_gen.i_batch
180 |                 data_batch = next(dev_gen)
181 |                 feed_dict_ = {model.contw_input: data_batch['context_ids'], model.quesw_input: data_batch['ques_ids'],
182 |                               model.contc_input: data_batch['context_char_ids'],
183 |                               model.quesc_input: data_batch['ques_char_ids'],
184 |                               model.y_start: data_batch['y1'], model.y_end: data_batch['y2'],
185 |                               # model.yp_start: data_batch['y1p'], model.yp_end: data_batch['y2p'],
186 |                               model.un_size: data_batch['context_ids'].shape[0]}
187 |                 if config['use_feat']:
188 |                     feed_dict_[model.cont_feat] = data_batch['context_feat']
189 |                     feed_dict_[model.ques_feat] = data_batch['ques_feat']
190 |                 if config['use_elmo'] == 1:
191 |                     feed_dict_[model.elmo_cont] = data_batch['elmo_cont']
192 |                     feed_dict_[model.elmo_ques] = data_batch['elmo_ques']
193 |                 if config['use_cove'] == 1:
194 |                     feed_dict_[model.cove_cont] = data_batch['cove_cont']
195 |                     feed_dict_[model.cove_ques] = data_batch['cove_ques']
196 | 
197 |                 loss_value, y1, y2 = sess.run([model.loss, model.mask_output1, model.mask_output2],
198 |                                               feed_dict=feed_dict_)
199 |                 y1s.append(y1)
200 |                 y2s.append(y2)
201 |                 sum_loss_val += loss_value
202 | 
203 |             y1s = np.concatenate(y1s)
204 |             y2s = np.concatenate(y2s)
205 |             answer_dict, _, noanswer_num = convert_tokens(eval_file, dev_qid.tolist(), y1s.tolist(),
206 |                                                           y2s.tolist(), data_type=1)
207 |             metrics = evaluate(eval_file, answer_dict)
208 |             ems.append(metrics['exact_match'])
209 |             f1s.append(metrics['f1'])
210 | 
211 |             # if metrics['f1'] < f1s[-1]:
212 |             #     config['learning_rate'] *= 0.5
213 |             #     logger.warning('learning rate reduce to:' + str(config['learning_rate']))
214 |             #     if config['learning_rate'] <= 1e-4:
215 |             #         logger.warning('rl loss start...')
216 |             #         config['rlw'] = config['rlw2']
217 | 
218 |             if ems[-1] > best_em:
219 |                 best_em = ems[-1]
220 |             if f1s[-1] > best_f1:
221 |                 best_f1 = f1s[-1]
222 |             saver.save(sess, os.path.join('model', config['path'], 'model.ckpt'),
223 |                        global_step=(i_epoch + 1) * train_gen.max_batch)
224 |             logger.warning("-loss: %.4f -EM:%.2f%% (best: %.2f%%), -F1:%.2f%% (best: %.2f%%) -Noanswer:%d" %
225 |                            (sum_loss_val / (dev_gen.max_batch + 1), metrics['exact_match'], best_em, metrics['f1'],
226 |                             best_f1, noanswer_num))
227 | #             metrics = evaluate_acc(eval_file, answer_dict)
228 | #             logger.warning("Has answer acc:%.2f%%, No answer acc::%.2f%%" % (
229 | #                 metrics['has_answer_acc'] * 100, metrics['hasno_answer_acc'] * 100))
230 |             result = pd.DataFrame([ems, f1s], index=['em', 'f1']).transpose()
231 |             result.to_csv('results/result_' + config['path'] + '.csv', index=None)
232 | 
233 |             # # recover the model
234 |             # if config['decay'] < 1.0:
235 |             #     saver.restore(sess, os.path.join('model', config['path'], 'temp_model.ckpt'))
236 |             #     print('recover weights over...')
237 | 


--------------------------------------------------------------------------------
/layersV0.py:
--------------------------------------------------------------------------------
  1 | import tensorflow as tf
  2 | from tensor2tensor.layers.common_layers import conv1d, dense
  3 | from tensorflow.contrib.cudnn_rnn import CudnnLSTM
  4 | from tensorflow.contrib.keras import backend
  5 | from tensorflow.contrib.layers import variance_scaling_initializer, l2_regularizer
  6 | 
  7 | initializer = lambda: variance_scaling_initializer(factor=1.0, mode='FAN_AVG', uniform=True, dtype=tf.float32)
  8 | initializer_relu = lambda: variance_scaling_initializer(factor=2.0, mode='FAN_IN', uniform=False, dtype=tf.float32)
  9 | regularizer = l2_regularizer(scale=3e-7)
 10 | 
 11 | 
 12 | # cudnnLSTM
 13 | def BiLSTM(x, filters, dropout=0.0, name='BiLSTM', layers=1, return_state=False):
 14 |     cudnn_lstm = CudnnLSTM(layers, filters, direction='bidirectional', name=name)
 15 |     if type(x) == list:
 16 |         assert len(x) == 2
 17 |         x1, x2 = x
 18 |         # cudnn compatibility: time first, batch second
 19 |         x1 = tf.transpose(x1, [1, 0, 2])
 20 |         x2 = tf.transpose(x2, [1, 0, 2])
 21 |         x1, x1_state = cudnn_lstm(x1)  # state:[2, bs, dim]
 22 |         x2, x2_state = cudnn_lstm(x2)
 23 |         x1 = tf.transpose(x1, [1, 0, 2])
 24 |         x2 = tf.transpose(x2, [1, 0, 2])
 25 |         x1_state = tf.concat(tf.unstack(x1_state[0], axis=0), axis=-1)
 26 |         x2_state = tf.concat(tf.unstack(x2_state[0], axis=0), axis=-1)
 27 |         if return_state:
 28 |             return tf.nn.dropout(x1_state, 1 - dropout), tf.nn.dropout(x2_state, 1 - dropout)
 29 |         else:
 30 |             return tf.nn.dropout(x1, 1 - dropout), tf.nn.dropout(x2, 1 - dropout)
 31 |     else:
 32 |         # cudnn compatibility: time first, batch second
 33 |         x = tf.transpose(x, [1, 0, 2])
 34 |         x, x_state = cudnn_lstm(x)
 35 |         if return_state:
 36 |             x_state = tf.concat(tf.unstack(x_state[0], axis=0), axis=-1)
 37 |             return tf.nn.dropout(x_state, 1 - dropout)
 38 |         else:
 39 |             x = tf.transpose(x, [1, 0, 2])
 40 |             return tf.nn.dropout(x, 1 - dropout)
 41 | 
 42 | 
 43 | def exp_mask(inputs, mask, mask_value=-1e30):
 44 |     mask = tf.cast(mask, tf.float32)
 45 |     return inputs + mask_value * (1 - mask)
 46 | 
 47 | 
 48 | def align_block(u, v, c_mask, q_mask, filters=128, dropout=0.0):
 49 |     with tf.variable_scope("Interactive_Alignment"):
 50 |         # attention
 51 |         E = tf.matmul(v, u, transpose_b=True)  # [bs, len_q, len_c]
 52 |         E_ = tf.nn.softmax(exp_mask(E, tf.expand_dims(q_mask, axis=-1)), axis=1)  # [bs, len_q, len_c]
 53 |         v_E = tf.matmul(E_, v, transpose_a=True)  # [bs, len_c, dim]
 54 | 
 55 |         # fusion
 56 |         uv = tf.concat([u, v_E, u * v_E, u - v_E], axis=-1)
 57 |         x = tf.nn.tanh(conv1d(uv, filters, 1, name='Wr'))
 58 |         g = tf.nn.sigmoid(conv1d(uv, filters, 1, name='Wg'))
 59 |         h = g * x + (1 - g) * u  # [bs, len_c, dim]
 60 | 
 61 |     with tf.variable_scope("Self_Alignment"):
 62 |         # attention
 63 |         B = tf.matmul(h, h, transpose_b=True)  # [bs, len_c, len_c]
 64 |         B = tf.matrix_set_diag(B, tf.zeros([tf.shape(B)[0], tf.shape(B)[-1]]))
 65 |         B_ = tf.nn.softmax(exp_mask(B, tf.expand_dims(c_mask, axis=-1)), axis=1)  # [bs, len_c, len_c]
 66 |         h_B = tf.matmul(B_, h, transpose_a=True)
 67 | 
 68 |         # fusion
 69 |         hh = tf.concat([h, h_B, h * h_B, h - h_B], axis=-1)
 70 |         x = tf.nn.tanh(conv1d(hh, filters, 1, name='Wr'))
 71 |         g = tf.nn.sigmoid(conv1d(hh, filters, 1, name='Wg'))
 72 |         Z = g * x + (1 - g) * h  # [bs, len_c, dim]
 73 | 
 74 |     with tf.variable_scope("Evidence_Collection"):
 75 |         R = BiLSTM(Z, filters // 2, name='bilstm', dropout=dropout)  # [bs, len_c, dim]
 76 | 
 77 |     return R
 78 | 
 79 | 
 80 | def feed_forward(x, name, filters=128, dropout=0.0):
 81 |     x = tf.nn.relu(conv1d(x, filters, 1, name=name+'_FF1'))
 82 |     x = conv1d(x, filters, 1, name=name+'FF2')
 83 |     x = tf.nn.dropout(x, 1 - dropout)
 84 |     return x
 85 | 
 86 | 
 87 | def answer_block(R, z1, filters, c_mask, dropout=0.0, return_logits=True):
 88 |     # start
 89 |     z_s = tf.tile(tf.expand_dims(z1, axis=1), [1, tf.shape(R)[1], 1])  # [bs, 1*c_len, dim]
 90 |     s = feed_forward(tf.concat([R, z_s, R * z_s], axis=-1), 'st', filters, dropout)  # [bs, c_len, dim]
 91 |     s_logits = exp_mask(tf.squeeze(conv1d(s, 1, 1, name='Ws'), axis=-1), c_mask)  # [bs, c_len]
 92 |     s = tf.expand_dims(tf.nn.softmax(s_logits), axis=-1)  # [bs, c_len]->[bs, c_len, 1]
 93 | 
 94 |     # get z2
 95 |     u = tf.squeeze(tf.matmul(R, s, transpose_a=True), axis=-1)  # [bs, dim, 1]->[bs, dim]
 96 |     zu = tf.concat([z1, u, z1 * u, z1 - u], axis=-1)
 97 |     z_s_ = tf.nn.tanh(dense(zu, filters, name='Wru'))
 98 |     g = tf.nn.sigmoid(dense(zu, filters, name='Wgu'))
 99 |     z2 = g * z_s_ + (1 - g) * z1  # [bs, dim]
100 | 
101 |     # end
102 |     z_e = tf.tile(tf.expand_dims(z2, axis=1), [1, tf.shape(R)[1], 1])  # [bs, 1*c_len, dim]
103 |     e = feed_forward(tf.concat([R, z_e, R * z_e], axis=-1), 'ed', filters, dropout)
104 |     e_logits = exp_mask(tf.squeeze(conv1d(e, 1, 1, name='We'), axis=-1), c_mask) # [bs, c_len]
105 |     e = tf.expand_dims(tf.nn.softmax(e_logits), axis=-1)
106 | 
107 |     # get z3
108 |     v = tf.squeeze(tf.matmul(R, e, transpose_a=True), axis=-1)
109 |     zv = tf.concat([z2, v, z2 * v, z2 - v], axis=-1)
110 |     z_e_ = tf.nn.tanh(dense(zv, filters, name='Wrv'))
111 |     g = tf.nn.sigmoid(dense(zv, filters, name='Wgv'))
112 |     z3 = g * z_e_ + (1 - g) * z2  # [bs, dim]
113 | 
114 |     if return_logits:
115 |         return s_logits, e_logits
116 |     else:
117 |         return z3
118 | 
119 | 
120 | def summary_vector(q_emb, c_maxlen, mask):
121 |     with tf.variable_scope("Question_Summary"):
122 |         alpha = tf.nn.softmax(exp_mask(tf.squeeze(conv1d(q_emb, 1, 1), axis=-1), mask))
123 |         s = tf.expand_dims(alpha, axis=-1) * q_emb
124 |         s = tf.reduce_sum(s, axis=1, keepdims=True)  # [bs, 1, dim]
125 |         s = tf.tile(s, [1, c_maxlen, 1])  # [bs, len_c, dim]
126 |     return s
127 | 
128 | 
129 | def start_logits(R, s, mask, filters=128, name='Start_Pointer'):
130 |     with tf.variable_scope(name):
131 |         if R.get_shape()[-1] == s.get_shape()[-1]:
132 |             logits1 = tf.concat([R, s, R * s, R - s], axis=-1)
133 |         else:
134 |             logits1 = tf.concat([R, s], axis=-1)
135 |         logits1 = tf.nn.tanh(conv1d(logits1, filters, 1, name='Wt'))
136 |         logits1 = tf.squeeze(conv1d(logits1, 1, 1, name='Wf'), axis=-1)
137 |         logits1 = exp_mask(logits1, mask)
138 |     return logits1
139 | 
140 | 
141 | def end_logits(R, logits1, s, mask, filters=128, name='End_Pointer'):
142 |     with tf.variable_scope(name):
143 |         l = R * tf.expand_dims(tf.nn.softmax(logits1, axis=-1), axis=-1)  # [bs, len_c, dim]
144 |         if s.get_shape()[-1] == l.get_shape()[-1]:
145 |             s_ = tf.concat([s, l, s * l, s - l], axis=-1)
146 |         else:
147 |             s_ = tf.concat([s, l], axis=-1)
148 |         x = tf.nn.relu(conv1d(s_, filters, 1, name='Wr'))  # [bs, len_c, dim]
149 |         g = tf.nn.sigmoid(conv1d(s_, filters, 1, name='Wg'))  # [bs, len_c, dim]
150 |         s_ = g * x + (1 - g) * s  # [bs, len_c, dim]
151 | 
152 |         if R.get_shape()[-1] == s_.get_shape()[-1]:
153 |             logits2 = tf.concat([R, s_, R * s_, R - s_], axis=-1)
154 |         else:
155 |             logits2 = tf.concat([R, s_], axis=-1)
156 |         logits2 = tf.nn.tanh(conv1d(logits2, filters, 1, name='Wt'))
157 |         logits2 = tf.squeeze(conv1d(logits2, 1, 1, name='Wf'), axis=-1)
158 |         logits2 = exp_mask(logits2, mask)
159 |     return logits2
160 | 
161 | 
162 | def ElmoCombineLayer(elmo_feats, name):  # [bs, len, 3, 1024]
163 |     n_lm_layers = int(elmo_feats.get_shape()[2])  # 3
164 |     W = tf.get_variable(
165 |         '{}_ELMo_W'.format(name),
166 |         shape=(n_lm_layers,),
167 |         initializer=tf.zeros_initializer,
168 |         regularizer=regularizer,
169 |         trainable=True,
170 |     )
171 |     normed_weights = tf.split(tf.nn.softmax(W + 1.0 / n_lm_layers), n_lm_layers)  # [1]*3
172 |     # split LM layers
173 |     layers = tf.split(elmo_feats, n_lm_layers, axis=2)  # [bs, len, 1, 1024]*3
174 | 
175 |     # compute the weighted, normalized LM activations
176 |     pieces = []
177 |     for w, t in zip(normed_weights, layers):
178 |         pieces.append(w * tf.squeeze(t, axis=2))
179 |     sum_pieces = tf.add_n(pieces)
180 | 
181 |     # scale the weighted sum by gamma
182 |     gamma = tf.get_variable(
183 |         '{}_ELMo_gamma'.format(name),
184 |         shape=(1,),
185 |         initializer=tf.ones_initializer,
186 |         regularizer=None,
187 |         trainable=True,
188 |     )
189 |     return sum_pieces * gamma  # [bs, len, 1024]
190 | 
191 | 
192 | def CoveCombineLayer(cove_feats, name):  # [bs, len, 2, 600]
193 |     n_lm_layers = int(cove_feats.get_shape()[2])  # 2
194 |     W = tf.get_variable(
195 |         '{}_Cove_W'.format(name),
196 |         shape=(n_lm_layers,),
197 |         initializer=tf.zeros_initializer,
198 |         regularizer=regularizer,
199 |         trainable=True,
200 |     )
201 |     normed_weights = tf.split(tf.nn.softmax(W + 1.0 / n_lm_layers), n_lm_layers)  # [1]*2
202 |     # split LM layers
203 |     layers = tf.split(cove_feats, n_lm_layers, axis=2)  # [bs, len, 1, 600]*2
204 | 
205 |     # compute the weighted, normalized LM activations
206 |     pieces = []
207 |     for w, t in zip(normed_weights, layers):
208 |         pieces.append(w * tf.squeeze(t, axis=2))
209 |     sum_pieces = tf.add_n(pieces)
210 | 
211 |     # scale the weighted sum by gamma
212 |     gamma = tf.get_variable(
213 |         '{}_Cove_gamma'.format(name),
214 |         shape=(1,),
215 |         initializer=tf.ones_initializer,
216 |         regularizer=None,
217 |         trainable=True,
218 |     )
219 |     return sum_pieces * gamma  # [bs, len, 600]
220 | 
221 | 
222 | def optimized_trilinear_for_attention(args, c_maxlen, q_maxlen, input_keep_prob=1.0, scope='efficient_trilinear',
223 |                                       bias_initializer=tf.zeros_initializer(), kernel_initializer=initializer()):
224 |     assert len(args) == 2, "just use for computing attention with two input"
225 |     arg0_shape = args[0].get_shape().as_list()
226 |     arg1_shape = args[1].get_shape().as_list()
227 |     if len(arg0_shape) != 3 or len(arg1_shape) != 3:
228 |         raise ValueError("`args` must be 3 dims (batch_size, len, dimension)")
229 |     if arg0_shape[2] != arg1_shape[2]:
230 |         raise ValueError("the last dimension of `args` must equal")
231 |     arg_size = arg0_shape[2]
232 |     dtype = args[0].dtype
233 |     droped_args = [tf.nn.dropout(arg, input_keep_prob) for arg in args]
234 |     with tf.variable_scope(scope):
235 |         weights4arg0 = tf.get_variable(
236 |             "linear_kernel4arg0", [arg_size, 1],
237 |             dtype=dtype,
238 |             regularizer=regularizer,
239 |             initializer=kernel_initializer)
240 |         weights4arg1 = tf.get_variable(
241 |             "linear_kernel4arg1", [arg_size, 1],
242 |             dtype=dtype,
243 |             regularizer=regularizer,
244 |             initializer=kernel_initializer)
245 |         weights4mlu = tf.get_variable(
246 |             "linear_kernel4mul", [1, 1, arg_size],
247 |             dtype=dtype,
248 |             regularizer=regularizer,
249 |             initializer=kernel_initializer)
250 |         biases = tf.get_variable(
251 |             "linear_bias", [1],
252 |             dtype=dtype,
253 |             regularizer=regularizer,
254 |             initializer=bias_initializer)
255 |         subres0 = tf.tile(backend.dot(droped_args[0], weights4arg0), [1, 1, q_maxlen])
256 |         subres1 = tf.tile(tf.transpose(backend.dot(droped_args[1], weights4arg1), perm=(0, 2, 1)), [1, c_maxlen, 1])
257 |         subres2 = backend.batch_dot(droped_args[0] * weights4mlu, tf.transpose(droped_args[1], perm=(0, 2, 1)))
258 |         res = subres0 + subres1 + subres2
259 |         res += biases
260 |         return res
261 | 
262 | 
263 | def ElmoAttention(inputs, c_maxlen, q_maxlen, q_mask, dropout):
264 |     c, q = inputs
265 |     S = optimized_trilinear_for_attention([c, q], c_maxlen, q_maxlen, input_keep_prob=1. - dropout,
266 |                                           scope='elmo_efficient_trilinear')
267 |     q_mask = tf.expand_dims(q_mask, 1)
268 |     S_ = tf.nn.softmax(exp_mask(S, mask=q_mask))
269 |     c2q = tf.matmul(S_, q)
270 |     return tf.concat([c, c2q], axis=-1)
271 | 
272 | 
273 | def total_params(exclude=None):
274 |     total_parameters = 0
275 |     if exclude is not None:
276 |         trainable_variables = list(set(tf.trainable_variables()) ^ set(tf.trainable_variables(exclude)))
277 |     else:
278 |         trainable_variables = tf.trainable_variables()
279 |     for variable in trainable_variables:
280 |         shape = variable.get_shape()
281 |         variable_parametes = 1
282 |         try:
283 |             for dim in shape:
284 |                 variable_parametes *= dim.value
285 |             total_parameters += variable_parametes
286 |         except:
287 |             print(shape, 'cudnn weights is unknown')
288 |     print("Total number of trainable parameters: {}".format(total_parameters))
289 | 


--------------------------------------------------------------------------------
/util/util.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | from collections import Counter
  3 | import string
  4 | import time
  5 | import tensorflow as tf
  6 | 
  7 | '''
  8 | This file is taken and modified from R-Net by HKUST-KnowComp
  9 | https://github.com/HKUST-KnowComp/R-Net
 10 | '''
 11 | 
 12 | 
 13 | def get_record_parser(config):
 14 |     def parser(example):
 15 |         if not config['data_type']:
 16 |             config['data_type'] = 2
 17 |         char_limit = config['char_limit']
 18 |         features_ = {
 19 |             "context_ids": tf.FixedLenFeature([], tf.string),
 20 |             "ques_ids": tf.FixedLenFeature([], tf.string),
 21 |             "context_char_ids": tf.FixedLenFeature([], tf.string),
 22 |             "ques_char_ids": tf.FixedLenFeature([], tf.string),
 23 |             'context_feat': tf.FixedLenFeature([], tf.string),
 24 |             'ques_feat': tf.FixedLenFeature([], tf.string),
 25 |             'elmo_context_feat': tf.FixedLenFeature([], tf.string),
 26 |             'elmo_question_feat': tf.FixedLenFeature([], tf.string),
 27 |             'cove_context_feat': tf.FixedLenFeature([], tf.string),
 28 |             'cove_question_feat': tf.FixedLenFeature([], tf.string),
 29 |             "y1": tf.FixedLenFeature([], tf.string),
 30 |             "y2": tf.FixedLenFeature([], tf.string),
 31 |             "qid": tf.FixedLenFeature([], tf.int64)
 32 |         }
 33 |         if config['data_type'] == 2:
 34 |             features_['y1p'] = tf.FixedLenFeature([], tf.string)
 35 |             features_['y2p'] = tf.FixedLenFeature([], tf.string)
 36 | 
 37 |         features = tf.parse_single_example(example, features=features_)
 38 |         context_idxs = tf.reshape(tf.decode_raw(features["context_ids"], tf.int32), [-1])
 39 |         ques_idxs = tf.reshape(tf.decode_raw(features["ques_ids"], tf.int32), [-1])
 40 |         context_char_idxs = tf.reshape(tf.decode_raw(features["context_char_ids"], tf.int32), [-1, char_limit])
 41 |         ques_char_idxs = tf.reshape(tf.decode_raw(features["ques_char_ids"], tf.int32), [-1, char_limit])
 42 |         context_feat = tf.reshape(tf.decode_raw(features["context_feat"], tf.float32), [-1, 73])
 43 |         ques_feat = tf.reshape(tf.decode_raw(features["ques_feat"], tf.float32), [-1, 73])
 44 |         elmo_context_feat = tf.reshape(tf.decode_raw(features['elmo_context_feat'], tf.float32), [-1, 3, 1024])
 45 |         elmo_question_feat = tf.reshape(tf.decode_raw(features['elmo_question_feat'], tf.float32), [-1, 3, 1024])
 46 |         cove_context_feat = tf.reshape(tf.decode_raw(features['cove_context_feat'], tf.float32), [-1, 2, 600])
 47 |         cove_question_feat = tf.reshape(tf.decode_raw(features['cove_question_feat'], tf.float32), [-1, 2, 600])
 48 |         y1 = tf.reshape(tf.decode_raw(features["y1"], tf.int32), [-1])
 49 |         y2 = tf.reshape(tf.decode_raw(features["y2"], tf.int32), [-1])
 50 |         if config['data_type'] == 2:
 51 |             y1p = tf.reshape(tf.decode_raw(features["y1p"], tf.int32), [-1])
 52 |             y2p = tf.reshape(tf.decode_raw(features["y2p"], tf.int32), [-1])
 53 |         # qid = features["qid"]
 54 |         if config['data_type'] == 2:
 55 |             return context_idxs, ques_idxs, \
 56 |                    context_char_idxs, ques_char_idxs, \
 57 |                    context_feat, ques_feat, \
 58 |                    elmo_context_feat, elmo_question_feat, \
 59 |                    cove_context_feat, cove_question_feat, \
 60 |                    y1, y2, y1p, y2p
 61 |         else:
 62 |             return context_idxs, ques_idxs, \
 63 |                    context_char_idxs, ques_char_idxs, \
 64 |                    context_feat, ques_feat, \
 65 |                    elmo_context_feat, elmo_question_feat, \
 66 |                    cove_context_feat, cove_question_feat, \
 67 |                    y1, y2
 68 | 
 69 |     return parser
 70 | 
 71 | 
 72 | def convert_tokens(eval_file, qa_id, pp1, pp2, unanswer_id=-1, data_type=2):
 73 |     answer_dict = {}
 74 |     remapped_dict = {}
 75 |     noanswer_num = 0
 76 |     for qid, p1, p2 in zip(qa_id, pp1, pp2):
 77 |         context = eval_file[str(qid)]["context"]
 78 |         spans = eval_file[str(qid)]["spans"]
 79 |         uuid = eval_file[str(qid)]["uuid"]
 80 |         if data_type == 2:
 81 |             if p1 == unanswer_id or p2 == unanswer_id or p1 >= len(spans) or p2 >= len(
 82 |                     spans):  # prediction has no answer
 83 |                 noanswer_num += 1
 84 |                 answer_dict[str(qid)] = ''
 85 |                 remapped_dict[uuid] = ''
 86 |             else:
 87 |                 start_idx = spans[min(p1, len(spans) - 1)][0]
 88 |                 end_idx = spans[min(p2, len(spans) - 1)][1]
 89 |                 answer_dict[str(qid)] = context[start_idx: end_idx]
 90 |                 remapped_dict[uuid] = context[start_idx: end_idx]
 91 |         else:
 92 |             start_idx = spans[min(p1, len(spans) - 1)][0]
 93 |             end_idx = spans[min(p2, len(spans) - 1)][1]
 94 |             answer_dict[str(qid)] = context[start_idx: end_idx]
 95 |             remapped_dict[uuid] = context[start_idx: end_idx]
 96 |     return answer_dict, remapped_dict, noanswer_num
 97 | 
 98 | 
 99 | def evaluate(eval_file, answer_dict):
100 |     f1 = exact_match = total = 0
101 |     for key, value in answer_dict.items():
102 |         total += 1
103 |         ground_truths = eval_file[key]["answers"]
104 |         prediction = value
105 |         if len(ground_truths) == 0:  # ground truth has no answer
106 |             if prediction == '':
107 |                 exact_match += 1
108 |                 f1 += 1
109 |         else:
110 |             exact_match += metric_max_over_ground_truths(exact_match_score, prediction, ground_truths)
111 |             f1 += metric_max_over_ground_truths(f1_score, prediction, ground_truths)
112 |     exact_match = 100.0 * exact_match / total
113 |     f1 = 100.0 * f1 / total
114 |     return {'exact_match': exact_match, 'f1': f1}
115 | 
116 | 
117 | def evaluate_acc(eval_file, answer_dict):
118 |     has_answer_acc = 0
119 |     has_answer_total = 0
120 |     hasno_answer_acc = 0
121 |     hasno_answer_total = 0
122 |     for key, value in answer_dict.items():
123 |         ground_truths = eval_file[key]["answers"]
124 |         prediction = value
125 |         if len(ground_truths) != 0:  # ground truth has answers
126 |             has_answer_total += 1
127 |             if prediction != '':
128 |                 has_answer_acc += 1
129 |         else:
130 |             hasno_answer_total += 1
131 |             if prediction == '':
132 |                 hasno_answer_acc += 1
133 |     print(has_answer_acc, '/', has_answer_total, hasno_answer_acc, '/', hasno_answer_total)
134 |     has_answer_acc /= has_answer_total
135 |     hasno_answer_acc /= hasno_answer_total
136 |     return {'has_answer_acc': has_answer_acc, 'hasno_answer_acc': hasno_answer_acc}
137 | 
138 | 
139 | def evaluate_max(eval_file, answer_dict_list):
140 |     f1 = exact_match = total = 0
141 |     for key, value in answer_dict_list[0].items():
142 |         total += 1
143 |         ground_truths = eval_file[key]["answers"]
144 |         f1_temp = 0
145 |         em_temp = 0
146 |         for answer_dict in answer_dict_list:
147 |             prediction = answer_dict[key]
148 |             if len(ground_truths) == 0:  # ground truth has no answer
149 |                 if prediction == 'unanswerable':
150 |                     em_temp = 1
151 |                     f1_temp = 1
152 |             else:
153 |                 em_temp = max(metric_max_over_ground_truths(exact_match_score, prediction, ground_truths), em_temp)
154 |                 f1_temp = max(metric_max_over_ground_truths(f1_score, prediction, ground_truths), f1_temp)
155 |         exact_match += em_temp
156 |         f1 += f1_temp
157 |     exact_match = 100.0 * exact_match / total
158 |     f1 = 100.0 * f1 / total
159 |     return {'exact_match': exact_match, 'f1': f1}
160 | 
161 | 
162 | def normalize_answer(s):
163 |     def remove_articles(text):
164 |         return re.sub(r'\b(a|an|the)\b', ' ', text)
165 | 
166 |     def white_space_fix(text):
167 |         return ' '.join(text.split())
168 | 
169 |     def remove_punc(text):
170 |         exclude = set(string.punctuation)
171 |         return ''.join(ch for ch in text if ch not in exclude)
172 | 
173 |     def lower(text):
174 |         return text.lower()
175 | 
176 |     return white_space_fix(remove_articles(remove_punc(lower(s))))
177 | 
178 | 
179 | def f1_score(prediction, ground_truth):
180 |     prediction_tokens = normalize_answer(prediction).split()
181 |     ground_truth_tokens = normalize_answer(ground_truth).split()
182 |     common = Counter(prediction_tokens) & Counter(ground_truth_tokens)
183 |     num_same = sum(common.values())
184 |     if num_same == 0:
185 |         return 0
186 |     precision = 1.0 * num_same / len(prediction_tokens)
187 |     recall = 1.0 * num_same / len(ground_truth_tokens)
188 |     f1 = (2 * precision * recall) / (precision + recall)
189 |     return f1
190 | 
191 | 
192 | def exact_match_score(prediction, ground_truth):
193 |     return (normalize_answer(prediction) == normalize_answer(ground_truth))
194 | 
195 | 
196 | def metric_max_over_ground_truths(metric_fn, prediction, ground_truths):
197 |     scores_for_ground_truths = []
198 |     for ground_truth in ground_truths:
199 |         score = metric_fn(prediction, ground_truth)
200 |         scores_for_ground_truths.append(score)
201 |     return max(scores_for_ground_truths)
202 | 
203 | 
204 | def cal_ETA(t_start, i, n_batch):
205 |     t_temp = time.time()
206 |     t_avg = float(int(t_temp) - int(t_start)) / float(i + 1)
207 |     if n_batch - i - 1 > 0:
208 |         return int((n_batch - i - 1) * t_avg)
209 |     else:
210 |         return int(t_temp) - int(t_start)
211 | 
212 | 
213 | import numpy as np
214 | import h5py
215 | 
216 | 
217 | def batchify_train(data):
218 |     def padding(datas):
219 |         max_len = max([d.shape[0] for d in datas])
220 |         paded_datas = np.zeros([len(datas), max_len] + list(datas[0].shape[1:]), dtype=datas[0].dtype)
221 |         for i in range(len(datas)):
222 |             paded_datas[i, 0:datas[i].shape[0]] = datas[i]
223 |         return paded_datas
224 | 
225 |     cont_ids, cont_char_ids, ques_ids, ques_char_ids = [], [], [], []
226 |     cont_feat, ques_feat, y1s, y2s, y1ps, y2ps = [], [], [], [], [], []
227 |     elmo_cont_feat, elmo_ques_feat = [], []
228 |     cove_cont_feat, cove_ques_feat = [], []
229 |     # load elmo
230 |     with h5py.File('dataset_pre3/train_ELMO_feats.h5', 'r') as elmo_h5f:
231 |         with h5py.File('dataset_pre3/train_COVE_feats.h5', 'r') as cove_h5f:
232 |             with h5py.File('dataset_pre3/train_data.h5', 'r') as h5f:
233 |                 for qid in data:
234 |                     group = h5f[str(qid)]
235 |                     # base feats
236 |                     cont_ids.append(group['context_ids'][:])
237 |                     cont_char_ids.append(group['context_char_ids'][:])
238 |                     cont_feat.append(group['context_feat'][:])
239 |                     ques_ids.append(group['ques_ids'][:])
240 |                     ques_char_ids.append(group['ques_char_ids'][:])
241 |                     ques_feat.append(group['ques_feat'][:])
242 |                     # elmo feats
243 |                     elmo_cont_feat.append(elmo_h5f[str(qid) + 'c'][:])
244 |                     elmo_ques_feat.append(elmo_h5f[str(qid) + 'q'][:])
245 |                     # cove feats
246 |                     cove_cont_feat.append(cove_h5f[str(qid) + 'c'][:])
247 |                     cove_ques_feat.append(cove_h5f[str(qid) + 'q'][:])
248 |     cont_ids = padding(cont_ids)
249 |     cont_char_ids = padding(cont_char_ids)
250 |     ques_ids = padding(ques_ids)
251 |     ques_char_ids = padding(ques_char_ids)
252 |     elmo_cont_feat = padding(elmo_cont_feat)
253 |     elmo_ques_feat = padding(elmo_ques_feat)
254 |     cove_cont_feat = padding(cove_cont_feat)
255 |     cove_ques_feat = padding(cove_ques_feat)
256 | 
257 |     return cont_ids, cont_char_ids, ques_ids, ques_char_ids, elmo_cont_feat, elmo_ques_feat, cove_cont_feat, cove_ques_feat
258 | 
259 | 
260 | def batchify_dev(data):
261 |     def padding(datas):
262 |         max_len = max([d.shape[0] for d in datas])
263 |         paded_datas = np.zeros([len(datas), max_len] + list(datas[0].shape[1:]), dtype=datas[0].dtype)
264 |         for i in range(len(datas)):
265 |             paded_datas[i, 0:datas[i].shape[0]] = datas[i]
266 |         return paded_datas
267 | 
268 |     cont_ids, cont_char_ids, ques_ids, ques_char_ids = [], [], [], []
269 |     cont_feat, ques_feat, y1s, y2s, y1ps, y2ps = [], [], [], [], [], []
270 |     elmo_cont_feat, elmo_ques_feat = [], []
271 |     cove_cont_feat, cove_ques_feat = [], []
272 |     # load elmo
273 |     with h5py.File('dataset_pre3/dev_ELMO_feats.h5', 'r') as elmo_h5f:
274 |         with h5py.File('dataset_pre3/dev_COVE_feats.h5', 'r') as cove_h5f:
275 |             with h5py.File('dataset_pre3/dev_data.h5', 'r') as h5f:
276 |                 for qid in data:
277 |                     group = h5f[str(qid)]
278 |                     # base feats
279 |                     cont_ids.append(group['context_ids'][:])
280 |                     cont_char_ids.append(group['context_char_ids'][:])
281 |                     cont_feat.append(group['context_feat'][:])
282 |                     ques_ids.append(group['ques_ids'][:])
283 |                     ques_char_ids.append(group['ques_char_ids'][:])
284 |                     ques_feat.append(group['ques_feat'][:])
285 |                     # elmo feats
286 |                     elmo_cont_feat.append(elmo_h5f[str(qid) + 'c'][:])
287 |                     elmo_ques_feat.append(elmo_h5f[str(qid) + 'q'][:])
288 |                     # cove feats
289 |                     cove_cont_feat.append(cove_h5f[str(qid) + 'c'][:])
290 |                     cove_ques_feat.append(cove_h5f[str(qid) + 'q'][:])
291 |     cont_ids = padding(cont_ids)
292 |     cont_char_ids = padding(cont_char_ids)
293 |     ques_ids = padding(ques_ids)
294 |     ques_char_ids = padding(ques_char_ids)
295 |     elmo_cont_feat = padding(elmo_cont_feat)
296 |     elmo_ques_feat = padding(elmo_ques_feat)
297 |     cove_cont_feat = padding(cove_cont_feat)
298 |     cove_ques_feat = padding(cove_ques_feat)
299 | 
300 |     return cont_ids, cont_char_ids, ques_ids, ques_char_ids, elmo_cont_feat, elmo_ques_feat, cove_cont_feat, cove_ques_feat
301 | 


--------------------------------------------------------------------------------
/train_tfrecords.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import json
  3 | import RMR_modelV6_squad2 as RMR
  4 | import tensorflow.contrib.slim as slim
  5 | from util.util import *
  6 | import tensorflow as tf
  7 | import pandas as pd
  8 | from util.log_wrapper import create_logger
  9 | 
 10 | os.environ["CUDA_VISIBLE_DEVICES"] = '6'
 11 | 
 12 | if __name__ == '__main__':
 13 | 
 14 |     data_source = '../QANet_tf/dataset_pre3'
 15 | 
 16 |     config = {
 17 |         'char_dim': 300,
 18 |         'cont_limit': 400,
 19 |         'ques_limit': 50,
 20 |         'char_limit': 16,
 21 |         'ans_limit': -1,
 22 |         'filters': 300,
 23 |         'dropout': 0.175,
 24 |         'dropout_emb': 0.15,
 25 |         'dropout_att': 0.2,
 26 |         'dropout_rnn': 0.1,
 27 |         'l2_norm': 3e-7,
 28 |         'decay': 1,
 29 |         'gamma_b': 0.3,
 30 |         'gamma_c': 1.0,
 31 |         'init_lambda': 3.0,
 32 |         'learning_rate': 1e-3,
 33 |         'shuffle_size': 25000,
 34 |         'grad_clip': 5.0,
 35 |         'use_elmo': 1,
 36 |         'use_cove': 1,
 37 |         'use_feat': True,
 38 |         'use_rlloss': False,
 39 |         'rlw': 0.0,
 40 |         'rlw2': 0.8,
 41 |         'optimizer': 'adam',
 42 |         'cove_path': '../SAN_tf/Keras_CoVe_2layers.h5',
 43 |         'elmo_weights_path': '../SAN_tf/elmo_tf/models/squad_context_concat_lm_2x4096_512_2048cnn_2xhighway_skip.hdf5',
 44 |         'elmo_options_path': '../SAN_tf/elmo_tf/models/options_squad_lm_2x4096_512_2048cnn_2xhighway_skip.json',
 45 |         'train_tfrecords': '../QANet_tf/tfrecords/train_pre_elmo_cove3.tfrecords',
 46 |         'dev_tfrecords': '../QANet_tf/tfrecords/dev_pre_elmo_cove3.tfrecords',
 47 |         'batch_size': 32,
 48 |         'epoch': 25,
 49 |         'origin_path': None,  # not finetune
 50 |         'path': 'RMR200'
 51 |     }
 52 | 
 53 |     global logger
 54 |     logger = create_logger(__name__, to_disk=True, log_file='log/' + config['path'] + '.log')
 55 | 
 56 |     logger.info('loading data...')
 57 |     train_qid = np.load(data_source + '/train_qid.npy').astype(np.int32)
 58 |     dev_qid = np.load(data_source + '/dev_qid.npy').astype(np.int32)
 59 |     with open(data_source + '/test_eval.json', "r") as fh:
 60 |         eval_file = json.load(fh)
 61 | 
 62 |     # load embedding matrix
 63 |     logger.info('loading embedding...')
 64 |     word_mat = np.load(data_source + '/word_emb_mat.npy')
 65 |     char_mat_fix = np.load(data_source + '/char_emb_mat_fix.npy').astype(np.float32)
 66 |     char_mat_trainable = np.load(data_source + '/char_emb_mat_trainable.npy').astype(np.float32)
 67 | 
 68 |     logger.info('generate train tfrecords...')
 69 |     train_dataset = tf.data.TFRecordDataset(config['train_tfrecords']) \
 70 |         .map(get_record_parser(config), num_parallel_calls=8) \
 71 |         .shuffle(config['shuffle_size']) \
 72 |         .padded_batch(config['batch_size'], padded_shapes=([None],
 73 |                                                            [None],
 74 |                                                            [None, None],
 75 |                                                            [None, None],
 76 |                                                            [None, None],
 77 |                                                            [None, None],
 78 |                                                            [None, None, None],
 79 |                                                            [None, None, None],
 80 |                                                            [None, None, None],
 81 |                                                            [None, None, None],
 82 |                                                            [None],
 83 |                                                            [None],
 84 |                                                            [None],
 85 |                                                            [None]))
 86 |     train_iterator = train_dataset.make_initializable_iterator()
 87 |     train_next_element = train_iterator.get_next()
 88 |     train_sum = 129941
 89 | 
 90 |     logger.info('generate dev tfrecords...')
 91 |     dev_dataset = tf.data.TFRecordDataset(config['dev_tfrecords']) \
 92 |         .map(get_record_parser(config), num_parallel_calls=8) \
 93 |         .padded_batch(config['batch_size'], padded_shapes=([None],
 94 |                                                            [None],
 95 |                                                            [None, None],
 96 |                                                            [None, None],
 97 |                                                            [None, None],
 98 |                                                            [None, None],
 99 |                                                            [None, None, None],
100 |                                                            [None, None, None],
101 |                                                            [None, None, None],
102 |                                                            [None, None, None],
103 |                                                            [None],
104 |                                                            [None],
105 |                                                            [None],
106 |                                                            [None]))
107 |     dev_iterator = dev_dataset.make_initializable_iterator()
108 |     dev_next_element = dev_iterator.get_next()
109 |     dev_sum = 11730
110 | 
111 |     logger.info('init model...')
112 |     model = RMR.Model(config, word_mat=word_mat, char_mat_trainable=char_mat_trainable, char_mat_fix=char_mat_fix)
113 |     sess_config = tf.ConfigProto(allow_soft_placement=True)
114 |     sess_config.gpu_options.allow_growth = True
115 |     best_f1 = 0
116 |     best_em = 0
117 |     f1s = []
118 |     ems = []
119 | 
120 |     with tf.Session(config=sess_config) as sess:
121 |         sess.run(tf.global_variables_initializer())
122 |         # scope with trainable weights
123 |         variables_to_restore = slim.get_variables_to_restore(include=['Input_Embedding_Mat',
124 |                                                                       'Input_Embedding_Layer',
125 |                                                                       'Iterative_Reattention_Aligner',
126 |                                                                       'Answer_Pointer',
127 |                                                                       'EMA_Weights'])
128 |         saver = tf.train.Saver(variables_to_restore, max_to_keep=10)
129 |         if config['origin_path'] is not None and os.path.exists(
130 |                 os.path.join('model', config['origin_path'], 'checkpoint')):
131 |             saver.restore(sess, tf.train.latest_checkpoint(os.path.join('model', str(config['origin_path']) + '/')))
132 | 
133 |         for i_epoch in range(config['epoch']):
134 |             sess.run(train_iterator.initializer)
135 |             i_batch = 0
136 |             train_n_batch = train_sum // config['batch_size'] + 1
137 |             val_n_batch = dev_sum // config['batch_size'] + 1
138 |             # if i_epoch + 1 >= 8 and (i_epoch + 1) % 8 == 0:
139 |             #     config['learning_rate'] *= 0.5
140 |             sum_loss = 0
141 |             while True:
142 |                 try:
143 |                     if i_batch == 1:
144 |                         t_start = time.time()
145 |                     context_idxs, ques_idxs, \
146 |                     context_char_idxs, ques_char_idxs, \
147 |                     context_feat, ques_feat, \
148 |                     elmo_context_feat, elmo_question_feat, \
149 |                     cove_context_feat, cove_question_feat, \
150 |                     y1, y2, y1p, y2p = sess.run(train_next_element)
151 |                     feed_dict_ = {model.contw_input: context_idxs, model.quesw_input: ques_idxs,
152 |                                   model.contc_input: context_char_idxs, model.quesc_input: ques_char_idxs,
153 |                                   model.y_start: y1, model.y_end: y2,
154 |                                   model.yp_start: y1p, model.yp_end: y2p,
155 |                                   model.un_size: context_idxs.shape[0],
156 |                                   model.dropout: config['dropout'],
157 |                                   model.dropout_emb: config['dropout_emb'],
158 |                                   model.dropout_att: config['dropout_att'],
159 |                                   model.dropout_rnn: config['dropout_rnn'],
160 |                                   model.learning_rate: config['learning_rate'],
161 |                                   model.rlw: config['rlw']}
162 |                     if config['use_feat']:
163 |                         feed_dict_[model.cont_feat] = context_feat
164 |                         feed_dict_[model.ques_feat] = ques_feat
165 |                     if config['use_elmo'] == 1:
166 |                         feed_dict_[model.elmo_cont] = elmo_context_feat
167 |                         feed_dict_[model.elmo_ques] = elmo_question_feat
168 |                     if config['use_cove'] == 1:
169 |                         feed_dict_[model.cove_cont] = cove_context_feat
170 |                         feed_dict_[model.cove_ques] = cove_question_feat
171 |                     if config['decay'] < 1:
172 |                         loss_value, _ = sess.run([model.loss, model.ema_train_op], feed_dict=feed_dict_)
173 |                     else:
174 |                         loss_value, _ = sess.run([model.loss, model.train_op], feed_dict=feed_dict_)
175 |                     char_mat = sess.run(model.char_mat)
176 |                     char_mat[-char_mat_fix.shape[0]:, ::] = char_mat_fix
177 |                     _ = sess.run(model.assign_char_mat, feed_dict={model.old_char_mat: char_mat})
178 |                     sum_loss += loss_value
179 | 
180 |                     # check embedding
181 |                     fix_feat, tra_feat = sess.run([model.char_mat[-93:, :], model.char_mat[0:1140, :]])
182 |                     fix_feat = np.sum(fix_feat)
183 |                     tra_feat = np.sum(tra_feat)
184 |                     print('fix:', fix_feat)
185 |                     print('trainable:', tra_feat)
186 | 
187 |                     last_train_str = "[epoch:%d/%d, steps:%d/%d] -loss:%.4f" % (
188 |                     i_epoch + 1, config['epoch'], i_batch + 1,
189 |                     train_n_batch, sum_loss / (i_batch + 1))
190 |                     if i_batch > 0:
191 |                         last_train_str += (' -ETA:%ds' % cal_ETA(t_start, i_batch, train_n_batch))
192 |                     if i_batch % 100 == 0:
193 |                         logger.info(last_train_str)
194 |                     i_batch += 1
195 |                 except tf.errors.OutOfRangeError:
196 |                     logger.info(last_train_str)
197 |                     break
198 | 
199 |             # validating step
200 |             # # save the temp weights and do ema
201 |             # if config['decay'] < 1.0:
202 |             #     saver.save(sess, os.path.join('model', config['path'], 'temp_model.ckpt'))
203 |             #     sess.run(model.assign_vars)
204 |             #     print('EMA over...')
205 |             sess.run(dev_iterator.initializer)
206 |             logger.info('validating...')
207 |             sum_loss_val = 0
208 |             y1s = []
209 |             y2s = []
210 |             i_batch = 0
211 |             while True:
212 |                 try:
213 |                     context_idxs, ques_idxs, \
214 |                     context_char_idxs, ques_char_idxs, \
215 |                     context_feat, ques_feat, \
216 |                     elmo_context_feat, elmo_question_feat, \
217 |                     cove_context_feat, cove_question_feat, \
218 |                     y1, y2, y1p, y2p = sess.run(dev_next_element)
219 |                     feed_dict_ = {model.contw_input: context_idxs, model.quesw_input: ques_idxs,
220 |                                   model.contc_input: context_char_idxs, model.quesc_input: ques_char_idxs,
221 |                                   model.y_start: y1, model.y_end: y2,
222 |                                   model.yp_start: y1p, model.yp_end: y2p,
223 |                                   model.un_size: context_idxs.shape[0]}
224 |                     if config['use_feat']:
225 |                         feed_dict_[model.cont_feat] = context_feat
226 |                         feed_dict_[model.ques_feat] = ques_feat
227 |                     if config['use_elmo'] == 1:
228 |                         feed_dict_[model.elmo_cont] = elmo_context_feat
229 |                         feed_dict_[model.elmo_ques] = elmo_question_feat
230 |                     if config['use_cove'] == 1:
231 |                         feed_dict_[model.cove_cont] = cove_context_feat
232 |                         feed_dict_[model.cove_ques] = cove_question_feat
233 |                     loss_value, y1, y2 = sess.run([model.loss, model.mask_output1, model.mask_output2],
234 |                                                   feed_dict=feed_dict_)
235 |                     y1s.append(y1)
236 |                     y2s.append(y2)
237 |                     sum_loss_val += loss_value
238 |                     i_batch += 1
239 |                 except tf.errors.OutOfRangeError:
240 |                     y1s = np.concatenate(y1s)
241 |                     y2s = np.concatenate(y2s)
242 |                     answer_dict, _, noanswer_num = convert_tokens(eval_file, dev_qid.tolist(), y1s.tolist(),
243 |                                                                   y2s.tolist(), data_type=2)
244 |                     metrics = evaluate(eval_file, answer_dict)
245 |                     ems.append(metrics['exact_match'])
246 |                     f1s.append(metrics['f1'])
247 | 
248 |                     # if metrics['f1'] < f1s[-1]:
249 |                     #     config['learning_rate'] *= 0.5
250 |                     #     logger.warning('learning rate reduce to:' + str(config['learning_rate']))
251 |                     #     if config['learning_rate'] <= 1e-4:
252 |                     #         logger.warning('rl loss start...')
253 |                     #         config['rlw'] = config['rlw2']
254 | 
255 |                     if ems[-1] > best_em:
256 |                         best_em = ems[-1]
257 |                     if f1s[-1] > best_f1:
258 |                         best_f1 = f1s[-1]
259 |                     saver.save(sess, os.path.join('model', config['path'], 'model.ckpt'),
260 |                                global_step=(i_epoch + 1) * train_n_batch)
261 |                     logger.warning("-loss: %.4f -EM:%.2f%% (best: %.2f%%), -F1:%.2f%% (best: %.2f%%) -Noanswer:%d" %
262 |                                    (sum_loss_val / (i_batch + 1), metrics['exact_match'], best_em, metrics['f1'],
263 |                                     best_f1, noanswer_num))
264 |                     metrics = evaluate_acc(eval_file, answer_dict)
265 |                     logger.warning("Has answer acc:%.2f%%, No answer acc::%.2f%%" % (
266 |                         metrics['has_answer_acc'] * 100, metrics['hasno_answer_acc'] * 100))
267 |                     result = pd.DataFrame([ems, f1s], index=['em', 'f1']).transpose()
268 |                     result.to_csv('results/result_' + config['path'] + '.csv', index=None)
269 | 
270 |                     # # recover the model
271 |                     # if config['decay'] < 1.0:
272 |                     #     saver.restore(sess, os.path.join('model', config['path'], 'temp_model.ckpt'))
273 |                     #     print('recover weights over...')
274 |                     break
275 | 


--------------------------------------------------------------------------------
/RMR_modelV3.py:
--------------------------------------------------------------------------------
  1 | import tensorflow as tf
  2 | from layersV0 import total_params, align_block, summary_vector, start_logits, end_logits, BiLSTM, ElmoAttention, \
  3 |     ElmoCombineLayer, CoveCombineLayer, answer_block
  4 | from bilm import BidirectionalLanguageModel, all_layers
  5 | from keras.models import load_model
  6 | from loss import rl_loss
  7 | import numpy as np
  8 | 
  9 | 
 10 | class Model(object):
 11 |     def __init__(self, config, word_mat=None, char_mat_trainable=None, char_mat_fix=None, test=False):
 12 | 
 13 |         # hyper-parameter
 14 |         self.char_dim = config['char_dim']
 15 |         self.cont_limit = config['cont_limit'] if not test else 1000
 16 |         self.ques_limit = config['ques_limit'] if not test else 50
 17 |         self.char_limit = config['char_limit']
 18 |         self.ans_limit = config['ans_limit']
 19 |         self.filters = config['filters']
 20 |         self.char_filters = config['char_filters']
 21 |         self.batch_size = config['batch_size']
 22 |         self.l2_norm = config['l2_norm']
 23 |         self.decay = config['decay']
 24 |         self.learning_rate = config['learning_rate']
 25 |         self.grad_clip = config['grad_clip']
 26 |         self.init_lambda = config['init_lambda']
 27 |         self.gamma_b = config['gamma_b']
 28 |         self.gamma_c = config['gamma_c']
 29 |         self.use_elmo = config['use_elmo']
 30 |         self.use_cove = config['use_cove']
 31 |         self.use_feat = config['use_feat']
 32 |         self.use_rlloss = config['use_rlloss']
 33 |         self.dropout = tf.placeholder_with_default(0.0, (), name="dropout")
 34 |         self.dropout_rnn = tf.placeholder_with_default(0.0, (), name="dropout_rnn")
 35 |         self.dropout_emb = tf.placeholder_with_default(0.0, (), name="dropout_emb")
 36 |         self.dropout_att = tf.placeholder_with_default(0.0, (), name="dropout_att")
 37 |         self.un_size = tf.placeholder_with_default(self.batch_size, (), name="un_size")
 38 |         self.rlw = tf.placeholder_with_default(0.0, (), name="rlloss_weights")
 39 | 
 40 |         # embedding layer
 41 |         self.word_mat = tf.get_variable("word_mat", initializer=tf.constant(word_mat, dtype=tf.float32),
 42 |                                         trainable=False)
 43 |         with tf.variable_scope("Input_Embedding_Mat"):
 44 |             self.char_mat = tf.get_variable("char_mat",
 45 |                                             initializer=np.concatenate([char_mat_trainable, char_mat_fix], axis=0),
 46 |                                             trainable=True)
 47 | 
 48 |         # input tensor
 49 |         self.contw_input = tf.placeholder(tf.int32, [None, None], "context_word")
 50 |         self.quesw_input = tf.placeholder(tf.int32, [None, None], "question_word")
 51 |         self.contc_input = tf.placeholder(tf.int32, [None, None, self.char_limit], "context_char")
 52 |         self.quesc_input = tf.placeholder(tf.int32, [None, None, self.char_limit], "question_char")
 53 |         self.y_start = tf.placeholder(tf.int32, [None, None], "answer_start_index")
 54 |         self.y_end = tf.placeholder(tf.int32, [None, None], "answer_end_index")
 55 |         self.contw_elmo_id = tf.placeholder(tf.int32, [None, None, 50], 'contw_elmo_id')
 56 |         self.quesw_elmo_id = tf.placeholder(tf.int32, [None, None, 50], 'quesw_elmo_id')
 57 |         if self.use_feat:
 58 |             self.cont_feat = tf.placeholder(tf.float32, [None, None, 73], "cont_feat")
 59 |             self.ques_feat = tf.placeholder(tf.float32, [None, None, 73], "ques_feat")
 60 |         self.old_char_mat = tf.placeholder(tf.float32, [None, None], "old_char_mat")
 61 |         self.assign_char_mat = tf.assign(self.char_mat, self.old_char_mat)
 62 | 
 63 |         # get mask & length for words & chars
 64 |         self.c_mask = tf.cast(self.contw_input, tf.bool)
 65 |         self.q_mask = tf.cast(self.quesw_input, tf.bool)
 66 |         self.cont_len = tf.reduce_sum(tf.cast(self.c_mask, tf.int32), axis=1)
 67 |         self.ques_len = tf.reduce_sum(tf.cast(self.q_mask, tf.int32), axis=1)
 68 | 
 69 |         # slice for maxlen in each batch
 70 |         self.c_maxlen = tf.reduce_max(self.cont_len)
 71 |         self.q_maxlen = tf.reduce_max(self.ques_len)
 72 | 
 73 |         # elmo features
 74 |         if self.use_elmo == 2:
 75 |             options_file = config['elmo_options_path']
 76 |             weight_file = config['elmo_weights_path']
 77 |             bilm = BidirectionalLanguageModel(options_file, weight_file)
 78 |             self.elmo_cont = all_layers(bilm(self.contw_elmo_id))  # [bs, 3, len, 1024]
 79 |             self.elmo_cont = tf.transpose(self.elmo_cont, [0, 2, 1, 3])  # [bs, len, 3, 1024]
 80 |             self.elmo_ques = all_layers(bilm(self.quesw_elmo_id))
 81 |             self.elmo_ques = tf.transpose(self.elmo_ques, [0, 2, 1, 3])
 82 |         elif self.use_elmo == 1:
 83 |             self.elmo_cont = tf.placeholder(tf.float32, [None, None, 3, 1024], 'elmo_cont')
 84 |             self.elmo_ques = tf.placeholder(tf.float32, [None, None, 3, 1024], 'elmo_ques')
 85 | 
 86 |         if self.use_cove == 2:
 87 |             with tf.variable_scope('Cove_Layer'):
 88 |                 self.cove_model = load_model(config['cove_path'])
 89 |         elif self.use_cove == 1:
 90 |             self.cove_cont = tf.placeholder(tf.float32, [None, None, 2, 600], 'cove_cont')
 91 |             self.cove_ques = tf.placeholder(tf.float32, [None, None, 2, 600], 'cove_ques')
 92 | 
 93 |         # lr schedule
 94 |         self.global_step = tf.get_variable('global_step', shape=[], dtype=tf.int32,
 95 |                                            initializer=tf.constant_initializer(0), trainable=False)
 96 | 
 97 |         self.learning_rate = tf.placeholder_with_default(config['learning_rate'], (), name="learning_rate")
 98 |         self.lr = self.learning_rate
 99 |         # self.lr = tf.minimum(self.learning_rate,
100 |         #                      self.learning_rate / tf.log(999.) * tf.log(tf.cast(self.global_step, tf.float32) + 1))
101 | 
102 |         # initial model & complie
103 |         self.build_model()
104 |         total_params()
105 |         self.complie()
106 | 
107 |     def build_model(self):
108 |         with tf.variable_scope("Input_Embedding_Layer"):
109 |             with tf.variable_scope("Char_Embedding_Layer"):
110 |                 # char embedding
111 |                 ch_emb = tf.reshape(tf.nn.embedding_lookup(self.char_mat, self.contc_input),
112 |                                     [-1, self.char_limit, self.char_dim])
113 |                 qh_emb = tf.reshape(tf.nn.embedding_lookup(self.char_mat, self.quesc_input),
114 |                                     [-1, self.char_limit, self.char_dim])
115 |                 ch_emb = tf.nn.dropout(ch_emb, 1 - self.dropout_emb)
116 |                 qh_emb = tf.nn.dropout(qh_emb, 1 - self.dropout_emb)
117 | 
118 |                 ch_emb, qh_emb = BiLSTM([ch_emb, qh_emb], self.char_filters // 2, dropout=self.dropout_rnn,
119 |                                         name='char_lstm')
120 |                 ch_emb = tf.reduce_max(ch_emb, axis=1)
121 |                 qh_emb = tf.reduce_max(qh_emb, axis=1)
122 |                 ch_emb = tf.reshape(ch_emb, [-1, self.c_maxlen, self.char_filters])
123 |                 qh_emb = tf.reshape(qh_emb, [-1, self.q_maxlen, self.char_filters])
124 | 
125 |             with tf.variable_scope("Word_Embedding_Layer"):
126 |                 # word embedding
127 |                 c_emb = tf.nn.embedding_lookup(self.word_mat, self.contw_input)
128 |                 q_emb = tf.nn.embedding_lookup(self.word_mat, self.quesw_input)
129 |                 c_emb = tf.nn.dropout(c_emb, 1.0 - self.dropout_emb)
130 |                 q_emb = tf.nn.dropout(q_emb, 1.0 - self.dropout_emb)
131 | 
132 |             # cove features
133 |             if self.use_cove != 0:
134 |                 if self.use_cove == 2:
135 |                     self.cove_cont = tf.stop_gradient(self.cove_model(c_emb))  # [bs, c_len, 2, 600]
136 |                     self.cove_ques = tf.stop_gradient(self.cove_model(q_emb))  # [bs, q_len, 2, 600]
137 |                 with tf.variable_scope('Cove_weights', reuse=tf.AUTO_REUSE):
138 |                     cove_context_input = CoveCombineLayer(self.cove_cont, 'input')
139 |                     cove_question_input = CoveCombineLayer(self.cove_ques, 'input')
140 |                 c_emb = tf.concat([c_emb, cove_context_input], axis=-1)
141 |                 q_emb = tf.concat([q_emb, cove_question_input], axis=-1)
142 | 
143 |             # elmo features
144 |             if self.use_elmo != 0:
145 |                 with tf.variable_scope('ELMo_weights', reuse=tf.AUTO_REUSE):
146 |                     elmo_context_input = ElmoCombineLayer(self.elmo_cont, 'input')
147 |                     elmo_question_input = ElmoCombineLayer(self.elmo_ques, 'input')
148 |                     elmo_context_output = ElmoCombineLayer(self.elmo_cont, 'output')
149 |                     elmo_question_output = ElmoCombineLayer(self.elmo_ques, 'output')
150 |                 c_emb = tf.concat([c_emb, elmo_context_input], axis=-1)
151 |                 q_emb = tf.concat([q_emb, elmo_question_input], axis=-1)
152 | 
153 |             if self.use_feat:
154 |                 c_emb = tf.concat([c_emb, self.cont_feat], axis=-1)
155 |                 q_emb = tf.concat([q_emb, self.ques_feat], axis=-1)
156 | 
157 |             # combine embedding feats
158 |             c_emb = tf.concat([c_emb, ch_emb], axis=-1)
159 |             q_emb = tf.concat([q_emb, qh_emb], axis=-1)
160 | 
161 |             # BiLSTM Embedding
162 |             with tf.variable_scope("BiLSTM_Embedding_Layer"):
163 |                 c_emb, q_emb = BiLSTM([c_emb, q_emb], self.filters // 2, dropout=self.dropout_rnn, name='encoder')
164 | 
165 |         with tf.variable_scope("Iterative_Reattention_Aligner"):
166 |             with tf.variable_scope("Aligning_Block1"):
167 |                 R = align_block(u=c_emb,
168 |                                 v=q_emb,
169 |                                 c_mask=self.c_mask,
170 |                                 q_mask=self.q_mask,
171 |                                 filters=self.filters,
172 |                                 dropout=self.dropout_rnn)
173 |                 R = tf.nn.dropout(R, 1.0 - self.dropout_att)
174 |             with tf.variable_scope("Aligning_Block2"):
175 |                 R = align_block(u=R,
176 |                                 v=q_emb,
177 |                                 c_mask=self.c_mask,
178 |                                 q_mask=self.q_mask,
179 |                                 filters=self.filters,
180 |                                 dropout=self.dropout_rnn)
181 |                 R = tf.nn.dropout(R, 1.0 - self.dropout_att)
182 | 
183 |         with tf.variable_scope("Answer_Pointer"):
184 |             z = tf.squeeze(tf.slice(q_emb, [0, tf.shape(q_emb)[1]-1, 0], [-1, 1, -1]), axis=1) # [bs, 1, dim]->[bs, dim]
185 |             # logits
186 |             if self.use_elmo != 0:
187 |                 R = tf.concat([R, elmo_context_output], axis=-1)
188 |                 z = tf.concat([z, elmo_question_output], axis=-1)
189 | 
190 |             with tf.variable_scope('Answer_Block1'):
191 |                 z = answer_block(R, z, self.filters, self.c_mask, dropout=self.dropout, return_logits=False)
192 |             with tf.variable_scope('Answer_Block2'):
193 |                 logits1, logits2 = answer_block(R, z, self.filters, self.c_mask, dropout=self.dropout, return_logits=True)
194 | 
195 |         with tf.variable_scope("Loss_Layer"):
196 |             # maximum-likelihood (ML) loss for dataset V2.0
197 |             start_loss = tf.nn.softmax_cross_entropy_with_logits_v2(logits=logits1, labels=self.y_start)
198 |             end_loss = tf.nn.softmax_cross_entropy_with_logits_v2(logits=logits2, labels=self.y_end)
199 |             self.loss = tf.reduce_mean(start_loss + end_loss)
200 | 
201 |             # l2 loss
202 |             if self.l2_norm is not None:
203 |                 decay_costs = []
204 |                 for var in tf.trainable_variables():
205 |                     decay_costs.append(tf.nn.l2_loss(var))
206 |                 self.loss += tf.multiply(self.l2_norm, tf.add_n(decay_costs))
207 | 
208 |             # RL loss
209 |             if self.use_rlloss:
210 |                 with tf.variable_scope("Reinforcement_Loss"):
211 |                     self.rl_loss, _, _ = rl_loss(logits1, logits2, self.y_start, self.y_end, self.c_maxlen)
212 |                     self.loss += (self.rlw * self.rl_loss)
213 | 
214 |         with tf.variable_scope('Output_Layer'):
215 |             softmax_start_scores = tf.nn.softmax(logits1)
216 |             softmax_end_scores = tf.nn.softmax(logits2)
217 | 
218 |             outer = tf.matmul(tf.expand_dims(softmax_start_scores, axis=2),
219 |                               tf.expand_dims(softmax_end_scores, axis=1))
220 |             outer = tf.matrix_band_part(outer, 0, self.ans_limit)
221 | 
222 |             def position_encoding(x):
223 |                 import math
224 |                 for i in range(x.shape[0]):
225 |                     for j in range(x.shape[1]):
226 |                         if j - i > 5:
227 |                             x[i][j] = float(1.0 / math.log(j - i + 1))
228 |                 return x
229 | 
230 |             mask_mat = tf.ones((self.c_maxlen, self.c_maxlen))
231 |             mask_mat = tf.expand_dims(tf.py_func(position_encoding, [mask_mat], tf.float32), axis=0)
232 |             mask_mat = tf.tile(mask_mat, [self.un_size, 1, 1])
233 | 
234 |             outer_masked = outer * mask_mat
235 |             self.mask_output1 = tf.argmax(tf.reduce_max(outer_masked, axis=2), axis=1)
236 |             self.mask_output2 = tf.argmax(tf.reduce_max(outer_masked, axis=1), axis=1)
237 | 
238 |     def complie(self):
239 |         # self.opt = AdaMaxOptimizer(learning_rate=self.lr, beta1=0.9, beta2=0.999, epsilon=1e-7)
240 |         self.opt = tf.train.AdamOptimizer(learning_rate=self.lr, beta1=0.8, beta2=0.999, epsilon=1e-7)
241 |         grads = self.opt.compute_gradients(self.loss)
242 |         gradients, variables = zip(*grads)
243 |         capped_grads, _ = tf.clip_by_global_norm(gradients, self.grad_clip)
244 |         self.train_op = self.opt.apply_gradients(zip(capped_grads, variables), global_step=self.global_step)
245 | 
246 |         # EMA
247 |         with tf.variable_scope("EMA_Weights"):
248 |             if self.decay is not None and self.decay < 1.:
249 |                 self.var_ema = tf.train.ExponentialMovingAverage(self.decay)
250 |                 with tf.control_dependencies([self.train_op]):
251 |                     self.ema_train_op = self.var_ema.apply(
252 |                         list(set(tf.trainable_variables()) ^ set(tf.trainable_variables('Cove_Layer'))))
253 |                 # assign ema weights
254 |                 self.assign_vars = []
255 |                 for var in tf.global_variables():
256 |                     v = self.var_ema.average(var)
257 |                     if v is not None:
258 |                         self.assign_vars.append(tf.assign(var, v))
259 | 
260 | # import numpy as np
261 | #
262 | # config = {
263 | #     'char_dim': 64,
264 | #     'cont_limit': 400,
265 | #     'ques_limit': 50,
266 | #     'char_limit': 16,
267 | #     'ans_limit': -1,
268 | #     'filters': 256,
269 | #     'dropout': 0.1,
270 | #     'dropout_emb': 0.1,
271 | #     'l2_norm': 3e-7,
272 | #     'decay': 0.9999,
273 | #     'gamma_c': 1.0,
274 | #     'gamma_b': 0.3,
275 | #     'learning_rate': 1e-3,
276 | #     'grad_clip': 5.0,
277 | #     'init_lambda': 3.0,
278 | #     'loss_type': 'use_plausible',
279 | #     'use_elmo': 0,
280 | #     'use_cove': 0,
281 | #     'use_feat': True,
282 | #     'optimizer': 'adam',
283 | #     'use_rlloss': False,
284 | #     'cove_path': 'Keras_CoVe_2layers.h5',
285 | #     'elmo_weights_path': 'elmo_tf/models/squad_context_concat_lm_2x4096_512_2048cnn_2xhighway_skip.hdf5',
286 | #     'elmo_options_path': 'elmo_tf/models/options_squad_lm_2x4096_512_2048cnn_2xhighway_skip.json',
287 | #     'train_tfrecords': 'tfrecords/train_pre_elmo_cove.tfrecords',
288 | #     'dev_tfrecords': 'tfrecords/dev_pre_elmo_cove.tfrecords',
289 | #     'batch_size': 24,
290 | #     'epoch': 40,
291 | #     'origin_path': None,  # not finetune
292 | #     'path': 'QANetV253'
293 | # }
294 | # word_mat = np.random.random((90950, 300)).astype(np.float32)
295 | # char_mat2 = np.random.random((94, 300)).astype(np.float32)
296 | # char_mat = np.random.random((1171, 300)).astype(np.float32)
297 | # model = Model(config, word_mat, char_mat, char_mat2)
298 | 


--------------------------------------------------------------------------------
/RMR_modelV6.py:
--------------------------------------------------------------------------------
  1 | import tensorflow as tf
  2 | from layers import total_params, align_block, summary_vector, start_logits, end_logits, BiLSTM, ElmoAttention, \
  3 |     ElmoCombineLayer, CoveCombineLayer
  4 | from bilm import BidirectionalLanguageModel, all_layers
  5 | from keras.models import load_model
  6 | from loss import rl_loss
  7 | import numpy as np
  8 | from util.Adamax import AdaMaxOptimizer
  9 | 
 10 | class Model(object):
 11 |     def __init__(self, config, word_mat=None, char_mat_trainable=None, char_mat_fix=None, test=False):
 12 | 
 13 |         # hyper-parameter
 14 |         self.char_dim = config['char_dim']
 15 |         self.cont_limit = config['cont_limit'] if not test else 1000
 16 |         self.ques_limit = config['ques_limit'] if not test else 50
 17 |         self.char_limit = config['char_limit']
 18 |         self.ans_limit = config['ans_limit']
 19 |         self.filters = config['filters']
 20 |         self.char_filters = config['char_filters']
 21 |         self.batch_size = config['batch_size']
 22 |         self.l2_norm = config['l2_norm']
 23 |         self.decay = config['decay']
 24 |         self.learning_rate = config['learning_rate']
 25 |         self.grad_clip = config['grad_clip']
 26 |         self.init_lambda = config['init_lambda']
 27 |         self.gamma_b = config['gamma_b']
 28 |         self.gamma_c = config['gamma_c']
 29 |         self.use_elmo = config['use_elmo']
 30 |         self.use_cove = config['use_cove']
 31 |         self.use_feat = config['use_feat']
 32 |         self.use_rlloss = config['use_rlloss']
 33 |         self.dropout = tf.placeholder_with_default(0.0, (), name="dropout")
 34 |         self.dropout_rnn = tf.placeholder_with_default(0.0, (), name="dropout_rnn")
 35 |         self.dropout_emb = tf.placeholder_with_default(0.0, (), name="dropout_emb")
 36 |         self.dropout_att = tf.placeholder_with_default(0.0, (), name="dropout_att")
 37 |         self.un_size = tf.placeholder_with_default(self.batch_size, (), name="un_size")
 38 |         self.rlw = tf.placeholder_with_default(0.0, (), name="rlloss_weights")
 39 | 
 40 |         # embedding layer
 41 |         self.word_mat = tf.get_variable("word_mat", initializer=tf.constant(word_mat, dtype=tf.float32), trainable=False)
 42 |         with tf.variable_scope("Input_Embedding_Mat"):
 43 |             self.char_mat = tf.get_variable("char_mat", initializer=np.concatenate([char_mat_trainable, char_mat_fix], axis=0), trainable=True)
 44 | 
 45 |         # input tensor
 46 |         self.contw_input = tf.placeholder(tf.int32, [None, None], "context_word")
 47 |         self.quesw_input = tf.placeholder(tf.int32, [None, None], "question_word")
 48 |         self.contc_input = tf.placeholder(tf.int32, [None, None, self.char_limit], "context_char")
 49 |         self.quesc_input = tf.placeholder(tf.int32, [None, None, self.char_limit], "question_char")
 50 |         self.y_start = tf.placeholder(tf.int32, [None, None], "answer_start_index")
 51 |         self.y_end = tf.placeholder(tf.int32, [None, None], "answer_end_index")
 52 |         self.contw_elmo_id = tf.placeholder(tf.int32, [None, None, 50], 'contw_elmo_id')
 53 |         self.quesw_elmo_id = tf.placeholder(tf.int32, [None, None, 50], 'quesw_elmo_id')
 54 |         if self.use_feat:
 55 |             self.cont_feat = tf.placeholder(tf.float32, [None, None, 73], "cont_feat")
 56 |             self.ques_feat = tf.placeholder(tf.float32, [None, None, 73], "ques_feat")
 57 |         self.old_char_mat = tf.placeholder(tf.float32, [None, None], "old_char_mat")
 58 |         self.assign_char_mat = tf.assign(self.char_mat, self.old_char_mat)
 59 | 
 60 |         # get mask & length for words & chars
 61 |         self.c_mask = tf.cast(self.contw_input, tf.bool)
 62 |         self.q_mask = tf.cast(self.quesw_input, tf.bool)
 63 |         self.cont_len = tf.reduce_sum(tf.cast(self.c_mask, tf.int32), axis=1)
 64 |         self.ques_len = tf.reduce_sum(tf.cast(self.q_mask, tf.int32), axis=1)
 65 | 
 66 |         # slice for maxlen in each batch
 67 |         self.c_maxlen = tf.reduce_max(self.cont_len)
 68 |         self.q_maxlen = tf.reduce_max(self.ques_len)
 69 | 
 70 |         # elmo features
 71 |         if self.use_elmo == 2:
 72 |             options_file = config['elmo_options_path']
 73 |             weight_file = config['elmo_weights_path']
 74 |             bilm = BidirectionalLanguageModel(options_file, weight_file)
 75 |             self.elmo_cont = all_layers(bilm(self.contw_elmo_id))  # [bs, 3, len, 1024]
 76 |             self.elmo_cont = tf.transpose(self.elmo_cont, [0, 2, 1, 3])  # [bs, len, 3, 1024]
 77 |             self.elmo_ques = all_layers(bilm(self.quesw_elmo_id))
 78 |             self.elmo_ques = tf.transpose(self.elmo_ques, [0, 2, 1, 3])
 79 |         elif self.use_elmo == 1:
 80 |             self.elmo_cont = tf.placeholder(tf.float32, [None, None, 3, 1024], 'elmo_cont')
 81 |             self.elmo_ques = tf.placeholder(tf.float32, [None, None, 3, 1024], 'elmo_ques')
 82 | 
 83 |         if self.use_cove == 2:
 84 |             with tf.variable_scope('Cove_Layer'):
 85 |                 self.cove_model = load_model(config['cove_path'])
 86 |         elif self.use_cove == 1:
 87 |             self.cove_cont = tf.placeholder(tf.float32, [None, None, 2, 600], 'cove_cont')
 88 |             self.cove_ques = tf.placeholder(tf.float32, [None, None, 2, 600], 'cove_ques')
 89 | 
 90 |         # lr schedule
 91 |         self.global_step = tf.get_variable('global_step', shape=[], dtype=tf.int32,
 92 |                                            initializer=tf.constant_initializer(0), trainable=False)
 93 | 
 94 |         self.learning_rate = tf.placeholder_with_default(config['learning_rate'], (), name="learning_rate")
 95 |         self.lr = self.learning_rate
 96 |         # self.lr = tf.minimum(self.learning_rate,
 97 |         #                      self.learning_rate / tf.log(999.) * tf.log(tf.cast(self.global_step, tf.float32) + 1))
 98 | 
 99 |         # initial model & complie
100 |         self.build_model()
101 |         total_params()
102 |         self.complie()
103 | 
104 |     def build_model(self):
105 |         with tf.variable_scope("Input_Embedding_Layer"):
106 |             with tf.variable_scope("Char_Embedding_Layer"):
107 |                 # char embedding
108 |                 ch_emb = tf.reshape(tf.nn.embedding_lookup(self.char_mat, self.contc_input), [-1, self.char_limit, self.char_dim])
109 |                 qh_emb = tf.reshape(tf.nn.embedding_lookup(self.char_mat, self.quesc_input), [-1, self.char_limit, self.char_dim])
110 |                 ch_emb = tf.nn.dropout(ch_emb, 1 - self.dropout_emb)
111 |                 qh_emb = tf.nn.dropout(qh_emb, 1 - self.dropout_emb)
112 | 
113 |                 ch_emb, qh_emb = BiLSTM([ch_emb, qh_emb], self.char_filters // 2, dropout=self.dropout_rnn, name='char_lstm')
114 |                 ch_emb = tf.reduce_max(ch_emb, axis=1)
115 |                 qh_emb = tf.reduce_max(qh_emb, axis=1)
116 |                 ch_emb = tf.reshape(ch_emb, [-1, self.c_maxlen, self.char_filters])
117 |                 qh_emb = tf.reshape(qh_emb, [-1, self.q_maxlen, self.char_filters])
118 | 
119 |             with tf.variable_scope("Word_Embedding_Layer"):
120 |                 # word embedding
121 |                 c_emb = tf.nn.embedding_lookup(self.word_mat, self.contw_input)
122 |                 q_emb = tf.nn.embedding_lookup(self.word_mat, self.quesw_input)
123 |                 c_emb = tf.nn.dropout(c_emb, 1.0 - self.dropout_emb)
124 |                 q_emb = tf.nn.dropout(q_emb, 1.0 - self.dropout_emb)
125 | 
126 |             # cove features
127 |             if self.use_cove != 0:
128 |                 if self.use_cove == 2:
129 |                     self.cove_cont = tf.stop_gradient(self.cove_model(c_emb))  # [bs, c_len, 2, 600]
130 |                     self.cove_ques = tf.stop_gradient(self.cove_model(q_emb))  # [bs, q_len, 2, 600]
131 |                 with tf.variable_scope('Cove_weights', reuse=tf.AUTO_REUSE):
132 |                     cove_context_input = CoveCombineLayer(self.cove_cont, 'input')
133 |                     cove_question_input = CoveCombineLayer(self.cove_ques, 'input')
134 |                 c_emb = tf.concat([c_emb, cove_context_input], axis=-1)
135 |                 q_emb = tf.concat([q_emb, cove_question_input], axis=-1)
136 | 
137 |             # elmo features
138 |             if self.use_elmo != 0:
139 |                 with tf.variable_scope('ELMo_weights', reuse=tf.AUTO_REUSE):
140 |                     elmo_context_input = ElmoCombineLayer(self.elmo_cont, 'input')
141 |                     elmo_question_input = ElmoCombineLayer(self.elmo_ques, 'input')
142 |                     elmo_context_output = ElmoCombineLayer(self.elmo_cont, 'output')
143 |                     elmo_question_output = ElmoCombineLayer(self.elmo_ques, 'output')
144 |                 c_emb = tf.concat([c_emb, elmo_context_input], axis=-1)
145 |                 q_emb = tf.concat([q_emb, elmo_question_input], axis=-1)
146 | 
147 |             if self.use_feat:
148 |                 c_emb = tf.concat([c_emb, self.cont_feat], axis=-1)
149 |                 q_emb = tf.concat([q_emb, self.ques_feat], axis=-1)
150 | 
151 |             # combine embedding feats
152 |             c_emb = tf.concat([c_emb, ch_emb], axis=-1)
153 |             q_emb = tf.concat([q_emb, qh_emb], axis=-1)
154 | 
155 |             # BiLSTM Embedding
156 |             with tf.variable_scope("BiLSTM_Embedding_Layer"):
157 |                 c_emb, q_emb = BiLSTM([c_emb, q_emb], self.filters // 2, dropout=self.dropout_rnn, name='encoder')
158 | 
159 |         with tf.variable_scope("Iterative_Reattention_Aligner"):
160 |             self.Lambda = tf.get_variable('Lambda', dtype=tf.float32, initializer=self.init_lambda)
161 |             with tf.variable_scope("Aligning_Block1"):
162 |                 R, Z1, E, B = align_block(u=c_emb,
163 |                                           v=q_emb,
164 |                                           c_mask=self.c_mask,
165 |                                           q_mask=self.q_mask,
166 |                                           Lambda=self.Lambda,
167 |                                           filters=self.filters,
168 |                                           dropout=self.dropout_rnn)
169 |                 R = tf.nn.dropout(R, 1.0 - self.dropout_att)
170 |             with tf.variable_scope("Aligning_Block2"):
171 |                 R, Z2, E, B = align_block(u=R,
172 |                                           v=q_emb,
173 |                                           c_mask=self.c_mask,
174 |                                           q_mask=self.q_mask,
175 |                                           E_0=E,
176 |                                           B_0=B,
177 |                                           Lambda=self.Lambda,
178 |                                           filters=self.filters,
179 |                                           dropout=self.dropout_rnn)
180 |                 R = tf.nn.dropout(R, 1.0 - self.dropout_att)
181 |             with tf.variable_scope("Aligning_Block3"):
182 |                 R, Z3, E, B = align_block(u=R,
183 |                                           v=q_emb,
184 |                                           c_mask=self.c_mask,
185 |                                           q_mask=self.q_mask,
186 |                                           E_0=E,
187 |                                           B_0=B,
188 |                                           Z_0=[Z1, Z2],
189 |                                           Lambda=self.Lambda,
190 |                                           filters=self.filters,
191 |                                           dropout=self.dropout_rnn)
192 |                 R = tf.nn.dropout(R, 1.0 - self.dropout_att)
193 | 
194 |         with tf.variable_scope("Answer_Pointer"):
195 |             # logits
196 |             if self.use_elmo != 0:
197 |                 elmo_output_feats = ElmoAttention([elmo_context_output, elmo_question_output],
198 |                                                   self.c_maxlen, self.q_maxlen, self.q_mask, self.dropout)
199 |                 R = tf.concat([R, elmo_output_feats], axis=-1)
200 |             s = summary_vector(q_emb, self.c_maxlen, mask=self.q_mask)
201 |             s = tf.nn.dropout(s, 1 - self.dropout)
202 |             logits1 = start_logits(R, s, mask=self.c_mask, filters=self.filters, name='Start_Pointer')  # [bs, c_len]
203 |             logits2 = end_logits(R, logits1, s, mask=self.c_mask, filters=self.filters, name='End_Pointer')  # [bs, c_len]
204 | 
205 |         with tf.variable_scope("Loss_Layer"):
206 |             # maximum-likelihood (ML) loss for dataset V2.0
207 |             start_loss = tf.nn.softmax_cross_entropy_with_logits_v2(logits=logits1, labels=self.y_start)
208 |             end_loss = tf.nn.softmax_cross_entropy_with_logits_v2(logits=logits2, labels=self.y_end)
209 |             self.loss = tf.reduce_mean(start_loss + end_loss)
210 | 
211 |             # l2 loss
212 |             if self.l2_norm is not None:
213 |                 decay_costs = []
214 |                 for var in tf.trainable_variables():
215 |                     decay_costs.append(tf.nn.l2_loss(var))
216 |                 self.loss += tf.multiply(self.l2_norm, tf.add_n(decay_costs))
217 | 
218 |             # RL loss
219 |             if self.use_rlloss:
220 |                 with tf.variable_scope("Reinforcement_Loss"):
221 |                     self.rl_loss, _, _ = rl_loss(logits1, logits2, self.y_start, self.y_end, self.c_maxlen)
222 |                     self.loss += (self.rlw * self.rl_loss)
223 | 
224 |         with tf.variable_scope('Output_Layer'):
225 |             softmax_start_scores = tf.nn.softmax(logits1)
226 |             softmax_end_scores = tf.nn.softmax(logits2)
227 | 
228 |             outer = tf.matmul(tf.expand_dims(softmax_start_scores, axis=2),
229 |                               tf.expand_dims(softmax_end_scores, axis=1))
230 |             outer = tf.matrix_band_part(outer, 0, self.ans_limit)
231 | 
232 |             def position_encoding(x):
233 |                 import math
234 |                 for i in range(x.shape[0]):
235 |                     for j in range(x.shape[1]):
236 |                         if j - i > 5:
237 |                             x[i][j] = float(1.0 / math.log(j - i + 1))
238 |                 return x
239 | 
240 |             mask_mat = tf.ones((self.c_maxlen, self.c_maxlen))
241 |             mask_mat = tf.expand_dims(tf.py_func(position_encoding, [mask_mat], tf.float32), axis=0)
242 |             mask_mat = tf.tile(mask_mat, [self.un_size, 1, 1])
243 | 
244 |             outer_masked = outer * mask_mat
245 |             self.mask_output1 = tf.argmax(tf.reduce_max(outer_masked, axis=2), axis=1)
246 |             self.mask_output2 = tf.argmax(tf.reduce_max(outer_masked, axis=1), axis=1)
247 | 
248 |     def complie(self):
249 |         # self.opt = AdaMaxOptimizer(learning_rate=self.lr, beta1=0.9, beta2=0.999, epsilon=1e-7)
250 |         self.opt = tf.train.AdamOptimizer(learning_rate=self.lr, beta1=0.8, beta2=0.999, epsilon=1e-7)
251 |         grads = self.opt.compute_gradients(self.loss)
252 |         gradients, variables = zip(*grads)
253 |         capped_grads, _ = tf.clip_by_global_norm(gradients, self.grad_clip)
254 |         self.train_op = self.opt.apply_gradients(zip(capped_grads, variables), global_step=self.global_step)
255 | 
256 |         # EMA
257 |         with tf.variable_scope("EMA_Weights"):
258 |             if self.decay is not None and self.decay < 1.:
259 |                 self.var_ema = tf.train.ExponentialMovingAverage(self.decay)
260 |                 with tf.control_dependencies([self.train_op]):
261 |                     self.ema_train_op = self.var_ema.apply(
262 |                         list(set(tf.trainable_variables()) ^ set(tf.trainable_variables('Cove_Layer'))))
263 |                 # assign ema weights
264 |                 self.assign_vars = []
265 |                 for var in tf.global_variables():
266 |                     v = self.var_ema.average(var)
267 |                     if v is not None:
268 |                         self.assign_vars.append(tf.assign(var, v))
269 | 
270 | 
271 | # import numpy as np
272 | #
273 | # config = {
274 | #     'char_dim': 64,
275 | #     'cont_limit': 400,
276 | #     'ques_limit': 50,
277 | #     'char_limit': 16,
278 | #     'ans_limit': -1,
279 | #     'filters': 256,
280 | #     'dropout': 0.1,
281 | #     'dropout_emb': 0.1,
282 | #     'l2_norm': 3e-7,
283 | #     'decay': 0.9999,
284 | #     'gamma_c': 1.0,
285 | #     'gamma_b': 0.3,
286 | #     'learning_rate': 1e-3,
287 | #     'grad_clip': 5.0,
288 | #     'init_lambda': 3.0,
289 | #     'loss_type': 'use_plausible',
290 | #     'use_elmo': 0,
291 | #     'use_cove': 0,
292 | #     'use_feat': True,
293 | #     'optimizer': 'adam',
294 | #     'use_rlloss': False,
295 | #     'cove_path': 'Keras_CoVe_2layers.h5',
296 | #     'elmo_weights_path': 'elmo_tf/models/squad_context_concat_lm_2x4096_512_2048cnn_2xhighway_skip.hdf5',
297 | #     'elmo_options_path': 'elmo_tf/models/options_squad_lm_2x4096_512_2048cnn_2xhighway_skip.json',
298 | #     'train_tfrecords': 'tfrecords/train_pre_elmo_cove.tfrecords',
299 | #     'dev_tfrecords': 'tfrecords/dev_pre_elmo_cove.tfrecords',
300 | #     'batch_size': 24,
301 | #     'epoch': 40,
302 | #     'origin_path': None,  # not finetune
303 | #     'path': 'QANetV253'
304 | # }
305 | # word_mat = np.random.random((90950, 300)).astype(np.float32)
306 | # char_mat2 = np.random.random((94, 300)).astype(np.float32)
307 | # char_mat = np.random.random((1171, 300)).astype(np.float32)
308 | # model = Model(config, word_mat, char_mat, char_mat2)
309 | 


--------------------------------------------------------------------------------
/bilm/data.py:
--------------------------------------------------------------------------------
  1 | # originally based on https://github.com/tensorflow/models/tree/master/lm_1b
  2 | import glob
  3 | import random
  4 | 
  5 | import numpy as np
  6 | 
  7 | from typing import List
  8 | 
  9 | 
 10 | class Vocabulary(object):
 11 |     '''
 12 |     A token vocabulary.  Holds a map from token to ids and provides
 13 |     a method for encoding text to a sequence of ids.
 14 |     '''
 15 |     def __init__(self, filename, validate_file=False):
 16 |         '''
 17 |         filename = the vocabulary file.  It is a flat text file with one
 18 |             (normalized) token per line.  In addition, the file should also
 19 |             contain the special tokens <S>, </S>, <UNK> (case sensitive).
 20 |         '''
 21 |         self._id_to_word = []
 22 |         self._word_to_id = {}
 23 |         self._unk = -1
 24 |         self._bos = -1
 25 |         self._eos = -1
 26 | 
 27 |         with open(filename) as f:
 28 |             idx = 0
 29 |             for line in f:
 30 |                 word_name = line.strip()
 31 |                 if word_name == '<S>':
 32 |                     self._bos = idx
 33 |                 elif word_name == '</S>':
 34 |                     self._eos = idx
 35 |                 elif word_name == '<UNK>':
 36 |                     self._unk = idx
 37 |                 if word_name == '!!!MAXTERMID':
 38 |                     continue
 39 | 
 40 |                 self._id_to_word.append(word_name)
 41 |                 self._word_to_id[word_name] = idx
 42 |                 idx += 1
 43 | 
 44 |         # check to ensure file has special tokens
 45 |         if validate_file:
 46 |             if self._bos == -1 or self._eos == -1 or self._unk == -1:
 47 |                 raise ValueError("Ensure the vocabulary file has "
 48 |                                  "<S>, </S>, <UNK> tokens")
 49 | 
 50 |     @property
 51 |     def bos(self):
 52 |         return self._bos
 53 | 
 54 |     @property
 55 |     def eos(self):
 56 |         return self._eos
 57 | 
 58 |     @property
 59 |     def unk(self):
 60 |         return self._unk
 61 | 
 62 |     @property
 63 |     def size(self):
 64 |         return len(self._id_to_word)
 65 | 
 66 |     def word_to_id(self, word):
 67 |         if word in self._word_to_id:
 68 |             return self._word_to_id[word]
 69 |         return self.unk
 70 | 
 71 |     def id_to_word(self, cur_id):
 72 |         return self._id_to_word[cur_id]
 73 | 
 74 |     def decode(self, cur_ids):
 75 |         """Convert a list of ids to a sentence, with space inserted."""
 76 |         return ' '.join([self.id_to_word(cur_id) for cur_id in cur_ids])
 77 | 
 78 |     def encode(self, sentence, reverse=False, split=True):
 79 |         """Convert a sentence to a list of ids, with special tokens added.
 80 |         Sentence is a single string with tokens separated by whitespace.
 81 | 
 82 |         If reverse, then the sentence is assumed to be reversed, and
 83 |             this method will swap the BOS/EOS tokens appropriately."""
 84 | 
 85 |         if split:
 86 |             word_ids = [
 87 |                 self.word_to_id(cur_word) for cur_word in sentence.split()
 88 |             ]
 89 |         else:
 90 |             word_ids = [self.word_to_id(cur_word) for cur_word in sentence]
 91 | 
 92 |         if reverse:
 93 |             return np.array([self.eos] + word_ids + [self.bos], dtype=np.int32)
 94 |         else:
 95 |             return np.array([self.bos] + word_ids + [self.eos], dtype=np.int32)
 96 | 
 97 | 
 98 | class UnicodeCharsVocabulary(Vocabulary):
 99 |     """Vocabulary containing character-level and word level information.
100 | 
101 |     Has a word vocabulary that is used to lookup word ids and
102 |     a character id that is used to map words to arrays of character ids.
103 | 
104 |     The character ids are defined by ord(c) for c in word.encode('utf-8')
105 |     This limits the total number of possible char ids to 256.
106 |     To this we add 5 additional special ids: begin sentence, end sentence,
107 |         begin word, end word and padding.
108 | 
109 |     WARNING: for prediction, we add +1 to the output ids from this
110 |     class to create a special padding id (=0).  As a result, we suggest
111 |     you use the `Batcher`, `TokenBatcher`, and `LMDataset` classes instead
112 |     of this lower level class.  If you are using this lower level class,
113 |     then be sure to add the +1 appropriately, otherwise embeddings computed
114 |     from the pre-trained model will be useless.
115 |     """
116 |     def __init__(self, filename, max_word_length, **kwargs):
117 |         super(UnicodeCharsVocabulary, self).__init__(filename, **kwargs)
118 |         self._max_word_length = max_word_length
119 | 
120 |         # char ids 0-255 come from utf-8 encoding bytes
121 |         # assign 256-300 to special chars
122 |         self.bos_char = 256  # <begin sentence>
123 |         self.eos_char = 257  # <end sentence>
124 |         self.bow_char = 258  # <begin word>
125 |         self.eow_char = 259  # <end word>
126 |         self.pad_char = 260 # <padding>
127 | 
128 |         num_words = len(self._id_to_word)
129 | 
130 |         self._word_char_ids = np.zeros([num_words, max_word_length],
131 |             dtype=np.int32)
132 | 
133 |         # the charcter representation of the begin/end of sentence characters
134 |         def _make_bos_eos(c):
135 |             r = np.zeros([self.max_word_length], dtype=np.int32)
136 |             r[:] = self.pad_char
137 |             r[0] = self.bow_char
138 |             r[1] = c
139 |             r[2] = self.eow_char
140 |             return r
141 |         self.bos_chars = _make_bos_eos(self.bos_char)
142 |         self.eos_chars = _make_bos_eos(self.eos_char)
143 | 
144 |         for i, word in enumerate(self._id_to_word):
145 |             self._word_char_ids[i] = self._convert_word_to_char_ids(word)
146 | 
147 |         self._word_char_ids[self.bos] = self.bos_chars
148 |         self._word_char_ids[self.eos] = self.eos_chars
149 |         # TODO: properly handle <UNK>
150 | 
151 |     @property
152 |     def word_char_ids(self):
153 |         return self._word_char_ids
154 | 
155 |     @property
156 |     def max_word_length(self):
157 |         return self._max_word_length
158 | 
159 |     def _convert_word_to_char_ids(self, word):
160 |         code = np.zeros([self.max_word_length], dtype=np.int32)
161 |         code[:] = self.pad_char
162 | 
163 |         word_encoded = word.encode('utf-8', 'ignore')[:(self.max_word_length-2)]
164 |         code[0] = self.bow_char
165 |         k=0
166 |         for k, chr_id in enumerate(word_encoded, start=1):
167 |             code[k] = chr_id
168 |         code[k + 1] = self.eow_char
169 | 
170 |         return code
171 | 
172 |     def word_to_char_ids(self, word):
173 |         if word in self._word_to_id:
174 |             return self._word_char_ids[self._word_to_id[word]]
175 |         else:
176 |             return self._convert_word_to_char_ids(word)
177 | 
178 |     def encode_chars(self, sentence, reverse=False, split=True):
179 |         '''
180 |         Encode the sentence as a white space delimited string of tokens.
181 |         '''
182 |         if split:
183 |             chars_ids = [self.word_to_char_ids(cur_word)
184 |                      for cur_word in sentence.split()]
185 |         else:
186 |             chars_ids = [self.word_to_char_ids(cur_word)
187 |                      for cur_word in sentence]
188 |         if reverse:
189 |             return np.vstack([self.eos_chars] + chars_ids + [self.bos_chars])
190 |         else:
191 |             return np.vstack([self.bos_chars] + chars_ids + [self.eos_chars])
192 | 
193 | 
194 | class Batcher(object):
195 |     ''' 
196 |     Batch sentences of tokenized text into character id matrices.
197 |     '''
198 |     def __init__(self, lm_vocab_file: str, max_token_length: int):
199 |         '''
200 |         lm_vocab_file = the language model vocabulary file (one line per
201 |             token)
202 |         max_token_length = the maximum number of characters in each token
203 |         '''
204 |         self._lm_vocab = UnicodeCharsVocabulary(
205 |             lm_vocab_file, max_token_length
206 |         )
207 |         self._max_token_length = max_token_length
208 | 
209 |     def batch_sentences(self, sentences: List[List[str]]):
210 |         '''
211 |         Batch the sentences as character ids
212 |         Each sentence is a list of tokens without <s> or </s>, e.g.
213 |         [['The', 'first', 'sentence', '.'], ['Second', '.']]
214 |         '''
215 |         n_sentences = len(sentences)
216 |         max_length = max(len(sentence) for sentence in sentences) + 2
217 | 
218 |         X_char_ids = np.zeros(
219 |             (n_sentences, max_length, self._max_token_length),
220 |             dtype=np.int64
221 |         )
222 | 
223 |         for k, sent in enumerate(sentences):
224 |             length = len(sent) + 2
225 |             char_ids_without_mask = self._lm_vocab.encode_chars(
226 |                 sent, split=False)
227 |             # add one so that 0 is the mask value
228 |             X_char_ids[k, :length, :] = char_ids_without_mask + 1
229 | 
230 |         return X_char_ids
231 | 
232 | 
233 | class TokenBatcher(object):
234 |     ''' 
235 |     Batch sentences of tokenized text into token id matrices.
236 |     '''
237 |     def __init__(self, lm_vocab_file: str):
238 |         '''
239 |         lm_vocab_file = the language model vocabulary file (one line per
240 |             token)
241 |         '''
242 |         self._lm_vocab = Vocabulary(lm_vocab_file)
243 | 
244 |     def batch_sentences(self, sentences: List[List[str]]):
245 |         '''
246 |         Batch the sentences as character ids
247 |         Each sentence is a list of tokens without <s> or </s>, e.g.
248 |         [['The', 'first', 'sentence', '.'], ['Second', '.']]
249 |         '''
250 |         n_sentences = len(sentences)
251 |         max_length = max(len(sentence) for sentence in sentences) + 2
252 | 
253 |         X_ids = np.zeros((n_sentences, max_length), dtype=np.int64)
254 | 
255 |         for k, sent in enumerate(sentences):
256 |             length = len(sent) + 2
257 |             ids_without_mask = self._lm_vocab.encode(sent, split=False)
258 |             # add one so that 0 is the mask value
259 |             X_ids[k, :length] = ids_without_mask + 1
260 | 
261 |         return X_ids
262 | 
263 | 
264 | ##### for training
265 | def _get_batch(generator, batch_size, num_steps, max_word_length):
266 |     """Read batches of input."""
267 |     cur_stream = [None] * batch_size
268 | 
269 |     no_more_data = False
270 |     while True:
271 |         inputs = np.zeros([batch_size, num_steps], np.int32)
272 |         if max_word_length is not None:
273 |             char_inputs = np.zeros([batch_size, num_steps, max_word_length],
274 |                                 np.int32)
275 |         else:
276 |             char_inputs = None
277 |         targets = np.zeros([batch_size, num_steps], np.int32)
278 | 
279 |         for i in range(batch_size):
280 |             cur_pos = 0
281 | 
282 |             while cur_pos < num_steps:
283 |                 if cur_stream[i] is None or len(cur_stream[i][0]) <= 1:
284 |                     try:
285 |                         cur_stream[i] = list(next(generator))
286 |                     except StopIteration:
287 |                         # No more data, exhaust current streams and quit
288 |                         no_more_data = True
289 |                         break
290 | 
291 |                 how_many = min(len(cur_stream[i][0]) - 1, num_steps - cur_pos)
292 |                 next_pos = cur_pos + how_many
293 | 
294 |                 inputs[i, cur_pos:next_pos] = cur_stream[i][0][:how_many]
295 |                 if max_word_length is not None:
296 |                     char_inputs[i, cur_pos:next_pos] = cur_stream[i][1][
297 |                                                                     :how_many]
298 |                 targets[i, cur_pos:next_pos] = cur_stream[i][0][1:how_many+1]
299 | 
300 |                 cur_pos = next_pos
301 | 
302 |                 cur_stream[i][0] = cur_stream[i][0][how_many:]
303 |                 if max_word_length is not None:
304 |                     cur_stream[i][1] = cur_stream[i][1][how_many:]
305 | 
306 |         if no_more_data:
307 |             # There is no more data.  Note: this will not return data
308 |             # for the incomplete batch
309 |             break
310 | 
311 |         X = {'token_ids': inputs, 'tokens_characters': char_inputs,
312 |                  'next_token_id': targets}
313 | 
314 |         yield X
315 | 
316 | class LMDataset(object):
317 |     """
318 |     Hold a language model dataset.
319 | 
320 |     A dataset is a list of tokenized files.  Each file contains one sentence
321 |         per line.  Each sentence is pre-tokenized and white space joined.
322 |     """
323 |     def __init__(self, filepattern, vocab, reverse=False, test=False,
324 |                  shuffle_on_load=False):
325 |         '''
326 |         filepattern = a glob string that specifies the list of files.
327 |         vocab = an instance of Vocabulary or UnicodeCharsVocabulary
328 |         reverse = if True, then iterate over tokens in each sentence in reverse
329 |         test = if True, then iterate through all data once then stop.
330 |             Otherwise, iterate forever.
331 |         shuffle_on_load = if True, then shuffle the sentences after loading.
332 |         '''
333 |         self._vocab = vocab
334 |         self._all_shards = glob.glob(filepattern)
335 |         print('Found %d shards at %s' % (len(self._all_shards), filepattern))
336 |         self._shards_to_choose = []
337 | 
338 |         self._reverse = reverse
339 |         self._test = test
340 |         self._shuffle_on_load = shuffle_on_load
341 |         self._use_char_inputs = hasattr(vocab, 'encode_chars')
342 | 
343 |         self._ids = self._load_random_shard()
344 | 
345 |     def _choose_random_shard(self):
346 |         if len(self._shards_to_choose) == 0:
347 |             self._shards_to_choose = list(self._all_shards)
348 |             random.shuffle(self._shards_to_choose)
349 |         shard_name = self._shards_to_choose.pop()
350 |         return shard_name
351 | 
352 |     def _load_random_shard(self):
353 |         """Randomly select a file and read it."""
354 |         if self._test:
355 |             if len(self._all_shards) == 0:
356 |                 # we've loaded all the data
357 |                 # this will propogate up to the generator in get_batch
358 |                 # and stop iterating
359 |                 raise StopIteration
360 |             else:
361 |                 shard_name = self._all_shards.pop()
362 |         else:
363 |             # just pick a random shard
364 |             shard_name = self._choose_random_shard()
365 | 
366 |         ids = self._load_shard(shard_name)
367 |         self._i = 0
368 |         self._nids = len(ids)
369 |         return ids
370 | 
371 |     def _load_shard(self, shard_name):
372 |         """Read one file and convert to ids.
373 | 
374 |         Args:
375 |             shard_name: file path.
376 | 
377 |         Returns:
378 |             list of (id, char_id) tuples.
379 |         """
380 |         print('Loading data from: %s' % shard_name)
381 |         with open(shard_name) as f:
382 |             sentences_raw = f.readlines()
383 | 
384 |         if self._reverse:
385 |             sentences = []
386 |             for sentence in sentences_raw:
387 |                 splitted = sentence.split()
388 |                 splitted.reverse()
389 |                 sentences.append(' '.join(splitted))
390 |         else:
391 |             sentences = sentences_raw
392 | 
393 |         if self._shuffle_on_load:
394 |             random.shuffle(sentences)
395 | 
396 |         ids = [self.vocab.encode(sentence, self._reverse)
397 |                for sentence in sentences]
398 |         if self._use_char_inputs:
399 |             chars_ids = [self.vocab.encode_chars(sentence, self._reverse)
400 |                      for sentence in sentences]
401 |         else:
402 |             chars_ids = [None] * len(ids)
403 | 
404 |         print('Loaded %d sentences.' % len(ids))
405 |         print('Finished loading')
406 |         return list(zip(ids, chars_ids))
407 | 
408 |     def get_sentence(self):
409 |         while True:
410 |             if self._i == self._nids:
411 |                 self._ids = self._load_random_shard()
412 |             ret = self._ids[self._i]
413 |             self._i += 1
414 |             yield ret
415 | 
416 |     @property
417 |     def max_word_length(self):
418 |         if self._use_char_inputs:
419 |             return self._vocab.max_word_length
420 |         else:
421 |             return None
422 | 
423 |     def iter_batches(self, batch_size, num_steps):
424 |         for X in _get_batch(self.get_sentence(), batch_size, num_steps,
425 |                            self.max_word_length):
426 | 
427 |             # token_ids = (batch_size, num_steps)
428 |             # char_inputs = (batch_size, num_steps, 50) of character ids
429 |             # targets = word ID of next word (batch_size, num_steps)
430 |             yield X
431 | 
432 |     @property
433 |     def vocab(self):
434 |         return self._vocab
435 | 
436 | class BidirectionalLMDataset(object):
437 |     def __init__(self, filepattern, vocab, test=False, shuffle_on_load=False):
438 |         '''
439 |         bidirectional version of LMDataset
440 |         '''
441 |         self._data_forward = LMDataset(
442 |             filepattern, vocab, reverse=False, test=test,
443 |             shuffle_on_load=shuffle_on_load)
444 |         self._data_reverse = LMDataset(
445 |             filepattern, vocab, reverse=True, test=test,
446 |             shuffle_on_load=shuffle_on_load)
447 | 
448 |     def iter_batches(self, batch_size, num_steps):
449 |         max_word_length = self._data_forward.max_word_length
450 | 
451 |         for X, Xr in zip(
452 |             _get_batch(self._data_forward.get_sentence(), batch_size,
453 |                       num_steps, max_word_length),
454 |             _get_batch(self._data_reverse.get_sentence(), batch_size,
455 |                       num_steps, max_word_length)
456 |             ):
457 | 
458 |             for k, v in Xr.items():
459 |                 X[k + '_reverse'] = v
460 | 
461 |             yield X
462 | 
463 | 
464 | class InvalidNumberOfCharacters(Exception):
465 |     pass
466 | 
467 | 


--------------------------------------------------------------------------------
/RMR_modelV6_squad2.py:
--------------------------------------------------------------------------------
  1 | import tensorflow as tf
  2 | from layers import total_params, align_block, summary_vector, start_logits, end_logits, BiLSTM, ElmoAttention, \
  3 |     ElmoCombineLayer, CoveCombineLayer
  4 | from bilm import BidirectionalLanguageModel, all_layers
  5 | from keras.models import load_model
  6 | from loss import rl_loss
  7 | import numpy as np
  8 | 
  9 | 
 10 | class Model(object):
 11 |     def __init__(self, config, word_mat=None, char_mat_trainable=None, char_mat_fix=None, test=False):
 12 | 
 13 |         # hyper-parameter
 14 |         self.char_dim = config['char_dim']
 15 |         self.cont_limit = config['cont_limit'] if not test else 1000
 16 |         self.ques_limit = config['ques_limit'] if not test else 50
 17 |         self.char_limit = config['char_limit']
 18 |         self.ans_limit = config['ans_limit']
 19 |         self.filters = config['filters']
 20 |         self.batch_size = config['batch_size']
 21 |         self.l2_norm = config['l2_norm']
 22 |         self.decay = config['decay']
 23 |         self.learning_rate = config['learning_rate']
 24 |         self.grad_clip = config['grad_clip']
 25 |         self.init_lambda = config['init_lambda']
 26 |         self.gamma_b = config['gamma_b']
 27 |         self.gamma_c = config['gamma_c']
 28 |         self.use_elmo = config['use_elmo']
 29 |         self.use_cove = config['use_cove']
 30 |         self.use_feat = config['use_feat']
 31 |         self.use_rlloss = config['use_rlloss']
 32 |         self.dropout = tf.placeholder_with_default(0.0, (), name="dropout")
 33 |         self.dropout_rnn = tf.placeholder_with_default(0.0, (), name="dropout_rnn")
 34 |         self.dropout_emb = tf.placeholder_with_default(0.0, (), name="dropout_emb")
 35 |         self.dropout_att = tf.placeholder_with_default(0.0, (), name="dropout_att")
 36 |         self.un_size = tf.placeholder_with_default(self.batch_size, (), name="un_size")
 37 |         self.rlw = tf.placeholder_with_default(0.0, (), name="rlloss_weights")
 38 | 
 39 |         # embedding layer
 40 |         self.word_mat = tf.get_variable("word_mat", initializer=tf.constant(word_mat, dtype=tf.float32),
 41 |                                         trainable=False)
 42 |         with tf.variable_scope("Input_Embedding_Mat"):
 43 |             self.char_mat = tf.get_variable("char_mat",
 44 |                                             initializer=np.concatenate([char_mat_trainable, char_mat_fix], axis=0),
 45 |                                             trainable=True)
 46 | 
 47 |         # input tensor
 48 |         self.contw_input = tf.placeholder(tf.int32, [None, None], "context_word")
 49 |         self.quesw_input = tf.placeholder(tf.int32, [None, None], "question_word")
 50 |         self.contc_input = tf.placeholder(tf.int32, [None, None, self.char_limit], "context_char")
 51 |         self.quesc_input = tf.placeholder(tf.int32, [None, None, self.char_limit], "question_char")
 52 |         self.y_start = tf.placeholder(tf.int32, [None, None], "answer_start_index")
 53 |         self.y_end = tf.placeholder(tf.int32, [None, None], "answer_end_index")
 54 |         self.yp_start = tf.placeholder(tf.int32, [None, None], "plausible_answer_start_index")
 55 |         self.yp_end = tf.placeholder(tf.int32, [None, None], "plausible_answer_end_index")
 56 |         self.contw_elmo_id = tf.placeholder(tf.int32, [None, None, 50], 'contw_elmo_id')
 57 |         self.quesw_elmo_id = tf.placeholder(tf.int32, [None, None, 50], 'quesw_elmo_id')
 58 |         if self.use_feat:
 59 |             self.cont_feat = tf.placeholder(tf.float32, [None, None, 73], "cont_feat")
 60 |             self.ques_feat = tf.placeholder(tf.float32, [None, None, 73], "ques_feat")
 61 |         self.old_char_mat = tf.placeholder(tf.float32, [None, None], "old_char_mat")
 62 |         self.assign_char_mat = tf.assign(self.char_mat, self.old_char_mat)
 63 | 
 64 |         # get mask & length for words & chars
 65 |         self.c_mask = tf.cast(self.contw_input, tf.bool)
 66 |         self.q_mask = tf.cast(self.quesw_input, tf.bool)
 67 |         self.cont_len = tf.reduce_sum(tf.cast(self.c_mask, tf.int32), axis=1)
 68 |         self.ques_len = tf.reduce_sum(tf.cast(self.q_mask, tf.int32), axis=1)
 69 | 
 70 |         # slice for maxlen in each batch
 71 |         self.c_maxlen = tf.reduce_max(self.cont_len)
 72 |         self.q_maxlen = tf.reduce_max(self.ques_len)
 73 | 
 74 |         # elmo features
 75 |         if self.use_elmo == 2:
 76 |             options_file = config['elmo_options_path']
 77 |             weight_file = config['elmo_weights_path']
 78 |             bilm = BidirectionalLanguageModel(options_file, weight_file)
 79 |             self.elmo_cont = all_layers(bilm(self.contw_elmo_id))  # [bs, 3, len, 1024]
 80 |             self.elmo_cont = tf.transpose(self.elmo_cont, [0, 2, 1, 3])  # [bs, len, 3, 1024]
 81 |             self.elmo_ques = all_layers(bilm(self.quesw_elmo_id))
 82 |             self.elmo_ques = tf.transpose(self.elmo_ques, [0, 2, 1, 3])
 83 |         elif self.use_elmo == 1:
 84 |             self.elmo_cont = tf.placeholder(tf.float32, [None, None, 3, 1024], 'elmo_cont')
 85 |             self.elmo_ques = tf.placeholder(tf.float32, [None, None, 3, 1024], 'elmo_ques')
 86 | 
 87 |         if self.use_cove == 2:
 88 |             with tf.variable_scope('Cove_Layer'):
 89 |                 self.cove_model = load_model(config['cove_path'])
 90 |         elif self.use_cove == 1:
 91 |             self.cove_cont = tf.placeholder(tf.float32, [None, None, 2, 600], 'cove_cont')
 92 |             self.cove_ques = tf.placeholder(tf.float32, [None, None, 2, 600], 'cove_ques')
 93 | 
 94 |         # lr schedule
 95 |         self.global_step = tf.get_variable('global_step', shape=[], dtype=tf.int32,
 96 |                                            initializer=tf.constant_initializer(0), trainable=False)
 97 | 
 98 |         self.learning_rate = tf.placeholder_with_default(config['learning_rate'], (), name="learning_rate")
 99 |         self.lr = tf.minimum(self.learning_rate,
100 |                              self.learning_rate / tf.log(999.) * tf.log(tf.cast(self.global_step, tf.float32) + 1))
101 | 
102 |         # initial model & complie
103 |         self.build_model()
104 |         total_params()
105 |         self.complie()
106 | 
107 |     def build_model(self):
108 |         with tf.variable_scope("Input_Embedding_Layer"):
109 |             with tf.variable_scope("Char_Embedding_Layer"):
110 |                 # char embedding
111 |                 ch_emb = tf.reshape(tf.nn.embedding_lookup(self.char_mat, self.contc_input),
112 |                                     [-1, self.char_limit, self.char_dim])
113 |                 qh_emb = tf.reshape(tf.nn.embedding_lookup(self.char_mat, self.quesc_input),
114 |                                     [-1, self.char_limit, self.char_dim])
115 |                 ch_emb = tf.nn.dropout(ch_emb, 1 - self.dropout_emb)
116 |                 qh_emb = tf.nn.dropout(qh_emb, 1 - self.dropout_emb)
117 | 
118 |                 ch_emb, qh_emb = BiLSTM([ch_emb, qh_emb], self.char_dim // 2, dropout=self.dropout_rnn,
119 |                                         name='char_lstm', return_state=True)
120 |                 ch_emb = tf.reshape(ch_emb, [-1, self.c_maxlen, self.char_dim])
121 |                 qh_emb = tf.reshape(qh_emb, [-1, self.q_maxlen, self.char_dim])
122 | 
123 |             with tf.variable_scope("Word_Embedding_Layer"):
124 |                 # word embedding
125 |                 c_emb = tf.nn.embedding_lookup(self.word_mat, self.contw_input)
126 |                 q_emb = tf.nn.embedding_lookup(self.word_mat, self.quesw_input)
127 |                 c_emb = tf.nn.dropout(c_emb, 1.0 - self.dropout_emb)
128 |                 q_emb = tf.nn.dropout(q_emb, 1.0 - self.dropout_emb)
129 | 
130 |             # cove features
131 |             if self.use_cove != 0:
132 |                 if self.use_cove == 2:
133 |                     self.cove_cont = tf.stop_gradient(self.cove_model(c_emb))  # [bs, c_len, 2, 600]
134 |                     self.cove_ques = tf.stop_gradient(self.cove_model(q_emb))  # [bs, q_len, 2, 600]
135 |                 with tf.variable_scope('Cove_weights', reuse=tf.AUTO_REUSE):
136 |                     cove_context_input = CoveCombineLayer(self.cove_cont, 'input')
137 |                     cove_question_input = CoveCombineLayer(self.cove_ques, 'input')
138 |                 c_emb = tf.concat([c_emb, cove_context_input], axis=-1)
139 |                 q_emb = tf.concat([q_emb, cove_question_input], axis=-1)
140 | 
141 |             # elmo features
142 |             if self.use_elmo != 0:
143 |                 with tf.variable_scope('ELMo_weights', reuse=tf.AUTO_REUSE):
144 |                     elmo_context_input = ElmoCombineLayer(self.elmo_cont, 'input')
145 |                     elmo_question_input = ElmoCombineLayer(self.elmo_ques, 'input')
146 |                     elmo_context_output = ElmoCombineLayer(self.elmo_cont, 'output')
147 |                     elmo_question_output = ElmoCombineLayer(self.elmo_ques, 'output')
148 |                 c_emb = tf.concat([c_emb, elmo_context_input], axis=-1)
149 |                 q_emb = tf.concat([q_emb, elmo_question_input], axis=-1)
150 | 
151 |             if self.use_feat:
152 |                 c_emb = tf.concat([c_emb, self.cont_feat], axis=-1)
153 |                 q_emb = tf.concat([q_emb, self.ques_feat], axis=-1)
154 | 
155 |             # combine embedding feats
156 |             c_emb = tf.concat([c_emb, ch_emb], axis=-1)
157 |             q_emb = tf.concat([q_emb, qh_emb], axis=-1)
158 | 
159 |             # BiLSTM Embedding
160 |             with tf.variable_scope("BiLSTM_Embedding_Layer"):
161 |                 c_emb, q_emb = BiLSTM([c_emb, q_emb], self.filters // 2, dropout=self.dropout_rnn, name='encoder')
162 | 
163 |         with tf.variable_scope("Iterative_Reattention_Aligner"):
164 |             self.Lambda = tf.get_variable('Lambda', dtype=tf.float32, initializer=self.init_lambda)
165 |             with tf.variable_scope("Aligning_Block1"):
166 |                 R, Z1, E, B = align_block(u=c_emb,
167 |                                           v=q_emb,
168 |                                           c_mask=self.c_mask,
169 |                                           q_mask=self.q_mask,
170 |                                           Lambda=self.Lambda,
171 |                                           filters=self.filters,
172 |                                           dropout=self.dropout_rnn)
173 |                 R = tf.nn.dropout(R, 1.0 - self.dropout_att)
174 |             with tf.variable_scope("Aligning_Block2"):
175 |                 R, Z2, E, B = align_block(u=R,
176 |                                           v=q_emb,
177 |                                           c_mask=self.c_mask,
178 |                                           q_mask=self.q_mask,
179 |                                           E_0=E,
180 |                                           B_0=B,
181 |                                           Lambda=self.Lambda,
182 |                                           filters=self.filters,
183 |                                           dropout=self.dropout_rnn)
184 |                 R = tf.nn.dropout(R, 1.0 - self.dropout_att)
185 |             with tf.variable_scope("Aligning_Block3"):
186 |                 R, Z3, E, B = align_block(u=R,
187 |                                           v=q_emb,
188 |                                           c_mask=self.c_mask,
189 |                                           q_mask=self.q_mask,
190 |                                           E_0=E,
191 |                                           B_0=B,
192 |                                           Z_0=[Z1, Z2],
193 |                                           Lambda=self.Lambda,
194 |                                           filters=self.filters,
195 |                                           dropout=self.dropout_rnn)
196 |                 R = tf.nn.dropout(R, 1.0 - self.dropout_att)
197 | 
198 |         with tf.variable_scope("Answer_Pointer"):
199 |             # logits
200 |             if self.use_elmo != 0:
201 |                 elmo_output_feats = ElmoAttention([elmo_context_output, elmo_question_output],
202 |                                                   self.c_maxlen, self.q_maxlen, self.q_mask, self.dropout)
203 |                 R = tf.concat([R, elmo_output_feats], axis=-1)
204 |             s = summary_vector(q_emb, self.c_maxlen, mask=self.q_mask)
205 |             s = tf.nn.dropout(s, 1 - self.dropout)
206 |             logits1 = start_logits(R, s, mask=self.c_mask, filters=self.filters, name='Start_Pointer')  # [bs, c_len]
207 |             logits2 = end_logits(R, logits1, s, mask=self.c_mask, filters=self.filters,
208 |                                  name='End_Pointer')  # [bs, c_len]
209 |             self.unanswer_bias = tf.get_variable("unanswer_bias", [1], initializer=tf.zeros_initializer())
210 |             self.unanswer_bias = tf.reshape(tf.tile(self.unanswer_bias, [self.un_size]), [-1, 1])
211 |             logits1 = tf.concat((self.unanswer_bias, logits1), axis=-1)
212 |             logits2 = tf.concat((self.unanswer_bias, logits2), axis=-1)
213 | 
214 |             logits1p = start_logits(R, s, mask=self.c_mask, filters=self.filters, name='Start_Pointer2')  # [bs, c_len]
215 |             logits2p = end_logits(R, logits1p, s, mask=self.c_mask, filters=self.filters,
216 |                                   name='End_Pointer2')  # [bs, c_len]
217 | 
218 |         with tf.variable_scope("Loss_Layer"):
219 |             # maximum-likelihood (ML) loss for dataset V2.0
220 |             # loss a
221 |             start_loss = tf.nn.softmax_cross_entropy_with_logits_v2(logits=logits1, labels=self.y_start)
222 |             end_loss = tf.nn.softmax_cross_entropy_with_logits_v2(logits=logits2, labels=self.y_end)
223 |             self.loss = tf.reduce_mean(start_loss + end_loss)
224 | 
225 |             # loss b
226 |             pstart_loss = tf.nn.softmax_cross_entropy_with_logits_v2(logits=logits1p, labels=self.yp_start)
227 |             pend_loss = tf.nn.softmax_cross_entropy_with_logits_v2(logits=logits2p, labels=self.yp_end)
228 |             self.loss += self.gamma_b * tf.reduce_mean(pstart_loss + pend_loss)
229 | 
230 |             # loss c
231 |             answer_exist_label = tf.cast(tf.slice(self.y_start, [0, 0], [-1, 1]), tf.float32)
232 |             self.loss += self.gamma_c * tf.reduce_mean(
233 |                 tf.nn.sigmoid_cross_entropy_with_logits(logits=self.unanswer_bias, labels=answer_exist_label))
234 | 
235 |             # l2 loss
236 |             if self.l2_norm is not None:
237 |                 decay_costs = []
238 |                 for var in tf.trainable_variables():
239 |                     decay_costs.append(tf.nn.l2_loss(var))
240 |                 self.loss += tf.multiply(self.l2_norm, tf.add_n(decay_costs))
241 | 
242 |             # RL loss
243 |             if self.use_rlloss:
244 |                 with tf.variable_scope("Reinforcement_Loss"):
245 |                     self.rl_loss_a, _, _ = rl_loss(logits1, logits2, self.y_start, self.y_end, self.c_maxlen + 1)
246 |                     self.rl_loss_b, _, _ = rl_loss(logits1p, logits2p, self.yp_start, self.yp_end, self.c_maxlen)
247 |                     self.loss += (self.rlw * (self.rl_loss_a + self.gamma_b * self.rl_loss_b))
248 | 
249 |         with tf.variable_scope('Output_Layer'):
250 |             softmax_start_scores = tf.nn.softmax(tf.slice(logits1, [0, 1], [-1, -1]))
251 |             softmax_end_scores = tf.nn.softmax(tf.slice(logits2, [0, 1], [-1, -1]))
252 | 
253 |             unanswer_mask1 = tf.cast(tf.argmax(tf.nn.softmax(logits1), axis=-1), tf.int64)
254 |             unanswer_mask1 = tf.cast(tf.cast(unanswer_mask1, tf.bool), tf.int64)  # [bs,] has answer=1 no answer=0
255 |             unanswer_move1 = unanswer_mask1 - 1  # [bs,] has answer=0 no answer=-1
256 |             unanswer_mask2 = tf.cast(tf.argmax(tf.nn.softmax(logits2), axis=-1), tf.int64)
257 |             unanswer_mask2 = tf.cast(tf.cast(unanswer_mask2, tf.bool), tf.int64)  # [bs,]
258 |             unanswer_move2 = unanswer_mask2 - 1
259 | 
260 |             softmax_start_p = tf.nn.softmax(logits2p)
261 |             softmax_end_p = tf.nn.softmax(logits2p)
262 |             softmax_start_scores = (1 - self.gamma_b) * softmax_start_scores + self.gamma_b * softmax_start_p
263 |             softmax_end_scores = (1 - self.gamma_b) * softmax_end_scores + self.gamma_b * softmax_end_p
264 | 
265 |             outer = tf.matmul(tf.expand_dims(softmax_start_scores, axis=2),
266 |                               tf.expand_dims(softmax_end_scores, axis=1))
267 |             outer = tf.matrix_band_part(outer, 0, self.ans_limit)
268 | 
269 |             def position_encoding(x):
270 |                 import math
271 |                 for i in range(x.shape[0]):
272 |                     for j in range(x.shape[1]):
273 |                         if j - i > 5:
274 |                             x[i][j] = float(1.0 / math.log(j - i + 1))
275 |                 return x
276 | 
277 |             mask_mat = tf.ones((self.c_maxlen, self.c_maxlen))
278 |             mask_mat = tf.expand_dims(tf.py_func(position_encoding, [mask_mat], tf.float32), axis=0)
279 |             mask_mat = tf.tile(mask_mat, [self.un_size, 1, 1])
280 | 
281 |             outer_masked = outer * mask_mat
282 |             self.mask_output1 = tf.argmax(tf.reduce_max(outer_masked, axis=2),
283 |                                           axis=1) * unanswer_mask1 + unanswer_move1
284 |             self.mask_output2 = tf.argmax(tf.reduce_max(outer_masked, axis=1),
285 |                                           axis=1) * unanswer_mask2 + unanswer_move2
286 | 
287 |     def complie(self):
288 |         self.opt = tf.train.AdamOptimizer(learning_rate=self.lr, beta1=0.8, beta2=0.999, epsilon=1e-7)
289 |         grads = self.opt.compute_gradients(self.loss)
290 |         gradients, variables = zip(*grads)
291 |         capped_grads, _ = tf.clip_by_global_norm(gradients, self.grad_clip)
292 |         self.train_op = self.opt.apply_gradients(zip(capped_grads, variables), global_step=self.global_step)
293 | 
294 |         # EMA
295 |         with tf.variable_scope("EMA_Weights"):
296 |             if self.decay is not None and self.decay < 1.:
297 |                 self.var_ema = tf.train.ExponentialMovingAverage(self.decay)
298 |                 with tf.control_dependencies([self.train_op]):
299 |                     self.ema_train_op = self.var_ema.apply(
300 |                         list(set(tf.trainable_variables()) ^ set(tf.trainable_variables('Cove_Layer'))))
301 |                 # assign ema weights
302 |                 self.assign_vars = []
303 |                 for var in tf.global_variables():
304 |                     v = self.var_ema.average(var)
305 |                     if v is not None:
306 |                         self.assign_vars.append(tf.assign(var, v))
307 | 
308 | 
309 | # import numpy as np
310 | #
311 | # config = {
312 | #     'char_dim': 64,
313 | #     'cont_limit': 400,
314 | #     'ques_limit': 50,
315 | #     'char_limit': 16,
316 | #     'ans_limit': -1,
317 | #     'filters': 256,
318 | #     'dropout': 0.1,
319 | #     'dropout_emb': 0.1,
320 | #     'l2_norm': 3e-7,
321 | #     'decay': 0.9999,
322 | #     'gamma_c': 1.0,
323 | #     'gamma_b': 0.3,
324 | #     'learning_rate': 1e-3,
325 | #     'grad_clip': 5.0,
326 | #     'init_lambda': 3.0,
327 | #     'loss_type': 'use_plausible',
328 | #     'use_elmo': 0,
329 | #     'use_cove': 0,
330 | #     'use_feat': True,
331 | #     'optimizer': 'adam',
332 | #     'cove_path': 'Keras_CoVe_2layers.h5',
333 | #     'elmo_weights_path': 'elmo_tf/models/squad_context_concat_lm_2x4096_512_2048cnn_2xhighway_skip.hdf5',
334 | #     'elmo_options_path': 'elmo_tf/models/options_squad_lm_2x4096_512_2048cnn_2xhighway_skip.json',
335 | #     'train_tfrecords': 'tfrecords/train_pre_elmo_cove.tfrecords',
336 | #     'dev_tfrecords': 'tfrecords/dev_pre_elmo_cove.tfrecords',
337 | #     'batch_size': 24,
338 | #     'epoch': 40,
339 | #     'origin_path': None,  # not finetune
340 | #     'path': 'QANetV253'
341 | # }
342 | # word_mat = np.random.random((90950, 300)).astype(np.float32)
343 | # char_mat2 = np.random.random((94, 300)).astype(np.float32)
344 | # char_mat = np.random.random((1171, 300)).astype(np.float32)
345 | # model = Model(config, word_mat, char_mat, char_mat2)
346 | 


--------------------------------------------------------------------------------
/bilm/model.py:
--------------------------------------------------------------------------------
  1 | 
  2 | import numpy as np
  3 | import tensorflow as tf
  4 | import h5py
  5 | import json
  6 | import re
  7 | 
  8 | from .data import UnicodeCharsVocabulary, Batcher
  9 | 
 10 | DTYPE = 'float32'
 11 | DTYPE_INT = 'int64'
 12 | 
 13 | 
 14 | class BidirectionalLanguageModel(object):
 15 |     def __init__(
 16 |             self,
 17 |             options_file: str,
 18 |             weight_file: str,
 19 |             use_character_inputs=True,
 20 |             embedding_weight_file=None,
 21 |             max_batch_size=128,
 22 |         ):
 23 |         '''
 24 |         Creates the language model computational graph and loads weights
 25 | 
 26 |         Two options for input type:
 27 |             (1) To use character inputs (paired with Batcher)
 28 |                 pass use_character_inputs=True, and ids_placeholder
 29 |                 of shape (None, None, max_characters_per_token)
 30 |                 to __call__
 31 |             (2) To use token ids as input (paired with TokenBatcher),
 32 |                 pass use_character_inputs=False and ids_placeholder
 33 |                 of shape (None, None) to __call__.
 34 |                 In this case, embedding_weight_file is also required input
 35 | 
 36 |         options_file: location of the json formatted file with
 37 |                       LM hyperparameters
 38 |         weight_file: location of the hdf5 file with LM weights
 39 |         use_character_inputs: if True, then use character ids as input,
 40 |             otherwise use token ids
 41 |         max_batch_size: the maximum allowable batch size 
 42 |         '''
 43 |         with open(options_file, 'r') as fin:
 44 |             options = json.load(fin)
 45 | 
 46 |         if not use_character_inputs:
 47 |             if embedding_weight_file is None:
 48 |                 raise ValueError(
 49 |                     "embedding_weight_file is required input with "
 50 |                     "not use_character_inputs"
 51 |                 )
 52 | 
 53 |         self._options = options
 54 |         self._weight_file = weight_file
 55 |         self._embedding_weight_file = embedding_weight_file
 56 |         self._use_character_inputs = use_character_inputs
 57 |         self._max_batch_size = max_batch_size
 58 | 
 59 |         self._ops = {}
 60 |         self._graphs = {}
 61 | 
 62 |     def __call__(self, ids_placeholder):
 63 |         '''
 64 |         Given the input character ids (or token ids), returns a dictionary
 65 |             with tensorflow ops:
 66 | 
 67 |             {'lm_embeddings': embedding_op,
 68 |              'lengths': sequence_lengths_op,
 69 |              'mask': op to compute mask}
 70 | 
 71 |         embedding_op computes the LM embeddings and is shape
 72 |             (None, 3, None, 1024)
 73 |         lengths_op computes the sequence lengths and is shape (None, )
 74 |         mask computes the sequence mask and is shape (None, None)
 75 | 
 76 |         ids_placeholder: a tf.placeholder of type int32.
 77 |             If use_character_inputs=True, it is shape
 78 |                 (None, None, max_characters_per_token) and holds the input
 79 |                 character ids for a batch
 80 |             If use_character_input=False, it is shape (None, None) and
 81 |                 holds the input token ids for a batch
 82 |         '''
 83 |         if ids_placeholder in self._ops:
 84 |             # have already created ops for this placeholder, just return them
 85 |             ret = self._ops[ids_placeholder]
 86 | 
 87 |         else:
 88 |             # need to create the graph
 89 |             if len(self._ops) == 0:
 90 |                 # first time creating the graph, don't reuse variables
 91 |                 lm_graph = BidirectionalLanguageModelGraph(
 92 |                     self._options,
 93 |                     self._weight_file,
 94 |                     ids_placeholder,
 95 |                     embedding_weight_file=self._embedding_weight_file,
 96 |                     use_character_inputs=self._use_character_inputs,
 97 |                     max_batch_size=self._max_batch_size)
 98 |             else:
 99 |                 with tf.variable_scope('', reuse=True):
100 |                     lm_graph = BidirectionalLanguageModelGraph(
101 |                         self._options,
102 |                         self._weight_file,
103 |                         ids_placeholder,
104 |                         embedding_weight_file=self._embedding_weight_file,
105 |                         use_character_inputs=self._use_character_inputs,
106 |                         max_batch_size=self._max_batch_size)
107 | 
108 |             ops = self._build_ops(lm_graph)
109 |             self._ops[ids_placeholder] = ops
110 |             self._graphs[ids_placeholder] = lm_graph
111 |             ret = ops
112 | 
113 |         return ret
114 | 
115 |     def _build_ops(self, lm_graph):
116 |         with tf.control_dependencies([lm_graph.update_state_op]):
117 |             # get the LM embeddings
118 |             token_embeddings = lm_graph.embedding
119 |             layers = [
120 |                 tf.concat([token_embeddings, token_embeddings], axis=2)
121 |             ]
122 | 
123 |             n_lm_layers = len(lm_graph.lstm_outputs['forward'])
124 |             for i in range(n_lm_layers):
125 |                 layers.append(
126 |                     tf.concat(
127 |                         [lm_graph.lstm_outputs['forward'][i],
128 |                          lm_graph.lstm_outputs['backward'][i]],
129 |                         axis=-1
130 |                     )
131 |                 )
132 | 
133 |             # The layers include the BOS/EOS tokens.  Remove them
134 |             sequence_length_wo_bos_eos = lm_graph.sequence_lengths - 2
135 |             layers_without_bos_eos = []
136 |             for layer in layers:
137 |                 layer_wo_bos_eos = layer[:, 1:, :]
138 |                 layer_wo_bos_eos = tf.reverse_sequence(
139 |                     layer_wo_bos_eos, 
140 |                     lm_graph.sequence_lengths - 1,
141 |                     seq_axis=1,
142 |                     batch_axis=0,
143 |                 )
144 |                 layer_wo_bos_eos = layer_wo_bos_eos[:, 1:, :]
145 |                 layer_wo_bos_eos = tf.reverse_sequence(
146 |                     layer_wo_bos_eos,
147 |                     sequence_length_wo_bos_eos,
148 |                     seq_axis=1,
149 |                     batch_axis=0,
150 |                 )
151 |                 layers_without_bos_eos.append(layer_wo_bos_eos)
152 | 
153 |             # concatenate the layers
154 |             lm_embeddings = tf.concat(
155 |                 [tf.expand_dims(t, axis=1) for t in layers_without_bos_eos],
156 |                 axis=1
157 |             )
158 | 
159 |             # get the mask op without bos/eos.
160 |             # tf doesn't support reversing boolean tensors, so cast
161 |             # to int then back
162 |             mask_wo_bos_eos = tf.cast(lm_graph.mask[:, 1:], 'int32')
163 |             mask_wo_bos_eos = tf.reverse_sequence(
164 |                 mask_wo_bos_eos,
165 |                 lm_graph.sequence_lengths - 1,
166 |                 seq_axis=1,
167 |                 batch_axis=0,
168 |             )
169 |             mask_wo_bos_eos = mask_wo_bos_eos[:, 1:]
170 |             mask_wo_bos_eos = tf.reverse_sequence(
171 |                 mask_wo_bos_eos,
172 |                 sequence_length_wo_bos_eos,
173 |                 seq_axis=1,
174 |                 batch_axis=0,
175 |             )
176 |             mask_wo_bos_eos = tf.cast(mask_wo_bos_eos, 'bool')
177 | 
178 |         return {
179 |             'lm_embeddings': lm_embeddings, 
180 |             'lengths': sequence_length_wo_bos_eos,
181 |             'token_embeddings': lm_graph.embedding,
182 |             'mask': mask_wo_bos_eos,
183 |         }
184 | 
185 | 
186 | def _pretrained_initializer(varname, weight_file, embedding_weight_file=None):
187 |     '''
188 |     We'll stub out all the initializers in the pretrained LM with
189 |     a function that loads the weights from the file
190 |     '''
191 |     weight_name_map = {}
192 |     for i in range(2):
193 |         for j in range(8):  # if we decide to add more layers
194 |             root = 'RNN_{}/RNN/MultiRNNCell/Cell{}'.format(i, j)
195 |             weight_name_map[root + '/rnn/lstm_cell/kernel'] = \
196 |                 root + '/LSTMCell/W_0'
197 |             weight_name_map[root + '/rnn/lstm_cell/bias'] = \
198 |                 root + '/LSTMCell/B'
199 |             weight_name_map[root + '/rnn/lstm_cell/projection/kernel'] = \
200 |                 root + '/LSTMCell/W_P_0'
201 | 
202 |     # convert the graph name to that in the checkpoint
203 |     varname_in_file = varname[5:]
204 |     if varname_in_file.startswith('RNN'):
205 |         varname_in_file = weight_name_map[varname_in_file]
206 | 
207 |     if varname_in_file == 'embedding':
208 |         with h5py.File(embedding_weight_file, 'r') as fin:
209 |             # Have added a special 0 index for padding not present
210 |             # in the original model.
211 |             embed_weights = fin[varname_in_file][...]
212 |             weights = np.zeros(
213 |                 (embed_weights.shape[0] + 1, embed_weights.shape[1]),
214 |                 dtype=DTYPE
215 |             )
216 |             weights[1:, :] = embed_weights
217 |     else:
218 |         with h5py.File(weight_file, 'r') as fin:
219 |             if varname_in_file == 'char_embed':
220 |                 # Have added a special 0 index for padding not present
221 |                 # in the original model.
222 |                 char_embed_weights = fin[varname_in_file][...]
223 |                 weights = np.zeros(
224 |                     (char_embed_weights.shape[0] + 1,
225 |                      char_embed_weights.shape[1]),
226 |                     dtype=DTYPE
227 |                 )
228 |                 weights[1:, :] = char_embed_weights
229 |             else:
230 |                 weights = fin[varname_in_file][...]
231 | 
232 |     # Tensorflow initializers are callables that accept a shape parameter
233 |     # and some optional kwargs
234 |     def ret(shape, **kwargs):
235 |         if list(shape) != list(weights.shape):
236 |             raise ValueError(
237 |                 "Invalid shape initializing {0}, got {1}, expected {2}".format(
238 |                     varname_in_file, shape, weights.shape)
239 |             )
240 |         return weights
241 | 
242 |     return ret
243 | 
244 | 
245 | class BidirectionalLanguageModelGraph(object):
246 |     '''
247 |     Creates the computational graph and holds the ops necessary for runnint
248 |     a bidirectional language model
249 |     '''
250 |     def __init__(self, options, weight_file, ids_placeholder,
251 |                  use_character_inputs=True, embedding_weight_file=None,
252 |                  max_batch_size=128):
253 | 
254 |         self.options = options
255 |         self._max_batch_size = max_batch_size
256 |         self.ids_placeholder = ids_placeholder
257 |         self.use_character_inputs = use_character_inputs
258 | 
259 |         # this custom_getter will make all variables not trainable and
260 |         # override the default initializer
261 |         def custom_getter(getter, name, *args, **kwargs):
262 |             kwargs['trainable'] = False
263 |             kwargs['initializer'] = _pretrained_initializer(
264 |                 name, weight_file, embedding_weight_file
265 |             )
266 |             return getter(name, *args, **kwargs)
267 | 
268 |         if embedding_weight_file is not None:
269 |             # get the vocab size
270 |             with h5py.File(embedding_weight_file, 'r') as fin:
271 |                 # +1 for padding
272 |                 self._n_tokens_vocab = fin['embedding'].shape[0] + 1
273 |         else:
274 |             self._n_tokens_vocab = None
275 | 
276 |         with tf.variable_scope('bilm', custom_getter=custom_getter):
277 |             self._build()
278 | 
279 |     def _build(self):
280 |         if self.use_character_inputs:
281 |             self._build_word_char_embeddings()
282 |         else:
283 |             self._build_word_embeddings()
284 |         self._build_lstms()
285 | 
286 |     def _build_word_char_embeddings(self):
287 |         '''
288 |         options contains key 'char_cnn': {
289 | 
290 |         'n_characters': 262,
291 | 
292 |         # includes the start / end characters
293 |         'max_characters_per_token': 50,
294 | 
295 |         'filters': [
296 |             [1, 32],
297 |             [2, 32],
298 |             [3, 64],
299 |             [4, 128],
300 |             [5, 256],
301 |             [6, 512],
302 |             [7, 1024]
303 |         ],
304 |         'activation': 'tanh',
305 | 
306 |         # for the character embedding
307 |         'embedding': {'dim': 16}
308 | 
309 |         # for highway layers
310 |         # if omitted, then no highway layers
311 |         'n_highway': 2,
312 |         }
313 |         '''
314 |         projection_dim = self.options['lstm']['projection_dim']
315 | 
316 |         cnn_options = self.options['char_cnn']
317 |         filters = cnn_options['filters']
318 |         n_filters = sum(f[1] for f in filters)
319 |         max_chars = cnn_options['max_characters_per_token']
320 |         char_embed_dim = cnn_options['embedding']['dim']
321 |         n_chars = cnn_options['n_characters']
322 |         if n_chars != 262:
323 |             raise InvalidNumberOfCharacters(
324 |                 "Set n_characters=262 after training see the README.md"
325 |             )
326 |         if cnn_options['activation'] == 'tanh':
327 |             activation = tf.nn.tanh
328 |         elif cnn_options['activation'] == 'relu':
329 |             activation = tf.nn.relu
330 | 
331 |         # the character embeddings
332 |         with tf.device("/cpu:0"):
333 |             self.embedding_weights = tf.get_variable(
334 |                     "char_embed", [n_chars, char_embed_dim],
335 |                     dtype=DTYPE,
336 |                     initializer=tf.random_uniform_initializer(-1.0, 1.0)
337 |             )
338 |             # shape (batch_size, unroll_steps, max_chars, embed_dim)
339 |             self.char_embedding = tf.nn.embedding_lookup(self.embedding_weights,
340 |                                                     self.ids_placeholder)
341 | 
342 |         # the convolutions
343 |         def make_convolutions(inp):
344 |             # inp: [bs, len_seq, len_char, char_dim]
345 |             with tf.variable_scope('CNN') as scope:
346 |                 convolutions = []
347 |                 for i, (width, num) in enumerate(filters):
348 |                     if cnn_options['activation'] == 'relu':
349 |                         # He initialization for ReLU activation
350 |                         # with char embeddings init between -1 and 1
351 |                         #w_init = tf.random_normal_initializer(
352 |                         #    mean=0.0,
353 |                         #    stddev=np.sqrt(2.0 / (width * char_embed_dim))
354 |                         #)
355 | 
356 |                         # Kim et al 2015, +/- 0.05
357 |                         w_init = tf.random_uniform_initializer(
358 |                             minval=-0.05, maxval=0.05)
359 |                     elif cnn_options['activation'] == 'tanh':
360 |                         # glorot init
361 |                         w_init = tf.random_normal_initializer(
362 |                             mean=0.0,
363 |                             stddev=np.sqrt(1.0 / (width * char_embed_dim))
364 |                         )
365 |                     w = tf.get_variable(
366 |                         "W_cnn_%s" % i,
367 |                         [1, width, char_embed_dim, num],
368 |                         initializer=w_init,
369 |                         dtype=DTYPE)
370 |                     b = tf.get_variable(
371 |                         "b_cnn_%s" % i, [num], dtype=DTYPE,
372 |                         initializer=tf.constant_initializer(0.0))
373 | 
374 |                     conv = tf.nn.conv2d(
375 |                             inp, w,
376 |                             strides=[1, 1, 1, 1],
377 |                             padding="VALID") + b
378 |                     # conv [bs, len_seq, len_char-width+1, filters]
379 |                     # now max pool
380 |                     conv = tf.nn.max_pool(
381 |                             conv, [1, 1, max_chars-width+1, 1],
382 |                             [1, 1, 1, 1], 'VALID')
383 | 
384 |                     # activation
385 |                     conv = activation(conv)
386 |                     conv = tf.squeeze(conv, squeeze_dims=[2])
387 |                     # [bs, len_seq, filters]
388 | 
389 |                     convolutions.append(conv)
390 | 
391 |             return tf.concat(convolutions, 2)
392 | 
393 |         embedding = make_convolutions(self.char_embedding)
394 | 
395 |         # for highway and projection layers
396 |         n_highway = cnn_options.get('n_highway')
397 |         use_highway = n_highway is not None and n_highway > 0
398 |         use_proj = n_filters != projection_dim
399 | 
400 |         if use_highway or use_proj:
401 |             #   reshape from (batch_size, n_tokens, dim) to (-1, dim)
402 |             batch_size_n_tokens = tf.shape(embedding)[0:2]
403 |             embedding = tf.reshape(embedding, [-1, n_filters])
404 | 
405 |         # set up weights for projection
406 |         if use_proj:
407 |             assert n_filters > projection_dim
408 |             with tf.variable_scope('CNN_proj') as scope:
409 |                     W_proj_cnn = tf.get_variable(
410 |                         "W_proj", [n_filters, projection_dim],
411 |                         initializer=tf.random_normal_initializer(
412 |                             mean=0.0, stddev=np.sqrt(1.0 / n_filters)),
413 |                         dtype=DTYPE)
414 |                     b_proj_cnn = tf.get_variable(
415 |                         "b_proj", [projection_dim],
416 |                         initializer=tf.constant_initializer(0.0),
417 |                         dtype=DTYPE)
418 | 
419 |         # apply highways layers
420 |         def high(x, ww_carry, bb_carry, ww_tr, bb_tr):
421 |             carry_gate = tf.nn.sigmoid(tf.matmul(x, ww_carry) + bb_carry)
422 |             transform_gate = tf.nn.relu(tf.matmul(x, ww_tr) + bb_tr)
423 |             return carry_gate * transform_gate + (1.0 - carry_gate) * x
424 | 
425 |         if use_highway:
426 |             highway_dim = n_filters
427 | 
428 |             for i in range(n_highway):
429 |                 with tf.variable_scope('CNN_high_%s' % i) as scope:
430 |                     W_carry = tf.get_variable(
431 |                         'W_carry', [highway_dim, highway_dim],
432 |                         # glorit init
433 |                         initializer=tf.random_normal_initializer(
434 |                             mean=0.0, stddev=np.sqrt(1.0 / highway_dim)),
435 |                         dtype=DTYPE)
436 |                     b_carry = tf.get_variable(
437 |                         'b_carry', [highway_dim],
438 |                         initializer=tf.constant_initializer(-2.0),
439 |                         dtype=DTYPE)
440 |                     W_transform = tf.get_variable(
441 |                         'W_transform', [highway_dim, highway_dim],
442 |                         initializer=tf.random_normal_initializer(
443 |                             mean=0.0, stddev=np.sqrt(1.0 / highway_dim)),
444 |                         dtype=DTYPE)
445 |                     b_transform = tf.get_variable(
446 |                         'b_transform', [highway_dim],
447 |                         initializer=tf.constant_initializer(0.0),
448 |                         dtype=DTYPE)
449 | 
450 |                 embedding = high(embedding, W_carry, b_carry,
451 |                                  W_transform, b_transform)
452 | 
453 |         # finally project down if needed
454 |         if use_proj:
455 |             embedding = tf.matmul(embedding, W_proj_cnn) + b_proj_cnn
456 | 
457 |         # reshape back to (batch_size, tokens, dim)
458 |         if use_highway or use_proj:
459 |             shp = tf.concat([batch_size_n_tokens, [projection_dim]], axis=0)
460 |             embedding = tf.reshape(embedding, shp)
461 | 
462 |         # at last assign attributes for remainder of the model
463 |         self.embedding = embedding
464 | 
465 | 
466 |     def _build_word_embeddings(self):
467 |         projection_dim = self.options['lstm']['projection_dim']
468 | 
469 |         # the word embeddings
470 |         with tf.device("/cpu:0"):
471 |             self.embedding_weights = tf.get_variable(
472 |                 "embedding", [self._n_tokens_vocab, projection_dim],
473 |                 dtype=DTYPE,
474 |             )
475 |             self.embedding = tf.nn.embedding_lookup(self.embedding_weights,
476 |                                                 self.ids_placeholder)
477 | 
478 | 
479 |     def _build_lstms(self):
480 |         # now the LSTMs
481 |         # these will collect the initial states for the forward
482 |         #   (and reverse LSTMs if we are doing bidirectional)
483 | 
484 |         # parse the options
485 |         lstm_dim = self.options['lstm']['dim']
486 |         projection_dim = self.options['lstm']['projection_dim']
487 |         n_lstm_layers = self.options['lstm'].get('n_layers', 1)
488 |         cell_clip = self.options['lstm'].get('cell_clip')
489 |         proj_clip = self.options['lstm'].get('proj_clip')
490 |         use_skip_connections = self.options['lstm']['use_skip_connections']
491 |         if use_skip_connections:
492 |             print("USING SKIP CONNECTIONS")
493 |         else:
494 |             print("NOT USING SKIP CONNECTIONS")
495 | 
496 |         # the sequence lengths from input mask
497 |         if self.use_character_inputs:
498 |             mask = tf.reduce_any(self.ids_placeholder > 0, axis=2)
499 |         else:
500 |             mask = self.ids_placeholder > 0
501 |         sequence_lengths = tf.reduce_sum(tf.cast(mask, tf.int32), axis=1)
502 |         batch_size = tf.shape(sequence_lengths)[0]
503 | 
504 |         # for each direction, we'll store tensors for each layer
505 |         self.lstm_outputs = {'forward': [], 'backward': []}
506 |         self.lstm_state_sizes = {'forward': [], 'backward': []}
507 |         self.lstm_init_states = {'forward': [], 'backward': []}
508 |         self.lstm_final_states = {'forward': [], 'backward': []}
509 | 
510 |         update_ops = []
511 |         for direction in ['forward', 'backward']:
512 |             if direction == 'forward':
513 |                 layer_input = self.embedding
514 |             else:
515 |                 layer_input = tf.reverse_sequence(
516 |                     self.embedding,
517 |                     sequence_lengths,
518 |                     seq_axis=1,
519 |                     batch_axis=0
520 |                 )
521 |             for i in range(n_lstm_layers):
522 |                 if projection_dim < lstm_dim:
523 |                     # are projecting down output
524 |                     lstm_cell = tf.nn.rnn_cell.LSTMCell(
525 |                         lstm_dim, num_proj=projection_dim,
526 |                         cell_clip=cell_clip, proj_clip=proj_clip)
527 |                 else:
528 |                     lstm_cell = tf.nn.rnn_cell.LSTMCell(
529 |                             lstm_dim,
530 |                             cell_clip=cell_clip, proj_clip=proj_clip)
531 | 
532 |                 if use_skip_connections:
533 |                     # ResidualWrapper adds inputs to outputs
534 |                     if i == 0:
535 |                         # don't add skip connection from token embedding to
536 |                         # 1st layer output
537 |                         pass
538 |                     else:
539 |                         # add a skip connection
540 |                         lstm_cell = tf.nn.rnn_cell.ResidualWrapper(lstm_cell)
541 | 
542 |                 # collect the input state, run the dynamic rnn, collect
543 |                 # the output
544 |                 state_size = lstm_cell.state_size
545 |                 # the LSTMs are stateful.  To support multiple batch sizes,
546 |                 # we'll allocate size for states up to max_batch_size,
547 |                 # then use the first batch_size entries for each batch
548 |                 init_states = [
549 |                     tf.Variable(
550 |                         tf.zeros([self._max_batch_size, dim]),
551 |                         trainable=False
552 |                     )
553 |                     for dim in lstm_cell.state_size
554 |                 ]
555 |                 batch_init_states = [
556 |                     state[:batch_size, :] for state in init_states
557 |                 ]
558 | 
559 |                 if direction == 'forward':
560 |                     i_direction = 0
561 |                 else:
562 |                     i_direction = 1
563 |                 variable_scope_name = 'RNN_{0}/RNN/MultiRNNCell/Cell{1}'.format(
564 |                     i_direction, i)
565 |                 with tf.variable_scope(variable_scope_name):
566 |                     layer_output, final_state = tf.nn.dynamic_rnn(
567 |                         lstm_cell,
568 |                         layer_input,
569 |                         sequence_length=sequence_lengths,
570 |                         initial_state=tf.nn.rnn_cell.LSTMStateTuple(
571 |                             *batch_init_states),
572 |                     )
573 | 
574 |                 self.lstm_state_sizes[direction].append(lstm_cell.state_size)
575 |                 self.lstm_init_states[direction].append(init_states)
576 |                 self.lstm_final_states[direction].append(final_state)
577 |                 if direction == 'forward':
578 |                     self.lstm_outputs[direction].append(layer_output)
579 |                 else:
580 |                     self.lstm_outputs[direction].append(
581 |                         tf.reverse_sequence(
582 |                             layer_output,
583 |                             sequence_lengths,
584 |                             seq_axis=1,
585 |                             batch_axis=0
586 |                         )
587 |                     )
588 | 
589 |                 with tf.control_dependencies([layer_output]):
590 |                     # update the initial states
591 |                     for i in range(2):
592 |                         new_state = tf.concat(
593 |                             [final_state[i][:batch_size, :],
594 |                              init_states[i][batch_size:, :]], axis=0)
595 |                         state_update_op = tf.assign(init_states[i], new_state)
596 |                         update_ops.append(state_update_op)
597 |     
598 |                 layer_input = layer_output
599 | 
600 |         self.mask = mask
601 |         self.sequence_lengths = sequence_lengths
602 |         self.update_state_op = tf.group(*update_ops)
603 | 
604 | 
605 | def dump_token_embeddings(vocab_file, options_file, weight_file, outfile):
606 |     '''
607 |     Given an input vocabulary file, dump all the token embeddings to the
608 |     outfile.  The result can be used as the embedding_weight_file when
609 |     constructing a BidirectionalLanguageModel.
610 |     '''
611 |     with open(options_file, 'r') as fin:
612 |         options = json.load(fin)
613 |     max_word_length = options['char_cnn']['max_characters_per_token']
614 | 
615 |     vocab = UnicodeCharsVocabulary(vocab_file, max_word_length)
616 |     batcher = Batcher(vocab_file, max_word_length)
617 | 
618 |     ids_placeholder = tf.placeholder('int32',
619 |                                      shape=(None, None, max_word_length)
620 |     )
621 |     model = BidirectionalLanguageModel(options_file, weight_file)
622 |     embedding_op = model(ids_placeholder)['token_embeddings']
623 | 
624 |     n_tokens = vocab.size
625 |     embed_dim = int(embedding_op.shape[2])
626 | 
627 |     embeddings = np.zeros((n_tokens, embed_dim), dtype=DTYPE)
628 | 
629 |     config = tf.ConfigProto(allow_soft_placement=True)
630 |     with tf.Session(config=config) as sess:
631 |         sess.run(tf.global_variables_initializer())
632 |         for k in range(n_tokens):
633 |             token = vocab.id_to_word(k)
634 |             char_ids = batcher.batch_sentences([[token]])[0, 1, :].reshape(
635 |                 1, 1, -1)
636 |             embeddings[k, :] = sess.run(
637 |                 embedding_op, feed_dict={ids_placeholder: char_ids}
638 |             )
639 | 
640 |     with h5py.File(outfile, 'w') as fout:
641 |         ds = fout.create_dataset(
642 |             'embedding', embeddings.shape, dtype='float32', data=embeddings
643 |         )
644 | 
645 | def dump_bilm_embeddings(vocab_file, dataset_file, options_file,
646 |                          weight_file, outfile):
647 |     with open(options_file, 'r') as fin:
648 |         options = json.load(fin)
649 |     max_word_length = options['char_cnn']['max_characters_per_token']
650 | 
651 |     vocab = UnicodeCharsVocabulary(vocab_file, max_word_length)
652 |     batcher = Batcher(vocab_file, max_word_length)
653 | 
654 |     ids_placeholder = tf.placeholder('int32',
655 |                                      shape=(None, None, max_word_length)
656 |     )
657 |     model = BidirectionalLanguageModel(options_file, weight_file)
658 |     ops = model(ids_placeholder)
659 | 
660 |     config = tf.ConfigProto(allow_soft_placement=True)
661 |     with tf.Session(config=config) as sess:
662 |         sess.run(tf.global_variables_initializer())
663 |         sentence_id = 0
664 |         with open(dataset_file, 'r') as fin, h5py.File(outfile, 'w') as fout:
665 |             for line in fin:
666 |                 sentence = line.strip().split()
667 |                 char_ids = batcher.batch_sentences([sentence])
668 |                 embeddings = sess.run(
669 |                     ops['lm_embeddings'], feed_dict={ids_placeholder: char_ids}
670 |                 )
671 |                 ds = fout.create_dataset(
672 |                     '{}'.format(sentence_id),
673 |                     embeddings.shape[1:], dtype='float32',
674 |                     data=embeddings[0, :, :, :]
675 |                 )
676 | 
677 |                 sentence_id += 1
678 | 
679 | 


--------------------------------------------------------------------------------