├── pics └── result1.jpg ├── bilm ├── __init__.py ├── elmo.py ├── data.py └── model.py ├── README.md ├── util ├── log_wrapper.py ├── tokenizer.py ├── h5py_generator.py ├── spacy_tokenizer.py ├── get_tfrecords.py └── util.py ├── loss.py ├── test.py ├── train_finetune.py ├── layers.py ├── train_h5py.py ├── layersV0.py ├── train_tfrecords.py ├── RMR_modelV3.py ├── RMR_modelV6.py └── RMR_modelV6_squad2.py /pics/result1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ewrfcas/Reinforced-Mnemonic-Reader/HEAD/pics/result1.jpg -------------------------------------------------------------------------------- /bilm/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | from .data import Batcher, TokenBatcher 3 | from .model import BidirectionalLanguageModel, dump_token_embeddings, \ 4 | dump_bilm_embeddings 5 | from .elmo import weight_layers, all_layers 6 | 7 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ## Reinforced Mnemonic Reader in tensorflow 2 | RMR: https://arxiv.org/abs/1705.02798 3 | 4 | ## Pipline 5 | 1. Run the ``preprocess.ipynb`` to get the input datasets. 6 | 2. Run ``train_h5py.py`` to start training. Now elmo and cove is not useable. 7 | 8 | ### notes 9 | 1. `conv1d` in `tensor2tensor` is used to instead of the matrix matmul (full connection) operation in RMR model. 10 | 2. Welcome to discuss any problem about this project (especially the RL loss). 11 | 3. The reinforcement loss should be used after the convergence of cross-entropy. 12 | 4. RMR_modelV3 is based on the version 3, and RMR_modelV6 is based on the version 6 of [https://arxiv.org/abs/1705.02798v3]. It seems that v3 performs better than v6. 13 | 14 | ## Updates 15 | - [x] Init for the RMR model (without dynamic-critical reinforcement learning DCRL) 16 | - [x] Add the self-critical sequence training (SCST) (no test) 17 | - [x] Update cudnn LSTM and squad 2.0 18 | - [x] Update v3 in modelV0 19 | - [ ] Test the RL loss 20 | 21 | ## Results 22 | 23 | ### Result on dev set of squad 1.1 24 | EM:71.17% F1:79.56% (no elmo, no cove paper v6) 25 | 26 | EM:74.37% F1:82.67% (hidden size=256 +elmo v6) 27 | 28 | EM:72.08% F1:80.51% (no elmo, no cove paper v3) 29 | 30 | EM:72.87% F1:81.47% (pytorch version, based on paper v3) 31 | 32 | ### Result on dev set of squad 2.0 33 | EM:64.89% F1:67.81% (+elmo+cove v3) -------------------------------------------------------------------------------- /util/log_wrapper.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import logging 3 | from time import gmtime, strftime 4 | from colorlog import ColoredFormatter 5 | 6 | def create_logger(name, silent=False, to_disk=False, log_file=None, prefix=None): 7 | """Logger wrapper 8 | by xiaodl 9 | """ 10 | # setup logger 11 | log = logging.getLogger(name) 12 | log.setLevel(logging.DEBUG) 13 | formatter = ColoredFormatter( 14 | "%(asctime)s %(log_color)s%(levelname)-8s%(reset)s [%(blue)s%(message)s%(reset)s]", 15 | datefmt='%Y-%m-%d %I:%M:%S', 16 | reset=True, 17 | log_colors={ 18 | 'DEBUG': 'cyan', 19 | 'INFO': 'green', 20 | 'WARNING': 'yellow', 21 | 'ERROR': 'red', 22 | 'CRITICAL': 'red,bg_white', 23 | }, 24 | secondary_log_colors={}, 25 | style='%' 26 | ) 27 | fformatter = logging.Formatter( 28 | "%(asctime)s [%(funcName)-12s] %(levelname)-8s [%(message)s]", 29 | datefmt='%Y-%m-%d %I:%M:%S', 30 | style='%' 31 | ) 32 | if not silent: 33 | ch = logging.StreamHandler(sys.stdout) 34 | ch.setLevel(logging.INFO) 35 | ch.setFormatter(formatter) 36 | log.addHandler(ch) 37 | if to_disk: 38 | prefix = prefix if prefix is not None else 'my_log' 39 | log_file = log_file if log_file is not None else strftime('{}-%Y-%m-%d-%H-%M-%S.log'.format(prefix), gmtime()) 40 | fh = logging.FileHandler(log_file) 41 | fh.setLevel(logging.DEBUG) 42 | fh.setFormatter(fformatter) 43 | log.addHandler(fh) 44 | # disable elmo info 45 | log.propagate = False 46 | return log 47 | -------------------------------------------------------------------------------- /util/tokenizer.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created October, 2017 3 | Author: xiaodl@microsoft.com 4 | ''' 5 | import re 6 | import warnings 7 | import spacy 8 | import tqdm 9 | import logging 10 | import unicodedata 11 | from collections import Counter 12 | 13 | 14 | logger = logging.getLogger(__name__) 15 | 16 | PAD = '' 17 | UNK = '' 18 | STA= '' 19 | END = '' 20 | 21 | PAD_ID = 0 22 | UNK_ID = 1 23 | STA_ID = 2 24 | END_ID = 3 25 | 26 | DigitsMapper = {'0': 'zero', '1': 'one', '2': 'two', '3': 'three', '4': 'four', '5': 'five', '6': 'six', '7': 'seven', '8': 'eight', '9': 'nine', '10': 'ten', 27 | 'zero': '0', 'one': '1', 'two': '2', 'three': '3', 'four': '4', 'five': '5', 'six': '6', 'seven': '7','eight': '8', 'nine': '9', 'ten': '10'} 28 | 29 | def normal_query(query, document): 30 | """ normalize digits 31 | """ 32 | nq = [] 33 | for w in query: 34 | if w in DigitsMapper and w not in document: 35 | if DigitsMapper[w] in document: 36 | w = DigitsMapper[w] 37 | nq.append(w) 38 | return nq 39 | 40 | 41 | def normalize_text(text): 42 | return unicodedata.normalize('NFD', text) 43 | 44 | def token_extend(reg_rules): 45 | return ' ' + reg_rules.group(0) + ' ' 46 | 47 | def reform_text(text): 48 | text = re.sub(u'-|¢|¥|€|£|\u2010|\u2011|\u2012|\u2013|\u2014|\u2015|%|\[|\]|:|\(|\)|/', token_extend, text) 49 | text = text.strip(' \n') 50 | text = re.sub('\s+', ' ', text) 51 | return text 52 | 53 | def reform_simple(text): 54 | text = text.strip(' \n') 55 | text = re.sub('\s+', ' ', text) 56 | return text 57 | 58 | class Vocabulary(object): 59 | INIT_LEN = 4 60 | def __init__(self, neat=False): 61 | self.neat = neat 62 | if not neat: 63 | self.tok2ind = {PAD: PAD_ID, UNK: UNK_ID, STA: STA_ID, END: END_ID} 64 | self.ind2tok = {PAD_ID: PAD, UNK_ID: UNK, STA_ID: STA, END_ID:END} 65 | else: 66 | self.tok2ind = {} 67 | self.ind2tok = {} 68 | 69 | def __len__(self): 70 | return len(self.tok2ind) 71 | 72 | def __iter__(self): 73 | return iter(self.tok2ind) 74 | 75 | def __contains__(self, key): 76 | if type(key) == int: 77 | return key in self.ind2tok 78 | elif type(key) == str: 79 | return key in self.tok2ind 80 | 81 | def __getitem__(self, key): 82 | if type(key) == int: 83 | return self.ind2tok.get(key, -1) if self.neat else self.ind2tok.get(key, UNK) 84 | if type(key) == str: 85 | return self.tok2ind.get(key, None) if self.neat else self.tok2ind.get(key,self.tok2ind.get(UNK)) 86 | 87 | def __setitem__(self, key, item): 88 | if type(key) == int and type(item) == str: 89 | self.ind2tok[key] = item 90 | elif type(key) == str and type(item) == int: 91 | self.tok2ind[key] = item 92 | else: 93 | raise RuntimeError('Invalid (key, item) types.') 94 | 95 | def add(self, token): 96 | if token not in self.tok2ind: 97 | index = len(self.tok2ind) 98 | self.tok2ind[token] = index 99 | self.ind2tok[index] = token 100 | 101 | def get_vocab_list(self, with_order=True): 102 | if with_order: 103 | words = [self[k] for k in range(0, len(self))] 104 | else: 105 | words = [k for k in self.tok2ind.keys() 106 | if k not in {PAD, UNK, STA, END}] 107 | return words 108 | 109 | def toidx(self, tokens): 110 | return [self[tok] for tok in tokens] 111 | 112 | def copy(self): 113 | new_vocab = Vocabulary(self.neat) 114 | for w in self: 115 | new_vocab.add(w) 116 | return new_vocab 117 | 118 | def build(words, neat=False): 119 | vocab = Vocabulary(neat) 120 | for w in words: vocab.add(w) 121 | return vocab 122 | -------------------------------------------------------------------------------- /util/h5py_generator.py: -------------------------------------------------------------------------------- 1 | import h5py 2 | import numpy as np 3 | import time 4 | 5 | class Generator(): 6 | def __init__(self, data_path, qid, batch_size=32, shuffle=True, padding_value=0, data_keys=None, use_elmo=0, 7 | use_cove=0, elmo_path=None, cove_path=None): 8 | self.batch_size = batch_size 9 | if isinstance(qid, str): 10 | self.qid = np.load(qid) 11 | else: 12 | self.qid = qid 13 | self.length = len(self.qid) 14 | self.shuffle = shuffle 15 | self.data_path = data_path 16 | self.max_batch = self.length // self.batch_size 17 | if self.length % self.batch_size != 0: 18 | self.max_batch += 1 19 | self.padding_value = padding_value 20 | if self.shuffle: 21 | self.run_shuffle() 22 | print('Loaded {} samples'.format(self.length)) 23 | self.i_batch = 0 24 | self.data_keys = data_keys 25 | self.use_elmo = use_elmo 26 | self.use_cove = use_cove 27 | if use_elmo == 1: 28 | assert elmo_path is not None 29 | self.elmo_path = elmo_path 30 | if use_cove == 1: 31 | assert cove_path is not None 32 | self.cove_path = cove_path 33 | 34 | self.get_time = 0 35 | self.pad_time = 0 36 | 37 | def reset(self): 38 | self.i_batch = 0 39 | self.run_shuffle() 40 | 41 | def run_shuffle(self): 42 | if self.shuffle: 43 | np.random.shuffle(self.qid) 44 | else: 45 | pass 46 | 47 | def padding(self, datas): 48 | max_len = max([d.shape[0] for d in datas]) 49 | paded_datas = np.zeros([len(datas), max_len] + list(datas[0].shape[1:]), dtype=datas[0].dtype) 50 | for i in range(len(datas)): 51 | paded_datas[i, 0:datas[i].shape[0]] = datas[i] 52 | return paded_datas 53 | 54 | def __len__(self): 55 | return self.length 56 | 57 | def __next__(self): 58 | batch_data = {} 59 | if self.use_elmo == 1: 60 | elmo_h5f = h5py.File(self.elmo_path, 'r') 61 | if self.use_cove == 1: 62 | cove_h5f = h5py.File(self.cove_path, 'r') 63 | # st = time.time() 64 | with h5py.File(self.data_path, 'r') as h5f: 65 | qid_batch = self.qid[self.i_batch * self.batch_size:(self.i_batch + 1) * self.batch_size] 66 | for id in qid_batch: 67 | group = h5f[str(id)] 68 | # normal features 69 | if self.data_keys is None: 70 | self.data_keys = list(group.keys()) 71 | for k in self.data_keys: 72 | if k not in batch_data: 73 | batch_data[k] = [group[k][:]] 74 | else: 75 | batch_data[k].append(group[k][:]) 76 | # elmo features 77 | if self.use_elmo == 1: 78 | if 'elmo_cont' not in batch_data: 79 | batch_data['elmo_cont'] = [elmo_h5f[str(id) + 'c'][:]] 80 | else: 81 | batch_data['elmo_cont'].append(elmo_h5f[str(id) + 'c'][:]) 82 | if 'elmo_ques' not in batch_data: 83 | batch_data['elmo_ques'] = [elmo_h5f[str(id) + 'q'][:]] 84 | else: 85 | batch_data['elmo_ques'].append(elmo_h5f[str(id) + 'q'][:]) 86 | # cove features 87 | if self.use_cove == 1: 88 | if 'cove_cont' not in batch_data: 89 | batch_data['cove_cont'] = [cove_h5f[str(id) + 'c'][:]] 90 | else: 91 | batch_data['cove_cont'].append(cove_h5f[str(id) + 'c'][:]) 92 | if 'cove_ques' not in batch_data: 93 | batch_data['cove_ques'] = [cove_h5f[str(id) + 'q'][:]] 94 | else: 95 | batch_data['cove_ques'].append(cove_h5f[str(id) + 'q'][:]) 96 | if self.use_elmo == 1: 97 | elmo_h5f.close() 98 | if self.use_cove == 1: 99 | cove_h5f.close() 100 | # ed = time.time() 101 | # self.get_time += float(ed - st) 102 | 103 | # st = time.time() 104 | for k in batch_data: 105 | batch_data[k] = self.padding(batch_data[k]) 106 | # ed = time.time() 107 | # self.pad_time += float(ed - st) 108 | # print('get_time:', self.get_time) 109 | # print('pad_time:', self.pad_time) 110 | self.i_batch += 1 111 | if self.i_batch == self.max_batch: 112 | self.i_batch = 0 113 | self.run_shuffle() 114 | return batch_data 115 | -------------------------------------------------------------------------------- /bilm/elmo.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | 3 | 4 | def weight_layers(name, bilm_ops, l2_coef=None, 5 | use_top_only=False, do_layer_norm=False): 6 | ''' 7 | Weight the layers of a biLM with trainable scalar weights to 8 | compute ELMo representations. 9 | 10 | For each output layer, this returns two ops. The first computes 11 | a layer specific weighted average of the biLM layers, and 12 | the second the l2 regularizer loss term. 13 | The regularization terms are also add to tf.GraphKeys.REGULARIZATION_LOSSES 14 | 15 | Input: 16 | name = a string prefix used for the trainable variable names 17 | bilm_ops = the tensorflow ops returned to compute internal 18 | representations from a biLM. This is the return value 19 | from BidirectionalLanguageModel(...)(ids_placeholder) 20 | l2_coef: the l2 regularization coefficient $\lambda$. 21 | Pass None or 0.0 for no regularization. 22 | use_top_only: if True, then only use the top layer. 23 | do_layer_norm: if True, then apply layer normalization to each biLM 24 | layer before normalizing 25 | 26 | Output: 27 | { 28 | 'weighted_op': op to compute weighted average for output, 29 | 'regularization_op': op to compute regularization term 30 | } 31 | ''' 32 | 33 | def _l2_regularizer(weights): 34 | if l2_coef is not None: 35 | return l2_coef * tf.reduce_sum(tf.square(weights)) 36 | else: 37 | return 0.0 38 | 39 | # Get ops for computing LM embeddings and mask 40 | lm_embeddings = bilm_ops['lm_embeddings'] 41 | mask = bilm_ops['mask'] 42 | 43 | n_lm_layers = int(lm_embeddings.get_shape()[1]) 44 | lm_dim = int(lm_embeddings.get_shape()[3]) 45 | 46 | with tf.control_dependencies([lm_embeddings, mask]): 47 | # Cast the mask and broadcast for layer use. 48 | mask_float = tf.cast(mask, 'float32') 49 | broadcast_mask = tf.expand_dims(mask_float, axis=-1) 50 | 51 | def _do_ln(x): 52 | # do layer normalization excluding the mask 53 | x_masked = x * broadcast_mask 54 | N = tf.reduce_sum(mask_float) * lm_dim 55 | mean = tf.reduce_sum(x_masked) / N 56 | variance = tf.reduce_sum(((x_masked - mean) * broadcast_mask) ** 2 57 | ) / N 58 | return tf.nn.batch_normalization( 59 | x, mean, variance, None, None, 1E-12 60 | ) 61 | 62 | if use_top_only: 63 | layers = tf.split(lm_embeddings, n_lm_layers, axis=1) 64 | # just the top layer 65 | sum_pieces = tf.squeeze(layers[-1], squeeze_dims=1) 66 | # no regularization 67 | reg = 0.0 68 | else: 69 | W = tf.get_variable( 70 | '{}_ELMo_W'.format(name), 71 | shape=(n_lm_layers,), 72 | initializer=tf.zeros_initializer, 73 | regularizer=_l2_regularizer, 74 | trainable=True, 75 | ) 76 | 77 | # normalize the weights 78 | normed_weights = tf.split( 79 | tf.nn.softmax(W + 1.0 / n_lm_layers), n_lm_layers 80 | ) 81 | # split LM layers 82 | layers = tf.split(lm_embeddings, n_lm_layers, axis=1) 83 | 84 | # compute the weighted, normalized LM activations 85 | pieces = [] 86 | for w, t in zip(normed_weights, layers): 87 | if do_layer_norm: 88 | pieces.append(w * _do_ln(tf.squeeze(t, squeeze_dims=1))) 89 | else: 90 | pieces.append(w * tf.squeeze(t, squeeze_dims=1)) 91 | sum_pieces = tf.add_n(pieces) 92 | 93 | # get the regularizer 94 | reg = [ 95 | r for r in tf.get_collection( 96 | tf.GraphKeys.REGULARIZATION_LOSSES) 97 | if r.name.find('{}_ELMo_W/'.format(name)) >= 0 98 | ] 99 | if len(reg) != 1: 100 | raise ValueError 101 | 102 | # scale the weighted sum by gamma 103 | gamma = tf.get_variable( 104 | '{}_ELMo_gamma'.format(name), 105 | shape=(1,), 106 | initializer=tf.ones_initializer, 107 | regularizer=None, 108 | trainable=True, 109 | ) 110 | weighted_lm_layers = sum_pieces * gamma 111 | 112 | ret = {'weighted_op': weighted_lm_layers, 'regularization_op': reg} 113 | 114 | return ret 115 | 116 | 117 | def all_layers(bilm_ops): 118 | # Get ops for computing LM embeddings and mask 119 | lm_embeddings = bilm_ops['lm_embeddings'] 120 | 121 | return lm_embeddings 122 | 123 | -------------------------------------------------------------------------------- /loss.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | 3 | 4 | def mask_to_start(score, start, score_mask_value=-1e30): 5 | score_mask = tf.cast(tf.ones_like(start) - tf.cumsum(start, axis=-1), tf.float32) 6 | return score + score_mask * score_mask_value 7 | 8 | 9 | def mask_to_topk(score, topk_ind, c_maxlen, score_mask_value=-1e30): 10 | score_mask = tf.reduce_sum(tf.one_hot(topk_ind, depth=c_maxlen), axis=-2) # [bs, topk]->[bs, topk, c_len]->[bs, c_len] 11 | score_mask = tf.cast(tf.ones_like(score_mask) - score_mask, tf.float32) 12 | return score + score_mask * score_mask_value 13 | 14 | 15 | def get_tf_f1(y_pred, y_true): 16 | y_true = tf.cast(y_true, tf.float32) 17 | y_union = tf.clip_by_value(y_pred + y_true, 0, 1) # [bs, c_maxlen] 18 | y_diff = tf.abs(y_pred - y_true) # [bs, c_maxlen] 19 | num_same = tf.cast(tf.reduce_sum(y_union, axis=-1) - tf.reduce_sum(y_diff, axis=-1), tf.float32) # [bs,] 20 | y_precision = num_same / (tf.cast(tf.reduce_sum(y_pred, axis=-1), tf.float32) + 1e-7) # [bs,] 21 | y_recall = num_same / (tf.cast(tf.reduce_sum(y_true, axis=-1), tf.float32) + 1e-7) # [bs,] 22 | y_f1 = (2.0 * y_precision * y_recall) / (tf.cast(y_precision + y_recall, tf.float32) + 1e-7) # [bs,] 23 | return tf.clip_by_value(y_f1, 0, 1) 24 | 25 | 26 | def rl_loss(logits_start, logits_end, y_start, y_end, c_maxlen, rl_loss_type = 'topk_DCRL', topk=3): 27 | assert rl_loss_type == 'DCRL' or rl_loss_type == 'SCST' or rl_loss_type == 'topk_DCRL' 28 | # get ground truth prediction 29 | # s:[0,1,0,0,0], e:[0,0,0,1,0]->[0,1,1,1,1]-[0,0,0,1,1]->[0,1,1,0,0]+e:[0,0,0,1,0]->pred:[0,1,1,1,0] 30 | y_start_cumsum = tf.cumsum(y_start, axis=-1) 31 | y_end_cumsum = tf.cumsum(y_end, axis=-1) 32 | ground_truth = y_start_cumsum - y_end_cumsum + y_end # [bs, c_maxlen] 33 | 34 | # get greedy prediction 35 | greedy_start = tf.one_hot(tf.argmax(logits_start, axis=-1), c_maxlen, 36 | axis=-1) # [bs, c_maxlen]->[bs,]->[bs, c_maxlen] 37 | masked_logits_end = mask_to_start(logits_end, greedy_start) 38 | greedy_end = tf.one_hot(tf.argmax(masked_logits_end, axis=-1), c_maxlen, axis=-1) 39 | greedy_start_cumsum = tf.cumsum(greedy_start, axis=-1) 40 | greedy_end_cumsum = tf.cumsum(greedy_end, axis=-1) 41 | greedy_prediction = greedy_start_cumsum - greedy_end_cumsum + greedy_end # [bs, c_maxlen] 42 | # get greedy f1 43 | greedy_f1 = get_tf_f1(greedy_prediction, ground_truth) 44 | 45 | # get sampled prediction (use tf.multinomial) 46 | if rl_loss_type == 'topk_DCRL': 47 | start_topk_ind = tf.nn.top_k(logits_start, topk).indices # [bs, topk_size] 48 | masked_logits_start = mask_to_topk(logits_start, start_topk_ind, c_maxlen) 49 | else: 50 | masked_logits_start = logits_start 51 | sampled_start_ind = tf.squeeze(tf.multinomial(tf.log(tf.nn.softmax(masked_logits_start)), 1), 52 | axis=-1) # [bs, c_maxlen]->[bs, 1]->[bs,] 53 | sampled_start = tf.one_hot(sampled_start_ind, c_maxlen, axis=-1) # [bs, c_maxlen]->[bs,]->[bs, c_maxlen] 54 | masked_logits_end = mask_to_start(logits_end, sampled_start) 55 | if rl_loss_type == 'topk_DCRL': 56 | end_topk_ind = tf.nn.top_k(masked_logits_end, topk).indices # [bs, topk_size] 57 | masked_logits_end = mask_to_topk(masked_logits_end, end_topk_ind, c_maxlen) 58 | sampled_end_ind = tf.squeeze(tf.multinomial(tf.log(tf.nn.softmax(masked_logits_end)), 1), axis=-1) 59 | sampled_end = tf.one_hot(sampled_end_ind, c_maxlen, axis=-1) 60 | sampled_start_cumsum = tf.cumsum(sampled_start, axis=-1) 61 | sampled_end_cumsum = tf.cumsum(sampled_end, axis=-1) 62 | sampled_prediction = sampled_start_cumsum - sampled_end_cumsum + sampled_end # [bs, c_maxlen] 63 | # get sampled f1 64 | sampled_f1 = get_tf_f1(sampled_prediction, ground_truth) 65 | 66 | reward = tf.stop_gradient(sampled_f1 - greedy_f1) # (sampled - baseline) 67 | sampled_start_loss = tf.nn.softmax_cross_entropy_with_logits_v2(logits=logits_start, labels=sampled_start) 68 | sampled_end_loss = tf.nn.softmax_cross_entropy_with_logits_v2(logits=logits_end, labels=sampled_end) 69 | 70 | if rl_loss_type == 'DCRL' or rl_loss_type == 'topk_DCRL': 71 | reward = tf.clip_by_value(reward, 0., 1e7) 72 | reward_greedy = tf.clip_by_value(tf.stop_gradient(greedy_f1 - sampled_f1), 0., 1e7) 73 | greedy_start_loss = tf.nn.softmax_cross_entropy_with_logits_v2(logits=logits_start, labels=greedy_start) 74 | greedy_end_loss = tf.nn.softmax_cross_entropy_with_logits_v2(logits=logits_end, labels=greedy_end) 75 | return tf.reduce_mean(reward * (sampled_start_loss + sampled_end_loss) + reward_greedy * ( 76 | greedy_start_loss + greedy_end_loss)), sampled_f1, greedy_f1 77 | elif rl_loss_type == 'SCST': 78 | return tf.reduce_mean(reward * (sampled_start_loss + sampled_end_loss)), sampled_f1, greedy_f1 79 | -------------------------------------------------------------------------------- /util/spacy_tokenizer.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # Copyright 2018-present, HKUST-KnowComp. 3 | # All rights reserved. 4 | # 5 | # This source code is licensed under the license found in the 6 | # LICENSE file in the root directory of this source tree. 7 | """Tokenizer that is backed by spaCy (spacy.io). 8 | Requires spaCy package and the spaCy english model. 9 | """ 10 | 11 | import spacy 12 | from tqdm import tqdm 13 | import copy 14 | 15 | 16 | class Tokens(object): 17 | """A class to represent a list of tokenized text.""" 18 | TEXT = 0 19 | CHAR = 1 20 | TEXT_WS = 2 21 | SPAN = 3 22 | POS = 4 23 | LEMMA = 5 24 | NER = 6 25 | 26 | def __init__(self, data, annotators, opts=None): 27 | self.data = data 28 | self.annotators = annotators 29 | self.opts = opts or {} 30 | 31 | def __len__(self): 32 | """The number of tokens.""" 33 | return len(self.data) 34 | 35 | def slice(self, i=None, j=None): 36 | """Return a view of the list of tokens from [i, j).""" 37 | new_tokens = copy.copy(self) 38 | new_tokens.data = self.data[i: j] 39 | return new_tokens 40 | 41 | def untokenize(self): 42 | """Returns the original text (with whitespace reinserted).""" 43 | return ''.join([t[self.TEXT_WS] for t in self.data]).strip() 44 | 45 | def chars(self, uncased=False): 46 | """Returns a list of the first character of each token 47 | Args: 48 | uncased: lower cases characters 49 | """ 50 | if uncased: 51 | return [t[self.CHAR].lower() for t in self.data] 52 | else: 53 | return [t[self.CHAR] for t in self.data] 54 | 55 | def words(self, uncased=False): 56 | """Returns a list of the text of each token 57 | Args: 58 | uncased: lower cases text 59 | """ 60 | if uncased: 61 | return [t[self.TEXT].lower() for t in self.data] 62 | else: 63 | return [t[self.TEXT] for t in self.data] 64 | 65 | def offsets(self): 66 | """Returns a list of [start, end) character offsets of each token.""" 67 | return [t[self.SPAN] for t in self.data] 68 | 69 | def pos(self): 70 | """Returns a list of part-of-speech tags of each token. 71 | Returns None if this annotation was not included. 72 | """ 73 | if 'pos' not in self.annotators: 74 | return None 75 | return [t[self.POS] for t in self.data] 76 | 77 | def lemmas(self): 78 | """Returns a list of the lemmatized text of each token. 79 | Returns None if this annotation was not included. 80 | """ 81 | if 'lemma' not in self.annotators: 82 | return None 83 | return [t[self.LEMMA] for t in self.data] 84 | 85 | def entities(self): 86 | """Returns a list of named-entity-recognition tags of each token. 87 | Returns None if this annotation was not included. 88 | """ 89 | if 'ner' not in self.annotators: 90 | return None 91 | return [t[self.NER] for t in self.data] 92 | 93 | def ngrams(self, n=1, uncased=False, filter_fn=None, as_strings=True): 94 | """Returns a list of all ngrams from length 1 to n. 95 | Args: 96 | n: upper limit of ngram length 97 | uncased: lower cases text 98 | filter_fn: user function that takes in an ngram list and returns 99 | True or False to keep or not keep the ngram 100 | as_string: return the ngram as a string vs list 101 | """ 102 | 103 | def _skip(gram): 104 | if not filter_fn: 105 | return False 106 | return filter_fn(gram) 107 | 108 | words = self.words(uncased) 109 | ngrams = [(s, e + 1) 110 | for s in range(len(words)) 111 | for e in range(s, min(s + n, len(words))) 112 | if not _skip(words[s:e + 1])] 113 | 114 | # Concatenate into strings 115 | if as_strings: 116 | ngrams = ['{}'.format(' '.join(words[s:e])) for (s, e) in ngrams] 117 | 118 | return ngrams 119 | 120 | def entity_groups(self): 121 | """Group consecutive entity tokens with the same NER tag.""" 122 | entities = self.entities() 123 | if not entities: 124 | return None 125 | non_ent = self.opts.get('non_ent', 'O') 126 | groups = [] 127 | idx = 0 128 | while idx < len(entities): 129 | ner_tag = entities[idx] 130 | # Check for entity tag 131 | if ner_tag != non_ent: 132 | # Chomp the sequence 133 | start = idx 134 | while (idx < len(entities) and entities[idx] == ner_tag): 135 | idx += 1 136 | groups.append((self.slice(start, idx).untokenize(), ner_tag)) 137 | else: 138 | idx += 1 139 | return groups 140 | 141 | 142 | class SpacyTokenizer(object): 143 | 144 | def __init__(self, **kwargs): 145 | """ 146 | Args: 147 | annotators: set that can include pos, lemma, and ner. 148 | model: spaCy model to use (either path, or keyword like 'en'). 149 | """ 150 | model = kwargs.get('model', 'en') 151 | self.annotators = copy.deepcopy(kwargs.get('annotators', set())) 152 | self.nlp = spacy.load(model) 153 | self.nlp.remove_pipe('parser') 154 | if not any([p in self.annotators for p in ['lemma', 'pos', 'ner']]): 155 | self.nlp.remove_pipe('tagger') 156 | if 'ner' not in self.annotators: 157 | self.nlp.remove_pipe('ner') 158 | 159 | def tokenize(self, text): 160 | # We don't treat new lines as tokens. 161 | clean_text = text.replace('\n', ' ') 162 | tokens = self.nlp(clean_text) 163 | 164 | data = [] 165 | for i in range(len(tokens)): 166 | # Get whitespace 167 | start_ws = tokens[i].idx 168 | if i + 1 < len(tokens): 169 | end_ws = tokens[i + 1].idx 170 | else: 171 | end_ws = tokens[i].idx + len(tokens[i].text) 172 | 173 | data.append(( 174 | tokens[i].text, 175 | tokens[i].text[0] if len(tokens[i].text) > 0 else '', 176 | text[start_ws: end_ws], 177 | (tokens[i].idx, tokens[i].idx + len(tokens[i].text)), 178 | tokens[i].tag_, 179 | tokens[i].lemma_, 180 | tokens[i].ent_type_, 181 | )) 182 | 183 | # Set special option for non-entity tag: '' vs 'O' in spaCy 184 | return Tokens(data, self.annotators, opts={'non_ent': ''}) 185 | 186 | def shutdown(self): 187 | pass 188 | 189 | def __del__(self): 190 | self.shutdown() -------------------------------------------------------------------------------- /util/get_tfrecords.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import numpy as np 3 | from tqdm import tqdm 4 | import random 5 | import h5py 6 | 7 | data_type = 'train' 8 | data_source = 'dataset_pre3' 9 | 10 | # load trainset 11 | qid = np.load(data_source + '/' + data_type + '_qid.npy').astype(np.int32) 12 | print(data_type + 'data loading over...') 13 | 14 | length = qid.shape[0] 15 | print(length) 16 | index = [i for i in range(0, length)] 17 | random.shuffle(index) 18 | print(index[0:10]) 19 | 20 | qid = qid[index] 21 | tfrecords_filename = 'tfrecords/' + data_type + '_pre_elmo_cove3.tfrecords' 22 | writer = tf.python_io.TFRecordWriter(tfrecords_filename) 23 | 24 | with h5py.File(data_source + '/train_ELMO_feats.h5', 'r') as f1: 25 | with h5py.File(data_source + '/train_COVE_feats.h5', 'r') as f2: 26 | with h5py.File(data_source + '/train_data.h5', 'r') as f: 27 | for i in tqdm(range(len(qid))): 28 | elmo_context_feat = f1[str(qid[i]) + 'c'][:] 29 | elmo_question_feat = f1[str(qid[i]) + 'q'][:] 30 | cove_context_feat = f2[str(qid[i]) + 'c'][:] 31 | cove_question_feat = f2[str(qid[i]) + 'q'][:] 32 | 33 | data_simple = f[str(qid[i])] 34 | context_ids = data_simple['context_ids'][:] 35 | ques_ids = data_simple['ques_ids'][:] 36 | context_char_ids = data_simple['context_char_ids'][:] 37 | ques_char_ids = data_simple['ques_char_ids'][:] 38 | y1 = data_simple['y1'][:] 39 | y2 = data_simple['y2'][:] 40 | y1p = data_simple['y1p'][:] 41 | y2p = data_simple['y2p'][:] 42 | context_feat = data_simple['context_feat'][:] 43 | ques_feat = data_simple['ques_feat'][:] 44 | 45 | record = tf.train.Example(features=tf.train.Features(feature={ 46 | "context_ids": tf.train.Feature(bytes_list=tf.train.BytesList(value=[context_ids.tostring()])), 47 | "ques_ids": tf.train.Feature(bytes_list=tf.train.BytesList(value=[ques_ids.tostring()])), 48 | "context_char_ids": tf.train.Feature( 49 | bytes_list=tf.train.BytesList(value=[context_char_ids.tostring()])), 50 | "ques_char_ids": tf.train.Feature(bytes_list=tf.train.BytesList(value=[ques_char_ids.tostring()])), 51 | "context_feat": tf.train.Feature(bytes_list=tf.train.BytesList(value=[context_feat.tostring()])), 52 | "ques_feat": tf.train.Feature(bytes_list=tf.train.BytesList(value=[ques_feat.tostring()])), 53 | 'elmo_context_feat': tf.train.Feature( 54 | bytes_list=tf.train.BytesList(value=[elmo_context_feat.tostring()])), 55 | 'elmo_question_feat': tf.train.Feature( 56 | bytes_list=tf.train.BytesList(value=[elmo_question_feat.tostring()])), 57 | 'cove_context_feat': tf.train.Feature( 58 | bytes_list=tf.train.BytesList(value=[cove_context_feat.tostring()])), 59 | 'cove_question_feat': tf.train.Feature( 60 | bytes_list=tf.train.BytesList(value=[cove_question_feat.tostring()])), 61 | "y1": tf.train.Feature(bytes_list=tf.train.BytesList(value=[y1.tostring()])), 62 | "y2": tf.train.Feature(bytes_list=tf.train.BytesList(value=[y2.tostring()])), 63 | "y1p": tf.train.Feature(bytes_list=tf.train.BytesList(value=[y1p.tostring()])), 64 | "y2p": tf.train.Feature(bytes_list=tf.train.BytesList(value=[y2p.tostring()])), 65 | "qid": tf.train.Feature(int64_list=tf.train.Int64List(value=[qid[i]])) 66 | })) 67 | writer.write(record.SerializeToString()) 68 | writer.close() 69 | 70 | data_type = 'dev' 71 | data_source = 'dataset_pre3' 72 | 73 | # load trainset 74 | qid = np.load(data_source + '/' + data_type + '_qid.npy').astype(np.int32) 75 | print(data_type + 'data loading over...') 76 | 77 | tfrecords_filename = 'tfrecords/' + data_type + '_pre_elmo_cove3.tfrecords' 78 | writer = tf.python_io.TFRecordWriter(tfrecords_filename) 79 | 80 | with h5py.File(data_source + '/dev_ELMO_feats.h5', 'r') as f1: 81 | with h5py.File(data_source + '/dev_COVE_feats.h5', 'r') as f2: 82 | with h5py.File(data_source + '/dev_data.h5', 'r') as f: 83 | for i in tqdm(range(len(qid))): 84 | elmo_context_feat = f1[str(qid[i]) + 'c'][:] 85 | elmo_question_feat = f1[str(qid[i]) + 'q'][:] 86 | cove_context_feat = f2[str(qid[i]) + 'c'][:] 87 | cove_question_feat = f2[str(qid[i]) + 'q'][:] 88 | 89 | data_simple = f[str(qid[i])] 90 | context_ids = data_simple['context_ids'][:] 91 | ques_ids = data_simple['ques_ids'][:] 92 | context_char_ids = data_simple['context_char_ids'][:] 93 | ques_char_ids = data_simple['ques_char_ids'][:] 94 | y1 = data_simple['y1'][:] 95 | y2 = data_simple['y2'][:] 96 | y1p = data_simple['y1p'][:] 97 | y2p = data_simple['y2p'][:] 98 | context_feat = data_simple['context_feat'][:] 99 | ques_feat = data_simple['ques_feat'][:] 100 | 101 | record = tf.train.Example(features=tf.train.Features(feature={ 102 | "context_ids": tf.train.Feature(bytes_list=tf.train.BytesList(value=[context_ids.tostring()])), 103 | "ques_ids": tf.train.Feature(bytes_list=tf.train.BytesList(value=[ques_ids.tostring()])), 104 | "context_char_ids": tf.train.Feature( 105 | bytes_list=tf.train.BytesList(value=[context_char_ids.tostring()])), 106 | "ques_char_ids": tf.train.Feature(bytes_list=tf.train.BytesList(value=[ques_char_ids.tostring()])), 107 | "context_feat": tf.train.Feature(bytes_list=tf.train.BytesList(value=[context_feat.tostring()])), 108 | "ques_feat": tf.train.Feature(bytes_list=tf.train.BytesList(value=[ques_feat.tostring()])), 109 | 'elmo_context_feat': tf.train.Feature( 110 | bytes_list=tf.train.BytesList(value=[elmo_context_feat.tostring()])), 111 | 'elmo_question_feat': tf.train.Feature( 112 | bytes_list=tf.train.BytesList(value=[elmo_question_feat.tostring()])), 113 | 'cove_context_feat': tf.train.Feature( 114 | bytes_list=tf.train.BytesList(value=[cove_context_feat.tostring()])), 115 | 'cove_question_feat': tf.train.Feature( 116 | bytes_list=tf.train.BytesList(value=[cove_question_feat.tostring()])), 117 | "y1": tf.train.Feature(bytes_list=tf.train.BytesList(value=[y1.tostring()])), 118 | "y2": tf.train.Feature(bytes_list=tf.train.BytesList(value=[y2.tostring()])), 119 | "y1p": tf.train.Feature(bytes_list=tf.train.BytesList(value=[y1p.tostring()])), 120 | "y2p": tf.train.Feature(bytes_list=tf.train.BytesList(value=[y2p.tostring()])), 121 | "qid": tf.train.Feature(int64_list=tf.train.Int64List(value=[qid[i]])) 122 | })) 123 | writer.write(record.SerializeToString()) 124 | writer.close() 125 | -------------------------------------------------------------------------------- /test.py: -------------------------------------------------------------------------------- 1 | import os 2 | import numpy as np 3 | import json 4 | import RMR_modelV6_squad2 as RMR 5 | import tensorflow.contrib.slim as slim 6 | from util.util import * 7 | import tensorflow as tf 8 | import pandas as pd 9 | from util.log_wrapper import create_logger 10 | 11 | os.environ["CUDA_VISIBLE_DEVICES"] = '4' 12 | 13 | if __name__ == '__main__': 14 | 15 | data_source = '../QANet_tf/dataset_pre3' 16 | 17 | config = { 18 | 'char_dim': 300, 19 | 'cont_limit': 400, 20 | 'ques_limit': 50, 21 | 'char_limit': 16, 22 | 'ans_limit': -1, 23 | 'filters': 300, 24 | 'dropout': 0.175, 25 | 'dropout_emb': 0.15, 26 | 'dropout_att': 0.2, 27 | 'dropout_rnn': 0.1, 28 | 'l2_norm': 3e-7, 29 | 'decay': 1, 30 | 'gamma_b': 0.3, 31 | 'gamma_c': 1.0, 32 | 'init_lambda': 3.0, 33 | 'learning_rate': 8e-4, 34 | 'shuffle_size': 25000, 35 | 'grad_clip': 5.0, 36 | 'use_elmo': 0, 37 | 'use_cove': 0, 38 | 'use_feat': True, 39 | 'use_rlloss': True, 40 | 'rlw': 0.0, 41 | 'rlw2': 0.8, 42 | 'optimizer': 'adam', 43 | 'cove_path': '../SAN_tf/Keras_CoVe_2layers.h5', 44 | 'elmo_weights_path': '../SAN_tf/elmo_tf/models/squad_context_concat_lm_2x4096_512_2048cnn_2xhighway_skip.hdf5', 45 | 'elmo_options_path': '../SAN_tf/elmo_tf/models/options_squad_lm_2x4096_512_2048cnn_2xhighway_skip.json', 46 | 'train_tfrecords': '../QANet_tf/tfrecords/train_pre_elmo_cove3.tfrecords', 47 | 'dev_tfrecords': '../QANet_tf/tfrecords/dev_pre_elmo_cove3.tfrecords', 48 | 'batch_size': 32, 49 | 'epoch': 25, 50 | 'origin_path': None, # not finetune 51 | 'path': 'RMR005' 52 | } 53 | 54 | global logger 55 | logger = create_logger(__name__, to_disk=False) 56 | 57 | logger.info('loading data...') 58 | dev_qid = np.load(data_source + '/dev_qid.npy').astype(np.int32) 59 | with open(data_source + '/test_eval.json', "r") as fh: 60 | eval_file = json.load(fh) 61 | 62 | # load embedding matrix 63 | logger.info('loading embedding...') 64 | word_mat = np.load(data_source + '/word_emb_mat.npy') 65 | char_mat_fix = np.load(data_source + '/char_emb_mat_fix.npy').astype(np.float32) 66 | char_mat_trainable = np.load(data_source + '/char_emb_mat_trainable.npy').astype(np.float32) 67 | 68 | 69 | logger.info('generate dev tfrecords...') 70 | dev_dataset = tf.data.TFRecordDataset(config['dev_tfrecords']) \ 71 | .map(get_record_parser(config), num_parallel_calls=8) \ 72 | .padded_batch(config['batch_size'], padded_shapes=([None], 73 | [None], 74 | [None, None], 75 | [None, None], 76 | [None, None], 77 | [None, None], 78 | [None, None, None], 79 | [None, None, None], 80 | [None, None, None], 81 | [None, None, None], 82 | [None], 83 | [None], 84 | [None], 85 | [None])) 86 | dev_iterator = dev_dataset.make_initializable_iterator() 87 | dev_next_element = dev_iterator.get_next() 88 | dev_sum = 11730 89 | 90 | logger.info('init model...') 91 | model = RMR.Model(config, word_mat=word_mat, char_mat_trainable=char_mat_trainable, char_mat_fix=char_mat_fix) 92 | sess_config = tf.ConfigProto(allow_soft_placement=True) 93 | sess_config.gpu_options.allow_growth = True 94 | best_f1 = 0 95 | best_em = 0 96 | f1s = [] 97 | ems = [] 98 | 99 | with tf.Session(config=sess_config) as sess: 100 | sess.run(tf.global_variables_initializer()) 101 | # scope with trainable weights 102 | variables_to_restore = slim.get_variables_to_restore(include=['Input_Embedding_Mat', 103 | 'Input_Embedding_Layer', 104 | 'Iterative_Reattention_Aligner', 105 | 'Answer_Pointer', 106 | 'EMA_Weights']) 107 | saver = tf.train.Saver(variables_to_restore, max_to_keep=10) 108 | if config['origin_path'] is not None and os.path.exists( 109 | os.path.join('model', config['origin_path'], 'checkpoint')): 110 | saver.restore(sess, tf.train.latest_checkpoint(os.path.join('model', str(config['origin_path']) + '/'))) 111 | 112 | i_batch = 0 113 | val_n_batch = dev_sum // config['batch_size'] + 1 114 | sum_loss = 0 115 | 116 | # validating step 117 | # save the temp weights and do ema 118 | if config['decay'] < 1.0: 119 | sess.run(model.assign_vars) 120 | print('EMA over...') 121 | sess.run(dev_iterator.initializer) 122 | logger.info('validating...') 123 | sum_loss_val = 0 124 | y1s = [] 125 | y2s = [] 126 | i_batch = 0 127 | while True: 128 | try: 129 | context_idxs, ques_idxs, \ 130 | context_char_idxs, ques_char_idxs, \ 131 | context_feat, ques_feat, \ 132 | elmo_context_feat, elmo_question_feat, \ 133 | cove_context_feat, cove_question_feat, \ 134 | y1, y2, y1p, y2p = sess.run(dev_next_element) 135 | feed_dict_ = {model.contw_input: context_idxs, model.quesw_input: ques_idxs, 136 | model.contc_input: context_char_idxs, model.quesc_input: ques_char_idxs, 137 | model.y_start: y1, model.y_end: y2, 138 | model.yp_start: y1p, model.yp_end: y2p, 139 | model.un_size: context_idxs.shape[0]} 140 | if config['use_feat']: 141 | feed_dict_[model.cont_feat] = context_feat 142 | feed_dict_[model.ques_feat] = ques_feat 143 | if config['use_elmo'] == 1: 144 | feed_dict_[model.elmo_cont] = elmo_context_feat 145 | feed_dict_[model.elmo_ques] = elmo_question_feat 146 | if config['use_cove'] == 1: 147 | feed_dict_[model.cove_cont] = cove_context_feat 148 | feed_dict_[model.cove_ques] = cove_question_feat 149 | loss_value, y1, y2 = sess.run([model.loss, model.mask_output1, model.mask_output2], 150 | feed_dict=feed_dict_) 151 | y1s.append(y1) 152 | y2s.append(y2) 153 | sum_loss_val += loss_value 154 | i_batch += 1 155 | except tf.errors.OutOfRangeError: 156 | y1s = np.concatenate(y1s) 157 | y2s = np.concatenate(y2s) 158 | answer_dict, _, noanswer_num = convert_tokens(eval_file, dev_qid.tolist(), y1s.tolist(), 159 | y2s.tolist(), data_type=2) 160 | metrics = evaluate(eval_file, answer_dict) 161 | ems.append(metrics['exact_match']) 162 | f1s.append(metrics['f1']) 163 | 164 | if metrics['f1'] < f1s[-1]: 165 | config['learning_rate'] *= 0.5 166 | logger.warning('learning rate reduce to:' + str(config['learning_rate'])) 167 | if config['learning_rate'] <= 1e-4: 168 | logger.warning('rl loss start...') 169 | config['rlw'] = config['rlw2'] 170 | 171 | if ems[-1] > best_em: 172 | best_em = ems[-1] 173 | if f1s[-1] > best_f1: 174 | best_f1 = f1s[-1] 175 | logger.warning("-loss: %.4f -EM:%.2f%% (best: %.2f%%), -F1:%.2f%% (best: %.2f%%) -Noanswer:%d" % 176 | (sum_loss_val / (i_batch + 1), metrics['exact_match'], best_em, metrics['f1'], 177 | best_f1, noanswer_num)) 178 | metrics = evaluate_acc(eval_file, answer_dict) 179 | logger.warning("Has answer acc:%.2f%%, No answer acc::%.2f%%" % ( 180 | metrics['has_answer_acc'] * 100, metrics['hasno_answer_acc'] * 100)) 181 | 182 | break 183 | -------------------------------------------------------------------------------- /train_finetune.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | import RMR_modelV6 4 | import tensorflow as tf 5 | import json 6 | import os 7 | import util 8 | import time 9 | import tensorflow.contrib.slim as slim 10 | 11 | os.environ["CUDA_VISIBLE_DEVICES"] = '0' 12 | 13 | 14 | def training_shuffle(data, seed=None): 15 | if seed is not None: 16 | np.random.seed(seed) 17 | index = np.arange(data[0].shape[0]) 18 | np.random.shuffle(index) 19 | for i, d in enumerate(data): 20 | if len(d.shape) > 1: 21 | data[i] = data[i][index, ::] 22 | else: 23 | data[i] = data[i][index] 24 | return data 25 | 26 | 27 | def next_batch(data, batch_size, iteration): 28 | data_temp = [] 29 | start_index = iteration * batch_size 30 | end_index = (iteration + 1) * batch_size 31 | for i, d in enumerate(data): 32 | data_temp.append(data[i][start_index: end_index, ::]) 33 | return data_temp 34 | 35 | 36 | def cal_ETA(t_start, i, n_batch): 37 | t_temp = time.time() 38 | t_avg = float(int(t_temp) - int(t_start)) / float(i + 1) 39 | if n_batch - i - 1 > 0: 40 | return int((n_batch - i - 1) * t_avg) 41 | else: 42 | return int(t_temp) - int(t_start) 43 | 44 | 45 | # load trainset 46 | context_word = np.load('../QANet_tf/dataset1.0/train_contw_input.npy').astype(np.int32) 47 | question_word = np.load('../QANet_tf/dataset1.0/train_quesw_input.npy').astype(np.int32) 48 | context_char = np.load('../QANet_tf/dataset1.0/train_contc_input.npy').astype(np.int32) 49 | question_char = np.load('../QANet_tf/dataset1.0/train_quesc_input.npy').astype(np.int32) 50 | start_label = np.load('../QANet_tf/dataset1.0/train_y_start.npy').astype(np.int32) 51 | end_label = np.load('../QANet_tf/dataset1.0/train_y_end.npy').astype(np.int32) 52 | context_string = np.load('../QANet_tf/dataset1.0/train_contw_strings.npy') 53 | ques_string = np.load('../QANet_tf/dataset1.0/train_quesw_strings.npy') 54 | 55 | # load valset 56 | val_context_word = np.load('../QANet_tf/dataset1.0/dev_contw_input.npy').astype(np.int32) 57 | val_question_word = np.load('../QANet_tf/dataset1.0/dev_quesw_input.npy').astype(np.int32) 58 | val_context_char = np.load('../QANet_tf/dataset1.0/dev_contc_input.npy').astype(np.int32) 59 | val_question_char = np.load('../QANet_tf/dataset1.0/dev_quesc_input.npy').astype(np.int32) 60 | val_start_label = np.load('../QANet_tf/dataset1.0/dev_y_start.npy').astype(np.int32) 61 | val_end_label = np.load('../QANet_tf/dataset1.0/dev_y_end.npy').astype(np.int32) 62 | val_qid = np.load('../QANet_tf/dataset1.0/dev_qid.npy').astype(np.int32) 63 | val_context_string = np.load('../QANet_tf/dataset1.0/dev_contw_strings.npy') 64 | val_ques_string = np.load('../QANet_tf/dataset1.0/dev_quesw_strings.npy') 65 | 66 | with open('../QANet_tf/dataset1.0/test_eval.json', "r") as fh: 67 | eval_file = json.load(fh) 68 | 69 | # load embedding matrix 70 | word_mat = np.load('../QANet_tf/dataset1.0/word_emb_mat.npy') 71 | char_mat = np.load('../QANet_tf/dataset1.0/char_emb_mat.npy') 72 | 73 | train_set = [context_word, question_word, context_char, question_char, context_string, ques_string, start_label, 74 | end_label] 75 | val_set = [val_context_word, val_question_word, val_context_char, val_question_char, val_context_string, 76 | val_ques_string, val_start_label, val_end_label] 77 | 78 | config = { 79 | 'char_dim': 64, 80 | 'cont_limit': 400, 81 | 'ques_limit': 50, 82 | 'char_limit': 16, 83 | 'ans_limit': 50, 84 | 'filters': 100, 85 | 'dropout': 0.3, 86 | 'l2_norm': 3e-7, 87 | 'decay': 0.9999, 88 | 'learning_rate': 1e-4, 89 | 'grad_clip': 5.0, 90 | 'batch_size': 32, 91 | 'epoch': 20, 92 | 'per_steps': 500, 93 | 'init_lambda': 3.0, 94 | 'rl_loss_type': 'DCRL', # ['SCTC', 'DCRL', 'topk_DCRL', None] 95 | 'origin_path': 'RMRV0', 96 | 'path': 'RMRV0_f' 97 | } 98 | 99 | model = RMR_modelV6.Model(config, word_mat=word_mat, char_mat=char_mat, elmo_path="../QANet_tf/tfhub_elmo") 100 | sess_config = tf.ConfigProto(allow_soft_placement=True) 101 | sess_config.gpu_options.allow_growth = True 102 | 103 | best_f1 = 0 104 | best_em = 0 105 | f1s = [] 106 | ems = [] 107 | 108 | with tf.Session(config=sess_config) as sess: 109 | if not os.path.exists(os.path.join('model', config['path'])): 110 | os.mkdir(os.path.join('model', config['path'])) 111 | sess.run(tf.global_variables_initializer()) 112 | variables_to_restore = slim.get_variables_to_restore(include=['Input_Embedding_Layer', 113 | 'Iterative_Reattention_Aligner', 114 | 'Answer_Pointer']) 115 | saver = tf.train.Saver(variables_to_restore) 116 | if os.path.exists(os.path.join('model',config['origin_path'],'checkpoint')): 117 | saver.restore(sess, tf.train.latest_checkpoint(os.path.join('model',config['origin_path']))) 118 | n_batch = context_word.shape[0] // config['batch_size'] 119 | n_batch_val = val_context_word.shape[0] // config['batch_size'] 120 | 121 | # during the finetune with rl_loss we validate the result per 500 steps 122 | for epoch in range(config['epoch']): 123 | train_set = training_shuffle(train_set) 124 | last_train_str = "\r" 125 | # training step 126 | sum_loss = 0 127 | sum_rl_loss = 0 128 | for i in range(n_batch): 129 | contw_input, quesw_input, contc_input, quesc_input, contw_string, quesw_string, y_start, y_end \ 130 | = next_batch(train_set, config['batch_size'], i) 131 | loss_value, rl_loss_value, theta_a, theta_b, sampled_f1, greedy_f1, _ = sess.run([model.loss, model.rl_loss, model.theta_a, model.theta_b, 132 | model.sampled_f1, model.greedy_f1, model.train_op], 133 | feed_dict={model.contw_input_: contw_input, model.quesw_input_: quesw_input, 134 | model.contc_input_: contc_input, model.quesc_input_: quesc_input, 135 | model.contw_strings: contw_string, model.quesw_strings: quesw_string, 136 | model.y_start_: y_start, model.y_end_: y_end, 137 | model.dropout: config['dropout']}) 138 | sum_loss += loss_value 139 | sum_rl_loss += rl_loss_value 140 | last_train_str = "\r[epoch:%d/%d, steps:%d/%d] loss:%.4f rl_loss:%.4f" % ( 141 | epoch + 1, config['epoch'], i + 1, n_batch, sum_loss/(i+1), rl_loss_value) 142 | print(last_train_str, end=' ', flush=True) 143 | # print('sf1:',sampled_f1) 144 | # print('gf1:',greedy_f1) 145 | if (i+1)%config['per_steps']==0 or i+1==n_batch: 146 | # validating step 147 | sum_loss_val = 0 148 | sum_rl_loss_val = 0 149 | y1s = [] 150 | y2s = [] 151 | last_val_str = "\r" 152 | for i in range(n_batch_val): 153 | contw_input, quesw_input, contc_input, quesc_input, contw_string, quesw_string, y_start, y_end \ 154 | = next_batch(val_set, config['batch_size'], i) 155 | loss_value, rl_loss_value, y1, y2 = sess.run([model.loss, model.rl_loss, model.output1, model.output2], 156 | feed_dict={model.contw_input_: contw_input, 157 | model.quesw_input_: quesw_input, 158 | model.contc_input_: contc_input, 159 | model.quesc_input_: quesc_input, 160 | model.contw_strings: contw_string, 161 | model.quesw_strings: quesw_string, 162 | model.y_start_: y_start, model.y_end_: y_end}) 163 | y1s.append(y1) 164 | y2s.append(y2) 165 | sum_loss_val += loss_value 166 | sum_rl_loss_val += rl_loss_value 167 | last_val_str = last_train_str + " [validate:%d/%d] loss:%.4f rl_loss:%.4f" % ( 168 | i + 1, n_batch_val, sum_loss_val / (i + 1), rl_loss_value) 169 | print(last_val_str, end=' ', flush=True) 170 | y1s = np.concatenate(y1s) 171 | y2s = np.concatenate(y2s) 172 | answer_dict, _, noanswer_num = util.convert_tokens(eval_file, val_qid.tolist(), y1s.tolist(), 173 | y2s.tolist(), 174 | data_type=1) 175 | metrics = util.evaluate(eval_file, answer_dict) 176 | ems.append(metrics['exact_match']) 177 | f1s.append(metrics['f1']) 178 | 179 | if metrics['f1'] > best_f1: 180 | best_f1 = metrics['f1'] 181 | saver.save(sess, os.path.join('model', config['path'], 'model.ckpt'), 182 | global_step=(epoch + 1) * n_batch) 183 | 184 | print(last_val_str, 185 | " -EM: %.2f%%, -F1: %.2f%% -Noanswer: %d" % (metrics['exact_match'], metrics['f1'], noanswer_num), 186 | end=' ', flush=True) 187 | print('\n') 188 | 189 | result = pd.DataFrame([ems, f1s], index=['em', 'f1']).transpose() 190 | result.to_csv('log/result_' + config['path'] + '.csv', index=None) 191 | 192 | saver.save(sess, os.path.join('model', config['path'], 'model.ckpt'), global_step=config['epoch'] * n_batch) -------------------------------------------------------------------------------- /layers.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | from tensor2tensor.layers.common_layers import conv1d 3 | from tensorflow.contrib.cudnn_rnn import CudnnLSTM 4 | from tensorflow.contrib.keras import backend 5 | from tensorflow.contrib.layers import variance_scaling_initializer, l2_regularizer 6 | 7 | initializer = lambda: variance_scaling_initializer(factor=1.0, mode='FAN_AVG', uniform=True, dtype=tf.float32) 8 | initializer_relu = lambda: variance_scaling_initializer(factor=2.0, mode='FAN_IN', uniform=False, dtype=tf.float32) 9 | regularizer = l2_regularizer(scale=3e-7) 10 | 11 | 12 | # cudnnLSTM 13 | def BiLSTM(x, filters, dropout=0.0, name='BiLSTM', layers=1, return_state=False): 14 | cudnn_lstm = CudnnLSTM(layers, filters, direction='bidirectional', name=name) 15 | if type(x) == list: 16 | assert len(x) == 2 17 | x1, x2 = x 18 | # cudnn compatibility: time first, batch second 19 | x1 = tf.transpose(x1, [1, 0, 2]) 20 | x2 = tf.transpose(x2, [1, 0, 2]) 21 | x1, x1_state = cudnn_lstm(x1) # state:[2, bs, dim] 22 | x2, x2_state = cudnn_lstm(x2) 23 | x1 = tf.transpose(x1, [1, 0, 2]) 24 | x2 = tf.transpose(x2, [1, 0, 2]) 25 | x1_state = tf.concat(tf.unstack(x1_state[0], axis=0), axis=-1) 26 | x2_state = tf.concat(tf.unstack(x2_state[0], axis=0), axis=-1) 27 | if return_state: 28 | return tf.nn.dropout(x1_state, 1 - dropout), tf.nn.dropout(x2_state, 1 - dropout) 29 | else: 30 | return tf.nn.dropout(x1, 1 - dropout), tf.nn.dropout(x2, 1 - dropout) 31 | else: 32 | # cudnn compatibility: time first, batch second 33 | x = tf.transpose(x, [1, 0, 2]) 34 | x, x_state = cudnn_lstm(x) 35 | if return_state: 36 | x_state = tf.concat(tf.unstack(x_state[0], axis=0), axis=-1) 37 | return tf.nn.dropout(x_state, 1 - dropout) 38 | else: 39 | x = tf.transpose(x, [1, 0, 2]) 40 | return tf.nn.dropout(x, 1 - dropout) 41 | 42 | 43 | def exp_mask(inputs, mask, mask_value=-1e30): 44 | mask = tf.cast(mask, tf.float32) 45 | return inputs + mask_value * (1 - mask) 46 | 47 | 48 | def align_block(u, v, c_mask, q_mask, Lambda, filters=128, E_0=None, B_0=None, Z_0=None, dropout=0.0): 49 | with tf.variable_scope("Interactive_Alignment"): 50 | # attention 51 | u_ = tf.nn.relu(conv1d(u, filters, 1, name="Wu")) # [bs, len_c, dim] 52 | v_ = tf.nn.relu(conv1d(v, filters, 1, name="Wv")) # [bs, len_q, dim] 53 | E = tf.matmul(v_, u_, transpose_b=True) # [bs, len_q, len_c] 54 | if E_0 is not None: 55 | E += (Lambda * E_0) 56 | E_ = tf.nn.softmax(exp_mask(E, tf.expand_dims(q_mask, axis=-1)), axis=1) # [bs, len_q, len_c] 57 | v_E = tf.matmul(E_, v, transpose_a=True) # [bs, len_c, dim] 58 | 59 | # fusion 60 | uv = tf.concat([u, v_E, u * v_E, u - v_E], axis=-1) 61 | x = tf.nn.relu(conv1d(uv, filters, 1, name='Wr')) 62 | g = tf.nn.sigmoid(conv1d(uv, filters, 1, name='Wg')) 63 | h = g * x + (1 - g) * u # [bs, len_c, dim] 64 | 65 | with tf.variable_scope("Self_Alignment"): 66 | # attention 67 | h_1 = tf.nn.relu(conv1d(h, filters, 1, name='Wh1')) 68 | h_2 = tf.nn.relu(conv1d(h, filters, 1, name='Wh2')) 69 | B = tf.matmul(h_2, h_1, transpose_b=True) # [bs, len_c, len_c] 70 | if B_0 is not None: 71 | B += (Lambda * B_0) 72 | B_ = tf.nn.softmax(exp_mask(B, tf.expand_dims(c_mask, axis=-1)), axis=1) # [bs, len_c, len_c] 73 | h_B = tf.matmul(B_, h, transpose_a=True) 74 | 75 | # fusion 76 | hh = tf.concat([h, h_B, h * h_B, h - h_B], axis=-1) 77 | x = tf.nn.relu(conv1d(hh, filters, 1, name='Wr')) 78 | g = tf.nn.sigmoid(conv1d(hh, filters, 1, name='Wg')) 79 | Z = g * x + (1 - g) * h # [bs, len_c, dim] 80 | 81 | with tf.variable_scope("Evidence_Collection"): 82 | if Z_0 is not None: 83 | Z = tf.concat([Z, Z_0[0], Z_0[1]], axis=-1) 84 | R = BiLSTM(Z, filters // 2, name='bilstm', dropout=dropout) # [bs, len_c, dim] 85 | 86 | # return the E_t, B_t 87 | E_t = tf.nn.softmax(exp_mask(E, tf.expand_dims(c_mask, axis=1)), axis=-1) # [bs, len_q, len_c] 88 | E_t = tf.matmul(E_t, B_) 89 | B_t = tf.nn.softmax(exp_mask(B, tf.expand_dims(c_mask, axis=1)), axis=-1) # [bs, len_c, len_c] 90 | B_t = tf.matmul(B_t, B_) 91 | 92 | return R, Z, E_t, B_t 93 | 94 | 95 | def summary_vector(q_emb, c_maxlen, mask): 96 | with tf.variable_scope("Question_Summary"): 97 | alpha = tf.nn.softmax(exp_mask(tf.squeeze(conv1d(q_emb, 1, 1), axis=-1), mask)) 98 | s = tf.expand_dims(alpha, axis=-1) * q_emb 99 | s = tf.reduce_sum(s, axis=1, keepdims=True) # [bs, 1, dim] 100 | s = tf.tile(s, [1, c_maxlen, 1]) # [bs, len_c, dim] 101 | return s 102 | 103 | 104 | def start_logits(R, s, mask, filters=128, name='Start_Pointer'): 105 | with tf.variable_scope(name): 106 | if R.get_shape()[-1] == s.get_shape()[-1]: 107 | logits1 = tf.concat([R, s, R * s, R - s], axis=-1) 108 | else: 109 | logits1 = tf.concat([R, s], axis=-1) 110 | logits1 = tf.nn.tanh(conv1d(logits1, filters, 1, name='Wt')) 111 | logits1 = tf.squeeze(conv1d(logits1, 1, 1, name='Wf'), axis=-1) 112 | logits1 = exp_mask(logits1, mask) 113 | return logits1 114 | 115 | 116 | def end_logits(R, logits1, s, mask, filters=128, name='End_Pointer'): 117 | with tf.variable_scope(name): 118 | l = R * tf.expand_dims(tf.nn.softmax(logits1, axis=-1), axis=-1) # [bs, len_c, dim] 119 | if s.get_shape()[-1] == l.get_shape()[-1]: 120 | s_ = tf.concat([s, l, s * l, s - l], axis=-1) 121 | else: 122 | s_ = tf.concat([s, l], axis=-1) 123 | x = tf.nn.relu(conv1d(s_, filters, 1, name='Wr')) # [bs, len_c, dim] 124 | g = tf.nn.sigmoid(conv1d(s_, filters, 1, name='Wg')) # [bs, len_c, dim] 125 | s_ = g * x + (1 - g) * s # [bs, len_c, dim] 126 | 127 | if R.get_shape()[-1] == s_.get_shape()[-1]: 128 | logits2 = tf.concat([R, s_, R * s_, R - s_], axis=-1) 129 | else: 130 | logits2 = tf.concat([R, s_], axis=-1) 131 | logits2 = tf.nn.tanh(conv1d(logits2, filters, 1, name='Wt')) 132 | logits2 = tf.squeeze(conv1d(logits2, 1, 1, name='Wf'), axis=-1) 133 | logits2 = exp_mask(logits2, mask) 134 | return logits2 135 | 136 | 137 | def ElmoCombineLayer(elmo_feats, name): # [bs, len, 3, 1024] 138 | n_lm_layers = int(elmo_feats.get_shape()[2]) # 3 139 | W = tf.get_variable( 140 | '{}_ELMo_W'.format(name), 141 | shape=(n_lm_layers,), 142 | initializer=tf.zeros_initializer, 143 | regularizer=regularizer, 144 | trainable=True, 145 | ) 146 | normed_weights = tf.split(tf.nn.softmax(W + 1.0 / n_lm_layers), n_lm_layers) # [1]*3 147 | # split LM layers 148 | layers = tf.split(elmo_feats, n_lm_layers, axis=2) # [bs, len, 1, 1024]*3 149 | 150 | # compute the weighted, normalized LM activations 151 | pieces = [] 152 | for w, t in zip(normed_weights, layers): 153 | pieces.append(w * tf.squeeze(t, axis=2)) 154 | sum_pieces = tf.add_n(pieces) 155 | 156 | # scale the weighted sum by gamma 157 | gamma = tf.get_variable( 158 | '{}_ELMo_gamma'.format(name), 159 | shape=(1,), 160 | initializer=tf.ones_initializer, 161 | regularizer=None, 162 | trainable=True, 163 | ) 164 | return sum_pieces * gamma # [bs, len, 1024] 165 | 166 | 167 | def CoveCombineLayer(cove_feats, name): # [bs, len, 2, 600] 168 | n_lm_layers = int(cove_feats.get_shape()[2]) # 2 169 | W = tf.get_variable( 170 | '{}_Cove_W'.format(name), 171 | shape=(n_lm_layers,), 172 | initializer=tf.zeros_initializer, 173 | regularizer=regularizer, 174 | trainable=True, 175 | ) 176 | normed_weights = tf.split(tf.nn.softmax(W + 1.0 / n_lm_layers), n_lm_layers) # [1]*2 177 | # split LM layers 178 | layers = tf.split(cove_feats, n_lm_layers, axis=2) # [bs, len, 1, 600]*2 179 | 180 | # compute the weighted, normalized LM activations 181 | pieces = [] 182 | for w, t in zip(normed_weights, layers): 183 | pieces.append(w * tf.squeeze(t, axis=2)) 184 | sum_pieces = tf.add_n(pieces) 185 | 186 | # scale the weighted sum by gamma 187 | gamma = tf.get_variable( 188 | '{}_Cove_gamma'.format(name), 189 | shape=(1,), 190 | initializer=tf.ones_initializer, 191 | regularizer=None, 192 | trainable=True, 193 | ) 194 | return sum_pieces * gamma # [bs, len, 600] 195 | 196 | 197 | def optimized_trilinear_for_attention(args, c_maxlen, q_maxlen, input_keep_prob=1.0, scope='efficient_trilinear', 198 | bias_initializer=tf.zeros_initializer(), kernel_initializer=initializer()): 199 | assert len(args) == 2, "just use for computing attention with two input" 200 | arg0_shape = args[0].get_shape().as_list() 201 | arg1_shape = args[1].get_shape().as_list() 202 | if len(arg0_shape) != 3 or len(arg1_shape) != 3: 203 | raise ValueError("`args` must be 3 dims (batch_size, len, dimension)") 204 | if arg0_shape[2] != arg1_shape[2]: 205 | raise ValueError("the last dimension of `args` must equal") 206 | arg_size = arg0_shape[2] 207 | dtype = args[0].dtype 208 | droped_args = [tf.nn.dropout(arg, input_keep_prob) for arg in args] 209 | with tf.variable_scope(scope): 210 | weights4arg0 = tf.get_variable( 211 | "linear_kernel4arg0", [arg_size, 1], 212 | dtype=dtype, 213 | regularizer=regularizer, 214 | initializer=kernel_initializer) 215 | weights4arg1 = tf.get_variable( 216 | "linear_kernel4arg1", [arg_size, 1], 217 | dtype=dtype, 218 | regularizer=regularizer, 219 | initializer=kernel_initializer) 220 | weights4mlu = tf.get_variable( 221 | "linear_kernel4mul", [1, 1, arg_size], 222 | dtype=dtype, 223 | regularizer=regularizer, 224 | initializer=kernel_initializer) 225 | biases = tf.get_variable( 226 | "linear_bias", [1], 227 | dtype=dtype, 228 | regularizer=regularizer, 229 | initializer=bias_initializer) 230 | subres0 = tf.tile(backend.dot(droped_args[0], weights4arg0), [1, 1, q_maxlen]) 231 | subres1 = tf.tile(tf.transpose(backend.dot(droped_args[1], weights4arg1), perm=(0, 2, 1)), [1, c_maxlen, 1]) 232 | subres2 = backend.batch_dot(droped_args[0] * weights4mlu, tf.transpose(droped_args[1], perm=(0, 2, 1))) 233 | res = subres0 + subres1 + subres2 234 | res += biases 235 | return res 236 | 237 | 238 | def ElmoAttention(inputs, c_maxlen, q_maxlen, q_mask, dropout): 239 | c, q = inputs 240 | S = optimized_trilinear_for_attention([c, q], c_maxlen, q_maxlen, input_keep_prob=1. - dropout, 241 | scope='elmo_efficient_trilinear') 242 | q_mask = tf.expand_dims(q_mask, 1) 243 | S_ = tf.nn.softmax(exp_mask(S, mask=q_mask)) 244 | c2q = tf.matmul(S_, q) 245 | return tf.concat([c, c2q], axis=-1) 246 | 247 | 248 | def total_params(exclude=None): 249 | total_parameters = 0 250 | if exclude is not None: 251 | trainable_variables = list(set(tf.trainable_variables()) ^ set(tf.trainable_variables(exclude))) 252 | else: 253 | trainable_variables = tf.trainable_variables() 254 | for variable in trainable_variables: 255 | shape = variable.get_shape() 256 | variable_parametes = 1 257 | try: 258 | for dim in shape: 259 | variable_parametes *= dim.value 260 | total_parameters += variable_parametes 261 | except: 262 | print(shape, 'cudnn weights is unknown') 263 | print("Total number of trainable parameters: {}".format(total_parameters)) 264 | -------------------------------------------------------------------------------- /train_h5py.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | import RMR_modelV6 as RMR 4 | import tensorflow.contrib.slim as slim 5 | from util.util import * 6 | import tensorflow as tf 7 | import pandas as pd 8 | from util.h5py_generator import Generator 9 | from util.log_wrapper import create_logger 10 | 11 | os.environ["CUDA_VISIBLE_DEVICES"] = '4' 12 | 13 | if __name__ == '__main__': 14 | 15 | data_source = 'dataset' 16 | 17 | config = { 18 | 'char_dim': 300, 19 | 'cont_limit': 400, 20 | 'ques_limit': 50, 21 | 'char_limit': 16, 22 | 'ans_limit': -1, 23 | 'filters': 128, 24 | 'char_filters': 100, 25 | 'dropout': 0.175, 26 | 'dropout_emb': 0.15, 27 | 'dropout_att': 0.2, 28 | 'dropout_rnn': 0.15, 29 | 'l2_norm': 3e-7, 30 | 'decay': 1, 31 | 'gamma_b': 0.3, 32 | 'gamma_c': 1.0, 33 | 'init_lambda': 3.0, 34 | 'learning_rate': 1e-3, 35 | 'shuffle_size': 25000, 36 | 'grad_clip': 5.0, 37 | 'use_elmo': 0, 38 | 'use_cove': 0, 39 | 'use_feat': True, 40 | 'use_rlloss': False, 41 | 'rlw': 0.0, 42 | 'rlw2': 0.8, 43 | 'optimizer': 'adam', 44 | 'cove_path': '../SAN_tf/Keras_CoVe_2layers.h5', 45 | 'elmo_weights_path': '../SAN_tf/elmo_tf/models/squad_context_concat_lm_2x4096_512_2048cnn_2xhighway_skip.hdf5', 46 | 'elmo_options_path': '../SAN_tf/elmo_tf/models/options_squad_lm_2x4096_512_2048cnn_2xhighway_skip.json', 47 | 'train_tfrecords': '../QANet_tf/tfrecords/train_pre_elmo_cove3.tfrecords', 48 | 'dev_tfrecords': '../QANet_tf/tfrecords/dev_pre_elmo_cove3.tfrecords', 49 | 'batch_size': 32, 50 | 'epoch': 30, 51 | 'origin_path': None, # not finetune 52 | 'path': 'RMRV102' 53 | } 54 | 55 | global logger 56 | logger = create_logger(__name__, to_disk=True, log_file='log/' + config['path'] + '.log') 57 | 58 | logger.info('loading data...') 59 | train_qid = np.load(data_source + '/train_qid.npy').astype(np.int32) 60 | dev_qid = np.load(data_source + '/dev_qid.npy').astype(np.int32) 61 | with open(data_source + '/test_eval.json', "r") as fh: 62 | eval_file = json.load(fh) 63 | 64 | # load embedding matrix 65 | logger.info('loading embedding...') 66 | word_mat = np.load(data_source + '/word_emb_mat.npy') 67 | char_mat_fix = np.load(data_source + '/char_emb_mat_fix.npy').astype(np.float32) 68 | char_mat_trainable = np.load(data_source + '/char_emb_mat_trainable.npy').astype(np.float32) 69 | 70 | logger.info('init model...') 71 | model = RMR.Model(config, word_mat=word_mat, char_mat_trainable=char_mat_trainable, char_mat_fix=char_mat_fix) 72 | sess_config = tf.ConfigProto(allow_soft_placement=True) 73 | sess_config.gpu_options.allow_growth = True 74 | best_f1 = 0 75 | best_em = 0 76 | f1s = [] 77 | ems = [] 78 | 79 | logger.info('init generator...') 80 | train_gen = Generator(data_source + '/train_data.h5', train_qid, batch_size=config['batch_size'], shuffle=True, 81 | use_elmo=config['use_elmo'], use_cove=config['use_cove'], 82 | elmo_path=data_source + '/train_ELMO_feats.h5', 83 | cove_path=data_source + '/train_COVE_feats.h5') 84 | dev_gen = Generator(data_source + '/dev_data.h5', dev_qid, batch_size=config['batch_size'], shuffle=False, 85 | use_elmo=config['use_elmo'], use_cove=config['use_cove'], 86 | elmo_path=data_source + '/dev_ELMO_feats.h5', cove_path=data_source + '/dev_COVE_feats.h5') 87 | 88 | logger.info('starting session...') 89 | with tf.Session(config=sess_config) as sess: 90 | sess.run(tf.global_variables_initializer()) 91 | # scope with trainable weights 92 | variables_to_restore = slim.get_variables_to_restore(include=['Input_Embedding_Mat', 93 | 'Input_Embedding_Layer', 94 | 'Iterative_Reattention_Aligner', 95 | 'Answer_Pointer', 96 | 'EMA_Weights']) 97 | saver = tf.train.Saver(variables_to_restore, max_to_keep=10) 98 | if config['origin_path'] is not None and os.path.exists( 99 | os.path.join('model', config['origin_path'], 'checkpoint')): 100 | saver.restore(sess, tf.train.latest_checkpoint(os.path.join('model', str(config['origin_path']) + '/'))) 101 | 102 | use_rl=False 103 | for i_epoch in range(config['epoch']): 104 | if (i_epoch + 1) % 8 == 0: 105 | config['learning_rate'] *= 0.5 106 | # logger.warning('learning rate reduce to:' + str(config['learning_rate'])) 107 | # if config['learning_rate'] <= 2.5e-4: 108 | # use_rl=True 109 | # logger.warning('rl loss start...') 110 | # config['rlw'] = config['rlw2'] 111 | 112 | sum_loss = 0 113 | for i_batch in range(train_gen.max_batch): 114 | assert i_batch == train_gen.i_batch 115 | # if use_rl: 116 | # config['rlw'] = min(config['rlw2'], config['rlw']+config['rlw2']/5000) 117 | if i_batch == 1: 118 | t_start = time.time() 119 | data_batch = next(train_gen) 120 | feed_dict_ = {model.contw_input: data_batch['context_ids'], model.quesw_input: data_batch['ques_ids'], 121 | model.contc_input: data_batch['context_char_ids'], 122 | model.quesc_input: data_batch['ques_char_ids'], 123 | model.y_start: data_batch['y1'], model.y_end: data_batch['y2'], 124 | # model.yp_start: data_batch['y1p'], model.yp_end: data_batch['y2p'], 125 | model.un_size: data_batch['context_ids'].shape[0], 126 | model.dropout: config['dropout'], 127 | model.dropout_emb: config['dropout_emb'], 128 | model.dropout_att: config['dropout_att'], 129 | model.dropout_rnn: config['dropout_rnn'], 130 | model.learning_rate: config['learning_rate'], 131 | model.rlw: config['rlw']} 132 | if config['use_feat']: 133 | feed_dict_[model.cont_feat] = data_batch['context_feat'] 134 | feed_dict_[model.ques_feat] = data_batch['ques_feat'] 135 | if config['use_elmo'] == 1: 136 | feed_dict_[model.elmo_cont] = data_batch['elmo_cont'] 137 | feed_dict_[model.elmo_ques] = data_batch['elmo_ques'] 138 | if config['use_cove'] == 1: 139 | feed_dict_[model.cove_cont] = data_batch['cove_cont'] 140 | feed_dict_[model.cove_ques] = data_batch['cove_ques'] 141 | if config['decay'] < 1: 142 | loss_value, _ = sess.run([model.loss, model.ema_train_op], feed_dict=feed_dict_) 143 | else: 144 | loss_value, _ = sess.run([model.loss, model.train_op], feed_dict=feed_dict_) 145 | char_mat = sess.run(model.char_mat) 146 | # ipdb.set_trace() 147 | char_mat[-char_mat_fix.shape[0]:,::] = char_mat_fix 148 | _ = sess.run(model.assign_char_mat, feed_dict={model.old_char_mat:char_mat}) 149 | sum_loss += loss_value 150 | 151 | # # check embedding 152 | # fix_feat, tra_feat = sess.run([model.char_mat[-93:, :], model.char_mat[0:1140, :]]) 153 | # fix_feat = np.sum(fix_feat) 154 | # tra_feat = np.sum(tra_feat) 155 | # print('fix:', fix_feat) 156 | # print('trainable:', tra_feat) 157 | 158 | last_train_str = "[epoch:%d/%d, steps:%d/%d] -loss:%.4f" % (i_epoch + 1, config['epoch'], i_batch + 1, 159 | train_gen.max_batch, 160 | sum_loss / (i_batch + 1)) 161 | if i_batch > 0: 162 | last_train_str += (' -ETA:%ds' % cal_ETA(t_start, i_batch, train_gen.max_batch)) 163 | if i_batch % 100 == 0: 164 | logger.info(last_train_str) 165 | logger.info(last_train_str) 166 | 167 | # validating step 168 | # # save the temp weights and do ema 169 | # if config['decay'] < 1.0: 170 | # saver.save(sess, os.path.join('model', config['path'], 'temp_model.ckpt')) 171 | # sess.run(model.assign_vars) 172 | # print('EMA over...') 173 | logger.info('validating...') 174 | sum_loss_val = 0 175 | y1s = [] 176 | y2s = [] 177 | dev_gen.reset() 178 | for i_batch in range(dev_gen.max_batch): 179 | assert i_batch == dev_gen.i_batch 180 | data_batch = next(dev_gen) 181 | feed_dict_ = {model.contw_input: data_batch['context_ids'], model.quesw_input: data_batch['ques_ids'], 182 | model.contc_input: data_batch['context_char_ids'], 183 | model.quesc_input: data_batch['ques_char_ids'], 184 | model.y_start: data_batch['y1'], model.y_end: data_batch['y2'], 185 | # model.yp_start: data_batch['y1p'], model.yp_end: data_batch['y2p'], 186 | model.un_size: data_batch['context_ids'].shape[0]} 187 | if config['use_feat']: 188 | feed_dict_[model.cont_feat] = data_batch['context_feat'] 189 | feed_dict_[model.ques_feat] = data_batch['ques_feat'] 190 | if config['use_elmo'] == 1: 191 | feed_dict_[model.elmo_cont] = data_batch['elmo_cont'] 192 | feed_dict_[model.elmo_ques] = data_batch['elmo_ques'] 193 | if config['use_cove'] == 1: 194 | feed_dict_[model.cove_cont] = data_batch['cove_cont'] 195 | feed_dict_[model.cove_ques] = data_batch['cove_ques'] 196 | 197 | loss_value, y1, y2 = sess.run([model.loss, model.mask_output1, model.mask_output2], 198 | feed_dict=feed_dict_) 199 | y1s.append(y1) 200 | y2s.append(y2) 201 | sum_loss_val += loss_value 202 | 203 | y1s = np.concatenate(y1s) 204 | y2s = np.concatenate(y2s) 205 | answer_dict, _, noanswer_num = convert_tokens(eval_file, dev_qid.tolist(), y1s.tolist(), 206 | y2s.tolist(), data_type=1) 207 | metrics = evaluate(eval_file, answer_dict) 208 | ems.append(metrics['exact_match']) 209 | f1s.append(metrics['f1']) 210 | 211 | # if metrics['f1'] < f1s[-1]: 212 | # config['learning_rate'] *= 0.5 213 | # logger.warning('learning rate reduce to:' + str(config['learning_rate'])) 214 | # if config['learning_rate'] <= 1e-4: 215 | # logger.warning('rl loss start...') 216 | # config['rlw'] = config['rlw2'] 217 | 218 | if ems[-1] > best_em: 219 | best_em = ems[-1] 220 | if f1s[-1] > best_f1: 221 | best_f1 = f1s[-1] 222 | saver.save(sess, os.path.join('model', config['path'], 'model.ckpt'), 223 | global_step=(i_epoch + 1) * train_gen.max_batch) 224 | logger.warning("-loss: %.4f -EM:%.2f%% (best: %.2f%%), -F1:%.2f%% (best: %.2f%%) -Noanswer:%d" % 225 | (sum_loss_val / (dev_gen.max_batch + 1), metrics['exact_match'], best_em, metrics['f1'], 226 | best_f1, noanswer_num)) 227 | # metrics = evaluate_acc(eval_file, answer_dict) 228 | # logger.warning("Has answer acc:%.2f%%, No answer acc::%.2f%%" % ( 229 | # metrics['has_answer_acc'] * 100, metrics['hasno_answer_acc'] * 100)) 230 | result = pd.DataFrame([ems, f1s], index=['em', 'f1']).transpose() 231 | result.to_csv('results/result_' + config['path'] + '.csv', index=None) 232 | 233 | # # recover the model 234 | # if config['decay'] < 1.0: 235 | # saver.restore(sess, os.path.join('model', config['path'], 'temp_model.ckpt')) 236 | # print('recover weights over...') 237 | -------------------------------------------------------------------------------- /layersV0.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | from tensor2tensor.layers.common_layers import conv1d, dense 3 | from tensorflow.contrib.cudnn_rnn import CudnnLSTM 4 | from tensorflow.contrib.keras import backend 5 | from tensorflow.contrib.layers import variance_scaling_initializer, l2_regularizer 6 | 7 | initializer = lambda: variance_scaling_initializer(factor=1.0, mode='FAN_AVG', uniform=True, dtype=tf.float32) 8 | initializer_relu = lambda: variance_scaling_initializer(factor=2.0, mode='FAN_IN', uniform=False, dtype=tf.float32) 9 | regularizer = l2_regularizer(scale=3e-7) 10 | 11 | 12 | # cudnnLSTM 13 | def BiLSTM(x, filters, dropout=0.0, name='BiLSTM', layers=1, return_state=False): 14 | cudnn_lstm = CudnnLSTM(layers, filters, direction='bidirectional', name=name) 15 | if type(x) == list: 16 | assert len(x) == 2 17 | x1, x2 = x 18 | # cudnn compatibility: time first, batch second 19 | x1 = tf.transpose(x1, [1, 0, 2]) 20 | x2 = tf.transpose(x2, [1, 0, 2]) 21 | x1, x1_state = cudnn_lstm(x1) # state:[2, bs, dim] 22 | x2, x2_state = cudnn_lstm(x2) 23 | x1 = tf.transpose(x1, [1, 0, 2]) 24 | x2 = tf.transpose(x2, [1, 0, 2]) 25 | x1_state = tf.concat(tf.unstack(x1_state[0], axis=0), axis=-1) 26 | x2_state = tf.concat(tf.unstack(x2_state[0], axis=0), axis=-1) 27 | if return_state: 28 | return tf.nn.dropout(x1_state, 1 - dropout), tf.nn.dropout(x2_state, 1 - dropout) 29 | else: 30 | return tf.nn.dropout(x1, 1 - dropout), tf.nn.dropout(x2, 1 - dropout) 31 | else: 32 | # cudnn compatibility: time first, batch second 33 | x = tf.transpose(x, [1, 0, 2]) 34 | x, x_state = cudnn_lstm(x) 35 | if return_state: 36 | x_state = tf.concat(tf.unstack(x_state[0], axis=0), axis=-1) 37 | return tf.nn.dropout(x_state, 1 - dropout) 38 | else: 39 | x = tf.transpose(x, [1, 0, 2]) 40 | return tf.nn.dropout(x, 1 - dropout) 41 | 42 | 43 | def exp_mask(inputs, mask, mask_value=-1e30): 44 | mask = tf.cast(mask, tf.float32) 45 | return inputs + mask_value * (1 - mask) 46 | 47 | 48 | def align_block(u, v, c_mask, q_mask, filters=128, dropout=0.0): 49 | with tf.variable_scope("Interactive_Alignment"): 50 | # attention 51 | E = tf.matmul(v, u, transpose_b=True) # [bs, len_q, len_c] 52 | E_ = tf.nn.softmax(exp_mask(E, tf.expand_dims(q_mask, axis=-1)), axis=1) # [bs, len_q, len_c] 53 | v_E = tf.matmul(E_, v, transpose_a=True) # [bs, len_c, dim] 54 | 55 | # fusion 56 | uv = tf.concat([u, v_E, u * v_E, u - v_E], axis=-1) 57 | x = tf.nn.tanh(conv1d(uv, filters, 1, name='Wr')) 58 | g = tf.nn.sigmoid(conv1d(uv, filters, 1, name='Wg')) 59 | h = g * x + (1 - g) * u # [bs, len_c, dim] 60 | 61 | with tf.variable_scope("Self_Alignment"): 62 | # attention 63 | B = tf.matmul(h, h, transpose_b=True) # [bs, len_c, len_c] 64 | B = tf.matrix_set_diag(B, tf.zeros([tf.shape(B)[0], tf.shape(B)[-1]])) 65 | B_ = tf.nn.softmax(exp_mask(B, tf.expand_dims(c_mask, axis=-1)), axis=1) # [bs, len_c, len_c] 66 | h_B = tf.matmul(B_, h, transpose_a=True) 67 | 68 | # fusion 69 | hh = tf.concat([h, h_B, h * h_B, h - h_B], axis=-1) 70 | x = tf.nn.tanh(conv1d(hh, filters, 1, name='Wr')) 71 | g = tf.nn.sigmoid(conv1d(hh, filters, 1, name='Wg')) 72 | Z = g * x + (1 - g) * h # [bs, len_c, dim] 73 | 74 | with tf.variable_scope("Evidence_Collection"): 75 | R = BiLSTM(Z, filters // 2, name='bilstm', dropout=dropout) # [bs, len_c, dim] 76 | 77 | return R 78 | 79 | 80 | def feed_forward(x, name, filters=128, dropout=0.0): 81 | x = tf.nn.relu(conv1d(x, filters, 1, name=name+'_FF1')) 82 | x = conv1d(x, filters, 1, name=name+'FF2') 83 | x = tf.nn.dropout(x, 1 - dropout) 84 | return x 85 | 86 | 87 | def answer_block(R, z1, filters, c_mask, dropout=0.0, return_logits=True): 88 | # start 89 | z_s = tf.tile(tf.expand_dims(z1, axis=1), [1, tf.shape(R)[1], 1]) # [bs, 1*c_len, dim] 90 | s = feed_forward(tf.concat([R, z_s, R * z_s], axis=-1), 'st', filters, dropout) # [bs, c_len, dim] 91 | s_logits = exp_mask(tf.squeeze(conv1d(s, 1, 1, name='Ws'), axis=-1), c_mask) # [bs, c_len] 92 | s = tf.expand_dims(tf.nn.softmax(s_logits), axis=-1) # [bs, c_len]->[bs, c_len, 1] 93 | 94 | # get z2 95 | u = tf.squeeze(tf.matmul(R, s, transpose_a=True), axis=-1) # [bs, dim, 1]->[bs, dim] 96 | zu = tf.concat([z1, u, z1 * u, z1 - u], axis=-1) 97 | z_s_ = tf.nn.tanh(dense(zu, filters, name='Wru')) 98 | g = tf.nn.sigmoid(dense(zu, filters, name='Wgu')) 99 | z2 = g * z_s_ + (1 - g) * z1 # [bs, dim] 100 | 101 | # end 102 | z_e = tf.tile(tf.expand_dims(z2, axis=1), [1, tf.shape(R)[1], 1]) # [bs, 1*c_len, dim] 103 | e = feed_forward(tf.concat([R, z_e, R * z_e], axis=-1), 'ed', filters, dropout) 104 | e_logits = exp_mask(tf.squeeze(conv1d(e, 1, 1, name='We'), axis=-1), c_mask) # [bs, c_len] 105 | e = tf.expand_dims(tf.nn.softmax(e_logits), axis=-1) 106 | 107 | # get z3 108 | v = tf.squeeze(tf.matmul(R, e, transpose_a=True), axis=-1) 109 | zv = tf.concat([z2, v, z2 * v, z2 - v], axis=-1) 110 | z_e_ = tf.nn.tanh(dense(zv, filters, name='Wrv')) 111 | g = tf.nn.sigmoid(dense(zv, filters, name='Wgv')) 112 | z3 = g * z_e_ + (1 - g) * z2 # [bs, dim] 113 | 114 | if return_logits: 115 | return s_logits, e_logits 116 | else: 117 | return z3 118 | 119 | 120 | def summary_vector(q_emb, c_maxlen, mask): 121 | with tf.variable_scope("Question_Summary"): 122 | alpha = tf.nn.softmax(exp_mask(tf.squeeze(conv1d(q_emb, 1, 1), axis=-1), mask)) 123 | s = tf.expand_dims(alpha, axis=-1) * q_emb 124 | s = tf.reduce_sum(s, axis=1, keepdims=True) # [bs, 1, dim] 125 | s = tf.tile(s, [1, c_maxlen, 1]) # [bs, len_c, dim] 126 | return s 127 | 128 | 129 | def start_logits(R, s, mask, filters=128, name='Start_Pointer'): 130 | with tf.variable_scope(name): 131 | if R.get_shape()[-1] == s.get_shape()[-1]: 132 | logits1 = tf.concat([R, s, R * s, R - s], axis=-1) 133 | else: 134 | logits1 = tf.concat([R, s], axis=-1) 135 | logits1 = tf.nn.tanh(conv1d(logits1, filters, 1, name='Wt')) 136 | logits1 = tf.squeeze(conv1d(logits1, 1, 1, name='Wf'), axis=-1) 137 | logits1 = exp_mask(logits1, mask) 138 | return logits1 139 | 140 | 141 | def end_logits(R, logits1, s, mask, filters=128, name='End_Pointer'): 142 | with tf.variable_scope(name): 143 | l = R * tf.expand_dims(tf.nn.softmax(logits1, axis=-1), axis=-1) # [bs, len_c, dim] 144 | if s.get_shape()[-1] == l.get_shape()[-1]: 145 | s_ = tf.concat([s, l, s * l, s - l], axis=-1) 146 | else: 147 | s_ = tf.concat([s, l], axis=-1) 148 | x = tf.nn.relu(conv1d(s_, filters, 1, name='Wr')) # [bs, len_c, dim] 149 | g = tf.nn.sigmoid(conv1d(s_, filters, 1, name='Wg')) # [bs, len_c, dim] 150 | s_ = g * x + (1 - g) * s # [bs, len_c, dim] 151 | 152 | if R.get_shape()[-1] == s_.get_shape()[-1]: 153 | logits2 = tf.concat([R, s_, R * s_, R - s_], axis=-1) 154 | else: 155 | logits2 = tf.concat([R, s_], axis=-1) 156 | logits2 = tf.nn.tanh(conv1d(logits2, filters, 1, name='Wt')) 157 | logits2 = tf.squeeze(conv1d(logits2, 1, 1, name='Wf'), axis=-1) 158 | logits2 = exp_mask(logits2, mask) 159 | return logits2 160 | 161 | 162 | def ElmoCombineLayer(elmo_feats, name): # [bs, len, 3, 1024] 163 | n_lm_layers = int(elmo_feats.get_shape()[2]) # 3 164 | W = tf.get_variable( 165 | '{}_ELMo_W'.format(name), 166 | shape=(n_lm_layers,), 167 | initializer=tf.zeros_initializer, 168 | regularizer=regularizer, 169 | trainable=True, 170 | ) 171 | normed_weights = tf.split(tf.nn.softmax(W + 1.0 / n_lm_layers), n_lm_layers) # [1]*3 172 | # split LM layers 173 | layers = tf.split(elmo_feats, n_lm_layers, axis=2) # [bs, len, 1, 1024]*3 174 | 175 | # compute the weighted, normalized LM activations 176 | pieces = [] 177 | for w, t in zip(normed_weights, layers): 178 | pieces.append(w * tf.squeeze(t, axis=2)) 179 | sum_pieces = tf.add_n(pieces) 180 | 181 | # scale the weighted sum by gamma 182 | gamma = tf.get_variable( 183 | '{}_ELMo_gamma'.format(name), 184 | shape=(1,), 185 | initializer=tf.ones_initializer, 186 | regularizer=None, 187 | trainable=True, 188 | ) 189 | return sum_pieces * gamma # [bs, len, 1024] 190 | 191 | 192 | def CoveCombineLayer(cove_feats, name): # [bs, len, 2, 600] 193 | n_lm_layers = int(cove_feats.get_shape()[2]) # 2 194 | W = tf.get_variable( 195 | '{}_Cove_W'.format(name), 196 | shape=(n_lm_layers,), 197 | initializer=tf.zeros_initializer, 198 | regularizer=regularizer, 199 | trainable=True, 200 | ) 201 | normed_weights = tf.split(tf.nn.softmax(W + 1.0 / n_lm_layers), n_lm_layers) # [1]*2 202 | # split LM layers 203 | layers = tf.split(cove_feats, n_lm_layers, axis=2) # [bs, len, 1, 600]*2 204 | 205 | # compute the weighted, normalized LM activations 206 | pieces = [] 207 | for w, t in zip(normed_weights, layers): 208 | pieces.append(w * tf.squeeze(t, axis=2)) 209 | sum_pieces = tf.add_n(pieces) 210 | 211 | # scale the weighted sum by gamma 212 | gamma = tf.get_variable( 213 | '{}_Cove_gamma'.format(name), 214 | shape=(1,), 215 | initializer=tf.ones_initializer, 216 | regularizer=None, 217 | trainable=True, 218 | ) 219 | return sum_pieces * gamma # [bs, len, 600] 220 | 221 | 222 | def optimized_trilinear_for_attention(args, c_maxlen, q_maxlen, input_keep_prob=1.0, scope='efficient_trilinear', 223 | bias_initializer=tf.zeros_initializer(), kernel_initializer=initializer()): 224 | assert len(args) == 2, "just use for computing attention with two input" 225 | arg0_shape = args[0].get_shape().as_list() 226 | arg1_shape = args[1].get_shape().as_list() 227 | if len(arg0_shape) != 3 or len(arg1_shape) != 3: 228 | raise ValueError("`args` must be 3 dims (batch_size, len, dimension)") 229 | if arg0_shape[2] != arg1_shape[2]: 230 | raise ValueError("the last dimension of `args` must equal") 231 | arg_size = arg0_shape[2] 232 | dtype = args[0].dtype 233 | droped_args = [tf.nn.dropout(arg, input_keep_prob) for arg in args] 234 | with tf.variable_scope(scope): 235 | weights4arg0 = tf.get_variable( 236 | "linear_kernel4arg0", [arg_size, 1], 237 | dtype=dtype, 238 | regularizer=regularizer, 239 | initializer=kernel_initializer) 240 | weights4arg1 = tf.get_variable( 241 | "linear_kernel4arg1", [arg_size, 1], 242 | dtype=dtype, 243 | regularizer=regularizer, 244 | initializer=kernel_initializer) 245 | weights4mlu = tf.get_variable( 246 | "linear_kernel4mul", [1, 1, arg_size], 247 | dtype=dtype, 248 | regularizer=regularizer, 249 | initializer=kernel_initializer) 250 | biases = tf.get_variable( 251 | "linear_bias", [1], 252 | dtype=dtype, 253 | regularizer=regularizer, 254 | initializer=bias_initializer) 255 | subres0 = tf.tile(backend.dot(droped_args[0], weights4arg0), [1, 1, q_maxlen]) 256 | subres1 = tf.tile(tf.transpose(backend.dot(droped_args[1], weights4arg1), perm=(0, 2, 1)), [1, c_maxlen, 1]) 257 | subres2 = backend.batch_dot(droped_args[0] * weights4mlu, tf.transpose(droped_args[1], perm=(0, 2, 1))) 258 | res = subres0 + subres1 + subres2 259 | res += biases 260 | return res 261 | 262 | 263 | def ElmoAttention(inputs, c_maxlen, q_maxlen, q_mask, dropout): 264 | c, q = inputs 265 | S = optimized_trilinear_for_attention([c, q], c_maxlen, q_maxlen, input_keep_prob=1. - dropout, 266 | scope='elmo_efficient_trilinear') 267 | q_mask = tf.expand_dims(q_mask, 1) 268 | S_ = tf.nn.softmax(exp_mask(S, mask=q_mask)) 269 | c2q = tf.matmul(S_, q) 270 | return tf.concat([c, c2q], axis=-1) 271 | 272 | 273 | def total_params(exclude=None): 274 | total_parameters = 0 275 | if exclude is not None: 276 | trainable_variables = list(set(tf.trainable_variables()) ^ set(tf.trainable_variables(exclude))) 277 | else: 278 | trainable_variables = tf.trainable_variables() 279 | for variable in trainable_variables: 280 | shape = variable.get_shape() 281 | variable_parametes = 1 282 | try: 283 | for dim in shape: 284 | variable_parametes *= dim.value 285 | total_parameters += variable_parametes 286 | except: 287 | print(shape, 'cudnn weights is unknown') 288 | print("Total number of trainable parameters: {}".format(total_parameters)) 289 | -------------------------------------------------------------------------------- /util/util.py: -------------------------------------------------------------------------------- 1 | import re 2 | from collections import Counter 3 | import string 4 | import time 5 | import tensorflow as tf 6 | 7 | ''' 8 | This file is taken and modified from R-Net by HKUST-KnowComp 9 | https://github.com/HKUST-KnowComp/R-Net 10 | ''' 11 | 12 | 13 | def get_record_parser(config): 14 | def parser(example): 15 | if not config['data_type']: 16 | config['data_type'] = 2 17 | char_limit = config['char_limit'] 18 | features_ = { 19 | "context_ids": tf.FixedLenFeature([], tf.string), 20 | "ques_ids": tf.FixedLenFeature([], tf.string), 21 | "context_char_ids": tf.FixedLenFeature([], tf.string), 22 | "ques_char_ids": tf.FixedLenFeature([], tf.string), 23 | 'context_feat': tf.FixedLenFeature([], tf.string), 24 | 'ques_feat': tf.FixedLenFeature([], tf.string), 25 | 'elmo_context_feat': tf.FixedLenFeature([], tf.string), 26 | 'elmo_question_feat': tf.FixedLenFeature([], tf.string), 27 | 'cove_context_feat': tf.FixedLenFeature([], tf.string), 28 | 'cove_question_feat': tf.FixedLenFeature([], tf.string), 29 | "y1": tf.FixedLenFeature([], tf.string), 30 | "y2": tf.FixedLenFeature([], tf.string), 31 | "qid": tf.FixedLenFeature([], tf.int64) 32 | } 33 | if config['data_type'] == 2: 34 | features_['y1p'] = tf.FixedLenFeature([], tf.string) 35 | features_['y2p'] = tf.FixedLenFeature([], tf.string) 36 | 37 | features = tf.parse_single_example(example, features=features_) 38 | context_idxs = tf.reshape(tf.decode_raw(features["context_ids"], tf.int32), [-1]) 39 | ques_idxs = tf.reshape(tf.decode_raw(features["ques_ids"], tf.int32), [-1]) 40 | context_char_idxs = tf.reshape(tf.decode_raw(features["context_char_ids"], tf.int32), [-1, char_limit]) 41 | ques_char_idxs = tf.reshape(tf.decode_raw(features["ques_char_ids"], tf.int32), [-1, char_limit]) 42 | context_feat = tf.reshape(tf.decode_raw(features["context_feat"], tf.float32), [-1, 73]) 43 | ques_feat = tf.reshape(tf.decode_raw(features["ques_feat"], tf.float32), [-1, 73]) 44 | elmo_context_feat = tf.reshape(tf.decode_raw(features['elmo_context_feat'], tf.float32), [-1, 3, 1024]) 45 | elmo_question_feat = tf.reshape(tf.decode_raw(features['elmo_question_feat'], tf.float32), [-1, 3, 1024]) 46 | cove_context_feat = tf.reshape(tf.decode_raw(features['cove_context_feat'], tf.float32), [-1, 2, 600]) 47 | cove_question_feat = tf.reshape(tf.decode_raw(features['cove_question_feat'], tf.float32), [-1, 2, 600]) 48 | y1 = tf.reshape(tf.decode_raw(features["y1"], tf.int32), [-1]) 49 | y2 = tf.reshape(tf.decode_raw(features["y2"], tf.int32), [-1]) 50 | if config['data_type'] == 2: 51 | y1p = tf.reshape(tf.decode_raw(features["y1p"], tf.int32), [-1]) 52 | y2p = tf.reshape(tf.decode_raw(features["y2p"], tf.int32), [-1]) 53 | # qid = features["qid"] 54 | if config['data_type'] == 2: 55 | return context_idxs, ques_idxs, \ 56 | context_char_idxs, ques_char_idxs, \ 57 | context_feat, ques_feat, \ 58 | elmo_context_feat, elmo_question_feat, \ 59 | cove_context_feat, cove_question_feat, \ 60 | y1, y2, y1p, y2p 61 | else: 62 | return context_idxs, ques_idxs, \ 63 | context_char_idxs, ques_char_idxs, \ 64 | context_feat, ques_feat, \ 65 | elmo_context_feat, elmo_question_feat, \ 66 | cove_context_feat, cove_question_feat, \ 67 | y1, y2 68 | 69 | return parser 70 | 71 | 72 | def convert_tokens(eval_file, qa_id, pp1, pp2, unanswer_id=-1, data_type=2): 73 | answer_dict = {} 74 | remapped_dict = {} 75 | noanswer_num = 0 76 | for qid, p1, p2 in zip(qa_id, pp1, pp2): 77 | context = eval_file[str(qid)]["context"] 78 | spans = eval_file[str(qid)]["spans"] 79 | uuid = eval_file[str(qid)]["uuid"] 80 | if data_type == 2: 81 | if p1 == unanswer_id or p2 == unanswer_id or p1 >= len(spans) or p2 >= len( 82 | spans): # prediction has no answer 83 | noanswer_num += 1 84 | answer_dict[str(qid)] = '' 85 | remapped_dict[uuid] = '' 86 | else: 87 | start_idx = spans[min(p1, len(spans) - 1)][0] 88 | end_idx = spans[min(p2, len(spans) - 1)][1] 89 | answer_dict[str(qid)] = context[start_idx: end_idx] 90 | remapped_dict[uuid] = context[start_idx: end_idx] 91 | else: 92 | start_idx = spans[min(p1, len(spans) - 1)][0] 93 | end_idx = spans[min(p2, len(spans) - 1)][1] 94 | answer_dict[str(qid)] = context[start_idx: end_idx] 95 | remapped_dict[uuid] = context[start_idx: end_idx] 96 | return answer_dict, remapped_dict, noanswer_num 97 | 98 | 99 | def evaluate(eval_file, answer_dict): 100 | f1 = exact_match = total = 0 101 | for key, value in answer_dict.items(): 102 | total += 1 103 | ground_truths = eval_file[key]["answers"] 104 | prediction = value 105 | if len(ground_truths) == 0: # ground truth has no answer 106 | if prediction == '': 107 | exact_match += 1 108 | f1 += 1 109 | else: 110 | exact_match += metric_max_over_ground_truths(exact_match_score, prediction, ground_truths) 111 | f1 += metric_max_over_ground_truths(f1_score, prediction, ground_truths) 112 | exact_match = 100.0 * exact_match / total 113 | f1 = 100.0 * f1 / total 114 | return {'exact_match': exact_match, 'f1': f1} 115 | 116 | 117 | def evaluate_acc(eval_file, answer_dict): 118 | has_answer_acc = 0 119 | has_answer_total = 0 120 | hasno_answer_acc = 0 121 | hasno_answer_total = 0 122 | for key, value in answer_dict.items(): 123 | ground_truths = eval_file[key]["answers"] 124 | prediction = value 125 | if len(ground_truths) != 0: # ground truth has answers 126 | has_answer_total += 1 127 | if prediction != '': 128 | has_answer_acc += 1 129 | else: 130 | hasno_answer_total += 1 131 | if prediction == '': 132 | hasno_answer_acc += 1 133 | print(has_answer_acc, '/', has_answer_total, hasno_answer_acc, '/', hasno_answer_total) 134 | has_answer_acc /= has_answer_total 135 | hasno_answer_acc /= hasno_answer_total 136 | return {'has_answer_acc': has_answer_acc, 'hasno_answer_acc': hasno_answer_acc} 137 | 138 | 139 | def evaluate_max(eval_file, answer_dict_list): 140 | f1 = exact_match = total = 0 141 | for key, value in answer_dict_list[0].items(): 142 | total += 1 143 | ground_truths = eval_file[key]["answers"] 144 | f1_temp = 0 145 | em_temp = 0 146 | for answer_dict in answer_dict_list: 147 | prediction = answer_dict[key] 148 | if len(ground_truths) == 0: # ground truth has no answer 149 | if prediction == 'unanswerable': 150 | em_temp = 1 151 | f1_temp = 1 152 | else: 153 | em_temp = max(metric_max_over_ground_truths(exact_match_score, prediction, ground_truths), em_temp) 154 | f1_temp = max(metric_max_over_ground_truths(f1_score, prediction, ground_truths), f1_temp) 155 | exact_match += em_temp 156 | f1 += f1_temp 157 | exact_match = 100.0 * exact_match / total 158 | f1 = 100.0 * f1 / total 159 | return {'exact_match': exact_match, 'f1': f1} 160 | 161 | 162 | def normalize_answer(s): 163 | def remove_articles(text): 164 | return re.sub(r'\b(a|an|the)\b', ' ', text) 165 | 166 | def white_space_fix(text): 167 | return ' '.join(text.split()) 168 | 169 | def remove_punc(text): 170 | exclude = set(string.punctuation) 171 | return ''.join(ch for ch in text if ch not in exclude) 172 | 173 | def lower(text): 174 | return text.lower() 175 | 176 | return white_space_fix(remove_articles(remove_punc(lower(s)))) 177 | 178 | 179 | def f1_score(prediction, ground_truth): 180 | prediction_tokens = normalize_answer(prediction).split() 181 | ground_truth_tokens = normalize_answer(ground_truth).split() 182 | common = Counter(prediction_tokens) & Counter(ground_truth_tokens) 183 | num_same = sum(common.values()) 184 | if num_same == 0: 185 | return 0 186 | precision = 1.0 * num_same / len(prediction_tokens) 187 | recall = 1.0 * num_same / len(ground_truth_tokens) 188 | f1 = (2 * precision * recall) / (precision + recall) 189 | return f1 190 | 191 | 192 | def exact_match_score(prediction, ground_truth): 193 | return (normalize_answer(prediction) == normalize_answer(ground_truth)) 194 | 195 | 196 | def metric_max_over_ground_truths(metric_fn, prediction, ground_truths): 197 | scores_for_ground_truths = [] 198 | for ground_truth in ground_truths: 199 | score = metric_fn(prediction, ground_truth) 200 | scores_for_ground_truths.append(score) 201 | return max(scores_for_ground_truths) 202 | 203 | 204 | def cal_ETA(t_start, i, n_batch): 205 | t_temp = time.time() 206 | t_avg = float(int(t_temp) - int(t_start)) / float(i + 1) 207 | if n_batch - i - 1 > 0: 208 | return int((n_batch - i - 1) * t_avg) 209 | else: 210 | return int(t_temp) - int(t_start) 211 | 212 | 213 | import numpy as np 214 | import h5py 215 | 216 | 217 | def batchify_train(data): 218 | def padding(datas): 219 | max_len = max([d.shape[0] for d in datas]) 220 | paded_datas = np.zeros([len(datas), max_len] + list(datas[0].shape[1:]), dtype=datas[0].dtype) 221 | for i in range(len(datas)): 222 | paded_datas[i, 0:datas[i].shape[0]] = datas[i] 223 | return paded_datas 224 | 225 | cont_ids, cont_char_ids, ques_ids, ques_char_ids = [], [], [], [] 226 | cont_feat, ques_feat, y1s, y2s, y1ps, y2ps = [], [], [], [], [], [] 227 | elmo_cont_feat, elmo_ques_feat = [], [] 228 | cove_cont_feat, cove_ques_feat = [], [] 229 | # load elmo 230 | with h5py.File('dataset_pre3/train_ELMO_feats.h5', 'r') as elmo_h5f: 231 | with h5py.File('dataset_pre3/train_COVE_feats.h5', 'r') as cove_h5f: 232 | with h5py.File('dataset_pre3/train_data.h5', 'r') as h5f: 233 | for qid in data: 234 | group = h5f[str(qid)] 235 | # base feats 236 | cont_ids.append(group['context_ids'][:]) 237 | cont_char_ids.append(group['context_char_ids'][:]) 238 | cont_feat.append(group['context_feat'][:]) 239 | ques_ids.append(group['ques_ids'][:]) 240 | ques_char_ids.append(group['ques_char_ids'][:]) 241 | ques_feat.append(group['ques_feat'][:]) 242 | # elmo feats 243 | elmo_cont_feat.append(elmo_h5f[str(qid) + 'c'][:]) 244 | elmo_ques_feat.append(elmo_h5f[str(qid) + 'q'][:]) 245 | # cove feats 246 | cove_cont_feat.append(cove_h5f[str(qid) + 'c'][:]) 247 | cove_ques_feat.append(cove_h5f[str(qid) + 'q'][:]) 248 | cont_ids = padding(cont_ids) 249 | cont_char_ids = padding(cont_char_ids) 250 | ques_ids = padding(ques_ids) 251 | ques_char_ids = padding(ques_char_ids) 252 | elmo_cont_feat = padding(elmo_cont_feat) 253 | elmo_ques_feat = padding(elmo_ques_feat) 254 | cove_cont_feat = padding(cove_cont_feat) 255 | cove_ques_feat = padding(cove_ques_feat) 256 | 257 | return cont_ids, cont_char_ids, ques_ids, ques_char_ids, elmo_cont_feat, elmo_ques_feat, cove_cont_feat, cove_ques_feat 258 | 259 | 260 | def batchify_dev(data): 261 | def padding(datas): 262 | max_len = max([d.shape[0] for d in datas]) 263 | paded_datas = np.zeros([len(datas), max_len] + list(datas[0].shape[1:]), dtype=datas[0].dtype) 264 | for i in range(len(datas)): 265 | paded_datas[i, 0:datas[i].shape[0]] = datas[i] 266 | return paded_datas 267 | 268 | cont_ids, cont_char_ids, ques_ids, ques_char_ids = [], [], [], [] 269 | cont_feat, ques_feat, y1s, y2s, y1ps, y2ps = [], [], [], [], [], [] 270 | elmo_cont_feat, elmo_ques_feat = [], [] 271 | cove_cont_feat, cove_ques_feat = [], [] 272 | # load elmo 273 | with h5py.File('dataset_pre3/dev_ELMO_feats.h5', 'r') as elmo_h5f: 274 | with h5py.File('dataset_pre3/dev_COVE_feats.h5', 'r') as cove_h5f: 275 | with h5py.File('dataset_pre3/dev_data.h5', 'r') as h5f: 276 | for qid in data: 277 | group = h5f[str(qid)] 278 | # base feats 279 | cont_ids.append(group['context_ids'][:]) 280 | cont_char_ids.append(group['context_char_ids'][:]) 281 | cont_feat.append(group['context_feat'][:]) 282 | ques_ids.append(group['ques_ids'][:]) 283 | ques_char_ids.append(group['ques_char_ids'][:]) 284 | ques_feat.append(group['ques_feat'][:]) 285 | # elmo feats 286 | elmo_cont_feat.append(elmo_h5f[str(qid) + 'c'][:]) 287 | elmo_ques_feat.append(elmo_h5f[str(qid) + 'q'][:]) 288 | # cove feats 289 | cove_cont_feat.append(cove_h5f[str(qid) + 'c'][:]) 290 | cove_ques_feat.append(cove_h5f[str(qid) + 'q'][:]) 291 | cont_ids = padding(cont_ids) 292 | cont_char_ids = padding(cont_char_ids) 293 | ques_ids = padding(ques_ids) 294 | ques_char_ids = padding(ques_char_ids) 295 | elmo_cont_feat = padding(elmo_cont_feat) 296 | elmo_ques_feat = padding(elmo_ques_feat) 297 | cove_cont_feat = padding(cove_cont_feat) 298 | cove_ques_feat = padding(cove_ques_feat) 299 | 300 | return cont_ids, cont_char_ids, ques_ids, ques_char_ids, elmo_cont_feat, elmo_ques_feat, cove_cont_feat, cove_ques_feat 301 | -------------------------------------------------------------------------------- /train_tfrecords.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | import RMR_modelV6_squad2 as RMR 4 | import tensorflow.contrib.slim as slim 5 | from util.util import * 6 | import tensorflow as tf 7 | import pandas as pd 8 | from util.log_wrapper import create_logger 9 | 10 | os.environ["CUDA_VISIBLE_DEVICES"] = '6' 11 | 12 | if __name__ == '__main__': 13 | 14 | data_source = '../QANet_tf/dataset_pre3' 15 | 16 | config = { 17 | 'char_dim': 300, 18 | 'cont_limit': 400, 19 | 'ques_limit': 50, 20 | 'char_limit': 16, 21 | 'ans_limit': -1, 22 | 'filters': 300, 23 | 'dropout': 0.175, 24 | 'dropout_emb': 0.15, 25 | 'dropout_att': 0.2, 26 | 'dropout_rnn': 0.1, 27 | 'l2_norm': 3e-7, 28 | 'decay': 1, 29 | 'gamma_b': 0.3, 30 | 'gamma_c': 1.0, 31 | 'init_lambda': 3.0, 32 | 'learning_rate': 1e-3, 33 | 'shuffle_size': 25000, 34 | 'grad_clip': 5.0, 35 | 'use_elmo': 1, 36 | 'use_cove': 1, 37 | 'use_feat': True, 38 | 'use_rlloss': False, 39 | 'rlw': 0.0, 40 | 'rlw2': 0.8, 41 | 'optimizer': 'adam', 42 | 'cove_path': '../SAN_tf/Keras_CoVe_2layers.h5', 43 | 'elmo_weights_path': '../SAN_tf/elmo_tf/models/squad_context_concat_lm_2x4096_512_2048cnn_2xhighway_skip.hdf5', 44 | 'elmo_options_path': '../SAN_tf/elmo_tf/models/options_squad_lm_2x4096_512_2048cnn_2xhighway_skip.json', 45 | 'train_tfrecords': '../QANet_tf/tfrecords/train_pre_elmo_cove3.tfrecords', 46 | 'dev_tfrecords': '../QANet_tf/tfrecords/dev_pre_elmo_cove3.tfrecords', 47 | 'batch_size': 32, 48 | 'epoch': 25, 49 | 'origin_path': None, # not finetune 50 | 'path': 'RMR200' 51 | } 52 | 53 | global logger 54 | logger = create_logger(__name__, to_disk=True, log_file='log/' + config['path'] + '.log') 55 | 56 | logger.info('loading data...') 57 | train_qid = np.load(data_source + '/train_qid.npy').astype(np.int32) 58 | dev_qid = np.load(data_source + '/dev_qid.npy').astype(np.int32) 59 | with open(data_source + '/test_eval.json', "r") as fh: 60 | eval_file = json.load(fh) 61 | 62 | # load embedding matrix 63 | logger.info('loading embedding...') 64 | word_mat = np.load(data_source + '/word_emb_mat.npy') 65 | char_mat_fix = np.load(data_source + '/char_emb_mat_fix.npy').astype(np.float32) 66 | char_mat_trainable = np.load(data_source + '/char_emb_mat_trainable.npy').astype(np.float32) 67 | 68 | logger.info('generate train tfrecords...') 69 | train_dataset = tf.data.TFRecordDataset(config['train_tfrecords']) \ 70 | .map(get_record_parser(config), num_parallel_calls=8) \ 71 | .shuffle(config['shuffle_size']) \ 72 | .padded_batch(config['batch_size'], padded_shapes=([None], 73 | [None], 74 | [None, None], 75 | [None, None], 76 | [None, None], 77 | [None, None], 78 | [None, None, None], 79 | [None, None, None], 80 | [None, None, None], 81 | [None, None, None], 82 | [None], 83 | [None], 84 | [None], 85 | [None])) 86 | train_iterator = train_dataset.make_initializable_iterator() 87 | train_next_element = train_iterator.get_next() 88 | train_sum = 129941 89 | 90 | logger.info('generate dev tfrecords...') 91 | dev_dataset = tf.data.TFRecordDataset(config['dev_tfrecords']) \ 92 | .map(get_record_parser(config), num_parallel_calls=8) \ 93 | .padded_batch(config['batch_size'], padded_shapes=([None], 94 | [None], 95 | [None, None], 96 | [None, None], 97 | [None, None], 98 | [None, None], 99 | [None, None, None], 100 | [None, None, None], 101 | [None, None, None], 102 | [None, None, None], 103 | [None], 104 | [None], 105 | [None], 106 | [None])) 107 | dev_iterator = dev_dataset.make_initializable_iterator() 108 | dev_next_element = dev_iterator.get_next() 109 | dev_sum = 11730 110 | 111 | logger.info('init model...') 112 | model = RMR.Model(config, word_mat=word_mat, char_mat_trainable=char_mat_trainable, char_mat_fix=char_mat_fix) 113 | sess_config = tf.ConfigProto(allow_soft_placement=True) 114 | sess_config.gpu_options.allow_growth = True 115 | best_f1 = 0 116 | best_em = 0 117 | f1s = [] 118 | ems = [] 119 | 120 | with tf.Session(config=sess_config) as sess: 121 | sess.run(tf.global_variables_initializer()) 122 | # scope with trainable weights 123 | variables_to_restore = slim.get_variables_to_restore(include=['Input_Embedding_Mat', 124 | 'Input_Embedding_Layer', 125 | 'Iterative_Reattention_Aligner', 126 | 'Answer_Pointer', 127 | 'EMA_Weights']) 128 | saver = tf.train.Saver(variables_to_restore, max_to_keep=10) 129 | if config['origin_path'] is not None and os.path.exists( 130 | os.path.join('model', config['origin_path'], 'checkpoint')): 131 | saver.restore(sess, tf.train.latest_checkpoint(os.path.join('model', str(config['origin_path']) + '/'))) 132 | 133 | for i_epoch in range(config['epoch']): 134 | sess.run(train_iterator.initializer) 135 | i_batch = 0 136 | train_n_batch = train_sum // config['batch_size'] + 1 137 | val_n_batch = dev_sum // config['batch_size'] + 1 138 | # if i_epoch + 1 >= 8 and (i_epoch + 1) % 8 == 0: 139 | # config['learning_rate'] *= 0.5 140 | sum_loss = 0 141 | while True: 142 | try: 143 | if i_batch == 1: 144 | t_start = time.time() 145 | context_idxs, ques_idxs, \ 146 | context_char_idxs, ques_char_idxs, \ 147 | context_feat, ques_feat, \ 148 | elmo_context_feat, elmo_question_feat, \ 149 | cove_context_feat, cove_question_feat, \ 150 | y1, y2, y1p, y2p = sess.run(train_next_element) 151 | feed_dict_ = {model.contw_input: context_idxs, model.quesw_input: ques_idxs, 152 | model.contc_input: context_char_idxs, model.quesc_input: ques_char_idxs, 153 | model.y_start: y1, model.y_end: y2, 154 | model.yp_start: y1p, model.yp_end: y2p, 155 | model.un_size: context_idxs.shape[0], 156 | model.dropout: config['dropout'], 157 | model.dropout_emb: config['dropout_emb'], 158 | model.dropout_att: config['dropout_att'], 159 | model.dropout_rnn: config['dropout_rnn'], 160 | model.learning_rate: config['learning_rate'], 161 | model.rlw: config['rlw']} 162 | if config['use_feat']: 163 | feed_dict_[model.cont_feat] = context_feat 164 | feed_dict_[model.ques_feat] = ques_feat 165 | if config['use_elmo'] == 1: 166 | feed_dict_[model.elmo_cont] = elmo_context_feat 167 | feed_dict_[model.elmo_ques] = elmo_question_feat 168 | if config['use_cove'] == 1: 169 | feed_dict_[model.cove_cont] = cove_context_feat 170 | feed_dict_[model.cove_ques] = cove_question_feat 171 | if config['decay'] < 1: 172 | loss_value, _ = sess.run([model.loss, model.ema_train_op], feed_dict=feed_dict_) 173 | else: 174 | loss_value, _ = sess.run([model.loss, model.train_op], feed_dict=feed_dict_) 175 | char_mat = sess.run(model.char_mat) 176 | char_mat[-char_mat_fix.shape[0]:, ::] = char_mat_fix 177 | _ = sess.run(model.assign_char_mat, feed_dict={model.old_char_mat: char_mat}) 178 | sum_loss += loss_value 179 | 180 | # check embedding 181 | fix_feat, tra_feat = sess.run([model.char_mat[-93:, :], model.char_mat[0:1140, :]]) 182 | fix_feat = np.sum(fix_feat) 183 | tra_feat = np.sum(tra_feat) 184 | print('fix:', fix_feat) 185 | print('trainable:', tra_feat) 186 | 187 | last_train_str = "[epoch:%d/%d, steps:%d/%d] -loss:%.4f" % ( 188 | i_epoch + 1, config['epoch'], i_batch + 1, 189 | train_n_batch, sum_loss / (i_batch + 1)) 190 | if i_batch > 0: 191 | last_train_str += (' -ETA:%ds' % cal_ETA(t_start, i_batch, train_n_batch)) 192 | if i_batch % 100 == 0: 193 | logger.info(last_train_str) 194 | i_batch += 1 195 | except tf.errors.OutOfRangeError: 196 | logger.info(last_train_str) 197 | break 198 | 199 | # validating step 200 | # # save the temp weights and do ema 201 | # if config['decay'] < 1.0: 202 | # saver.save(sess, os.path.join('model', config['path'], 'temp_model.ckpt')) 203 | # sess.run(model.assign_vars) 204 | # print('EMA over...') 205 | sess.run(dev_iterator.initializer) 206 | logger.info('validating...') 207 | sum_loss_val = 0 208 | y1s = [] 209 | y2s = [] 210 | i_batch = 0 211 | while True: 212 | try: 213 | context_idxs, ques_idxs, \ 214 | context_char_idxs, ques_char_idxs, \ 215 | context_feat, ques_feat, \ 216 | elmo_context_feat, elmo_question_feat, \ 217 | cove_context_feat, cove_question_feat, \ 218 | y1, y2, y1p, y2p = sess.run(dev_next_element) 219 | feed_dict_ = {model.contw_input: context_idxs, model.quesw_input: ques_idxs, 220 | model.contc_input: context_char_idxs, model.quesc_input: ques_char_idxs, 221 | model.y_start: y1, model.y_end: y2, 222 | model.yp_start: y1p, model.yp_end: y2p, 223 | model.un_size: context_idxs.shape[0]} 224 | if config['use_feat']: 225 | feed_dict_[model.cont_feat] = context_feat 226 | feed_dict_[model.ques_feat] = ques_feat 227 | if config['use_elmo'] == 1: 228 | feed_dict_[model.elmo_cont] = elmo_context_feat 229 | feed_dict_[model.elmo_ques] = elmo_question_feat 230 | if config['use_cove'] == 1: 231 | feed_dict_[model.cove_cont] = cove_context_feat 232 | feed_dict_[model.cove_ques] = cove_question_feat 233 | loss_value, y1, y2 = sess.run([model.loss, model.mask_output1, model.mask_output2], 234 | feed_dict=feed_dict_) 235 | y1s.append(y1) 236 | y2s.append(y2) 237 | sum_loss_val += loss_value 238 | i_batch += 1 239 | except tf.errors.OutOfRangeError: 240 | y1s = np.concatenate(y1s) 241 | y2s = np.concatenate(y2s) 242 | answer_dict, _, noanswer_num = convert_tokens(eval_file, dev_qid.tolist(), y1s.tolist(), 243 | y2s.tolist(), data_type=2) 244 | metrics = evaluate(eval_file, answer_dict) 245 | ems.append(metrics['exact_match']) 246 | f1s.append(metrics['f1']) 247 | 248 | # if metrics['f1'] < f1s[-1]: 249 | # config['learning_rate'] *= 0.5 250 | # logger.warning('learning rate reduce to:' + str(config['learning_rate'])) 251 | # if config['learning_rate'] <= 1e-4: 252 | # logger.warning('rl loss start...') 253 | # config['rlw'] = config['rlw2'] 254 | 255 | if ems[-1] > best_em: 256 | best_em = ems[-1] 257 | if f1s[-1] > best_f1: 258 | best_f1 = f1s[-1] 259 | saver.save(sess, os.path.join('model', config['path'], 'model.ckpt'), 260 | global_step=(i_epoch + 1) * train_n_batch) 261 | logger.warning("-loss: %.4f -EM:%.2f%% (best: %.2f%%), -F1:%.2f%% (best: %.2f%%) -Noanswer:%d" % 262 | (sum_loss_val / (i_batch + 1), metrics['exact_match'], best_em, metrics['f1'], 263 | best_f1, noanswer_num)) 264 | metrics = evaluate_acc(eval_file, answer_dict) 265 | logger.warning("Has answer acc:%.2f%%, No answer acc::%.2f%%" % ( 266 | metrics['has_answer_acc'] * 100, metrics['hasno_answer_acc'] * 100)) 267 | result = pd.DataFrame([ems, f1s], index=['em', 'f1']).transpose() 268 | result.to_csv('results/result_' + config['path'] + '.csv', index=None) 269 | 270 | # # recover the model 271 | # if config['decay'] < 1.0: 272 | # saver.restore(sess, os.path.join('model', config['path'], 'temp_model.ckpt')) 273 | # print('recover weights over...') 274 | break 275 | -------------------------------------------------------------------------------- /RMR_modelV3.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | from layersV0 import total_params, align_block, summary_vector, start_logits, end_logits, BiLSTM, ElmoAttention, \ 3 | ElmoCombineLayer, CoveCombineLayer, answer_block 4 | from bilm import BidirectionalLanguageModel, all_layers 5 | from keras.models import load_model 6 | from loss import rl_loss 7 | import numpy as np 8 | 9 | 10 | class Model(object): 11 | def __init__(self, config, word_mat=None, char_mat_trainable=None, char_mat_fix=None, test=False): 12 | 13 | # hyper-parameter 14 | self.char_dim = config['char_dim'] 15 | self.cont_limit = config['cont_limit'] if not test else 1000 16 | self.ques_limit = config['ques_limit'] if not test else 50 17 | self.char_limit = config['char_limit'] 18 | self.ans_limit = config['ans_limit'] 19 | self.filters = config['filters'] 20 | self.char_filters = config['char_filters'] 21 | self.batch_size = config['batch_size'] 22 | self.l2_norm = config['l2_norm'] 23 | self.decay = config['decay'] 24 | self.learning_rate = config['learning_rate'] 25 | self.grad_clip = config['grad_clip'] 26 | self.init_lambda = config['init_lambda'] 27 | self.gamma_b = config['gamma_b'] 28 | self.gamma_c = config['gamma_c'] 29 | self.use_elmo = config['use_elmo'] 30 | self.use_cove = config['use_cove'] 31 | self.use_feat = config['use_feat'] 32 | self.use_rlloss = config['use_rlloss'] 33 | self.dropout = tf.placeholder_with_default(0.0, (), name="dropout") 34 | self.dropout_rnn = tf.placeholder_with_default(0.0, (), name="dropout_rnn") 35 | self.dropout_emb = tf.placeholder_with_default(0.0, (), name="dropout_emb") 36 | self.dropout_att = tf.placeholder_with_default(0.0, (), name="dropout_att") 37 | self.un_size = tf.placeholder_with_default(self.batch_size, (), name="un_size") 38 | self.rlw = tf.placeholder_with_default(0.0, (), name="rlloss_weights") 39 | 40 | # embedding layer 41 | self.word_mat = tf.get_variable("word_mat", initializer=tf.constant(word_mat, dtype=tf.float32), 42 | trainable=False) 43 | with tf.variable_scope("Input_Embedding_Mat"): 44 | self.char_mat = tf.get_variable("char_mat", 45 | initializer=np.concatenate([char_mat_trainable, char_mat_fix], axis=0), 46 | trainable=True) 47 | 48 | # input tensor 49 | self.contw_input = tf.placeholder(tf.int32, [None, None], "context_word") 50 | self.quesw_input = tf.placeholder(tf.int32, [None, None], "question_word") 51 | self.contc_input = tf.placeholder(tf.int32, [None, None, self.char_limit], "context_char") 52 | self.quesc_input = tf.placeholder(tf.int32, [None, None, self.char_limit], "question_char") 53 | self.y_start = tf.placeholder(tf.int32, [None, None], "answer_start_index") 54 | self.y_end = tf.placeholder(tf.int32, [None, None], "answer_end_index") 55 | self.contw_elmo_id = tf.placeholder(tf.int32, [None, None, 50], 'contw_elmo_id') 56 | self.quesw_elmo_id = tf.placeholder(tf.int32, [None, None, 50], 'quesw_elmo_id') 57 | if self.use_feat: 58 | self.cont_feat = tf.placeholder(tf.float32, [None, None, 73], "cont_feat") 59 | self.ques_feat = tf.placeholder(tf.float32, [None, None, 73], "ques_feat") 60 | self.old_char_mat = tf.placeholder(tf.float32, [None, None], "old_char_mat") 61 | self.assign_char_mat = tf.assign(self.char_mat, self.old_char_mat) 62 | 63 | # get mask & length for words & chars 64 | self.c_mask = tf.cast(self.contw_input, tf.bool) 65 | self.q_mask = tf.cast(self.quesw_input, tf.bool) 66 | self.cont_len = tf.reduce_sum(tf.cast(self.c_mask, tf.int32), axis=1) 67 | self.ques_len = tf.reduce_sum(tf.cast(self.q_mask, tf.int32), axis=1) 68 | 69 | # slice for maxlen in each batch 70 | self.c_maxlen = tf.reduce_max(self.cont_len) 71 | self.q_maxlen = tf.reduce_max(self.ques_len) 72 | 73 | # elmo features 74 | if self.use_elmo == 2: 75 | options_file = config['elmo_options_path'] 76 | weight_file = config['elmo_weights_path'] 77 | bilm = BidirectionalLanguageModel(options_file, weight_file) 78 | self.elmo_cont = all_layers(bilm(self.contw_elmo_id)) # [bs, 3, len, 1024] 79 | self.elmo_cont = tf.transpose(self.elmo_cont, [0, 2, 1, 3]) # [bs, len, 3, 1024] 80 | self.elmo_ques = all_layers(bilm(self.quesw_elmo_id)) 81 | self.elmo_ques = tf.transpose(self.elmo_ques, [0, 2, 1, 3]) 82 | elif self.use_elmo == 1: 83 | self.elmo_cont = tf.placeholder(tf.float32, [None, None, 3, 1024], 'elmo_cont') 84 | self.elmo_ques = tf.placeholder(tf.float32, [None, None, 3, 1024], 'elmo_ques') 85 | 86 | if self.use_cove == 2: 87 | with tf.variable_scope('Cove_Layer'): 88 | self.cove_model = load_model(config['cove_path']) 89 | elif self.use_cove == 1: 90 | self.cove_cont = tf.placeholder(tf.float32, [None, None, 2, 600], 'cove_cont') 91 | self.cove_ques = tf.placeholder(tf.float32, [None, None, 2, 600], 'cove_ques') 92 | 93 | # lr schedule 94 | self.global_step = tf.get_variable('global_step', shape=[], dtype=tf.int32, 95 | initializer=tf.constant_initializer(0), trainable=False) 96 | 97 | self.learning_rate = tf.placeholder_with_default(config['learning_rate'], (), name="learning_rate") 98 | self.lr = self.learning_rate 99 | # self.lr = tf.minimum(self.learning_rate, 100 | # self.learning_rate / tf.log(999.) * tf.log(tf.cast(self.global_step, tf.float32) + 1)) 101 | 102 | # initial model & complie 103 | self.build_model() 104 | total_params() 105 | self.complie() 106 | 107 | def build_model(self): 108 | with tf.variable_scope("Input_Embedding_Layer"): 109 | with tf.variable_scope("Char_Embedding_Layer"): 110 | # char embedding 111 | ch_emb = tf.reshape(tf.nn.embedding_lookup(self.char_mat, self.contc_input), 112 | [-1, self.char_limit, self.char_dim]) 113 | qh_emb = tf.reshape(tf.nn.embedding_lookup(self.char_mat, self.quesc_input), 114 | [-1, self.char_limit, self.char_dim]) 115 | ch_emb = tf.nn.dropout(ch_emb, 1 - self.dropout_emb) 116 | qh_emb = tf.nn.dropout(qh_emb, 1 - self.dropout_emb) 117 | 118 | ch_emb, qh_emb = BiLSTM([ch_emb, qh_emb], self.char_filters // 2, dropout=self.dropout_rnn, 119 | name='char_lstm') 120 | ch_emb = tf.reduce_max(ch_emb, axis=1) 121 | qh_emb = tf.reduce_max(qh_emb, axis=1) 122 | ch_emb = tf.reshape(ch_emb, [-1, self.c_maxlen, self.char_filters]) 123 | qh_emb = tf.reshape(qh_emb, [-1, self.q_maxlen, self.char_filters]) 124 | 125 | with tf.variable_scope("Word_Embedding_Layer"): 126 | # word embedding 127 | c_emb = tf.nn.embedding_lookup(self.word_mat, self.contw_input) 128 | q_emb = tf.nn.embedding_lookup(self.word_mat, self.quesw_input) 129 | c_emb = tf.nn.dropout(c_emb, 1.0 - self.dropout_emb) 130 | q_emb = tf.nn.dropout(q_emb, 1.0 - self.dropout_emb) 131 | 132 | # cove features 133 | if self.use_cove != 0: 134 | if self.use_cove == 2: 135 | self.cove_cont = tf.stop_gradient(self.cove_model(c_emb)) # [bs, c_len, 2, 600] 136 | self.cove_ques = tf.stop_gradient(self.cove_model(q_emb)) # [bs, q_len, 2, 600] 137 | with tf.variable_scope('Cove_weights', reuse=tf.AUTO_REUSE): 138 | cove_context_input = CoveCombineLayer(self.cove_cont, 'input') 139 | cove_question_input = CoveCombineLayer(self.cove_ques, 'input') 140 | c_emb = tf.concat([c_emb, cove_context_input], axis=-1) 141 | q_emb = tf.concat([q_emb, cove_question_input], axis=-1) 142 | 143 | # elmo features 144 | if self.use_elmo != 0: 145 | with tf.variable_scope('ELMo_weights', reuse=tf.AUTO_REUSE): 146 | elmo_context_input = ElmoCombineLayer(self.elmo_cont, 'input') 147 | elmo_question_input = ElmoCombineLayer(self.elmo_ques, 'input') 148 | elmo_context_output = ElmoCombineLayer(self.elmo_cont, 'output') 149 | elmo_question_output = ElmoCombineLayer(self.elmo_ques, 'output') 150 | c_emb = tf.concat([c_emb, elmo_context_input], axis=-1) 151 | q_emb = tf.concat([q_emb, elmo_question_input], axis=-1) 152 | 153 | if self.use_feat: 154 | c_emb = tf.concat([c_emb, self.cont_feat], axis=-1) 155 | q_emb = tf.concat([q_emb, self.ques_feat], axis=-1) 156 | 157 | # combine embedding feats 158 | c_emb = tf.concat([c_emb, ch_emb], axis=-1) 159 | q_emb = tf.concat([q_emb, qh_emb], axis=-1) 160 | 161 | # BiLSTM Embedding 162 | with tf.variable_scope("BiLSTM_Embedding_Layer"): 163 | c_emb, q_emb = BiLSTM([c_emb, q_emb], self.filters // 2, dropout=self.dropout_rnn, name='encoder') 164 | 165 | with tf.variable_scope("Iterative_Reattention_Aligner"): 166 | with tf.variable_scope("Aligning_Block1"): 167 | R = align_block(u=c_emb, 168 | v=q_emb, 169 | c_mask=self.c_mask, 170 | q_mask=self.q_mask, 171 | filters=self.filters, 172 | dropout=self.dropout_rnn) 173 | R = tf.nn.dropout(R, 1.0 - self.dropout_att) 174 | with tf.variable_scope("Aligning_Block2"): 175 | R = align_block(u=R, 176 | v=q_emb, 177 | c_mask=self.c_mask, 178 | q_mask=self.q_mask, 179 | filters=self.filters, 180 | dropout=self.dropout_rnn) 181 | R = tf.nn.dropout(R, 1.0 - self.dropout_att) 182 | 183 | with tf.variable_scope("Answer_Pointer"): 184 | z = tf.squeeze(tf.slice(q_emb, [0, tf.shape(q_emb)[1]-1, 0], [-1, 1, -1]), axis=1) # [bs, 1, dim]->[bs, dim] 185 | # logits 186 | if self.use_elmo != 0: 187 | R = tf.concat([R, elmo_context_output], axis=-1) 188 | z = tf.concat([z, elmo_question_output], axis=-1) 189 | 190 | with tf.variable_scope('Answer_Block1'): 191 | z = answer_block(R, z, self.filters, self.c_mask, dropout=self.dropout, return_logits=False) 192 | with tf.variable_scope('Answer_Block2'): 193 | logits1, logits2 = answer_block(R, z, self.filters, self.c_mask, dropout=self.dropout, return_logits=True) 194 | 195 | with tf.variable_scope("Loss_Layer"): 196 | # maximum-likelihood (ML) loss for dataset V2.0 197 | start_loss = tf.nn.softmax_cross_entropy_with_logits_v2(logits=logits1, labels=self.y_start) 198 | end_loss = tf.nn.softmax_cross_entropy_with_logits_v2(logits=logits2, labels=self.y_end) 199 | self.loss = tf.reduce_mean(start_loss + end_loss) 200 | 201 | # l2 loss 202 | if self.l2_norm is not None: 203 | decay_costs = [] 204 | for var in tf.trainable_variables(): 205 | decay_costs.append(tf.nn.l2_loss(var)) 206 | self.loss += tf.multiply(self.l2_norm, tf.add_n(decay_costs)) 207 | 208 | # RL loss 209 | if self.use_rlloss: 210 | with tf.variable_scope("Reinforcement_Loss"): 211 | self.rl_loss, _, _ = rl_loss(logits1, logits2, self.y_start, self.y_end, self.c_maxlen) 212 | self.loss += (self.rlw * self.rl_loss) 213 | 214 | with tf.variable_scope('Output_Layer'): 215 | softmax_start_scores = tf.nn.softmax(logits1) 216 | softmax_end_scores = tf.nn.softmax(logits2) 217 | 218 | outer = tf.matmul(tf.expand_dims(softmax_start_scores, axis=2), 219 | tf.expand_dims(softmax_end_scores, axis=1)) 220 | outer = tf.matrix_band_part(outer, 0, self.ans_limit) 221 | 222 | def position_encoding(x): 223 | import math 224 | for i in range(x.shape[0]): 225 | for j in range(x.shape[1]): 226 | if j - i > 5: 227 | x[i][j] = float(1.0 / math.log(j - i + 1)) 228 | return x 229 | 230 | mask_mat = tf.ones((self.c_maxlen, self.c_maxlen)) 231 | mask_mat = tf.expand_dims(tf.py_func(position_encoding, [mask_mat], tf.float32), axis=0) 232 | mask_mat = tf.tile(mask_mat, [self.un_size, 1, 1]) 233 | 234 | outer_masked = outer * mask_mat 235 | self.mask_output1 = tf.argmax(tf.reduce_max(outer_masked, axis=2), axis=1) 236 | self.mask_output2 = tf.argmax(tf.reduce_max(outer_masked, axis=1), axis=1) 237 | 238 | def complie(self): 239 | # self.opt = AdaMaxOptimizer(learning_rate=self.lr, beta1=0.9, beta2=0.999, epsilon=1e-7) 240 | self.opt = tf.train.AdamOptimizer(learning_rate=self.lr, beta1=0.8, beta2=0.999, epsilon=1e-7) 241 | grads = self.opt.compute_gradients(self.loss) 242 | gradients, variables = zip(*grads) 243 | capped_grads, _ = tf.clip_by_global_norm(gradients, self.grad_clip) 244 | self.train_op = self.opt.apply_gradients(zip(capped_grads, variables), global_step=self.global_step) 245 | 246 | # EMA 247 | with tf.variable_scope("EMA_Weights"): 248 | if self.decay is not None and self.decay < 1.: 249 | self.var_ema = tf.train.ExponentialMovingAverage(self.decay) 250 | with tf.control_dependencies([self.train_op]): 251 | self.ema_train_op = self.var_ema.apply( 252 | list(set(tf.trainable_variables()) ^ set(tf.trainable_variables('Cove_Layer')))) 253 | # assign ema weights 254 | self.assign_vars = [] 255 | for var in tf.global_variables(): 256 | v = self.var_ema.average(var) 257 | if v is not None: 258 | self.assign_vars.append(tf.assign(var, v)) 259 | 260 | # import numpy as np 261 | # 262 | # config = { 263 | # 'char_dim': 64, 264 | # 'cont_limit': 400, 265 | # 'ques_limit': 50, 266 | # 'char_limit': 16, 267 | # 'ans_limit': -1, 268 | # 'filters': 256, 269 | # 'dropout': 0.1, 270 | # 'dropout_emb': 0.1, 271 | # 'l2_norm': 3e-7, 272 | # 'decay': 0.9999, 273 | # 'gamma_c': 1.0, 274 | # 'gamma_b': 0.3, 275 | # 'learning_rate': 1e-3, 276 | # 'grad_clip': 5.0, 277 | # 'init_lambda': 3.0, 278 | # 'loss_type': 'use_plausible', 279 | # 'use_elmo': 0, 280 | # 'use_cove': 0, 281 | # 'use_feat': True, 282 | # 'optimizer': 'adam', 283 | # 'use_rlloss': False, 284 | # 'cove_path': 'Keras_CoVe_2layers.h5', 285 | # 'elmo_weights_path': 'elmo_tf/models/squad_context_concat_lm_2x4096_512_2048cnn_2xhighway_skip.hdf5', 286 | # 'elmo_options_path': 'elmo_tf/models/options_squad_lm_2x4096_512_2048cnn_2xhighway_skip.json', 287 | # 'train_tfrecords': 'tfrecords/train_pre_elmo_cove.tfrecords', 288 | # 'dev_tfrecords': 'tfrecords/dev_pre_elmo_cove.tfrecords', 289 | # 'batch_size': 24, 290 | # 'epoch': 40, 291 | # 'origin_path': None, # not finetune 292 | # 'path': 'QANetV253' 293 | # } 294 | # word_mat = np.random.random((90950, 300)).astype(np.float32) 295 | # char_mat2 = np.random.random((94, 300)).astype(np.float32) 296 | # char_mat = np.random.random((1171, 300)).astype(np.float32) 297 | # model = Model(config, word_mat, char_mat, char_mat2) 298 | -------------------------------------------------------------------------------- /RMR_modelV6.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | from layers import total_params, align_block, summary_vector, start_logits, end_logits, BiLSTM, ElmoAttention, \ 3 | ElmoCombineLayer, CoveCombineLayer 4 | from bilm import BidirectionalLanguageModel, all_layers 5 | from keras.models import load_model 6 | from loss import rl_loss 7 | import numpy as np 8 | from util.Adamax import AdaMaxOptimizer 9 | 10 | class Model(object): 11 | def __init__(self, config, word_mat=None, char_mat_trainable=None, char_mat_fix=None, test=False): 12 | 13 | # hyper-parameter 14 | self.char_dim = config['char_dim'] 15 | self.cont_limit = config['cont_limit'] if not test else 1000 16 | self.ques_limit = config['ques_limit'] if not test else 50 17 | self.char_limit = config['char_limit'] 18 | self.ans_limit = config['ans_limit'] 19 | self.filters = config['filters'] 20 | self.char_filters = config['char_filters'] 21 | self.batch_size = config['batch_size'] 22 | self.l2_norm = config['l2_norm'] 23 | self.decay = config['decay'] 24 | self.learning_rate = config['learning_rate'] 25 | self.grad_clip = config['grad_clip'] 26 | self.init_lambda = config['init_lambda'] 27 | self.gamma_b = config['gamma_b'] 28 | self.gamma_c = config['gamma_c'] 29 | self.use_elmo = config['use_elmo'] 30 | self.use_cove = config['use_cove'] 31 | self.use_feat = config['use_feat'] 32 | self.use_rlloss = config['use_rlloss'] 33 | self.dropout = tf.placeholder_with_default(0.0, (), name="dropout") 34 | self.dropout_rnn = tf.placeholder_with_default(0.0, (), name="dropout_rnn") 35 | self.dropout_emb = tf.placeholder_with_default(0.0, (), name="dropout_emb") 36 | self.dropout_att = tf.placeholder_with_default(0.0, (), name="dropout_att") 37 | self.un_size = tf.placeholder_with_default(self.batch_size, (), name="un_size") 38 | self.rlw = tf.placeholder_with_default(0.0, (), name="rlloss_weights") 39 | 40 | # embedding layer 41 | self.word_mat = tf.get_variable("word_mat", initializer=tf.constant(word_mat, dtype=tf.float32), trainable=False) 42 | with tf.variable_scope("Input_Embedding_Mat"): 43 | self.char_mat = tf.get_variable("char_mat", initializer=np.concatenate([char_mat_trainable, char_mat_fix], axis=0), trainable=True) 44 | 45 | # input tensor 46 | self.contw_input = tf.placeholder(tf.int32, [None, None], "context_word") 47 | self.quesw_input = tf.placeholder(tf.int32, [None, None], "question_word") 48 | self.contc_input = tf.placeholder(tf.int32, [None, None, self.char_limit], "context_char") 49 | self.quesc_input = tf.placeholder(tf.int32, [None, None, self.char_limit], "question_char") 50 | self.y_start = tf.placeholder(tf.int32, [None, None], "answer_start_index") 51 | self.y_end = tf.placeholder(tf.int32, [None, None], "answer_end_index") 52 | self.contw_elmo_id = tf.placeholder(tf.int32, [None, None, 50], 'contw_elmo_id') 53 | self.quesw_elmo_id = tf.placeholder(tf.int32, [None, None, 50], 'quesw_elmo_id') 54 | if self.use_feat: 55 | self.cont_feat = tf.placeholder(tf.float32, [None, None, 73], "cont_feat") 56 | self.ques_feat = tf.placeholder(tf.float32, [None, None, 73], "ques_feat") 57 | self.old_char_mat = tf.placeholder(tf.float32, [None, None], "old_char_mat") 58 | self.assign_char_mat = tf.assign(self.char_mat, self.old_char_mat) 59 | 60 | # get mask & length for words & chars 61 | self.c_mask = tf.cast(self.contw_input, tf.bool) 62 | self.q_mask = tf.cast(self.quesw_input, tf.bool) 63 | self.cont_len = tf.reduce_sum(tf.cast(self.c_mask, tf.int32), axis=1) 64 | self.ques_len = tf.reduce_sum(tf.cast(self.q_mask, tf.int32), axis=1) 65 | 66 | # slice for maxlen in each batch 67 | self.c_maxlen = tf.reduce_max(self.cont_len) 68 | self.q_maxlen = tf.reduce_max(self.ques_len) 69 | 70 | # elmo features 71 | if self.use_elmo == 2: 72 | options_file = config['elmo_options_path'] 73 | weight_file = config['elmo_weights_path'] 74 | bilm = BidirectionalLanguageModel(options_file, weight_file) 75 | self.elmo_cont = all_layers(bilm(self.contw_elmo_id)) # [bs, 3, len, 1024] 76 | self.elmo_cont = tf.transpose(self.elmo_cont, [0, 2, 1, 3]) # [bs, len, 3, 1024] 77 | self.elmo_ques = all_layers(bilm(self.quesw_elmo_id)) 78 | self.elmo_ques = tf.transpose(self.elmo_ques, [0, 2, 1, 3]) 79 | elif self.use_elmo == 1: 80 | self.elmo_cont = tf.placeholder(tf.float32, [None, None, 3, 1024], 'elmo_cont') 81 | self.elmo_ques = tf.placeholder(tf.float32, [None, None, 3, 1024], 'elmo_ques') 82 | 83 | if self.use_cove == 2: 84 | with tf.variable_scope('Cove_Layer'): 85 | self.cove_model = load_model(config['cove_path']) 86 | elif self.use_cove == 1: 87 | self.cove_cont = tf.placeholder(tf.float32, [None, None, 2, 600], 'cove_cont') 88 | self.cove_ques = tf.placeholder(tf.float32, [None, None, 2, 600], 'cove_ques') 89 | 90 | # lr schedule 91 | self.global_step = tf.get_variable('global_step', shape=[], dtype=tf.int32, 92 | initializer=tf.constant_initializer(0), trainable=False) 93 | 94 | self.learning_rate = tf.placeholder_with_default(config['learning_rate'], (), name="learning_rate") 95 | self.lr = self.learning_rate 96 | # self.lr = tf.minimum(self.learning_rate, 97 | # self.learning_rate / tf.log(999.) * tf.log(tf.cast(self.global_step, tf.float32) + 1)) 98 | 99 | # initial model & complie 100 | self.build_model() 101 | total_params() 102 | self.complie() 103 | 104 | def build_model(self): 105 | with tf.variable_scope("Input_Embedding_Layer"): 106 | with tf.variable_scope("Char_Embedding_Layer"): 107 | # char embedding 108 | ch_emb = tf.reshape(tf.nn.embedding_lookup(self.char_mat, self.contc_input), [-1, self.char_limit, self.char_dim]) 109 | qh_emb = tf.reshape(tf.nn.embedding_lookup(self.char_mat, self.quesc_input), [-1, self.char_limit, self.char_dim]) 110 | ch_emb = tf.nn.dropout(ch_emb, 1 - self.dropout_emb) 111 | qh_emb = tf.nn.dropout(qh_emb, 1 - self.dropout_emb) 112 | 113 | ch_emb, qh_emb = BiLSTM([ch_emb, qh_emb], self.char_filters // 2, dropout=self.dropout_rnn, name='char_lstm') 114 | ch_emb = tf.reduce_max(ch_emb, axis=1) 115 | qh_emb = tf.reduce_max(qh_emb, axis=1) 116 | ch_emb = tf.reshape(ch_emb, [-1, self.c_maxlen, self.char_filters]) 117 | qh_emb = tf.reshape(qh_emb, [-1, self.q_maxlen, self.char_filters]) 118 | 119 | with tf.variable_scope("Word_Embedding_Layer"): 120 | # word embedding 121 | c_emb = tf.nn.embedding_lookup(self.word_mat, self.contw_input) 122 | q_emb = tf.nn.embedding_lookup(self.word_mat, self.quesw_input) 123 | c_emb = tf.nn.dropout(c_emb, 1.0 - self.dropout_emb) 124 | q_emb = tf.nn.dropout(q_emb, 1.0 - self.dropout_emb) 125 | 126 | # cove features 127 | if self.use_cove != 0: 128 | if self.use_cove == 2: 129 | self.cove_cont = tf.stop_gradient(self.cove_model(c_emb)) # [bs, c_len, 2, 600] 130 | self.cove_ques = tf.stop_gradient(self.cove_model(q_emb)) # [bs, q_len, 2, 600] 131 | with tf.variable_scope('Cove_weights', reuse=tf.AUTO_REUSE): 132 | cove_context_input = CoveCombineLayer(self.cove_cont, 'input') 133 | cove_question_input = CoveCombineLayer(self.cove_ques, 'input') 134 | c_emb = tf.concat([c_emb, cove_context_input], axis=-1) 135 | q_emb = tf.concat([q_emb, cove_question_input], axis=-1) 136 | 137 | # elmo features 138 | if self.use_elmo != 0: 139 | with tf.variable_scope('ELMo_weights', reuse=tf.AUTO_REUSE): 140 | elmo_context_input = ElmoCombineLayer(self.elmo_cont, 'input') 141 | elmo_question_input = ElmoCombineLayer(self.elmo_ques, 'input') 142 | elmo_context_output = ElmoCombineLayer(self.elmo_cont, 'output') 143 | elmo_question_output = ElmoCombineLayer(self.elmo_ques, 'output') 144 | c_emb = tf.concat([c_emb, elmo_context_input], axis=-1) 145 | q_emb = tf.concat([q_emb, elmo_question_input], axis=-1) 146 | 147 | if self.use_feat: 148 | c_emb = tf.concat([c_emb, self.cont_feat], axis=-1) 149 | q_emb = tf.concat([q_emb, self.ques_feat], axis=-1) 150 | 151 | # combine embedding feats 152 | c_emb = tf.concat([c_emb, ch_emb], axis=-1) 153 | q_emb = tf.concat([q_emb, qh_emb], axis=-1) 154 | 155 | # BiLSTM Embedding 156 | with tf.variable_scope("BiLSTM_Embedding_Layer"): 157 | c_emb, q_emb = BiLSTM([c_emb, q_emb], self.filters // 2, dropout=self.dropout_rnn, name='encoder') 158 | 159 | with tf.variable_scope("Iterative_Reattention_Aligner"): 160 | self.Lambda = tf.get_variable('Lambda', dtype=tf.float32, initializer=self.init_lambda) 161 | with tf.variable_scope("Aligning_Block1"): 162 | R, Z1, E, B = align_block(u=c_emb, 163 | v=q_emb, 164 | c_mask=self.c_mask, 165 | q_mask=self.q_mask, 166 | Lambda=self.Lambda, 167 | filters=self.filters, 168 | dropout=self.dropout_rnn) 169 | R = tf.nn.dropout(R, 1.0 - self.dropout_att) 170 | with tf.variable_scope("Aligning_Block2"): 171 | R, Z2, E, B = align_block(u=R, 172 | v=q_emb, 173 | c_mask=self.c_mask, 174 | q_mask=self.q_mask, 175 | E_0=E, 176 | B_0=B, 177 | Lambda=self.Lambda, 178 | filters=self.filters, 179 | dropout=self.dropout_rnn) 180 | R = tf.nn.dropout(R, 1.0 - self.dropout_att) 181 | with tf.variable_scope("Aligning_Block3"): 182 | R, Z3, E, B = align_block(u=R, 183 | v=q_emb, 184 | c_mask=self.c_mask, 185 | q_mask=self.q_mask, 186 | E_0=E, 187 | B_0=B, 188 | Z_0=[Z1, Z2], 189 | Lambda=self.Lambda, 190 | filters=self.filters, 191 | dropout=self.dropout_rnn) 192 | R = tf.nn.dropout(R, 1.0 - self.dropout_att) 193 | 194 | with tf.variable_scope("Answer_Pointer"): 195 | # logits 196 | if self.use_elmo != 0: 197 | elmo_output_feats = ElmoAttention([elmo_context_output, elmo_question_output], 198 | self.c_maxlen, self.q_maxlen, self.q_mask, self.dropout) 199 | R = tf.concat([R, elmo_output_feats], axis=-1) 200 | s = summary_vector(q_emb, self.c_maxlen, mask=self.q_mask) 201 | s = tf.nn.dropout(s, 1 - self.dropout) 202 | logits1 = start_logits(R, s, mask=self.c_mask, filters=self.filters, name='Start_Pointer') # [bs, c_len] 203 | logits2 = end_logits(R, logits1, s, mask=self.c_mask, filters=self.filters, name='End_Pointer') # [bs, c_len] 204 | 205 | with tf.variable_scope("Loss_Layer"): 206 | # maximum-likelihood (ML) loss for dataset V2.0 207 | start_loss = tf.nn.softmax_cross_entropy_with_logits_v2(logits=logits1, labels=self.y_start) 208 | end_loss = tf.nn.softmax_cross_entropy_with_logits_v2(logits=logits2, labels=self.y_end) 209 | self.loss = tf.reduce_mean(start_loss + end_loss) 210 | 211 | # l2 loss 212 | if self.l2_norm is not None: 213 | decay_costs = [] 214 | for var in tf.trainable_variables(): 215 | decay_costs.append(tf.nn.l2_loss(var)) 216 | self.loss += tf.multiply(self.l2_norm, tf.add_n(decay_costs)) 217 | 218 | # RL loss 219 | if self.use_rlloss: 220 | with tf.variable_scope("Reinforcement_Loss"): 221 | self.rl_loss, _, _ = rl_loss(logits1, logits2, self.y_start, self.y_end, self.c_maxlen) 222 | self.loss += (self.rlw * self.rl_loss) 223 | 224 | with tf.variable_scope('Output_Layer'): 225 | softmax_start_scores = tf.nn.softmax(logits1) 226 | softmax_end_scores = tf.nn.softmax(logits2) 227 | 228 | outer = tf.matmul(tf.expand_dims(softmax_start_scores, axis=2), 229 | tf.expand_dims(softmax_end_scores, axis=1)) 230 | outer = tf.matrix_band_part(outer, 0, self.ans_limit) 231 | 232 | def position_encoding(x): 233 | import math 234 | for i in range(x.shape[0]): 235 | for j in range(x.shape[1]): 236 | if j - i > 5: 237 | x[i][j] = float(1.0 / math.log(j - i + 1)) 238 | return x 239 | 240 | mask_mat = tf.ones((self.c_maxlen, self.c_maxlen)) 241 | mask_mat = tf.expand_dims(tf.py_func(position_encoding, [mask_mat], tf.float32), axis=0) 242 | mask_mat = tf.tile(mask_mat, [self.un_size, 1, 1]) 243 | 244 | outer_masked = outer * mask_mat 245 | self.mask_output1 = tf.argmax(tf.reduce_max(outer_masked, axis=2), axis=1) 246 | self.mask_output2 = tf.argmax(tf.reduce_max(outer_masked, axis=1), axis=1) 247 | 248 | def complie(self): 249 | # self.opt = AdaMaxOptimizer(learning_rate=self.lr, beta1=0.9, beta2=0.999, epsilon=1e-7) 250 | self.opt = tf.train.AdamOptimizer(learning_rate=self.lr, beta1=0.8, beta2=0.999, epsilon=1e-7) 251 | grads = self.opt.compute_gradients(self.loss) 252 | gradients, variables = zip(*grads) 253 | capped_grads, _ = tf.clip_by_global_norm(gradients, self.grad_clip) 254 | self.train_op = self.opt.apply_gradients(zip(capped_grads, variables), global_step=self.global_step) 255 | 256 | # EMA 257 | with tf.variable_scope("EMA_Weights"): 258 | if self.decay is not None and self.decay < 1.: 259 | self.var_ema = tf.train.ExponentialMovingAverage(self.decay) 260 | with tf.control_dependencies([self.train_op]): 261 | self.ema_train_op = self.var_ema.apply( 262 | list(set(tf.trainable_variables()) ^ set(tf.trainable_variables('Cove_Layer')))) 263 | # assign ema weights 264 | self.assign_vars = [] 265 | for var in tf.global_variables(): 266 | v = self.var_ema.average(var) 267 | if v is not None: 268 | self.assign_vars.append(tf.assign(var, v)) 269 | 270 | 271 | # import numpy as np 272 | # 273 | # config = { 274 | # 'char_dim': 64, 275 | # 'cont_limit': 400, 276 | # 'ques_limit': 50, 277 | # 'char_limit': 16, 278 | # 'ans_limit': -1, 279 | # 'filters': 256, 280 | # 'dropout': 0.1, 281 | # 'dropout_emb': 0.1, 282 | # 'l2_norm': 3e-7, 283 | # 'decay': 0.9999, 284 | # 'gamma_c': 1.0, 285 | # 'gamma_b': 0.3, 286 | # 'learning_rate': 1e-3, 287 | # 'grad_clip': 5.0, 288 | # 'init_lambda': 3.0, 289 | # 'loss_type': 'use_plausible', 290 | # 'use_elmo': 0, 291 | # 'use_cove': 0, 292 | # 'use_feat': True, 293 | # 'optimizer': 'adam', 294 | # 'use_rlloss': False, 295 | # 'cove_path': 'Keras_CoVe_2layers.h5', 296 | # 'elmo_weights_path': 'elmo_tf/models/squad_context_concat_lm_2x4096_512_2048cnn_2xhighway_skip.hdf5', 297 | # 'elmo_options_path': 'elmo_tf/models/options_squad_lm_2x4096_512_2048cnn_2xhighway_skip.json', 298 | # 'train_tfrecords': 'tfrecords/train_pre_elmo_cove.tfrecords', 299 | # 'dev_tfrecords': 'tfrecords/dev_pre_elmo_cove.tfrecords', 300 | # 'batch_size': 24, 301 | # 'epoch': 40, 302 | # 'origin_path': None, # not finetune 303 | # 'path': 'QANetV253' 304 | # } 305 | # word_mat = np.random.random((90950, 300)).astype(np.float32) 306 | # char_mat2 = np.random.random((94, 300)).astype(np.float32) 307 | # char_mat = np.random.random((1171, 300)).astype(np.float32) 308 | # model = Model(config, word_mat, char_mat, char_mat2) 309 | -------------------------------------------------------------------------------- /bilm/data.py: -------------------------------------------------------------------------------- 1 | # originally based on https://github.com/tensorflow/models/tree/master/lm_1b 2 | import glob 3 | import random 4 | 5 | import numpy as np 6 | 7 | from typing import List 8 | 9 | 10 | class Vocabulary(object): 11 | ''' 12 | A token vocabulary. Holds a map from token to ids and provides 13 | a method for encoding text to a sequence of ids. 14 | ''' 15 | def __init__(self, filename, validate_file=False): 16 | ''' 17 | filename = the vocabulary file. It is a flat text file with one 18 | (normalized) token per line. In addition, the file should also 19 | contain the special tokens , , (case sensitive). 20 | ''' 21 | self._id_to_word = [] 22 | self._word_to_id = {} 23 | self._unk = -1 24 | self._bos = -1 25 | self._eos = -1 26 | 27 | with open(filename) as f: 28 | idx = 0 29 | for line in f: 30 | word_name = line.strip() 31 | if word_name == '': 32 | self._bos = idx 33 | elif word_name == '': 34 | self._eos = idx 35 | elif word_name == '': 36 | self._unk = idx 37 | if word_name == '!!!MAXTERMID': 38 | continue 39 | 40 | self._id_to_word.append(word_name) 41 | self._word_to_id[word_name] = idx 42 | idx += 1 43 | 44 | # check to ensure file has special tokens 45 | if validate_file: 46 | if self._bos == -1 or self._eos == -1 or self._unk == -1: 47 | raise ValueError("Ensure the vocabulary file has " 48 | ", , tokens") 49 | 50 | @property 51 | def bos(self): 52 | return self._bos 53 | 54 | @property 55 | def eos(self): 56 | return self._eos 57 | 58 | @property 59 | def unk(self): 60 | return self._unk 61 | 62 | @property 63 | def size(self): 64 | return len(self._id_to_word) 65 | 66 | def word_to_id(self, word): 67 | if word in self._word_to_id: 68 | return self._word_to_id[word] 69 | return self.unk 70 | 71 | def id_to_word(self, cur_id): 72 | return self._id_to_word[cur_id] 73 | 74 | def decode(self, cur_ids): 75 | """Convert a list of ids to a sentence, with space inserted.""" 76 | return ' '.join([self.id_to_word(cur_id) for cur_id in cur_ids]) 77 | 78 | def encode(self, sentence, reverse=False, split=True): 79 | """Convert a sentence to a list of ids, with special tokens added. 80 | Sentence is a single string with tokens separated by whitespace. 81 | 82 | If reverse, then the sentence is assumed to be reversed, and 83 | this method will swap the BOS/EOS tokens appropriately.""" 84 | 85 | if split: 86 | word_ids = [ 87 | self.word_to_id(cur_word) for cur_word in sentence.split() 88 | ] 89 | else: 90 | word_ids = [self.word_to_id(cur_word) for cur_word in sentence] 91 | 92 | if reverse: 93 | return np.array([self.eos] + word_ids + [self.bos], dtype=np.int32) 94 | else: 95 | return np.array([self.bos] + word_ids + [self.eos], dtype=np.int32) 96 | 97 | 98 | class UnicodeCharsVocabulary(Vocabulary): 99 | """Vocabulary containing character-level and word level information. 100 | 101 | Has a word vocabulary that is used to lookup word ids and 102 | a character id that is used to map words to arrays of character ids. 103 | 104 | The character ids are defined by ord(c) for c in word.encode('utf-8') 105 | This limits the total number of possible char ids to 256. 106 | To this we add 5 additional special ids: begin sentence, end sentence, 107 | begin word, end word and padding. 108 | 109 | WARNING: for prediction, we add +1 to the output ids from this 110 | class to create a special padding id (=0). As a result, we suggest 111 | you use the `Batcher`, `TokenBatcher`, and `LMDataset` classes instead 112 | of this lower level class. If you are using this lower level class, 113 | then be sure to add the +1 appropriately, otherwise embeddings computed 114 | from the pre-trained model will be useless. 115 | """ 116 | def __init__(self, filename, max_word_length, **kwargs): 117 | super(UnicodeCharsVocabulary, self).__init__(filename, **kwargs) 118 | self._max_word_length = max_word_length 119 | 120 | # char ids 0-255 come from utf-8 encoding bytes 121 | # assign 256-300 to special chars 122 | self.bos_char = 256 # 123 | self.eos_char = 257 # 124 | self.bow_char = 258 # 125 | self.eow_char = 259 # 126 | self.pad_char = 260 # 127 | 128 | num_words = len(self._id_to_word) 129 | 130 | self._word_char_ids = np.zeros([num_words, max_word_length], 131 | dtype=np.int32) 132 | 133 | # the charcter representation of the begin/end of sentence characters 134 | def _make_bos_eos(c): 135 | r = np.zeros([self.max_word_length], dtype=np.int32) 136 | r[:] = self.pad_char 137 | r[0] = self.bow_char 138 | r[1] = c 139 | r[2] = self.eow_char 140 | return r 141 | self.bos_chars = _make_bos_eos(self.bos_char) 142 | self.eos_chars = _make_bos_eos(self.eos_char) 143 | 144 | for i, word in enumerate(self._id_to_word): 145 | self._word_char_ids[i] = self._convert_word_to_char_ids(word) 146 | 147 | self._word_char_ids[self.bos] = self.bos_chars 148 | self._word_char_ids[self.eos] = self.eos_chars 149 | # TODO: properly handle 150 | 151 | @property 152 | def word_char_ids(self): 153 | return self._word_char_ids 154 | 155 | @property 156 | def max_word_length(self): 157 | return self._max_word_length 158 | 159 | def _convert_word_to_char_ids(self, word): 160 | code = np.zeros([self.max_word_length], dtype=np.int32) 161 | code[:] = self.pad_char 162 | 163 | word_encoded = word.encode('utf-8', 'ignore')[:(self.max_word_length-2)] 164 | code[0] = self.bow_char 165 | k=0 166 | for k, chr_id in enumerate(word_encoded, start=1): 167 | code[k] = chr_id 168 | code[k + 1] = self.eow_char 169 | 170 | return code 171 | 172 | def word_to_char_ids(self, word): 173 | if word in self._word_to_id: 174 | return self._word_char_ids[self._word_to_id[word]] 175 | else: 176 | return self._convert_word_to_char_ids(word) 177 | 178 | def encode_chars(self, sentence, reverse=False, split=True): 179 | ''' 180 | Encode the sentence as a white space delimited string of tokens. 181 | ''' 182 | if split: 183 | chars_ids = [self.word_to_char_ids(cur_word) 184 | for cur_word in sentence.split()] 185 | else: 186 | chars_ids = [self.word_to_char_ids(cur_word) 187 | for cur_word in sentence] 188 | if reverse: 189 | return np.vstack([self.eos_chars] + chars_ids + [self.bos_chars]) 190 | else: 191 | return np.vstack([self.bos_chars] + chars_ids + [self.eos_chars]) 192 | 193 | 194 | class Batcher(object): 195 | ''' 196 | Batch sentences of tokenized text into character id matrices. 197 | ''' 198 | def __init__(self, lm_vocab_file: str, max_token_length: int): 199 | ''' 200 | lm_vocab_file = the language model vocabulary file (one line per 201 | token) 202 | max_token_length = the maximum number of characters in each token 203 | ''' 204 | self._lm_vocab = UnicodeCharsVocabulary( 205 | lm_vocab_file, max_token_length 206 | ) 207 | self._max_token_length = max_token_length 208 | 209 | def batch_sentences(self, sentences: List[List[str]]): 210 | ''' 211 | Batch the sentences as character ids 212 | Each sentence is a list of tokens without or , e.g. 213 | [['The', 'first', 'sentence', '.'], ['Second', '.']] 214 | ''' 215 | n_sentences = len(sentences) 216 | max_length = max(len(sentence) for sentence in sentences) + 2 217 | 218 | X_char_ids = np.zeros( 219 | (n_sentences, max_length, self._max_token_length), 220 | dtype=np.int64 221 | ) 222 | 223 | for k, sent in enumerate(sentences): 224 | length = len(sent) + 2 225 | char_ids_without_mask = self._lm_vocab.encode_chars( 226 | sent, split=False) 227 | # add one so that 0 is the mask value 228 | X_char_ids[k, :length, :] = char_ids_without_mask + 1 229 | 230 | return X_char_ids 231 | 232 | 233 | class TokenBatcher(object): 234 | ''' 235 | Batch sentences of tokenized text into token id matrices. 236 | ''' 237 | def __init__(self, lm_vocab_file: str): 238 | ''' 239 | lm_vocab_file = the language model vocabulary file (one line per 240 | token) 241 | ''' 242 | self._lm_vocab = Vocabulary(lm_vocab_file) 243 | 244 | def batch_sentences(self, sentences: List[List[str]]): 245 | ''' 246 | Batch the sentences as character ids 247 | Each sentence is a list of tokens without or , e.g. 248 | [['The', 'first', 'sentence', '.'], ['Second', '.']] 249 | ''' 250 | n_sentences = len(sentences) 251 | max_length = max(len(sentence) for sentence in sentences) + 2 252 | 253 | X_ids = np.zeros((n_sentences, max_length), dtype=np.int64) 254 | 255 | for k, sent in enumerate(sentences): 256 | length = len(sent) + 2 257 | ids_without_mask = self._lm_vocab.encode(sent, split=False) 258 | # add one so that 0 is the mask value 259 | X_ids[k, :length] = ids_without_mask + 1 260 | 261 | return X_ids 262 | 263 | 264 | ##### for training 265 | def _get_batch(generator, batch_size, num_steps, max_word_length): 266 | """Read batches of input.""" 267 | cur_stream = [None] * batch_size 268 | 269 | no_more_data = False 270 | while True: 271 | inputs = np.zeros([batch_size, num_steps], np.int32) 272 | if max_word_length is not None: 273 | char_inputs = np.zeros([batch_size, num_steps, max_word_length], 274 | np.int32) 275 | else: 276 | char_inputs = None 277 | targets = np.zeros([batch_size, num_steps], np.int32) 278 | 279 | for i in range(batch_size): 280 | cur_pos = 0 281 | 282 | while cur_pos < num_steps: 283 | if cur_stream[i] is None or len(cur_stream[i][0]) <= 1: 284 | try: 285 | cur_stream[i] = list(next(generator)) 286 | except StopIteration: 287 | # No more data, exhaust current streams and quit 288 | no_more_data = True 289 | break 290 | 291 | how_many = min(len(cur_stream[i][0]) - 1, num_steps - cur_pos) 292 | next_pos = cur_pos + how_many 293 | 294 | inputs[i, cur_pos:next_pos] = cur_stream[i][0][:how_many] 295 | if max_word_length is not None: 296 | char_inputs[i, cur_pos:next_pos] = cur_stream[i][1][ 297 | :how_many] 298 | targets[i, cur_pos:next_pos] = cur_stream[i][0][1:how_many+1] 299 | 300 | cur_pos = next_pos 301 | 302 | cur_stream[i][0] = cur_stream[i][0][how_many:] 303 | if max_word_length is not None: 304 | cur_stream[i][1] = cur_stream[i][1][how_many:] 305 | 306 | if no_more_data: 307 | # There is no more data. Note: this will not return data 308 | # for the incomplete batch 309 | break 310 | 311 | X = {'token_ids': inputs, 'tokens_characters': char_inputs, 312 | 'next_token_id': targets} 313 | 314 | yield X 315 | 316 | class LMDataset(object): 317 | """ 318 | Hold a language model dataset. 319 | 320 | A dataset is a list of tokenized files. Each file contains one sentence 321 | per line. Each sentence is pre-tokenized and white space joined. 322 | """ 323 | def __init__(self, filepattern, vocab, reverse=False, test=False, 324 | shuffle_on_load=False): 325 | ''' 326 | filepattern = a glob string that specifies the list of files. 327 | vocab = an instance of Vocabulary or UnicodeCharsVocabulary 328 | reverse = if True, then iterate over tokens in each sentence in reverse 329 | test = if True, then iterate through all data once then stop. 330 | Otherwise, iterate forever. 331 | shuffle_on_load = if True, then shuffle the sentences after loading. 332 | ''' 333 | self._vocab = vocab 334 | self._all_shards = glob.glob(filepattern) 335 | print('Found %d shards at %s' % (len(self._all_shards), filepattern)) 336 | self._shards_to_choose = [] 337 | 338 | self._reverse = reverse 339 | self._test = test 340 | self._shuffle_on_load = shuffle_on_load 341 | self._use_char_inputs = hasattr(vocab, 'encode_chars') 342 | 343 | self._ids = self._load_random_shard() 344 | 345 | def _choose_random_shard(self): 346 | if len(self._shards_to_choose) == 0: 347 | self._shards_to_choose = list(self._all_shards) 348 | random.shuffle(self._shards_to_choose) 349 | shard_name = self._shards_to_choose.pop() 350 | return shard_name 351 | 352 | def _load_random_shard(self): 353 | """Randomly select a file and read it.""" 354 | if self._test: 355 | if len(self._all_shards) == 0: 356 | # we've loaded all the data 357 | # this will propogate up to the generator in get_batch 358 | # and stop iterating 359 | raise StopIteration 360 | else: 361 | shard_name = self._all_shards.pop() 362 | else: 363 | # just pick a random shard 364 | shard_name = self._choose_random_shard() 365 | 366 | ids = self._load_shard(shard_name) 367 | self._i = 0 368 | self._nids = len(ids) 369 | return ids 370 | 371 | def _load_shard(self, shard_name): 372 | """Read one file and convert to ids. 373 | 374 | Args: 375 | shard_name: file path. 376 | 377 | Returns: 378 | list of (id, char_id) tuples. 379 | """ 380 | print('Loading data from: %s' % shard_name) 381 | with open(shard_name) as f: 382 | sentences_raw = f.readlines() 383 | 384 | if self._reverse: 385 | sentences = [] 386 | for sentence in sentences_raw: 387 | splitted = sentence.split() 388 | splitted.reverse() 389 | sentences.append(' '.join(splitted)) 390 | else: 391 | sentences = sentences_raw 392 | 393 | if self._shuffle_on_load: 394 | random.shuffle(sentences) 395 | 396 | ids = [self.vocab.encode(sentence, self._reverse) 397 | for sentence in sentences] 398 | if self._use_char_inputs: 399 | chars_ids = [self.vocab.encode_chars(sentence, self._reverse) 400 | for sentence in sentences] 401 | else: 402 | chars_ids = [None] * len(ids) 403 | 404 | print('Loaded %d sentences.' % len(ids)) 405 | print('Finished loading') 406 | return list(zip(ids, chars_ids)) 407 | 408 | def get_sentence(self): 409 | while True: 410 | if self._i == self._nids: 411 | self._ids = self._load_random_shard() 412 | ret = self._ids[self._i] 413 | self._i += 1 414 | yield ret 415 | 416 | @property 417 | def max_word_length(self): 418 | if self._use_char_inputs: 419 | return self._vocab.max_word_length 420 | else: 421 | return None 422 | 423 | def iter_batches(self, batch_size, num_steps): 424 | for X in _get_batch(self.get_sentence(), batch_size, num_steps, 425 | self.max_word_length): 426 | 427 | # token_ids = (batch_size, num_steps) 428 | # char_inputs = (batch_size, num_steps, 50) of character ids 429 | # targets = word ID of next word (batch_size, num_steps) 430 | yield X 431 | 432 | @property 433 | def vocab(self): 434 | return self._vocab 435 | 436 | class BidirectionalLMDataset(object): 437 | def __init__(self, filepattern, vocab, test=False, shuffle_on_load=False): 438 | ''' 439 | bidirectional version of LMDataset 440 | ''' 441 | self._data_forward = LMDataset( 442 | filepattern, vocab, reverse=False, test=test, 443 | shuffle_on_load=shuffle_on_load) 444 | self._data_reverse = LMDataset( 445 | filepattern, vocab, reverse=True, test=test, 446 | shuffle_on_load=shuffle_on_load) 447 | 448 | def iter_batches(self, batch_size, num_steps): 449 | max_word_length = self._data_forward.max_word_length 450 | 451 | for X, Xr in zip( 452 | _get_batch(self._data_forward.get_sentence(), batch_size, 453 | num_steps, max_word_length), 454 | _get_batch(self._data_reverse.get_sentence(), batch_size, 455 | num_steps, max_word_length) 456 | ): 457 | 458 | for k, v in Xr.items(): 459 | X[k + '_reverse'] = v 460 | 461 | yield X 462 | 463 | 464 | class InvalidNumberOfCharacters(Exception): 465 | pass 466 | 467 | -------------------------------------------------------------------------------- /RMR_modelV6_squad2.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | from layers import total_params, align_block, summary_vector, start_logits, end_logits, BiLSTM, ElmoAttention, \ 3 | ElmoCombineLayer, CoveCombineLayer 4 | from bilm import BidirectionalLanguageModel, all_layers 5 | from keras.models import load_model 6 | from loss import rl_loss 7 | import numpy as np 8 | 9 | 10 | class Model(object): 11 | def __init__(self, config, word_mat=None, char_mat_trainable=None, char_mat_fix=None, test=False): 12 | 13 | # hyper-parameter 14 | self.char_dim = config['char_dim'] 15 | self.cont_limit = config['cont_limit'] if not test else 1000 16 | self.ques_limit = config['ques_limit'] if not test else 50 17 | self.char_limit = config['char_limit'] 18 | self.ans_limit = config['ans_limit'] 19 | self.filters = config['filters'] 20 | self.batch_size = config['batch_size'] 21 | self.l2_norm = config['l2_norm'] 22 | self.decay = config['decay'] 23 | self.learning_rate = config['learning_rate'] 24 | self.grad_clip = config['grad_clip'] 25 | self.init_lambda = config['init_lambda'] 26 | self.gamma_b = config['gamma_b'] 27 | self.gamma_c = config['gamma_c'] 28 | self.use_elmo = config['use_elmo'] 29 | self.use_cove = config['use_cove'] 30 | self.use_feat = config['use_feat'] 31 | self.use_rlloss = config['use_rlloss'] 32 | self.dropout = tf.placeholder_with_default(0.0, (), name="dropout") 33 | self.dropout_rnn = tf.placeholder_with_default(0.0, (), name="dropout_rnn") 34 | self.dropout_emb = tf.placeholder_with_default(0.0, (), name="dropout_emb") 35 | self.dropout_att = tf.placeholder_with_default(0.0, (), name="dropout_att") 36 | self.un_size = tf.placeholder_with_default(self.batch_size, (), name="un_size") 37 | self.rlw = tf.placeholder_with_default(0.0, (), name="rlloss_weights") 38 | 39 | # embedding layer 40 | self.word_mat = tf.get_variable("word_mat", initializer=tf.constant(word_mat, dtype=tf.float32), 41 | trainable=False) 42 | with tf.variable_scope("Input_Embedding_Mat"): 43 | self.char_mat = tf.get_variable("char_mat", 44 | initializer=np.concatenate([char_mat_trainable, char_mat_fix], axis=0), 45 | trainable=True) 46 | 47 | # input tensor 48 | self.contw_input = tf.placeholder(tf.int32, [None, None], "context_word") 49 | self.quesw_input = tf.placeholder(tf.int32, [None, None], "question_word") 50 | self.contc_input = tf.placeholder(tf.int32, [None, None, self.char_limit], "context_char") 51 | self.quesc_input = tf.placeholder(tf.int32, [None, None, self.char_limit], "question_char") 52 | self.y_start = tf.placeholder(tf.int32, [None, None], "answer_start_index") 53 | self.y_end = tf.placeholder(tf.int32, [None, None], "answer_end_index") 54 | self.yp_start = tf.placeholder(tf.int32, [None, None], "plausible_answer_start_index") 55 | self.yp_end = tf.placeholder(tf.int32, [None, None], "plausible_answer_end_index") 56 | self.contw_elmo_id = tf.placeholder(tf.int32, [None, None, 50], 'contw_elmo_id') 57 | self.quesw_elmo_id = tf.placeholder(tf.int32, [None, None, 50], 'quesw_elmo_id') 58 | if self.use_feat: 59 | self.cont_feat = tf.placeholder(tf.float32, [None, None, 73], "cont_feat") 60 | self.ques_feat = tf.placeholder(tf.float32, [None, None, 73], "ques_feat") 61 | self.old_char_mat = tf.placeholder(tf.float32, [None, None], "old_char_mat") 62 | self.assign_char_mat = tf.assign(self.char_mat, self.old_char_mat) 63 | 64 | # get mask & length for words & chars 65 | self.c_mask = tf.cast(self.contw_input, tf.bool) 66 | self.q_mask = tf.cast(self.quesw_input, tf.bool) 67 | self.cont_len = tf.reduce_sum(tf.cast(self.c_mask, tf.int32), axis=1) 68 | self.ques_len = tf.reduce_sum(tf.cast(self.q_mask, tf.int32), axis=1) 69 | 70 | # slice for maxlen in each batch 71 | self.c_maxlen = tf.reduce_max(self.cont_len) 72 | self.q_maxlen = tf.reduce_max(self.ques_len) 73 | 74 | # elmo features 75 | if self.use_elmo == 2: 76 | options_file = config['elmo_options_path'] 77 | weight_file = config['elmo_weights_path'] 78 | bilm = BidirectionalLanguageModel(options_file, weight_file) 79 | self.elmo_cont = all_layers(bilm(self.contw_elmo_id)) # [bs, 3, len, 1024] 80 | self.elmo_cont = tf.transpose(self.elmo_cont, [0, 2, 1, 3]) # [bs, len, 3, 1024] 81 | self.elmo_ques = all_layers(bilm(self.quesw_elmo_id)) 82 | self.elmo_ques = tf.transpose(self.elmo_ques, [0, 2, 1, 3]) 83 | elif self.use_elmo == 1: 84 | self.elmo_cont = tf.placeholder(tf.float32, [None, None, 3, 1024], 'elmo_cont') 85 | self.elmo_ques = tf.placeholder(tf.float32, [None, None, 3, 1024], 'elmo_ques') 86 | 87 | if self.use_cove == 2: 88 | with tf.variable_scope('Cove_Layer'): 89 | self.cove_model = load_model(config['cove_path']) 90 | elif self.use_cove == 1: 91 | self.cove_cont = tf.placeholder(tf.float32, [None, None, 2, 600], 'cove_cont') 92 | self.cove_ques = tf.placeholder(tf.float32, [None, None, 2, 600], 'cove_ques') 93 | 94 | # lr schedule 95 | self.global_step = tf.get_variable('global_step', shape=[], dtype=tf.int32, 96 | initializer=tf.constant_initializer(0), trainable=False) 97 | 98 | self.learning_rate = tf.placeholder_with_default(config['learning_rate'], (), name="learning_rate") 99 | self.lr = tf.minimum(self.learning_rate, 100 | self.learning_rate / tf.log(999.) * tf.log(tf.cast(self.global_step, tf.float32) + 1)) 101 | 102 | # initial model & complie 103 | self.build_model() 104 | total_params() 105 | self.complie() 106 | 107 | def build_model(self): 108 | with tf.variable_scope("Input_Embedding_Layer"): 109 | with tf.variable_scope("Char_Embedding_Layer"): 110 | # char embedding 111 | ch_emb = tf.reshape(tf.nn.embedding_lookup(self.char_mat, self.contc_input), 112 | [-1, self.char_limit, self.char_dim]) 113 | qh_emb = tf.reshape(tf.nn.embedding_lookup(self.char_mat, self.quesc_input), 114 | [-1, self.char_limit, self.char_dim]) 115 | ch_emb = tf.nn.dropout(ch_emb, 1 - self.dropout_emb) 116 | qh_emb = tf.nn.dropout(qh_emb, 1 - self.dropout_emb) 117 | 118 | ch_emb, qh_emb = BiLSTM([ch_emb, qh_emb], self.char_dim // 2, dropout=self.dropout_rnn, 119 | name='char_lstm', return_state=True) 120 | ch_emb = tf.reshape(ch_emb, [-1, self.c_maxlen, self.char_dim]) 121 | qh_emb = tf.reshape(qh_emb, [-1, self.q_maxlen, self.char_dim]) 122 | 123 | with tf.variable_scope("Word_Embedding_Layer"): 124 | # word embedding 125 | c_emb = tf.nn.embedding_lookup(self.word_mat, self.contw_input) 126 | q_emb = tf.nn.embedding_lookup(self.word_mat, self.quesw_input) 127 | c_emb = tf.nn.dropout(c_emb, 1.0 - self.dropout_emb) 128 | q_emb = tf.nn.dropout(q_emb, 1.0 - self.dropout_emb) 129 | 130 | # cove features 131 | if self.use_cove != 0: 132 | if self.use_cove == 2: 133 | self.cove_cont = tf.stop_gradient(self.cove_model(c_emb)) # [bs, c_len, 2, 600] 134 | self.cove_ques = tf.stop_gradient(self.cove_model(q_emb)) # [bs, q_len, 2, 600] 135 | with tf.variable_scope('Cove_weights', reuse=tf.AUTO_REUSE): 136 | cove_context_input = CoveCombineLayer(self.cove_cont, 'input') 137 | cove_question_input = CoveCombineLayer(self.cove_ques, 'input') 138 | c_emb = tf.concat([c_emb, cove_context_input], axis=-1) 139 | q_emb = tf.concat([q_emb, cove_question_input], axis=-1) 140 | 141 | # elmo features 142 | if self.use_elmo != 0: 143 | with tf.variable_scope('ELMo_weights', reuse=tf.AUTO_REUSE): 144 | elmo_context_input = ElmoCombineLayer(self.elmo_cont, 'input') 145 | elmo_question_input = ElmoCombineLayer(self.elmo_ques, 'input') 146 | elmo_context_output = ElmoCombineLayer(self.elmo_cont, 'output') 147 | elmo_question_output = ElmoCombineLayer(self.elmo_ques, 'output') 148 | c_emb = tf.concat([c_emb, elmo_context_input], axis=-1) 149 | q_emb = tf.concat([q_emb, elmo_question_input], axis=-1) 150 | 151 | if self.use_feat: 152 | c_emb = tf.concat([c_emb, self.cont_feat], axis=-1) 153 | q_emb = tf.concat([q_emb, self.ques_feat], axis=-1) 154 | 155 | # combine embedding feats 156 | c_emb = tf.concat([c_emb, ch_emb], axis=-1) 157 | q_emb = tf.concat([q_emb, qh_emb], axis=-1) 158 | 159 | # BiLSTM Embedding 160 | with tf.variable_scope("BiLSTM_Embedding_Layer"): 161 | c_emb, q_emb = BiLSTM([c_emb, q_emb], self.filters // 2, dropout=self.dropout_rnn, name='encoder') 162 | 163 | with tf.variable_scope("Iterative_Reattention_Aligner"): 164 | self.Lambda = tf.get_variable('Lambda', dtype=tf.float32, initializer=self.init_lambda) 165 | with tf.variable_scope("Aligning_Block1"): 166 | R, Z1, E, B = align_block(u=c_emb, 167 | v=q_emb, 168 | c_mask=self.c_mask, 169 | q_mask=self.q_mask, 170 | Lambda=self.Lambda, 171 | filters=self.filters, 172 | dropout=self.dropout_rnn) 173 | R = tf.nn.dropout(R, 1.0 - self.dropout_att) 174 | with tf.variable_scope("Aligning_Block2"): 175 | R, Z2, E, B = align_block(u=R, 176 | v=q_emb, 177 | c_mask=self.c_mask, 178 | q_mask=self.q_mask, 179 | E_0=E, 180 | B_0=B, 181 | Lambda=self.Lambda, 182 | filters=self.filters, 183 | dropout=self.dropout_rnn) 184 | R = tf.nn.dropout(R, 1.0 - self.dropout_att) 185 | with tf.variable_scope("Aligning_Block3"): 186 | R, Z3, E, B = align_block(u=R, 187 | v=q_emb, 188 | c_mask=self.c_mask, 189 | q_mask=self.q_mask, 190 | E_0=E, 191 | B_0=B, 192 | Z_0=[Z1, Z2], 193 | Lambda=self.Lambda, 194 | filters=self.filters, 195 | dropout=self.dropout_rnn) 196 | R = tf.nn.dropout(R, 1.0 - self.dropout_att) 197 | 198 | with tf.variable_scope("Answer_Pointer"): 199 | # logits 200 | if self.use_elmo != 0: 201 | elmo_output_feats = ElmoAttention([elmo_context_output, elmo_question_output], 202 | self.c_maxlen, self.q_maxlen, self.q_mask, self.dropout) 203 | R = tf.concat([R, elmo_output_feats], axis=-1) 204 | s = summary_vector(q_emb, self.c_maxlen, mask=self.q_mask) 205 | s = tf.nn.dropout(s, 1 - self.dropout) 206 | logits1 = start_logits(R, s, mask=self.c_mask, filters=self.filters, name='Start_Pointer') # [bs, c_len] 207 | logits2 = end_logits(R, logits1, s, mask=self.c_mask, filters=self.filters, 208 | name='End_Pointer') # [bs, c_len] 209 | self.unanswer_bias = tf.get_variable("unanswer_bias", [1], initializer=tf.zeros_initializer()) 210 | self.unanswer_bias = tf.reshape(tf.tile(self.unanswer_bias, [self.un_size]), [-1, 1]) 211 | logits1 = tf.concat((self.unanswer_bias, logits1), axis=-1) 212 | logits2 = tf.concat((self.unanswer_bias, logits2), axis=-1) 213 | 214 | logits1p = start_logits(R, s, mask=self.c_mask, filters=self.filters, name='Start_Pointer2') # [bs, c_len] 215 | logits2p = end_logits(R, logits1p, s, mask=self.c_mask, filters=self.filters, 216 | name='End_Pointer2') # [bs, c_len] 217 | 218 | with tf.variable_scope("Loss_Layer"): 219 | # maximum-likelihood (ML) loss for dataset V2.0 220 | # loss a 221 | start_loss = tf.nn.softmax_cross_entropy_with_logits_v2(logits=logits1, labels=self.y_start) 222 | end_loss = tf.nn.softmax_cross_entropy_with_logits_v2(logits=logits2, labels=self.y_end) 223 | self.loss = tf.reduce_mean(start_loss + end_loss) 224 | 225 | # loss b 226 | pstart_loss = tf.nn.softmax_cross_entropy_with_logits_v2(logits=logits1p, labels=self.yp_start) 227 | pend_loss = tf.nn.softmax_cross_entropy_with_logits_v2(logits=logits2p, labels=self.yp_end) 228 | self.loss += self.gamma_b * tf.reduce_mean(pstart_loss + pend_loss) 229 | 230 | # loss c 231 | answer_exist_label = tf.cast(tf.slice(self.y_start, [0, 0], [-1, 1]), tf.float32) 232 | self.loss += self.gamma_c * tf.reduce_mean( 233 | tf.nn.sigmoid_cross_entropy_with_logits(logits=self.unanswer_bias, labels=answer_exist_label)) 234 | 235 | # l2 loss 236 | if self.l2_norm is not None: 237 | decay_costs = [] 238 | for var in tf.trainable_variables(): 239 | decay_costs.append(tf.nn.l2_loss(var)) 240 | self.loss += tf.multiply(self.l2_norm, tf.add_n(decay_costs)) 241 | 242 | # RL loss 243 | if self.use_rlloss: 244 | with tf.variable_scope("Reinforcement_Loss"): 245 | self.rl_loss_a, _, _ = rl_loss(logits1, logits2, self.y_start, self.y_end, self.c_maxlen + 1) 246 | self.rl_loss_b, _, _ = rl_loss(logits1p, logits2p, self.yp_start, self.yp_end, self.c_maxlen) 247 | self.loss += (self.rlw * (self.rl_loss_a + self.gamma_b * self.rl_loss_b)) 248 | 249 | with tf.variable_scope('Output_Layer'): 250 | softmax_start_scores = tf.nn.softmax(tf.slice(logits1, [0, 1], [-1, -1])) 251 | softmax_end_scores = tf.nn.softmax(tf.slice(logits2, [0, 1], [-1, -1])) 252 | 253 | unanswer_mask1 = tf.cast(tf.argmax(tf.nn.softmax(logits1), axis=-1), tf.int64) 254 | unanswer_mask1 = tf.cast(tf.cast(unanswer_mask1, tf.bool), tf.int64) # [bs,] has answer=1 no answer=0 255 | unanswer_move1 = unanswer_mask1 - 1 # [bs,] has answer=0 no answer=-1 256 | unanswer_mask2 = tf.cast(tf.argmax(tf.nn.softmax(logits2), axis=-1), tf.int64) 257 | unanswer_mask2 = tf.cast(tf.cast(unanswer_mask2, tf.bool), tf.int64) # [bs,] 258 | unanswer_move2 = unanswer_mask2 - 1 259 | 260 | softmax_start_p = tf.nn.softmax(logits2p) 261 | softmax_end_p = tf.nn.softmax(logits2p) 262 | softmax_start_scores = (1 - self.gamma_b) * softmax_start_scores + self.gamma_b * softmax_start_p 263 | softmax_end_scores = (1 - self.gamma_b) * softmax_end_scores + self.gamma_b * softmax_end_p 264 | 265 | outer = tf.matmul(tf.expand_dims(softmax_start_scores, axis=2), 266 | tf.expand_dims(softmax_end_scores, axis=1)) 267 | outer = tf.matrix_band_part(outer, 0, self.ans_limit) 268 | 269 | def position_encoding(x): 270 | import math 271 | for i in range(x.shape[0]): 272 | for j in range(x.shape[1]): 273 | if j - i > 5: 274 | x[i][j] = float(1.0 / math.log(j - i + 1)) 275 | return x 276 | 277 | mask_mat = tf.ones((self.c_maxlen, self.c_maxlen)) 278 | mask_mat = tf.expand_dims(tf.py_func(position_encoding, [mask_mat], tf.float32), axis=0) 279 | mask_mat = tf.tile(mask_mat, [self.un_size, 1, 1]) 280 | 281 | outer_masked = outer * mask_mat 282 | self.mask_output1 = tf.argmax(tf.reduce_max(outer_masked, axis=2), 283 | axis=1) * unanswer_mask1 + unanswer_move1 284 | self.mask_output2 = tf.argmax(tf.reduce_max(outer_masked, axis=1), 285 | axis=1) * unanswer_mask2 + unanswer_move2 286 | 287 | def complie(self): 288 | self.opt = tf.train.AdamOptimizer(learning_rate=self.lr, beta1=0.8, beta2=0.999, epsilon=1e-7) 289 | grads = self.opt.compute_gradients(self.loss) 290 | gradients, variables = zip(*grads) 291 | capped_grads, _ = tf.clip_by_global_norm(gradients, self.grad_clip) 292 | self.train_op = self.opt.apply_gradients(zip(capped_grads, variables), global_step=self.global_step) 293 | 294 | # EMA 295 | with tf.variable_scope("EMA_Weights"): 296 | if self.decay is not None and self.decay < 1.: 297 | self.var_ema = tf.train.ExponentialMovingAverage(self.decay) 298 | with tf.control_dependencies([self.train_op]): 299 | self.ema_train_op = self.var_ema.apply( 300 | list(set(tf.trainable_variables()) ^ set(tf.trainable_variables('Cove_Layer')))) 301 | # assign ema weights 302 | self.assign_vars = [] 303 | for var in tf.global_variables(): 304 | v = self.var_ema.average(var) 305 | if v is not None: 306 | self.assign_vars.append(tf.assign(var, v)) 307 | 308 | 309 | # import numpy as np 310 | # 311 | # config = { 312 | # 'char_dim': 64, 313 | # 'cont_limit': 400, 314 | # 'ques_limit': 50, 315 | # 'char_limit': 16, 316 | # 'ans_limit': -1, 317 | # 'filters': 256, 318 | # 'dropout': 0.1, 319 | # 'dropout_emb': 0.1, 320 | # 'l2_norm': 3e-7, 321 | # 'decay': 0.9999, 322 | # 'gamma_c': 1.0, 323 | # 'gamma_b': 0.3, 324 | # 'learning_rate': 1e-3, 325 | # 'grad_clip': 5.0, 326 | # 'init_lambda': 3.0, 327 | # 'loss_type': 'use_plausible', 328 | # 'use_elmo': 0, 329 | # 'use_cove': 0, 330 | # 'use_feat': True, 331 | # 'optimizer': 'adam', 332 | # 'cove_path': 'Keras_CoVe_2layers.h5', 333 | # 'elmo_weights_path': 'elmo_tf/models/squad_context_concat_lm_2x4096_512_2048cnn_2xhighway_skip.hdf5', 334 | # 'elmo_options_path': 'elmo_tf/models/options_squad_lm_2x4096_512_2048cnn_2xhighway_skip.json', 335 | # 'train_tfrecords': 'tfrecords/train_pre_elmo_cove.tfrecords', 336 | # 'dev_tfrecords': 'tfrecords/dev_pre_elmo_cove.tfrecords', 337 | # 'batch_size': 24, 338 | # 'epoch': 40, 339 | # 'origin_path': None, # not finetune 340 | # 'path': 'QANetV253' 341 | # } 342 | # word_mat = np.random.random((90950, 300)).astype(np.float32) 343 | # char_mat2 = np.random.random((94, 300)).astype(np.float32) 344 | # char_mat = np.random.random((1171, 300)).astype(np.float32) 345 | # model = Model(config, word_mat, char_mat, char_mat2) 346 | -------------------------------------------------------------------------------- /bilm/model.py: -------------------------------------------------------------------------------- 1 | 2 | import numpy as np 3 | import tensorflow as tf 4 | import h5py 5 | import json 6 | import re 7 | 8 | from .data import UnicodeCharsVocabulary, Batcher 9 | 10 | DTYPE = 'float32' 11 | DTYPE_INT = 'int64' 12 | 13 | 14 | class BidirectionalLanguageModel(object): 15 | def __init__( 16 | self, 17 | options_file: str, 18 | weight_file: str, 19 | use_character_inputs=True, 20 | embedding_weight_file=None, 21 | max_batch_size=128, 22 | ): 23 | ''' 24 | Creates the language model computational graph and loads weights 25 | 26 | Two options for input type: 27 | (1) To use character inputs (paired with Batcher) 28 | pass use_character_inputs=True, and ids_placeholder 29 | of shape (None, None, max_characters_per_token) 30 | to __call__ 31 | (2) To use token ids as input (paired with TokenBatcher), 32 | pass use_character_inputs=False and ids_placeholder 33 | of shape (None, None) to __call__. 34 | In this case, embedding_weight_file is also required input 35 | 36 | options_file: location of the json formatted file with 37 | LM hyperparameters 38 | weight_file: location of the hdf5 file with LM weights 39 | use_character_inputs: if True, then use character ids as input, 40 | otherwise use token ids 41 | max_batch_size: the maximum allowable batch size 42 | ''' 43 | with open(options_file, 'r') as fin: 44 | options = json.load(fin) 45 | 46 | if not use_character_inputs: 47 | if embedding_weight_file is None: 48 | raise ValueError( 49 | "embedding_weight_file is required input with " 50 | "not use_character_inputs" 51 | ) 52 | 53 | self._options = options 54 | self._weight_file = weight_file 55 | self._embedding_weight_file = embedding_weight_file 56 | self._use_character_inputs = use_character_inputs 57 | self._max_batch_size = max_batch_size 58 | 59 | self._ops = {} 60 | self._graphs = {} 61 | 62 | def __call__(self, ids_placeholder): 63 | ''' 64 | Given the input character ids (or token ids), returns a dictionary 65 | with tensorflow ops: 66 | 67 | {'lm_embeddings': embedding_op, 68 | 'lengths': sequence_lengths_op, 69 | 'mask': op to compute mask} 70 | 71 | embedding_op computes the LM embeddings and is shape 72 | (None, 3, None, 1024) 73 | lengths_op computes the sequence lengths and is shape (None, ) 74 | mask computes the sequence mask and is shape (None, None) 75 | 76 | ids_placeholder: a tf.placeholder of type int32. 77 | If use_character_inputs=True, it is shape 78 | (None, None, max_characters_per_token) and holds the input 79 | character ids for a batch 80 | If use_character_input=False, it is shape (None, None) and 81 | holds the input token ids for a batch 82 | ''' 83 | if ids_placeholder in self._ops: 84 | # have already created ops for this placeholder, just return them 85 | ret = self._ops[ids_placeholder] 86 | 87 | else: 88 | # need to create the graph 89 | if len(self._ops) == 0: 90 | # first time creating the graph, don't reuse variables 91 | lm_graph = BidirectionalLanguageModelGraph( 92 | self._options, 93 | self._weight_file, 94 | ids_placeholder, 95 | embedding_weight_file=self._embedding_weight_file, 96 | use_character_inputs=self._use_character_inputs, 97 | max_batch_size=self._max_batch_size) 98 | else: 99 | with tf.variable_scope('', reuse=True): 100 | lm_graph = BidirectionalLanguageModelGraph( 101 | self._options, 102 | self._weight_file, 103 | ids_placeholder, 104 | embedding_weight_file=self._embedding_weight_file, 105 | use_character_inputs=self._use_character_inputs, 106 | max_batch_size=self._max_batch_size) 107 | 108 | ops = self._build_ops(lm_graph) 109 | self._ops[ids_placeholder] = ops 110 | self._graphs[ids_placeholder] = lm_graph 111 | ret = ops 112 | 113 | return ret 114 | 115 | def _build_ops(self, lm_graph): 116 | with tf.control_dependencies([lm_graph.update_state_op]): 117 | # get the LM embeddings 118 | token_embeddings = lm_graph.embedding 119 | layers = [ 120 | tf.concat([token_embeddings, token_embeddings], axis=2) 121 | ] 122 | 123 | n_lm_layers = len(lm_graph.lstm_outputs['forward']) 124 | for i in range(n_lm_layers): 125 | layers.append( 126 | tf.concat( 127 | [lm_graph.lstm_outputs['forward'][i], 128 | lm_graph.lstm_outputs['backward'][i]], 129 | axis=-1 130 | ) 131 | ) 132 | 133 | # The layers include the BOS/EOS tokens. Remove them 134 | sequence_length_wo_bos_eos = lm_graph.sequence_lengths - 2 135 | layers_without_bos_eos = [] 136 | for layer in layers: 137 | layer_wo_bos_eos = layer[:, 1:, :] 138 | layer_wo_bos_eos = tf.reverse_sequence( 139 | layer_wo_bos_eos, 140 | lm_graph.sequence_lengths - 1, 141 | seq_axis=1, 142 | batch_axis=0, 143 | ) 144 | layer_wo_bos_eos = layer_wo_bos_eos[:, 1:, :] 145 | layer_wo_bos_eos = tf.reverse_sequence( 146 | layer_wo_bos_eos, 147 | sequence_length_wo_bos_eos, 148 | seq_axis=1, 149 | batch_axis=0, 150 | ) 151 | layers_without_bos_eos.append(layer_wo_bos_eos) 152 | 153 | # concatenate the layers 154 | lm_embeddings = tf.concat( 155 | [tf.expand_dims(t, axis=1) for t in layers_without_bos_eos], 156 | axis=1 157 | ) 158 | 159 | # get the mask op without bos/eos. 160 | # tf doesn't support reversing boolean tensors, so cast 161 | # to int then back 162 | mask_wo_bos_eos = tf.cast(lm_graph.mask[:, 1:], 'int32') 163 | mask_wo_bos_eos = tf.reverse_sequence( 164 | mask_wo_bos_eos, 165 | lm_graph.sequence_lengths - 1, 166 | seq_axis=1, 167 | batch_axis=0, 168 | ) 169 | mask_wo_bos_eos = mask_wo_bos_eos[:, 1:] 170 | mask_wo_bos_eos = tf.reverse_sequence( 171 | mask_wo_bos_eos, 172 | sequence_length_wo_bos_eos, 173 | seq_axis=1, 174 | batch_axis=0, 175 | ) 176 | mask_wo_bos_eos = tf.cast(mask_wo_bos_eos, 'bool') 177 | 178 | return { 179 | 'lm_embeddings': lm_embeddings, 180 | 'lengths': sequence_length_wo_bos_eos, 181 | 'token_embeddings': lm_graph.embedding, 182 | 'mask': mask_wo_bos_eos, 183 | } 184 | 185 | 186 | def _pretrained_initializer(varname, weight_file, embedding_weight_file=None): 187 | ''' 188 | We'll stub out all the initializers in the pretrained LM with 189 | a function that loads the weights from the file 190 | ''' 191 | weight_name_map = {} 192 | for i in range(2): 193 | for j in range(8): # if we decide to add more layers 194 | root = 'RNN_{}/RNN/MultiRNNCell/Cell{}'.format(i, j) 195 | weight_name_map[root + '/rnn/lstm_cell/kernel'] = \ 196 | root + '/LSTMCell/W_0' 197 | weight_name_map[root + '/rnn/lstm_cell/bias'] = \ 198 | root + '/LSTMCell/B' 199 | weight_name_map[root + '/rnn/lstm_cell/projection/kernel'] = \ 200 | root + '/LSTMCell/W_P_0' 201 | 202 | # convert the graph name to that in the checkpoint 203 | varname_in_file = varname[5:] 204 | if varname_in_file.startswith('RNN'): 205 | varname_in_file = weight_name_map[varname_in_file] 206 | 207 | if varname_in_file == 'embedding': 208 | with h5py.File(embedding_weight_file, 'r') as fin: 209 | # Have added a special 0 index for padding not present 210 | # in the original model. 211 | embed_weights = fin[varname_in_file][...] 212 | weights = np.zeros( 213 | (embed_weights.shape[0] + 1, embed_weights.shape[1]), 214 | dtype=DTYPE 215 | ) 216 | weights[1:, :] = embed_weights 217 | else: 218 | with h5py.File(weight_file, 'r') as fin: 219 | if varname_in_file == 'char_embed': 220 | # Have added a special 0 index for padding not present 221 | # in the original model. 222 | char_embed_weights = fin[varname_in_file][...] 223 | weights = np.zeros( 224 | (char_embed_weights.shape[0] + 1, 225 | char_embed_weights.shape[1]), 226 | dtype=DTYPE 227 | ) 228 | weights[1:, :] = char_embed_weights 229 | else: 230 | weights = fin[varname_in_file][...] 231 | 232 | # Tensorflow initializers are callables that accept a shape parameter 233 | # and some optional kwargs 234 | def ret(shape, **kwargs): 235 | if list(shape) != list(weights.shape): 236 | raise ValueError( 237 | "Invalid shape initializing {0}, got {1}, expected {2}".format( 238 | varname_in_file, shape, weights.shape) 239 | ) 240 | return weights 241 | 242 | return ret 243 | 244 | 245 | class BidirectionalLanguageModelGraph(object): 246 | ''' 247 | Creates the computational graph and holds the ops necessary for runnint 248 | a bidirectional language model 249 | ''' 250 | def __init__(self, options, weight_file, ids_placeholder, 251 | use_character_inputs=True, embedding_weight_file=None, 252 | max_batch_size=128): 253 | 254 | self.options = options 255 | self._max_batch_size = max_batch_size 256 | self.ids_placeholder = ids_placeholder 257 | self.use_character_inputs = use_character_inputs 258 | 259 | # this custom_getter will make all variables not trainable and 260 | # override the default initializer 261 | def custom_getter(getter, name, *args, **kwargs): 262 | kwargs['trainable'] = False 263 | kwargs['initializer'] = _pretrained_initializer( 264 | name, weight_file, embedding_weight_file 265 | ) 266 | return getter(name, *args, **kwargs) 267 | 268 | if embedding_weight_file is not None: 269 | # get the vocab size 270 | with h5py.File(embedding_weight_file, 'r') as fin: 271 | # +1 for padding 272 | self._n_tokens_vocab = fin['embedding'].shape[0] + 1 273 | else: 274 | self._n_tokens_vocab = None 275 | 276 | with tf.variable_scope('bilm', custom_getter=custom_getter): 277 | self._build() 278 | 279 | def _build(self): 280 | if self.use_character_inputs: 281 | self._build_word_char_embeddings() 282 | else: 283 | self._build_word_embeddings() 284 | self._build_lstms() 285 | 286 | def _build_word_char_embeddings(self): 287 | ''' 288 | options contains key 'char_cnn': { 289 | 290 | 'n_characters': 262, 291 | 292 | # includes the start / end characters 293 | 'max_characters_per_token': 50, 294 | 295 | 'filters': [ 296 | [1, 32], 297 | [2, 32], 298 | [3, 64], 299 | [4, 128], 300 | [5, 256], 301 | [6, 512], 302 | [7, 1024] 303 | ], 304 | 'activation': 'tanh', 305 | 306 | # for the character embedding 307 | 'embedding': {'dim': 16} 308 | 309 | # for highway layers 310 | # if omitted, then no highway layers 311 | 'n_highway': 2, 312 | } 313 | ''' 314 | projection_dim = self.options['lstm']['projection_dim'] 315 | 316 | cnn_options = self.options['char_cnn'] 317 | filters = cnn_options['filters'] 318 | n_filters = sum(f[1] for f in filters) 319 | max_chars = cnn_options['max_characters_per_token'] 320 | char_embed_dim = cnn_options['embedding']['dim'] 321 | n_chars = cnn_options['n_characters'] 322 | if n_chars != 262: 323 | raise InvalidNumberOfCharacters( 324 | "Set n_characters=262 after training see the README.md" 325 | ) 326 | if cnn_options['activation'] == 'tanh': 327 | activation = tf.nn.tanh 328 | elif cnn_options['activation'] == 'relu': 329 | activation = tf.nn.relu 330 | 331 | # the character embeddings 332 | with tf.device("/cpu:0"): 333 | self.embedding_weights = tf.get_variable( 334 | "char_embed", [n_chars, char_embed_dim], 335 | dtype=DTYPE, 336 | initializer=tf.random_uniform_initializer(-1.0, 1.0) 337 | ) 338 | # shape (batch_size, unroll_steps, max_chars, embed_dim) 339 | self.char_embedding = tf.nn.embedding_lookup(self.embedding_weights, 340 | self.ids_placeholder) 341 | 342 | # the convolutions 343 | def make_convolutions(inp): 344 | # inp: [bs, len_seq, len_char, char_dim] 345 | with tf.variable_scope('CNN') as scope: 346 | convolutions = [] 347 | for i, (width, num) in enumerate(filters): 348 | if cnn_options['activation'] == 'relu': 349 | # He initialization for ReLU activation 350 | # with char embeddings init between -1 and 1 351 | #w_init = tf.random_normal_initializer( 352 | # mean=0.0, 353 | # stddev=np.sqrt(2.0 / (width * char_embed_dim)) 354 | #) 355 | 356 | # Kim et al 2015, +/- 0.05 357 | w_init = tf.random_uniform_initializer( 358 | minval=-0.05, maxval=0.05) 359 | elif cnn_options['activation'] == 'tanh': 360 | # glorot init 361 | w_init = tf.random_normal_initializer( 362 | mean=0.0, 363 | stddev=np.sqrt(1.0 / (width * char_embed_dim)) 364 | ) 365 | w = tf.get_variable( 366 | "W_cnn_%s" % i, 367 | [1, width, char_embed_dim, num], 368 | initializer=w_init, 369 | dtype=DTYPE) 370 | b = tf.get_variable( 371 | "b_cnn_%s" % i, [num], dtype=DTYPE, 372 | initializer=tf.constant_initializer(0.0)) 373 | 374 | conv = tf.nn.conv2d( 375 | inp, w, 376 | strides=[1, 1, 1, 1], 377 | padding="VALID") + b 378 | # conv [bs, len_seq, len_char-width+1, filters] 379 | # now max pool 380 | conv = tf.nn.max_pool( 381 | conv, [1, 1, max_chars-width+1, 1], 382 | [1, 1, 1, 1], 'VALID') 383 | 384 | # activation 385 | conv = activation(conv) 386 | conv = tf.squeeze(conv, squeeze_dims=[2]) 387 | # [bs, len_seq, filters] 388 | 389 | convolutions.append(conv) 390 | 391 | return tf.concat(convolutions, 2) 392 | 393 | embedding = make_convolutions(self.char_embedding) 394 | 395 | # for highway and projection layers 396 | n_highway = cnn_options.get('n_highway') 397 | use_highway = n_highway is not None and n_highway > 0 398 | use_proj = n_filters != projection_dim 399 | 400 | if use_highway or use_proj: 401 | # reshape from (batch_size, n_tokens, dim) to (-1, dim) 402 | batch_size_n_tokens = tf.shape(embedding)[0:2] 403 | embedding = tf.reshape(embedding, [-1, n_filters]) 404 | 405 | # set up weights for projection 406 | if use_proj: 407 | assert n_filters > projection_dim 408 | with tf.variable_scope('CNN_proj') as scope: 409 | W_proj_cnn = tf.get_variable( 410 | "W_proj", [n_filters, projection_dim], 411 | initializer=tf.random_normal_initializer( 412 | mean=0.0, stddev=np.sqrt(1.0 / n_filters)), 413 | dtype=DTYPE) 414 | b_proj_cnn = tf.get_variable( 415 | "b_proj", [projection_dim], 416 | initializer=tf.constant_initializer(0.0), 417 | dtype=DTYPE) 418 | 419 | # apply highways layers 420 | def high(x, ww_carry, bb_carry, ww_tr, bb_tr): 421 | carry_gate = tf.nn.sigmoid(tf.matmul(x, ww_carry) + bb_carry) 422 | transform_gate = tf.nn.relu(tf.matmul(x, ww_tr) + bb_tr) 423 | return carry_gate * transform_gate + (1.0 - carry_gate) * x 424 | 425 | if use_highway: 426 | highway_dim = n_filters 427 | 428 | for i in range(n_highway): 429 | with tf.variable_scope('CNN_high_%s' % i) as scope: 430 | W_carry = tf.get_variable( 431 | 'W_carry', [highway_dim, highway_dim], 432 | # glorit init 433 | initializer=tf.random_normal_initializer( 434 | mean=0.0, stddev=np.sqrt(1.0 / highway_dim)), 435 | dtype=DTYPE) 436 | b_carry = tf.get_variable( 437 | 'b_carry', [highway_dim], 438 | initializer=tf.constant_initializer(-2.0), 439 | dtype=DTYPE) 440 | W_transform = tf.get_variable( 441 | 'W_transform', [highway_dim, highway_dim], 442 | initializer=tf.random_normal_initializer( 443 | mean=0.0, stddev=np.sqrt(1.0 / highway_dim)), 444 | dtype=DTYPE) 445 | b_transform = tf.get_variable( 446 | 'b_transform', [highway_dim], 447 | initializer=tf.constant_initializer(0.0), 448 | dtype=DTYPE) 449 | 450 | embedding = high(embedding, W_carry, b_carry, 451 | W_transform, b_transform) 452 | 453 | # finally project down if needed 454 | if use_proj: 455 | embedding = tf.matmul(embedding, W_proj_cnn) + b_proj_cnn 456 | 457 | # reshape back to (batch_size, tokens, dim) 458 | if use_highway or use_proj: 459 | shp = tf.concat([batch_size_n_tokens, [projection_dim]], axis=0) 460 | embedding = tf.reshape(embedding, shp) 461 | 462 | # at last assign attributes for remainder of the model 463 | self.embedding = embedding 464 | 465 | 466 | def _build_word_embeddings(self): 467 | projection_dim = self.options['lstm']['projection_dim'] 468 | 469 | # the word embeddings 470 | with tf.device("/cpu:0"): 471 | self.embedding_weights = tf.get_variable( 472 | "embedding", [self._n_tokens_vocab, projection_dim], 473 | dtype=DTYPE, 474 | ) 475 | self.embedding = tf.nn.embedding_lookup(self.embedding_weights, 476 | self.ids_placeholder) 477 | 478 | 479 | def _build_lstms(self): 480 | # now the LSTMs 481 | # these will collect the initial states for the forward 482 | # (and reverse LSTMs if we are doing bidirectional) 483 | 484 | # parse the options 485 | lstm_dim = self.options['lstm']['dim'] 486 | projection_dim = self.options['lstm']['projection_dim'] 487 | n_lstm_layers = self.options['lstm'].get('n_layers', 1) 488 | cell_clip = self.options['lstm'].get('cell_clip') 489 | proj_clip = self.options['lstm'].get('proj_clip') 490 | use_skip_connections = self.options['lstm']['use_skip_connections'] 491 | if use_skip_connections: 492 | print("USING SKIP CONNECTIONS") 493 | else: 494 | print("NOT USING SKIP CONNECTIONS") 495 | 496 | # the sequence lengths from input mask 497 | if self.use_character_inputs: 498 | mask = tf.reduce_any(self.ids_placeholder > 0, axis=2) 499 | else: 500 | mask = self.ids_placeholder > 0 501 | sequence_lengths = tf.reduce_sum(tf.cast(mask, tf.int32), axis=1) 502 | batch_size = tf.shape(sequence_lengths)[0] 503 | 504 | # for each direction, we'll store tensors for each layer 505 | self.lstm_outputs = {'forward': [], 'backward': []} 506 | self.lstm_state_sizes = {'forward': [], 'backward': []} 507 | self.lstm_init_states = {'forward': [], 'backward': []} 508 | self.lstm_final_states = {'forward': [], 'backward': []} 509 | 510 | update_ops = [] 511 | for direction in ['forward', 'backward']: 512 | if direction == 'forward': 513 | layer_input = self.embedding 514 | else: 515 | layer_input = tf.reverse_sequence( 516 | self.embedding, 517 | sequence_lengths, 518 | seq_axis=1, 519 | batch_axis=0 520 | ) 521 | for i in range(n_lstm_layers): 522 | if projection_dim < lstm_dim: 523 | # are projecting down output 524 | lstm_cell = tf.nn.rnn_cell.LSTMCell( 525 | lstm_dim, num_proj=projection_dim, 526 | cell_clip=cell_clip, proj_clip=proj_clip) 527 | else: 528 | lstm_cell = tf.nn.rnn_cell.LSTMCell( 529 | lstm_dim, 530 | cell_clip=cell_clip, proj_clip=proj_clip) 531 | 532 | if use_skip_connections: 533 | # ResidualWrapper adds inputs to outputs 534 | if i == 0: 535 | # don't add skip connection from token embedding to 536 | # 1st layer output 537 | pass 538 | else: 539 | # add a skip connection 540 | lstm_cell = tf.nn.rnn_cell.ResidualWrapper(lstm_cell) 541 | 542 | # collect the input state, run the dynamic rnn, collect 543 | # the output 544 | state_size = lstm_cell.state_size 545 | # the LSTMs are stateful. To support multiple batch sizes, 546 | # we'll allocate size for states up to max_batch_size, 547 | # then use the first batch_size entries for each batch 548 | init_states = [ 549 | tf.Variable( 550 | tf.zeros([self._max_batch_size, dim]), 551 | trainable=False 552 | ) 553 | for dim in lstm_cell.state_size 554 | ] 555 | batch_init_states = [ 556 | state[:batch_size, :] for state in init_states 557 | ] 558 | 559 | if direction == 'forward': 560 | i_direction = 0 561 | else: 562 | i_direction = 1 563 | variable_scope_name = 'RNN_{0}/RNN/MultiRNNCell/Cell{1}'.format( 564 | i_direction, i) 565 | with tf.variable_scope(variable_scope_name): 566 | layer_output, final_state = tf.nn.dynamic_rnn( 567 | lstm_cell, 568 | layer_input, 569 | sequence_length=sequence_lengths, 570 | initial_state=tf.nn.rnn_cell.LSTMStateTuple( 571 | *batch_init_states), 572 | ) 573 | 574 | self.lstm_state_sizes[direction].append(lstm_cell.state_size) 575 | self.lstm_init_states[direction].append(init_states) 576 | self.lstm_final_states[direction].append(final_state) 577 | if direction == 'forward': 578 | self.lstm_outputs[direction].append(layer_output) 579 | else: 580 | self.lstm_outputs[direction].append( 581 | tf.reverse_sequence( 582 | layer_output, 583 | sequence_lengths, 584 | seq_axis=1, 585 | batch_axis=0 586 | ) 587 | ) 588 | 589 | with tf.control_dependencies([layer_output]): 590 | # update the initial states 591 | for i in range(2): 592 | new_state = tf.concat( 593 | [final_state[i][:batch_size, :], 594 | init_states[i][batch_size:, :]], axis=0) 595 | state_update_op = tf.assign(init_states[i], new_state) 596 | update_ops.append(state_update_op) 597 | 598 | layer_input = layer_output 599 | 600 | self.mask = mask 601 | self.sequence_lengths = sequence_lengths 602 | self.update_state_op = tf.group(*update_ops) 603 | 604 | 605 | def dump_token_embeddings(vocab_file, options_file, weight_file, outfile): 606 | ''' 607 | Given an input vocabulary file, dump all the token embeddings to the 608 | outfile. The result can be used as the embedding_weight_file when 609 | constructing a BidirectionalLanguageModel. 610 | ''' 611 | with open(options_file, 'r') as fin: 612 | options = json.load(fin) 613 | max_word_length = options['char_cnn']['max_characters_per_token'] 614 | 615 | vocab = UnicodeCharsVocabulary(vocab_file, max_word_length) 616 | batcher = Batcher(vocab_file, max_word_length) 617 | 618 | ids_placeholder = tf.placeholder('int32', 619 | shape=(None, None, max_word_length) 620 | ) 621 | model = BidirectionalLanguageModel(options_file, weight_file) 622 | embedding_op = model(ids_placeholder)['token_embeddings'] 623 | 624 | n_tokens = vocab.size 625 | embed_dim = int(embedding_op.shape[2]) 626 | 627 | embeddings = np.zeros((n_tokens, embed_dim), dtype=DTYPE) 628 | 629 | config = tf.ConfigProto(allow_soft_placement=True) 630 | with tf.Session(config=config) as sess: 631 | sess.run(tf.global_variables_initializer()) 632 | for k in range(n_tokens): 633 | token = vocab.id_to_word(k) 634 | char_ids = batcher.batch_sentences([[token]])[0, 1, :].reshape( 635 | 1, 1, -1) 636 | embeddings[k, :] = sess.run( 637 | embedding_op, feed_dict={ids_placeholder: char_ids} 638 | ) 639 | 640 | with h5py.File(outfile, 'w') as fout: 641 | ds = fout.create_dataset( 642 | 'embedding', embeddings.shape, dtype='float32', data=embeddings 643 | ) 644 | 645 | def dump_bilm_embeddings(vocab_file, dataset_file, options_file, 646 | weight_file, outfile): 647 | with open(options_file, 'r') as fin: 648 | options = json.load(fin) 649 | max_word_length = options['char_cnn']['max_characters_per_token'] 650 | 651 | vocab = UnicodeCharsVocabulary(vocab_file, max_word_length) 652 | batcher = Batcher(vocab_file, max_word_length) 653 | 654 | ids_placeholder = tf.placeholder('int32', 655 | shape=(None, None, max_word_length) 656 | ) 657 | model = BidirectionalLanguageModel(options_file, weight_file) 658 | ops = model(ids_placeholder) 659 | 660 | config = tf.ConfigProto(allow_soft_placement=True) 661 | with tf.Session(config=config) as sess: 662 | sess.run(tf.global_variables_initializer()) 663 | sentence_id = 0 664 | with open(dataset_file, 'r') as fin, h5py.File(outfile, 'w') as fout: 665 | for line in fin: 666 | sentence = line.strip().split() 667 | char_ids = batcher.batch_sentences([sentence]) 668 | embeddings = sess.run( 669 | ops['lm_embeddings'], feed_dict={ids_placeholder: char_ids} 670 | ) 671 | ds = fout.create_dataset( 672 | '{}'.format(sentence_id), 673 | embeddings.shape[1:], dtype='float32', 674 | data=embeddings[0, :, :, :] 675 | ) 676 | 677 | sentence_id += 1 678 | 679 | --------------------------------------------------------------------------------