├── data ├── fetch_and_preprocess.sh ├── download.py └── preprocess_data.py ├── scripts └── enc_nli │ ├── test.sh │ ├── train.sh │ ├── train.py │ ├── gen.py │ ├── data_iterator.py │ └── main.py ├── Readme.md └── LICENSE /data/fetch_and_preprocess.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | python download.py 4 | python preprocess_data.py 5 | -------------------------------------------------------------------------------- /scripts/enc_nli/test.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | export THEANO_FLAGS='mode=FAST_RUN,device=gpu0,floatX=float32,optimizer_including=cudnn,warn_float64=warn,lib.cnmem=0.9' 4 | 5 | # export THEANO_FLAGS=device=cpu,floatX=float32 6 | 7 | python -u ./gen.py > log_test.txt 2>&1 & 8 | 9 | 10 | 11 | 12 | -------------------------------------------------------------------------------- /scripts/enc_nli/train.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # GPU 4 | export THEANO_FLAGS='mode=FAST_RUN,device=gpu0,floatX=float32,optimizer_including=cudnn,warn_float64=warn,lib.cnmem=0.9' 5 | 6 | # CPU 7 | # export THEANO_FLAGS='mode=FAST_RUN,device=cpu,floatX=float32' 8 | 9 | python -u ./train.py > log.txt 2>&1 & 10 | 11 | 12 | 13 | 14 | -------------------------------------------------------------------------------- /Readme.md: -------------------------------------------------------------------------------- 1 | # LSTM Encoder for Natural Language Inference 2 | 3 | ## Dependencies 4 | To run it perfectly, you will need: 5 | * Python 2.7 6 | * Theano 0.8.2 7 | 8 | ## Running the Script 9 | 1. Download and preprocess 10 | ``` 11 | cd data 12 | bash fetch_and_preprocess.sh 13 | ``` 14 | 15 | 2. Train and test model 16 | ``` 17 | cd scripts/enc_nli 18 | bash train.sh 19 | ``` 20 | 21 | The result is in `log.txt` file. 22 | -------------------------------------------------------------------------------- /data/download.py: -------------------------------------------------------------------------------- 1 | """ 2 | Downloads the following: 3 | - Glove vectors 4 | - Stanford Natural Language Inference (SNLI) Corpus 5 | 6 | """ 7 | 8 | import sys 9 | import os 10 | import zipfile 11 | import gzip 12 | 13 | def download(url, dirpath): 14 | filename = url.split('/')[-1] 15 | filepath = os.path.join(dirpath, filename) 16 | os.system('wget {} -O {}'.format(url, filepath)) 17 | return filepath 18 | 19 | def unzip(filepath): 20 | print("Extracting: " + filepath) 21 | dirpath = os.path.dirname(filepath) 22 | with zipfile.ZipFile(filepath) as zf: 23 | zf.extractall(dirpath) 24 | os.remove(filepath) 25 | 26 | def download_wordvecs(dirpath): 27 | if os.path.exists(dirpath): 28 | print('Found Glove vectors - skip') 29 | return 30 | else: 31 | os.makedirs(dirpath) 32 | url = 'http://www-nlp.stanford.edu/data/glove.840B.300d.zip' 33 | unzip(download(url, dirpath)) 34 | 35 | def download_multinli(dirpath): 36 | if os.path.exists(dirpath): 37 | print('Found MultiNLI dataset - skip') 38 | return 39 | else: 40 | os.makedirs(dirpath) 41 | url = 'http://www.nyu.edu/projects/bowman/multinli/multinli_0.9.zip' 42 | unzip(download(url, dirpath)) 43 | 44 | 45 | if __name__ == '__main__': 46 | base_dir = os.path.dirname(os.path.realpath(__file__)) 47 | multinli_dir = os.path.join(base_dir, 'multinli') 48 | wordvec_dir = os.path.join(base_dir, 'glove') 49 | download_multinli(multinli_dir) 50 | download_wordvecs(wordvec_dir) 51 | 52 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | BSD 3-Clause License 2 | 3 | Copyright (c) 2017, Qian Chen 4 | All rights reserved. 5 | 6 | Redistribution and use in source and binary forms, with or without 7 | modification, are permitted provided that the following conditions are met: 8 | 9 | * Redistributions of source code must retain the above copyright notice, this 10 | list of conditions and the following disclaimer. 11 | 12 | * Redistributions in binary form must reproduce the above copyright notice, 13 | this list of conditions and the following disclaimer in the documentation 14 | and/or other materials provided with the distribution. 15 | 16 | * Neither the name of the copyright holder nor the names of its 17 | contributors may be used to endorse or promote products derived from 18 | this software without specific prior written permission. 19 | 20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 21 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 23 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 24 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 26 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 27 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 28 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 29 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30 | -------------------------------------------------------------------------------- /scripts/enc_nli/train.py: -------------------------------------------------------------------------------- 1 | import numpy 2 | import os 3 | 4 | from main import train 5 | 6 | if __name__ == '__main__': 7 | model_name = os.path.basename(os.path.dirname(os.path.realpath(__file__))) 8 | train( 9 | saveto = './{}.npz'.format(model_name), 10 | reload_ = False, 11 | dim_word = 300, 12 | dim = 600, 13 | patience = 7, 14 | n_words = 100140, 15 | decay_c = 0., 16 | clip_c = 10., 17 | lrate = 0.0004, 18 | optimizer = 'adam', 19 | maxlen = 450, 20 | batch_size = 32, 21 | valid_batch_size = 32, 22 | dispFreq = 100, 23 | validFreq = int(392702/32+1), 24 | saveFreq = int(392702/32+1), 25 | use_dropout = True, 26 | verbose = False, 27 | datasets = ['../../data/word_sequence/premise_multinli_0.9_train.txt', 28 | '../../data/word_sequence/hypothesis_multinli_0.9_train.txt', 29 | '../../data/word_sequence/label_multinli_0.9_train.txt'], 30 | valid_datasets = ['../../data/word_sequence/premise_multinli_0.9_dev_matched.txt', 31 | '../../data/word_sequence/hypothesis_multinli_0.9_dev_matched.txt', 32 | '../../data/word_sequence/label_multinli_0.9_dev_matched.txt'], 33 | test_datasets = ['../../data/word_sequence/premise_multinli_0.9_dev_mismatched.txt', 34 | '../../data/word_sequence/hypothesis_multinli_0.9_dev_mismatched.txt', 35 | '../../data/word_sequence/label_multinli_0.9_dev_mismatched.txt'], 36 | dictionary = '../../data/word_sequence/vocab_cased.pkl', 37 | embedding = '../../data/glove/glove.840B.300d.txt', 38 | ) 39 | 40 | -------------------------------------------------------------------------------- /data/preprocess_data.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | import sys 3 | import os 4 | import numpy 5 | import cPickle as pkl 6 | 7 | from collections import OrderedDict 8 | 9 | dic = {'entailment': '0', 'neutral': '1', 'contradiction': '2'} 10 | 11 | def build_dictionary(filepaths, dst_path, lowercase=False): 12 | word_freqs = OrderedDict() 13 | for filepath in filepaths: 14 | print 'Processing', filepath 15 | with open(filepath, 'r') as f: 16 | for line in f: 17 | if lowercase: 18 | line = line.lower() 19 | words_in = line.strip().split(' ') 20 | for w in words_in: 21 | if w not in word_freqs: 22 | word_freqs[w] = 0 23 | word_freqs[w] += 1 24 | 25 | words = word_freqs.keys() 26 | freqs = word_freqs.values() 27 | 28 | sorted_idx = numpy.argsort(freqs) 29 | sorted_words = [words[ii] for ii in sorted_idx[::-1]] 30 | 31 | worddict = OrderedDict() 32 | worddict['_PAD_'] = 0 # default, padding 33 | worddict['_UNK_'] = 1 # out-of-vocabulary 34 | worddict['_BOS_'] = 2 # begin of sentence token 35 | worddict['_EOS_'] = 3 # end of sentence token 36 | 37 | for ii, ww in enumerate(sorted_words): 38 | worddict[ww] = ii + 4 39 | 40 | with open(dst_path, 'wb') as f: 41 | pkl.dump(worddict, f) 42 | 43 | print 'Dict size', len(worddict) 44 | print 'Done' 45 | 46 | 47 | def build_sequence(filepath, dst_dir, isTest=False): 48 | filename = os.path.basename(filepath) 49 | print filename 50 | len_p = [] 51 | len_h = [] 52 | with open(filepath) as f, \ 53 | open(os.path.join(dst_dir, 'premise_%s'%filename), 'w') as f1, \ 54 | open(os.path.join(dst_dir, 'hypothesis_%s'%filename), 'w') as f2, \ 55 | open(os.path.join(dst_dir, 'label_%s'%filename), 'w') as f3: 56 | next(f) # skip the header row 57 | for line in f: 58 | sents = line.strip().split('\t') 59 | if sents[0] is '-': 60 | continue 61 | 62 | words_in = sents[1].strip().split(' ') 63 | words_in = [x for x in words_in if x not in ('(',')')] 64 | f1.write(' '.join(words_in) + '\n') 65 | len_p.append(len(words_in)) 66 | 67 | words_in = sents[2].strip().split(' ') 68 | words_in = [x for x in words_in if x not in ('(',')')] 69 | f2.write(' '.join(words_in) + '\n') 70 | len_h.append(len(words_in)) 71 | if isTest: 72 | f3.write('0' + '\n') 73 | else: 74 | f3.write(dic[sents[0]] + '\n') 75 | 76 | print 'max min len premise', max(len_p), min(len_p) 77 | print 'max min len hypothesis', max(len_h), min(len_h) 78 | 79 | 80 | def make_dirs(dirs): 81 | for d in dirs: 82 | if not os.path.exists(d): 83 | os.makedirs(d) 84 | 85 | if __name__ == '__main__': 86 | print('=' * 80) 87 | print('Preprocessing snli_1.0 dataset') 88 | print('=' * 80) 89 | base_dir = os.path.dirname(os.path.realpath(__file__)) 90 | dst_dir = os.path.join(base_dir, 'word_sequence') 91 | multinli_dir = os.path.join(base_dir, 'multinli/multinli_0.9') 92 | make_dirs([dst_dir]) 93 | 94 | build_sequence(os.path.join(multinli_dir, 'multinli_0.9_train.txt'), dst_dir) 95 | build_sequence(os.path.join(multinli_dir, 'multinli_0.9_dev_matched.txt'), dst_dir) 96 | build_sequence(os.path.join(multinli_dir, 'multinli_0.9_dev_mismatched.txt'), dst_dir) 97 | # build_sequence(os.path.join(multinli_dir, 'multinli_0.9_test_matched_unlabeled.txt'), dst_dir, isTest=True) 98 | # build_sequence(os.path.join(multinli_dir, 'multinli_0.9_test_mismatched_unlabeled.txt'), dst_dir, isTest=True) 99 | 100 | build_dictionary([os.path.join(dst_dir, 'premise_multinli_0.9_train.txt'), 101 | os.path.join(dst_dir, 'hypothesis_multinli_0.9_train.txt')], 102 | os.path.join(dst_dir, 'vocab_cased.pkl')) 103 | -------------------------------------------------------------------------------- /scripts/enc_nli/gen.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Generate 3 | ''' 4 | import argparse 5 | import theano 6 | import numpy 7 | import cPickle as pkl 8 | import os 9 | from data_iterator import TextIterator 10 | 11 | from main import (build_model, pred_probs, prepare_data, pred_acc, load_params, 12 | init_params, init_tparams) 13 | 14 | def main(): 15 | dic = {'0':'entailment', '1':'neutral', '2':'contradiction'} 16 | 17 | dev_matched_datasets=['../../data/word_sequence/premise_multinli_0.9_dev_matched.txt', 18 | '../../data/word_sequence/hypothesis_multinli_0.9_dev_matched.txt', 19 | '../../data/word_sequence/label_multinli_0.9_dev_matched.txt'] 20 | dev_mismatched_datasets=['../../data/word_sequence/premise_multinli_0.9_dev_mismatched.txt', 21 | '../../data/word_sequence/hypothesis_multinli_0.9_dev_mismatched.txt', 22 | '../../data/word_sequence/label_multinli_0.9_dev_mismatched.txt'] 23 | dictionary='../../data/word_sequence/vocab_cased.pkl' 24 | 25 | # load model model_options 26 | model_name = os.path.basename(os.path.dirname(os.path.realpath(__file__))) 27 | model = './{}.npz'.format(model_name) 28 | with open('%s.pkl' % model, 'rb') as f: 29 | options = pkl.load(f) 30 | 31 | print options 32 | 33 | # load dictionary and invert 34 | with open(dictionary, 'rb') as f: 35 | word_dict = pkl.load(f) 36 | word_idict = dict() 37 | for kk, vv in word_dict.iteritems(): 38 | word_idict[vv] = kk 39 | 40 | dev_matched = TextIterator(dev_matched_datasets[0], dev_matched_datasets[1], dev_matched_datasets[2], 41 | dictionary, 42 | n_words=options['n_words'], 43 | batch_size=options['valid_batch_size'], 44 | shuffle=False) 45 | dev_mismatched = TextIterator(dev_mismatched_datasets[0], dev_mismatched_datasets[1], dev_mismatched_datasets[2], 46 | dictionary, 47 | n_words=options['n_words'], 48 | batch_size=options['valid_batch_size'], 49 | shuffle=False) 50 | 51 | # allocate model parameters 52 | params = init_params(options, word_dict) 53 | 54 | # load model parameters and set theano shared variables 55 | params = load_params(model, params) 56 | tparams = init_tparams(params) 57 | 58 | trng, use_noise, \ 59 | x1, x1_mask, char_x1, char_x1_mask, x2, x2_mask, char_x2, char_x2_mask, y, \ 60 | opt_ret, \ 61 | cost, \ 62 | f_pred, f_prods = \ 63 | build_model(tparams, options) 64 | 65 | use_noise.set_value(0.) 66 | dev_matched_acc = pred_acc(f_pred, prepare_data, options, dev_matched, word_idict) 67 | dev_mismatched_acc = pred_acc(f_pred, prepare_data, options, dev_mismatched, word_idict) 68 | 69 | print 'dev_matched accuracy', dev_matched_acc 70 | print 'dev_mismatched accuracy', dev_mismatched_acc 71 | 72 | predict_labels_dev_matched = pred_label(f_prods, prepare_data, options, dev_matched, word_idict) 73 | predict_labels_dev_mismatched = pred_label(f_prods, prepare_data, options, dev_mismatched, word_idict) 74 | 75 | with open('./dev_matched_output.txt', 'w') as fw: 76 | with open(dev_matched_datasets[0], 'r') as f1: 77 | with open(dev_matched_datasets[1], 'r') as f2: 78 | with open(dev_matched_datasets[2], 'r') as f3: 79 | for a, b, c, d in zip(predict_labels_dev_matched, f3, f1, f2): 80 | fw.write(str(a) + '\t' + b.rstrip() + '\t' + c.rstrip() + '\t' + d.rstrip() + '\n') 81 | 82 | with open('./dev_dismatched_output.txt', 'w') as fw: 83 | with open(dev_mismatched_datasets[0], 'r') as f1: 84 | with open(dev_mismatched_datasets[1], 'r') as f2: 85 | with open(dev_mismatched_datasets[2], 'r') as f3: 86 | for a, b, c, d in zip(predict_labels_dev_mismatched, f3, f1, f2): 87 | fw.write(str(a) + '\t' + b.rstrip() + '\t' + c.rstrip() + '\t' + d.rstrip() + '\n') 88 | 89 | print 'Done' 90 | 91 | def pred_label(f_prods, prepare_data, options, iterator, word_idict): 92 | labels = [] 93 | valid_acc = 0 94 | n_done = 0 95 | for x1_, x2_, y_ in iterator: 96 | n_done += len(x1_) 97 | lengths_x1 = [len(s) for s in x1_] 98 | lengths_x2 = [len(s) for s in x2_] 99 | x1, x1_mask, char_x1, char_x1_mask, x2, x2_mask, char_x2, char_x2_mask, y = prepare_data(x1_, x2_, y_, word_idict) 100 | inps = [x1, x1_mask, char_x1, char_x1_mask, x2, x2_mask, char_x2, char_x2_mask] 101 | prods = f_prods(*inps) 102 | preds = prods.argmax(axis=1) 103 | valid_acc += (preds == y).sum() 104 | labels = labels + preds.tolist() 105 | 106 | valid_acc = 1.0 * valid_acc / n_done 107 | print "total sampel", n_done 108 | print "Acc", valid_acc 109 | 110 | return labels 111 | 112 | if __name__ == "__main__": 113 | parser = argparse.ArgumentParser() 114 | args = parser.parse_args() 115 | main() 116 | -------------------------------------------------------------------------------- /scripts/enc_nli/data_iterator.py: -------------------------------------------------------------------------------- 1 | import cPickle as pkl 2 | import gzip 3 | import numpy 4 | import random 5 | import math 6 | 7 | def fopen(filename, mode='r'): 8 | if filename.endswith('.gz'): 9 | return gzip.open(filename, mode) 10 | return open(filename, mode) 11 | 12 | class TextIterator: 13 | """Simple Bitext iterator.""" 14 | def __init__(self, source, target, label, 15 | dict, 16 | batch_size=128, 17 | n_words=-1, 18 | shuffle=True): 19 | self.source = fopen(source, 'r') 20 | self.target = fopen(target, 'r') 21 | self.label = fopen(label, 'r') 22 | with open(dict, 'rb') as f: 23 | self.dict = pkl.load(f) 24 | self.batch_size = batch_size 25 | self.n_words = n_words 26 | self.shuffle = shuffle 27 | self.end_of_data = False 28 | 29 | self.source_buffer = [] 30 | self.target_buffer = [] 31 | self.label_buffer = [] 32 | self.k = batch_size * 20 33 | 34 | def __iter__(self): 35 | return self 36 | 37 | def reset(self): 38 | self.source.seek(0) 39 | self.target.seek(0) 40 | self.label.seek(0) 41 | 42 | def next(self): 43 | if self.end_of_data: 44 | self.end_of_data = False 45 | self.reset() 46 | raise StopIteration 47 | 48 | source = [] 49 | target = [] 50 | label = [] 51 | 52 | # fill buffer, if it's empty 53 | assert len(self.source_buffer) == len(self.target_buffer), 'Buffer size mismatch!' 54 | assert len(self.source_buffer) == len(self.label_buffer), 'Buffer size mismatch!' 55 | 56 | if len(self.source_buffer) == 0: 57 | for k_ in xrange(self.k): 58 | ss = self.source.readline() 59 | if ss == "": 60 | break 61 | tt = self.target.readline() 62 | if tt == "": 63 | break 64 | ll = self.label.readline() 65 | if ll == "": 66 | break 67 | 68 | self.source_buffer.append(ss.strip().split()) 69 | self.target_buffer.append(tt.strip().split()) 70 | self.label_buffer.append(ll.strip()) 71 | 72 | if self.shuffle: 73 | # sort by target buffer 74 | tlen = numpy.array([len(t) for t in self.target_buffer]) 75 | tidx = tlen.argsort() 76 | # shuffle mini-batch 77 | tindex = [] 78 | small_index = range(int(math.ceil(len(tidx)*1./self.batch_size))) 79 | random.shuffle(small_index) 80 | for i in small_index: 81 | if (i+1)*self.batch_size > len(tidx): 82 | tindex.extend(tidx[i*self.batch_size:]) 83 | else: 84 | tindex.extend(tidx[i*self.batch_size:(i+1)*self.batch_size]) 85 | 86 | tidx = tindex 87 | 88 | _sbuf = [self.source_buffer[i] for i in tidx] 89 | _tbuf = [self.target_buffer[i] for i in tidx] 90 | _lbuf = [self.label_buffer[i] for i in tidx] 91 | 92 | self.source_buffer = _sbuf 93 | self.target_buffer = _tbuf 94 | self.label_buffer = _lbuf 95 | 96 | if len(self.source_buffer) == 0 or len(self.target_buffer) == 0 or len(self.label_buffer) == 0: 97 | self.end_of_data = False 98 | self.reset() 99 | raise StopIteration 100 | 101 | try: 102 | 103 | # actual work here 104 | while True: 105 | 106 | # read from source file and map to word index 107 | try: 108 | ss = self.source_buffer.pop(0) 109 | except IndexError: 110 | break 111 | 112 | ss.insert(0, '_BOS_') 113 | ss.append('_EOS_') 114 | ss = [self.dict[w] if w in self.dict else 1 115 | for w in ss] 116 | if self.n_words > 0: 117 | ss = [w if w < self.n_words else 1 for w in ss] 118 | 119 | # read from source file and map to word index 120 | tt = self.target_buffer.pop(0) 121 | tt.insert(0, '_BOS_') 122 | tt.append('_EOS_') 123 | tt = [self.dict[w] if w in self.dict else 1 124 | for w in tt] 125 | if self.n_words > 0: 126 | tt = [w if w < self.n_words else 1 for w in tt] 127 | 128 | # read label 129 | ll = self.label_buffer.pop(0) 130 | 131 | source.append(ss) 132 | target.append(tt) 133 | label.append(ll) 134 | 135 | if len(source) >= self.batch_size or \ 136 | len(target) >= self.batch_size or \ 137 | len(label) >= self.batch_size: 138 | break 139 | except IOError: 140 | self.end_of_data = True 141 | 142 | if len(source) <= 0 or len(target) <= 0 or len(label) <= 0: 143 | self.end_of_data = False 144 | self.reset() 145 | raise StopIteration 146 | 147 | return source, target, label 148 | -------------------------------------------------------------------------------- /scripts/enc_nli/main.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Build a Gated-Attenion BiLSTM model for Natural Language Inference 3 | ''' 4 | import theano 5 | import theano.tensor as tensor 6 | from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams 7 | 8 | import cPickle as pkl 9 | import pdb 10 | import numpy 11 | import copy 12 | 13 | import os 14 | import warnings 15 | import sys 16 | import time 17 | import pprint 18 | import logging 19 | 20 | from collections import OrderedDict 21 | from data_iterator import TextIterator 22 | 23 | profile = False 24 | logger = logging.getLogger(__name__) 25 | 26 | def str2list(s): 27 | alphabet = " abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789,;.!?:'\"/\\|_@#$%^&*~`+-=<>()[]{}" 28 | l = len(s) 29 | ans = [] 30 | for i in range(0, l): 31 | a = alphabet.find(s[i]) 32 | if a >= 0: 33 | ans.append(a) 34 | else: 35 | ans.append(0) 36 | #print(s[i]) 37 | return ans 38 | 39 | # push parameters to Theano shared variables 40 | def zipp(params, tparams): 41 | for kk, vv in params.iteritems(): 42 | tparams[kk].set_value(vv) 43 | 44 | 45 | # pull parameters from Theano shared variables 46 | def unzip(zipped): 47 | new_params = OrderedDict() 48 | for kk, vv in zipped.iteritems(): 49 | new_params[kk] = vv.get_value() 50 | return new_params 51 | 52 | 53 | # get the list of parameters: Note that tparams must be OrderedDict 54 | def itemlist(tparams): 55 | return [vv for kk, vv in tparams.iteritems()] 56 | 57 | 58 | # dropout 59 | def dropout_layer(state_before, use_noise, trng): 60 | """ 61 | tensor switch is like an if statement that checks the 62 | value of the theano shared variable (use_noise), before 63 | either dropping out the state_before tensor or 64 | computing the appropriate activation. During training/testing 65 | use_noise is toggled on and off. 66 | """ 67 | proj = tensor.switch( 68 | use_noise, 69 | state_before * trng.binomial(state_before.shape, p=0.5, n=1, 70 | dtype=state_before.dtype), 71 | state_before * 0.5) 72 | return proj 73 | 74 | 75 | # make prefix-appended name 76 | def _p(pp, name): 77 | return '%s_%s' % (pp, name) 78 | 79 | 80 | # initialize Theano shared variables according to the initial parameters 81 | def init_tparams(params): 82 | tparams = OrderedDict() 83 | for kk, pp in params.iteritems(): 84 | tparams[kk] = theano.shared(params[kk], name=kk) 85 | print kk, pp.shape 86 | return tparams 87 | 88 | 89 | # load parameters 90 | def load_params(path, params): 91 | pp = numpy.load(path) 92 | for kk, vv in params.iteritems(): 93 | if kk not in pp: 94 | warnings.warn('%s is not in the archive' % kk) 95 | continue 96 | params[kk] = pp[kk] 97 | 98 | return params 99 | 100 | 101 | """ 102 | Neural network layer definitions. 103 | 104 | The life-cycle of each of these layers is as follows 105 | 1) The param_init of the layer is called, which creates 106 | the weights of the network. 107 | 2) The feedforward is called which builds that part of the Theano graph 108 | using the weights created in step 1). This automatically links 109 | these variables to the graph. 110 | 111 | Each prefix is used like a key and should be unique 112 | to avoid naming conflicts when building the graph. 113 | """ 114 | # layers: 'name': ('parameter initializer', 'feedforward') 115 | layers = {'ff': ('param_init_fflayer', 'fflayer'), 116 | 'lstm': ('param_init_lstm', 'lstm_layer'), 117 | } 118 | 119 | 120 | def get_layer(name): 121 | fns = layers[name] 122 | return (eval(fns[0]), eval(fns[1])) 123 | 124 | 125 | # some utilities 126 | def ortho_weight(ndim): 127 | """ 128 | Random orthogonal weights 129 | 130 | Used by norm_weights(below), in which case, we 131 | are ensuring that the rows are orthogonal 132 | (i.e W = U \Sigma V, U has the same 133 | # of rows, V has the same # of cols) 134 | """ 135 | W = numpy.random.randn(ndim, ndim) 136 | u, s, v = numpy.linalg.svd(W) 137 | return u.astype('float32') 138 | 139 | 140 | def norm_weight(nin, nout=None, scale=0.01, ortho=True): 141 | """ 142 | Random weights drawn from a Gaussian 143 | """ 144 | if nout is None: 145 | nout = nin 146 | if nout == nin and ortho: 147 | W = ortho_weight(nin) 148 | else: 149 | W = scale * numpy.random.randn(nin, nout) 150 | return W.astype('float32') 151 | 152 | 153 | # some useful shorthands 154 | def tanh(x): 155 | return tensor.tanh(x) 156 | 157 | def relu(x): 158 | return tensor.nnet.relu(x) 159 | 160 | def linear(x): 161 | return x 162 | 163 | 164 | def concatenate(tensor_list, axis=0): 165 | """ 166 | Alternative implementation of `theano.tensor.concatenate`. 167 | This function does exactly the same thing, but contrary to Theano's own 168 | implementation, the gradient is implemented on the GPU. 169 | Backpropagating through `theano.tensor.concatenate` yields slowdowns 170 | because the inverse operation (splitting) needs to be done on the CPU. 171 | This implementation does not have that problem. 172 | :usage: 173 | >>> x, y = theano.tensor.matrices('x', 'y') 174 | >>> c = concatenate([x, y], axis=1) 175 | :parameters: 176 | - tensor_list : list 177 | list of Theano tensor expressions that should be concatenated. 178 | - axis : int 179 | the tensors will be joined along this axis. 180 | :returns: 181 | - out : tensor 182 | the concatenated tensor expression. 183 | """ 184 | concat_size = sum(tt.shape[axis] for tt in tensor_list) 185 | 186 | output_shape = () 187 | for k in range(axis): 188 | output_shape += (tensor_list[0].shape[k],) 189 | output_shape += (concat_size,) 190 | for k in range(axis + 1, tensor_list[0].ndim): 191 | output_shape += (tensor_list[0].shape[k],) 192 | 193 | out = tensor.zeros(output_shape) 194 | offset = 0 195 | for tt in tensor_list: 196 | indices = () 197 | for k in range(axis): 198 | indices += (slice(None),) 199 | indices += (slice(offset, offset + tt.shape[axis]),) 200 | for k in range(axis + 1, tensor_list[0].ndim): 201 | indices += (slice(None),) 202 | 203 | out = tensor.set_subtensor(out[indices], tt) 204 | offset += tt.shape[axis] 205 | 206 | return out 207 | 208 | def prepare_data(seqs_x, seqs_y, labels, worddicts_r, maxlen=None): 209 | # x: a list of sentences 210 | lengths_x = [len(s) for s in seqs_x] 211 | lengths_y = [len(s) for s in seqs_y] 212 | 213 | if maxlen is not None: 214 | new_seqs_x = [] 215 | new_seqs_y = [] 216 | new_lengths_x = [] 217 | new_lengths_y = [] 218 | new_labels = [] 219 | for l_x, s_x, l_y, s_y, l in zip(lengths_x, seqs_x, lengths_y, seqs_y, labels): 220 | if l_x < maxlen and l_y < maxlen: 221 | new_seqs_x.append(s_x) 222 | new_lengths_x.append(l_x) 223 | new_seqs_y.append(s_y) 224 | new_lengths_y.append(l_y) 225 | new_labels.append(l) 226 | lengths_x = new_lengths_x 227 | seqs_x = new_seqs_x 228 | lengths_y = new_lengths_y 229 | seqs_y = new_seqs_y 230 | labels = new_labels 231 | 232 | if len(lengths_x) < 1 or len(lengths_y) < 1: 233 | return None 234 | 235 | max_char_len_x = 0 236 | max_char_len_y = 0 237 | seqs_x_char = [] 238 | l_seqs_x_char = [] 239 | seqs_y_char = [] 240 | l_seqs_y_char = [] 241 | 242 | for idx, [s_x, s_y, s_l] in enumerate(zip(seqs_x, seqs_y, labels)): 243 | temp_seqs_x_char = [] 244 | temp_l_seqs_x_char = [] 245 | temp_seqs_y_char = [] 246 | temp_l_seqs_y_char = [] 247 | for w_x in s_x: 248 | word = worddicts_r[w_x] 249 | word_list = str2list(word) 250 | l_word_list = len(word_list) 251 | temp_seqs_x_char.append(word_list) 252 | temp_l_seqs_x_char.append(l_word_list) 253 | if l_word_list >= max_char_len_x: 254 | max_char_len_x = l_word_list 255 | for w_y in s_y: 256 | word = worddicts_r[w_y] 257 | word_list = str2list(word) 258 | l_word_list = len(word_list) 259 | temp_seqs_y_char.append(word_list) 260 | temp_l_seqs_y_char.append(l_word_list) 261 | if l_word_list >= max_char_len_y: 262 | max_char_len_y = l_word_list 263 | 264 | seqs_x_char.append(temp_seqs_x_char) 265 | l_seqs_x_char.append(temp_l_seqs_x_char) 266 | seqs_y_char.append(temp_seqs_y_char) 267 | l_seqs_y_char.append(temp_l_seqs_y_char) 268 | 269 | 270 | n_samples = len(seqs_x) 271 | maxlen_x = numpy.max(lengths_x) 272 | maxlen_y = numpy.max(lengths_y) 273 | 274 | x = numpy.zeros((maxlen_x, n_samples)).astype('int64') 275 | y = numpy.zeros((maxlen_y, n_samples)).astype('int64') 276 | x_mask = numpy.zeros((maxlen_x, n_samples)).astype('float32') 277 | y_mask = numpy.zeros((maxlen_y, n_samples)).astype('float32') 278 | l = numpy.zeros((n_samples,)).astype('int64') 279 | char_x = numpy.zeros((maxlen_x, n_samples, max_char_len_x)).astype('int64') 280 | char_x_mask = numpy.zeros((maxlen_x, n_samples, max_char_len_x)).astype('float32') 281 | char_y = numpy.zeros((maxlen_y, n_samples, max_char_len_y)).astype('int64') 282 | char_y_mask = numpy.zeros((maxlen_y, n_samples, max_char_len_y)).astype('float32') 283 | 284 | for idx, [s_x, s_y, ll] in enumerate(zip(seqs_x, seqs_y, labels)): 285 | x[:lengths_x[idx], idx] = s_x 286 | x_mask[:lengths_x[idx], idx] = 1. 287 | y[:lengths_y[idx], idx] = s_y 288 | y_mask[:lengths_y[idx], idx] = 1. 289 | l[idx] = ll 290 | 291 | for j in range(0, lengths_x[idx]): 292 | char_x[j, idx, :l_seqs_x_char[idx][j]] = seqs_x_char[idx][j] 293 | char_x_mask[j, idx, :l_seqs_x_char[idx][j]] = 1. 294 | for j in range(0, lengths_y[idx]): 295 | char_y[j, idx, :l_seqs_y_char[idx][j]] = seqs_y_char[idx][j] 296 | char_y_mask[j, idx, :l_seqs_y_char[idx][j]] = 1. 297 | 298 | return x, x_mask, char_x, char_x_mask, y, y_mask, char_y, char_y_mask, l 299 | 300 | 301 | # feedforward layer: affine transformation + point-wise nonlinearity 302 | def param_init_fflayer(options, params, prefix='ff', nin=None, nout=None, 303 | ortho=True): 304 | if nin is None: 305 | nin = options['dim'] 306 | if nout is None: 307 | nout = options['dim'] 308 | params[_p(prefix, 'W')] = norm_weight(nin, nout, scale=0.01, ortho=ortho) 309 | params[_p(prefix, 'b')] = numpy.zeros((nout,)).astype('float32') 310 | 311 | return params 312 | 313 | 314 | def fflayer(tparams, state_below, options, prefix='rconv', 315 | activ='lambda x: tensor.tanh(x)', **kwargs): 316 | return eval(activ)( 317 | tensor.dot(state_below, tparams[_p(prefix, 'W')]) + 318 | tparams[_p(prefix, 'b')]) 319 | 320 | # LSTM layer 321 | def param_init_lstm(options, params, prefix='lstm', nin=None, dim=None): 322 | if nin is None: 323 | nin = options['dim'] 324 | if dim is None: 325 | dim = options['dim'] 326 | """ 327 | Stack the weight matricies for all the gates 328 | for much cleaner code and slightly faster dot-prods 329 | """ 330 | # input weights 331 | W = numpy.concatenate([norm_weight(nin,dim), 332 | norm_weight(nin,dim), 333 | norm_weight(nin,dim), 334 | norm_weight(nin,dim)], axis=1) 335 | params[_p(prefix,'W')] = W 336 | # for the previous hidden activation 337 | U = numpy.concatenate([ortho_weight(dim), 338 | ortho_weight(dim), 339 | ortho_weight(dim), 340 | ortho_weight(dim)], axis=1) 341 | params[_p(prefix,'U')] = U 342 | params[_p(prefix,'b')] = numpy.zeros((4 * dim,)).astype('float32') 343 | 344 | return params 345 | 346 | # This function implements the lstm fprop 347 | def lstm_layer(tparams, state_below, options, prefix='lstm', mask=None, **kwargs): 348 | nsteps = state_below.shape[0] 349 | dim = tparams[_p(prefix,'U')].shape[0] 350 | 351 | n_samples = state_below.shape[1] 352 | init_state = tensor.alloc(0., n_samples, dim) 353 | init_memory = tensor.alloc(0., n_samples, dim) 354 | 355 | # if we have no mask, we assume all the inputs are valid 356 | if mask == None: 357 | mask = tensor.alloc(1., state_below.shape[0], 1) 358 | 359 | # use the slice to calculate all the different gates 360 | def _slice(_x, n, dim): 361 | if _x.ndim == 3: 362 | return _x[:, :, n*dim:(n+1)*dim] 363 | elif _x.ndim == 2: 364 | return _x[:, n*dim:(n+1)*dim] 365 | return _x[n*dim:(n+1)*dim] 366 | 367 | # one time step of the lstm 368 | def _step(m_, x_, h_, c_): 369 | preact = tensor.dot(h_, tparams[_p(prefix, 'U')]) 370 | preact += x_ 371 | 372 | i = tensor.nnet.sigmoid(_slice(preact, 0, dim)) 373 | f = tensor.nnet.sigmoid(_slice(preact, 1, dim)) 374 | o = tensor.nnet.sigmoid(_slice(preact, 2, dim)) 375 | c = tensor.tanh(_slice(preact, 3, dim)) 376 | 377 | c = f * c_ + i * c 378 | c = m_[:,None] * c + (1. - m_)[:,None] * c_ 379 | 380 | h = o * tensor.tanh(c) 381 | h = m_[:,None] * h + (1. - m_)[:,None] * h_ 382 | 383 | return h, c, i, f, o, preact 384 | 385 | state_below = tensor.dot(state_below, tparams[_p(prefix, 'W')]) + tparams[_p(prefix, 'b')] 386 | 387 | rval, updates = theano.scan(_step, 388 | sequences=[mask, state_below], 389 | outputs_info=[init_state, init_memory, None, None, None, None], 390 | name=_p(prefix, '_layers'), 391 | n_steps=nsteps, profile=False) 392 | return rval 393 | 394 | 395 | # initialize all parameters 396 | def init_params(options, worddicts): 397 | params = OrderedDict() 398 | 399 | # embedding 400 | params['Wemb'] = norm_weight(options['n_words'], options['dim_word']) 401 | # read embedding from GloVe 402 | if options['embedding']: 403 | with open(options['embedding'], 'r') as f: 404 | for line in f: 405 | tmp = line.split() 406 | word = tmp[0] 407 | vector = tmp[1:] 408 | if word in worddicts and worddicts[word] < options['n_words']: 409 | params['Wemb'][worddicts[word], :] = vector 410 | 411 | params['Charemb'] = norm_weight(options['l_alphabet']+1, options['dim_char_emb']) 412 | for char_k_rows in options['char_k_rows']: 413 | w_shp = (options['char_nout'], 1, char_k_rows, options['char_k_cols']) 414 | w_bound = numpy.sqrt(3 * char_k_rows * options['char_k_cols']) 415 | params['filter_{}'.format(char_k_rows)] = numpy.random.uniform(low=-1.0 / w_bound, high=1.0 / w_bound, size=w_shp).astype('float32') 416 | 417 | dim_emb = options['dim_word']+3*options['char_nout'] 418 | 419 | params = get_layer(options['encoder'])[0](options, params, 420 | prefix='encoder_1', 421 | nin=dim_emb, 422 | dim=options['dim']) 423 | 424 | params = get_layer(options['encoder'])[0](options, params, 425 | prefix='encoder_r_1', 426 | nin=dim_emb, 427 | dim=options['dim']) 428 | 429 | params = get_layer(options['encoder'])[0](options, params, 430 | prefix='encoder_2', 431 | nin=options['dim']*2+dim_emb, 432 | dim=options['dim']) 433 | 434 | params = get_layer(options['encoder'])[0](options, params, 435 | prefix='encoder_r_2', 436 | nin=options['dim']*2+dim_emb, 437 | dim=options['dim']) 438 | 439 | params = get_layer(options['encoder'])[0](options, params, 440 | prefix='encoder_3', 441 | nin=options['dim']*2+dim_emb, 442 | dim=options['dim']) 443 | 444 | params = get_layer(options['encoder'])[0](options, params, 445 | prefix='encoder_r_3', 446 | nin=options['dim']*2+dim_emb, 447 | dim=options['dim']) 448 | 449 | # classifier 450 | params = get_layer('ff')[0](options, params, prefix='ff_layer_1', 451 | nin=options['dim'] * 24, nout=options['dim'], ortho=False) 452 | params = get_layer('ff')[0](options, params, prefix='ff_layer_2', 453 | nin=options['dim'] * 25, nout=options['dim'], ortho=False) 454 | params = get_layer('ff')[0](options, params, prefix='ff_layer_output', 455 | nin=options['dim'], nout=3, ortho=False) 456 | 457 | return params 458 | 459 | 460 | # build a training model 461 | def build_model(tparams, options): 462 | """ Builds the entire computational graph used for training 463 | """ 464 | opt_ret = dict() 465 | 466 | trng = RandomStreams(1234) 467 | use_noise = theano.shared(numpy.float32(0.)) 468 | 469 | # description string: #words x #samples 470 | x1 = tensor.matrix('x1', dtype='int64') 471 | x1_mask = tensor.matrix('x1_mask', dtype='float32') 472 | x2 = tensor.matrix('x2', dtype='int64') 473 | x2_mask = tensor.matrix('x2_mask', dtype='float32') 474 | y = tensor.vector('y', dtype='int64') 475 | 476 | xr1 = x1[::-1] 477 | xr1_mask = x1_mask[::-1] 478 | xr2 = x2[::-1] 479 | xr2_mask = x2_mask[::-1] 480 | 481 | n_timesteps_x1 = x1.shape[0] 482 | n_timesteps_x2 = x2.shape[0] 483 | n_samples = x1.shape[1] 484 | 485 | char_x1 = theano.tensor.tensor3('char_x1', dtype='int64') 486 | char_x1_mask = theano.tensor.tensor3('char_x1_mask', dtype='float32') 487 | char_x2 = theano.tensor.tensor3('char_x2', dtype='int64') 488 | char_x2_mask = theano.tensor.tensor3('char_x2_mask', dtype='float32') 489 | 490 | emb_char1 = tparams['Charemb'][char_x1.flatten()].reshape([n_timesteps_x1, n_samples, char_x1.shape[2], options['dim_char_emb']]) 491 | emb_char1 = emb_char1 * char_x1_mask[:,:,:,None] 492 | emb_char_inp1 = emb_char1.reshape([n_timesteps_x1*n_samples, 1, char_x1.shape[2], options['dim_char_emb']]) 493 | 494 | emb_char1s = [] 495 | for num in options['char_k_rows']: 496 | emb_char1 = tensor.nnet.conv.conv2d(emb_char_inp1, tparams['filter_{}'.format(num)], border_mode='valid') 497 | emb_char1 = tensor.nnet.nnet.relu(emb_char1) 498 | emb_char1 = emb_char1.reshape([n_timesteps_x1*n_samples, options['char_nout'], emb_char1.shape[2]]) 499 | emb_char1 = emb_char1.max(2) 500 | emb_char1 = emb_char1.reshape([n_timesteps_x1, n_samples, options['char_nout']]) 501 | emb_char1s.append(emb_char1) 502 | 503 | emb_char1 = concatenate(emb_char1s, axis = 2) 504 | 505 | emb_char2 = tparams['Charemb'][char_x2.flatten()].reshape([n_timesteps_x2, n_samples, char_x2.shape[2], options['dim_char_emb']]) 506 | emb_char2 = emb_char2 * char_x2_mask[:,:,:,None] 507 | emb_char_inp2 = emb_char2.reshape([n_timesteps_x2*n_samples, 1, char_x2.shape[2], options['dim_char_emb']]) 508 | 509 | emb_char2s = [] 510 | for num in options['char_k_rows']: 511 | emb_char2 = tensor.nnet.conv.conv2d(emb_char_inp2, tparams['filter_{}'.format(num)], border_mode='valid') 512 | emb_char2 = tensor.nnet.nnet.relu(emb_char2) 513 | emb_char2 = emb_char2.reshape([n_timesteps_x2*n_samples, options['char_nout'], emb_char2.shape[2]]) 514 | emb_char2 = emb_char2.max(2) 515 | emb_char2 = emb_char2.reshape([n_timesteps_x2, n_samples, options['char_nout']]) 516 | emb_char2s.append(emb_char2) 517 | 518 | emb_char2 = concatenate(emb_char2s, axis = 2) 519 | 520 | # word embedding 521 | emb1 = tparams['Wemb'][x1.flatten()].reshape([n_timesteps_x1, n_samples, options['dim_word']]) 522 | emb1 = concatenate([emb1, emb_char1], axis = 2) 523 | if options['use_dropout']: 524 | emb1 = dropout_layer(emb1, use_noise, trng) 525 | 526 | emb2 = tparams['Wemb'][x2.flatten()].reshape([n_timesteps_x2, n_samples, options['dim_word']]) 527 | emb2 = concatenate([emb2, emb_char2], axis = 2) 528 | if options['use_dropout']: 529 | emb2 = dropout_layer(emb2, use_noise, trng) 530 | 531 | for l in range(3): 532 | if l == 0: 533 | ctx1 = emb1 534 | ctx2 = emb2 535 | else: 536 | ctx1 = concatenate([ctx1, emb1], axis=2) 537 | ctx2 = concatenate([ctx2, emb2], axis=2) 538 | 539 | ctxr1 = ctx1[::-1] 540 | ctxr2 = ctx2[::-1] 541 | proj1 = get_layer(options['encoder'])[1](tparams, ctx1, options, 542 | prefix='encoder_{}'.format(str(l+1)), 543 | mask=x1_mask) 544 | projr1 = get_layer(options['encoder'])[1](tparams, ctxr1, options, 545 | prefix='encoder_r_{}'.format(str(l+1)), 546 | mask=xr1_mask) 547 | proj2 = get_layer(options['encoder'])[1](tparams, ctx2, options, 548 | prefix='encoder_{}'.format(str(l+1)), 549 | mask=x2_mask) 550 | projr2 = get_layer(options['encoder'])[1](tparams, ctxr2, options, 551 | prefix='encoder_r_{}'.format(str(l+1)), 552 | mask=xr2_mask) 553 | ctx1 = concatenate([proj1[0], projr1[0][::-1]], axis=proj1[0].ndim-1) 554 | ctx2 = concatenate([proj2[0], projr2[0][::-1]], axis=proj2[0].ndim-1) 555 | 556 | # step x sample x dim 557 | inp_gate1 = concatenate([proj1[2], projr1[2][::-1]], axis=proj1[2].ndim-1) 558 | inp_gate2 = concatenate([proj2[2], projr2[2][::-1]], axis=proj2[2].ndim-1) 559 | 560 | inp_gate1 = inp_gate1.norm(2, axis=2) 561 | inp_gate2 = inp_gate2.norm(2, axis=2) 562 | 563 | mean_1 = (ctx1 * x1_mask[:, :, None]).sum(0) / x1_mask.sum(0)[:, None] 564 | max_1 = (ctx1 * x1_mask[:, :, None]).max(0) 565 | gate_1 = (ctx1 * inp_gate1[:, :, None] * x1_mask[:, :, None]).sum(0) / (inp_gate1[:, :, None] * x1_mask[:, :, None]).sum(0) 566 | 567 | mean_2 = (ctx2 * x2_mask[:, :, None]).sum(0) / x2_mask.sum(0)[:, None] 568 | max_2 = (ctx2 * x2_mask[:, :, None]).max(0) 569 | gate_2 = (ctx2 * inp_gate2[:, :, None] * x2_mask[:, :, None]).sum(0) / (inp_gate2[:, :, None] * x2_mask[:, :, None]).sum(0) 570 | 571 | rep1 = concatenate([mean_1, max_1, gate_1], axis=1) 572 | rep2 = concatenate([mean_2, max_2, gate_2], axis=1) 573 | 574 | logit_0 = concatenate([rep1, rep2, abs(rep1-rep2), rep1*rep2], axis=1) 575 | 576 | logit = get_layer('ff')[1](tparams, logit_0, options, 577 | prefix='ff_layer_1', activ='relu') 578 | if options['use_dropout']: 579 | logit = dropout_layer(logit, use_noise, trng) 580 | logit = concatenate([logit_0, logit], axis=1) 581 | logit = get_layer('ff')[1](tparams, logit, options, 582 | prefix='ff_layer_2', activ='relu') 583 | if options['use_dropout']: 584 | logit = dropout_layer(logit, use_noise, trng) 585 | logit = get_layer('ff')[1](tparams, logit, options, 586 | prefix='ff_layer_output', activ='linear') 587 | probs = tensor.nnet.softmax(logit) 588 | cost = tensor.nnet.categorical_crossentropy(probs, y) 589 | 590 | f_pred = theano.function([x1, x1_mask, char_x1, char_x1_mask, x2, x2_mask, char_x2, char_x2_mask], probs.argmax(axis=1), name='f_pred') 591 | f_prods = theano.function([x1, x1_mask, char_x1, char_x1_mask, x2, x2_mask, char_x2, char_x2_mask], probs, name='f_prods') 592 | opt_ret['rep1'] = rep1 593 | opt_ret['rep2'] = rep2 594 | 595 | return trng, use_noise, x1, x1_mask, char_x1, char_x1_mask, x2, x2_mask, char_x2, char_x2_mask, y, opt_ret, cost, f_pred, f_prods 596 | 597 | 598 | # calculate the log probablities on a given corpus using translation model 599 | def pred_probs(f_log_probs, prepare_data, options, iterator, worddicts_r, verbose=False): 600 | probs = [] 601 | n_done = 0 602 | 603 | for x1, x2, y in iterator: 604 | n_done += len(x1) 605 | x1, x1_mask, char_x1, char_x1_mask, x2, x2_mask, char_x2, char_x2_mask, y = prepare_data(x1, x2, y, worddicts_r) 606 | 607 | pprobs = f_log_probs(x1, x1_mask, char_x1, char_x1_mask, x2, x2_mask, char_x2, char_x2_mask, y) 608 | for pp in pprobs: 609 | probs.append(pp) 610 | 611 | if numpy.isnan(numpy.mean(probs)): 612 | ipdb.set_trace() 613 | 614 | if verbose: 615 | print >>sys.stderr, '%d samples computed' % (n_done) 616 | 617 | return numpy.array(probs) 618 | 619 | def pred_acc(f_pred, prepare_data, options, iterator, worddicts_r, verbose=False): 620 | """ 621 | Just compute the accuracy 622 | f_pred: Theano fct computing the prediction 623 | prepare_data: usual prepare_data for that dataset. 624 | """ 625 | valid_acc = 0 626 | n_done = 0 627 | 628 | for x1, x2, y in iterator: 629 | n_done += len(x1) 630 | x1, x1_mask, char_x1, char_x1_mask, x2, x2_mask, char_x2, char_x2_mask, y = prepare_data(x1, x2, y, worddicts_r) 631 | preds = f_pred(x1, x1_mask, char_x1, char_x1_mask, x2, x2_mask, char_x2, char_x2_mask) 632 | valid_acc += (preds == y).sum() 633 | 634 | valid_acc = 1.0 * valid_acc / n_done 635 | 636 | return valid_acc 637 | 638 | 639 | # optimizers 640 | # name(hyperp, tparams, grads, inputs (list), cost) = f_grad_shared, f_update 641 | def adam(lr, tparams, grads, inp, cost, beta1=0.9, beta2=0.999, e=1e-8): 642 | 643 | gshared = [theano.shared(p.get_value() * 0., name='%s_grad' % k) 644 | for k, p in tparams.iteritems()] 645 | gsup = [(gs, g) for gs, g in zip(gshared, grads)] 646 | 647 | f_grad_shared = theano.function(inp, cost, updates=gsup, profile=profile) 648 | 649 | updates = [] 650 | 651 | t_prev = theano.shared(numpy.float32(0.)) 652 | t = t_prev + 1. 653 | lr_t = lr * tensor.sqrt(1. - beta2**t) / (1. - beta1**t) 654 | 655 | for p, g in zip(tparams.values(), gshared): 656 | m = theano.shared(p.get_value() * 0., p.name + '_mean') 657 | v = theano.shared(p.get_value() * 0., p.name + '_variance') 658 | m_t = beta1 * m + (1. - beta1) * g 659 | v_t = beta2 * v + (1. - beta2) * g**2 660 | step = lr_t * m_t / (tensor.sqrt(v_t) + e) 661 | p_t = p - step 662 | updates.append((m, m_t)) 663 | updates.append((v, v_t)) 664 | updates.append((p, p_t)) 665 | updates.append((t_prev, t)) 666 | 667 | f_update = theano.function([lr], [], updates=updates, 668 | on_unused_input='ignore', profile=profile) 669 | 670 | return f_grad_shared, f_update 671 | 672 | 673 | def adadelta(lr, tparams, grads, inp, cost, epsilon = 1e-6, rho = 0.95): 674 | zipped_grads = [theano.shared(p.get_value() * numpy.float32(0.), 675 | name='%s_grad' % k) 676 | for k, p in tparams.iteritems()] 677 | running_up2 = [theano.shared(p.get_value() * numpy.float32(0.), 678 | name='%s_rup2' % k) 679 | for k, p in tparams.iteritems()] 680 | running_grads2 = [theano.shared(p.get_value() * numpy.float32(0.), 681 | name='%s_rgrad2' % k) 682 | for k, p in tparams.iteritems()] 683 | 684 | zgup = [(zg, g) for zg, g in zip(zipped_grads, grads)] 685 | rg2up = [(rg2, rho * rg2 + (1 - rho) * (g ** 2)) 686 | for rg2, g in zip(running_grads2, grads)] 687 | 688 | f_grad_shared = theano.function(inp, cost, updates=zgup+rg2up, 689 | profile=profile) 690 | 691 | updir = [-tensor.sqrt(ru2 + epsilon) / tensor.sqrt(rg2 + epsilon) * zg 692 | for zg, ru2, rg2 in zip(zipped_grads, running_up2, 693 | running_grads2)] 694 | ru2up = [(ru2, rho * ru2 + (1 - rho) * (ud ** 2)) 695 | for ru2, ud in zip(running_up2, updir)] 696 | param_up = [(p, p + ud) for p, ud in zip(itemlist(tparams), updir)] 697 | 698 | f_update = theano.function([lr], [], updates=ru2up+param_up, 699 | on_unused_input='ignore', profile=profile) 700 | 701 | return f_grad_shared, f_update 702 | 703 | 704 | def rmsprop(lr, tparams, grads, inp, cost): 705 | zipped_grads = [theano.shared(p.get_value() * numpy.float32(0.), 706 | name='%s_grad' % k) 707 | for k, p in tparams.iteritems()] 708 | running_grads = [theano.shared(p.get_value() * numpy.float32(0.), 709 | name='%s_rgrad' % k) 710 | for k, p in tparams.iteritems()] 711 | running_grads2 = [theano.shared(p.get_value() * numpy.float32(0.), 712 | name='%s_rgrad2' % k) 713 | for k, p in tparams.iteritems()] 714 | 715 | zgup = [(zg, g) for zg, g in zip(zipped_grads, grads)] 716 | rgup = [(rg, 0.95 * rg + 0.05 * g) for rg, g in zip(running_grads, grads)] 717 | rg2up = [(rg2, 0.95 * rg2 + 0.05 * (g ** 2)) 718 | for rg2, g in zip(running_grads2, grads)] 719 | 720 | f_grad_shared = theano.function(inp, cost, updates=zgup+rgup+rg2up, 721 | profile=profile) 722 | 723 | updir = [theano.shared(p.get_value() * numpy.float32(0.), 724 | name='%s_updir' % k) 725 | for k, p in tparams.iteritems()] 726 | updir_new = [(ud, 0.9 * ud - 1e-4 * zg / tensor.sqrt(rg2 - rg ** 2 + 1e-4)) 727 | for ud, zg, rg, rg2 in zip(updir, zipped_grads, running_grads, 728 | running_grads2)] 729 | param_up = [(p, p + udn[1]) 730 | for p, udn in zip(itemlist(tparams), updir_new)] 731 | f_update = theano.function([lr], [], updates=updir_new+param_up, 732 | on_unused_input='ignore', profile=profile) 733 | 734 | return f_grad_shared, f_update 735 | 736 | 737 | def sgd(lr, tparams, grads, inp, cost): 738 | gshared = [theano.shared(p.get_value() * 0., 739 | name='%s_grad' % k) 740 | for k, p in tparams.iteritems()] 741 | gsup = [(gs, g) for gs, g in zip(gshared, grads)] 742 | 743 | f_grad_shared = theano.function(inp, cost, updates=gsup, 744 | profile=profile) 745 | 746 | pup = [(p, p - lr * g) for p, g in zip(itemlist(tparams), gshared)] 747 | f_update = theano.function([lr], [], updates=pup, profile=profile) 748 | 749 | return f_grad_shared, f_update 750 | 751 | """Note: all the hyperparameters are stored in a dictionary model_options (or options outside train). 752 | train() then proceeds to do the following: 753 | 1. The params are initialized (or reloaded) 754 | 2. The computations graph is built symbolically using Theano. 755 | 3. A cost is defined, then gradient are obtained automatically with tensor.grad 756 | 4. With some helper functions, gradient descent + periodic saving/printing proceeds 757 | """ 758 | def train( 759 | dim_word = 100, # word vector dimensionality 760 | dim = 100, # the number of GRU units 761 | encoder = 'lstm', # encoder model 762 | decoder = 'lstm', # decoder model 763 | patience = 10, # early stopping patience 764 | max_epochs = 5000, 765 | finish_after = 10000000, # finish after this many updates 766 | decay_c = 0., # L2 regularization penalty 767 | clip_c = -1., # gradient clipping threshold 768 | lrate = 0.01, # learning rate 769 | n_words = 100000, # vocabulary size 770 | maxlen = 100, # maximum length of the description 771 | optimizer = 'adadelta', 772 | batch_size = 16, 773 | valid_batch_size = 16, 774 | saveto = 'model.npz', 775 | dispFreq = 100, 776 | validFreq = 1000, 777 | saveFreq = 1000, # save the parameters after every saveFreq updates 778 | use_dropout = False, 779 | reload_ = False, 780 | verbose = False, # print verbose information for debug but slow speed 781 | datasets = [], 782 | valid_datasets = [], 783 | test_datasets = [], 784 | dictionary = '', 785 | embedding = '', # pretrain embedding file, such as word2vec, GLOVE 786 | ): 787 | 788 | logging.basicConfig(level=logging.DEBUG, format="%(asctime)s: %(name)s: %(levelname)s: %(message)s") 789 | # Model options 790 | model_options = locals().copy() 791 | 792 | model_options['alphabet'] = " abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789-,;.!?:'\"/\\|_@#$%^&*~`+-=<>()[]{}" 793 | model_options['l_alphabet'] = len(model_options['alphabet']) 794 | model_options['dim_char_emb'] = 15 795 | model_options['char_nout'] = 100 796 | model_options['char_k_rows'] = [1, 3, 5] 797 | model_options['char_k_cols'] = model_options['dim_char_emb'] 798 | 799 | # load dictionary and invert them 800 | with open(dictionary, 'rb') as f: 801 | worddicts = pkl.load(f) 802 | worddicts_r = dict() 803 | for kk, vv in worddicts.iteritems(): 804 | worddicts_r[vv] = kk 805 | 806 | # reload options 807 | if reload_ and os.path.exists(saveto): 808 | print 'Reload options' 809 | with open('%s.pkl' % saveto, 'rb') as f: 810 | model_options = pkl.load(f) 811 | 812 | logger.debug(pprint.pformat(model_options)) 813 | 814 | print 'Loading data' 815 | train = TextIterator(datasets[0], datasets[1], datasets[2], 816 | dictionary, 817 | n_words=n_words, 818 | batch_size=batch_size) 819 | train_valid = TextIterator(datasets[0], datasets[1], datasets[2], 820 | dictionary, 821 | n_words=n_words, 822 | batch_size=valid_batch_size, 823 | shuffle=False) 824 | valid = TextIterator(valid_datasets[0], valid_datasets[1], valid_datasets[2], 825 | dictionary, 826 | n_words=n_words, 827 | batch_size=valid_batch_size, 828 | shuffle=False) 829 | test = TextIterator(test_datasets[0], test_datasets[1], test_datasets[2], 830 | dictionary, 831 | n_words=n_words, 832 | batch_size=valid_batch_size, 833 | shuffle=False) 834 | 835 | # Initialize (or reload) the parameters using 'model_options' 836 | # then build the Theano graph 837 | print 'Building model' 838 | params = init_params(model_options, worddicts) 839 | # reload parameters 840 | if reload_ and os.path.exists(saveto): 841 | print 'Reload parameters' 842 | params = load_params(saveto, params) 843 | 844 | # numpy arrays -> theano shared variables 845 | tparams = init_tparams(params) 846 | 847 | trng, use_noise, \ 848 | x1, x1_mask, char_x1, char_x1_mask, x2, x2_mask, char_x2, char_x2_mask, y, \ 849 | opt_ret, \ 850 | cost, \ 851 | f_pred, f_prods = \ 852 | build_model(tparams, model_options) 853 | inps = [x1, x1_mask, char_x1, char_x1_mask, x2, x2_mask, char_x2, char_x2_mask, y] 854 | 855 | # before any regularizer 856 | print 'Building f_log_probs...', 857 | f_log_probs = theano.function(inps, cost, profile=profile) 858 | print 'Done' 859 | 860 | cost = cost.mean() 861 | 862 | # apply L2 regularization on weights 863 | if decay_c > 0.: 864 | decay_c = theano.shared(numpy.float32(decay_c), name='decay_c') 865 | weight_decay = 0. 866 | for kk, vv in tparams.iteritems(): 867 | weight_decay += (vv ** 2).sum() 868 | weight_decay *= decay_c 869 | cost += weight_decay 870 | 871 | # after all regularizers - compile the computational graph for cost 872 | print 'Building f_cost...', 873 | f_cost = theano.function(inps, cost, profile=profile) 874 | print 'Done' 875 | 876 | updated_params = OrderedDict([(key,value) for (key,value) in tparams.iteritems() if not key.startswith('Wemb')]) 877 | 878 | print 'Computing gradient...', 879 | grads = tensor.grad(cost, wrt=itemlist(updated_params)) 880 | print 'Done' 881 | 882 | # apply gradient clipping here 883 | if clip_c > 0.: 884 | g2 = 0. 885 | for g in grads: 886 | g2 += (g**2).sum() 887 | new_grads = [] 888 | for g in grads: 889 | new_grads.append(tensor.switch(g2 > (clip_c**2), 890 | g / tensor.sqrt(g2) * clip_c, 891 | g)) 892 | grads = new_grads 893 | if verbose: 894 | print 'Building function of gradient\'s norm' 895 | f_norm_g = theano.function(inps, tensor.sqrt(g2)) 896 | 897 | 898 | # compile the optimizer, the actual computational graph is compiled here 899 | lr = tensor.scalar(name='lr') 900 | print 'Building optimizers...', 901 | f_grad_shared, f_update = eval(optimizer)(lr, updated_params, grads, inps, cost) 902 | print 'Done' 903 | 904 | print 'Optimization' 905 | 906 | history_errs = [] 907 | # reload history 908 | if reload_ and os.path.exists(saveto): 909 | print 'Reload history error' 910 | history_errs = list(numpy.load(saveto)['history_errs']) 911 | best_p = None 912 | bad_counter = 0 913 | 914 | if validFreq == -1: 915 | validFreq = len(train[0])/batch_size 916 | if saveFreq == -1: 917 | saveFreq = len(train[0])/batch_size 918 | 919 | uidx = 0 920 | estop = False 921 | valid_acc_record = [] 922 | test_acc_record = [] 923 | best_epoch_num = 0 924 | lr_change_list = [] 925 | wait_counter = 0 926 | wait_N = 1 927 | for eidx in xrange(max_epochs): 928 | n_samples = 0 929 | for x1, x2, y in train: 930 | n_samples += len(x1) 931 | uidx += 1 932 | use_noise.set_value(1.) 933 | x1, x1_mask, char_x1, char_x1_mask, x2, x2_mask, char_x2, char_x2_mask, y = prepare_data(x1, x2, y, worddicts_r, maxlen=maxlen) 934 | 935 | if x1 is None: 936 | print 'Minibatch with zero sample under length ', maxlen 937 | uidx -= 1 938 | continue 939 | 940 | ud_start = time.time() 941 | 942 | # compute cost, grads and copy grads to shared variables 943 | cost = f_grad_shared(x1, x1_mask, char_x1, char_x1_mask, x2, x2_mask, char_x2, char_x2_mask, y) 944 | if verbose: 945 | if clip_c > 0.: 946 | norm_g = f_norm_g(x1, x1_mask, char_x1, char_x1_mask, x2, x2_mask, char_x2, char_x2_mask, y) 947 | 948 | # do the update on parameters 949 | f_update(lrate) 950 | ud = time.time() - ud_start 951 | # check for bad numbers, usually we remove non-finite elements 952 | # and continue training - but not done here 953 | if numpy.isnan(cost) or numpy.isinf(cost): 954 | print 'NaN detected' 955 | return None 956 | 957 | # verbose 958 | if numpy.mod(uidx, dispFreq) == 0: 959 | logger.debug('Epoch {0} Update {1} Cost {2} UD {3}'.format(eidx, uidx, cost, ud)) 960 | if verbose: 961 | if clip_c > 0.: 962 | logger.debug('Grad {0}'.format(norm_g)) 963 | 964 | # save the best model so far 965 | if numpy.mod(uidx, saveFreq) == 0: 966 | print 'Saving...', 967 | if best_p is not None: 968 | params = best_p 969 | else: 970 | params = unzip(tparams) 971 | numpy.savez(saveto, history_errs=history_errs, **params) 972 | pkl.dump(model_options, open('%s.pkl' % saveto, 'wb')) 973 | print 'Done' 974 | 975 | # validate model on validation set and early stop if necessary 976 | if numpy.mod(uidx, validFreq) == 0: 977 | use_noise.set_value(0.) 978 | valid_cost = pred_probs(f_log_probs, prepare_data, model_options, valid, worddicts_r).mean() 979 | valid_acc = pred_acc(f_pred, prepare_data, model_options, valid, worddicts_r) 980 | valid_err = 1.0 - valid_acc 981 | history_errs.append(valid_err) 982 | test_cost = pred_probs(f_log_probs, prepare_data, model_options, test, worddicts_r).mean() 983 | test_acc = pred_acc(f_pred, prepare_data, model_options, test, worddicts_r) 984 | 985 | print 'Valid cost', valid_cost 986 | print 'Valid accuracy', valid_acc 987 | print 'Test cost', test_cost 988 | print 'Test accuracy', test_acc 989 | print 'lrate:', lrate 990 | 991 | valid_acc_record.append(valid_acc) 992 | test_acc_record.append(test_acc) 993 | 994 | if uidx == 0 or valid_err <= numpy.array(history_errs).min(): 995 | best_p = unzip(tparams) 996 | best_epoch_num = eidx 997 | wait_counter = 0 998 | 999 | if valid_err > numpy.array(history_errs).min(): 1000 | wait_counter += 1 1001 | 1002 | if wait_counter >= wait_N: 1003 | print 'wait_counter max, need to half the lr' 1004 | bad_counter += 1 1005 | wait_counter = 0 1006 | print 'bad_counter: '+str(bad_counter) 1007 | lrate=lrate*0.5 1008 | lr_change_list.append(eidx) 1009 | print 'lrate change to: ' + str(lrate) 1010 | zipp(best_p, tparams) 1011 | 1012 | if bad_counter > patience: 1013 | print 'Early Stop!' 1014 | estop = True 1015 | break 1016 | 1017 | if numpy.isnan(valid_err): 1018 | pdb.set_trace() 1019 | 1020 | # finish after this many updates 1021 | if uidx >= finish_after: 1022 | print 'Finishing after %d iterations!' % uidx 1023 | estop = True 1024 | break 1025 | 1026 | print 'Seen %d samples' % n_samples 1027 | 1028 | if estop: 1029 | break 1030 | 1031 | if best_p is not None: 1032 | zipp(best_p, tparams) 1033 | 1034 | with open('record.csv', 'w') as f: 1035 | f.write(str(best_epoch_num) + '\n') 1036 | f.write(','.join(map(str,lr_change_list)) + '\n') 1037 | f.write(','.join(map(str,valid_acc_record)) + '\n') 1038 | f.write(','.join(map(str,test_acc_record)) + '\n') 1039 | 1040 | use_noise.set_value(0.) 1041 | 1042 | print '=' * 80 1043 | print 'Final Result' 1044 | print '=' * 80 1045 | train_cost = pred_probs(f_log_probs, prepare_data, model_options, train_valid, worddicts_r).mean() 1046 | train_acc = pred_acc(f_pred, prepare_data, model_options, train_valid, worddicts_r) 1047 | print 'Train cost', train_cost 1048 | print 'Train accuracy', train_acc 1049 | valid_cost = pred_probs(f_log_probs, prepare_data, model_options, valid, worddicts_r).mean() 1050 | valid_acc = pred_acc(f_pred, prepare_data, model_options, valid, worddicts_r) 1051 | print 'Valid cost', valid_cost 1052 | print 'Valid accuracy', valid_acc 1053 | test_cost = pred_probs(f_log_probs, prepare_data, model_options, test, worddicts_r).mean() 1054 | test_acc = pred_acc(f_pred, prepare_data, model_options, test, worddicts_r) 1055 | print 'Test cost', test_cost 1056 | print 'Test accuracy', test_acc 1057 | params = copy.copy(best_p) 1058 | numpy.savez(saveto, zipped_params=best_p, history_errs=history_errs, **params) 1059 | logger.debug('Done') 1060 | 1061 | return None 1062 | 1063 | if __name__ == '__main__': 1064 | pass 1065 | --------------------------------------------------------------------------------