├── data
    ├── fetch_and_preprocess.sh
    ├── download.py
    └── preprocess_data.py
├── scripts
    └── enc_nli
    │   ├── test.sh
    │   ├── train.sh
    │   ├── train.py
    │   ├── gen.py
    │   ├── data_iterator.py
    │   └── main.py
├── Readme.md
└── LICENSE


/data/fetch_and_preprocess.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | set -e
3 | python download.py
4 | python preprocess_data.py
5 | 


--------------------------------------------------------------------------------
/scripts/enc_nli/test.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | export THEANO_FLAGS='mode=FAST_RUN,device=gpu0,floatX=float32,optimizer_including=cudnn,warn_float64=warn,lib.cnmem=0.9'
 4 | 
 5 | # export THEANO_FLAGS=device=cpu,floatX=float32
 6 | 
 7 | python -u ./gen.py > log_test.txt 2>&1 &
 8 | 
 9 | 
10 | 
11 | 
12 | 


--------------------------------------------------------------------------------
/scripts/enc_nli/train.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # GPU
 4 | export THEANO_FLAGS='mode=FAST_RUN,device=gpu0,floatX=float32,optimizer_including=cudnn,warn_float64=warn,lib.cnmem=0.9'
 5 | 
 6 | # CPU
 7 | # export THEANO_FLAGS='mode=FAST_RUN,device=cpu,floatX=float32'
 8 | 
 9 | python -u ./train.py > log.txt 2>&1 &
10 | 
11 | 
12 | 
13 | 
14 | 


--------------------------------------------------------------------------------
/Readme.md:
--------------------------------------------------------------------------------
 1 | # LSTM Encoder for Natural Language Inference
 2 | 
 3 | ## Dependencies
 4 | To run it perfectly, you will need:
 5 | * Python 2.7
 6 | * Theano 0.8.2
 7 | 
 8 | ## Running the Script
 9 | 1. Download and preprocess 
10 | ```
11 | cd data
12 | bash fetch_and_preprocess.sh
13 | ```
14 | 
15 | 2. Train and test model
16 | ```
17 | cd scripts/enc_nli
18 | bash train.sh
19 | ```
20 | 
21 | The result is in `log.txt` file.
22 | 


--------------------------------------------------------------------------------
/data/download.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Downloads the following:
 3 | - Glove vectors
 4 | - Stanford Natural Language Inference (SNLI) Corpus
 5 | 
 6 | """
 7 | 
 8 | import sys
 9 | import os
10 | import zipfile
11 | import gzip
12 | 
13 | def download(url, dirpath):
14 |     filename = url.split('/')[-1]
15 |     filepath = os.path.join(dirpath, filename)
16 |     os.system('wget {} -O {}'.format(url, filepath))
17 |     return filepath
18 | 
19 | def unzip(filepath):
20 |     print("Extracting: " + filepath)
21 |     dirpath = os.path.dirname(filepath)
22 |     with zipfile.ZipFile(filepath) as zf:
23 |         zf.extractall(dirpath)
24 |     os.remove(filepath)
25 | 
26 | def download_wordvecs(dirpath):
27 |     if os.path.exists(dirpath):
28 |         print('Found Glove vectors - skip')
29 |         return
30 |     else:
31 |         os.makedirs(dirpath)
32 |     url = 'http://www-nlp.stanford.edu/data/glove.840B.300d.zip'
33 |     unzip(download(url, dirpath))
34 | 
35 | def download_multinli(dirpath):
36 |     if os.path.exists(dirpath):
37 |         print('Found MultiNLI dataset - skip')
38 |         return
39 |     else:
40 |         os.makedirs(dirpath)
41 |     url = 'http://www.nyu.edu/projects/bowman/multinli/multinli_0.9.zip'
42 |     unzip(download(url, dirpath))
43 | 
44 | 
45 | if __name__ == '__main__':
46 |     base_dir = os.path.dirname(os.path.realpath(__file__))
47 |     multinli_dir = os.path.join(base_dir, 'multinli')
48 |     wordvec_dir = os.path.join(base_dir, 'glove')
49 |     download_multinli(multinli_dir)
50 |     download_wordvecs(wordvec_dir)
51 | 
52 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | BSD 3-Clause License
 2 | 
 3 | Copyright (c) 2017, Qian Chen
 4 | All rights reserved.
 5 | 
 6 | Redistribution and use in source and binary forms, with or without
 7 | modification, are permitted provided that the following conditions are met:
 8 | 
 9 | * Redistributions of source code must retain the above copyright notice, this
10 |   list of conditions and the following disclaimer.
11 | 
12 | * Redistributions in binary form must reproduce the above copyright notice,
13 |   this list of conditions and the following disclaimer in the documentation
14 |   and/or other materials provided with the distribution.
15 | 
16 | * Neither the name of the copyright holder nor the names of its
17 |   contributors may be used to endorse or promote products derived from
18 |   this software without specific prior written permission.
19 | 
20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
21 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
23 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
24 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
26 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
27 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
28 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 | 


--------------------------------------------------------------------------------
/scripts/enc_nli/train.py:
--------------------------------------------------------------------------------
 1 | import numpy
 2 | import os
 3 | 
 4 | from main import train
 5 | 
 6 | if __name__ == '__main__':
 7 |     model_name = os.path.basename(os.path.dirname(os.path.realpath(__file__)))
 8 |     train(
 9 |     saveto           = './{}.npz'.format(model_name),
10 |     reload_          = False,
11 |     dim_word         = 300,
12 |     dim              = 600,
13 |     patience         = 7,
14 |     n_words          = 100140,
15 |     decay_c          = 0.,
16 |     clip_c           = 10.,
17 |     lrate            = 0.0004,
18 |     optimizer        = 'adam', 
19 |     maxlen           = 450,
20 |     batch_size       = 32,
21 |     valid_batch_size = 32,
22 |     dispFreq         = 100,
23 |     validFreq        = int(392702/32+1),
24 |     saveFreq         = int(392702/32+1),
25 |     use_dropout      = True,
26 |     verbose          = False,
27 |     datasets         = ['../../data/word_sequence/premise_multinli_0.9_train.txt', 
28 |                         '../../data/word_sequence/hypothesis_multinli_0.9_train.txt',
29 |                         '../../data/word_sequence/label_multinli_0.9_train.txt'],
30 |     valid_datasets   = ['../../data/word_sequence/premise_multinli_0.9_dev_matched.txt', 
31 |                         '../../data/word_sequence/hypothesis_multinli_0.9_dev_matched.txt',
32 |                         '../../data/word_sequence/label_multinli_0.9_dev_matched.txt'],
33 |     test_datasets    = ['../../data/word_sequence/premise_multinli_0.9_dev_mismatched.txt', 
34 |                         '../../data/word_sequence/hypothesis_multinli_0.9_dev_mismatched.txt',
35 |                         '../../data/word_sequence/label_multinli_0.9_dev_mismatched.txt'],
36 |     dictionary       = '../../data/word_sequence/vocab_cased.pkl',
37 |     embedding        = '../../data/glove/glove.840B.300d.txt',
38 |     )
39 | 
40 | 


--------------------------------------------------------------------------------
/data/preprocess_data.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | import sys
  3 | import os
  4 | import numpy
  5 | import cPickle as pkl
  6 | 
  7 | from collections import OrderedDict
  8 | 
  9 | dic = {'entailment': '0', 'neutral': '1', 'contradiction': '2'}
 10 | 
 11 | def build_dictionary(filepaths, dst_path, lowercase=False):
 12 |     word_freqs = OrderedDict()
 13 |     for filepath in filepaths:
 14 |         print 'Processing', filepath
 15 |         with open(filepath, 'r') as f:
 16 |             for line in f:
 17 |                 if lowercase:
 18 |                     line = line.lower()
 19 |                 words_in = line.strip().split(' ')
 20 |                 for w in words_in:
 21 |                     if w not in word_freqs:
 22 |                         word_freqs[w] = 0
 23 |                     word_freqs[w] += 1
 24 | 
 25 |     words = word_freqs.keys()
 26 |     freqs = word_freqs.values()
 27 | 
 28 |     sorted_idx = numpy.argsort(freqs)
 29 |     sorted_words = [words[ii] for ii in sorted_idx[::-1]]
 30 | 
 31 |     worddict = OrderedDict()
 32 |     worddict['_PAD_'] = 0 # default, padding 
 33 |     worddict['_UNK_'] = 1 # out-of-vocabulary
 34 |     worddict['_BOS_'] = 2 # begin of sentence token
 35 |     worddict['_EOS_'] = 3 # end of sentence token
 36 | 
 37 |     for ii, ww in enumerate(sorted_words):
 38 |         worddict[ww] = ii + 4
 39 | 
 40 |     with open(dst_path, 'wb') as f:
 41 |         pkl.dump(worddict, f)
 42 | 
 43 |     print 'Dict size', len(worddict)
 44 |     print 'Done'
 45 | 
 46 | 
 47 | def build_sequence(filepath, dst_dir, isTest=False):
 48 |     filename = os.path.basename(filepath)
 49 |     print filename
 50 |     len_p = []
 51 |     len_h = []
 52 |     with open(filepath) as f, \
 53 |          open(os.path.join(dst_dir, 'premise_%s'%filename), 'w') as f1, \
 54 |          open(os.path.join(dst_dir, 'hypothesis_%s'%filename), 'w') as f2,  \
 55 |          open(os.path.join(dst_dir, 'label_%s'%filename), 'w') as f3:
 56 |         next(f) # skip the header row
 57 |         for line in f:
 58 |             sents = line.strip().split('\t')
 59 |             if sents[0] is '-':
 60 |                 continue
 61 | 
 62 |             words_in = sents[1].strip().split(' ')
 63 |             words_in = [x for x in words_in if x not in ('(',')')]
 64 |             f1.write(' '.join(words_in) + '\n')
 65 |             len_p.append(len(words_in))
 66 | 
 67 |             words_in = sents[2].strip().split(' ')
 68 |             words_in = [x for x in words_in if x not in ('(',')')]
 69 |             f2.write(' '.join(words_in) + '\n')
 70 |             len_h.append(len(words_in))
 71 |             if isTest:
 72 |                 f3.write('0' + '\n')
 73 |             else:
 74 |                 f3.write(dic[sents[0]] + '\n')
 75 | 
 76 |     print 'max min len premise', max(len_p), min(len_p)
 77 |     print 'max min len hypothesis', max(len_h), min(len_h)
 78 | 
 79 | 
 80 | def make_dirs(dirs):
 81 |     for d in dirs:
 82 |         if not os.path.exists(d):
 83 |             os.makedirs(d)
 84 | 
 85 | if __name__ == '__main__':
 86 |     print('=' * 80)
 87 |     print('Preprocessing snli_1.0 dataset')
 88 |     print('=' * 80)
 89 |     base_dir = os.path.dirname(os.path.realpath(__file__))
 90 |     dst_dir = os.path.join(base_dir, 'word_sequence')
 91 |     multinli_dir = os.path.join(base_dir, 'multinli/multinli_0.9')
 92 |     make_dirs([dst_dir])
 93 | 
 94 |     build_sequence(os.path.join(multinli_dir, 'multinli_0.9_train.txt'), dst_dir)
 95 |     build_sequence(os.path.join(multinli_dir, 'multinli_0.9_dev_matched.txt'), dst_dir)
 96 |     build_sequence(os.path.join(multinli_dir, 'multinli_0.9_dev_mismatched.txt'), dst_dir)
 97 |     # build_sequence(os.path.join(multinli_dir, 'multinli_0.9_test_matched_unlabeled.txt'), dst_dir, isTest=True)
 98 |     # build_sequence(os.path.join(multinli_dir, 'multinli_0.9_test_mismatched_unlabeled.txt'), dst_dir, isTest=True)
 99 | 
100 |     build_dictionary([os.path.join(dst_dir, 'premise_multinli_0.9_train.txt'), 
101 |                       os.path.join(dst_dir, 'hypothesis_multinli_0.9_train.txt')], 
102 |                       os.path.join(dst_dir, 'vocab_cased.pkl'))
103 | 


--------------------------------------------------------------------------------
/scripts/enc_nli/gen.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | Generate 
  3 | '''
  4 | import argparse
  5 | import theano
  6 | import numpy
  7 | import cPickle as pkl
  8 | import os
  9 | from data_iterator import TextIterator
 10 | 
 11 | from main import (build_model, pred_probs, prepare_data, pred_acc, load_params,
 12 |                  init_params, init_tparams)
 13 | 
 14 | def main():
 15 |     dic = {'0':'entailment', '1':'neutral', '2':'contradiction'}
 16 | 
 17 |     dev_matched_datasets=['../../data/word_sequence/premise_multinli_0.9_dev_matched.txt', 
 18 |                 '../../data/word_sequence/hypothesis_multinli_0.9_dev_matched.txt',
 19 |                 '../../data/word_sequence/label_multinli_0.9_dev_matched.txt']
 20 |     dev_mismatched_datasets=['../../data/word_sequence/premise_multinli_0.9_dev_mismatched.txt', 
 21 |                 '../../data/word_sequence/hypothesis_multinli_0.9_dev_mismatched.txt',
 22 |                 '../../data/word_sequence/label_multinli_0.9_dev_mismatched.txt']
 23 |     dictionary='../../data/word_sequence/vocab_cased.pkl'
 24 | 
 25 |     # load model model_options
 26 |     model_name = os.path.basename(os.path.dirname(os.path.realpath(__file__)))
 27 |     model = './{}.npz'.format(model_name)
 28 |     with open('%s.pkl' % model, 'rb') as f:
 29 |         options = pkl.load(f)
 30 | 
 31 |     print options
 32 | 
 33 |     # load dictionary and invert
 34 |     with open(dictionary, 'rb') as f:
 35 |         word_dict = pkl.load(f)
 36 |     word_idict = dict()
 37 |     for kk, vv in word_dict.iteritems():
 38 |         word_idict[vv] = kk
 39 | 
 40 |     dev_matched = TextIterator(dev_matched_datasets[0], dev_matched_datasets[1], dev_matched_datasets[2], 
 41 |                                 dictionary,
 42 |                                 n_words=options['n_words'],
 43 |                                 batch_size=options['valid_batch_size'],
 44 |                                 shuffle=False)
 45 |     dev_mismatched = TextIterator(dev_mismatched_datasets[0], dev_mismatched_datasets[1], dev_mismatched_datasets[2], 
 46 |                                 dictionary,
 47 |                                 n_words=options['n_words'],
 48 |                                 batch_size=options['valid_batch_size'],
 49 |                                 shuffle=False)
 50 | 
 51 |     # allocate model parameters
 52 |     params = init_params(options, word_dict)
 53 | 
 54 |     # load model parameters and set theano shared variables
 55 |     params = load_params(model, params)
 56 |     tparams = init_tparams(params)
 57 | 
 58 |     trng, use_noise, \
 59 |         x1, x1_mask, char_x1, char_x1_mask, x2, x2_mask, char_x2, char_x2_mask, y, \
 60 |         opt_ret, \
 61 |         cost, \
 62 |         f_pred, f_prods = \
 63 |         build_model(tparams, options)
 64 | 
 65 |     use_noise.set_value(0.)
 66 |     dev_matched_acc = pred_acc(f_pred, prepare_data, options, dev_matched, word_idict)
 67 |     dev_mismatched_acc = pred_acc(f_pred, prepare_data, options, dev_mismatched, word_idict)
 68 | 
 69 |     print 'dev_matched accuracy', dev_matched_acc
 70 |     print 'dev_mismatched accuracy', dev_mismatched_acc
 71 | 
 72 |     predict_labels_dev_matched = pred_label(f_prods, prepare_data, options, dev_matched, word_idict)
 73 |     predict_labels_dev_mismatched = pred_label(f_prods, prepare_data, options, dev_mismatched, word_idict)
 74 | 
 75 |     with open('./dev_matched_output.txt', 'w') as fw:
 76 |         with open(dev_matched_datasets[0], 'r') as f1:
 77 |             with open(dev_matched_datasets[1], 'r') as f2:
 78 |                 with open(dev_matched_datasets[2], 'r') as f3:
 79 |                     for a, b, c, d in zip(predict_labels_dev_matched, f3, f1, f2):
 80 |                         fw.write(str(a) + '\t' + b.rstrip() + '\t' + c.rstrip() + '\t' + d.rstrip() + '\n')
 81 | 
 82 |     with open('./dev_dismatched_output.txt', 'w') as fw:
 83 |         with open(dev_mismatched_datasets[0], 'r') as f1:
 84 |             with open(dev_mismatched_datasets[1], 'r') as f2:
 85 |                 with open(dev_mismatched_datasets[2], 'r') as f3:
 86 |                     for a, b, c, d in zip(predict_labels_dev_mismatched, f3, f1, f2):
 87 |                         fw.write(str(a) + '\t' + b.rstrip() + '\t' + c.rstrip() + '\t' + d.rstrip() + '\n')
 88 | 
 89 |     print 'Done'
 90 | 
 91 | def pred_label(f_prods, prepare_data, options, iterator, word_idict):
 92 |     labels = []
 93 |     valid_acc = 0
 94 |     n_done = 0
 95 |     for x1_, x2_, y_ in iterator:
 96 |         n_done += len(x1_)
 97 |         lengths_x1 = [len(s) for s in x1_]
 98 |         lengths_x2 = [len(s) for s in x2_]
 99 |         x1, x1_mask, char_x1, char_x1_mask, x2, x2_mask, char_x2, char_x2_mask, y = prepare_data(x1_, x2_, y_, word_idict)
100 |         inps = [x1, x1_mask, char_x1, char_x1_mask, x2, x2_mask, char_x2, char_x2_mask]
101 |         prods = f_prods(*inps)
102 |         preds = prods.argmax(axis=1)
103 |         valid_acc += (preds == y).sum()
104 |         labels = labels + preds.tolist()
105 | 
106 |     valid_acc = 1.0 * valid_acc / n_done
107 |     print "total sampel", n_done
108 |     print "Acc", valid_acc
109 | 
110 |     return labels
111 | 
112 | if __name__ == "__main__":
113 |     parser = argparse.ArgumentParser()
114 |     args = parser.parse_args()
115 |     main()
116 | 


--------------------------------------------------------------------------------
/scripts/enc_nli/data_iterator.py:
--------------------------------------------------------------------------------
  1 | import cPickle as pkl
  2 | import gzip
  3 | import numpy
  4 | import random
  5 | import math
  6 | 
  7 | def fopen(filename, mode='r'):
  8 |     if filename.endswith('.gz'):
  9 |         return gzip.open(filename, mode)
 10 |     return open(filename, mode)
 11 | 
 12 | class TextIterator:
 13 |     """Simple Bitext iterator."""
 14 |     def __init__(self, source, target, label,
 15 |                  dict,
 16 |                  batch_size=128,
 17 |                  n_words=-1,
 18 |                  shuffle=True):
 19 |         self.source = fopen(source, 'r')
 20 |         self.target = fopen(target, 'r')
 21 |         self.label = fopen(label, 'r')
 22 |         with open(dict, 'rb') as f:
 23 |             self.dict = pkl.load(f)
 24 |         self.batch_size = batch_size
 25 |         self.n_words = n_words
 26 |         self.shuffle = shuffle
 27 |         self.end_of_data = False
 28 | 
 29 |         self.source_buffer = []
 30 |         self.target_buffer = []
 31 |         self.label_buffer = []
 32 |         self.k = batch_size * 20
 33 | 
 34 |     def __iter__(self):
 35 |         return self
 36 | 
 37 |     def reset(self):
 38 |         self.source.seek(0)
 39 |         self.target.seek(0)
 40 |         self.label.seek(0)
 41 | 
 42 |     def next(self):
 43 |         if self.end_of_data:
 44 |             self.end_of_data = False
 45 |             self.reset()
 46 |             raise StopIteration
 47 | 
 48 |         source = []
 49 |         target = []
 50 |         label = []
 51 | 
 52 |         # fill buffer, if it's empty
 53 |         assert len(self.source_buffer) == len(self.target_buffer), 'Buffer size mismatch!'
 54 |         assert len(self.source_buffer) == len(self.label_buffer), 'Buffer size mismatch!'
 55 | 
 56 |         if len(self.source_buffer) == 0:
 57 |             for k_ in xrange(self.k):
 58 |                 ss = self.source.readline()
 59 |                 if ss == "":
 60 |                     break
 61 |                 tt = self.target.readline()
 62 |                 if tt == "":
 63 |                     break
 64 |                 ll = self.label.readline()
 65 |                 if ll == "":
 66 |                     break
 67 | 
 68 |                 self.source_buffer.append(ss.strip().split())
 69 |                 self.target_buffer.append(tt.strip().split())
 70 |                 self.label_buffer.append(ll.strip())
 71 | 
 72 |             if self.shuffle:
 73 |                 # sort by target buffer
 74 |                 tlen = numpy.array([len(t) for t in self.target_buffer])
 75 |                 tidx = tlen.argsort()
 76 |                 # shuffle mini-batch
 77 |                 tindex = []
 78 |                 small_index = range(int(math.ceil(len(tidx)*1./self.batch_size)))
 79 |                 random.shuffle(small_index)
 80 |                 for i in small_index:
 81 |                     if (i+1)*self.batch_size > len(tidx):
 82 |                         tindex.extend(tidx[i*self.batch_size:])
 83 |                     else:
 84 |                         tindex.extend(tidx[i*self.batch_size:(i+1)*self.batch_size])
 85 | 
 86 |                 tidx = tindex
 87 | 
 88 |                 _sbuf = [self.source_buffer[i] for i in tidx]
 89 |                 _tbuf = [self.target_buffer[i] for i in tidx]
 90 |                 _lbuf = [self.label_buffer[i] for i in tidx]
 91 | 
 92 |                 self.source_buffer = _sbuf
 93 |                 self.target_buffer = _tbuf
 94 |                 self.label_buffer = _lbuf
 95 | 
 96 |         if len(self.source_buffer) == 0 or len(self.target_buffer) == 0 or len(self.label_buffer) == 0:
 97 |             self.end_of_data = False
 98 |             self.reset()
 99 |             raise StopIteration
100 | 
101 |         try:
102 | 
103 |             # actual work here
104 |             while True:
105 | 
106 |                 # read from source file and map to word index
107 |                 try:
108 |                     ss = self.source_buffer.pop(0)
109 |                 except IndexError:
110 |                     break
111 | 
112 |                 ss.insert(0, '_BOS_')
113 |                 ss.append('_EOS_')
114 |                 ss = [self.dict[w] if w in self.dict else 1
115 |                       for w in ss]
116 |                 if self.n_words > 0:
117 |                     ss = [w if w < self.n_words else 1 for w in ss]
118 | 
119 |                 # read from source file and map to word index
120 |                 tt = self.target_buffer.pop(0)
121 |                 tt.insert(0, '_BOS_')
122 |                 tt.append('_EOS_')
123 |                 tt = [self.dict[w] if w in self.dict else 1
124 |                       for w in tt]
125 |                 if self.n_words > 0:
126 |                     tt = [w if w < self.n_words else 1 for w in tt]
127 | 
128 |                 # read label 
129 |                 ll = self.label_buffer.pop(0)
130 | 
131 |                 source.append(ss)
132 |                 target.append(tt)
133 |                 label.append(ll)
134 | 
135 |                 if len(source) >= self.batch_size or \
136 |                         len(target) >= self.batch_size or \
137 |                         len(label) >= self.batch_size:
138 |                     break
139 |         except IOError:
140 |             self.end_of_data = True
141 | 
142 |         if len(source) <= 0 or len(target) <= 0 or len(label) <= 0:
143 |             self.end_of_data = False
144 |             self.reset()
145 |             raise StopIteration
146 | 
147 |         return source, target, label
148 | 


--------------------------------------------------------------------------------
/scripts/enc_nli/main.py:
--------------------------------------------------------------------------------
   1 | '''
   2 | Build a Gated-Attenion BiLSTM model for Natural Language Inference
   3 | '''
   4 | import theano
   5 | import theano.tensor as tensor
   6 | from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams
   7 | 
   8 | import cPickle as pkl
   9 | import pdb
  10 | import numpy
  11 | import copy
  12 | 
  13 | import os
  14 | import warnings
  15 | import sys
  16 | import time
  17 | import pprint
  18 | import logging
  19 | 
  20 | from collections import OrderedDict
  21 | from data_iterator import TextIterator
  22 | 
  23 | profile = False
  24 | logger = logging.getLogger(__name__)
  25 | 
  26 | def str2list(s):
  27 |     alphabet = " abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789,;.!?:'\"/\\|_@#$%^&*~`+-=<>()[]{}"
  28 |     l = len(s)
  29 |     ans = []
  30 |     for i in range(0, l):
  31 |         a = alphabet.find(s[i])
  32 |         if a >= 0:
  33 |             ans.append(a)
  34 |         else:
  35 |             ans.append(0)
  36 |             #print(s[i])
  37 |     return ans
  38 | 
  39 | # push parameters to Theano shared variables
  40 | def zipp(params, tparams):
  41 |     for kk, vv in params.iteritems():
  42 |         tparams[kk].set_value(vv)
  43 | 
  44 | 
  45 | # pull parameters from Theano shared variables
  46 | def unzip(zipped):
  47 |     new_params = OrderedDict()
  48 |     for kk, vv in zipped.iteritems():
  49 |         new_params[kk] = vv.get_value()
  50 |     return new_params
  51 | 
  52 | 
  53 | # get the list of parameters: Note that tparams must be OrderedDict
  54 | def itemlist(tparams):
  55 |     return [vv for kk, vv in tparams.iteritems()]
  56 | 
  57 | 
  58 | # dropout
  59 | def dropout_layer(state_before, use_noise, trng):
  60 |     """
  61 |     tensor switch is like an if statement that checks the
  62 |     value of the theano shared variable (use_noise), before
  63 |     either dropping out the state_before tensor or
  64 |     computing the appropriate activation. During training/testing
  65 |     use_noise is toggled on and off.
  66 |     """
  67 |     proj = tensor.switch(
  68 |         use_noise,
  69 |         state_before * trng.binomial(state_before.shape, p=0.5, n=1,
  70 |                                      dtype=state_before.dtype),
  71 |         state_before * 0.5)
  72 |     return proj
  73 | 
  74 | 
  75 | # make prefix-appended name
  76 | def _p(pp, name):
  77 |     return '%s_%s' % (pp, name)
  78 | 
  79 | 
  80 | # initialize Theano shared variables according to the initial parameters
  81 | def init_tparams(params):
  82 |     tparams = OrderedDict()
  83 |     for kk, pp in params.iteritems():
  84 |         tparams[kk] = theano.shared(params[kk], name=kk)
  85 |         print kk, pp.shape
  86 |     return tparams
  87 | 
  88 | 
  89 | # load parameters
  90 | def load_params(path, params):
  91 |     pp = numpy.load(path)
  92 |     for kk, vv in params.iteritems():
  93 |         if kk not in pp:
  94 |             warnings.warn('%s is not in the archive' % kk)
  95 |             continue
  96 |         params[kk] = pp[kk]
  97 | 
  98 |     return params
  99 | 
 100 | 
 101 | """
 102 | Neural network layer definitions.
 103 | 
 104 | The life-cycle of each of these layers is as follows
 105 |     1) The param_init of the layer is called, which creates
 106 |     the weights of the network.
 107 |     2) The feedforward is called which builds that part of the Theano graph
 108 |     using the weights created in step 1). This automatically links
 109 |     these variables to the graph.
 110 | 
 111 | Each prefix is used like a key and should be unique
 112 | to avoid naming conflicts when building the graph.
 113 | """
 114 | # layers: 'name': ('parameter initializer', 'feedforward')
 115 | layers = {'ff': ('param_init_fflayer', 'fflayer'),
 116 |           'lstm': ('param_init_lstm', 'lstm_layer'),
 117 |           }
 118 | 
 119 | 
 120 | def get_layer(name):
 121 |     fns = layers[name]
 122 |     return (eval(fns[0]), eval(fns[1]))
 123 | 
 124 | 
 125 | # some utilities
 126 | def ortho_weight(ndim):
 127 |     """
 128 |     Random orthogonal weights
 129 | 
 130 |     Used by norm_weights(below), in which case, we
 131 |     are ensuring that the rows are orthogonal
 132 |     (i.e W = U \Sigma V, U has the same
 133 |     # of rows, V has the same # of cols)
 134 |     """
 135 |     W = numpy.random.randn(ndim, ndim)
 136 |     u, s, v = numpy.linalg.svd(W)
 137 |     return u.astype('float32')
 138 | 
 139 | 
 140 | def norm_weight(nin, nout=None, scale=0.01, ortho=True):
 141 |     """
 142 |     Random weights drawn from a Gaussian
 143 |     """
 144 |     if nout is None:
 145 |         nout = nin
 146 |     if nout == nin and ortho:
 147 |         W = ortho_weight(nin)
 148 |     else:
 149 |         W = scale * numpy.random.randn(nin, nout)
 150 |     return W.astype('float32')
 151 | 
 152 | 
 153 | # some useful shorthands
 154 | def tanh(x):
 155 |     return tensor.tanh(x)
 156 | 
 157 | def relu(x):
 158 |     return tensor.nnet.relu(x)
 159 | 
 160 | def linear(x):
 161 |     return x
 162 | 
 163 | 
 164 | def concatenate(tensor_list, axis=0):
 165 |     """
 166 |     Alternative implementation of `theano.tensor.concatenate`.
 167 |     This function does exactly the same thing, but contrary to Theano's own
 168 |     implementation, the gradient is implemented on the GPU.
 169 |     Backpropagating through `theano.tensor.concatenate` yields slowdowns
 170 |     because the inverse operation (splitting) needs to be done on the CPU.
 171 |     This implementation does not have that problem.
 172 |     :usage:
 173 |         >>> x, y = theano.tensor.matrices('x', 'y')
 174 |         >>> c = concatenate([x, y], axis=1)
 175 |     :parameters:
 176 |         - tensor_list : list
 177 |             list of Theano tensor expressions that should be concatenated.
 178 |         - axis : int
 179 |             the tensors will be joined along this axis.
 180 |     :returns:
 181 |         - out : tensor
 182 |             the concatenated tensor expression.
 183 |     """
 184 |     concat_size = sum(tt.shape[axis] for tt in tensor_list)
 185 | 
 186 |     output_shape = ()
 187 |     for k in range(axis):
 188 |         output_shape += (tensor_list[0].shape[k],)
 189 |     output_shape += (concat_size,)
 190 |     for k in range(axis + 1, tensor_list[0].ndim):
 191 |         output_shape += (tensor_list[0].shape[k],)
 192 | 
 193 |     out = tensor.zeros(output_shape)
 194 |     offset = 0
 195 |     for tt in tensor_list:
 196 |         indices = ()
 197 |         for k in range(axis):
 198 |             indices += (slice(None),)
 199 |         indices += (slice(offset, offset + tt.shape[axis]),)
 200 |         for k in range(axis + 1, tensor_list[0].ndim):
 201 |             indices += (slice(None),)
 202 | 
 203 |         out = tensor.set_subtensor(out[indices], tt)
 204 |         offset += tt.shape[axis]
 205 | 
 206 |     return out
 207 | 
 208 | def prepare_data(seqs_x, seqs_y, labels, worddicts_r, maxlen=None):
 209 |     # x: a list of sentences
 210 |     lengths_x = [len(s) for s in seqs_x]
 211 |     lengths_y = [len(s) for s in seqs_y]
 212 | 
 213 |     if maxlen is not None:
 214 |         new_seqs_x = []
 215 |         new_seqs_y = []
 216 |         new_lengths_x = []
 217 |         new_lengths_y = []
 218 |         new_labels = []
 219 |         for l_x, s_x, l_y, s_y, l in zip(lengths_x, seqs_x, lengths_y, seqs_y, labels):
 220 |             if l_x < maxlen and l_y < maxlen:
 221 |                 new_seqs_x.append(s_x)
 222 |                 new_lengths_x.append(l_x)
 223 |                 new_seqs_y.append(s_y)
 224 |                 new_lengths_y.append(l_y)
 225 |                 new_labels.append(l)
 226 |         lengths_x = new_lengths_x
 227 |         seqs_x = new_seqs_x
 228 |         lengths_y = new_lengths_y
 229 |         seqs_y = new_seqs_y
 230 |         labels = new_labels
 231 | 
 232 |         if len(lengths_x) < 1 or len(lengths_y) < 1:
 233 |             return None
 234 | 
 235 |     max_char_len_x = 0
 236 |     max_char_len_y = 0
 237 |     seqs_x_char = []
 238 |     l_seqs_x_char = []
 239 |     seqs_y_char = []
 240 |     l_seqs_y_char = []
 241 | 
 242 |     for idx, [s_x, s_y, s_l] in enumerate(zip(seqs_x, seqs_y, labels)):
 243 |         temp_seqs_x_char = []
 244 |         temp_l_seqs_x_char = []
 245 |         temp_seqs_y_char = []
 246 |         temp_l_seqs_y_char = []
 247 |         for w_x in s_x:
 248 |             word = worddicts_r[w_x]
 249 |             word_list = str2list(word)
 250 |             l_word_list = len(word_list)
 251 |             temp_seqs_x_char.append(word_list)
 252 |             temp_l_seqs_x_char.append(l_word_list)
 253 |             if l_word_list >= max_char_len_x:
 254 |                 max_char_len_x = l_word_list
 255 |         for w_y in s_y:
 256 |             word = worddicts_r[w_y]
 257 |             word_list = str2list(word)
 258 |             l_word_list = len(word_list)
 259 |             temp_seqs_y_char.append(word_list)
 260 |             temp_l_seqs_y_char.append(l_word_list)
 261 |             if l_word_list >= max_char_len_y:
 262 |                 max_char_len_y = l_word_list
 263 | 
 264 |         seqs_x_char.append(temp_seqs_x_char)
 265 |         l_seqs_x_char.append(temp_l_seqs_x_char)
 266 |         seqs_y_char.append(temp_seqs_y_char)
 267 |         l_seqs_y_char.append(temp_l_seqs_y_char)
 268 | 
 269 | 
 270 |     n_samples = len(seqs_x)
 271 |     maxlen_x = numpy.max(lengths_x)
 272 |     maxlen_y = numpy.max(lengths_y)
 273 | 
 274 |     x = numpy.zeros((maxlen_x, n_samples)).astype('int64')
 275 |     y = numpy.zeros((maxlen_y, n_samples)).astype('int64')
 276 |     x_mask = numpy.zeros((maxlen_x, n_samples)).astype('float32')
 277 |     y_mask = numpy.zeros((maxlen_y, n_samples)).astype('float32')
 278 |     l = numpy.zeros((n_samples,)).astype('int64')
 279 |     char_x = numpy.zeros((maxlen_x, n_samples, max_char_len_x)).astype('int64')
 280 |     char_x_mask = numpy.zeros((maxlen_x, n_samples, max_char_len_x)).astype('float32')
 281 |     char_y = numpy.zeros((maxlen_y, n_samples, max_char_len_y)).astype('int64')
 282 |     char_y_mask = numpy.zeros((maxlen_y, n_samples, max_char_len_y)).astype('float32')
 283 | 
 284 |     for idx, [s_x, s_y, ll] in enumerate(zip(seqs_x, seqs_y, labels)):
 285 |         x[:lengths_x[idx], idx] = s_x
 286 |         x_mask[:lengths_x[idx], idx] = 1.
 287 |         y[:lengths_y[idx], idx] = s_y
 288 |         y_mask[:lengths_y[idx], idx] = 1.
 289 |         l[idx] = ll
 290 | 
 291 |         for j in range(0, lengths_x[idx]):
 292 |             char_x[j, idx, :l_seqs_x_char[idx][j]] = seqs_x_char[idx][j]
 293 |             char_x_mask[j, idx, :l_seqs_x_char[idx][j]] = 1.
 294 |         for j in range(0, lengths_y[idx]):
 295 |             char_y[j, idx, :l_seqs_y_char[idx][j]] = seqs_y_char[idx][j]
 296 |             char_y_mask[j, idx, :l_seqs_y_char[idx][j]] = 1.
 297 | 
 298 |     return x, x_mask, char_x, char_x_mask, y, y_mask, char_y, char_y_mask, l
 299 | 
 300 | 
 301 | # feedforward layer: affine transformation + point-wise nonlinearity
 302 | def param_init_fflayer(options, params, prefix='ff', nin=None, nout=None,
 303 |                        ortho=True):
 304 |     if nin is None:
 305 |         nin = options['dim']
 306 |     if nout is None:
 307 |         nout = options['dim']
 308 |     params[_p(prefix, 'W')] = norm_weight(nin, nout, scale=0.01, ortho=ortho)
 309 |     params[_p(prefix, 'b')] = numpy.zeros((nout,)).astype('float32')
 310 | 
 311 |     return params
 312 | 
 313 | 
 314 | def fflayer(tparams, state_below, options, prefix='rconv',
 315 |             activ='lambda x: tensor.tanh(x)', **kwargs):
 316 |     return eval(activ)(
 317 |         tensor.dot(state_below, tparams[_p(prefix, 'W')]) +
 318 |         tparams[_p(prefix, 'b')])
 319 | 
 320 | # LSTM layer
 321 | def param_init_lstm(options, params, prefix='lstm', nin=None, dim=None):
 322 |     if nin is None:
 323 |         nin = options['dim']
 324 |     if dim is None:
 325 |         dim = options['dim']
 326 |     """
 327 |      Stack the weight matricies for all the gates
 328 |      for much cleaner code and slightly faster dot-prods
 329 |     """
 330 |     # input weights
 331 |     W = numpy.concatenate([norm_weight(nin,dim),
 332 |                            norm_weight(nin,dim),
 333 |                            norm_weight(nin,dim),
 334 |                            norm_weight(nin,dim)], axis=1)
 335 |     params[_p(prefix,'W')] = W
 336 |     # for the previous hidden activation
 337 |     U = numpy.concatenate([ortho_weight(dim),
 338 |                            ortho_weight(dim),
 339 |                            ortho_weight(dim),
 340 |                            ortho_weight(dim)], axis=1)
 341 |     params[_p(prefix,'U')] = U
 342 |     params[_p(prefix,'b')] = numpy.zeros((4 * dim,)).astype('float32')
 343 | 
 344 |     return params
 345 | 
 346 | # This function implements the lstm fprop
 347 | def lstm_layer(tparams, state_below, options, prefix='lstm', mask=None, **kwargs):
 348 |     nsteps = state_below.shape[0]
 349 |     dim = tparams[_p(prefix,'U')].shape[0]
 350 | 
 351 |     n_samples = state_below.shape[1]
 352 |     init_state = tensor.alloc(0., n_samples, dim)
 353 |     init_memory = tensor.alloc(0., n_samples, dim)
 354 | 
 355 |     # if we have no mask, we assume all the inputs are valid
 356 |     if mask == None:
 357 |         mask = tensor.alloc(1., state_below.shape[0], 1)
 358 | 
 359 |     # use the slice to calculate all the different gates
 360 |     def _slice(_x, n, dim):
 361 |         if _x.ndim == 3:
 362 |             return _x[:, :, n*dim:(n+1)*dim]
 363 |         elif _x.ndim == 2:
 364 |             return _x[:, n*dim:(n+1)*dim]
 365 |         return _x[n*dim:(n+1)*dim]
 366 | 
 367 |     # one time step of the lstm
 368 |     def _step(m_, x_, h_, c_):
 369 |         preact = tensor.dot(h_, tparams[_p(prefix, 'U')])
 370 |         preact += x_
 371 | 
 372 |         i = tensor.nnet.sigmoid(_slice(preact, 0, dim))
 373 |         f = tensor.nnet.sigmoid(_slice(preact, 1, dim))
 374 |         o = tensor.nnet.sigmoid(_slice(preact, 2, dim))
 375 |         c = tensor.tanh(_slice(preact, 3, dim))
 376 | 
 377 |         c = f * c_ + i * c
 378 |         c = m_[:,None] * c + (1. - m_)[:,None] * c_ 
 379 | 
 380 |         h = o * tensor.tanh(c)
 381 |         h = m_[:,None] * h + (1. - m_)[:,None] * h_
 382 | 
 383 |         return h, c, i, f, o, preact
 384 | 
 385 |     state_below = tensor.dot(state_below, tparams[_p(prefix, 'W')]) + tparams[_p(prefix, 'b')]
 386 | 
 387 |     rval, updates = theano.scan(_step,
 388 |                                 sequences=[mask, state_below],
 389 |                                 outputs_info=[init_state, init_memory, None, None, None, None],
 390 |                                 name=_p(prefix, '_layers'),
 391 |                                 n_steps=nsteps, profile=False)
 392 |     return rval
 393 | 
 394 | 
 395 | # initialize all parameters
 396 | def init_params(options, worddicts):
 397 |     params = OrderedDict()
 398 | 
 399 |     # embedding
 400 |     params['Wemb'] = norm_weight(options['n_words'], options['dim_word'])
 401 |     # read embedding from GloVe
 402 |     if options['embedding']:
 403 |         with open(options['embedding'], 'r') as f:
 404 |             for line in f:
 405 |                 tmp = line.split()
 406 |                 word = tmp[0]
 407 |                 vector = tmp[1:]
 408 |                 if word in worddicts and worddicts[word] < options['n_words']:
 409 |                     params['Wemb'][worddicts[word], :] = vector
 410 | 
 411 |     params['Charemb'] = norm_weight(options['l_alphabet']+1, options['dim_char_emb'])
 412 |     for char_k_rows in options['char_k_rows']:
 413 |         w_shp = (options['char_nout'], 1, char_k_rows, options['char_k_cols'])
 414 |         w_bound = numpy.sqrt(3 * char_k_rows * options['char_k_cols'])
 415 |         params['filter_{}'.format(char_k_rows)] = numpy.random.uniform(low=-1.0 / w_bound, high=1.0 / w_bound, size=w_shp).astype('float32')
 416 | 
 417 |     dim_emb = options['dim_word']+3*options['char_nout']
 418 | 
 419 |     params = get_layer(options['encoder'])[0](options, params,
 420 |                                               prefix='encoder_1',
 421 |                                               nin=dim_emb,
 422 |                                               dim=options['dim'])
 423 | 
 424 |     params = get_layer(options['encoder'])[0](options, params,
 425 |                                               prefix='encoder_r_1',
 426 |                                               nin=dim_emb,
 427 |                                               dim=options['dim'])
 428 | 
 429 |     params = get_layer(options['encoder'])[0](options, params,
 430 |                                               prefix='encoder_2',
 431 |                                               nin=options['dim']*2+dim_emb,
 432 |                                               dim=options['dim'])
 433 | 
 434 |     params = get_layer(options['encoder'])[0](options, params,
 435 |                                               prefix='encoder_r_2',
 436 |                                               nin=options['dim']*2+dim_emb,
 437 |                                               dim=options['dim'])
 438 | 
 439 |     params = get_layer(options['encoder'])[0](options, params,
 440 |                                               prefix='encoder_3',
 441 |                                               nin=options['dim']*2+dim_emb,
 442 |                                               dim=options['dim'])
 443 | 
 444 |     params = get_layer(options['encoder'])[0](options, params,
 445 |                                               prefix='encoder_r_3',
 446 |                                               nin=options['dim']*2+dim_emb,
 447 |                                               dim=options['dim'])
 448 | 
 449 |     # classifier
 450 |     params = get_layer('ff')[0](options, params, prefix='ff_layer_1',
 451 |                                 nin=options['dim'] * 24, nout=options['dim'], ortho=False)
 452 |     params = get_layer('ff')[0](options, params, prefix='ff_layer_2',
 453 |                                 nin=options['dim'] * 25, nout=options['dim'], ortho=False)
 454 |     params = get_layer('ff')[0](options, params, prefix='ff_layer_output',
 455 |                                 nin=options['dim'], nout=3, ortho=False)
 456 | 
 457 |     return params
 458 | 
 459 | 
 460 | # build a training model
 461 | def build_model(tparams, options):
 462 |     """ Builds the entire computational graph used for training
 463 |     """
 464 |     opt_ret = dict()
 465 | 
 466 |     trng = RandomStreams(1234)
 467 |     use_noise = theano.shared(numpy.float32(0.))
 468 | 
 469 |     # description string: #words x #samples
 470 |     x1 = tensor.matrix('x1', dtype='int64')
 471 |     x1_mask = tensor.matrix('x1_mask', dtype='float32')
 472 |     x2 = tensor.matrix('x2', dtype='int64')
 473 |     x2_mask = tensor.matrix('x2_mask', dtype='float32')
 474 |     y = tensor.vector('y', dtype='int64')
 475 | 
 476 |     xr1 = x1[::-1]
 477 |     xr1_mask = x1_mask[::-1]
 478 |     xr2 = x2[::-1]
 479 |     xr2_mask = x2_mask[::-1]
 480 | 
 481 |     n_timesteps_x1 = x1.shape[0]
 482 |     n_timesteps_x2 = x2.shape[0]
 483 |     n_samples = x1.shape[1]
 484 | 
 485 |     char_x1 = theano.tensor.tensor3('char_x1', dtype='int64')
 486 |     char_x1_mask = theano.tensor.tensor3('char_x1_mask', dtype='float32')
 487 |     char_x2 = theano.tensor.tensor3('char_x2', dtype='int64')
 488 |     char_x2_mask = theano.tensor.tensor3('char_x2_mask', dtype='float32')
 489 | 
 490 |     emb_char1 = tparams['Charemb'][char_x1.flatten()].reshape([n_timesteps_x1, n_samples, char_x1.shape[2], options['dim_char_emb']])
 491 |     emb_char1 = emb_char1 * char_x1_mask[:,:,:,None]
 492 |     emb_char_inp1 = emb_char1.reshape([n_timesteps_x1*n_samples, 1, char_x1.shape[2], options['dim_char_emb']])
 493 | 
 494 |     emb_char1s = []
 495 |     for num in options['char_k_rows']:
 496 |         emb_char1 = tensor.nnet.conv.conv2d(emb_char_inp1, tparams['filter_{}'.format(num)], border_mode='valid')
 497 |         emb_char1 = tensor.nnet.nnet.relu(emb_char1)
 498 |         emb_char1 = emb_char1.reshape([n_timesteps_x1*n_samples, options['char_nout'], emb_char1.shape[2]])
 499 |         emb_char1 = emb_char1.max(2)
 500 |         emb_char1 = emb_char1.reshape([n_timesteps_x1, n_samples, options['char_nout']])
 501 |         emb_char1s.append(emb_char1)
 502 | 
 503 |     emb_char1 = concatenate(emb_char1s, axis = 2)
 504 | 
 505 |     emb_char2 = tparams['Charemb'][char_x2.flatten()].reshape([n_timesteps_x2, n_samples, char_x2.shape[2], options['dim_char_emb']])
 506 |     emb_char2 = emb_char2 * char_x2_mask[:,:,:,None]
 507 |     emb_char_inp2 = emb_char2.reshape([n_timesteps_x2*n_samples, 1, char_x2.shape[2], options['dim_char_emb']])
 508 | 
 509 |     emb_char2s = []
 510 |     for num in options['char_k_rows']:
 511 |         emb_char2 = tensor.nnet.conv.conv2d(emb_char_inp2, tparams['filter_{}'.format(num)], border_mode='valid')
 512 |         emb_char2 = tensor.nnet.nnet.relu(emb_char2)
 513 |         emb_char2 = emb_char2.reshape([n_timesteps_x2*n_samples, options['char_nout'], emb_char2.shape[2]])
 514 |         emb_char2 = emb_char2.max(2)
 515 |         emb_char2 = emb_char2.reshape([n_timesteps_x2, n_samples, options['char_nout']])
 516 |         emb_char2s.append(emb_char2)
 517 | 
 518 |     emb_char2 = concatenate(emb_char2s, axis = 2)
 519 | 
 520 |     # word embedding
 521 |     emb1 = tparams['Wemb'][x1.flatten()].reshape([n_timesteps_x1, n_samples, options['dim_word']])
 522 |     emb1 = concatenate([emb1, emb_char1], axis = 2)
 523 |     if options['use_dropout']:
 524 |         emb1 = dropout_layer(emb1, use_noise, trng)
 525 | 
 526 |     emb2 = tparams['Wemb'][x2.flatten()].reshape([n_timesteps_x2, n_samples, options['dim_word']])
 527 |     emb2 = concatenate([emb2, emb_char2], axis = 2)
 528 |     if options['use_dropout']:
 529 |         emb2 = dropout_layer(emb2, use_noise, trng)
 530 | 
 531 |     for l in range(3):
 532 |         if l == 0:
 533 |             ctx1 = emb1
 534 |             ctx2 = emb2
 535 |         else:
 536 |             ctx1 = concatenate([ctx1, emb1], axis=2)
 537 |             ctx2 = concatenate([ctx2, emb2], axis=2)
 538 | 
 539 |         ctxr1 = ctx1[::-1]
 540 |         ctxr2 = ctx2[::-1]
 541 |         proj1 = get_layer(options['encoder'])[1](tparams, ctx1, options,
 542 |                                                 prefix='encoder_{}'.format(str(l+1)),
 543 |                                                 mask=x1_mask)
 544 |         projr1 = get_layer(options['encoder'])[1](tparams, ctxr1, options,
 545 |                                                  prefix='encoder_r_{}'.format(str(l+1)),
 546 |                                                  mask=xr1_mask)
 547 |         proj2 = get_layer(options['encoder'])[1](tparams, ctx2, options,
 548 |                                                 prefix='encoder_{}'.format(str(l+1)),
 549 |                                                 mask=x2_mask)
 550 |         projr2 = get_layer(options['encoder'])[1](tparams, ctxr2, options,
 551 |                                                  prefix='encoder_r_{}'.format(str(l+1)),
 552 |                                                  mask=xr2_mask)
 553 |         ctx1 = concatenate([proj1[0], projr1[0][::-1]], axis=proj1[0].ndim-1)
 554 |         ctx2 = concatenate([proj2[0], projr2[0][::-1]], axis=proj2[0].ndim-1)
 555 | 
 556 |     # step x sample x dim
 557 |     inp_gate1 = concatenate([proj1[2], projr1[2][::-1]], axis=proj1[2].ndim-1)
 558 |     inp_gate2 = concatenate([proj2[2], projr2[2][::-1]], axis=proj2[2].ndim-1)
 559 | 
 560 |     inp_gate1 = inp_gate1.norm(2, axis=2)
 561 |     inp_gate2 = inp_gate2.norm(2, axis=2)
 562 | 
 563 |     mean_1 = (ctx1 * x1_mask[:, :, None]).sum(0) / x1_mask.sum(0)[:, None]
 564 |     max_1 = (ctx1 * x1_mask[:, :, None]).max(0)
 565 |     gate_1 = (ctx1 * inp_gate1[:, :, None] * x1_mask[:, :, None]).sum(0) / (inp_gate1[:, :, None] * x1_mask[:, :, None]).sum(0)
 566 | 
 567 |     mean_2 = (ctx2 * x2_mask[:, :, None]).sum(0) / x2_mask.sum(0)[:, None]
 568 |     max_2 = (ctx2 * x2_mask[:, :, None]).max(0)
 569 |     gate_2 = (ctx2 * inp_gate2[:, :, None] * x2_mask[:, :, None]).sum(0) / (inp_gate2[:, :, None] * x2_mask[:, :, None]).sum(0)
 570 | 
 571 |     rep1 = concatenate([mean_1, max_1, gate_1], axis=1)
 572 |     rep2 = concatenate([mean_2, max_2, gate_2], axis=1)
 573 | 
 574 |     logit_0 = concatenate([rep1, rep2, abs(rep1-rep2), rep1*rep2], axis=1)
 575 | 
 576 |     logit = get_layer('ff')[1](tparams, logit_0, options,
 577 |                                     prefix='ff_layer_1', activ='relu')
 578 |     if options['use_dropout']:
 579 |         logit = dropout_layer(logit, use_noise, trng)
 580 |     logit = concatenate([logit_0, logit], axis=1)
 581 |     logit = get_layer('ff')[1](tparams, logit, options,
 582 |                                     prefix='ff_layer_2', activ='relu')
 583 |     if options['use_dropout']:
 584 |         logit = dropout_layer(logit, use_noise, trng)
 585 |     logit = get_layer('ff')[1](tparams, logit, options,
 586 |                                prefix='ff_layer_output', activ='linear')
 587 |     probs = tensor.nnet.softmax(logit)
 588 |     cost = tensor.nnet.categorical_crossentropy(probs, y)
 589 | 
 590 |     f_pred = theano.function([x1, x1_mask, char_x1, char_x1_mask, x2, x2_mask, char_x2, char_x2_mask], probs.argmax(axis=1), name='f_pred')
 591 |     f_prods = theano.function([x1, x1_mask, char_x1, char_x1_mask, x2, x2_mask, char_x2, char_x2_mask], probs, name='f_prods')
 592 |     opt_ret['rep1'] = rep1
 593 |     opt_ret['rep2'] = rep2
 594 | 
 595 |     return trng, use_noise, x1, x1_mask, char_x1, char_x1_mask, x2, x2_mask, char_x2, char_x2_mask, y, opt_ret, cost, f_pred, f_prods
 596 | 
 597 | 
 598 | # calculate the log probablities on a given corpus using translation model
 599 | def pred_probs(f_log_probs, prepare_data, options, iterator, worddicts_r, verbose=False):
 600 |     probs = []
 601 |     n_done = 0
 602 | 
 603 |     for x1, x2, y in iterator:
 604 |         n_done += len(x1)
 605 |         x1, x1_mask, char_x1, char_x1_mask, x2, x2_mask, char_x2, char_x2_mask, y = prepare_data(x1, x2, y, worddicts_r)
 606 | 
 607 |         pprobs = f_log_probs(x1, x1_mask, char_x1, char_x1_mask, x2, x2_mask, char_x2, char_x2_mask, y)
 608 |         for pp in pprobs:
 609 |             probs.append(pp)
 610 | 
 611 |         if numpy.isnan(numpy.mean(probs)):
 612 |             ipdb.set_trace()
 613 | 
 614 |         if verbose:
 615 |             print >>sys.stderr, '%d samples computed' % (n_done)
 616 | 
 617 |     return numpy.array(probs)
 618 | 
 619 | def pred_acc(f_pred, prepare_data, options, iterator, worddicts_r, verbose=False):
 620 |     """
 621 |     Just compute the accuracy
 622 |     f_pred: Theano fct computing the prediction
 623 |     prepare_data: usual prepare_data for that dataset.
 624 |     """
 625 |     valid_acc = 0
 626 |     n_done = 0
 627 | 
 628 |     for x1, x2, y in iterator:
 629 |         n_done += len(x1)
 630 |         x1, x1_mask, char_x1, char_x1_mask, x2, x2_mask, char_x2, char_x2_mask, y = prepare_data(x1, x2, y, worddicts_r)
 631 |         preds = f_pred(x1, x1_mask, char_x1, char_x1_mask, x2, x2_mask, char_x2, char_x2_mask)
 632 |         valid_acc += (preds == y).sum()
 633 | 
 634 |     valid_acc = 1.0 * valid_acc / n_done
 635 | 
 636 |     return valid_acc
 637 | 
 638 | 
 639 | # optimizers
 640 | # name(hyperp, tparams, grads, inputs (list), cost) = f_grad_shared, f_update
 641 | def adam(lr, tparams, grads, inp, cost, beta1=0.9, beta2=0.999, e=1e-8):
 642 | 
 643 |     gshared = [theano.shared(p.get_value() * 0., name='%s_grad' % k)
 644 |                for k, p in tparams.iteritems()]
 645 |     gsup = [(gs, g) for gs, g in zip(gshared, grads)]
 646 | 
 647 |     f_grad_shared = theano.function(inp, cost, updates=gsup, profile=profile)
 648 | 
 649 |     updates = []
 650 | 
 651 |     t_prev = theano.shared(numpy.float32(0.))
 652 |     t = t_prev + 1.
 653 |     lr_t = lr * tensor.sqrt(1. - beta2**t) / (1. - beta1**t)
 654 | 
 655 |     for p, g in zip(tparams.values(), gshared):
 656 |         m = theano.shared(p.get_value() * 0., p.name + '_mean')
 657 |         v = theano.shared(p.get_value() * 0., p.name + '_variance')
 658 |         m_t = beta1 * m + (1. - beta1) * g
 659 |         v_t = beta2 * v + (1. - beta2) * g**2
 660 |         step = lr_t * m_t / (tensor.sqrt(v_t) + e)
 661 |         p_t = p - step
 662 |         updates.append((m, m_t))
 663 |         updates.append((v, v_t))
 664 |         updates.append((p, p_t))
 665 |     updates.append((t_prev, t))
 666 | 
 667 |     f_update = theano.function([lr], [], updates=updates,
 668 |                                on_unused_input='ignore', profile=profile)
 669 | 
 670 |     return f_grad_shared, f_update
 671 |     
 672 | 
 673 | def adadelta(lr, tparams, grads, inp, cost, epsilon = 1e-6, rho = 0.95):
 674 |     zipped_grads = [theano.shared(p.get_value() * numpy.float32(0.),
 675 |                                   name='%s_grad' % k)
 676 |                     for k, p in tparams.iteritems()]
 677 |     running_up2 = [theano.shared(p.get_value() * numpy.float32(0.),
 678 |                                  name='%s_rup2' % k)
 679 |                    for k, p in tparams.iteritems()]
 680 |     running_grads2 = [theano.shared(p.get_value() * numpy.float32(0.),
 681 |                                     name='%s_rgrad2' % k)
 682 |                       for k, p in tparams.iteritems()]
 683 | 
 684 |     zgup = [(zg, g) for zg, g in zip(zipped_grads, grads)]
 685 |     rg2up = [(rg2, rho * rg2 + (1 - rho) * (g ** 2))
 686 |              for rg2, g in zip(running_grads2, grads)]
 687 | 
 688 |     f_grad_shared = theano.function(inp, cost, updates=zgup+rg2up,
 689 |                                     profile=profile)
 690 | 
 691 |     updir = [-tensor.sqrt(ru2 + epsilon) / tensor.sqrt(rg2 + epsilon) * zg
 692 |              for zg, ru2, rg2 in zip(zipped_grads, running_up2,
 693 |                                      running_grads2)]
 694 |     ru2up = [(ru2, rho * ru2 + (1 - rho) * (ud ** 2))
 695 |              for ru2, ud in zip(running_up2, updir)]
 696 |     param_up = [(p, p + ud) for p, ud in zip(itemlist(tparams), updir)]
 697 | 
 698 |     f_update = theano.function([lr], [], updates=ru2up+param_up,
 699 |                                on_unused_input='ignore', profile=profile)
 700 | 
 701 |     return f_grad_shared, f_update
 702 | 
 703 | 
 704 | def rmsprop(lr, tparams, grads, inp, cost):
 705 |     zipped_grads = [theano.shared(p.get_value() * numpy.float32(0.),
 706 |                                   name='%s_grad' % k)
 707 |                     for k, p in tparams.iteritems()]
 708 |     running_grads = [theano.shared(p.get_value() * numpy.float32(0.),
 709 |                                    name='%s_rgrad' % k)
 710 |                      for k, p in tparams.iteritems()]
 711 |     running_grads2 = [theano.shared(p.get_value() * numpy.float32(0.),
 712 |                                     name='%s_rgrad2' % k)
 713 |                       for k, p in tparams.iteritems()]
 714 | 
 715 |     zgup = [(zg, g) for zg, g in zip(zipped_grads, grads)]
 716 |     rgup = [(rg, 0.95 * rg + 0.05 * g) for rg, g in zip(running_grads, grads)]
 717 |     rg2up = [(rg2, 0.95 * rg2 + 0.05 * (g ** 2))
 718 |              for rg2, g in zip(running_grads2, grads)]
 719 | 
 720 |     f_grad_shared = theano.function(inp, cost, updates=zgup+rgup+rg2up,
 721 |                                     profile=profile)
 722 | 
 723 |     updir = [theano.shared(p.get_value() * numpy.float32(0.),
 724 |                            name='%s_updir' % k)
 725 |              for k, p in tparams.iteritems()]
 726 |     updir_new = [(ud, 0.9 * ud - 1e-4 * zg / tensor.sqrt(rg2 - rg ** 2 + 1e-4))
 727 |                  for ud, zg, rg, rg2 in zip(updir, zipped_grads, running_grads,
 728 |                                             running_grads2)]
 729 |     param_up = [(p, p + udn[1])
 730 |                 for p, udn in zip(itemlist(tparams), updir_new)]
 731 |     f_update = theano.function([lr], [], updates=updir_new+param_up,
 732 |                                on_unused_input='ignore', profile=profile)
 733 | 
 734 |     return f_grad_shared, f_update
 735 | 
 736 | 
 737 | def sgd(lr, tparams, grads, inp, cost):
 738 |     gshared = [theano.shared(p.get_value() * 0.,
 739 |                              name='%s_grad' % k)
 740 |                for k, p in tparams.iteritems()]
 741 |     gsup = [(gs, g) for gs, g in zip(gshared, grads)]
 742 | 
 743 |     f_grad_shared = theano.function(inp, cost, updates=gsup,
 744 |                                     profile=profile)
 745 | 
 746 |     pup = [(p, p - lr * g) for p, g in zip(itemlist(tparams), gshared)]
 747 |     f_update = theano.function([lr], [], updates=pup, profile=profile)
 748 | 
 749 |     return f_grad_shared, f_update
 750 | 
 751 | """Note: all the hyperparameters are stored in a dictionary model_options (or options outside train).
 752 |    train() then proceeds to do the following:
 753 |        1. The params are initialized (or reloaded)
 754 |        2. The computations graph is built symbolically using Theano.
 755 |        3. A cost is defined, then gradient are obtained automatically with tensor.grad
 756 |        4. With some helper functions, gradient descent + periodic saving/printing proceeds
 757 | """
 758 | def train(
 759 |           dim_word         = 100,  # word vector dimensionality
 760 |           dim              = 100,  # the number of GRU units
 761 |           encoder          = 'lstm', # encoder model
 762 |           decoder          = 'lstm', # decoder model 
 763 |           patience         = 10,  # early stopping patience
 764 |           max_epochs       = 5000, 
 765 |           finish_after     = 10000000, # finish after this many updates
 766 |           decay_c          = 0.,  # L2 regularization penalty
 767 |           clip_c           = -1.,  # gradient clipping threshold
 768 |           lrate            = 0.01,  # learning rate
 769 |           n_words          = 100000,  # vocabulary size
 770 |           maxlen           = 100,  # maximum length of the description
 771 |           optimizer        = 'adadelta',
 772 |           batch_size       = 16,
 773 |           valid_batch_size = 16,
 774 |           saveto           = 'model.npz',
 775 |           dispFreq         = 100,
 776 |           validFreq        = 1000,
 777 |           saveFreq         = 1000,   # save the parameters after every saveFreq updates
 778 |           use_dropout      = False,
 779 |           reload_          = False,
 780 |           verbose          = False, # print verbose information for debug but slow speed
 781 |           datasets         = [],
 782 |           valid_datasets   = [],
 783 |           test_datasets    = [],
 784 |           dictionary       = '',
 785 |           embedding        = '', # pretrain embedding file, such as word2vec, GLOVE
 786 |           ):
 787 | 
 788 |     logging.basicConfig(level=logging.DEBUG, format="%(asctime)s: %(name)s: %(levelname)s: %(message)s")
 789 |     # Model options
 790 |     model_options = locals().copy()
 791 | 
 792 |     model_options['alphabet'] = " abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789-,;.!?:'\"/\\|_@#$%^&*~`+-=<>()[]{}"
 793 |     model_options['l_alphabet'] = len(model_options['alphabet'])
 794 |     model_options['dim_char_emb'] = 15
 795 |     model_options['char_nout'] = 100
 796 |     model_options['char_k_rows'] = [1, 3, 5]
 797 |     model_options['char_k_cols'] = model_options['dim_char_emb']
 798 | 
 799 |     # load dictionary and invert them
 800 |     with open(dictionary, 'rb') as f:
 801 |         worddicts = pkl.load(f)
 802 |     worddicts_r = dict()
 803 |     for kk, vv in worddicts.iteritems():
 804 |         worddicts_r[vv] = kk
 805 | 
 806 |     # reload options
 807 |     if reload_ and os.path.exists(saveto):
 808 |         print 'Reload options'
 809 |         with open('%s.pkl' % saveto, 'rb') as f:
 810 |             model_options = pkl.load(f)
 811 | 
 812 |     logger.debug(pprint.pformat(model_options))
 813 | 
 814 |     print 'Loading data'
 815 |     train = TextIterator(datasets[0], datasets[1], datasets[2],
 816 |                          dictionary,
 817 |                          n_words=n_words,
 818 |                          batch_size=batch_size)
 819 |     train_valid = TextIterator(datasets[0], datasets[1], datasets[2],
 820 |                          dictionary,
 821 |                          n_words=n_words,
 822 |                          batch_size=valid_batch_size,
 823 |                          shuffle=False)
 824 |     valid = TextIterator(valid_datasets[0], valid_datasets[1], valid_datasets[2],
 825 |                          dictionary,
 826 |                          n_words=n_words,
 827 |                          batch_size=valid_batch_size,
 828 |                          shuffle=False)
 829 |     test = TextIterator(test_datasets[0], test_datasets[1], test_datasets[2],
 830 |                          dictionary,
 831 |                          n_words=n_words,
 832 |                          batch_size=valid_batch_size,
 833 |                          shuffle=False)
 834 | 
 835 |     # Initialize (or reload) the parameters using 'model_options'
 836 |     # then build the Theano graph
 837 |     print 'Building model'
 838 |     params = init_params(model_options, worddicts)
 839 |     # reload parameters
 840 |     if reload_ and os.path.exists(saveto):
 841 |         print 'Reload parameters'
 842 |         params = load_params(saveto, params)
 843 | 
 844 |     # numpy arrays -> theano shared variables
 845 |     tparams = init_tparams(params)
 846 | 
 847 |     trng, use_noise, \
 848 |         x1, x1_mask, char_x1, char_x1_mask, x2, x2_mask, char_x2, char_x2_mask, y, \
 849 |         opt_ret, \
 850 |         cost, \
 851 |         f_pred, f_prods = \
 852 |         build_model(tparams, model_options)
 853 |     inps = [x1, x1_mask, char_x1, char_x1_mask, x2, x2_mask, char_x2, char_x2_mask, y]
 854 | 
 855 |     # before any regularizer
 856 |     print 'Building f_log_probs...',
 857 |     f_log_probs = theano.function(inps, cost, profile=profile)
 858 |     print 'Done'
 859 | 
 860 |     cost = cost.mean()
 861 | 
 862 |     # apply L2 regularization on weights
 863 |     if decay_c > 0.:
 864 |         decay_c = theano.shared(numpy.float32(decay_c), name='decay_c')
 865 |         weight_decay = 0.
 866 |         for kk, vv in tparams.iteritems():
 867 |             weight_decay += (vv ** 2).sum()
 868 |         weight_decay *= decay_c
 869 |         cost += weight_decay
 870 | 
 871 |     # after all regularizers - compile the computational graph for cost
 872 |     print 'Building f_cost...',
 873 |     f_cost = theano.function(inps, cost, profile=profile)
 874 |     print 'Done'
 875 | 
 876 |     updated_params = OrderedDict([(key,value) for (key,value) in tparams.iteritems() if not key.startswith('Wemb')])
 877 | 
 878 |     print 'Computing gradient...',
 879 |     grads = tensor.grad(cost, wrt=itemlist(updated_params))
 880 |     print 'Done'
 881 | 
 882 |     # apply gradient clipping here
 883 |     if clip_c > 0.:
 884 |         g2 = 0.
 885 |         for g in grads:
 886 |             g2 += (g**2).sum()
 887 |         new_grads = []
 888 |         for g in grads:
 889 |             new_grads.append(tensor.switch(g2 > (clip_c**2),
 890 |                                            g / tensor.sqrt(g2) * clip_c,
 891 |                                            g))
 892 |         grads = new_grads
 893 |         if verbose:
 894 |             print 'Building function of gradient\'s norm'
 895 |             f_norm_g = theano.function(inps, tensor.sqrt(g2))
 896 | 
 897 | 
 898 |     # compile the optimizer, the actual computational graph is compiled here
 899 |     lr = tensor.scalar(name='lr')
 900 |     print 'Building optimizers...',
 901 |     f_grad_shared, f_update = eval(optimizer)(lr, updated_params, grads, inps, cost)
 902 |     print 'Done'
 903 | 
 904 |     print 'Optimization'
 905 | 
 906 |     history_errs = []
 907 |     # reload history
 908 |     if reload_ and os.path.exists(saveto):
 909 |         print 'Reload history error'
 910 |         history_errs = list(numpy.load(saveto)['history_errs'])
 911 |     best_p = None
 912 |     bad_counter = 0
 913 | 
 914 |     if validFreq == -1:
 915 |         validFreq = len(train[0])/batch_size
 916 |     if saveFreq == -1:
 917 |         saveFreq = len(train[0])/batch_size
 918 | 
 919 |     uidx = 0
 920 |     estop = False
 921 |     valid_acc_record = []
 922 |     test_acc_record = []
 923 |     best_epoch_num = 0
 924 |     lr_change_list = []
 925 |     wait_counter = 0
 926 |     wait_N = 1
 927 |     for eidx in xrange(max_epochs):
 928 |         n_samples = 0
 929 |         for x1, x2, y in train:
 930 |             n_samples += len(x1)
 931 |             uidx += 1
 932 |             use_noise.set_value(1.)
 933 |             x1, x1_mask, char_x1, char_x1_mask, x2, x2_mask, char_x2, char_x2_mask, y = prepare_data(x1, x2, y, worddicts_r, maxlen=maxlen)
 934 | 
 935 |             if x1 is None:
 936 |                 print 'Minibatch with zero sample under length ', maxlen
 937 |                 uidx -= 1
 938 |                 continue
 939 | 
 940 |             ud_start = time.time()
 941 | 
 942 |             # compute cost, grads and copy grads to shared variables
 943 |             cost = f_grad_shared(x1, x1_mask, char_x1, char_x1_mask, x2, x2_mask, char_x2, char_x2_mask, y)
 944 |             if verbose:
 945 |                 if clip_c > 0.:
 946 |                     norm_g = f_norm_g(x1, x1_mask, char_x1, char_x1_mask, x2, x2_mask, char_x2, char_x2_mask, y)
 947 | 
 948 |             # do the update on parameters
 949 |             f_update(lrate)
 950 |             ud = time.time() - ud_start
 951 |             # check for bad numbers, usually we remove non-finite elements
 952 |             # and continue training - but not done here
 953 |             if numpy.isnan(cost) or numpy.isinf(cost):
 954 |                 print 'NaN detected'
 955 |                 return None
 956 | 
 957 |             # verbose
 958 |             if numpy.mod(uidx, dispFreq) == 0:
 959 |                 logger.debug('Epoch {0} Update {1} Cost {2} UD {3}'.format(eidx, uidx, cost, ud))
 960 |                 if verbose:
 961 |                     if clip_c > 0.:
 962 |                         logger.debug('Grad {0}'.format(norm_g))
 963 | 
 964 |             # save the best model so far
 965 |             if numpy.mod(uidx, saveFreq) == 0:
 966 |                 print 'Saving...',
 967 |                 if best_p is not None:
 968 |                     params = best_p
 969 |                 else:
 970 |                     params = unzip(tparams)
 971 |                 numpy.savez(saveto, history_errs=history_errs, **params)
 972 |                 pkl.dump(model_options, open('%s.pkl' % saveto, 'wb'))
 973 |                 print 'Done'
 974 | 
 975 |             # validate model on validation set and early stop if necessary
 976 |             if numpy.mod(uidx, validFreq) == 0:
 977 |                 use_noise.set_value(0.)
 978 |                 valid_cost = pred_probs(f_log_probs, prepare_data, model_options, valid, worddicts_r).mean()
 979 |                 valid_acc = pred_acc(f_pred, prepare_data, model_options, valid, worddicts_r)
 980 |                 valid_err = 1.0 - valid_acc
 981 |                 history_errs.append(valid_err)
 982 |                 test_cost = pred_probs(f_log_probs, prepare_data, model_options, test, worddicts_r).mean()
 983 |                 test_acc = pred_acc(f_pred, prepare_data, model_options, test, worddicts_r)
 984 | 
 985 |                 print 'Valid cost', valid_cost
 986 |                 print 'Valid accuracy', valid_acc
 987 |                 print 'Test cost', test_cost
 988 |                 print 'Test accuracy', test_acc
 989 |                 print 'lrate:', lrate
 990 | 
 991 |                 valid_acc_record.append(valid_acc)
 992 |                 test_acc_record.append(test_acc)
 993 | 
 994 |                 if uidx == 0 or valid_err <= numpy.array(history_errs).min():
 995 |                     best_p = unzip(tparams)
 996 |                     best_epoch_num = eidx
 997 |                     wait_counter = 0
 998 | 
 999 |                 if valid_err > numpy.array(history_errs).min():
1000 |                     wait_counter += 1
1001 |             
1002 |                 if wait_counter >= wait_N:
1003 |                     print 'wait_counter max, need to half the lr'
1004 |                     bad_counter += 1
1005 |                     wait_counter = 0
1006 |                     print 'bad_counter: '+str(bad_counter)
1007 |                     lrate=lrate*0.5
1008 |                     lr_change_list.append(eidx)
1009 |                     print 'lrate change to: ' + str(lrate)
1010 |                     zipp(best_p, tparams)
1011 | 
1012 |                 if bad_counter > patience:
1013 |                         print 'Early Stop!'
1014 |                         estop = True
1015 |                         break
1016 | 
1017 |                 if numpy.isnan(valid_err):
1018 |                     pdb.set_trace()
1019 | 
1020 |             # finish after this many updates
1021 |             if uidx >= finish_after:
1022 |                 print 'Finishing after %d iterations!' % uidx
1023 |                 estop = True
1024 |                 break
1025 | 
1026 |         print 'Seen %d samples' % n_samples
1027 | 
1028 |         if estop:
1029 |             break
1030 | 
1031 |     if best_p is not None:
1032 |         zipp(best_p, tparams)
1033 | 
1034 |     with open('record.csv', 'w') as f:
1035 |         f.write(str(best_epoch_num) + '\n')
1036 |         f.write(','.join(map(str,lr_change_list)) + '\n')
1037 |         f.write(','.join(map(str,valid_acc_record)) + '\n')
1038 |         f.write(','.join(map(str,test_acc_record)) + '\n')
1039 | 
1040 |     use_noise.set_value(0.)
1041 | 
1042 |     print '=' * 80
1043 |     print 'Final Result'
1044 |     print '=' * 80
1045 |     train_cost = pred_probs(f_log_probs, prepare_data, model_options, train_valid, worddicts_r).mean()
1046 |     train_acc = pred_acc(f_pred, prepare_data, model_options, train_valid, worddicts_r)
1047 |     print 'Train cost', train_cost
1048 |     print 'Train accuracy', train_acc
1049 |     valid_cost = pred_probs(f_log_probs, prepare_data, model_options, valid, worddicts_r).mean()
1050 |     valid_acc = pred_acc(f_pred, prepare_data, model_options, valid, worddicts_r)
1051 |     print 'Valid cost', valid_cost
1052 |     print 'Valid accuracy', valid_acc
1053 |     test_cost = pred_probs(f_log_probs, prepare_data, model_options, test, worddicts_r).mean()
1054 |     test_acc = pred_acc(f_pred, prepare_data, model_options, test, worddicts_r)
1055 |     print 'Test cost', test_cost
1056 |     print 'Test accuracy', test_acc
1057 |     params = copy.copy(best_p)
1058 |     numpy.savez(saveto, zipped_params=best_p, history_errs=history_errs, **params)
1059 |     logger.debug('Done')
1060 | 
1061 |     return None
1062 | 
1063 | if __name__ == '__main__':
1064 |     pass
1065 | 


--------------------------------------------------------------------------------