├── src ├── data_utils.pyc ├── test.py ├── Dropout.py ├── driver.py ├── guess.py ├── embeddings.py ├── data_utils.py ├── optimizer.py ├── model.py ├── Dense.py ├── CNN.py ├── LSTM.py └── pairwise.py ├── .gitattributes └── Readme /src/data_utils.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FudanNLP/NeuralSentenceOrdering/HEAD/src/data_utils.pyc -------------------------------------------------------------------------------- /src/test.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | np.random.seed(1234) 3 | n =10 4 | idx_list = np.arange(n, dtype="int32") 5 | def shuffle(idx_list): 6 | np.random.shuffle(idx_list) 7 | return idx_list 8 | n =10 9 | idx_list = np.arange(n, dtype="int32") 10 | idx_list = shuffle(idx_list) 11 | print idx_list 12 | -------------------------------------------------------------------------------- /.gitattributes: -------------------------------------------------------------------------------- 1 | # Auto detect text files and perform LF normalization 2 | * text=auto 3 | 4 | # Custom for Visual Studio 5 | *.cs diff=csharp 6 | 7 | # Standard to msysgit 8 | *.doc diff=astextplain 9 | *.DOC diff=astextplain 10 | *.docx diff=astextplain 11 | *.DOCX diff=astextplain 12 | *.dot diff=astextplain 13 | *.DOT diff=astextplain 14 | *.pdf diff=astextplain 15 | *.PDF diff=astextplain 16 | *.rtf diff=astextplain 17 | *.RTF diff=astextplain 18 | -------------------------------------------------------------------------------- /src/Dropout.py: -------------------------------------------------------------------------------- 1 | import theano 2 | from theano import tensor as T 3 | from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams 4 | from theano.tensor.signal import downsample 5 | import numpy as np 6 | def dropout(x, level, seed=None): 7 | if level < 0. or level >= 1: 8 | raise Exception('Dropout level must be in interval [0, 1[.') 9 | if seed is None: 10 | seed = np.random.randint(10e6) 11 | rng = RandomStreams(seed=seed) 12 | retain_prob = 1. - level 13 | x *= rng.binomial(x.shape, p=retain_prob, dtype=x.dtype) 14 | x /= retain_prob 15 | return x -------------------------------------------------------------------------------- /Readme: -------------------------------------------------------------------------------- 1 | An python (with theano) based implementation of Paper "Neural sentence ordering". 2 | 3 | The data is available on https://drive.google.com/drive/folders/0B-mnK8kniGAiNVB6WTQ4bmdyamc. 4 | 5 | The sample of processed data could be found on https://drive.google.com/file/d/0B-mnK8kniGAiSWhaR3gyalJyQm8/view?usp=sharing. And users should put this *.gz file into ./data/ to run the code. 6 | This processed data is based only a toy way to organize data in order to make the code run. 7 | 8 | The entrance of the code is ./src/driver.py 9 | 10 | Any usage of our code, data or idea could cite the paper "Neural sentence ordering" on arXiv . 11 | 12 | -------------------------------------------------------------------------------- /src/driver.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import sys 3 | import time 4 | from pairwise import Pairwise 5 | from model import build_model 6 | import cPickle as pkl 7 | #from Activation import * 8 | if __name__ == '__main__': 9 | flag_toy_data = 0.1 10 | random_seed = 1234 11 | alpha = 0.2 12 | batch_size = 128 13 | dispFreq = 2048 14 | n_epochs = 600 15 | wordVecLen = 25 # useless 16 | flag_dropout = False 17 | size_hidden_layer = 100 18 | dropoutRates = 0.2 # for output of the embedding layer 19 | optimizer = 'adadelta' 20 | beam_size = 128 21 | dataset = 'all' 22 | datapath = '../data/%s.pkl.gz'%dataset 23 | result_path = './result/' 24 | sentence_modeling = 'CNN' # available: 'CBoW' 'LSTM' 'CNN' 25 | CNN_filter_length = 3 26 | LSTM_go_backwards = True 27 | 28 | flag_random_lookup_table = False 29 | 30 | pair_score = Pairwise(alpha = alpha, 31 | batch_size=batch_size, 32 | n_epochs=n_epochs, 33 | wordVecLen = wordVecLen, 34 | flag_dropout = flag_dropout, 35 | datapath=datapath, 36 | random_seed=random_seed, 37 | dropoutRates = dropoutRates, 38 | optimizer = optimizer, 39 | dispFreq = dispFreq, 40 | beam_size = beam_size, 41 | flag_random_lookup_table = flag_random_lookup_table, 42 | flag_toy_data = flag_toy_data, 43 | size_hidden_layer = size_hidden_layer, 44 | dataset = dataset, 45 | result_path = result_path, 46 | sentence_modeling = sentence_modeling, 47 | CNN_filter_length = CNN_filter_length, 48 | LSTM_go_backwards = LSTM_go_backwards 49 | ) 50 | 51 | 52 | -------------------------------------------------------------------------------- /src/guess.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from data_utils import load_data 3 | 4 | def score_rank(sentence): 5 | n_total = 0 6 | n_correct = 0 7 | for i in range(len(sentence)): 8 | for j in range(i+1, len(sentence)): 9 | n_total += 1 10 | if sentence[i] < sentence[j]: n_correct += 1 11 | patial_correct = n_correct * 1.0 / n_total 12 | total_correct = 0.0 13 | if n_correct == n_total: total_correct = 1.0 14 | return patial_correct, total_correct 15 | def save_result(path,top1_res): 16 | fw = open(path,'w') 17 | for paragraph, cur_categories in top1_res: 18 | paragraph = np.asarray(paragraph) - np.min(paragraph) 19 | paragraph = list(paragraph) 20 | for sentence in paragraph: 21 | fw.write(str(sentence)) 22 | fw.write(' ') 23 | fw.write('#') 24 | for category in cur_categories: 25 | fw.write(category) 26 | fw.write(' ') 27 | fw.write('\n') 28 | fw.close() 29 | 30 | 31 | dataset = 'cs' 32 | datapath = '../data/%s.pkl.gz'%dataset 33 | src_train,src_valid,src_test,dic_w2idx, dic_idx2w, dic_w2embed, dic_idx2embed, embedding = load_data(path=datapath) 34 | 35 | res_order = [] 36 | res_eva = [] 37 | for paragraph, cur_categories in src_test: 38 | n = len(paragraph) 39 | candidates = [x for x in xrange(n)] 40 | guess_order = [] 41 | for i in xrange(n): 42 | idx = np.random.randint(n - i) 43 | guess_order.append(candidates[idx]) 44 | candidates.remove(candidates[idx]) 45 | res_order.append((guess_order, cur_categories)) 46 | patial_correct, total_correct = score_rank(guess_order) 47 | res_eva.append(np.asarray([patial_correct, total_correct])) 48 | res_eva = np.asarray(res_eva) 49 | 50 | print 'Guess result# patial_correct, total_correct: ', np.average(res_eva, axis = 0) 51 | result_path = './result/guess_%s'%dataset 52 | 53 | save_result(result_path, res_order) -------------------------------------------------------------------------------- /src/embeddings.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | from keras import backend as K 3 | from keras import activations, initializations, regularizers, constraints 4 | from keras.regularizers import ActivityRegularizer 5 | import numpy as np 6 | import theano 7 | import theano.tensor as T 8 | 9 | class Embedding(): 10 | '''Turn positive integers (indexes) into dense vectors of fixed size. 11 | eg. [[4], [20]] -> [[0.25, 0.1], [0.6, -0.2]] 12 | 13 | This layer can only be used as the first layer in a model. 14 | 15 | # Input shape 16 | 2D tensor with shape: `(nb_samples, sequence_length)`. 17 | 18 | # Output shape 19 | 3D tensor with shape: `(nb_samples, sequence_length, output_dim)`. 20 | 21 | # Arguments 22 | input_dim: int >= 0. Size of the vocabulary, ie. 23 | 1 + maximum integer index occurring in the input data. 24 | output_dim: int >= 0. Dimension of the dense embedding. 25 | init: name of initialization function for the weights 26 | of the layer (see: [initializations](../initializations.md)), 27 | or alternatively, Theano function to use for weights initialization. 28 | This parameter is only relevant if you don't pass a `weights` argument. 29 | weights: list of numpy arrays to set as initial weights. 30 | The list should have 1 element, of shape `(input_dim, output_dim)`. 31 | W_regularizer: instance of the [regularizers](../regularizers.md) module 32 | (eg. L1 or L2 regularization), applied to the embedding matrix. 33 | W_constraint: instance of the [constraints](../constraints.md) module 34 | (eg. maxnorm, nonneg), applied to the embedding matrix. 35 | mask_zero: Whether or not the input value 0 is a special "padding" 36 | value that should be masked out. 37 | This is useful for [recurrent layers](recurrent.md) which may take 38 | variable length input. If this is `True` then all subsequent layers 39 | in the model need to support masking or an exception will be raised. 40 | input_length: Length of input sequences, when it is constant. 41 | This argument is required if you are going to connect 42 | `Flatten` then `Dense` layers upstream 43 | (without it, the shape of the dense outputs cannot be computed). 44 | ''' 45 | input_ndim = 2 46 | 47 | def __init__(self, input_dim, output_dim, init='uniform', 48 | weights=None): 49 | self.input_dim = input_dim 50 | self.output_dim = output_dim 51 | self.init = initializations.get(init) 52 | 53 | self.W = self.init((self.input_dim, self.output_dim)) 54 | #self.W_Tag = self.init((self.size_label_set, self.output_dim_tag)) 55 | 56 | 57 | if weights != None: 58 | self.W = theano.shared(value = np.asarray(weights, dtype = theano.config.floatX), borrow=True) 59 | self.params = [self.W] 60 | 61 | def get_output(self, train=False): 62 | X = train 63 | out = K.gather(self.W, X) 64 | return out 65 | 66 | 67 | 68 | -------------------------------------------------------------------------------- /src/data_utils.py: -------------------------------------------------------------------------------- 1 | import cPickle 2 | import gzip 3 | import os 4 | import sys 5 | 6 | import numpy as np 7 | import theano 8 | 9 | def get_max_length(sentences): 10 | n = 0 11 | for sentence in sentences: 12 | l = len(sentence) 13 | if n < l: n = l 14 | return n 15 | def padding(sentences, max_len): 16 | res = np.zeros((len(sentences),max_len),dtype = np.int32) 17 | mask = np.zeros((len(sentences),max_len)) 18 | for s_id, sentence in enumerate(sentences): 19 | for w_id, word in enumerate(sentence): 20 | res[s_id][w_id] = word 21 | mask[s_id][w_id] = 1 22 | return res, mask 23 | 24 | def data_padding(batch_samples): 25 | s1 = [] 26 | s2 = [] 27 | y = [] 28 | for fir, sec, label in batch_samples: 29 | s1.append(fir) 30 | s2.append(sec) 31 | y.append(label) 32 | max_len1 = get_max_length(s1) 33 | max_len2 = get_max_length(s2) 34 | # s: 2d_array n_samples * max_len 35 | # mask: 2d_array n_samples * max_len 36 | s1, s1_mask = padding(s1, max_len1) 37 | s2, s2_mask = padding(s2, max_len2) 38 | y = np.asarray(y) 39 | return s1, s1_mask, s2, s2_mask, y 40 | 41 | def get_minibatches_idx(n, minibatch_size, shuffle=False): 42 | """ 43 | Used to shuffle the dataset at each iteration. 44 | """ 45 | 46 | idx_list = np.arange(n, dtype="int32") 47 | 48 | if shuffle: 49 | np.random.shuffle(idx_list) 50 | 51 | minibatches = [] 52 | minibatch_start = 0 53 | for i in range(n // minibatch_size): 54 | minibatches.append(idx_list[minibatch_start:minibatch_start + minibatch_size]) 55 | minibatch_start += minibatch_size 56 | 57 | if minibatch_start != n: 58 | # Make a minibatch out of what is left 59 | minibatches.append(idx_list[minibatch_start:]) 60 | 61 | return zip(range(len(minibatches)), minibatches) 62 | 63 | def prepare_data(examples): 64 | data = [] # (s1, s2, y) 65 | pairdict = {} 66 | n_sentences = 0 67 | for paragraph, cur_categories in examples: 68 | for s1_id,s1 in enumerate(paragraph): 69 | for s2_id,s2 in enumerate(paragraph): 70 | if s1_id == s2_id: continue 71 | if s1_id < s2_id: 72 | data.append((s1, s2, 1)) 73 | else: 74 | data.append((s1, s2, 0)) 75 | pairdict[(n_sentences + s1_id, n_sentences + s2_id)] = len(data) - 1 76 | n_sentences += len(paragraph) 77 | return data, pairdict 78 | 79 | 80 | def load_data(path='tsp_test.pkl.gz'): 81 | data_dir, data_file = os.path.split(path) 82 | if data_dir == "" and not os.path.isfile(path): 83 | path = os.path.join( 84 | os.path.split(__file__)[0], 85 | "..", 86 | "data", 87 | path 88 | ) 89 | 90 | if path.endswith(".gz"): 91 | f = gzip.open(path, 'rb') 92 | else: 93 | f = open(path, 'rb') 94 | 95 | src_train,src_valid,src_test,dic_w2idx, dic_idx2w, dic_w2embed, dic_idx2embed, embedding = cPickle.load(f) 96 | f.close() 97 | return src_train,src_valid,src_test,dic_w2idx, dic_idx2w, dic_w2embed, dic_idx2embed, embedding 98 | 99 | 100 | if __name__ == '__main__': 101 | pass -------------------------------------------------------------------------------- /src/optimizer.py: -------------------------------------------------------------------------------- 1 | from collections import OrderedDict 2 | import cPickle as pkl 3 | import sys 4 | import time 5 | import argparse 6 | import copy 7 | 8 | import random 9 | import numpy 10 | import theano 11 | from theano import config 12 | import theano.tensor as tensor 13 | from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams 14 | def numpy_floatX(data): 15 | return numpy.asarray(data, dtype=config.floatX) 16 | def sgd(lr, tparams, grads, sentence1,sentence1_mask,sentence2,sentence2_mask,y, cost): 17 | """ Stochastic Gradient Descent 18 | 19 | :note: A more complicated version of sgd then needed. This is 20 | done like that for adadelta and rmsprop. 21 | 22 | """ 23 | # New set of shared variable that will contain the gradient 24 | # for a mini-batch. 25 | gshared = [theano.shared(v.get_value() * 0., name='%s_grad' % k) 26 | for k, v in tparams.iteritems()] 27 | gsup = [(gs, g) for gs, g in zip(gshared, grads)] 28 | 29 | # Function that computes gradients for a mini-batch, but do not 30 | # updates the weights. 31 | f_grad_shared = theano.function([sentence1,sentence1_mask,sentence2,sentence2_mask,y], cost, updates=gsup, 32 | name='sgd_f_grad_shared') 33 | 34 | pup = [(v, v - lr * g) for v, g in zip(tparams.values(), gshared)] 35 | 36 | # Function that updates the weights from the previously computed 37 | # gradient. 38 | f_update = theano.function([lr], [], updates=pup, name='sgd_f_update') 39 | 40 | return f_grad_shared, f_update 41 | 42 | 43 | def rmsprop(lr, tparams, grads, sentence1,sentence1_mask,sentence2,sentence2_mask,y, cost): 44 | zipped_grads = [theano.shared(q.get_value() * numpy_floatX(0.), name='%s_grad' % k) 45 | for k, q in tparams.iteritems()] 46 | running_grads = [theano.shared(q.get_value() * numpy_floatX(0.), name='%s_rgrad' % k) 47 | for k, q in tparams.iteritems()] 48 | running_grads2 = [theano.shared(q.get_value() * numpy_floatX(0.), name='%s_rgrad2' % k) 49 | for k, q in tparams.iteritems()] 50 | 51 | zgup = [(zg, g) for zg, g in zip(zipped_grads, grads)] 52 | rgup = [(rg, 0.95 * rg + 0.05 * g) for rg, g in zip(running_grads, grads)] 53 | rg2up = [(rg2, 0.95 * rg2 + 0.05 * (g ** 2)) 54 | for rg2, g in zip(running_grads2, grads)] 55 | 56 | f_grad_shared = theano.function([sentence1,sentence1_mask,sentence2,sentence2_mask,y], cost, 57 | updates=zgup + rgup + rg2up, 58 | name='rmsprop_f_grad_shared') 59 | 60 | updir = [theano.shared(q.get_value() * numpy_floatX(0.), name='%s_updir' % k) 61 | for k, q in tparams.iteritems()] 62 | updir_new = [(ud, 0.9 * ud - 1e-4 * zg / tensor.sqrt(rg2 - rg ** 2 + 1e-4)) 63 | for ud, zg, rg, rg2 in zip(updir, zipped_grads, running_grads, 64 | running_grads2)] 65 | param_up = [(q, q + udn[1]) 66 | for q, udn in zip(tparams.values(), updir_new)] 67 | f_update = theano.function([lr], [], updates=updir_new + param_up, 68 | on_unused_input='ignore', 69 | name='rmsprop_f_update') 70 | 71 | return f_grad_shared, f_update 72 | 73 | 74 | def adadelta(lr, tparams, grads, sentence1,sentence1_mask,sentence2,sentence2_mask,y, cost): 75 | ''' 76 | zipped_grads = [theano.shared(q.get_value() * numpy_floatX(0.), name='%s_grad' % k) 77 | for k, q in tparams.iteritems()] 78 | running_up2 = [theano.shared(q.get_value() * numpy_floatX(0.),name='%s_rup2' % k) 79 | for k, q in tparams.iteritems()] 80 | running_grads2 = [theano.shared(q.get_value() * numpy_floatX(0.),name='%s_rgrad2' % k) 81 | for k, q in tparams.iteritems()] 82 | ''' 83 | zipped_grads = [theano.shared(q.get_value() * numpy_floatX(0.)) 84 | for q in tparams] 85 | running_up2 = [theano.shared(q.get_value() * numpy_floatX(0.)) 86 | for q in tparams] 87 | running_grads2 = [theano.shared(q.get_value() * numpy_floatX(0.)) 88 | for q in tparams] 89 | 90 | zgup = [(zg, g) for zg, g in zip(zipped_grads, grads)] 91 | rg2up = [(rg2, 0.95 * rg2 + 0.05 * (g ** 2)) 92 | for rg2, g in zip(running_grads2, grads)] 93 | 94 | f_grad_shared = theano.function([sentence1,sentence1_mask,sentence2,sentence2_mask,y], cost, updates=zgup + rg2up, 95 | name='adadelta_f_grad_shared', allow_input_downcast=True) 96 | 97 | updir = [-tensor.sqrt(ru2 + 1e-6) / tensor.sqrt(rg2 + 1e-6) * zg 98 | for zg, ru2, rg2 in zip(zipped_grads, running_up2, running_grads2)] 99 | ru2up = [(ru2, 0.95 * ru2 + 0.05 * (ud ** 2)) 100 | for ru2, ud in zip(running_up2, updir)] 101 | param_up = [(q, q + ud) for q, ud in zip(tparams, updir)] 102 | 103 | f_update = theano.function([lr], [], updates=ru2up + param_up, 104 | on_unused_input='ignore', 105 | name='adadelta_f_update', allow_input_downcast=True) 106 | 107 | return f_grad_shared, f_update -------------------------------------------------------------------------------- /src/model.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import time 3 | import sys 4 | from embeddings import Embedding 5 | from Dropout import dropout 6 | import theano 7 | import theano.tensor as T 8 | import pickle 9 | from collections import OrderedDict 10 | from Dense import Dense 11 | from keras import backend as K 12 | 13 | from CNN import Convolution1D 14 | from LSTM import LSTM 15 | def CNN_embed(embed_s,s_mask,sentence_encode_layer): 16 | s_mask = s_mask.reshape((s_mask.shape[0],s_mask.shape[1],1)) # n_samples * len_sentence * 1 17 | s_mask = s_mask.repeat(embed_s.shape[2],axis = 2) # n_samples * len_sentence * embed_dim 18 | embed_s = embed_s * s_mask # n_samples * len_sentence * embed_dim 19 | 20 | embed_s = sentence_encode_layer.get_output(embed_s) # n_samples * len_sentence- * embed_dim 21 | embed_s = T.max(embed_s,axis = 1) # n_samples * embed_dim 22 | return embed_s # n_samples * embed_dim 23 | 24 | def LSTM_embed(embed_s,s_mask,sentence_encode_layer, options): 25 | s_mask = s_mask.reshape((s_mask.shape[0],s_mask.shape[1],1)) # n_samples * len_sentence * 1 26 | s_mask = s_mask.repeat(embed_s.shape[2],axis = 2) # n_samples * len_sentence * embed_dim 27 | embed_s = embed_s * s_mask # n_samples * len_sentence * embed_dim 28 | 29 | embed_s = sentence_encode_layer.get_output(go_backwards = options['LSTM_go_backwards'], train = embed_s) # n_samples * len_sentence * embed_dim 30 | return embed_s[:,-1,:] # n_samples * embed_dim 31 | 32 | def ave_embed(embed_s,s_mask): 33 | n = s_mask.sum(axis = 1) # n_samples 34 | n = n.reshape((s_mask.shape[0],1)) # n_samples * 1 35 | n = n.repeat(embed_s.shape[2],axis = 1) # n_samples * embed_dim 36 | s_mask = s_mask.reshape((s_mask.shape[0],s_mask.shape[1],1)) # n_samples * len_sentence * 1 37 | s_mask = s_mask.repeat(embed_s.shape[2],axis = 2) # n_samples * len_sentence * embed_dim 38 | embed_s = embed_s * s_mask # n_samples * len_sentence * embed_dim 39 | return embed_s.sum(axis = 1) /n # n_samples * embed_dim 40 | def build_model(options): 41 | print('Build model...') 42 | sys.stdout.flush() 43 | weights = None 44 | if options['flag_random_lookup_table'] == False: weights = options['embedding'] 45 | embed_layer = Embedding(input_dim = options['embedding'].shape[0], 46 | output_dim = options['embedding'].shape[1], 47 | weights = weights) 48 | dense_layers = [] 49 | dense_layers.append(Dense(input_dim = options['embedding'].shape[1] * 2, output_dim = options['size_hidden_layer'], activation = 'tanh')) 50 | dense_layers.append(Dense(input_dim = options['size_hidden_layer'], output_dim = 1, activation = 'sigmoid')) 51 | 52 | # for training 53 | sentence1 = T.imatrix('s1') # sentence1, n_samples * len_sentence 54 | sentence1_mask = T.matrix('s1_mask') 55 | sentence2 = T.imatrix('s2') # sentence2, n_samples * len_sentence 56 | sentence2_mask = T.matrix('s2_mask') 57 | y = T.ivector('y1') # n_samples 58 | 59 | embed_s1 = embed_layer.get_output(sentence1) # n_samples * len_sentence * embed_dim 60 | embed_s2 = embed_layer.get_output(sentence2) # n_samples * len_sentence * embed_dim 61 | if options['sentence_modeling'] == 'CBoW': 62 | embed_s1 = ave_embed(embed_s1,sentence1_mask) # n_samples * embed_dim 63 | embed_s2 = ave_embed(embed_s2,sentence2_mask) # n_samples * embed_dim 64 | elif options['sentence_modeling'] == 'CNN': 65 | sentence_encode_layer = Convolution1D(input_dim = options['embedding'].shape[1], activation = 'tanh', 66 | nb_filter = options['embedding'].shape[1], filter_length = options['CNN_filter_length'], 67 | border_mode = 'same') 68 | embed_s1 = CNN_embed(embed_s1,sentence1_mask,sentence_encode_layer) # n_samples * embed_dim 69 | embed_s2 = CNN_embed(embed_s2,sentence2_mask,sentence_encode_layer) # n_samples * embed_dim 70 | elif options['sentence_modeling'] == 'LSTM': 71 | sentence_encode_layer = LSTM(input_dim = options['embedding'].shape[1], output_dim = options['embedding'].shape[1]) 72 | embed_s1 = LSTM_embed(embed_s1,sentence1_mask,sentence_encode_layer,options) # n_samples * embed_dim 73 | embed_s2 = LSTM_embed(embed_s2,sentence2_mask,sentence_encode_layer,options) # n_samples * embed_dim 74 | else: 75 | print 'Error: No model called %s available!' % options['sentence_modeling'] 76 | return 77 | 78 | output = T.concatenate([embed_s1,embed_s2],axis = -1) # n_samples * (embed_dim * 2) 79 | 80 | if options['flag_dropout'] == True: 81 | output = dropout(output, level=options['dropoutRates']) 82 | for dense_layer in dense_layers: 83 | output = dense_layer.get_output(output) 84 | f_pred = theano.function([sentence1,sentence1_mask,sentence2,sentence2_mask],output, allow_input_downcast=True) 85 | 86 | output = output.reshape((output.shape[0],)) 87 | #y = y.reshape((output.shape[0],1)) 88 | cost = T.nnet.binary_crossentropy(output, y).mean() 89 | f_debug = theano.function([sentence1,sentence1_mask,sentence2,sentence2_mask,y],[output,y,T.nnet.binary_crossentropy(output, y),cost], allow_input_downcast=True) 90 | tparams = [] 91 | tparams += embed_layer.params 92 | if options['sentence_modeling'] != 'CBoW': 93 | tparams += sentence_encode_layer.params 94 | for dense_layer in dense_layers: tparams += dense_layer.params 95 | return sentence1,sentence1_mask,sentence2,sentence2_mask,y,cost,f_pred,tparams,f_debug 96 | -------------------------------------------------------------------------------- /src/Dense.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | from __future__ import division 3 | 4 | import numpy as np 5 | 6 | from collections import OrderedDict 7 | import copy 8 | from six.moves import zip 9 | 10 | from keras import backend as K 11 | from keras import activations, initializations, regularizers, constraints 12 | from keras.regularizers import ActivityRegularizer 13 | 14 | import marshal 15 | import types 16 | import sys 17 | class Dense(): 18 | '''Apply a same Dense layer for each dimension[1] (time_dimension) input. 19 | Especially useful after a recurrent network with 'return_sequence=True'. 20 | 21 | # Input shape 22 | 3D tensor with shape `(nb_sample, time_dimension, input_dim)`. 23 | 24 | # Output shape 25 | 3D tensor with shape `(nb_sample, time_dimension, output_dim)`. 26 | 27 | # Arguments 28 | output_dim: int > 0. 29 | init: name of initialization function for the weights of the layer 30 | (see [initializations](../initializations.md)), 31 | or alternatively, Theano function to use for weights 32 | initialization. This parameter is only relevant 33 | if you don't pass a `weights` argument. 34 | activation: name of activation function to use 35 | (see [activations](../activations.md)), 36 | or alternatively, elementwise Theano function. 37 | If you don't specify anything, no activation is applied 38 | (ie. "linear" activation: a(x) = x). 39 | weights: list of numpy arrays to set as initial weights. 40 | The list should have 1 element, of shape `(input_dim, output_dim)`. 41 | W_regularizer: instance of [WeightRegularizer](../regularizers.md) 42 | (eg. L1 or L2 regularization), applied to the main weights matrix. 43 | b_regularizer: instance of [WeightRegularizer](../regularizers.md), 44 | applied to the bias. 45 | activity_regularizer: instance of [ActivityRegularizer](../regularizers.md), 46 | applied to the network output. 47 | W_constraint: instance of the [constraints](../constraints.md) module 48 | (eg. maxnorm, nonneg), applied to the main weights matrix. 49 | b_constraint: instance of the [constraints](../constraints.md) module, 50 | applied to the bias. 51 | input_dim: dimensionality of the input (integer). 52 | This argument (or alternatively, the keyword argument `input_shape`) 53 | is required when using this layer as the first layer in a model. 54 | ''' 55 | input_ndim = 3 56 | 57 | def __init__(self, input_dim, output_dim, 58 | init='glorot_uniform', activation='linear', weights=None, 59 | W_regularizer=None, b_regularizer=None, activity_regularizer=None, 60 | W_constraint=None, b_constraint=None): 61 | self.input_dim = input_dim 62 | self.output_dim = output_dim 63 | self.init = initializations.get(init) 64 | self.activation = activations.get(activation) 65 | ''' 66 | self.W_regularizer = regularizers.get(W_regularizer) 67 | self.b_regularizer = regularizers.get(b_regularizer) 68 | self.activity_regularizer = regularizers.get(activity_regularizer) 69 | 70 | self.W_constraint = constraints.get(W_constraint) 71 | self.b_constraint = constraints.get(b_constraint) 72 | self.constraints = [self.W_constraint, self.b_constraint] 73 | 74 | self.initial_weights = weights 75 | ''' 76 | 77 | #super(TimeDistributedDense, self).__init__(**kwargs) 78 | 79 | #def build(self): 80 | 81 | 82 | self.W = self.init((self.input_dim, self.output_dim)) 83 | self.b = K.zeros((self.output_dim,)) 84 | 85 | self.params = [self.W, self.b] 86 | ''' 87 | self.regularizers = [] 88 | 89 | if self.W_regularizer: 90 | self.W_regularizer.set_param(self.W) 91 | self.regularizers.append(self.W_regularizer) 92 | 93 | if self.b_regularizer: 94 | self.b_regularizer.set_param(self.b) 95 | self.regularizers.append(self.b_regularizer) 96 | 97 | if self.activity_regularizer: 98 | self.activity_regularizer.set_layer(self) 99 | self.regularizers.append(self.activity_regularizer) 100 | 101 | if self.initial_weights is not None: 102 | self.set_weights(self.initial_weights) 103 | del self.initial_weights 104 | 105 | ''' 106 | 107 | def get_output(self, X): 108 | output = self.activation(K.dot(X, self.W) + self.b) 109 | return output 110 | ''' 111 | def get_config(self): 112 | config = {'name': self.__class__.__name__, 113 | 'output_dim': self.output_dim, 114 | 'init': self.init.__name__, 115 | 'activation': self.activation.__name__, 116 | 'W_regularizer': self.W_regularizer.get_config() if self.W_regularizer else None, 117 | 'b_regularizer': self.b_regularizer.get_config() if self.b_regularizer else None, 118 | 'activity_regularizer': self.activity_regularizer.get_config() if self.activity_regularizer else None, 119 | 'W_constraint': self.W_constraint.get_config() if self.W_constraint else None, 120 | 'b_constraint': self.b_constraint.get_config() if self.b_constraint else None, 121 | 'input_dim': self.input_dim, 122 | 'input_length': self.input_length} 123 | #base_config = super(TimeDistributedDense, self).get_config() 124 | return dict(list(config.items())) 125 | ''' -------------------------------------------------------------------------------- /src/CNN.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import absolute_import 3 | 4 | from keras import backend as K 5 | from keras import activations, initializations, regularizers, constraints 6 | from keras.regularizers import ActivityRegularizer 7 | from theano.tensor.signal import downsample 8 | 9 | 10 | def conv_output_length(input_length, filter_size, border_mode, stride): 11 | if input_length is None: 12 | return None 13 | assert border_mode in {'same', 'valid'} 14 | if border_mode == 'same': 15 | output_length = input_length 16 | elif border_mode == 'valid': 17 | output_length = input_length - filter_size + 1 18 | return (output_length + stride - 1) // stride 19 | 20 | 21 | class Convolution1D(): 22 | '''Convolution operator for filtering neighborhoods of one-dimensional inputs. 23 | When using this layer as the first layer in a model, 24 | either provide the keyword argument `input_dim` 25 | (int, e.g. 128 for sequences of 128-dimensional vectors), 26 | or `input_shape` (tuple of integers, e.g. (10, 128) for sequences 27 | of 10 vectors of 128-dimensional vectors). 28 | 29 | # Input shape 30 | 3D tensor with shape: `(samples, steps, input_dim)`. 31 | 32 | # Output shape 33 | 3D tensor with shape: `(samples, new_steps, nb_filter)`. 34 | `steps` value might have changed due to padding. 35 | 36 | # Arguments 37 | nb_filter: Number of convolution kernels to use 38 | (dimensionality of the output). 39 | filter_length: The extension (spatial or temporal) of each filter. 40 | init: name of initialization function for the weights of the layer 41 | (see [initializations](../initializations.md)), 42 | or alternatively, Theano function to use for weights initialization. 43 | This parameter is only relevant if you don't pass a `weights` argument. 44 | activation: name of activation function to use 45 | (see [activations](../activations.md)), 46 | or alternatively, elementwise Theano function. 47 | If you don't specify anything, no activation is applied 48 | (ie. "linear" activation: a(x) = x). 49 | weights: list of numpy arrays to set as initial weights. 50 | border_mode: 'valid' or 'same'. 51 | subsample_length: factor by which to subsample output. 52 | W_regularizer: instance of [WeightRegularizer](../regularizers.md) 53 | (eg. L1 or L2 regularization), applied to the main weights matrix. 54 | b_regularizer: instance of [WeightRegularizer](../regularizers.md), 55 | applied to the bias. 56 | activity_regularizer: instance of [ActivityRegularizer](../regularizers.md), 57 | applied to the network output. 58 | W_constraint: instance of the [constraints](../constraints.md) module 59 | (eg. maxnorm, nonneg), applied to the main weights matrix. 60 | b_constraint: instance of the [constraints](../constraints.md) module, 61 | applied to the bias. 62 | input_dim: Number of channels/dimensions in the input. 63 | Either this argument or the keyword argument `input_shape`must be 64 | provided when using this layer as the first layer in a model. 65 | input_length: Length of input sequences, when it is constant. 66 | This argument is required if you are going to connect 67 | `Flatten` then `Dense` layers upstream 68 | (without it, the shape of the dense outputs cannot be computed). 69 | ''' 70 | input_ndim = 3 71 | 72 | def __init__(self, nb_filter, filter_length, 73 | init='uniform', activation='linear', weights=None, 74 | border_mode='valid', subsample_length=1, 75 | input_dim=None): 76 | 77 | if border_mode not in {'valid', 'same'}: 78 | raise Exception('Invalid border mode for Convolution1D:', border_mode) 79 | self.nb_filter = nb_filter 80 | self.filter_length = filter_length 81 | self.init = initializations.get(init) 82 | self.activation = activations.get(activation) 83 | assert border_mode in {'valid', 'same'}, 'border_mode must be in {valid, same}' 84 | self.border_mode = border_mode 85 | self.subsample_length = subsample_length 86 | 87 | self.subsample = (subsample_length, 1) 88 | 89 | 90 | self.input_dim = input_dim 91 | 92 | 93 | input_dim = self.input_dim 94 | self.W_shape = (self.nb_filter, input_dim, self.filter_length, 1) 95 | self.W = self.init(self.W_shape) 96 | self.b = K.zeros((self.nb_filter,)) 97 | self.params = [self.W, self.b] 98 | 99 | def get_output(self, train=False): 100 | X = train 101 | X = K.expand_dims(X, -1) # add a dimension of the right 102 | X = K.permute_dimensions(X, (0, 2, 1, 3)) 103 | conv_out = K.conv2d(X, self.W, strides=self.subsample, 104 | border_mode=self.border_mode, 105 | dim_ordering='th') 106 | 107 | output = conv_out + K.reshape(self.b, (1, self.nb_filter, 1, 1)) 108 | output = self.activation(output) 109 | output = K.squeeze(output, 3) # remove the dummy 3rd dimension 110 | output = K.permute_dimensions(output, (0, 2, 1)) 111 | return output 112 | 113 | class MaxPooling1D(): 114 | '''Max pooling operation for temporal data. 115 | 116 | # Input shape 117 | 3D tensor with shape: `(samples, steps, features)`. 118 | 119 | # Output shape 120 | 3D tensor with shape: `(samples, downsampled_steps, features)`. 121 | 122 | # Arguments 123 | pool_length: factor by which to downscale. 2 will halve the input. 124 | stride: integer or None. Stride value. 125 | border_mode: 'valid' or 'same'. 126 | Note: 'same' will only work with TensorFlow for the time being. 127 | ''' 128 | def __init__(self, pool_length=2, stride=None, 129 | border_mode='valid'): 130 | self.pool_length = pool_length = 2 131 | self.border_mode = border_mode 132 | self.params = [] 133 | 134 | def get_output(self, train=False): 135 | #output = K.pool2d(x = train, pool_size = (self.pool_length,1), 136 | # border_mode = self.border_mode, pool_mode='max') 137 | pool_size = (self.pool_length, 1) 138 | strides = (self.pool_length, 1) 139 | ignore_border = True 140 | padding = (0, 0) 141 | output = downsample.max_pool_2d(train, ds=pool_size, st=strides, 142 | ignore_border=ignore_border, 143 | padding=padding, 144 | mode='max') 145 | return output 146 | -------------------------------------------------------------------------------- /src/LSTM.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import absolute_import 3 | import numpy as np 4 | 5 | from keras import backend as K 6 | from keras import activations, initializations 7 | import theano 8 | import theano.tensor as T 9 | 10 | class LSTM(): 11 | '''Long-Short Term Memory unit - Hochreiter 1997. 12 | 13 | For a step-by-step description of the algorithm, see 14 | [this tutorial](http://deeplearning.net/tutorial/lstm.html). 15 | 16 | # Arguments 17 | output_dim: dimension of the internal projections and the final output. 18 | init: weight initialization function. 19 | Can be the name of an existing function (str), 20 | or a Theano function (see: [initializations](../initializations.md)). 21 | inner_init: initialization function of the inner cells. 22 | forget_bias_init: initialization function for the bias of the forget gate. 23 | [Jozefowicz et al.](http://www.jmlr.org/proceedings/papers/v37/jozefowicz15.pdf) 24 | recommend initializing with ones. 25 | activation: activation function. 26 | Can be the name of an existing function (str), 27 | or a Theano function (see: [activations](../activations.md)). 28 | inner_activation: activation function for the inner cells. 29 | 30 | # References 31 | - [Long short-term memory](http://deeplearning.cs.cmu.edu/pdfs/Hochreiter97_lstm.pdf) (original 1997 paper) 32 | - [Learning to forget: Continual prediction with LSTM](http://www.mitpressjournals.org/doi/pdf/10.1162/089976600300015015) 33 | - [Supervised sequence labelling with recurrent neural networks](http://www.cs.toronto.edu/~graves/preprint.pdf) 34 | ''' 35 | def __init__(self, input_dim, output_dim, 36 | init='glorot_uniform', inner_init='orthogonal', 37 | forget_bias_init='one', activation='tanh', 38 | inner_activation='hard_sigmoid'): 39 | #self.input_dim = input_dim 40 | self.output_dim = output_dim 41 | self.init = initializations.get(init) 42 | self.inner_init = initializations.get(inner_init) 43 | self.forget_bias_init = initializations.get(forget_bias_init) 44 | self.activation = activations.get(activation) 45 | self.inner_activation = activations.get(inner_activation) 46 | 47 | self.input_dim = input_dim 48 | #self.input = K.placeholder(input_shape) 49 | 50 | # initial states: 2 all-zero tensor of shape (output_dim) 51 | self.states = [None, None] 52 | 53 | self.W_i = self.init((input_dim, self.output_dim)) 54 | self.U_i = self.inner_init((self.output_dim, self.output_dim)) 55 | self.b_i = K.zeros((self.output_dim,)) 56 | 57 | self.W_f = self.init((input_dim, self.output_dim)) 58 | self.U_f = self.inner_init((self.output_dim, self.output_dim)) 59 | self.b_f = self.forget_bias_init((self.output_dim,)) 60 | 61 | self.W_c = self.init((input_dim, self.output_dim)) 62 | self.U_c = self.inner_init((self.output_dim, self.output_dim)) 63 | self.b_c = K.zeros((self.output_dim,)) 64 | 65 | self.W_o = self.init((input_dim, self.output_dim)) 66 | self.U_o = self.inner_init((self.output_dim, self.output_dim)) 67 | self.b_o = K.zeros((self.output_dim,)) 68 | 69 | self.params = [self.W_i, self.U_i, self.b_i, 70 | self.W_c, self.U_c, self.b_c, 71 | self.W_f, self.U_f, self.b_f, 72 | self.W_o, self.U_o, self.b_o] 73 | 74 | #if self.initial_weights is not None: 75 | # self.set_weights(self.initial_weights) 76 | # del self.initial_weights 77 | 78 | def numpy_floatX(self,data): 79 | return np.asarray(data, dtype=np.float32) 80 | def reset_states(self,batch_size): 81 | #self.states = [K.zeros((batch_size, self.output_dim)), 82 | # K.zeros((batch_size, self.output_dim))] 83 | 84 | self.states = [T.alloc(self.numpy_floatX(0.),batch_size,self.output_dim), 85 | T.alloc(self.numpy_floatX(0.),batch_size,self.output_dim)] 86 | 87 | def step(self, x, h_tm1, c_tm1): 88 | #assert len(states) == 2 89 | #h_tm1 = states[0] 90 | #c_tm1 = states[1] 91 | 92 | x_i = K.dot(x, self.W_i) + self.b_i 93 | x_f = K.dot(x, self.W_f) + self.b_f 94 | x_c = K.dot(x, self.W_c) + self.b_c 95 | x_o = K.dot(x, self.W_o) + self.b_o 96 | 97 | i = self.inner_activation(x_i + K.dot(h_tm1, self.U_i)) 98 | f = self.inner_activation(x_f + K.dot(h_tm1, self.U_f)) 99 | c = f * c_tm1 + i * self.activation(x_c + K.dot(h_tm1, self.U_c)) 100 | o = self.inner_activation(x_o + K.dot(h_tm1, self.U_o)) 101 | h = o * self.activation(c) 102 | return h, c 103 | 104 | def get_output(self, go_backwards = False, train = False): 105 | self.reset_states(train.shape[0]) 106 | inputs = train.dimshuffle((1, 0, 2)) 107 | results, _ = theano.scan( 108 | self.step, 109 | sequences=inputs, 110 | outputs_info=[self.states[0],self.states[1]], 111 | go_backwards=go_backwards) 112 | ''' 113 | # deal with Theano API inconsistency 114 | if type(results) is list: 115 | outputs = results[0] 116 | states = results[1:] 117 | else: 118 | outputs = results 119 | states = [] 120 | 121 | outputs = T.squeeze(outputs) 122 | last_output = outputs[-1] 123 | ''' 124 | 125 | #outputs = np.asarray(results)[:,0] 126 | #outputs = T.squeeze(outputs) 127 | #outputs = outputs.dimshuffle((1, 0, 2)) 128 | 129 | #states = [T.squeeze(state[-1]) for state in states] 130 | #return last_output, outputs, states 131 | 132 | outputs = results[0] 133 | outputs = T.squeeze(outputs) 134 | outputs = outputs.dimshuffle((1, 0, 2)) 135 | return outputs 136 | 137 | 138 | class BLSTM(): 139 | def __init__(self, input_dim, output_dim, 140 | init='glorot_uniform', inner_init='orthogonal', 141 | forget_bias_init='one', activation='tanh', 142 | inner_activation='hard_sigmoid'): 143 | #self.input_dim = input_dim 144 | self.output_dim = int(output_dim / 2) 145 | self.init = initializations.get(init) 146 | self.inner_init = initializations.get(inner_init) 147 | self.forget_bias_init = initializations.get(forget_bias_init) 148 | self.activation = activations.get(activation) 149 | self.inner_activation = activations.get(inner_activation) 150 | 151 | self.input_dim = input_dim 152 | #self.input = K.placeholder(input_shape) 153 | 154 | # initial states: 2 all-zero tensor of shape (output_dim) 155 | self.forward_lstm = LSTM(input_dim = input_dim, output_dim = self.output_dim) 156 | self.backward_lstm = LSTM(input_dim = input_dim, output_dim = self.output_dim) 157 | 158 | self.params = self.forward_lstm.params + self.backward_lstm.params 159 | 160 | #if self.initial_weights is not None: 161 | # self.set_weights(self.initial_weights) 162 | # del self.initial_weights 163 | 164 | 165 | 166 | def get_output(self, train = False): 167 | res_forward = self.forward_lstm.get_output(train) 168 | res_backward = self.backward_lstm.get_output(train[:,::-1,:]) 169 | outputs = T.concatenate([res_forward, res_backward[:,::-1,:]], axis = -1) 170 | return outputs 171 | -------------------------------------------------------------------------------- /src/pairwise.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import sys 3 | import time 4 | from collections import OrderedDict 5 | import theano 6 | import copy 7 | from data_utils import load_data, prepare_data, get_minibatches_idx, data_padding 8 | from model import build_model 9 | from optimizer import sgd,rmsprop,adadelta 10 | import theano.tensor as tensor 11 | class Pairwise(object): 12 | def get_score(self, pre_sentence_list, cur_sentence, preds, pairdict): 13 | score = 0.0 14 | for pre_sentence in pre_sentence_list: 15 | idx = pairdict[(pre_sentence, cur_sentence)] 16 | score += np.log(preds[idx]) 17 | return score 18 | def score_rank(self, sentence): 19 | n_total = 0 20 | n_correct = 0 21 | for i in range(len(sentence)): 22 | for j in range(i+1, len(sentence)): 23 | n_total += 1 24 | if sentence[i] < sentence[j]: n_correct += 1 25 | patial_correct = n_correct * 1.0 / n_total 26 | total_correct = 0.0 27 | if n_correct == n_total: total_correct = 1.0 28 | return patial_correct, total_correct 29 | def eva(self, 30 | f_pred, src_data, data, pairdict, kf, model_options): 31 | preds = [] 32 | for _, data_index in kf: 33 | batch_samples = [data[t] for t in data_index] 34 | sentence1,sentence1_mask,sentence2,sentence2_mask,y = data_padding(batch_samples) 35 | preds.append(f_pred(sentence1,sentence1_mask,sentence2,sentence2_mask)) 36 | preds = np.concatenate(preds, axis = 0) # 1d_array n_samples 37 | 38 | categories = [] 39 | n_sentences = 0 40 | data_beams = [] # n_paragraph * n_sentences (sentence, score) 41 | for paragraph, cur_categories in src_data: 42 | categories.append(cur_categories) 43 | beam = [] 44 | for s_id in xrange(len(paragraph)): 45 | beam.append(([s_id + n_sentences],0.0)) 46 | for nid in xrange(len(paragraph)-1): 47 | new_beam = [] 48 | for item in beam: 49 | for s_id in xrange(len(paragraph)): 50 | new_sentence = item[0] + [s_id + n_sentences] 51 | if len(set(new_sentence)) < nid + 2: continue # repeated elements occur 52 | new_score = item[1] + self.get_score(item[0], s_id + n_sentences, preds, pairdict) 53 | new_beam.append((new_sentence, new_score)) 54 | new_beam = sorted(new_beam, key=lambda item : -item[1]) #from high score to lower ones 55 | beam = new_beam[:model_options['beam_size']] 56 | data_beams.append(beam) 57 | n_sentences += len(paragraph) 58 | 59 | top1_res = [] # sentence_rank, paragraph_categories 60 | eva_res = np.zeros((len(src_data),model_options['beam_size'],2)) # n_paragraph * beam_size * 2 patial_correct, total_correct 61 | for id_paragraph, beam in enumerate(data_beams): 62 | top1_res.append((beam[0][0], categories[id_paragraph])) 63 | for idx, (sentence, _) in enumerate(beam): 64 | patial_correct, total_correct = self.score_rank(sentence) 65 | eva_res[id_paragraph][idx] = np.asarray([patial_correct, total_correct]) 66 | 67 | top = 1 68 | while top <= model_options['beam_size']: 69 | eva_res_top = np.max(eva_res[:,:top,:], axis = 1) # n_paragraph * 2 patial_correct, total_correct 70 | print 'Top %d beam ' % top 71 | average = np.average(eva_res_top, axis = 0) 72 | print 'patial_correct_rate: ', average[0] 73 | print 'total_correct_rate: ', average[1] 74 | top *= 2 75 | print '' 76 | 77 | return top1_res 78 | def save_result(self,path,top1_res): 79 | fw = open(path,'w') 80 | for paragraph, cur_categories in top1_res: 81 | paragraph = np.asarray(paragraph) - np.min(paragraph) 82 | paragraph = list(paragraph) 83 | for sentence in paragraph: 84 | fw.write(str(sentence)) 85 | fw.write(' ') 86 | fw.write('#') 87 | for category in cur_categories: 88 | fw.write(category) 89 | fw.write(' ') 90 | fw.write('\n') 91 | fw.close() 92 | 93 | def __init__(self, 94 | alpha, 95 | batch_size, 96 | n_epochs, 97 | wordVecLen, 98 | flag_dropout, 99 | datapath, 100 | random_seed, 101 | dropoutRates, 102 | optimizer, 103 | dispFreq, 104 | beam_size, 105 | flag_random_lookup_table, 106 | flag_toy_data, 107 | size_hidden_layer, 108 | dataset, 109 | result_path, 110 | sentence_modeling, 111 | CNN_filter_length, 112 | LSTM_go_backwards 113 | ): 114 | model_options = locals().copy() 115 | model_options['rng'] = np.random.RandomState(random_seed) 116 | print 'Loading data' 117 | src_train,src_valid,src_test,dic_w2idx, dic_idx2w, dic_w2embed, dic_idx2embed, embedding = load_data(path=datapath) 118 | if flag_toy_data == True: 119 | src_valid = src_valid[:10] 120 | src_test = src_test[:10] 121 | #src_train = copy.copy(src_valid) 122 | src_train = src_train[:10] 123 | elif flag_toy_data != False: 124 | valid_l = len(src_valid) * flag_toy_data 125 | test_l = len(src_test) * flag_toy_data 126 | train_l = len(src_train) * flag_toy_data 127 | src_valid = src_valid[:int(valid_l)] 128 | src_test = src_test[:int(test_l)] 129 | src_train = src_train[:int(train_l)] 130 | 131 | train,pairdict_train = prepare_data(src_train) 132 | valid,pairdict_valid = prepare_data(src_valid) 133 | test,pairdict_test = prepare_data(src_test) 134 | model_options['embedding'] = embedding 135 | 136 | (sentence1,sentence1_mask,sentence2,sentence2_mask,y,cost,f_pred,tparams,f_debug) = build_model(model_options) 137 | #f_cost = theano.function([sentence1,sentence1_mask,sentence2,sentence2_mask,y], cost, name='f_cost') 138 | 139 | #grads = tensor.grad(theano.gradient.grad_clip(cost, -2.0, 2.0), wrt=tparams.values()) 140 | grads = tensor.grad(theano.gradient.grad_clip(cost, -2.0, 2.0), wrt=tparams) 141 | # grads = tensor.grad(cost, wrt=tparams.values()) 142 | #f_grad = theano.function([sentence1,sentence1_mask,sentence2,sentence2_mask,y], grads, name='f_grad') 143 | 144 | lr = tensor.scalar(name='lr') 145 | if model_options['optimizer'] == 'sgd': optimizer = sgd 146 | elif model_options['optimizer'] == 'rmsprop': optimizer = rmsprop 147 | else: optimizer = adadelta 148 | f_grad_shared, f_update = optimizer(lr, tparams, grads, sentence1,sentence1_mask,sentence2,sentence2_mask,y, cost) 149 | 150 | 151 | print 'Optimization' 152 | 153 | kf_valid = get_minibatches_idx(len(valid), model_options['batch_size']) 154 | kf_test = get_minibatches_idx(len(test), model_options['batch_size']) 155 | 156 | print "%d train examples" % len(train) 157 | print "%d valid examples" % len(valid) 158 | print "%d test examples" % len(test) 159 | sys.stdout.flush() 160 | 161 | 162 | best_validation_score = -np.inf 163 | best_iter = 0 164 | uidx = 0 # the number of update done 165 | for epoch in xrange(model_options['n_epochs']): 166 | print ('Training on %d epoch' % epoch) 167 | sys.stdout.flush() 168 | kf = get_minibatches_idx(len(train), batch_size, shuffle=True) 169 | start_time = time.time() 170 | samples_seen = 0 171 | for _, train_index in kf: 172 | uidx += 1 173 | batch_samples = [train[t] for t in train_index] 174 | samples_seen += len(batch_samples) 175 | #print batch_samples 176 | sentence1,sentence1_mask,sentence2,sentence2_mask,y = data_padding(batch_samples) 177 | #print sentence1,sentence1_mask,sentence2,sentence2_mask,y 178 | #print sentence1.shape,sentence1_mask.shape,sentence2.shape,sentence2_mask.shape,y.shape 179 | #o = f_debug(sentence1,sentence1_mask,sentence2,sentence2_mask,y) 180 | #print o 181 | #print o[0].shape,o[1].shape,o[2].shape,o[3].shape 182 | cost = f_grad_shared(sentence1,sentence1_mask,sentence2,sentence2_mask,y) 183 | f_update(model_options['alpha']) 184 | if np.isnan(cost) or np.isinf(cost): 185 | print 'NaN detected' 186 | return 1., 1., 1. 187 | 188 | if np.mod(uidx, dispFreq) == 0: 189 | print 'Epoch ', epoch, 'Update ', uidx, 'Cost ', cost, 'Samples_seen ', samples_seen 190 | sys.stdout.flush() 191 | print 'Epoch ', epoch, 'Update ', uidx, 'Cost ', cost, 'Samples_seen ', samples_seen 192 | sys.stdout.flush() 193 | ''' 194 | if epoch % 5 == 0: 195 | kf_train = get_minibatches_idx(len(train), batch_size) 196 | print ('Train_score:') 197 | self.eva(f_pred, src_train, train, pairdict_train, kf_train, model_options) 198 | sys.stdout.flush() 199 | ''' 200 | print ('Valid_score:') 201 | top1_res = self.eva(f_pred, src_valid, valid, pairdict_valid, kf_valid, model_options) 202 | self.save_result(model_options['result_path'] + 'dev.on.' + str(epoch) +'th_epoch_' + model_options['dataset'],top1_res) 203 | sys.stdout.flush() 204 | print ('Test_score:') 205 | top1_res = self.eva(f_pred, src_test, test, pairdict_test, kf_test, model_options) 206 | self.save_result(model_options['result_path'] + 'test.on.' + str(epoch) +'th_epoch_' + model_options['dataset'],top1_res) 207 | sys.stdout.flush() 208 | 209 | print ('%d epoch completed.' % epoch) 210 | sys.stdout.flush() 211 | ''' 212 | if(best_validation_score < valid_score): 213 | best_iter = epoch 214 | best_validation_score = valid_score 215 | print ('Current best_dev_F is %.2f, at %d epoch'%(best_validation_score,best_iter)) 216 | ''' 217 | 218 | end_time = time.time() 219 | minu = int((end_time - start_time)/60) 220 | sec = (end_time - start_time) - 60 * minu 221 | print ('Time: %d min %.2f sec' % (minu, sec)) 222 | sys.stdout.flush() 223 | print('Training completed!') 224 | sys.stdout.flush() 225 | 226 | --------------------------------------------------------------------------------