├── README.md
├── multi_view.py
├── multi_view_domain_embedding_memory_adversarial.py
├── preprocessing.py
└── read_data.py


/README.md:
--------------------------------------------------------------------------------
 1 | multi-domain-sentiment
 2 | ======
 3 | A framework for multi-domain sentiment analysis by learning domain-specific representations of input sentences using neural network. 
 4 | 
 5 | Prerequisite
 6 | ======
 7 | 1. Tensorflow 
 8 | 2. Google News Embeddings (https://code.google.com/archive/p/word2vec/) (rename it to 'vectors.gz' and put it under the main folder)
 9 | 3. Gensim
10 | 
11 | Data Preparation
12 | ======
13 | 1. Download datasets (e.g. laptops). We assume the datasets are preprocessed into the following format:
14 | 
15 |      The unit does everything it promises . I 've only used it once so far , but i 'm happy with it ||| 1
16 | 
17 | 2. Randomly split each dataset into training (e.g. laptops/trn), development (e.g. laptops/dev) and testing datasets (e.g. laptops/tst). Put all datasets into a folder named 'dataset'. Thus, the directory structure looks like dataset/laptops/trn. 
18 | 
19 | Preprocessing and Run the Demo
20 | ======
21 | 
22 | 1. Run `python preprocessing.py`. This program will iterate through the 'dataset' folder and generate files like dictionaries, embeddings and transformed datasets.
23 | 
24 | 2. Run `python multi_view_domain_embedding_memory_adversarial.py dataset_name1 dataset_name2 ...` for running the algorithm.
25 | 
26 |  
27 | 
28 | 


--------------------------------------------------------------------------------
/multi_view.py:
--------------------------------------------------------------------------------
  1 | from __future__ import print_function
  2 | import six.moves.cPickle as pickle
  3 | from collections import OrderedDict
  4 | import sys
  5 | import time
  6 | import numpy 
  7 | import tensorflow as tf
  8 | import read_data
  9 | from random import shuffle
 10 | 
 11 | class EmbeddingModel(object):
 12 | 
 13 |     def __init__(self, is_training, config, session, trainable):
 14 |         batch_size = config.batch_size
 15 |         #the steps of applying LSTM
 16 |         num_steps = config.num_steps
 17 |         hidden_size= config.hidden_size
 18 |         vocab_size = config.vocab_size
 19 | 
 20 |         #inputs: features, mask and labels
 21 |         self.input_data = tf.placeholder(tf.int32, [num_steps, batch_size], name="inputs")
 22 |         self.mask= tf.placeholder(tf.int64, [batch_size], name="mask")
 23 |         self.labels=tf.placeholder(tf.int64, [batch_size], name="labels")
 24 |         self.domains=tf.placeholder(tf.int64, [batch_size], name="domains")
 25 | 
 26 |         #word embedding layer
 27 |         with tf.device("/cpu:0"):
 28 |             self.embedding=embedding = tf.get_variable("embedding", [vocab_size, hidden_size], trainable=trainable)
 29 |             # num_steps* batch_size * embedding_size
 30 |             inputs = tf.nn.embedding_lookup(embedding, self.input_data)
 31 |             #add dropout to input units
 32 |             if is_training and config.keep_prob < 1:
 33 |                 inputs = tf.nn.dropout(inputs, config.keep_prob)
 34 | 
 35 |         #add LSTM cell and dropout nodes
 36 |         with tf.variable_scope('forward'):
 37 |             fw_lstm = tf.contrib.rnn.BasicLSTMCell(hidden_size, forget_bias=0.0)
 38 |             if is_training and config.keep_prob < 1:
 39 |                 fw_lstm = tf.contrib.rnn.DropoutWrapper(fw_lstm, output_keep_prob=config.keep_prob)
 40 | 
 41 |         with tf.variable_scope('backward'):
 42 |             bw_lstm = tf.contrib.rnn.BasicLSTMCell(hidden_size, forget_bias=0.0)
 43 |             if is_training and config.keep_prob < 1:
 44 |                 bw_lstm = tf.contrib.rnn.DropoutWrapper(bw_lstm, output_keep_prob=config.keep_prob)
 45 | 
 46 |         #bidirectional rnn
 47 |         lstm_output=tf.nn.bidirectional_dynamic_rnn(fw_lstm, bw_lstm, inputs=inputs, sequence_length=self.mask, time_major=True, dtype=tf.float32)
 48 |         #num_step * batch_size * (hidden_size, hidden_siz)
 49 |         self.lstm_output=lstm_output=tf.concat(lstm_output[0], 2)
 50 |         #final sentence embedding.  batch_size * (2 * hidden_size)
 51 |         self.lstm_output=lstm_output=tf.reduce_mean(lstm_output, axis=0)
 52 | 
 53 | class Combine_two_model:
 54 |     def __init__(self, share_model, config):
 55 |         self.share_model=share_model
 56 |         self.batch_size=batch_size=config.batch_size
 57 |         
 58 |         #combined_embedding=tf.concat([model.lstm_output, share_model.lstm_output],1)
 59 |         #softmax matrix
 60 |         softmax_w = tf.get_variable("softmax_w", [2*config.hidden_size, config.num_classes])
 61 |         softmax_b = tf.get_variable("softmax_b", [config.num_classes])
 62 |         logits = tf.matmul(share_model.lstm_output, softmax_w) + softmax_b
 63 |         #cross entropy loss
 64 |         loss = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=share_model.labels, logits=logits)
 65 |         self.entropy=cost = tf.reduce_sum(loss)
 66 |         #add regularization
 67 |         tvars = tf.trainable_variables()
 68 |         for var in tvars:
 69 |             if ('shared_model/bidirectional_rnn' in var.name and 'biases' not in var.name) \
 70 |             or 'shared_model/embedding' in var.name or tf.get_variable_scope().name+'/embedding' in var.name:
 71 |                 cost=tf.add(cost, get_lambda(var.name, config)*tf.nn.l2_loss(var))
 72 |         self.cost= cost
 73 |         #operators for prediction
 74 |         self.prediction=prediction=tf.argmax(logits,1)
 75 |         correct_prediction = tf.equal(prediction, share_model.labels)
 76 |         self.accuracy = tf.reduce_sum(tf.cast(correct_prediction, tf.float32))
 77 |         
 78 |         #operators for optimizer
 79 |         self.lr = tf.Variable(0.0, trainable=False)
 80 |         
 81 |         grads, _ = tf.clip_by_global_norm(tf.gradients(cost, tvars),config.max_grad_norm)
 82 |         self.grads=grads[4]
 83 |         optimizer = tf.train.AdagradOptimizer(self.lr)
 84 |         #optimizer = tf.train.AdamOptimizer(self.lr)
 85 |         #self.train_op = optimizer.minimize(cost)
 86 |         self.train_op = optimizer.apply_gradients(zip(grads, tvars))
 87 | 
 88 |     #assign value to learning rate
 89 |     def assign_lr(self, session, lr_value):
 90 |         session.run(tf.assign(self.lr, lr_value))
 91 | 
 92 | class Config(object):
 93 |     vocab_size=10000  # Vocabulary size
 94 |     maxlen=100  # Sequence longer then this get ignored
 95 |     num_steps = maxlen
 96 |     batch_size=10  # The batch size during training.
 97 | 
 98 |     init_scale = 0.05
 99 |     learning_rate = 1
100 |     max_grad_norm = 5
101 |     hidden_size = 300
102 |     max_epoch = 1
103 |     max_max_epoch =30
104 |     keep_prob = 0.40
105 |     lr_decay = 0.90
106 |     lambda_loss_m1=3e-6
107 |     lambda_loss_m2=3e-6
108 |     lambda_loss_share=3e-6
109 |     valid_portion=0.1
110 |     domain_size=2
111 |     dataset='1'
112 | 
113 | #get lambda for regularization
114 | def get_lambda(name, config):
115 | 	if "m1" in name:
116 | 		return config.lambda_loss_m1
117 | 	if "m2" in name:
118 | 		return config.lambda_loss_m2
119 | 	if "shared_model" in name:
120 | 		return config.lambda_loss_share
121 | def get_minibatches_idx(n, batch_size, shuffle=False):
122 |     """
123 |     Used to shuffle the dataset at each iteration.
124 |     """
125 |     idx_list = numpy.arange(n, dtype="int32")
126 | 
127 |     if shuffle:
128 |         numpy.random.shuffle(idx_list)
129 | 
130 |     minibatches = []
131 |     minibatch_start = 0
132 |     for i in range(n // batch_size):
133 |         minibatches.append(idx_list[minibatch_start:
134 |                                     minibatch_start + batch_size])
135 |         minibatch_start += batch_size
136 | 
137 |     if (minibatch_start != n):
138 |         # Make a minibatch out of what is left
139 |         minibatches.append(idx_list[-batch_size:])
140 |     return minibatches
141 | 
142 | 
143 | def run_epoch(session, m, data, eval_op, num=1000):
144 |     n_samples = data[0].shape[1]
145 |     print("Running %d samples:"%(n_samples))  
146 |     minibatches = get_minibatches_idx(n_samples, m.batch_size, shuffle=False)
147 | 
148 |     correct = 0.
149 |     total = 0
150 |     #predictions
151 |     p=[]
152 |     total_entropy=0
153 |     total_cost=0
154 |     for inds in minibatches[:]:
155 |         x = data[0][:,inds]        
156 |         mask = data[1][inds]
157 |         y = data[2][inds]
158 |         
159 |         count, _, prediction,embedding, cost, entropy, grads= \
160 |         session.run([m.accuracy, eval_op, m.prediction, m.share_model.embedding, m.cost, m.entropy, m.grads],\
161 |             {m.share_model.input_data: x, m.share_model.mask: mask, m.share_model.labels: y,\
162 |             m.share_model.domains: numpy.array([num]*len(y))})
163 |         print(grads)
164 |         correct += count
165 |         total += len(inds)
166 |         p+=prediction.tolist()
167 |         total_entropy+=entropy
168 |         total_cost+=cost
169 | 
170 |     print("Entropy loss")
171 |     print(total_entropy)
172 |     print("Total loss:")
173 |     print(total_cost)
174 |     accuracy = correct/total
175 |     return (accuracy, p)
176 | 
177 | def load_dataset(path, config):
178 |     print('Loading data: '+ path)
179 |     train, valid, test = read_data.load_data(path, n_words=config.vocab_size, \
180 |         valid_portion=0.15, maxlen=config.maxlen)
181 |     train = read_data.prepare_data(train[0], train[1], maxlen=config.maxlen)
182 |     valid = read_data.prepare_data(valid[0], valid[1], maxlen=config.maxlen)
183 |     test = read_data.prepare_data(test[0], test[1], maxlen=config.maxlen)
184 |     return (train, valid, test)
185 | 
186 | def train_test_model(config, session, train_models, valid_models, test_models, trains, valids, tests):
187 |     for i in range(config.max_max_epoch):
188 |         #compute lr_decay
189 |         lr_decay = config.lr_decay ** max(i - config.max_epoch, 0.0)
190 |        	model_list=list(zip(range(len(train_models)), train_models, valid_models, trains, valids))
191 |        	if i%2==0:
192 |             model_list=reversed(model_list)
193 |         min_training=1.0
194 |         number=-1
195 |         for num, train_model, test_model, train, valid in model_list:
196 |             #update learning rate
197 |             train_model.assign_lr(session, config.learning_rate * lr_decay)
198 |             print("")
199 |             print("Model: "+str(num+1))
200 |             print("Epoch: %d Learning rate: %.3f" % (i + 1, session.run(train_model.lr)))
201 |             start_time = time.time()
202 |             if(train_model.__class__.__name__=='Combine_two_model'):
203 |                 train_acc = run_epoch(session, train_model, train, train_model.train_op, num=num)
204 |             print("Training Accuracy = %.4f, time = %.3f seconds\n"%(train_acc[0], time.time()-start_time))
205 |             
206 |             if train_acc[0] < 0.9 and train_acc[0]< min_training:
207 |                 number=num
208 |                 min_training=train_acc[0]
209 | 
210 |             
211 |             valid_acc = run_epoch(session, test_model, valid, tf.no_op(), num=num)
212 |             print("Valid Accuracy = %.4f\n" % valid_acc[0])
213 | 
214 |         if number != -1:
215 |             for num, train_model, test_model, train, valid in model_list:
216 |                 if num==number:
217 |                     print("Model: "+str(num+1))
218 |                     print("Epoch: %d Learning rate: %.3f" % (i + 1, session.run(train_model.lr)))
219 |                     start_time = time.time()
220 |                     train_acc = run_epoch(session, train_model, train, train_model.train_op, num=num)
221 |                     print("Training Accuracy = %.4f, time = %.3f seconds\n"%(train_acc[0], time.time()-start_time))
222 | 
223 |             
224 |             #print(valid_acc[1])
225 |         for num, test_model, test in zip(range(len(test_models)),test_models, tests):          
226 |             test_acc = run_epoch(session, test_model, test, tf.no_op(),num=num)
227 |             
228 |             print(sys.argv[1+num])
229 |             print("Test Accuracy = %.4f\n" % test_acc[0])
230 |             
231 |             with open("multi_result_final.txt", 'a') as f:
232 |                 f.write("final accuracy for dataset "+ sys.argv[num+1]+": "+str(test_acc[0])+"\n")
233 | 
234 | 
235 | #combine two datasets
236 | def combine(d1, d2):
237 |     return numpy.concatenate([d1[0],d2[0]], axis=1),\
238 |     numpy.concatenate([d1[1],d2[1]]),numpy.concatenate([d1[2],d2[2]])
239 | 
240 | def word_to_vec(session,config, *args):
241 |     f = open("vectors"+config.dataset, 'rb')
242 |     #f = open("domainvectors", 'rb')
243 |     matrix= numpy.array(pickle.load(f))
244 |     print("word2vec shape: ", matrix.shape)
245 |     for model in args:
246 |         session.run(tf.assign(model.embedding, matrix))
247 | 
248 | def extend(train, times):
249 |     newtrain=train
250 |     for i in range(times-1):
251 |         newtrain=combine(newtrain, train)
252 |     return newtrain
253 | 
254 | #make dataset approximately the same size
255 | def extend_data(train, train1):
256 |     if train[0].shape[0] > train1[0].shape[0]:
257 |         if train[0].shape[0]/train1[0].shape[0]>1:
258 |             train1=extend(train1, train[0].shape[0]/train1[0].shape[0])
259 |         elif float(train[0].shape[0])/train1[0].shape[0]>1.6:
260 |             train1=extend(train1, 2)
261 |     else:
262 |         if train1[0].shape[0]/train[0].shape[0]>1:
263 |             train=extend(train, train1[0].shape[0]/train[0].shape[0])
264 |         elif float(train1[0].shape[0])/train[0].shape[0]>1.6:
265 |             train=extend(train, 2)
266 |     return train, train1
267 | 
268 | def count_labels(labels):
269 |     return len(set(labels))
270 | 
271 | def main(unused_args):
272 |     #configs
273 |     config = Config()
274 |     #domains to be processed
275 |     domain_list=sys.argv[1:]
276 |     domain_size=len(domain_list)
277 |     if domain_size<=0:
278 |         print("No dataset")
279 |         exit(1)
280 |     #load dataset
281 |     train_datasets, valid_datasets, test_datasets=[],[],[]
282 |     for domain in domain_list:
283 |         train, valid, test = read_data.load_data(path='dataset'+config.dataset+'/'+domain+'/dataset',n_words=config.vocab_size, \
284 |             valid_portion=config.valid_portion, maxlen=config.maxlen)
285 |         train_datasets.append(train)
286 |         valid_datasets.append(valid)
287 |         test_datasets.append(test)
288 |     #transform dataset to matrix
289 |     for index in range(domain_size):
290 |         train = read_data.prepare_data(train_datasets[index][0], train_datasets[index][1], maxlen=config.maxlen, traindata=True)
291 |         valid = read_data.prepare_data(valid_datasets[index][0], valid_datasets[index][1], maxlen=config.maxlen, traindata=False)
292 |         test = read_data.prepare_data(test_datasets[index][0], test_datasets[index][1], maxlen=config.maxlen, traindata=False)
293 |         train_datasets[index]=train
294 |         valid_datasets[index]=valid
295 |         test_datasets[index]=test
296 | 
297 |     config.num_classes = count_labels(train_datasets[0][2])
298 |     gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.8)
299 |     with tf.Graph().as_default(), tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) as session:
300 |         initializer = tf.random_normal_initializer(0, 0.05)
301 | 
302 |         #training model for shared weights
303 |         with tf.variable_scope("shared_model", reuse=None, initializer=initializer):
304 |             share_model_train = EmbeddingModel(is_training=True, config=config, session=session,trainable=True)
305 |         #testing model for shared weights
306 |         with tf.variable_scope("shared_model", reuse = True, initializer=initializer):
307 |             share_model_test = EmbeddingModel(is_training=False, config=config, session=session, trainable=True)
308 | 
309 |         #build models
310 |         train_models=[]
311 |         test_models=[]
312 |         for index in range(domain_size): 
313 |             with tf.variable_scope("m"+str(index), reuse = None, initializer=initializer):
314 |                 train_model = Combine_two_model(share_model_train, config)
315 |             with tf.variable_scope("m"+str(index), reuse = True, initializer=initializer):
316 |                 test_model = Combine_two_model(share_model_test, config)
317 |             train_models.append(train_model)
318 |             test_models.append(test_model)
319 | 
320 |         init = tf.global_variables_initializer()
321 |         session.run(init)
322 | 
323 |         #initialize share model's embedding with word2vec
324 |         word_to_vec(session,config, share_model_train)
325 |         #train test model
326 |         train_test_model(config, session,\
327 |             train_models,test_models,test_models,\
328 |             train_datasets,valid_datasets,test_datasets)
329 | 
330 | if __name__ == '__main__':
331 |     tf.app.run()
332 | 


--------------------------------------------------------------------------------
/multi_view_domain_embedding_memory_adversarial.py:
--------------------------------------------------------------------------------
  1 | from __future__ import print_function
  2 | import six.moves.cPickle as pickle
  3 | from collections import OrderedDict
  4 | import sys
  5 | import time
  6 | import numpy 
  7 | import tensorflow as tf
  8 | import read_data
  9 | from random import shuffle
 10 | import random
 11 | import numpy as np
 12 | 
 13 | class EmbeddingModel(object):
 14 | 
 15 |     def __init__(self, is_training, config, session):
 16 |         batch_size = config.batch_size
 17 |         num_steps = config.num_steps
 18 |         hidden_size= config.hidden_size
 19 |         vocab_size = config.vocab_size
 20 | 
 21 |         #inputs: features, mask and labels
 22 |         self.input_data = tf.placeholder(tf.int32, [num_steps, batch_size], name="inputs")
 23 |         self.mask= tf.placeholder(tf.int64, [batch_size], name="mask")
 24 |         self.labels=tf.placeholder(tf.int64, [batch_size], name="labels")
 25 |         self.domains=tf.placeholder(tf.int64, [batch_size], name="domains")
 26 |         self.memory_location=tf.placeholder(tf.int64, [batch_size], name="memory_location")
 27 | 
 28 |         #word embedding layer
 29 |         with tf.device("/cpu:0"):
 30 |             self.embedding=embedding = tf.get_variable("embedding", [vocab_size, hidden_size])
 31 |             # num_steps* batch_size * embedding_size
 32 |             inputs = tf.nn.embedding_lookup(embedding, self.input_data)
 33 |             #add dropout to input units
 34 |             if is_training and config.keep_prob < 1:
 35 |                 inputs = tf.nn.dropout(inputs, config.keep_prob)
 36 | 
 37 |         #add LSTM cell and dropout nodes
 38 |         with tf.variable_scope('forward'):
 39 |             fw_lstm = tf.contrib.rnn.BasicLSTMCell(hidden_size, forget_bias=0.0)
 40 |             if is_training and config.keep_prob < 1:
 41 |                 fw_lstm = tf.contrib.rnn.DropoutWrapper(fw_lstm, output_keep_prob=config.keep_prob)
 42 | 
 43 |         with tf.variable_scope('backward'):
 44 |             bw_lstm = tf.contrib.rnn.BasicLSTMCell(hidden_size, forget_bias=0.0)
 45 |             if is_training and config.keep_prob < 1:
 46 |                 bw_lstm = tf.contrib.rnn.DropoutWrapper(bw_lstm, output_keep_prob=config.keep_prob)
 47 | 
 48 |         #bidirectional rnn
 49 |         lstm_output=tf.nn.bidirectional_dynamic_rnn(fw_lstm, bw_lstm, inputs=inputs, sequence_length=self.mask, time_major=True, dtype=tf.float32)
 50 |         #num_step * batch_size * (hidden_size, hidden_size)
 51 |         self.lstm_output=tf.concat(lstm_output[0], 2)
 52 | 
 53 | class Domain_classifier:
 54 |     def __init__(self, share_model, weight1, bias1, weight2, bias2, config, is_adversarial=False):
 55 |         self.batch_size = config.batch_size
 56 |         self.share_model=share_model
 57 |         representation=tf.reduce_mean(share_model.lstm_output, axis=0)
 58 |         representation=tf.nn.relu(tf.matmul(representation, weight1) + bias1)
 59 |         logits=tf.matmul(representation, weight2) + bias2
 60 |         self.logits=logits
 61 | 
 62 | 
 63 |         #operators for prediction
 64 |         self.prediction=prediction=tf.argmax(logits,1)
 65 |         correct_prediction = tf.equal(prediction, share_model.domains)
 66 |         self.accuracy = tf.reduce_sum(tf.cast(correct_prediction, tf.float32))     
 67 | 
 68 |         #loss function
 69 |         global domain_size
 70 |         if is_adversarial:
 71 |         	loss=tf.nn.softmax(logits)*tf.one_hot(share_model.domains, depth=domain_size, on_value=0.0, off_value=1.0)
 72 |         else:
 73 |         	loss=tf.nn.softmax(logits)*tf.one_hot(share_model.domains, depth=domain_size, on_value=1.0, off_value=0.0)
 74 |         
 75 |         loss=tf.reduce_sum(loss,axis=1)
 76 |         loss=-tf.log(loss+1e-30)
 77 |         self.cost=cost =tf.reduce_sum(loss)
 78 | 
 79 |         #designate training variables
 80 |         tvars=tf.trainable_variables()
 81 |         if not is_adversarial:
 82 |             train_vars = [var for var in tvars if 'domain_classifier' in var.name]
 83 |             print("domain_classifier")
 84 |         else:
 85 |             train_vars = [var for var in tvars if 'shared_model/embedding' in var.name or 'bidirectional_rnn' in var.name]
 86 |             print("adversarial_domain_classifier")
 87 | 
 88 |         for tv in train_vars:
 89 |             print(tv.name)
 90 | 
 91 |         self.lr = tf.Variable(0.0, trainable=False)
 92 |         grads=tf.gradients(cost, train_vars)
 93 |         grads, _ = tf.clip_by_global_norm(grads,config.max_grad_norm)    
 94 |         optimizer = tf.train.AdagradOptimizer(self.lr)        
 95 |         self.train_op = optimizer.apply_gradients(zip(grads, train_vars))
 96 | 
 97 |     def assign_lr(self, session, lr_value):
 98 |         session.run(tf.assign(self.lr, lr_value))
 99 | 
100 | class Combine_two_model:
101 |     def __init__(self, is_training,share_model, config, domain_embedding, num, memories, W_a, U_a, v_a,weight1, bias1, weight2, bias2, self_Q, self_K):
102 |         self.share_model=share_model
103 |         self.batch_size=batch_size=config.batch_size
104 |         self.memory_location=memory_location= share_model.memory_location
105 |         memory=memories[num]            
106 |       
107 |         #domain embedding layer     
108 |         with tf.device("/cpu:0"):      
109 |             #batch_size * (2*hidden_size)
110 |             domain_inputs = tf.nn.embedding_lookup(domain_embedding, share_model.domains)
111 |         
112 |         #self attention
113 |         self.score=tf.nn.softmax(tf.matmul(tf.matmul(domain_inputs, self_Q),tf.transpose(tf.matmul(domain_embedding, self_K))))
114 |         self.domain_inputs= domain_inputs= tf.matmul(self.score, domain_embedding)
115 | 
116 |         #compute attention scores 
117 |         #domain queries 
118 |         query_vec=tf.matmul(domain_inputs, W_a)
119 |         #replicate domain queries for num_steps and reshape
120 |         query_vec=tf.reshape(tf.tile(tf.expand_dims(query_vec, dim=1), [1,config.num_steps,1]), [-1, 4*config.hidden_size])
121 |         
122 |         #reshape LSTM outputs to two-dimensional
123 |         lstm_output=tf.transpose(share_model.lstm_output, [1, 0, 2])
124 |         reshaped_lstm_output=tf.reshape(lstm_output, [-1, 2*config.hidden_size])
125 | 
126 |         #compute unnormalized scores
127 |         layer1=tf.tanh(tf.add(query_vec, tf.matmul(reshaped_lstm_output, U_a)))
128 |         unnormalized_scores=tf.reshape(tf.squeeze(tf.matmul(layer1, v_a),axis=[1]), [-1, config.num_steps])
129 |         #in order to tackle variable length
130 |         sequence_mask=tf.cast(tf.sequence_mask(share_model.mask, config.num_steps), tf.float32)
131 |         minimize_softmax_score=sequence_mask*1e25-1e25
132 |         unnormalized_scores=unnormalized_scores*sequence_mask+minimize_softmax_score
133 |         #normalize the scores
134 |         self.normalized_score=normalized_score=tf.nn.softmax(unnormalized_scores)
135 | 
136 |         #compute weighted vectors 
137 |         normalized_score=tf.expand_dims(normalized_score, dim=2)
138 |         combine_vector=tf.reduce_sum(normalized_score*lstm_output, axis=1)
139 | 
140 |         #update op for memory network
141 |         self.update_memory=tf.scatter_update(memory, memory_location, combine_vector)
142 | 
143 |         #attention on memory samples
144 |         self.samples=samples=tf.nn.softmax(tf.matmul(combine_vector,tf.transpose(memory)))
145 |         self.context_vector= context_vector= tf.matmul(samples, memory)
146 | 
147 |         #concat both vectors
148 |         combine_vector=tf.concat([context_vector, combine_vector],axis=1)
149 | 
150 |         #softmax matrix
151 |         softmax_w = tf.get_variable("softmax_w", [4*config.hidden_size, config.num_classes])
152 |         #softmax_w = tf.get_variable("softmax_w", [2*config.hidden_size, 2])
153 |         softmax_b = tf.get_variable("softmax_b", [config.num_classes])
154 | 
155 |         #add dropout to combine_vector
156 |         if is_training and config.keep_prob < 1:
157 |             combine_vector = tf.nn.dropout(combine_vector, config.keep_prob)
158 | 
159 |         logits = tf.matmul(combine_vector, softmax_w) + softmax_b
160 | 
161 |         #operators for prediction
162 |         self.prediction=prediction=tf.argmax(logits,1)
163 |         correct_prediction = tf.equal(prediction, share_model.labels)
164 |         self.accuracy = tf.reduce_sum(tf.cast(correct_prediction, tf.float32))
165 |         
166 |         #cross entropy loss
167 |         loss = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=share_model.labels, logits=logits)
168 |         cost = tf.reduce_sum(loss)
169 | 
170 |         self.cost=cost
171 |         #compute grads and update 
172 |         tvars=tf.trainable_variables()
173 | 
174 |         train_vars = [var for var in tvars if 'shared_model' in var.name or "m"+str(num) in var.name]
175 | 
176 |         print("m"+str(num))
177 |         for tv in train_vars:
178 |             print(tv.name)
179 | 
180 |         self.lr = tf.Variable(0.0, trainable=False)
181 |         grads=tf.gradients(cost, train_vars)
182 |         grads, _ = tf.clip_by_global_norm(grads,config.max_grad_norm)    
183 |         optimizer = tf.train.AdagradOptimizer(self.lr)        
184 |         self.train_op = optimizer.apply_gradients(zip(grads, train_vars))
185 | 
186 |     #assign value to learning rate
187 |     def assign_lr(self, session, lr_value):
188 |         session.run(tf.assign(self.lr, lr_value))
189 | 
190 | class Config(object):
191 |     vocab_size=10000
192 |     maxlen=100 
193 |     num_steps = maxlen
194 |     max_grad_norm = 5
195 |     init_scale = 0.05
196 |     hidden_size = 300
197 |     lr_decay = 0.95
198 |     valid_portion=0.1
199 |     dataset=''
200 |     batch_size=10
201 |     keep_prob = 0.4
202 |     #0.05
203 |     learning_rate = 0.1
204 |     domain_learning_rate = 0.003
205 |     max_epoch =2
206 |     max_max_epoch =40
207 | 
208 | def get_minibatches_idx(n, batch_size, shuffle=False):
209 |     """
210 |     Used to shuffle the dataset at each iteration.
211 |     """
212 |     idx_list = numpy.arange(n, dtype="int32")
213 | 
214 |     if shuffle:
215 |         numpy.random.shuffle(idx_list)
216 | 
217 |     minibatches = []
218 |     minibatch_start = 0
219 |     for i in range(n // batch_size):
220 |         minibatches.append(idx_list[minibatch_start:
221 |                                     minibatch_start + batch_size])
222 |         minibatch_start += batch_size
223 | 
224 |     if (minibatch_start != n):
225 |         # Make a minibatch out of what is left
226 |         minibatches.append(idx_list[-batch_size:])
227 |     return minibatches
228 | 
229 | def run_pre_epoch(session, m, data, num):
230 |     n_samples = data[0].shape[1]
231 |     print("Running %d samples:"%(n_samples))  
232 |     minibatches = get_minibatches_idx(n_samples, m.batch_size, shuffle=False)
233 | 
234 |     for inds in minibatches[:]:
235 |         x = data[0][:,inds]        
236 |         mask = data[1][inds]
237 |         y = data[2][inds]
238 |         memory_location= data[3][inds]
239 | 
240 |         memory_data=session.run([m.update_memory],\
241 |             {m.share_model.input_data: x, m.share_model.mask: mask, m.share_model.labels: y,\
242 |             m.share_model.domains: numpy.array([num]*len(y)), m.share_model.memory_location: memory_location})
243 | 
244 | 
245 | def run_epoch(session, m, data, eval_op, num, is_training):
246 |     n_samples = data[0].shape[1]
247 |     print("Running %d samples:"%(n_samples))  
248 |     minibatches = get_minibatches_idx(n_samples, m.batch_size, shuffle=False)
249 | 
250 |     correct = 0.
251 |     total = 0
252 |     total_cost=0
253 |     for inds in minibatches[:]:
254 |         x = data[0][:,inds]        
255 |         mask = data[1][inds]
256 |         y = data[2][inds]
257 |         
258 |         count, _, cost= \
259 |         session.run([m.accuracy, eval_op,m.cost],\
260 |             {m.share_model.input_data: x, m.share_model.mask: mask,m.share_model.labels: y,\
261 |             m.share_model.domains: [num]*m.batch_size})
262 | 
263 |         correct += count 
264 |         total += len(inds)
265 |         total_cost+=cost
266 | 
267 |     print("Total loss:")
268 |     print(total_cost)
269 |     accuracy = correct/total
270 |     return accuracy
271 | 
272 | def run_domain_classifier_epoch(session, m, data, eval_op):
273 |     n_samples = data[0].shape[1]
274 |     print("Running %d samples:"%(n_samples))  
275 |     minibatches = get_minibatches_idx(n_samples, m.batch_size, shuffle=True)
276 | 
277 |     correct = 0.
278 |     total = 0
279 |     total_cost=0
280 | 
281 |     data[2] = np.array(data[2])
282 |     for inds in minibatches[:]:
283 |         print(inds)
284 |         x = data[0][:,inds]        
285 |         mask = data[1][inds]
286 |         y = data[2][inds]
287 | 
288 |         count, _, prediction,cost, logits= \
289 |         session.run([m.accuracy, eval_op, m.prediction, m.cost, m.logits],\
290 |             {m.share_model.input_data: x, m.share_model.mask: mask, m.share_model.domains: y})
291 | 
292 |         correct += count 
293 |         total += len(inds)
294 |         total_cost+=cost
295 | 
296 |     print("Total loss:")
297 |     print(total_cost)
298 |     accuracy = correct/total
299 |     return accuracy
300 | 
301 | 
302 | def train_test_model(config, session, train_models, valid_models, test_models, trains, valids, tests, domain_classifier, domain_classifier_adversarial,combined_data):
303 |     for i in range(config.max_max_epoch):
304 |         #compute lr_decay
305 |         lr_decay = config.lr_decay ** max(i - config.max_epoch, 0.0)
306 |         #zip the models and data
307 |         model_list=list(zip(range(len(train_models)), train_models, valid_models, trains, valids))
308 |         #reverse order 
309 |         if i%2==1:
310 |             model_list=reversed(model_list)
311 | 
312 |         #record which one has minimum training accuracies
313 |         min_training=1.0
314 |         number=-1
315 |         for num, train_model, test_model, train, valid in model_list:
316 |             #update memory            
317 |             print("Updating Memories")
318 |             run_pre_epoch(session, test_model, train, num=num)
319 |             
320 | 
321 |             #update learning rate
322 |             train_model.assign_lr(session, config.learning_rate * lr_decay)
323 | 
324 |             #training            
325 |             print()
326 |             print("Model: "+str(num+1))
327 |             print("Epoch: %d Learning rate: %.5f" % (i + 1, session.run(train_model.lr)))
328 |             start_time = time.time()
329 |             train_acc = run_epoch(session, train_model, train, train_model.train_op, num=num, is_training=True)
330 |             print("Training Accuracy = %.4f, time = %.3f seconds\n"%(train_acc, time.time()-start_time))
331 |             
332 |             #record mimimum training accuracy
333 |             if train_acc< min_training:
334 |                 number=num
335 |                 min_training=train_acc
336 |             
337 |             #valid 
338 |             valid_acc = run_epoch(session, test_model, valid, tf.no_op(), num=num, is_training=False)
339 |             print("Valid Accuracy = %.4f\n" % valid_acc)
340 | 
341 |         #run model with minimum training accuracy again
342 |         if number != -1:
343 |             for num, train_model, test_model, train, valid in model_list:
344 |                 if num==number:
345 |                     print("Model: "+str(num+1))
346 |                     print("Epoch: %d Learning rate: %.5f" % (i + 1, session.run(train_model.lr)))
347 |                     start_time = time.time()
348 |                     train_acc = run_epoch(session, train_model, train, train_model.train_op, num=num, is_training=False)
349 |                     print("Training Accuracy = %.4f, time = %.3f seconds\n"%(train_acc, time.time()-start_time))
350 | 
351 |             
352 |         #testing
353 |         for num, test_model, test in zip(range(len(test_models)),test_models, tests):          
354 |             print(sys.argv[1+num])
355 |             test_acc = run_epoch(session, test_model, test, tf.no_op(),num=num, is_training=False)
356 |             print("Test Accuracy = %.4f\n" % test_acc)
357 |             #write out accuracies
358 |             with open("multi_view_domain.txt", 'a') as f:
359 |                 f.write("Accuracy for dataset "+ sys.argv[num+1]+": "+str(test_acc)+"\n")
360 | 
361 |         #domain classifier training
362 |         print("Domain classifier Training:")
363 |         domain_classifier.assign_lr(session, config.domain_learning_rate * lr_decay)
364 |         start_time = time.time()
365 |         domain_train_acc = run_domain_classifier_epoch(session, domain_classifier, combined_data, domain_classifier.train_op)
366 |         print("Domain Training Accuracy = %.4f, time = %.3f seconds\n"%(domain_train_acc, time.time()-start_time))
367 | 
368 | 
369 |         print("Domain adversarial classifier Training:")
370 |         domain_classifier_adversarial.assign_lr(session, config.domain_learning_rate * lr_decay)
371 |         start_time = time.time()
372 |         domain_train_acc = run_domain_classifier_epoch(session, domain_classifier_adversarial, combined_data, domain_classifier_adversarial.train_op)
373 |         print("Domain Training Accuracy = %.4f, time = %.3f seconds\n"%(domain_train_acc, time.time()-start_time))
374 | 
375 | def word_to_vec(session,config, *args):
376 |     f = open("vectors"+config.dataset, 'rb')
377 |     matrix= numpy.array(pickle.load(f))
378 |     print("word2vec shape: ", matrix.shape)
379 |     for model in args:
380 |         session.run(tf.assign(model.embedding, matrix))
381 | 
382 | 
383 | #combine two datasets
384 | def combine(dataset):
385 |     flag=False
386 |     for single_dataset in dataset:
387 |         if flag==False:
388 |             flag=True
389 |             combined_data=[single_dataset[0], single_dataset[1],single_dataset[4]]
390 |         else:
391 |             combined_data=[numpy.concatenate([combined_data[0],single_dataset[0]], axis=1),numpy.concatenate([combined_data[1],single_dataset[1]]),\
392 |             numpy.concatenate([combined_data[2],single_dataset[4]])]
393 |     return combined_data
394 | 
395 | def get_domains():
396 |     #domains to be processed
397 |     domain_list=sys.argv[1:]
398 |     domain_size=len(domain_list)
399 |     print(domain_size)
400 |     if domain_size<=0:
401 |         print("No dataset")
402 |         exit(1)
403 |     return domain_size, domain_list
404 | 
405 | def count_labels(labels):
406 |     return len(set(labels))
407 | 
408 | if __name__ == "__main__":
409 |     #configs
410 |     config = Config()
411 |     domain_size, domain_list=get_domains()
412 | 
413 |     #load dataset
414 |     train_datasets, valid_datasets, test_datasets=[],[],[]
415 |     for domain in domain_list:
416 |         train, valid, test = read_data.load_data(path='dataset'+config.dataset+'/'+domain+'/dataset',n_words=config.vocab_size, \
417 |             valid_portion=config.valid_portion, maxlen=config.maxlen)
418 |         train_datasets.append(train)
419 |         valid_datasets.append(valid)
420 |         test_datasets.append(test)
421 | 
422 |     #transform dataset to matrix
423 |     for index in range(domain_size):
424 |         train = read_data.prepare_data(train_datasets[index][0], train_datasets[index][1], maxlen=config.maxlen, traindata=True, index=index)
425 |         valid = read_data.prepare_data(valid_datasets[index][0], valid_datasets[index][1], maxlen=config.maxlen, traindata=False, index=index)
426 |         test = read_data.prepare_data(test_datasets[index][0], test_datasets[index][1], maxlen=config.maxlen, traindata=False, index=index)
427 |         train_datasets[index]=train
428 |         valid_datasets[index]=valid
429 |         test_datasets[index]=test
430 |     config.num_classes = count_labels(train_datasets[0][2])
431 |     combined_data=combine(train_datasets)   
432 | 
433 |     gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.9)
434 |     with tf.Graph().as_default(), tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) as session:
435 |         initializer = tf.random_normal_initializer(0, 0.05)
436 | 
437 |         #attention weights
438 |         with tf.variable_scope("shared_model"):
439 |             #domain embedding
440 |             domain_embedding = tf.Variable(tf.random_normal([domain_size, 2*config.hidden_size], mean=0.0, stddev=0.1, dtype=tf.float32), name="domain_embedding")
441 |             W_a = tf.Variable(tf.random_normal([2*config.hidden_size, 4*config.hidden_size], mean=0.0, stddev=0.1, dtype=tf.float32), name="W_a")
442 |             U_a = tf.Variable(tf.random_normal([2*config.hidden_size, 4*config.hidden_size], mean=0.0, stddev=0.1, dtype=tf.float32), name="U_a")
443 |             v_a = tf.Variable(tf.random_normal([4*config.hidden_size, 1], mean=0.0, stddev=0.1, dtype=tf.float32), name="v_a")
444 | 
445 |         
446 |         #domain self-attention weights
447 |         with tf.variable_scope("self_attention"):          
448 |             self_Q = tf.Variable(tf.random_normal([2*config.hidden_size, 2*config.hidden_size], mean=0.0, stddev=0.1, dtype=tf.float32), name="Q")
449 |             self_K = tf.Variable(tf.random_normal([2*config.hidden_size, 2*config.hidden_size], mean=0.0, stddev=0.1, dtype=tf.float32), name="K")
450 | 
451 | 
452 |         #memory network
453 |         memories=[]
454 |         for index, train in enumerate(train_datasets):
455 |             memory = tf.Variable(tf.random_normal([len(train[3]), 2*config.hidden_size], mean=0.0, stddev=0.1, dtype=tf.float32),trainable=False ,name="memory"+str(index))
456 |             memories.append(memory)
457 | 
458 |         #weights for domain classifier (adversarial training)
459 |         with tf.variable_scope('domain_classifier'):
460 |             domain_classifier_weight1 = tf.Variable(tf.random_normal([2*config.hidden_size, config.hidden_size], mean=0.0, stddev=0.1, dtype=tf.float32), name="domain_classifier1")
461 |             domain_classifier_bias1 = tf.Variable(tf.random_normal([config.hidden_size], mean=0.0, stddev=0.1, dtype=tf.float32), name="domain_classifier_bias1")
462 | 
463 |             domain_classifier_weight2 = tf.Variable(tf.random_normal([config.hidden_size, domain_size], mean=0.0, stddev=0.1, dtype=tf.float32), name="domain_classifier2")
464 |             domain_classifier_bias2 = tf.Variable(tf.random_normal([domain_size], mean=0.0, stddev=0.1, dtype=tf.float32), name="domain_classifier_bias2")
465 | 
466 |         #print memory shape
467 |         print("memory shape")
468 |         for index,memory in enumerate(memories):
469 |             print(sys.argv[1+index])
470 |             print(memory.get_shape())
471 |         
472 |         #training model for shared weights
473 |         with tf.variable_scope("shared_model", reuse=None, initializer=initializer):
474 |             share_model_train = EmbeddingModel(True, config=config, session=session)
475 |         #testing model for shared weights
476 |         with tf.variable_scope("shared_model", reuse = True, initializer=initializer):
477 |             share_model_test = EmbeddingModel(False, config=config, session=session)
478 | 
479 |         #domain classifier         
480 |         domain_classifier=Domain_classifier(share_model_train, domain_classifier_weight1, domain_classifier_bias1,domain_classifier_weight2, domain_classifier_bias2,config, False)
481 |         domain_classifier_adversarial=Domain_classifier(share_model_train, domain_classifier_weight1, domain_classifier_bias1,domain_classifier_weight2, domain_classifier_bias2,config, True)
482 | 
483 |         #build models
484 |         train_models=[]
485 |         test_models=[]
486 |         for index in range(domain_size): 
487 |             with tf.variable_scope("m"+str(index), reuse = None, initializer=initializer):
488 |                 train_model = Combine_two_model(True,share_model_train, config, domain_embedding, index, memories, W_a, U_a,v_a, domain_classifier_weight1,domain_classifier_bias1, domain_classifier_weight2,domain_classifier_bias2, self_Q, self_K)
489 |             with tf.variable_scope("m"+str(index), reuse = True, initializer=initializer):
490 |                 test_model = Combine_two_model(False,share_model_test, config, domain_embedding, index, memories, W_a, U_a,v_a, domain_classifier_weight1,domain_classifier_bias1, domain_classifier_weight2,domain_classifier_bias2, self_Q, self_K)
491 |             train_models.append(train_model)
492 |             test_models.append(test_model)
493 | 
494 |         #print trainable variables
495 |         for v in tf.trainable_variables():
496 |             print(v.name)
497 | 
498 |         #initialize
499 |         init = tf.global_variables_initializer()
500 |         session.run(init)
501 | 
502 |         #initialize share model's embedding with word2vec
503 |         word_to_vec(session,config, share_model_train)
504 |         #train test model
505 |         train_test_model(config, session,\
506 |             train_models,test_models,test_models,\
507 |             train_datasets,valid_datasets,test_datasets, domain_classifier,domain_classifier_adversarial,combined_data)
508 | 


--------------------------------------------------------------------------------
/preprocessing.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import collections
 3 | import pickle
 4 | import gensim
 5 | from gensim.models.keyedvectors import KeyedVectors
 6 | import numpy as np
 7 | import re
 8 | 
 9 | 
10 | #insert words of a file
11 | def insert_word(f):
12 | 	global all_words
13 | 	for l in f:
14 | 		words=re.split('\s|-',l.lower().split("|||")[0].strip())
15 | 		
16 | 		all_words+=words
17 | 
18 | #convert words to numbers
19 | def convert_words_to_number(f, dataset, labels):
20 | 	global common_word
21 | 	for l in f:
22 | 		try:
23 | 			words=re.split('\s|-',l.lower().split("|||")[0].strip())
24 | 			label=l.lower().split("|||")[1].strip('\n')
25 | 			words=[common_word[w] if w in common_word else 1 for w in words]
26 | 			dataset+=[words]
27 | 			labels+=[label]
28 | 		except:
29 | 			continue
30 | vocab=10000
31 | gap=2
32 | vocab_size=vocab-2
33 | location='./dataset/'
34 | all_words=[]
35 | 
36 | #iterate all files
37 | for file in os.listdir(location):
38 | 	if file != '.DS_Store':
39 | 		with open(location+file+"/trn") as f:
40 | 			insert_word(f)
41 | 		with open(location+file+"/dev") as f:
42 | 			insert_word(f)
43 | 
44 | #take out frequent words 
45 | counter=collections.Counter(all_words)
46 | common_word=dict(counter.most_common(vocab_size))
47 | 
48 | #number them
49 | c=2
50 | for key in common_word:
51 | 	common_word[key]=c
52 | 	c+=1
53 | print(common_word)
54 | pickle.dump(common_word, open('dictionary', 'wb'))
55 | 
56 | for file in os.listdir(location):
57 | 
58 | 	if file != '.DS_Store':
59 | 		train=[]
60 | 		train_label=[]
61 | 		test=[]
62 | 		test_label=[]
63 | 		with open(location+file+"/trn") as f:
64 | 			convert_words_to_number(f, train, train_label)
65 | 
66 | 		with open(location+file+"/dev") as f:
67 | 			convert_words_to_number(f, train, train_label)
68 | 
69 | 		pickle.dump(((train,train_label) ,(test,test_label)), open(location+file+'/dataset', 'wb'))
70 | 
71 | 
72 | #create embedding vector matrix
73 | word_vectors = KeyedVectors.load_word2vec_format('vectors.gz', binary=True)
74 | word2vec=[[0]*300, [0]*300]
75 | for number, word in sorted(zip(common_word.values(), common_word.keys())):
76 | 	try:
77 | 		print(type(word_vectors.word_vec(word)))
78 | 		word2vec.append(word_vectors.word_vec(word).tolist())
79 | 	except KeyError: 
80 | 		print(word+ " not found")
81 | 		word2vec.append([0]*300)
82 | pickle.dump(word2vec, open('vectors', 'wb'))
83 | print(len(word2vec))
84 | 
85 | print(word_vectors.word_vec('laptop'))
86 | 


--------------------------------------------------------------------------------
/read_data.py:
--------------------------------------------------------------------------------
  1 | from __future__ import print_function
  2 | from six.moves import xrange
  3 | import six.moves.cPickle as pickle
  4 | 
  5 | import gzip
  6 | import os
  7 | import numpy
  8 | 
  9 | def prepare_data(seqs, labels, maxlen, traindata, index):
 10 |     """Create the matrices from the datasets.
 11 | 
 12 |     This pad each sequence to the same lenght: the lenght of the
 13 |     longuest sequence or maxlen.
 14 | 
 15 |     if maxlen is set, we will cut all sequence to this maximum
 16 |     lenght.
 17 | 
 18 |     This swap the axis!
 19 |     """
 20 |     # x: a list of sentences
 21 |     lengths = [len(s) for s in seqs]
 22 | 
 23 |     if maxlen is not None:
 24 |         new_seqs = []
 25 |         new_labels = []
 26 |         new_lengths = []
 27 |         for l, s, y in zip(lengths, seqs, labels):
 28 |             if l < maxlen:
 29 |                 new_seqs.append(s)
 30 |                 new_labels.append(y)
 31 |                 new_lengths.append(l)
 32 |         lengths = new_lengths
 33 |         labels = new_labels
 34 |         seqs = new_seqs
 35 | 
 36 |         if len(lengths) < 1:
 37 |             return None, None, None
 38 | 
 39 |     n_samples = len(seqs)
 40 |     #maxlen = numpy.max(lengths)
 41 | 
 42 |     x = numpy.zeros((maxlen, n_samples)).astype('int64')
 43 |     labels = numpy.array(labels).astype('int32')
 44 |     for idx, s in enumerate(seqs):
 45 |         x[:lengths[idx], idx] = s
 46 |     if not traindata:
 47 |         return [x, numpy.array(lengths).astype('int32'), labels]
 48 |     else:
 49 |         return [x, numpy.array(lengths).astype('int32'), labels, numpy.array(range(len(lengths))), [index]*len(lengths)]
 50 | 
 51 | def load_data(path, n_words=10000, valid_portion=0.2, maxlen=None,
 52 |     sort_by_len=False):
 53 | 
 54 |     if path.endswith(".gz"):
 55 |         f = gzip.open(path, 'rb')
 56 |     else:
 57 |         f = open(path, 'rb')
 58 | 
 59 |     train_set, test_set= pickle.load(f)
 60 |     f.close()
 61 |     if maxlen:
 62 |         new_train_set_x = []
 63 |         new_train_set_y = []
 64 |         for x, y in zip(train_set[0], train_set[1]):
 65 |             if len(x) < maxlen:
 66 |                 new_train_set_x.append(x)
 67 |                 new_train_set_y.append(y)
 68 |         train_set = (new_train_set_x, new_train_set_y)
 69 |         del new_train_set_x, new_train_set_y
 70 | 
 71 |     # split training set into validation set
 72 |     train_set_x, train_set_y = train_set
 73 |     n_samples = len(train_set_x)
 74 |     sidx = numpy.random.permutation(n_samples)
 75 |     n_train = int(numpy.round(n_samples * (1. - valid_portion)))
 76 |     valid_set_x = [train_set_x[s] for s in sidx[n_train:]]
 77 |     valid_set_y = [train_set_y[s] for s in sidx[n_train:]]
 78 |     train_set_x = [train_set_x[s] for s in sidx[:n_train]]
 79 |     train_set_y = [train_set_y[s] for s in sidx[:n_train]]
 80 | 
 81 |     train_set = (train_set_x, train_set_y)
 82 |     valid_set = (valid_set_x, valid_set_y)
 83 | 
 84 |     def remove_unk(x):
 85 |         return [[1 if w >= n_words else w for w in sen] for sen in x]
 86 | 
 87 |     test_set_x, test_set_y = test_set
 88 |     valid_set_x, valid_set_y = valid_set
 89 |     train_set_x, train_set_y = train_set
 90 | 
 91 |     train_set_x = remove_unk(train_set_x)
 92 |     valid_set_x = remove_unk(valid_set_x)
 93 |     test_set_x = remove_unk(test_set_x)
 94 | 
 95 |     def len_argsort(seq):
 96 |         return sorted(range(len(seq)), key=lambda x: len(seq[x]))
 97 | 
 98 |     if sort_by_len:
 99 |         sorted_index = len_argsort(test_set_x)
100 |         test_set_x = [test_set_x[i] for i in sorted_index]
101 |         test_set_y = [test_set_y[i] for i in sorted_index]
102 | 
103 |         sorted_index = len_argsort(valid_set_x)
104 |         valid_set_x = [valid_set_x[i] for i in sorted_index]
105 |         valid_set_y = [valid_set_y[i] for i in sorted_index]
106 | 
107 |         sorted_index = len_argsort(train_set_x)
108 |         train_set_x = [train_set_x[i] for i in sorted_index]
109 |         train_set_y = [train_set_y[i] for i in sorted_index]
110 | 
111 |     train = [train_set_x, train_set_y]
112 |     valid = [valid_set_x, valid_set_y]
113 |     test = [test_set_x, test_set_y]
114 | 
115 |     return train, valid, test


--------------------------------------------------------------------------------