├── README.md ├── multi_view.py ├── multi_view_domain_embedding_memory_adversarial.py ├── preprocessing.py └── read_data.py /README.md: -------------------------------------------------------------------------------- 1 | multi-domain-sentiment 2 | ====== 3 | A framework for multi-domain sentiment analysis by learning domain-specific representations of input sentences using neural network. 4 | 5 | Prerequisite 6 | ====== 7 | 1. Tensorflow 8 | 2. Google News Embeddings (https://code.google.com/archive/p/word2vec/) (rename it to 'vectors.gz' and put it under the main folder) 9 | 3. Gensim 10 | 11 | Data Preparation 12 | ====== 13 | 1. Download datasets (e.g. laptops). We assume the datasets are preprocessed into the following format: 14 | 15 | The unit does everything it promises . I 've only used it once so far , but i 'm happy with it ||| 1 16 | 17 | 2. Randomly split each dataset into training (e.g. laptops/trn), development (e.g. laptops/dev) and testing datasets (e.g. laptops/tst). Put all datasets into a folder named 'dataset'. Thus, the directory structure looks like dataset/laptops/trn. 18 | 19 | Preprocessing and Run the Demo 20 | ====== 21 | 22 | 1. Run `python preprocessing.py`. This program will iterate through the 'dataset' folder and generate files like dictionaries, embeddings and transformed datasets. 23 | 24 | 2. Run `python multi_view_domain_embedding_memory_adversarial.py dataset_name1 dataset_name2 ...` for running the algorithm. 25 | 26 | 27 | 28 | -------------------------------------------------------------------------------- /multi_view.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | import six.moves.cPickle as pickle 3 | from collections import OrderedDict 4 | import sys 5 | import time 6 | import numpy 7 | import tensorflow as tf 8 | import read_data 9 | from random import shuffle 10 | 11 | class EmbeddingModel(object): 12 | 13 | def __init__(self, is_training, config, session, trainable): 14 | batch_size = config.batch_size 15 | #the steps of applying LSTM 16 | num_steps = config.num_steps 17 | hidden_size= config.hidden_size 18 | vocab_size = config.vocab_size 19 | 20 | #inputs: features, mask and labels 21 | self.input_data = tf.placeholder(tf.int32, [num_steps, batch_size], name="inputs") 22 | self.mask= tf.placeholder(tf.int64, [batch_size], name="mask") 23 | self.labels=tf.placeholder(tf.int64, [batch_size], name="labels") 24 | self.domains=tf.placeholder(tf.int64, [batch_size], name="domains") 25 | 26 | #word embedding layer 27 | with tf.device("/cpu:0"): 28 | self.embedding=embedding = tf.get_variable("embedding", [vocab_size, hidden_size], trainable=trainable) 29 | # num_steps* batch_size * embedding_size 30 | inputs = tf.nn.embedding_lookup(embedding, self.input_data) 31 | #add dropout to input units 32 | if is_training and config.keep_prob < 1: 33 | inputs = tf.nn.dropout(inputs, config.keep_prob) 34 | 35 | #add LSTM cell and dropout nodes 36 | with tf.variable_scope('forward'): 37 | fw_lstm = tf.contrib.rnn.BasicLSTMCell(hidden_size, forget_bias=0.0) 38 | if is_training and config.keep_prob < 1: 39 | fw_lstm = tf.contrib.rnn.DropoutWrapper(fw_lstm, output_keep_prob=config.keep_prob) 40 | 41 | with tf.variable_scope('backward'): 42 | bw_lstm = tf.contrib.rnn.BasicLSTMCell(hidden_size, forget_bias=0.0) 43 | if is_training and config.keep_prob < 1: 44 | bw_lstm = tf.contrib.rnn.DropoutWrapper(bw_lstm, output_keep_prob=config.keep_prob) 45 | 46 | #bidirectional rnn 47 | lstm_output=tf.nn.bidirectional_dynamic_rnn(fw_lstm, bw_lstm, inputs=inputs, sequence_length=self.mask, time_major=True, dtype=tf.float32) 48 | #num_step * batch_size * (hidden_size, hidden_siz) 49 | self.lstm_output=lstm_output=tf.concat(lstm_output[0], 2) 50 | #final sentence embedding. batch_size * (2 * hidden_size) 51 | self.lstm_output=lstm_output=tf.reduce_mean(lstm_output, axis=0) 52 | 53 | class Combine_two_model: 54 | def __init__(self, share_model, config): 55 | self.share_model=share_model 56 | self.batch_size=batch_size=config.batch_size 57 | 58 | #combined_embedding=tf.concat([model.lstm_output, share_model.lstm_output],1) 59 | #softmax matrix 60 | softmax_w = tf.get_variable("softmax_w", [2*config.hidden_size, config.num_classes]) 61 | softmax_b = tf.get_variable("softmax_b", [config.num_classes]) 62 | logits = tf.matmul(share_model.lstm_output, softmax_w) + softmax_b 63 | #cross entropy loss 64 | loss = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=share_model.labels, logits=logits) 65 | self.entropy=cost = tf.reduce_sum(loss) 66 | #add regularization 67 | tvars = tf.trainable_variables() 68 | for var in tvars: 69 | if ('shared_model/bidirectional_rnn' in var.name and 'biases' not in var.name) \ 70 | or 'shared_model/embedding' in var.name or tf.get_variable_scope().name+'/embedding' in var.name: 71 | cost=tf.add(cost, get_lambda(var.name, config)*tf.nn.l2_loss(var)) 72 | self.cost= cost 73 | #operators for prediction 74 | self.prediction=prediction=tf.argmax(logits,1) 75 | correct_prediction = tf.equal(prediction, share_model.labels) 76 | self.accuracy = tf.reduce_sum(tf.cast(correct_prediction, tf.float32)) 77 | 78 | #operators for optimizer 79 | self.lr = tf.Variable(0.0, trainable=False) 80 | 81 | grads, _ = tf.clip_by_global_norm(tf.gradients(cost, tvars),config.max_grad_norm) 82 | self.grads=grads[4] 83 | optimizer = tf.train.AdagradOptimizer(self.lr) 84 | #optimizer = tf.train.AdamOptimizer(self.lr) 85 | #self.train_op = optimizer.minimize(cost) 86 | self.train_op = optimizer.apply_gradients(zip(grads, tvars)) 87 | 88 | #assign value to learning rate 89 | def assign_lr(self, session, lr_value): 90 | session.run(tf.assign(self.lr, lr_value)) 91 | 92 | class Config(object): 93 | vocab_size=10000 # Vocabulary size 94 | maxlen=100 # Sequence longer then this get ignored 95 | num_steps = maxlen 96 | batch_size=10 # The batch size during training. 97 | 98 | init_scale = 0.05 99 | learning_rate = 1 100 | max_grad_norm = 5 101 | hidden_size = 300 102 | max_epoch = 1 103 | max_max_epoch =30 104 | keep_prob = 0.40 105 | lr_decay = 0.90 106 | lambda_loss_m1=3e-6 107 | lambda_loss_m2=3e-6 108 | lambda_loss_share=3e-6 109 | valid_portion=0.1 110 | domain_size=2 111 | dataset='1' 112 | 113 | #get lambda for regularization 114 | def get_lambda(name, config): 115 | if "m1" in name: 116 | return config.lambda_loss_m1 117 | if "m2" in name: 118 | return config.lambda_loss_m2 119 | if "shared_model" in name: 120 | return config.lambda_loss_share 121 | def get_minibatches_idx(n, batch_size, shuffle=False): 122 | """ 123 | Used to shuffle the dataset at each iteration. 124 | """ 125 | idx_list = numpy.arange(n, dtype="int32") 126 | 127 | if shuffle: 128 | numpy.random.shuffle(idx_list) 129 | 130 | minibatches = [] 131 | minibatch_start = 0 132 | for i in range(n // batch_size): 133 | minibatches.append(idx_list[minibatch_start: 134 | minibatch_start + batch_size]) 135 | minibatch_start += batch_size 136 | 137 | if (minibatch_start != n): 138 | # Make a minibatch out of what is left 139 | minibatches.append(idx_list[-batch_size:]) 140 | return minibatches 141 | 142 | 143 | def run_epoch(session, m, data, eval_op, num=1000): 144 | n_samples = data[0].shape[1] 145 | print("Running %d samples:"%(n_samples)) 146 | minibatches = get_minibatches_idx(n_samples, m.batch_size, shuffle=False) 147 | 148 | correct = 0. 149 | total = 0 150 | #predictions 151 | p=[] 152 | total_entropy=0 153 | total_cost=0 154 | for inds in minibatches[:]: 155 | x = data[0][:,inds] 156 | mask = data[1][inds] 157 | y = data[2][inds] 158 | 159 | count, _, prediction,embedding, cost, entropy, grads= \ 160 | session.run([m.accuracy, eval_op, m.prediction, m.share_model.embedding, m.cost, m.entropy, m.grads],\ 161 | {m.share_model.input_data: x, m.share_model.mask: mask, m.share_model.labels: y,\ 162 | m.share_model.domains: numpy.array([num]*len(y))}) 163 | print(grads) 164 | correct += count 165 | total += len(inds) 166 | p+=prediction.tolist() 167 | total_entropy+=entropy 168 | total_cost+=cost 169 | 170 | print("Entropy loss") 171 | print(total_entropy) 172 | print("Total loss:") 173 | print(total_cost) 174 | accuracy = correct/total 175 | return (accuracy, p) 176 | 177 | def load_dataset(path, config): 178 | print('Loading data: '+ path) 179 | train, valid, test = read_data.load_data(path, n_words=config.vocab_size, \ 180 | valid_portion=0.15, maxlen=config.maxlen) 181 | train = read_data.prepare_data(train[0], train[1], maxlen=config.maxlen) 182 | valid = read_data.prepare_data(valid[0], valid[1], maxlen=config.maxlen) 183 | test = read_data.prepare_data(test[0], test[1], maxlen=config.maxlen) 184 | return (train, valid, test) 185 | 186 | def train_test_model(config, session, train_models, valid_models, test_models, trains, valids, tests): 187 | for i in range(config.max_max_epoch): 188 | #compute lr_decay 189 | lr_decay = config.lr_decay ** max(i - config.max_epoch, 0.0) 190 | model_list=list(zip(range(len(train_models)), train_models, valid_models, trains, valids)) 191 | if i%2==0: 192 | model_list=reversed(model_list) 193 | min_training=1.0 194 | number=-1 195 | for num, train_model, test_model, train, valid in model_list: 196 | #update learning rate 197 | train_model.assign_lr(session, config.learning_rate * lr_decay) 198 | print("") 199 | print("Model: "+str(num+1)) 200 | print("Epoch: %d Learning rate: %.3f" % (i + 1, session.run(train_model.lr))) 201 | start_time = time.time() 202 | if(train_model.__class__.__name__=='Combine_two_model'): 203 | train_acc = run_epoch(session, train_model, train, train_model.train_op, num=num) 204 | print("Training Accuracy = %.4f, time = %.3f seconds\n"%(train_acc[0], time.time()-start_time)) 205 | 206 | if train_acc[0] < 0.9 and train_acc[0]< min_training: 207 | number=num 208 | min_training=train_acc[0] 209 | 210 | 211 | valid_acc = run_epoch(session, test_model, valid, tf.no_op(), num=num) 212 | print("Valid Accuracy = %.4f\n" % valid_acc[0]) 213 | 214 | if number != -1: 215 | for num, train_model, test_model, train, valid in model_list: 216 | if num==number: 217 | print("Model: "+str(num+1)) 218 | print("Epoch: %d Learning rate: %.3f" % (i + 1, session.run(train_model.lr))) 219 | start_time = time.time() 220 | train_acc = run_epoch(session, train_model, train, train_model.train_op, num=num) 221 | print("Training Accuracy = %.4f, time = %.3f seconds\n"%(train_acc[0], time.time()-start_time)) 222 | 223 | 224 | #print(valid_acc[1]) 225 | for num, test_model, test in zip(range(len(test_models)),test_models, tests): 226 | test_acc = run_epoch(session, test_model, test, tf.no_op(),num=num) 227 | 228 | print(sys.argv[1+num]) 229 | print("Test Accuracy = %.4f\n" % test_acc[0]) 230 | 231 | with open("multi_result_final.txt", 'a') as f: 232 | f.write("final accuracy for dataset "+ sys.argv[num+1]+": "+str(test_acc[0])+"\n") 233 | 234 | 235 | #combine two datasets 236 | def combine(d1, d2): 237 | return numpy.concatenate([d1[0],d2[0]], axis=1),\ 238 | numpy.concatenate([d1[1],d2[1]]),numpy.concatenate([d1[2],d2[2]]) 239 | 240 | def word_to_vec(session,config, *args): 241 | f = open("vectors"+config.dataset, 'rb') 242 | #f = open("domainvectors", 'rb') 243 | matrix= numpy.array(pickle.load(f)) 244 | print("word2vec shape: ", matrix.shape) 245 | for model in args: 246 | session.run(tf.assign(model.embedding, matrix)) 247 | 248 | def extend(train, times): 249 | newtrain=train 250 | for i in range(times-1): 251 | newtrain=combine(newtrain, train) 252 | return newtrain 253 | 254 | #make dataset approximately the same size 255 | def extend_data(train, train1): 256 | if train[0].shape[0] > train1[0].shape[0]: 257 | if train[0].shape[0]/train1[0].shape[0]>1: 258 | train1=extend(train1, train[0].shape[0]/train1[0].shape[0]) 259 | elif float(train[0].shape[0])/train1[0].shape[0]>1.6: 260 | train1=extend(train1, 2) 261 | else: 262 | if train1[0].shape[0]/train[0].shape[0]>1: 263 | train=extend(train, train1[0].shape[0]/train[0].shape[0]) 264 | elif float(train1[0].shape[0])/train[0].shape[0]>1.6: 265 | train=extend(train, 2) 266 | return train, train1 267 | 268 | def count_labels(labels): 269 | return len(set(labels)) 270 | 271 | def main(unused_args): 272 | #configs 273 | config = Config() 274 | #domains to be processed 275 | domain_list=sys.argv[1:] 276 | domain_size=len(domain_list) 277 | if domain_size<=0: 278 | print("No dataset") 279 | exit(1) 280 | #load dataset 281 | train_datasets, valid_datasets, test_datasets=[],[],[] 282 | for domain in domain_list: 283 | train, valid, test = read_data.load_data(path='dataset'+config.dataset+'/'+domain+'/dataset',n_words=config.vocab_size, \ 284 | valid_portion=config.valid_portion, maxlen=config.maxlen) 285 | train_datasets.append(train) 286 | valid_datasets.append(valid) 287 | test_datasets.append(test) 288 | #transform dataset to matrix 289 | for index in range(domain_size): 290 | train = read_data.prepare_data(train_datasets[index][0], train_datasets[index][1], maxlen=config.maxlen, traindata=True) 291 | valid = read_data.prepare_data(valid_datasets[index][0], valid_datasets[index][1], maxlen=config.maxlen, traindata=False) 292 | test = read_data.prepare_data(test_datasets[index][0], test_datasets[index][1], maxlen=config.maxlen, traindata=False) 293 | train_datasets[index]=train 294 | valid_datasets[index]=valid 295 | test_datasets[index]=test 296 | 297 | config.num_classes = count_labels(train_datasets[0][2]) 298 | gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.8) 299 | with tf.Graph().as_default(), tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) as session: 300 | initializer = tf.random_normal_initializer(0, 0.05) 301 | 302 | #training model for shared weights 303 | with tf.variable_scope("shared_model", reuse=None, initializer=initializer): 304 | share_model_train = EmbeddingModel(is_training=True, config=config, session=session,trainable=True) 305 | #testing model for shared weights 306 | with tf.variable_scope("shared_model", reuse = True, initializer=initializer): 307 | share_model_test = EmbeddingModel(is_training=False, config=config, session=session, trainable=True) 308 | 309 | #build models 310 | train_models=[] 311 | test_models=[] 312 | for index in range(domain_size): 313 | with tf.variable_scope("m"+str(index), reuse = None, initializer=initializer): 314 | train_model = Combine_two_model(share_model_train, config) 315 | with tf.variable_scope("m"+str(index), reuse = True, initializer=initializer): 316 | test_model = Combine_two_model(share_model_test, config) 317 | train_models.append(train_model) 318 | test_models.append(test_model) 319 | 320 | init = tf.global_variables_initializer() 321 | session.run(init) 322 | 323 | #initialize share model's embedding with word2vec 324 | word_to_vec(session,config, share_model_train) 325 | #train test model 326 | train_test_model(config, session,\ 327 | train_models,test_models,test_models,\ 328 | train_datasets,valid_datasets,test_datasets) 329 | 330 | if __name__ == '__main__': 331 | tf.app.run() 332 | -------------------------------------------------------------------------------- /multi_view_domain_embedding_memory_adversarial.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | import six.moves.cPickle as pickle 3 | from collections import OrderedDict 4 | import sys 5 | import time 6 | import numpy 7 | import tensorflow as tf 8 | import read_data 9 | from random import shuffle 10 | import random 11 | import numpy as np 12 | 13 | class EmbeddingModel(object): 14 | 15 | def __init__(self, is_training, config, session): 16 | batch_size = config.batch_size 17 | num_steps = config.num_steps 18 | hidden_size= config.hidden_size 19 | vocab_size = config.vocab_size 20 | 21 | #inputs: features, mask and labels 22 | self.input_data = tf.placeholder(tf.int32, [num_steps, batch_size], name="inputs") 23 | self.mask= tf.placeholder(tf.int64, [batch_size], name="mask") 24 | self.labels=tf.placeholder(tf.int64, [batch_size], name="labels") 25 | self.domains=tf.placeholder(tf.int64, [batch_size], name="domains") 26 | self.memory_location=tf.placeholder(tf.int64, [batch_size], name="memory_location") 27 | 28 | #word embedding layer 29 | with tf.device("/cpu:0"): 30 | self.embedding=embedding = tf.get_variable("embedding", [vocab_size, hidden_size]) 31 | # num_steps* batch_size * embedding_size 32 | inputs = tf.nn.embedding_lookup(embedding, self.input_data) 33 | #add dropout to input units 34 | if is_training and config.keep_prob < 1: 35 | inputs = tf.nn.dropout(inputs, config.keep_prob) 36 | 37 | #add LSTM cell and dropout nodes 38 | with tf.variable_scope('forward'): 39 | fw_lstm = tf.contrib.rnn.BasicLSTMCell(hidden_size, forget_bias=0.0) 40 | if is_training and config.keep_prob < 1: 41 | fw_lstm = tf.contrib.rnn.DropoutWrapper(fw_lstm, output_keep_prob=config.keep_prob) 42 | 43 | with tf.variable_scope('backward'): 44 | bw_lstm = tf.contrib.rnn.BasicLSTMCell(hidden_size, forget_bias=0.0) 45 | if is_training and config.keep_prob < 1: 46 | bw_lstm = tf.contrib.rnn.DropoutWrapper(bw_lstm, output_keep_prob=config.keep_prob) 47 | 48 | #bidirectional rnn 49 | lstm_output=tf.nn.bidirectional_dynamic_rnn(fw_lstm, bw_lstm, inputs=inputs, sequence_length=self.mask, time_major=True, dtype=tf.float32) 50 | #num_step * batch_size * (hidden_size, hidden_size) 51 | self.lstm_output=tf.concat(lstm_output[0], 2) 52 | 53 | class Domain_classifier: 54 | def __init__(self, share_model, weight1, bias1, weight2, bias2, config, is_adversarial=False): 55 | self.batch_size = config.batch_size 56 | self.share_model=share_model 57 | representation=tf.reduce_mean(share_model.lstm_output, axis=0) 58 | representation=tf.nn.relu(tf.matmul(representation, weight1) + bias1) 59 | logits=tf.matmul(representation, weight2) + bias2 60 | self.logits=logits 61 | 62 | 63 | #operators for prediction 64 | self.prediction=prediction=tf.argmax(logits,1) 65 | correct_prediction = tf.equal(prediction, share_model.domains) 66 | self.accuracy = tf.reduce_sum(tf.cast(correct_prediction, tf.float32)) 67 | 68 | #loss function 69 | global domain_size 70 | if is_adversarial: 71 | loss=tf.nn.softmax(logits)*tf.one_hot(share_model.domains, depth=domain_size, on_value=0.0, off_value=1.0) 72 | else: 73 | loss=tf.nn.softmax(logits)*tf.one_hot(share_model.domains, depth=domain_size, on_value=1.0, off_value=0.0) 74 | 75 | loss=tf.reduce_sum(loss,axis=1) 76 | loss=-tf.log(loss+1e-30) 77 | self.cost=cost =tf.reduce_sum(loss) 78 | 79 | #designate training variables 80 | tvars=tf.trainable_variables() 81 | if not is_adversarial: 82 | train_vars = [var for var in tvars if 'domain_classifier' in var.name] 83 | print("domain_classifier") 84 | else: 85 | train_vars = [var for var in tvars if 'shared_model/embedding' in var.name or 'bidirectional_rnn' in var.name] 86 | print("adversarial_domain_classifier") 87 | 88 | for tv in train_vars: 89 | print(tv.name) 90 | 91 | self.lr = tf.Variable(0.0, trainable=False) 92 | grads=tf.gradients(cost, train_vars) 93 | grads, _ = tf.clip_by_global_norm(grads,config.max_grad_norm) 94 | optimizer = tf.train.AdagradOptimizer(self.lr) 95 | self.train_op = optimizer.apply_gradients(zip(grads, train_vars)) 96 | 97 | def assign_lr(self, session, lr_value): 98 | session.run(tf.assign(self.lr, lr_value)) 99 | 100 | class Combine_two_model: 101 | def __init__(self, is_training,share_model, config, domain_embedding, num, memories, W_a, U_a, v_a,weight1, bias1, weight2, bias2, self_Q, self_K): 102 | self.share_model=share_model 103 | self.batch_size=batch_size=config.batch_size 104 | self.memory_location=memory_location= share_model.memory_location 105 | memory=memories[num] 106 | 107 | #domain embedding layer 108 | with tf.device("/cpu:0"): 109 | #batch_size * (2*hidden_size) 110 | domain_inputs = tf.nn.embedding_lookup(domain_embedding, share_model.domains) 111 | 112 | #self attention 113 | self.score=tf.nn.softmax(tf.matmul(tf.matmul(domain_inputs, self_Q),tf.transpose(tf.matmul(domain_embedding, self_K)))) 114 | self.domain_inputs= domain_inputs= tf.matmul(self.score, domain_embedding) 115 | 116 | #compute attention scores 117 | #domain queries 118 | query_vec=tf.matmul(domain_inputs, W_a) 119 | #replicate domain queries for num_steps and reshape 120 | query_vec=tf.reshape(tf.tile(tf.expand_dims(query_vec, dim=1), [1,config.num_steps,1]), [-1, 4*config.hidden_size]) 121 | 122 | #reshape LSTM outputs to two-dimensional 123 | lstm_output=tf.transpose(share_model.lstm_output, [1, 0, 2]) 124 | reshaped_lstm_output=tf.reshape(lstm_output, [-1, 2*config.hidden_size]) 125 | 126 | #compute unnormalized scores 127 | layer1=tf.tanh(tf.add(query_vec, tf.matmul(reshaped_lstm_output, U_a))) 128 | unnormalized_scores=tf.reshape(tf.squeeze(tf.matmul(layer1, v_a),axis=[1]), [-1, config.num_steps]) 129 | #in order to tackle variable length 130 | sequence_mask=tf.cast(tf.sequence_mask(share_model.mask, config.num_steps), tf.float32) 131 | minimize_softmax_score=sequence_mask*1e25-1e25 132 | unnormalized_scores=unnormalized_scores*sequence_mask+minimize_softmax_score 133 | #normalize the scores 134 | self.normalized_score=normalized_score=tf.nn.softmax(unnormalized_scores) 135 | 136 | #compute weighted vectors 137 | normalized_score=tf.expand_dims(normalized_score, dim=2) 138 | combine_vector=tf.reduce_sum(normalized_score*lstm_output, axis=1) 139 | 140 | #update op for memory network 141 | self.update_memory=tf.scatter_update(memory, memory_location, combine_vector) 142 | 143 | #attention on memory samples 144 | self.samples=samples=tf.nn.softmax(tf.matmul(combine_vector,tf.transpose(memory))) 145 | self.context_vector= context_vector= tf.matmul(samples, memory) 146 | 147 | #concat both vectors 148 | combine_vector=tf.concat([context_vector, combine_vector],axis=1) 149 | 150 | #softmax matrix 151 | softmax_w = tf.get_variable("softmax_w", [4*config.hidden_size, config.num_classes]) 152 | #softmax_w = tf.get_variable("softmax_w", [2*config.hidden_size, 2]) 153 | softmax_b = tf.get_variable("softmax_b", [config.num_classes]) 154 | 155 | #add dropout to combine_vector 156 | if is_training and config.keep_prob < 1: 157 | combine_vector = tf.nn.dropout(combine_vector, config.keep_prob) 158 | 159 | logits = tf.matmul(combine_vector, softmax_w) + softmax_b 160 | 161 | #operators for prediction 162 | self.prediction=prediction=tf.argmax(logits,1) 163 | correct_prediction = tf.equal(prediction, share_model.labels) 164 | self.accuracy = tf.reduce_sum(tf.cast(correct_prediction, tf.float32)) 165 | 166 | #cross entropy loss 167 | loss = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=share_model.labels, logits=logits) 168 | cost = tf.reduce_sum(loss) 169 | 170 | self.cost=cost 171 | #compute grads and update 172 | tvars=tf.trainable_variables() 173 | 174 | train_vars = [var for var in tvars if 'shared_model' in var.name or "m"+str(num) in var.name] 175 | 176 | print("m"+str(num)) 177 | for tv in train_vars: 178 | print(tv.name) 179 | 180 | self.lr = tf.Variable(0.0, trainable=False) 181 | grads=tf.gradients(cost, train_vars) 182 | grads, _ = tf.clip_by_global_norm(grads,config.max_grad_norm) 183 | optimizer = tf.train.AdagradOptimizer(self.lr) 184 | self.train_op = optimizer.apply_gradients(zip(grads, train_vars)) 185 | 186 | #assign value to learning rate 187 | def assign_lr(self, session, lr_value): 188 | session.run(tf.assign(self.lr, lr_value)) 189 | 190 | class Config(object): 191 | vocab_size=10000 192 | maxlen=100 193 | num_steps = maxlen 194 | max_grad_norm = 5 195 | init_scale = 0.05 196 | hidden_size = 300 197 | lr_decay = 0.95 198 | valid_portion=0.1 199 | dataset='' 200 | batch_size=10 201 | keep_prob = 0.4 202 | #0.05 203 | learning_rate = 0.1 204 | domain_learning_rate = 0.003 205 | max_epoch =2 206 | max_max_epoch =40 207 | 208 | def get_minibatches_idx(n, batch_size, shuffle=False): 209 | """ 210 | Used to shuffle the dataset at each iteration. 211 | """ 212 | idx_list = numpy.arange(n, dtype="int32") 213 | 214 | if shuffle: 215 | numpy.random.shuffle(idx_list) 216 | 217 | minibatches = [] 218 | minibatch_start = 0 219 | for i in range(n // batch_size): 220 | minibatches.append(idx_list[minibatch_start: 221 | minibatch_start + batch_size]) 222 | minibatch_start += batch_size 223 | 224 | if (minibatch_start != n): 225 | # Make a minibatch out of what is left 226 | minibatches.append(idx_list[-batch_size:]) 227 | return minibatches 228 | 229 | def run_pre_epoch(session, m, data, num): 230 | n_samples = data[0].shape[1] 231 | print("Running %d samples:"%(n_samples)) 232 | minibatches = get_minibatches_idx(n_samples, m.batch_size, shuffle=False) 233 | 234 | for inds in minibatches[:]: 235 | x = data[0][:,inds] 236 | mask = data[1][inds] 237 | y = data[2][inds] 238 | memory_location= data[3][inds] 239 | 240 | memory_data=session.run([m.update_memory],\ 241 | {m.share_model.input_data: x, m.share_model.mask: mask, m.share_model.labels: y,\ 242 | m.share_model.domains: numpy.array([num]*len(y)), m.share_model.memory_location: memory_location}) 243 | 244 | 245 | def run_epoch(session, m, data, eval_op, num, is_training): 246 | n_samples = data[0].shape[1] 247 | print("Running %d samples:"%(n_samples)) 248 | minibatches = get_minibatches_idx(n_samples, m.batch_size, shuffle=False) 249 | 250 | correct = 0. 251 | total = 0 252 | total_cost=0 253 | for inds in minibatches[:]: 254 | x = data[0][:,inds] 255 | mask = data[1][inds] 256 | y = data[2][inds] 257 | 258 | count, _, cost= \ 259 | session.run([m.accuracy, eval_op,m.cost],\ 260 | {m.share_model.input_data: x, m.share_model.mask: mask,m.share_model.labels: y,\ 261 | m.share_model.domains: [num]*m.batch_size}) 262 | 263 | correct += count 264 | total += len(inds) 265 | total_cost+=cost 266 | 267 | print("Total loss:") 268 | print(total_cost) 269 | accuracy = correct/total 270 | return accuracy 271 | 272 | def run_domain_classifier_epoch(session, m, data, eval_op): 273 | n_samples = data[0].shape[1] 274 | print("Running %d samples:"%(n_samples)) 275 | minibatches = get_minibatches_idx(n_samples, m.batch_size, shuffle=True) 276 | 277 | correct = 0. 278 | total = 0 279 | total_cost=0 280 | 281 | data[2] = np.array(data[2]) 282 | for inds in minibatches[:]: 283 | print(inds) 284 | x = data[0][:,inds] 285 | mask = data[1][inds] 286 | y = data[2][inds] 287 | 288 | count, _, prediction,cost, logits= \ 289 | session.run([m.accuracy, eval_op, m.prediction, m.cost, m.logits],\ 290 | {m.share_model.input_data: x, m.share_model.mask: mask, m.share_model.domains: y}) 291 | 292 | correct += count 293 | total += len(inds) 294 | total_cost+=cost 295 | 296 | print("Total loss:") 297 | print(total_cost) 298 | accuracy = correct/total 299 | return accuracy 300 | 301 | 302 | def train_test_model(config, session, train_models, valid_models, test_models, trains, valids, tests, domain_classifier, domain_classifier_adversarial,combined_data): 303 | for i in range(config.max_max_epoch): 304 | #compute lr_decay 305 | lr_decay = config.lr_decay ** max(i - config.max_epoch, 0.0) 306 | #zip the models and data 307 | model_list=list(zip(range(len(train_models)), train_models, valid_models, trains, valids)) 308 | #reverse order 309 | if i%2==1: 310 | model_list=reversed(model_list) 311 | 312 | #record which one has minimum training accuracies 313 | min_training=1.0 314 | number=-1 315 | for num, train_model, test_model, train, valid in model_list: 316 | #update memory 317 | print("Updating Memories") 318 | run_pre_epoch(session, test_model, train, num=num) 319 | 320 | 321 | #update learning rate 322 | train_model.assign_lr(session, config.learning_rate * lr_decay) 323 | 324 | #training 325 | print() 326 | print("Model: "+str(num+1)) 327 | print("Epoch: %d Learning rate: %.5f" % (i + 1, session.run(train_model.lr))) 328 | start_time = time.time() 329 | train_acc = run_epoch(session, train_model, train, train_model.train_op, num=num, is_training=True) 330 | print("Training Accuracy = %.4f, time = %.3f seconds\n"%(train_acc, time.time()-start_time)) 331 | 332 | #record mimimum training accuracy 333 | if train_acc< min_training: 334 | number=num 335 | min_training=train_acc 336 | 337 | #valid 338 | valid_acc = run_epoch(session, test_model, valid, tf.no_op(), num=num, is_training=False) 339 | print("Valid Accuracy = %.4f\n" % valid_acc) 340 | 341 | #run model with minimum training accuracy again 342 | if number != -1: 343 | for num, train_model, test_model, train, valid in model_list: 344 | if num==number: 345 | print("Model: "+str(num+1)) 346 | print("Epoch: %d Learning rate: %.5f" % (i + 1, session.run(train_model.lr))) 347 | start_time = time.time() 348 | train_acc = run_epoch(session, train_model, train, train_model.train_op, num=num, is_training=False) 349 | print("Training Accuracy = %.4f, time = %.3f seconds\n"%(train_acc, time.time()-start_time)) 350 | 351 | 352 | #testing 353 | for num, test_model, test in zip(range(len(test_models)),test_models, tests): 354 | print(sys.argv[1+num]) 355 | test_acc = run_epoch(session, test_model, test, tf.no_op(),num=num, is_training=False) 356 | print("Test Accuracy = %.4f\n" % test_acc) 357 | #write out accuracies 358 | with open("multi_view_domain.txt", 'a') as f: 359 | f.write("Accuracy for dataset "+ sys.argv[num+1]+": "+str(test_acc)+"\n") 360 | 361 | #domain classifier training 362 | print("Domain classifier Training:") 363 | domain_classifier.assign_lr(session, config.domain_learning_rate * lr_decay) 364 | start_time = time.time() 365 | domain_train_acc = run_domain_classifier_epoch(session, domain_classifier, combined_data, domain_classifier.train_op) 366 | print("Domain Training Accuracy = %.4f, time = %.3f seconds\n"%(domain_train_acc, time.time()-start_time)) 367 | 368 | 369 | print("Domain adversarial classifier Training:") 370 | domain_classifier_adversarial.assign_lr(session, config.domain_learning_rate * lr_decay) 371 | start_time = time.time() 372 | domain_train_acc = run_domain_classifier_epoch(session, domain_classifier_adversarial, combined_data, domain_classifier_adversarial.train_op) 373 | print("Domain Training Accuracy = %.4f, time = %.3f seconds\n"%(domain_train_acc, time.time()-start_time)) 374 | 375 | def word_to_vec(session,config, *args): 376 | f = open("vectors"+config.dataset, 'rb') 377 | matrix= numpy.array(pickle.load(f)) 378 | print("word2vec shape: ", matrix.shape) 379 | for model in args: 380 | session.run(tf.assign(model.embedding, matrix)) 381 | 382 | 383 | #combine two datasets 384 | def combine(dataset): 385 | flag=False 386 | for single_dataset in dataset: 387 | if flag==False: 388 | flag=True 389 | combined_data=[single_dataset[0], single_dataset[1],single_dataset[4]] 390 | else: 391 | combined_data=[numpy.concatenate([combined_data[0],single_dataset[0]], axis=1),numpy.concatenate([combined_data[1],single_dataset[1]]),\ 392 | numpy.concatenate([combined_data[2],single_dataset[4]])] 393 | return combined_data 394 | 395 | def get_domains(): 396 | #domains to be processed 397 | domain_list=sys.argv[1:] 398 | domain_size=len(domain_list) 399 | print(domain_size) 400 | if domain_size<=0: 401 | print("No dataset") 402 | exit(1) 403 | return domain_size, domain_list 404 | 405 | def count_labels(labels): 406 | return len(set(labels)) 407 | 408 | if __name__ == "__main__": 409 | #configs 410 | config = Config() 411 | domain_size, domain_list=get_domains() 412 | 413 | #load dataset 414 | train_datasets, valid_datasets, test_datasets=[],[],[] 415 | for domain in domain_list: 416 | train, valid, test = read_data.load_data(path='dataset'+config.dataset+'/'+domain+'/dataset',n_words=config.vocab_size, \ 417 | valid_portion=config.valid_portion, maxlen=config.maxlen) 418 | train_datasets.append(train) 419 | valid_datasets.append(valid) 420 | test_datasets.append(test) 421 | 422 | #transform dataset to matrix 423 | for index in range(domain_size): 424 | train = read_data.prepare_data(train_datasets[index][0], train_datasets[index][1], maxlen=config.maxlen, traindata=True, index=index) 425 | valid = read_data.prepare_data(valid_datasets[index][0], valid_datasets[index][1], maxlen=config.maxlen, traindata=False, index=index) 426 | test = read_data.prepare_data(test_datasets[index][0], test_datasets[index][1], maxlen=config.maxlen, traindata=False, index=index) 427 | train_datasets[index]=train 428 | valid_datasets[index]=valid 429 | test_datasets[index]=test 430 | config.num_classes = count_labels(train_datasets[0][2]) 431 | combined_data=combine(train_datasets) 432 | 433 | gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.9) 434 | with tf.Graph().as_default(), tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) as session: 435 | initializer = tf.random_normal_initializer(0, 0.05) 436 | 437 | #attention weights 438 | with tf.variable_scope("shared_model"): 439 | #domain embedding 440 | domain_embedding = tf.Variable(tf.random_normal([domain_size, 2*config.hidden_size], mean=0.0, stddev=0.1, dtype=tf.float32), name="domain_embedding") 441 | W_a = tf.Variable(tf.random_normal([2*config.hidden_size, 4*config.hidden_size], mean=0.0, stddev=0.1, dtype=tf.float32), name="W_a") 442 | U_a = tf.Variable(tf.random_normal([2*config.hidden_size, 4*config.hidden_size], mean=0.0, stddev=0.1, dtype=tf.float32), name="U_a") 443 | v_a = tf.Variable(tf.random_normal([4*config.hidden_size, 1], mean=0.0, stddev=0.1, dtype=tf.float32), name="v_a") 444 | 445 | 446 | #domain self-attention weights 447 | with tf.variable_scope("self_attention"): 448 | self_Q = tf.Variable(tf.random_normal([2*config.hidden_size, 2*config.hidden_size], mean=0.0, stddev=0.1, dtype=tf.float32), name="Q") 449 | self_K = tf.Variable(tf.random_normal([2*config.hidden_size, 2*config.hidden_size], mean=0.0, stddev=0.1, dtype=tf.float32), name="K") 450 | 451 | 452 | #memory network 453 | memories=[] 454 | for index, train in enumerate(train_datasets): 455 | memory = tf.Variable(tf.random_normal([len(train[3]), 2*config.hidden_size], mean=0.0, stddev=0.1, dtype=tf.float32),trainable=False ,name="memory"+str(index)) 456 | memories.append(memory) 457 | 458 | #weights for domain classifier (adversarial training) 459 | with tf.variable_scope('domain_classifier'): 460 | domain_classifier_weight1 = tf.Variable(tf.random_normal([2*config.hidden_size, config.hidden_size], mean=0.0, stddev=0.1, dtype=tf.float32), name="domain_classifier1") 461 | domain_classifier_bias1 = tf.Variable(tf.random_normal([config.hidden_size], mean=0.0, stddev=0.1, dtype=tf.float32), name="domain_classifier_bias1") 462 | 463 | domain_classifier_weight2 = tf.Variable(tf.random_normal([config.hidden_size, domain_size], mean=0.0, stddev=0.1, dtype=tf.float32), name="domain_classifier2") 464 | domain_classifier_bias2 = tf.Variable(tf.random_normal([domain_size], mean=0.0, stddev=0.1, dtype=tf.float32), name="domain_classifier_bias2") 465 | 466 | #print memory shape 467 | print("memory shape") 468 | for index,memory in enumerate(memories): 469 | print(sys.argv[1+index]) 470 | print(memory.get_shape()) 471 | 472 | #training model for shared weights 473 | with tf.variable_scope("shared_model", reuse=None, initializer=initializer): 474 | share_model_train = EmbeddingModel(True, config=config, session=session) 475 | #testing model for shared weights 476 | with tf.variable_scope("shared_model", reuse = True, initializer=initializer): 477 | share_model_test = EmbeddingModel(False, config=config, session=session) 478 | 479 | #domain classifier 480 | domain_classifier=Domain_classifier(share_model_train, domain_classifier_weight1, domain_classifier_bias1,domain_classifier_weight2, domain_classifier_bias2,config, False) 481 | domain_classifier_adversarial=Domain_classifier(share_model_train, domain_classifier_weight1, domain_classifier_bias1,domain_classifier_weight2, domain_classifier_bias2,config, True) 482 | 483 | #build models 484 | train_models=[] 485 | test_models=[] 486 | for index in range(domain_size): 487 | with tf.variable_scope("m"+str(index), reuse = None, initializer=initializer): 488 | train_model = Combine_two_model(True,share_model_train, config, domain_embedding, index, memories, W_a, U_a,v_a, domain_classifier_weight1,domain_classifier_bias1, domain_classifier_weight2,domain_classifier_bias2, self_Q, self_K) 489 | with tf.variable_scope("m"+str(index), reuse = True, initializer=initializer): 490 | test_model = Combine_two_model(False,share_model_test, config, domain_embedding, index, memories, W_a, U_a,v_a, domain_classifier_weight1,domain_classifier_bias1, domain_classifier_weight2,domain_classifier_bias2, self_Q, self_K) 491 | train_models.append(train_model) 492 | test_models.append(test_model) 493 | 494 | #print trainable variables 495 | for v in tf.trainable_variables(): 496 | print(v.name) 497 | 498 | #initialize 499 | init = tf.global_variables_initializer() 500 | session.run(init) 501 | 502 | #initialize share model's embedding with word2vec 503 | word_to_vec(session,config, share_model_train) 504 | #train test model 505 | train_test_model(config, session,\ 506 | train_models,test_models,test_models,\ 507 | train_datasets,valid_datasets,test_datasets, domain_classifier,domain_classifier_adversarial,combined_data) 508 | -------------------------------------------------------------------------------- /preprocessing.py: -------------------------------------------------------------------------------- 1 | import os 2 | import collections 3 | import pickle 4 | import gensim 5 | from gensim.models.keyedvectors import KeyedVectors 6 | import numpy as np 7 | import re 8 | 9 | 10 | #insert words of a file 11 | def insert_word(f): 12 | global all_words 13 | for l in f: 14 | words=re.split('\s|-',l.lower().split("|||")[0].strip()) 15 | 16 | all_words+=words 17 | 18 | #convert words to numbers 19 | def convert_words_to_number(f, dataset, labels): 20 | global common_word 21 | for l in f: 22 | try: 23 | words=re.split('\s|-',l.lower().split("|||")[0].strip()) 24 | label=l.lower().split("|||")[1].strip('\n') 25 | words=[common_word[w] if w in common_word else 1 for w in words] 26 | dataset+=[words] 27 | labels+=[label] 28 | except: 29 | continue 30 | vocab=10000 31 | gap=2 32 | vocab_size=vocab-2 33 | location='./dataset/' 34 | all_words=[] 35 | 36 | #iterate all files 37 | for file in os.listdir(location): 38 | if file != '.DS_Store': 39 | with open(location+file+"/trn") as f: 40 | insert_word(f) 41 | with open(location+file+"/dev") as f: 42 | insert_word(f) 43 | 44 | #take out frequent words 45 | counter=collections.Counter(all_words) 46 | common_word=dict(counter.most_common(vocab_size)) 47 | 48 | #number them 49 | c=2 50 | for key in common_word: 51 | common_word[key]=c 52 | c+=1 53 | print(common_word) 54 | pickle.dump(common_word, open('dictionary', 'wb')) 55 | 56 | for file in os.listdir(location): 57 | 58 | if file != '.DS_Store': 59 | train=[] 60 | train_label=[] 61 | test=[] 62 | test_label=[] 63 | with open(location+file+"/trn") as f: 64 | convert_words_to_number(f, train, train_label) 65 | 66 | with open(location+file+"/dev") as f: 67 | convert_words_to_number(f, train, train_label) 68 | 69 | pickle.dump(((train,train_label) ,(test,test_label)), open(location+file+'/dataset', 'wb')) 70 | 71 | 72 | #create embedding vector matrix 73 | word_vectors = KeyedVectors.load_word2vec_format('vectors.gz', binary=True) 74 | word2vec=[[0]*300, [0]*300] 75 | for number, word in sorted(zip(common_word.values(), common_word.keys())): 76 | try: 77 | print(type(word_vectors.word_vec(word))) 78 | word2vec.append(word_vectors.word_vec(word).tolist()) 79 | except KeyError: 80 | print(word+ " not found") 81 | word2vec.append([0]*300) 82 | pickle.dump(word2vec, open('vectors', 'wb')) 83 | print(len(word2vec)) 84 | 85 | print(word_vectors.word_vec('laptop')) 86 | -------------------------------------------------------------------------------- /read_data.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | from six.moves import xrange 3 | import six.moves.cPickle as pickle 4 | 5 | import gzip 6 | import os 7 | import numpy 8 | 9 | def prepare_data(seqs, labels, maxlen, traindata, index): 10 | """Create the matrices from the datasets. 11 | 12 | This pad each sequence to the same lenght: the lenght of the 13 | longuest sequence or maxlen. 14 | 15 | if maxlen is set, we will cut all sequence to this maximum 16 | lenght. 17 | 18 | This swap the axis! 19 | """ 20 | # x: a list of sentences 21 | lengths = [len(s) for s in seqs] 22 | 23 | if maxlen is not None: 24 | new_seqs = [] 25 | new_labels = [] 26 | new_lengths = [] 27 | for l, s, y in zip(lengths, seqs, labels): 28 | if l < maxlen: 29 | new_seqs.append(s) 30 | new_labels.append(y) 31 | new_lengths.append(l) 32 | lengths = new_lengths 33 | labels = new_labels 34 | seqs = new_seqs 35 | 36 | if len(lengths) < 1: 37 | return None, None, None 38 | 39 | n_samples = len(seqs) 40 | #maxlen = numpy.max(lengths) 41 | 42 | x = numpy.zeros((maxlen, n_samples)).astype('int64') 43 | labels = numpy.array(labels).astype('int32') 44 | for idx, s in enumerate(seqs): 45 | x[:lengths[idx], idx] = s 46 | if not traindata: 47 | return [x, numpy.array(lengths).astype('int32'), labels] 48 | else: 49 | return [x, numpy.array(lengths).astype('int32'), labels, numpy.array(range(len(lengths))), [index]*len(lengths)] 50 | 51 | def load_data(path, n_words=10000, valid_portion=0.2, maxlen=None, 52 | sort_by_len=False): 53 | 54 | if path.endswith(".gz"): 55 | f = gzip.open(path, 'rb') 56 | else: 57 | f = open(path, 'rb') 58 | 59 | train_set, test_set= pickle.load(f) 60 | f.close() 61 | if maxlen: 62 | new_train_set_x = [] 63 | new_train_set_y = [] 64 | for x, y in zip(train_set[0], train_set[1]): 65 | if len(x) < maxlen: 66 | new_train_set_x.append(x) 67 | new_train_set_y.append(y) 68 | train_set = (new_train_set_x, new_train_set_y) 69 | del new_train_set_x, new_train_set_y 70 | 71 | # split training set into validation set 72 | train_set_x, train_set_y = train_set 73 | n_samples = len(train_set_x) 74 | sidx = numpy.random.permutation(n_samples) 75 | n_train = int(numpy.round(n_samples * (1. - valid_portion))) 76 | valid_set_x = [train_set_x[s] for s in sidx[n_train:]] 77 | valid_set_y = [train_set_y[s] for s in sidx[n_train:]] 78 | train_set_x = [train_set_x[s] for s in sidx[:n_train]] 79 | train_set_y = [train_set_y[s] for s in sidx[:n_train]] 80 | 81 | train_set = (train_set_x, train_set_y) 82 | valid_set = (valid_set_x, valid_set_y) 83 | 84 | def remove_unk(x): 85 | return [[1 if w >= n_words else w for w in sen] for sen in x] 86 | 87 | test_set_x, test_set_y = test_set 88 | valid_set_x, valid_set_y = valid_set 89 | train_set_x, train_set_y = train_set 90 | 91 | train_set_x = remove_unk(train_set_x) 92 | valid_set_x = remove_unk(valid_set_x) 93 | test_set_x = remove_unk(test_set_x) 94 | 95 | def len_argsort(seq): 96 | return sorted(range(len(seq)), key=lambda x: len(seq[x])) 97 | 98 | if sort_by_len: 99 | sorted_index = len_argsort(test_set_x) 100 | test_set_x = [test_set_x[i] for i in sorted_index] 101 | test_set_y = [test_set_y[i] for i in sorted_index] 102 | 103 | sorted_index = len_argsort(valid_set_x) 104 | valid_set_x = [valid_set_x[i] for i in sorted_index] 105 | valid_set_y = [valid_set_y[i] for i in sorted_index] 106 | 107 | sorted_index = len_argsort(train_set_x) 108 | train_set_x = [train_set_x[i] for i in sorted_index] 109 | train_set_y = [train_set_y[i] for i in sorted_index] 110 | 111 | train = [train_set_x, train_set_y] 112 | valid = [valid_set_x, valid_set_y] 113 | test = [test_set_x, test_set_y] 114 | 115 | return train, valid, test --------------------------------------------------------------------------------