├── README.md ├── imdb.py └── tf_lstm.py /README.md: -------------------------------------------------------------------------------- 1 | #tensorflowtrials 2 | ##Working sentiment analysis and other small devices using TensorFlow 3 | 4 | The module tf_lstm.py performs sentiment analysis as done in Theano's SA tutorial 5 | (http://deeplearning.net/tutorial/lstm.html). tf_lstm.py is a modification of 6 | ptb_word_lm.py in TensorFlow's Language Modeling tutorial, reworked to perform SA. Accuracy of 80% is achieved after ~60 epochs. 7 | 8 | -------------------------------------------------------------------------------- /imdb.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Slight variation on imdb.py from Theano LSTM Sentiment Analysis tutorial. 3 | Changed for use in TensorFlow Sentiment Analysis implementation 4 | ''' 5 | from __future__ import print_function 6 | from six.moves import xrange 7 | import six.moves.cPickle as pickle 8 | 9 | import gzip 10 | import os 11 | import numpy 12 | 13 | 14 | def prepare_data(seqs, labels, maxlen=None): 15 | """Create the matrices from the datasets. 16 | 17 | This pad each sequence to the same lenght: the lenght of the 18 | longuest sequence or maxlen. 19 | 20 | if maxlen is set, we will cut all sequence to this maximum 21 | lenght. 22 | 23 | This swap the axis! 24 | """ 25 | # x: a list of sentences 26 | lengths = [len(s) for s in seqs] 27 | 28 | if maxlen is not None: 29 | new_seqs = [] 30 | new_labels = [] 31 | new_lengths = [] 32 | for l, s, y in zip(lengths, seqs, labels): 33 | if l < maxlen: 34 | new_seqs.append(s) 35 | new_labels.append(y) 36 | new_lengths.append(l) 37 | lengths = new_lengths 38 | labels = new_labels 39 | seqs = new_seqs 40 | 41 | if len(lengths) < 1: 42 | return None, None, None 43 | 44 | n_samples = len(seqs) 45 | #maxlen = numpy.max(lengths) #Causes error, placeholder input size must be fixed 46 | 47 | x = numpy.zeros((maxlen, n_samples)).astype('int64') 48 | x_mask = numpy.zeros((maxlen, n_samples)).astype('float32') 49 | labels = numpy.array(labels).astype('int32') 50 | for idx, s in enumerate(seqs): 51 | x[:lengths[idx], idx] = s 52 | x_mask[:lengths[idx], idx] = 1. 53 | 54 | return x, x_mask, labels 55 | 56 | 57 | def get_dataset_file(dataset, default_dataset, origin): 58 | '''Look for it as if it was a full path, if not, try local file, 59 | if not try in the data directory. 60 | 61 | Download dataset if it is not present 62 | 63 | ''' 64 | data_dir, data_file = os.path.split(dataset) 65 | 66 | # if data_dir == "" and not os.path.isfile(dataset): 67 | # # Check if dataset is in the data directory. 68 | # new_path = os.path.join( 69 | # os.path.split(__file__)[0], 70 | # "..", 71 | # "data", 72 | # dataset 73 | # ) 74 | # if os.path.isfile(new_path) or data_file == default_dataset: 75 | # dataset = new_path 76 | # 77 | # if (not os.path.isfile(dataset)) and data_file == default_dataset: 78 | # from six.moves import urllib 79 | # print('Downloading data from %s' % origin) 80 | # print('dataset : %s' % dataset) 81 | # urllib.request.urlretrieve(origin, dataset) 82 | #CHANGE dataset TO LOCATION FOR FILE TO BE DOWNLOADED TO 83 | 84 | if not os.path.isfile(dataset): 85 | if data_dir == "": 86 | dataset = os.path.join(os.path.split(__file__)[0], dataset) 87 | from six.moves import urllib 88 | print('Downloading data from %s' % origin) 89 | print('dataset : %s' % dataset) 90 | urllib.request.urlretrieve(origin, dataset) 91 | 92 | return dataset 93 | 94 | 95 | def load_data(path="imdb.pkl", n_words=100000, valid_portion=0.1, maxlen=None, 96 | sort_by_len=True): 97 | '''Loads the dataset 98 | 99 | :type path: String 100 | :param path: The path to the dataset (here IMDB) 101 | :type n_words: int 102 | :param n_words: The number of word to keep in the vocabulary. 103 | All extra words are set to unknow (1). 104 | :type valid_portion: float 105 | :param valid_portion: The proportion of the full train set used for 106 | the validation set. 107 | :type maxlen: None or positive int 108 | :param maxlen: the max sequence length we use in the train/valid set. 109 | :type sort_by_len: bool 110 | :name sort_by_len: Sort by the sequence lenght for the train, 111 | valid and test set. This allow faster execution as it cause 112 | less padding per minibatch. Another mechanism must be used to 113 | shuffle the train set at each epoch. 114 | 115 | ''' 116 | 117 | ############# 118 | # LOAD DATA # 119 | ############# 120 | 121 | # Load the dataset 122 | path = get_dataset_file( 123 | path, "imdb.pkl", 124 | "http://www.iro.umontreal.ca/~lisa/deep/data/imdb.pkl") 125 | 126 | if path.endswith(".gz"): 127 | f = gzip.open(path, 'rb') 128 | else: 129 | f = open(path, 'rb') 130 | 131 | train_set = pickle.load(f) 132 | test_set = pickle.load(f) 133 | f.close() 134 | if maxlen: 135 | new_train_set_x = [] 136 | new_train_set_y = [] 137 | for x, y in zip(train_set[0], train_set[1]): 138 | if len(x) < maxlen: 139 | new_train_set_x.append(x) 140 | new_train_set_y.append(y) 141 | train_set = (new_train_set_x, new_train_set_y) 142 | del new_train_set_x, new_train_set_y 143 | 144 | # split training set into validation set 145 | train_set_x, train_set_y = train_set 146 | n_samples = len(train_set_x) 147 | sidx = numpy.random.permutation(n_samples) 148 | n_train = int(numpy.round(n_samples * (1. - valid_portion))) 149 | valid_set_x = [train_set_x[s] for s in sidx[n_train:]] 150 | valid_set_y = [train_set_y[s] for s in sidx[n_train:]] 151 | train_set_x = [train_set_x[s] for s in sidx[:n_train]] 152 | train_set_y = [train_set_y[s] for s in sidx[:n_train]] 153 | 154 | train_set = (train_set_x, train_set_y) 155 | valid_set = (valid_set_x, valid_set_y) 156 | 157 | def remove_unk(x): 158 | return [[1 if w >= n_words else w for w in sen] for sen in x] 159 | 160 | test_set_x, test_set_y = test_set 161 | valid_set_x, valid_set_y = valid_set 162 | train_set_x, train_set_y = train_set 163 | 164 | train_set_x = remove_unk(train_set_x) 165 | valid_set_x = remove_unk(valid_set_x) 166 | test_set_x = remove_unk(test_set_x) 167 | 168 | def len_argsort(seq): 169 | return sorted(range(len(seq)), key=lambda x: len(seq[x])) 170 | 171 | if sort_by_len: 172 | sorted_index = len_argsort(test_set_x) 173 | test_set_x = [test_set_x[i] for i in sorted_index] 174 | test_set_y = [test_set_y[i] for i in sorted_index] 175 | 176 | sorted_index = len_argsort(valid_set_x) 177 | valid_set_x = [valid_set_x[i] for i in sorted_index] 178 | valid_set_y = [valid_set_y[i] for i in sorted_index] 179 | 180 | sorted_index = len_argsort(train_set_x) 181 | train_set_x = [train_set_x[i] for i in sorted_index] 182 | train_set_y = [train_set_y[i] for i in sorted_index] 183 | 184 | train = (train_set_x, train_set_y) 185 | valid = (valid_set_x, valid_set_y) 186 | test = (test_set_x, test_set_y) 187 | 188 | return train, valid, test 189 | -------------------------------------------------------------------------------- /tf_lstm.py: -------------------------------------------------------------------------------- 1 | ''' 2 | A TensorFlow implementation of Theano LSTM sentiment analyzer tutorial, 3 | this model is a variation on TensorFlow's ptb_word_lm.py seq2seq model 4 | tutorial to accomplish the sentiment analysis task from the IMDB dataset. 5 | 6 | ''' 7 | 8 | from __future__ import print_function 9 | import six.moves.cPickle as pickle 10 | 11 | from collections import OrderedDict 12 | import sys 13 | import time 14 | 15 | import numpy 16 | import tensorflow as tf 17 | import imdb 18 | 19 | 20 | class SentimentModel(object): 21 | 22 | def __init__(self, is_training, config): 23 | self.batch_size = batch_size = config.batch_size 24 | self.num_steps = num_steps = config.num_steps 25 | size = config.hidden_size 26 | vocab_size = config.vocab_size 27 | 28 | self.input_data = tf.placeholder(tf.int32, [num_steps, batch_size], name="inputs") 29 | self.mask = tf.placeholder(tf.float32, [num_steps, batch_size], name="mask") 30 | self.labels = tf.placeholder(tf.int64, [batch_size], name="labels") 31 | mask = tf.expand_dims(self.mask, -1) 32 | labels = self.labels 33 | #mask = tf.transpose(self._mask) 34 | #mask_expand = tf.tile(mask, tf.pack([1, 1, size])) 35 | #self._targets = tf.placeholder(tf.int32, [batch_size, num_steps]) 36 | 37 | #add LSTM cell and dropout nodes 38 | cell = tf.nn.rnn_cell.BasicLSTMCell(size, forget_bias=0.0) 39 | if is_training and config.keep_prob < 1: 40 | cell = tf.nn.rnn_cell.DropoutWrapper(cell, output_keep_prob=config.keep_prob) 41 | 42 | self.initial_state = cell.zero_state(batch_size, tf.float32) 43 | 44 | with tf.device("/cpu:0"): 45 | embedding = tf.get_variable("embedding", [vocab_size, size]) 46 | inputs = tf.nn.embedding_lookup(embedding, self.input_data) 47 | 48 | #add dropout to input units 49 | if is_training and config.keep_prob < 1: 50 | inputs = tf.nn.dropout(inputs, config.keep_prob) 51 | 52 | outputs = [] 53 | state = self.initial_state 54 | with tf.variable_scope("RNN"): 55 | for time_step in range(num_steps): 56 | if time_step > 0: 57 | tf.get_variable_scope().reuse_variables() 58 | (cell_output, state) = cell(inputs[time_step, :, :], state) 59 | outputs.append(tf.expand_dims(cell_output, 0)) 60 | 61 | outputs = tf.concat(0, outputs)*mask 62 | mask_sum = tf.reduce_sum(mask, 0) 63 | proj = tf.reduce_sum(outputs, 0)/mask_sum 64 | #NOW proj has shape [batch_size, size] 65 | 66 | softmax_w = tf.get_variable("softmax_w", [size, vocab_size]) 67 | softmax_b = tf.get_variable("softmax_b", [vocab_size]) 68 | logits = tf.matmul(proj, softmax_w) + softmax_b 69 | pred = tf.nn.softmax(logits) 70 | loss = tf.nn.sparse_softmax_cross_entropy_with_logits(logits, labels) 71 | self.cost = cost = tf.reduce_sum(loss) / batch_size 72 | self.final_state = state 73 | correct_prediction = tf.equal(tf.argmax(pred,1), labels) 74 | self.accuracy = tf.reduce_sum(tf.cast(correct_prediction, tf.float32)) 75 | 76 | if not is_training: 77 | return 78 | 79 | 80 | self.lr = tf.Variable(0.0, trainable=False) 81 | tvars = tf.trainable_variables() 82 | grads, _ = tf.clip_by_global_norm(tf.gradients(cost, tvars),config.max_grad_norm) 83 | #optimizer = tf.train.GradientDescentOptimizer(self.lr) 84 | optimizer = tf.train.AdagradOptimizer(self.lr) 85 | self.train_op = optimizer.apply_gradients(zip(grads, tvars)) 86 | 87 | def assign_lr(self, session, lr_value): 88 | session.run(tf.assign(self.lr, lr_value)) 89 | 90 | 91 | class Config(object): 92 | patience=10 # Number of epoch to wait before early stop if no progress 93 | dispFreq=10 # Display to stdout the training progress every N updates 94 | decay_c=0. # Weight decay for the classifier applied to the U weights. 95 | lrate=0.0001 # Learning rate for sgd (not used for adadelta and rmsprop) 96 | vocab_size=10000 # Vocabulary size 97 | encoder='lstm' # TODO: can be removed must be lstm. 98 | saveto='lstm_model.npz', # The best model will be saved there 99 | validFreq=370 # Compute the validation error after this number of update. 100 | saveFreq=1110 # Save the parameters after every saveFreq updates 101 | maxlen=100 # Sequence longer then this get ignored 102 | batch_size=20 # The batch size during training. 103 | dataset='imdb' # Parameter for extra option 104 | noise_std=0. 105 | use_dropout=True # If False slightly faster, but worst test error. This frequently need a bigger model. 106 | reload_model=None # Path to a saved model we want to start from. 107 | 108 | init_scale = 0.05 109 | learning_rate = 1.0 110 | max_grad_norm = 5 111 | num_layers = 1 112 | num_steps = 100 113 | hidden_size = 128 114 | max_epoch = 6 115 | max_max_epoch = 75 116 | keep_prob = 0.5 117 | lr_decay = 0.95 118 | 119 | 120 | def get_minibatches_idx(n, batch_size, shuffle=False): 121 | """ 122 | Used to shuffle the dataset at each iteration. 123 | """ 124 | 125 | idx_list = numpy.arange(n, dtype="int32") 126 | 127 | if shuffle: 128 | numpy.random.shuffle(idx_list) 129 | 130 | minibatches = [] 131 | minibatch_start = 0 132 | for i in range(n // batch_size): 133 | minibatches.append(idx_list[minibatch_start: 134 | minibatch_start + batch_size]) 135 | minibatch_start += batch_size 136 | 137 | if (minibatch_start != n): 138 | # Make a minibatch out of what is left 139 | minibatches.append(idx_list[minibatch_start:]) 140 | 141 | return minibatches 142 | 143 | 144 | def run_epoch(session, m, data, eval_op, verbose=False): 145 | print("batch size", m.batch_size) 146 | state = m.initial_state.eval() 147 | n_samples = data[0].shape[1] 148 | print("Testing %d samples:"%(n_samples)) 149 | 150 | minibatches = get_minibatches_idx(n_samples, m.batch_size, shuffle=True) 151 | n_batches = len(minibatches)-1 152 | b_ind = 0 153 | correct = 0. 154 | total = 0 155 | 156 | for inds in minibatches[:-1]: 157 | print("\rbatch %d / %d"%(b_ind, n_batches), end="") 158 | sys.stdout.flush() 159 | 160 | x = data[0][:,inds] 161 | mask = data[1][:,inds] 162 | y = data[2][inds] 163 | 164 | cost, state, count, _ = session.run([m.cost, m.final_state, m.accuracy, eval_op], 165 | {m.input_data: x, m.mask: mask, m.labels: y, m.initial_state: state}) 166 | correct += count 167 | total += len(inds) 168 | b_ind += 1 169 | 170 | print("") 171 | accuracy = correct/total 172 | return accuracy 173 | 174 | 175 | def get_config(): 176 | return Config() 177 | 178 | 179 | def main(unused_args): 180 | 181 | maxlen = 100 182 | n_words = 10000 183 | 184 | print('Loading data') 185 | train, valid, test = imdb.load_data(n_words=n_words, valid_portion=0.05, maxlen=maxlen) 186 | 187 | train = imdb.prepare_data(train[0], train[1], maxlen=maxlen) 188 | valid = imdb.prepare_data(valid[0], valid[1], maxlen=maxlen) 189 | test = imdb.prepare_data(test[0], test[1], maxlen=maxlen) 190 | 191 | for data in [train, valid, test]: 192 | print(data[0].shape, data[1].shape, data[2].shape) 193 | 194 | config = get_config() 195 | eval_config = get_config() 196 | #eval_config.batch_size = 1 197 | #eval_config.num_steps = 1 198 | 199 | with tf.Graph().as_default(), tf.Session() as session: 200 | initializer = tf.random_uniform_initializer(-config.init_scale,config.init_scale) 201 | with tf.variable_scope("model", reuse=None, initializer=initializer): 202 | m = SentimentModel(is_training=True, config=config) 203 | with tf.variable_scope("model", reuse = True, initializer=initializer): 204 | mvalid = SentimentModel(is_training=False, config=config) 205 | mtest = SentimentModel(is_training=False, config=config) 206 | 207 | tf.initialize_all_variables().run() 208 | 209 | for i in range(config.max_max_epoch): 210 | lr_decay = config.lr_decay ** max(i - config.max_epoch, 0.0) 211 | m.assign_lr(session, config.learning_rate * lr_decay) 212 | 213 | print("Epoch: %d Learning rate: %.3f" % (i + 1, session.run(m.lr))) 214 | start_time = time.time() 215 | train_acc = run_epoch(session, m, train, m.train_op) 216 | print("Training Accuracy = %.4f, time = %.3f seconds\n"%(train_acc, time.time()-start_time)) 217 | valid_acc = run_epoch(session, mvalid, valid, tf.no_op()) 218 | print("Valid Accuracy = %.4f\n" % valid_acc) 219 | 220 | test_acc = run_epoch(session, mtest, test, tf.no_op()) 221 | print("Test Accuracy = %.4f\n" % test_acc) 222 | 223 | 224 | if __name__ == '__main__': 225 | tf.app.run() 226 | 227 | 228 | --------------------------------------------------------------------------------