├── README.md
├── imdb.py
└── tf_lstm.py


/README.md:
--------------------------------------------------------------------------------
1 | #tensorflowtrials
2 | ##Working sentiment analysis and other small devices using TensorFlow
3 | 
4 | The module tf_lstm.py performs sentiment analysis as done in Theano's SA tutorial 
5 | (http://deeplearning.net/tutorial/lstm.html). tf_lstm.py is a modification of
6 | ptb_word_lm.py in TensorFlow's Language Modeling tutorial, reworked to perform SA. Accuracy of 80% is achieved after ~60 epochs.
7 | 
8 | 


--------------------------------------------------------------------------------
/imdb.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | Slight variation on imdb.py from Theano LSTM Sentiment Analysis tutorial.
  3 | Changed for use in TensorFlow Sentiment Analysis implementation
  4 | '''
  5 | from __future__ import print_function
  6 | from six.moves import xrange
  7 | import six.moves.cPickle as pickle
  8 | 
  9 | import gzip
 10 | import os
 11 | import numpy
 12 | 
 13 | 
 14 | def prepare_data(seqs, labels, maxlen=None):
 15 |     """Create the matrices from the datasets.
 16 | 
 17 |     This pad each sequence to the same lenght: the lenght of the
 18 |     longuest sequence or maxlen.
 19 | 
 20 |     if maxlen is set, we will cut all sequence to this maximum
 21 |     lenght.
 22 | 
 23 |     This swap the axis!
 24 |     """
 25 |     # x: a list of sentences
 26 |     lengths = [len(s) for s in seqs]
 27 | 
 28 |     if maxlen is not None:
 29 |         new_seqs = []
 30 |         new_labels = []
 31 |         new_lengths = []
 32 |         for l, s, y in zip(lengths, seqs, labels):
 33 |             if l < maxlen:
 34 |                 new_seqs.append(s)
 35 |                 new_labels.append(y)
 36 |                 new_lengths.append(l)
 37 |         lengths = new_lengths
 38 |         labels = new_labels
 39 |         seqs = new_seqs
 40 | 
 41 |         if len(lengths) < 1:
 42 |             return None, None, None
 43 | 
 44 |     n_samples = len(seqs)
 45 |     #maxlen = numpy.max(lengths) #Causes error, placeholder input size must be fixed
 46 | 
 47 |     x = numpy.zeros((maxlen, n_samples)).astype('int64')
 48 |     x_mask = numpy.zeros((maxlen, n_samples)).astype('float32')
 49 |     labels = numpy.array(labels).astype('int32')
 50 |     for idx, s in enumerate(seqs):
 51 |         x[:lengths[idx], idx] = s
 52 |         x_mask[:lengths[idx], idx] = 1.
 53 | 
 54 |     return x, x_mask, labels
 55 | 
 56 | 
 57 | def get_dataset_file(dataset, default_dataset, origin):
 58 |     '''Look for it as if it was a full path, if not, try local file,
 59 |     if not try in the data directory.
 60 | 
 61 |     Download dataset if it is not present
 62 | 
 63 |     '''
 64 |     data_dir, data_file = os.path.split(dataset)
 65 | 
 66 | #    if data_dir == "" and not os.path.isfile(dataset):
 67 | #        # Check if dataset is in the data directory.
 68 | #        new_path = os.path.join(
 69 | #            os.path.split(__file__)[0],
 70 | #            "..",
 71 | #            "data",
 72 | #            dataset
 73 | #        )
 74 | #        if os.path.isfile(new_path) or data_file == default_dataset:
 75 | #            dataset = new_path
 76 | #
 77 | #    if (not os.path.isfile(dataset)) and data_file == default_dataset:
 78 | #        from six.moves import urllib
 79 | #        print('Downloading data from %s' % origin)
 80 | #        print('dataset : %s' % dataset)
 81 | #        urllib.request.urlretrieve(origin, dataset)
 82 | #CHANGE dataset TO LOCATION FOR FILE TO BE DOWNLOADED TO
 83 | 
 84 |     if not os.path.isfile(dataset):
 85 |         if data_dir == "":
 86 |             dataset = os.path.join(os.path.split(__file__)[0], dataset)
 87 |         from six.moves import urllib
 88 |         print('Downloading data from %s' % origin)
 89 |         print('dataset : %s' % dataset)
 90 |         urllib.request.urlretrieve(origin, dataset)
 91 | 
 92 |     return dataset
 93 | 
 94 | 
 95 | def load_data(path="imdb.pkl", n_words=100000, valid_portion=0.1, maxlen=None,
 96 |     sort_by_len=True):
 97 |     '''Loads the dataset
 98 | 
 99 |     :type path: String
100 |     :param path: The path to the dataset (here IMDB)
101 |     :type n_words: int
102 |     :param n_words: The number of word to keep in the vocabulary.
103 |         All extra words are set to unknow (1).
104 |     :type valid_portion: float
105 |     :param valid_portion: The proportion of the full train set used for
106 |         the validation set.
107 |     :type maxlen: None or positive int
108 |     :param maxlen: the max sequence length we use in the train/valid set.
109 |     :type sort_by_len: bool
110 |     :name sort_by_len: Sort by the sequence lenght for the train,
111 |         valid and test set. This allow faster execution as it cause
112 |         less padding per minibatch. Another mechanism must be used to
113 |         shuffle the train set at each epoch.
114 | 
115 |     '''
116 | 
117 |     #############
118 |     # LOAD DATA #
119 |     #############
120 | 
121 |     # Load the dataset
122 |     path = get_dataset_file(
123 |         path, "imdb.pkl",
124 |         "http://www.iro.umontreal.ca/~lisa/deep/data/imdb.pkl")
125 | 
126 |     if path.endswith(".gz"):
127 |         f = gzip.open(path, 'rb')
128 |     else:
129 |         f = open(path, 'rb')
130 | 
131 |     train_set = pickle.load(f)
132 |     test_set = pickle.load(f)
133 |     f.close()
134 |     if maxlen:
135 |         new_train_set_x = []
136 |         new_train_set_y = []
137 |         for x, y in zip(train_set[0], train_set[1]):
138 |             if len(x) < maxlen:
139 |                 new_train_set_x.append(x)
140 |                 new_train_set_y.append(y)
141 |         train_set = (new_train_set_x, new_train_set_y)
142 |         del new_train_set_x, new_train_set_y
143 | 
144 |     # split training set into validation set
145 |     train_set_x, train_set_y = train_set
146 |     n_samples = len(train_set_x)
147 |     sidx = numpy.random.permutation(n_samples)
148 |     n_train = int(numpy.round(n_samples * (1. - valid_portion)))
149 |     valid_set_x = [train_set_x[s] for s in sidx[n_train:]]
150 |     valid_set_y = [train_set_y[s] for s in sidx[n_train:]]
151 |     train_set_x = [train_set_x[s] for s in sidx[:n_train]]
152 |     train_set_y = [train_set_y[s] for s in sidx[:n_train]]
153 | 
154 |     train_set = (train_set_x, train_set_y)
155 |     valid_set = (valid_set_x, valid_set_y)
156 | 
157 |     def remove_unk(x):
158 |         return [[1 if w >= n_words else w for w in sen] for sen in x]
159 | 
160 |     test_set_x, test_set_y = test_set
161 |     valid_set_x, valid_set_y = valid_set
162 |     train_set_x, train_set_y = train_set
163 | 
164 |     train_set_x = remove_unk(train_set_x)
165 |     valid_set_x = remove_unk(valid_set_x)
166 |     test_set_x = remove_unk(test_set_x)
167 | 
168 |     def len_argsort(seq):
169 |         return sorted(range(len(seq)), key=lambda x: len(seq[x]))
170 | 
171 |     if sort_by_len:
172 |         sorted_index = len_argsort(test_set_x)
173 |         test_set_x = [test_set_x[i] for i in sorted_index]
174 |         test_set_y = [test_set_y[i] for i in sorted_index]
175 | 
176 |         sorted_index = len_argsort(valid_set_x)
177 |         valid_set_x = [valid_set_x[i] for i in sorted_index]
178 |         valid_set_y = [valid_set_y[i] for i in sorted_index]
179 | 
180 |         sorted_index = len_argsort(train_set_x)
181 |         train_set_x = [train_set_x[i] for i in sorted_index]
182 |         train_set_y = [train_set_y[i] for i in sorted_index]
183 | 
184 |     train = (train_set_x, train_set_y)
185 |     valid = (valid_set_x, valid_set_y)
186 |     test = (test_set_x, test_set_y)
187 | 
188 |     return train, valid, test
189 | 


--------------------------------------------------------------------------------
/tf_lstm.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | A TensorFlow implementation of Theano LSTM sentiment analyzer tutorial,
  3 | this model is a variation on TensorFlow's ptb_word_lm.py seq2seq model
  4 | tutorial to accomplish the sentiment analysis task from the IMDB dataset.
  5 | 
  6 | '''
  7 | 
  8 | from __future__ import print_function
  9 | import six.moves.cPickle as pickle
 10 | 
 11 | from collections import OrderedDict
 12 | import sys
 13 | import time
 14 | 
 15 | import numpy
 16 | import tensorflow as tf
 17 | import imdb
 18 | 
 19 | 
 20 | class SentimentModel(object):
 21 | 
 22 |     def __init__(self, is_training, config):
 23 |         self.batch_size = batch_size = config.batch_size
 24 |         self.num_steps = num_steps = config.num_steps
 25 |         size = config.hidden_size
 26 |         vocab_size = config.vocab_size
 27 | 
 28 |         self.input_data = tf.placeholder(tf.int32, [num_steps, batch_size], name="inputs")
 29 |         self.mask = tf.placeholder(tf.float32, [num_steps, batch_size], name="mask")
 30 |         self.labels = tf.placeholder(tf.int64, [batch_size], name="labels")
 31 |         mask = tf.expand_dims(self.mask, -1)
 32 |         labels = self.labels
 33 |         #mask = tf.transpose(self._mask)
 34 |         #mask_expand = tf.tile(mask, tf.pack([1, 1, size]))
 35 |         #self._targets = tf.placeholder(tf.int32, [batch_size, num_steps])
 36 | 
 37 |         #add LSTM cell and dropout nodes
 38 |         cell = tf.nn.rnn_cell.BasicLSTMCell(size, forget_bias=0.0)
 39 |         if is_training and config.keep_prob < 1:
 40 |             cell = tf.nn.rnn_cell.DropoutWrapper(cell, output_keep_prob=config.keep_prob)
 41 | 
 42 |         self.initial_state = cell.zero_state(batch_size, tf.float32)
 43 | 
 44 |         with tf.device("/cpu:0"):
 45 |             embedding = tf.get_variable("embedding", [vocab_size, size])
 46 |             inputs = tf.nn.embedding_lookup(embedding, self.input_data)
 47 | 
 48 |         #add dropout to input units
 49 |         if is_training and config.keep_prob < 1:
 50 |             inputs = tf.nn.dropout(inputs, config.keep_prob)
 51 | 
 52 |         outputs = []
 53 |         state = self.initial_state
 54 |         with tf.variable_scope("RNN"):
 55 |             for time_step in range(num_steps):
 56 |                 if time_step > 0:
 57 |                     tf.get_variable_scope().reuse_variables()
 58 |                 (cell_output, state) = cell(inputs[time_step, :, :], state)
 59 |                 outputs.append(tf.expand_dims(cell_output, 0))
 60 |         
 61 |         outputs = tf.concat(0, outputs)*mask
 62 |         mask_sum = tf.reduce_sum(mask, 0)
 63 |         proj = tf.reduce_sum(outputs, 0)/mask_sum
 64 |         #NOW proj has shape [batch_size, size]
 65 | 
 66 |         softmax_w = tf.get_variable("softmax_w", [size, vocab_size])
 67 |         softmax_b = tf.get_variable("softmax_b", [vocab_size])
 68 |         logits = tf.matmul(proj, softmax_w) + softmax_b
 69 |         pred = tf.nn.softmax(logits)
 70 |         loss = tf.nn.sparse_softmax_cross_entropy_with_logits(logits, labels) 
 71 |         self.cost = cost = tf.reduce_sum(loss) / batch_size
 72 |         self.final_state = state
 73 |         correct_prediction = tf.equal(tf.argmax(pred,1), labels)
 74 |         self.accuracy = tf.reduce_sum(tf.cast(correct_prediction, tf.float32))
 75 | 
 76 |         if not is_training:
 77 |             return
 78 | 
 79 | 
 80 |         self.lr = tf.Variable(0.0, trainable=False)
 81 |         tvars = tf.trainable_variables()
 82 |         grads, _ = tf.clip_by_global_norm(tf.gradients(cost, tvars),config.max_grad_norm)
 83 |         #optimizer = tf.train.GradientDescentOptimizer(self.lr)
 84 |         optimizer = tf.train.AdagradOptimizer(self.lr)
 85 |         self.train_op = optimizer.apply_gradients(zip(grads, tvars))
 86 | 
 87 |     def assign_lr(self, session, lr_value):
 88 |         session.run(tf.assign(self.lr, lr_value))
 89 | 
 90 | 
 91 | class Config(object):
 92 |     patience=10  # Number of epoch to wait before early stop if no progress
 93 |     dispFreq=10  # Display to stdout the training progress every N updates
 94 |     decay_c=0.  # Weight decay for the classifier applied to the U weights.
 95 |     lrate=0.0001  # Learning rate for sgd (not used for adadelta and rmsprop)
 96 |     vocab_size=10000  # Vocabulary size
 97 |     encoder='lstm'  # TODO: can be removed must be lstm.
 98 |     saveto='lstm_model.npz',  # The best model will be saved there
 99 |     validFreq=370  # Compute the validation error after this number of update.
100 |     saveFreq=1110  # Save the parameters after every saveFreq updates
101 |     maxlen=100  # Sequence longer then this get ignored
102 |     batch_size=20  # The batch size during training.
103 |     dataset='imdb'  # Parameter for extra option
104 |     noise_std=0.
105 |     use_dropout=True  # If False slightly faster, but worst test error. This frequently need a bigger model.
106 |     reload_model=None  # Path to a saved model we want to start from.
107 | 
108 |     init_scale = 0.05
109 |     learning_rate = 1.0
110 |     max_grad_norm = 5
111 |     num_layers = 1
112 |     num_steps = 100
113 |     hidden_size = 128
114 |     max_epoch = 6
115 |     max_max_epoch = 75
116 |     keep_prob = 0.5
117 |     lr_decay = 0.95
118 | 
119 | 
120 | def get_minibatches_idx(n, batch_size, shuffle=False):
121 |     """
122 |     Used to shuffle the dataset at each iteration.
123 |     """
124 | 
125 |     idx_list = numpy.arange(n, dtype="int32")
126 | 
127 |     if shuffle:
128 |         numpy.random.shuffle(idx_list)
129 | 
130 |     minibatches = []
131 |     minibatch_start = 0
132 |     for i in range(n // batch_size):
133 |         minibatches.append(idx_list[minibatch_start:
134 |                                     minibatch_start + batch_size])
135 |         minibatch_start += batch_size
136 | 
137 |     if (minibatch_start != n):
138 |         # Make a minibatch out of what is left
139 |         minibatches.append(idx_list[minibatch_start:])
140 | 
141 |     return minibatches
142 | 
143 | 
144 | def run_epoch(session, m, data, eval_op, verbose=False):
145 |     print("batch size", m.batch_size)
146 |     state = m.initial_state.eval()
147 |     n_samples = data[0].shape[1]
148 |     print("Testing %d samples:"%(n_samples))
149 |    
150 |     minibatches = get_minibatches_idx(n_samples, m.batch_size, shuffle=True)
151 |     n_batches = len(minibatches)-1
152 |     b_ind = 0
153 |     correct = 0.
154 |     total = 0
155 | 
156 |     for inds in minibatches[:-1]:
157 |         print("\rbatch %d / %d"%(b_ind, n_batches), end="")
158 |         sys.stdout.flush()
159 | 
160 |         x = data[0][:,inds]
161 |         mask = data[1][:,inds]
162 |         y = data[2][inds]
163 | 
164 |         cost, state, count, _ = session.run([m.cost, m.final_state, m.accuracy, eval_op],
165 |                             {m.input_data: x, m.mask: mask, m.labels: y, m.initial_state: state})
166 |         correct += count
167 |         total += len(inds)
168 |         b_ind += 1
169 | 
170 |     print("")
171 |     accuracy = correct/total
172 |     return accuracy
173 | 
174 | 
175 | def get_config():
176 |     return Config()
177 | 
178 | 
179 | def main(unused_args):
180 |     
181 |     maxlen = 100
182 |     n_words = 10000
183 | 
184 |     print('Loading data')
185 |     train, valid, test = imdb.load_data(n_words=n_words, valid_portion=0.05, maxlen=maxlen)
186 | 
187 |     train = imdb.prepare_data(train[0], train[1], maxlen=maxlen)
188 |     valid = imdb.prepare_data(valid[0], valid[1], maxlen=maxlen)
189 |     test = imdb.prepare_data(test[0], test[1], maxlen=maxlen)
190 | 
191 |     for data in [train, valid, test]:
192 |         print(data[0].shape, data[1].shape, data[2].shape)
193 | 
194 |     config = get_config()
195 |     eval_config = get_config()
196 |     #eval_config.batch_size = 1
197 |     #eval_config.num_steps = 1
198 | 
199 |     with tf.Graph().as_default(), tf.Session() as session:
200 |         initializer = tf.random_uniform_initializer(-config.init_scale,config.init_scale)
201 |         with tf.variable_scope("model", reuse=None, initializer=initializer):
202 |             m = SentimentModel(is_training=True, config=config)
203 |         with tf.variable_scope("model", reuse = True, initializer=initializer):
204 |             mvalid = SentimentModel(is_training=False, config=config)
205 |             mtest = SentimentModel(is_training=False, config=config)
206 | 
207 |         tf.initialize_all_variables().run()
208 |         
209 |         for i in range(config.max_max_epoch):
210 |             lr_decay = config.lr_decay ** max(i - config.max_epoch, 0.0)
211 |             m.assign_lr(session, config.learning_rate * lr_decay)
212 | 
213 |             print("Epoch: %d Learning rate: %.3f" % (i + 1, session.run(m.lr)))
214 |             start_time = time.time()
215 |             train_acc = run_epoch(session, m, train, m.train_op) 
216 |             print("Training Accuracy = %.4f, time = %.3f seconds\n"%(train_acc, time.time()-start_time))
217 |             valid_acc = run_epoch(session, mvalid, valid, tf.no_op())
218 |             print("Valid Accuracy = %.4f\n" % valid_acc)
219 | 
220 |         test_acc = run_epoch(session, mtest, test, tf.no_op())
221 |         print("Test Accuracy = %.4f\n" % test_acc)
222 | 
223 | 
224 | if __name__ == '__main__':
225 |     tf.app.run()
226 | 
227 | 
228 | 


--------------------------------------------------------------------------------