├── data_read.py ├── neural_net.py └── readme.md /data_read.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | 3 | import csv 4 | from tqdm import tqdm 5 | import pickle 6 | from nltk.tokenize import word_tokenize 7 | from nltk.stem import WordNetLemmatizer 8 | 9 | lemmatizer = WordNetLemmatizer() 10 | import numpy as np 11 | 12 | def init_process(fin, fout): 13 | """ 14 | converts the raw sentiment140 data into required format. 15 | Format has 3 category labels and then the tweet 16 | the seperator used is '|' and any use of the character '|' in tweets has ben replaced with a space 17 | This had to be done instead of multi char seperator as both python's csv module and tensorflow's csv decoder require single character seperator 18 | Args: 19 | fin: string filename of input file 20 | fout: string filename of output file 21 | Returns: 22 | None 23 | Creates: 24 | csv file named fout 25 | """ 26 | outfile = open(fout, 'a') 27 | with open(fin, buffering=200000, encoding='latin-1') as f: 28 | csvreader = csv.reader(f, quotechar='"') 29 | for line in csvreader: 30 | #the tweet is in the last column 31 | #replace any " in the tweet with null 32 | line[-1] = line[-1].replace('"','') 33 | #replace any "|" in the tweet with space as "|" will be used as a delimiter later 34 | line[-1] = line[-1].replace("|"," ") 35 | #The category is in the first column 36 | initial_polarity = line[0] 37 | 38 | # As mentioned above, in the markdown, using one-hot vector does not work 39 | # However, I found later that the issue was that the record_defaults parameter in 40 | # tensorflow's decode_csv can't take a list 41 | # so we convert the one-hot list in the old csv to 3 columns showing the category as 1 or 0 42 | if initial_polarity == '0': 43 | initial_polarity = '1|0|0' 44 | elif initial_polarity == '2': 45 | initial_polarity = '0|1|0' 46 | elif initial_polarity == '4': 47 | initial_polarity = '0|0|1' 48 | 49 | tweet = line[-1] 50 | outline = initial_polarity +'|'+tweet+'\n' 51 | outfile.write(outline) 52 | 53 | outfile.close() 54 | 55 | def file_len(fname): 56 | with open(fname) as f: 57 | for i, l in enumerate(f): 58 | pass 59 | return i + 1 60 | 61 | def create_lexicon(fin): 62 | """ 63 | Creates lexocon 64 | """ 65 | lexicon = [] 66 | with open(fin, 'r', buffering=100000, encoding='latin-1') as f: 67 | try: 68 | counter = 1 69 | content = '' 70 | for line in tqdm(f,total=file_len(fin)): 71 | counter += 1 72 | if (counter/2500.0).is_integer(): 73 | tweet = line.split('|')[3] 74 | content += ' '+tweet 75 | words = word_tokenize(content) 76 | words = [lemmatizer.lemmatize(i) for i in words] 77 | lexicon = list(set(lexicon + words)) 78 | # print(counter, len(lexicon)) 79 | 80 | except Exception as e: 81 | print(str(e)) 82 | 83 | with open('lexicon.pickle','wb') as f: 84 | pickle.dump(lexicon,f) 85 | 86 | def read_to_vec(fin, lexicon): 87 | """generates a vector for a line(USE GENERATOR) 88 | this function is just for demonstration purposes, to show how the feature vector and category vector should be returned later in tensorflow. 89 | it works but it will be slower 90 | Args: 91 | fin: filename to read from 92 | lexicon: the lexicon, premade using training data and loaded from pickle file 93 | Returns: 94 | None 95 | Yields: 96 | labels: one hot labels from csv 97 | features: feature vector from csv 98 | """ 99 | with open(fin, 'r') as f: 100 | csvreader = csv.reader(f,delimiter='|') 101 | for line in csvreader: 102 | current_words = word_tokenize(line[-1].lower()) 103 | current_words = [lemmatizer.lemmatize(i) for i in current_words] 104 | features = np.zeros(len(lexicon)) 105 | for word in current_words: 106 | if word.lower() in lexicon: 107 | index_value = lexicon.index(word.lower()) 108 | features[index_value] += 1 109 | labels = list(int(val) for val in (line[0:3])) 110 | yield(labels,list(features)) #every time the function is called, return a label, features pair is returned 111 | 112 | 113 | def read_record(filename_queue): 114 | """ 115 | Outputs tensorflow ops to read one tweet and one set of labels 116 | Args: 117 | filename_queue: tensorflow queue that contains the files to be read 118 | lexicon: the lexicon, premade using training data and loaded from pickle file 119 | sess: the tensorflow session to run this in 120 | Returns: 121 | feature_op: tensorflow op that returns tweet from one record 122 | label_op: tensorflow op that returns one-hot label vector in numpy array of shape 3 from one record 123 | """ 124 | #filename_queue = tf.train.string_input_producer([fin]) 125 | 126 | reader = tf.TextLineReader() 127 | key, value = reader.read(filename_queue) 128 | #default value of positive label 129 | cat1_default = tf.constant([0]) 130 | #default value of neutral label 131 | cat2_default = tf.constant([1]) 132 | #default value of negative label 133 | cat3_default = tf.constant([0]) 134 | #default value of tweet 135 | tweet_default = tf.constant(['default tweet']) 136 | #combine the default values in a list 137 | record_defaults = [cat1_default,cat2_default,cat3_default, tweet_default] 138 | #get ops to read the values 139 | cat1, cat2, cat3, raw_tweet_op = tf.decode_csv(value, record_defaults=record_defaults,field_delim='|') 140 | #combine the labels into one list 141 | label_op = tf.stack([cat1,cat2,cat3]) 142 | 143 | return raw_tweet_op, label_op 144 | 145 | 146 | def get_vector(filename_queue,lexicon,sess,coord,threads): 147 | """ 148 | Does preprocessing to convert raw_tweet returned from the tensorflow reader to vector 149 | Made for the batch reading but does not currently work 150 | Args: 151 | filename_queue: tensorflow filename_queue that feeds in the files to process 152 | lexicon: the lexicon, premade using training data and loaded from pickle file 153 | Returns: 154 | features: feature vector showing if a word is in the tweet 155 | label: labels showing the sentiment polarity 156 | """ 157 | raw_tweet_op,label_op = read_record(filename_queue) 158 | with tf.Session() as sess: 159 | # sess.run(tf.initialize_local_variables()) 160 | # coord = tf.train.Coordinator() 161 | # threads = tf.train.start_queue_runners(coord = coord) 162 | sess.run(tf.initialize_local_variables()) 163 | raw_tweet, label = sess.run([raw_tweet_op, label_op]) 164 | # print(label) 165 | print(raw_tweet) 166 | coord.request_stop() 167 | coord.join(threads) 168 | 169 | # tflexicon = tf.constant(lexicon) 170 | tweet = str(raw_tweet,'utf-8') 171 | features = np.zeros(len(lexicon)) 172 | for word in tweet: 173 | if word.lower() in lexicon: 174 | index_value = lexicon.index(word.lower()) 175 | features[index_value] += 1 176 | # print(features) 177 | return list(features), label 178 | 179 | 180 | def input_pipeline(filename_queue, batch_size, lexicon,sess,coord,threads,num_epochs=None): 181 | """ 182 | Reads multiple lines of data and then creates a batch of size batch_size 183 | NOTE: does not currently work 184 | Args: 185 | filename_queue: the tensorflow queue from which to read the file 186 | batch_size: the size of the batch to output 187 | lexicon: the lexicon containing the words 188 | sess: the tensorflow session in which to run the operations 189 | coord: the coordinator used in the session 190 | threads: the threads used in the session 191 | num_epochs: the number of epochs to run this for 192 | Returns: 193 | example_batch: batch of example feature vectors 194 | label_batch: batch of labels 195 | """ 196 | # filename_queue = tf.train.string_input_producer(filenames, num_epochs=num_epochs, shuffle=True) 197 | #get one preprosed example and one label 198 | example, label = get_vector(filename_queue,lexicon,sess, coord, threads) 199 | # min_after_dequeue defines how big a buffer we will randomly sample 200 | # from -- bigger means better shuffling but slower start up and more 201 | # memory used. 202 | # capacity must be larger than min_after_dequeue and the amount larger 203 | # determines the maximum we will prefetch. Recommendation: 204 | # min_after_dequeue + (num_threads + a small safety margin) * batch_size 205 | min_after_dequeue = 10000 206 | capacity = min_after_dequeue + 3 * batch_size 207 | example_batch, label_batch = tf.train.shuffle_batch([example, label], batch_size=batch_size, capacity=capacity, 208 | min_after_dequeue=min_after_dequeue) 209 | 210 | return example_batch, label_batch 211 | 212 | 213 | import random 214 | def shuffle_data(fin): 215 | """ 216 | function to shuffle input data as it is ordered by default with all the negative tweets first so this is required to mix the tweet categories 217 | Args: 218 | fin: string filename of file to shuffle 219 | """ 220 | with open(fin,'r') as source: 221 | data = [ (random.random(), line) for line in source ] 222 | data.sort() 223 | with open('./data/shuffled_train_data.csv','w') as target: 224 | for _, line in data: 225 | target.write( line ) 226 | 227 | 228 | def main(): 229 | """uncomment to make the csv file""" 230 | # init_process('./data/training.1600000.processed.noemoticon.csv','./data/train_data.csv') 231 | # init_process('./data/testdata.manual.2009.06.14.csv','./data/test_data.csv') 232 | with open('./lexicon.pickle','rb') as f: 233 | lexicon = pickle.load(f) 234 | 235 | """uncomment to test the generator function""" 236 | # vec_getter = read_to_vec('train_data.csv',lexicon) 237 | # for i in range(10): 238 | # print(next(vec_getter)) 239 | 240 | """uncomment to test the tensorflow version of the generator""" 241 | """we use test data here as the training data is ordered and it is difficult to see if the whole dataset is fetched""" 242 | # filename_queue = tf.train.string_input_producer(['test_data.csv'],num_epochs=2) 243 | # try: 244 | # with tf.Session() as sess: 245 | # # sess.run(tf.initialize_all_variables()) 246 | # sess.run(tf.initialize_local_variables()) 247 | # coord = tf.train.Coordinator() 248 | # threads = tf.train.start_queue_runners(coord=coord) 249 | # 250 | # feature_op, label_op = read_record(filename_queue,lexicon,sess) 251 | # # feature_op, label_op = input_pipeline(filename_queue, 128, lexicon, sess) 252 | # count = 0 253 | # while not coord.should_stop(): 254 | # label = sess.run([label_op]) 255 | # print(label) 256 | # count +=1 257 | # print(count) 258 | # except tf.errors.OutOfRangeError: 259 | # print("Done training, epoch reached") 260 | # finally: 261 | # coord.request_stop() 262 | # coord.join(threads) 263 | 264 | """uncomment to test the batch generator""" 265 | # filename_queue = tf.train.string_input_producer(['test_data.csv'],num_epochs=1) 266 | # try: 267 | # with tf.Session() as sess: 268 | # sess.run(tf.initialize_all_variables()) 269 | # sess.run(tf.initialize_local_variables()) 270 | # coord = tf.train.Coordinator() 271 | # threads = tf.train.start_queue_runners(coord=coord) 272 | # feature_op, label_op = input_pipeline(filename_queue, 100, lexicon,sess,coord,threads,num_epochs=1) 273 | # count = 0 274 | # while not coord.should_stop(): 275 | # features = sess.run(feature_op) 276 | # # print(features) 277 | # print(str(features[0],'utf-8')) 278 | # count +=1 279 | # print(count) 280 | # except tf.errors.OutOfRangeError: 281 | # print("Done training, epoch reached") 282 | # finally: 283 | # coord.request_stop() 284 | # coord.join(threads) 285 | 286 | """uncomment to test the get_vector function""" 287 | # filename_queue = tf.train.string_input_producer(['test_data.csv'],num_epochs=None) 288 | # print(get_vector(filename_queue,lexicon)) 289 | 290 | 291 | """uncomment to shuffle the input file and check 10 records""" 292 | shuffle_data('./data/train_data.csv') 293 | with open('./data/shuffled_train_data.csv','r') as f: 294 | for i in range(10): 295 | print(f.readline()) 296 | 297 | if __name__ == '__main__': 298 | """ 299 | Read the comments and file and the main function to see how to use 300 | """ 301 | # import time 302 | # start = time.time() 303 | main() 304 | # fin = time.time() 305 | # print(fin-start) 306 | -------------------------------------------------------------------------------- /neural_net.py: -------------------------------------------------------------------------------- 1 | #import the read_record function from data_read file 2 | from data_read import read_record 3 | 4 | #the main imports 5 | import tensorflow as tf 6 | import pickle 7 | import numpy as np 8 | 9 | #set number of hidden nodes in each layer 10 | n_nodes_hl1 = 500 11 | n_nodes_hl2 = 500 12 | n_nodes_hl3 = 500 13 | #numner of classes. In our case it is positive, negative and neutral 14 | n_classes = 3 15 | 16 | batch_size=512 17 | #there are 1600000 lines in the training data 18 | total_batches = int(1600000/batch_size) 19 | #total number of epochs(epoch is number of times the whole data set is seen ) 20 | hm_epochs = 5 21 | #placeholders for the input and output 22 | # x = tf.placeholder(tf.float32,shape=[None, 2638]) 23 | x = tf.placeholder(tf.float32,shape=[None, 3007]) 24 | y = tf.placeholder(tf.float32) 25 | 26 | #Define the hidden layers' weights and biases 27 | #the 2638 is the number of words in the lexicon created using the training data 28 | hidden_1_layer = {'f_fum':n_nodes_hl1, 29 | # 'weight':tf.Variable(tf.random_normal([2638, n_nodes_hl1])), 30 | 'weight':tf.Variable(tf.random_normal([3007, n_nodes_hl1])), 31 | 'bias':tf.Variable(tf.random_normal([n_nodes_hl1]))} 32 | 33 | hidden_2_layer = {'f_fum':n_nodes_hl2, 34 | 'weight':tf.Variable(tf.random_normal([n_nodes_hl1, n_nodes_hl2])), 35 | 'bias':tf.Variable(tf.random_normal([n_nodes_hl2]))} 36 | 37 | hidden_3_layer = {'f_fum':n_nodes_hl3, 38 | 'weight':tf.Variable(tf.random_normal([n_nodes_hl2, n_nodes_hl3])), 39 | 'bias':tf.Variable(tf.random_normal([n_nodes_hl3]))} 40 | #define the output layers weights and biases 41 | output_layer = {'f_fum':None, 42 | 'weight':tf.Variable(tf.random_normal([n_nodes_hl3, n_classes])), 43 | 'bias':tf.Variable(tf.random_normal([n_classes])),} 44 | 45 | #The saver object 46 | #this can only be made after declaring some tensorflow variables. 47 | #in our case, those are the layer weights and biases above 48 | saver = tf.train.Saver() 49 | #log file to store number of epochs completed 50 | tf_log = 'tf.log' 51 | 52 | def neural_network_model(data): 53 | """ 54 | Function to define the neural network model 55 | In this version it is just a feedforward multi layer preceptrion 56 | Args: 57 | data: tensorflow placeholder containing the feature vector/s 58 | Returns: 59 | output: Output tensor built after the network 60 | """ 61 | #all of the layers follow the standard M * X + b function where M is the weights and b is the bias 62 | #then the hidden layers are passed through the relu function 63 | l1 = tf.nn.relu(tf.add(tf.matmul(data, hidden_1_layer['weight']), hidden_1_layer['bias'])) 64 | l2 = tf.nn.relu(tf.add(tf.matmul(l1, hidden_2_layer['weight']), hidden_2_layer['bias'])) 65 | l3 = tf.nn.relu(tf.add(tf.matmul(l2, hidden_3_layer['weight']), hidden_3_layer['bias'])) 66 | output = tf.add(tf.matmul(l3, output_layer['weight']),output_layer['bias']) 67 | return output 68 | 69 | 70 | def train_neural_network(x): 71 | """ 72 | Function to train the neural network 73 | Args: 74 | x: the tensorflow placeholder for the input feature vector 75 | """ 76 | #the feed forward tensorflow operation 77 | prediction = neural_network_model(x) 78 | #cost operation 79 | #tf.nn.softmax_cross_entropy_with_logits is used instead of tf.nn.softmax when we have one-hot vectors as both the labels and the output from the network 80 | cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=prediction, labels=y)) 81 | 82 | global_step = tf.Variable(0, trainable=False) 83 | starter_learning_rate = 0.1 84 | learning_rate = tf.train.exponential_decay(starter_learning_rate, global_step,100000, 0.96, staircase=True) 85 | # Passing global_step to minimize() will increment it at each step. 86 | learning_step = (tf.train.AdamOptimizer(learning_rate).minimize(cost, global_step=global_step)) 87 | 88 | #Define the optimizer and the value to minimize 89 | #NOTE Following line is disabled to check decaying learning_rate 90 | # optimizer = tf.train.AdamOptimizer().minimize(cost) 91 | #queue of input files 92 | filename_queue = tf.train.string_input_producer(['./data/shuffled_train_data.csv'],num_epochs=hm_epochs) 93 | #get the lexicon built and saved previously from the training data 94 | with open('./lexicon.pickle', 'rb') as f: 95 | lexicon = pickle.load(f) 96 | #tensorflow operations to get one tweet and one set of one-hot labels from the input file 97 | tweet_op, label_op = read_record(filename_queue) 98 | 99 | #All tensorflow operations need to be run from a session 100 | with tf.Session() as sess: 101 | #the tensorflow variables(eg; weights and biases declared earlier) have to be initialized using this function 102 | #otherwise they are just tensors describing those variables 103 | sess.run(tf.global_variables_initializer()) 104 | #the epoch is counted internally in tensorflow. This counter needs to be initialized using this function 105 | sess.run(tf.local_variables_initializer()) 106 | #try to get the epoch number from the log file 107 | try: 108 | epoch =int(open(tf_log,'r').read().split('\n')[-2])+1 109 | print("Starting: Epoch %d" % epoch) 110 | #if the file does not open, assume we are on epoch 1 111 | except: 112 | epoch = 1 113 | 114 | try: 115 | #if some epochs have already been run, then restore from checkpoint file called 'model.ckpt' using the saver 116 | if epoch != 1: 117 | saver.restore(sess, "model.ckpt") 118 | epoch_loss=1 119 | #the batch of input vectors 120 | batch_x = [] 121 | #the batch of labels 122 | batch_y = [] 123 | #to keep track of the number of batches run in one epoch 124 | batches_run = 0 125 | #the following two lines are essencial for using tensorflow's data reader from the file queue (filename_queue) above 126 | coord = tf.train.Coordinator() 127 | threads = tf.train.start_queue_runners(coord = coord) 128 | #the should_stop function keeps running the following code until the epochs in the filename_queue are completed 129 | #the epoch counter is internal which is why we used the tf.initialize_local_variables earlier 130 | while not coord.should_stop(): 131 | #run the operation to get one tweer as a binary string and one set of one-hot labels 132 | raw_tweet, label = sess.run([tweet_op,label_op]) 133 | #convert the binary tweet to utf-8(python3's default string encoding) 134 | tweet = str(raw_tweet,'utf-8') 135 | #a numpy array of zeros representing, each representing one word in the lexicon 136 | features = np.zeros(len(lexicon)) 137 | #if any word in the tweet is in the lexicon, we increment its count in the feature vector 138 | for word in tweet: 139 | if word.lower() in lexicon: 140 | index_value = lexicon.index(word.lower()) 141 | features[index_value] += 1 142 | #add the tweet's feature vector to the batch 143 | batch_x.append(list(features)) 144 | batch_y.append(label) 145 | #when a batch is filled 146 | if len(batch_x)>=batch_size: 147 | #run the optimizer operation and the cost operation using the batch 148 | _,c = sess.run([learning_step, cost], feed_dict={x:np.array(batch_x), y:np.array(batch_y)}) 149 | #increment the epoch loss by the cost c of the batch 150 | epoch_loss += c 151 | #empty the batch 152 | batch_x = [] 153 | batch_y = [] 154 | #increment the batch counter 155 | batches_run += 1 156 | #print the label of the last example in the batch for debugging/sanity check 157 | # print('sample label: {}'.format(label)) 158 | #show the batch loss and the number of batches run in the given epoch 159 | print('Batch run: {}/{} | Epoch: {} | Batch Loss: {}'.format(batches_run,total_batches, epoch, c)) 160 | if batches_run == total_batches: #meaning one epoch completed 161 | print("saving model") 162 | saver.save(sess, 'model.ckpt') #save the variables at each epoch 163 | 164 | print('Epoch: {}, completed out of {}, Loss = {}'.format(epoch, hm_epochs, epoch_loss)) 165 | with open(tf_log, 'a') as f: 166 | f.write(str(epoch)+'\n') 167 | epoch +=1 168 | batches_run=0 #reset the counter 169 | except tf.errors.OutOfRangeError: 170 | print("Done training, epoch reached") 171 | finally: 172 | coord.request_stop() 173 | threads.join(coord) 174 | 175 | def test_neural_network(): 176 | """ 177 | Function to test the neural network 178 | 179 | """ 180 | #tensorflow op to get prediction 181 | prediction = neural_network_model(x) 182 | #tensorflow queue for the input file 183 | filename_queue = tf.train.string_input_producer(['./data/test_data.csv'],num_epochs=1) 184 | #tensorflow op to check if prediction is correct 185 | #the argmax function gives the index of the maximum value on the given axis. In this case axis 1 186 | #axis 1 meaning the row axis here 187 | correct = tf.equal(tf.argmax(prediction, 1), tf.argmax(y, 1)) 188 | #tensorflow operation to get the accuracy of the classifier 189 | #it converts the result of the previous op from boolean to float32 and then takes the mean over the result 190 | accuracy = tf.reduce_mean(tf.cast(correct, tf.float32)) 191 | #lists for features and labels for the input 192 | feature_sets = [] 193 | labels = [] 194 | 195 | #get the lexicon 196 | with open('./lexicon.pickle','rb') as f: 197 | lexicon = pickle.load(f) 198 | #tensorflow operation to get one tweet and one set of labels 199 | tweet_op, label_op = read_record(filename_queue) 200 | with tf.Session() as sess: 201 | #initialize the saved variables 202 | sess.run(tf.global_variables_initializer()) 203 | #this is required to initialize the internal epoch counter used by the filename_queue 204 | sess.run(tf.local_variables_initializer()) 205 | #get the trained model or show an error 206 | try: 207 | # print(saver.latest_checkpoint()) 208 | saver.restore(sess,"model.ckpt") 209 | print('restored network') 210 | except Exception as e: 211 | print(str(e)) 212 | 213 | try: 214 | #tensorflow coordinator to coordinate the threads 215 | coord = tf.train.Coordinator() 216 | #tensorflow queue runner required to get input from the filename_queue 217 | threads = tf.train.start_queue_runners(coord = coord) 218 | #run the following until the number of epochs specified in filename_queue are completed 219 | while not coord.should_stop(): 220 | #get one tweet as binary string and one set of labels 221 | tweet_raw, label = sess.run([tweet_op,label_op]) 222 | #if you want to see the tweet, uncomment the line below 223 | # print(tweet_raw) 224 | #convert the tweet to utf-8, python3's default string encoding 225 | tweet = str(tweet_raw, 'utf-8') 226 | #uncomment below to see the converted tweet 227 | # print(tweet) 228 | #initialize a numpy array of zeros for each word in the lexicon 229 | features = np.zeros(len(lexicon)) 230 | #if a word in the tweet is in the lexicon, increment the count of that word in the feature vector 231 | for word in tweet: 232 | if word.lower() in lexicon: 233 | index_value = lexicon.index(word.lower()) 234 | features[index_value] +=1 235 | #append the feature vector and label to the input list 236 | feature_sets.append(features) 237 | labels.append(label) 238 | #since the test file is small, the whole file is processed in the above lines 239 | #when this is done we get an OutOfRangeError meaning the epochs specified have been completed 240 | #in this case, we only need 1 epoch and when it is complete we classify the test data 241 | except tf.errors.OutOfRangeError: 242 | #convert the lists to numpy arrays 243 | test_x = np.array(feature_sets) 244 | test_y = np.array(labels) 245 | #for debugging pruposes, the shape of the numpy array can be checked 246 | #the first dimension should equal the number of lines in the test csv 247 | # print(test_x.shape) 248 | # print(test_y.shape) 249 | #uncomment below if you want to see the actual boolean array that shows which examples were correctly classified 250 | # correct_example = sess.run(correct, feed_dict={x:feature_sets,y:labels}) 251 | # print(correct_example) 252 | # print(correct_example.shape) 253 | # get the accuracy by running the accuracy operation 254 | # accuracy.eval is just a different way of saying sess.run(accuracy, feed_dict={x:test_x, y:test_y}) 255 | print('Accuracy:',accuracy.eval({x:test_x, y:test_y})) 256 | print("Epoch complete") 257 | #stop the coordinator and join the threads 258 | finally: 259 | coord.request_stop() 260 | # threads.join(coord) 261 | 262 | if __name__ == '__main__': 263 | #uncomment to train the network 264 | # train_neural_network(x) 265 | test_neural_network() 266 | -------------------------------------------------------------------------------- /readme.md: -------------------------------------------------------------------------------- 1 | #Introduction 2 | This tutorial uses the same technique as sentdex from youtube from [this video](https://www.youtube.com/watch?v=JeamFbHhmDo) 3 | with a written tutorial [here] (https://pythonprogramming.net/data-size-example-tensorflow-deep-learning-tutorial/) 4 | 5 | My version adds tensorflow's built-in reader instead of manual iteration used in the original. 6 | 7 | The reason for this is that for loops in python are known to be slow and manual iteration has to use them.\n 8 | Tensorflow, however, does this in C++ like all of its graph computation so theoretically, it should be much faster. 9 | I am making this both for my own testing and because, while tensorflow has many tutorial and a lot of documentation, I personally haven't found any that explains its data reading beyond feed_dict well. 10 | 11 | #The dataset 12 | The dataset used here is the sentiment 140 dataset which can be found [here] 13 | (http://help.sentiment140.com/for-students/) 14 | and the direct download link is [here] 15 | (http://cs.stanford.edu/people/alecmgo/trainingandtestdata.zip) 16 | with a [mirror link] (https://docs.google.com/file/d/0B04GJPshIjmPRnZManQwWEdTZjg/edit) 17 | You need to get this data before running the network 18 | 19 | #File descriptions 20 | I am using two main files here: 21 | #####data_read.py: 22 | Contains functions for reading in and preprocessing the data and creating the lexicon of words 23 | #####neural_net.py: 24 | Contains the main neural network model and testing function 25 | 26 | #Reading from tensorflow 27 | There are a few other differences between this and sentdex's version but the main one is using tensorflow's built-in data reader. 28 | Here is some description of it:\n 29 | Firstly you need to put the input files in a queue using this: 30 | filename_queue = tf.train.string_input_producer(['filenames'], num_epochs=None, shuffle=True) 31 | file name have to be in a list and multiple files can be passed in. This is useful for distributed environments as well as for datasets that use multiple files like the IMDB review dataset 32 | which has files called neg and pos for the two categories. 33 | 34 | The queue needs to be declared before running the tensorflow session(tf.Session() or tf.InteractiveSession()). 35 | If a number is passed in num_epochs then before running the reader operation, which I will mention later, you need to initialize local variables using tf.initialize_local_variables() 36 | as tensorflow counts the epochs using a local variable which are not initialized when callling tf.initialize_all_variables() as is normally done. 37 | 38 | tensorflow has many readers. In our case, we are reading a csv line by line so we use tf.TextLineReader() which is declared like this: 39 | reader = tf.TextLineReader() 40 | and a line reading operation is declared like this 41 | key, value = reader.read(filename_queue) where key is the line being read and is assigned by tensorflow automatically 42 | After this, we need to describe the data format in the csv and give default values for each column. These are also used to find the data type and convert automatically from raw string 43 | for example if a line contains a numeric value we use:\n 44 | ```python 45 | val_default = tf.constant([0]) 46 | ``` 47 | or if it contains a string we use:\n 48 | ```python 49 | str_default = tf.constant(['default string']) 50 | ``` 51 | note that lists can not be used in default values so 52 | ```python 53 | lst_dafault = tf.constant([[0][1][2]]) returns an error 54 | ``` 55 | for simplicity, all the default values can be put in a list before being passed to the csv decoder 56 | ```python 57 | record_defaults = [default1, default2, default3] 58 | ``` 59 | where each default represents one column separated by a given one character delimiter. Note that it has to be a one character delimiter due to the internal C operation requiring one char delimiter. 60 | The decoding can be done using the following function 61 | ```python 62 | tf.decode_csv(value, record_defaults=record_defaults,field_delim='|') 63 | ``` 64 | here, each default value in the record_defaults list results in a tensorflow operation. For example if we have 3 default values we have 65 | ```python 66 | default1_op, default2_op, default3_op = tf.decode_csv(value, record_defaults=record_defaults,field_delim='|') 67 | ``` 68 | where all the returned values are tensorflow operations that return the mentioned column when run 69 | Before running the operations from the tensorflow csv decoder, we need to use a queue runner which should be coordinated by a coordinator and this needs to be done within the tensorflow session 70 | ```python 71 | coord = tf.train.Coordinator() 72 | threads = tf.train.start_queue_runners(coord=coord) 73 | ``` 74 | after this, each time the operations from the decoder are run, they return their respective column from one line of the input files 75 | 76 | There are two ways to run the operations. 77 | If you want to get only some lines, then they can be run in a loop. 78 | ```python 79 | for _ in range(10): 80 | col1, col2 = sess.run([default1_op, default2_op]) 81 | ``` 82 | gets 10 lines. 83 | The other option is to read the whole data set by setting a number of epochs in the filename queue and then 84 | ```python 85 | while not coord.should_stop(): 86 | col1, col2 = sess.run([default1_op, default2_op]) 87 | ``` 88 | reads the whole dataset for the specified number of epochs 89 | 90 | There is also an option to read data in batches.However I did not use it in this example as the preprocessing here required converting each tweet into a feature vector which required running the operations from the tensorflow csv decoder instead of doing the whole preprocessing within tensorflow. While this could be possible, it caused errors in my case so I am reading the file line by line in this tutorial. 91 | I might use and describe this in a future tutorial but you can see the unfinished version I tried in the data_read.py file. 92 | 93 | #NOTE1 94 | the neural_net.py and data_read.py have been annotated to be usable for a tutorial in addition to this writeup. 95 | #NOTE2 96 | If you have any comments or issues with the tutorial, please feel free to contact me at arif_sohaib@outlook.com 97 | Additionally, I am currently available for freelance projects in tensorflow or caffe 98 | --------------------------------------------------------------------------------