├── .gitignore ├── 1709.05554.pdf ├── README.md ├── auto ├── data │ └── ag_news_csv │ │ ├── create_vali.py │ │ ├── load_batch.py │ │ ├── load_util.py │ │ ├── reformat.py │ │ └── stats.py ├── src │ ├── model │ │ ├── @ │ │ ├── ccrnn_model_gen.py │ │ ├── mcrnn_model.py │ │ ├── mcrnn_model_1_lstm.py │ │ ├── mcrnn_model_gen.py │ │ ├── mcrnn_model_gen2.py │ │ ├── mcrnn_model_gen_bi.py │ │ ├── mcrnn_model_gen_bi2.py │ │ ├── train.py │ │ ├── train_1_lstm.py │ │ ├── train_gen.py │ │ └── train_gen2.py │ └── util │ │ ├── 1q │ │ ├── hps.py │ │ ├── hps2.py │ │ ├── hps_script.sh │ │ ├── load_batch.py │ │ ├── load_batch2.py │ │ ├── load_batch_val.py │ │ ├── load_util.py │ │ ├── reformat.py │ │ ├── tf_utils.py │ │ ├── tf_utils_old.py │ │ └── tf_utils_reg.py └── src_final │ ├── model │ ├── ccrnn_model_gen.py │ ├── mcrnn_model.py │ ├── mcrnn_model_1_lstm.py │ ├── mcrnn_model_gen.py │ ├── mcrnn_model_gen2.py │ ├── mcrnn_model_gen_bi.py │ ├── mcrnn_model_gen_bi2.py │ ├── train.py │ ├── train_1_lstm.py │ ├── train_gen.py │ └── train_gen2.py │ └── util │ ├── 1q │ ├── hps.py │ ├── hps2.py │ ├── hps_script.sh │ ├── load_batch.py │ ├── load_batch2.py │ ├── load_batch_val.py │ ├── load_util.py │ ├── reformat.py │ ├── tf_utils.py │ ├── tf_utils_old.py │ └── tf_utils_reg.py ├── data ├── .gitkeep ├── helpX ├── test_Category.json.gz ├── test_Helpful.json.gz └── train.json.gz ├── logs └── .gitkeep ├── scripts ├── .gitkeep ├── killZk.sh ├── newTerminalMac.sh ├── startKafkaServer.sh ├── startNimbus.sh ├── startStormUI.sh ├── startSupervisor.sh ├── startZK.sh ├── startZKClient.sh ├── systemStartMac.sh ├── systemStartUbuntu.sh └── userRunAPI.sh └── src ├── .gitkeep ├── models ├── c2c_cooccurence.py ├── c2c_cooccurenceNonUniform.py ├── cascKeras.py ├── cascade.py ├── ccrnn.py ├── ccrnn_bn.py ├── ccrnn_drop.py ├── ccrnn_swap.py ├── cflstm.py ├── contextToContext.py ├── contextToContextNonUniform.py ├── mcrnn.py ├── mcrnn_bn.py ├── mtlKeras.py ├── tc2c.py ├── textToContext.py ├── tf_t2c.py ├── tweetnet.py └── tweetnet_lstm.py ├── storm ├── TwitterCleanerBolt.java ├── TwitterStorm.java ├── TwitterStreamSpout.java └── pom.xml └── utils ├── ReducedAsciiDictionary.py ├── checkTrainTestDup.py ├── dumpDedup.py ├── embeddingGeneration.py ├── getEnglishHashTweets.py ├── hashtagFrequency.py ├── loadData.py ├── loadDataNewModel.py ├── loadDataT2C.py ├── loadDataText2Hashtag.py ├── loadDataTweetMultiTask.py ├── loadData_lstm.py ├── loadKaggleHelpful.py ├── logger.py ├── mkMultiTaskTweet.py ├── predContext.py ├── prelimTest.py ├── preprocessor.py ├── tf_utils.py ├── tf_utils_reg.py ├── tweetGenerator.py ├── tweetGenerator_lstm.py └── visualizeData.py /.gitignore: -------------------------------------------------------------------------------- 1 | *.class 2 | 3 | # Mobile Tools for Java (J2ME) 4 | .mtj.tmp/ 5 | 6 | # Package Files # 7 | *.jar 8 | *.war 9 | *.ear 10 | *.html 11 | *.pkl 12 | *.pyc 13 | *.swp 14 | *.out 15 | *.hdf5 16 | *.txt 17 | *.csv 18 | *zookeeper* 19 | *.jpg 20 | *.png 21 | *storm-local* 22 | *.bmp 23 | runAPI.sh 24 | *.gz 25 | *.log 26 | # misc data files# 27 | *.html 28 | *.pkl 29 | *.pyc 30 | *.swp 31 | *.out 32 | *.hdf5 33 | *.txt 34 | *.csv 35 | 36 | # misc files generated by zk and storm 37 | *zookeeper* 38 | *storm-local* 39 | runAPI.sh 40 | 41 | # misc data files# 42 | *.html 43 | *.pkl 44 | *.pyc 45 | *.swp 46 | *.out 47 | *.hdf5 48 | *.txt 49 | *.csv 50 | 51 | # misc files generated by zk and storm 52 | *zookeeper* 53 | *storm-local* 54 | runAPI.sh 55 | 56 | # virtual machine crash logs, see http://www.java.com/en/download/help/error_hotspot.xml 57 | hs_err_pid* 58 | -------------------------------------------------------------------------------- /1709.05554.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/davisliang/automated/5e07e5802a604036fc6295ed3538d596936fca0e/1709.05554.pdf -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Automated Multi-task Learning 2 | **Automated MTL** supports two generalized multi-tasking, and recurrent deep learning architectures. Automated MTL uses the statistical regularities within the original dataset itself to reinforce the representations learned for the primary task. Automated MTL comes in two flavors: the CRNN (Cascaded Recurrent Neural Network) and the MRNN (Multi-tasking Recurrent Neural Network). 3 | 4 | The automated MTL architectures have achieved state-of-the-art performance in sentiment analysis, topic prediction, and hashtag recommendation using a diverse set of text corpuses including Twitter, Rotten Tomatoes, and IMDB. 5 | 6 | ## The Infinite Data Pipeline (∞DP): 7 | A side project of automated MTL resulted in the ***Infinite Data Pipeline*** which is built on Java, Apache Storm, Kafka, and the Twitter API. The Infinite Data Pipeline streams and preprocesses Twitter data online and directly injects the streamed data into a running Tensorflow topology. 8 | 9 | ## Requirements: 10 | 1. CUDNN (tested on cuDNN 5105) 11 | 2. CUDA Drivers + NVIDIA Graphics Card with 5.0+ support (tested on GTX 1080) 12 | 3. Apache Zookeeper (tested on version 3.4.6) 13 | 4. Apache Storm (tested on version 0.9.5) 14 | 5. Twitter API + Developer Credentials (tested on version 4.0.4) 15 | 6. Theano (tested on version 0.8.2) 16 | 7. Keras (tested on latest version as of January 9, 2017) 17 | 8. Linux Based OS (tested on Ubuntu 16.04LTS) 18 | 19 | ## Install Guide: 20 | 1. [Install CUDA and cuDNN](http://tleyden.github.io/blog/2015/11/22/cuda-7-dot-5-on-aws-gpu-instance-running-ubuntu-14-dot-04/) 21 | 2. [Apache Storm and Twitter API Setup](https://www.tutorialspoint.com/apache_storm/apache_storm_installation.htm) 22 | 3. [Install keras and Theano](http://www.pyimagesearch.com/2016/07/18/installing-keras-for-deep-learning/) 23 | 4. [Download Kafka 2.10](https://www.apache.org/dyn/closer.cgi?path=/kafka/0.10.1.1/kafka_2.10-0.10.1.1.tgz) 24 | 25 | ## Data Miner Run Guide (MacOSX Local): 26 | 1. Run **systemStartMac.sh** to start your *Storm* instance. Make sure `KAFKAHOME` is set correctly in `scripts/startKafkaServer.sh`. 27 | 2. Edit `src/storm/pom.xml` with the appropriate Twitter credentials. Run `mvn install` inside `src/storm` to compile and `mvn exec:java` to start the data collection and streaming. 28 | 29 | ## Data Miner Run Guide (Ubuntu 16.04 Local): 30 | 1. Run **systemStartUbuntu.sh** to start your *Storm* instance. 31 | 2. Run **runAPI.sh** to open the *Twitter* stream and start collection. (Requires you to edit **runAPI.sh** with correct *Twitter* API credentials). 32 | 33 | ## Tweetnet Run Guide: 34 | 1. Run **tweetnet.py**. 35 | 36 | ## Notes: 37 | 38 | **Note**: The system start script opens five new terminals; *Apache Zookeeper*, the *Nimbus*, the *Supervisor*, *StormUI*, and the *Kafka* server. Each new open terminal requires **sudo** access and will request for the user's password. To view *StormUI* you can navigate to *localhost:8080*. 39 | 40 | **Note**: In the CUDA setup, the section where you link cuda to cuda-7.5 is outdated. 41 | 42 | Intead of following this step: 43 | 44 | export CUDA_HOME=/usr/local/cuda-7.5 45 | 46 | Make sure you using and linking *CUDA v8.0*: 47 | 48 | export CUDA_HOME=/usr/local/cuda-8.0 49 | 50 | **Note**: You will need to register for Twitter Developer credentials to run the data miner. 51 | -------------------------------------------------------------------------------- /auto/data/ag_news_csv/create_vali.py: -------------------------------------------------------------------------------- 1 | import os 2 | from os.path import expanduser 3 | import random 4 | import shutil 5 | from random import shuffle 6 | #Step 1: list all folder under training data set: 7 | data_path = "~/tweetnet/automatedMTL/data/ag_news_csv" 8 | all_classes = os.listdir(expanduser(data_path + "/Train")) 9 | 10 | for c in all_classes: 11 | if c[0] != ".": 12 | print c 13 | files = os.listdir(expanduser(data_path+"/Train/"+c)) 14 | shuffle(files) 15 | if "Sports" in c: 16 | for f in files[0:4568]: 17 | shutil.move(expanduser(data_path+"/Train/"+c+"/"+f), expanduser(data_path+"/Validation/"+c+"/"+f)) 18 | else: 19 | for f in files[0:4569]: 20 | shutil.move(expanduser(data_path+"/Train/"+c+"/"+f), expanduser(data_path+"/Validation/"+c+"/"+f)) 21 | 22 | 23 | -------------------------------------------------------------------------------- /auto/data/ag_news_csv/load_batch.py: -------------------------------------------------------------------------------- 1 | import cPickle as pickle 2 | import numpy as np 3 | import os 4 | import random 5 | from os.path import expanduser 6 | #from reformat import reformat_data 7 | from load_util import class_look_up 8 | 9 | # The files are named from 0.txt to n.txt 10 | # This function returns a list of all shuffled file names 11 | 12 | def get_file_identifiers(data_path): 13 | ids = [] 14 | f = open(expanduser(data_path)) 15 | for l in f.readlines(): 16 | ids.append(int(l.split(" ")[0])) 17 | random.shuffle(ids) 18 | return ids 19 | 20 | def get_classes(all_classes, id): 21 | return all_classes[id][0] 22 | 23 | def get_text_by_batch(data_path, is_train, train_file, test_file, all_classes, start_idx, batch_size): 24 | if is_train: 25 | identifiers = train_file 26 | else: 27 | identifiers = test_file 28 | batch_identifiers = identifiers[start_idx: start_idx + batch_size] 29 | 30 | batch_text = [] 31 | for idx in batch_identifiers: 32 | text = open(expanduser(data_path + get_classes(all_classes, idx)+ "/" + str(idx)+".txt")) 33 | batch_text.append(text.read()) 34 | 35 | return batch_identifiers, batch_text 36 | 37 | def load_data(data_path): 38 | 39 | all_classes = pickle.load(open(expanduser(data_path + '/classes.pkl'))) 40 | test_file = get_file_identifiers(data_path + "/test_classes.txt") 41 | train_file = get_file_identifiers(data_path + "/train_classes.txt") 42 | return all_classes, train_file, test_file 43 | 44 | def get_word2vec(data_path): 45 | # TO DO: download word2vec! 46 | word2vec_dic = pickle.load(open(expanduser(data_path))) 47 | return word2vec_dic 48 | 49 | # Unknown symbols are UNK 50 | # Missing word symbols are zeros 51 | # EOS are EOS 52 | 53 | def encode_sequence(word2vec_dic, sequence, encode_dim, max_len): 54 | sequence_by_word = sequence.split(" ") 55 | encoded_seq = np.zeros((max_len, encode_dim)) 56 | for i in range(len(sequence_by_word)): 57 | word = sequence_by_word[i] 58 | if word2vec_dic.get(word) == None: 59 | encoded_seq[i, :] = word2vec_dic["UNK"] 60 | else: 61 | if word != "REMOVE": 62 | encoded_seq[i, :] = word2vec_dic[word] 63 | else: 64 | encoded_seq[i, :] = word2vec_dic["_"] 65 | return encoded_seq, len(sequence_by_word) 66 | 67 | def encode_sequence_generation(word2vec_dic, sequence, encode_dim, max_len): 68 | sequence_by_word = sequence.split(" ") 69 | encoded_seq = np.zeros((max_len, encode_dim)) 70 | for i in range(1, len(sequence_by_word)): 71 | word = sequence_by_word[i] 72 | if word2vec_dic.get(word) == None: 73 | encoded_seq[i-1, :] = word2vec_dic["UNK"] 74 | else: 75 | encoded_seq[i-1, :] = word2vec_dic[word] 76 | encoded_seq[len(sequence_by_word)-1, :] = word2vec_dic["EOS"] 77 | context_target = sequence_by_word[1:len(sequence_by_word)] + ["EOS"] 78 | 79 | return encoded_seq, context_target, len(sequence_by_word) 80 | 81 | def oneHot(nclasses, idx): 82 | one_hot = np.zeros((nclasses)) 83 | one_hot[idx-1] = 1 84 | return one_hot 85 | 86 | def load_batch(n_classes, word2vec_dic, missing_word_dic, encode_dim, max_len, data_path, is_train, train_file, test_file, all_classes, start_idx, batch_size, automated_task): 87 | batch_identifiers, batch_text = get_text_by_batch(data_path, is_train, train_file, test_file, all_classes, start_idx, batch_size) 88 | encoded_batch = np.zeros((batch_size, max_len, encode_dim)) 89 | batch_classes = np.zeros((batch_size, n_classes)) 90 | batch_context_encoded = np.zeros((batch_size, encode_dim)) 91 | if automated_task == "word generation": batch_context_encoded = np.zeros((batch_size, max_len, encode_dim)) 92 | batch_context = [] 93 | batch_length = [] 94 | for i in range(batch_size): 95 | encoded_batch[i,:, :], text_length = encode_sequence(word2vec_dic, batch_text[i], encode_dim, max_len) 96 | batch_classes[i,:] = oneHot(n_classes, all_classes[batch_identifiers[i]][-1]) 97 | if automated_task != "word generation": 98 | batch_context_encoded[i,:] = word2vec_dic[missing_word_dic[batch_identifiers[i]]] 99 | batch_context.append(missing_word_dic[batch_identifiers[i]]) 100 | else: 101 | batch_context_encoded[i, :, :], context_target, text_length = encode_sequence_generation(word2vec_dic, batch_text[i], encode_dim, max_len) 102 | batch_context.append(context_target) 103 | batch_length.append(text_length) 104 | return encoded_batch, batch_classes, batch_context_encoded, batch_context, batch_identifiers, batch_text, batch_length 105 | 106 | if __name__ == "__main__": 107 | data_path = "~/tweetnet/automatedMTL/data/ag_news_csv" 108 | data_stats = pickle.load(open(expanduser(data_path + "/stats.pkl"))) 109 | n_classes, n_data, n_data_per_class, n_train_data, n_test_data, max_length = data_stats['n_classes'], data_stats['n_data'], data_stats['n_data_per_class'],data_stats['n_train_data'], data_stats['n_test_data'], data_stats['max_length'] 110 | print n_classes, n_data, n_data_per_class 111 | word2vec_dic = get_word2vec("~/tweetnet/data/word2vec_dict.pkl") 112 | for epoch in range(3): 113 | dic = {} 114 | all_classes, train_file, test_file = load_data(data_path) 115 | start_idx = 0 116 | for minibatch in range(3): 117 | encoded_batch, batch_classes, batch_context_encoded, batch_context, batch_identifier, batch_text, batch_length = load_batch(n_classes, word2vec_dic, {}, 300, max_length, data_path+"/Train/", 1, train_file, test_file, all_classes, start_idx, 1, automated_task="word generation") 118 | start_idx += 1 119 | print batch_text 120 | print batch_classes 121 | print batch_context 122 | print encoded_batch.shape 123 | print batch_context_encoded.shape 124 | for i in batch_identifier: 125 | if dic.get(i) != None: print "Wrong" 126 | else: dic[i] = 1 127 | -------------------------------------------------------------------------------- /auto/data/ag_news_csv/load_util.py: -------------------------------------------------------------------------------- 1 | import cPickle as pickle 2 | import os 3 | from os.path import expanduser 4 | from os.path import basename 5 | 6 | def class_look_up(data_path): 7 | out_train = open(expanduser(data_path+"/train_classes.txt"), "w") 8 | out_test = open(expanduser(data_path+"/test_classes.txt"), "w") 9 | train_folders = os.listdir(expanduser(data_path+"/Train/")) 10 | test_folders = os.listdir(expanduser(data_path+"/Test/")) 11 | if "rotten_tomato" not in data_path: 12 | validation_folders = os.listdir(expanduser(data_path+"/Validation/")) 13 | out_val = open(expanduser(data_path+"/validation_classes.txt"), "w") 14 | 15 | dict = {"World": 0, "Sports": 1, "Business":2, "Sci_Tech":3} 16 | cnt = 0 17 | file2class_dict = {} 18 | 19 | for i in train_folders: 20 | if i[0] != '.': 21 | #if dict.get(i) == None: 22 | # dict[i] = cnt 23 | # cnt += 1 24 | files = os.listdir(expanduser(data_path+"/Train/"+i)) 25 | for f in files: 26 | if f[0] == ".": continue 27 | out_train.write(f[0:len(f) - 4] + " " + i + " " + str(dict[i])) 28 | out_train.write("\n") 29 | file2class_dict[int(f[0:len(f) - 4])] = (i, dict[i]) 30 | 31 | for i in test_folders: 32 | if i[0] != '.': 33 | files = os.listdir(expanduser(data_path+"/Test/"+i)) 34 | for f in files: 35 | if f[0] == ".": continue 36 | out_test.write(f[0:len(f) - 4] + " " + i + " " + str(dict[i])) 37 | out_test.write("\n") 38 | file2class_dict[int(f[0:len(f) - 4])] = (i, dict[i]) 39 | 40 | 41 | if "rotten_tomato" not in data_path: 42 | for i in validation_folders: 43 | if i[0] != '.': 44 | files = os.listdir(expanduser(data_path+"/Validation/"+i)) 45 | for f in files: 46 | if f[0] == ".": continue 47 | out_val.write(f[0:len(f) - 4] + " " + i + " " + str(dict[i])) 48 | out_val.write("\n") 49 | file2class_dict[int(f[0:len(f) - 4])] = (i, dict[i]) 50 | out_val.close() 51 | out_train.close() 52 | out_test.close() 53 | pickle.dump(file2class_dict, open(expanduser(data_path+"/classes.pkl"), "w")) 54 | #print file2class_dict 55 | #print len(file2class_dict) 56 | 57 | if __name__ == "__main__": 58 | #class_look_up("~/automatedMTL/data/rotten_tomato") 59 | class_look_up("~/tweetnet/automatedMTL/data/ag_news_csv") 60 | -------------------------------------------------------------------------------- /auto/data/ag_news_csv/reformat.py: -------------------------------------------------------------------------------- 1 | from os.path import expanduser 2 | import csv 3 | class_dict = {"1": "World", "2": "Sports", "3": "Business", "4": "Sci_Tech"} 4 | file_cnt = 0 5 | with open(expanduser("~/automatedMTL/data/ag_news_csv/train.csv")) as f: 6 | reader = csv.reader(f) 7 | for row in reader: 8 | class_ = row[0] 9 | content = " ".join(row[1:len(row)]) 10 | with open(expanduser("~/automatedMTL/data/ag_news_csv/Train_raw/"+class_dict[class_]+"/"+str(file_cnt)+".txt"), "w") as f: 11 | f.write(content) 12 | f.close() 13 | file_cnt += 1 14 | 15 | print file_cnt 16 | with open(expanduser("~/automatedMTL/data/ag_news_csv/test.csv")) as f: 17 | reader = csv.reader(f) 18 | for row in reader: 19 | class_ = row[0] 20 | content = " ".join(row[1:len(row)]) 21 | with open(expanduser("~/automatedMTL/data/ag_news_csv/Test_raw/"+class_dict[class_]+"/"+str(file_cnt)+".txt"), "w") as f: 22 | f.write(content) 23 | f.close() 24 | file_cnt += 1 25 | print file_cnt 26 | -------------------------------------------------------------------------------- /auto/data/ag_news_csv/stats.py: -------------------------------------------------------------------------------- 1 | import os 2 | import cPickle as pickle 3 | from os.path import expanduser 4 | 5 | def dedup(data_path): 6 | classes = os.listdir(expanduser(data_path+"/Test_raw/")) 7 | n_data_per_class = {} 8 | n_train_data = 0 9 | n_test_data = 0 10 | file_cnt = 0 11 | for c in classes: 12 | print c 13 | if c[0] != ".": 14 | test_files = os.listdir(expanduser(data_path+"/Test_raw/"+c)) 15 | train_files = os.listdir(expanduser(data_path+"/Train_raw/"+c)) 16 | for t in test_files: 17 | if t[0] != ".": 18 | print file_cnt 19 | f = open(expanduser(data_path+"/Test_raw/"+c+"/"+t), "r") 20 | txt = f.read().lower() 21 | words = txt.split(" ") 22 | chrs = list(" ".join(words)) 23 | for i in range(len(chrs)): 24 | if ((ord(chrs[i]) < ord('a') or ord(chrs[i]) > ord('z'))) and chrs[i] != "'": 25 | chrs[i] = " " 26 | remove_long_txt = "".join(chrs) 27 | words = remove_long_txt.split() 28 | words.append("EOS") 29 | remove_long_txt = " ".join(words) 30 | with open(expanduser(data_path+"/Test/"+c+"/"+str(file_cnt)+".txt"), "w") as f: 31 | f.write(remove_long_txt) 32 | f.close() 33 | file_cnt += 1 34 | for t in train_files: 35 | if t[0] != ".": 36 | print file_cnt 37 | f = open(expanduser(data_path+"/Train_raw/"+c+"/"+t), "r") 38 | txt = f.read().lower() 39 | words = txt.split(" ") 40 | chrs = list(" ".join(words)) 41 | for i in range(len(chrs)): 42 | if ((ord(chrs[i]) < ord('a') or ord(chrs[i]) > ord('z'))) and chrs[i] != "'": 43 | chrs[i] = " " 44 | remove_long_txt = "".join(chrs) 45 | words = remove_long_txt.split() 46 | words.append("EOS") 47 | remove_long_txt = " ".join(words) 48 | with open(expanduser(data_path+"/Train/"+c+"/"+str(file_cnt)+".txt"), "w") as f: 49 | f.write(remove_long_txt) 50 | f.close() 51 | file_cnt += 1 52 | def stats(data_path): 53 | classes = os.listdir(expanduser(data_path+"/Test/")) 54 | n_data_per_class = {} 55 | n_train_data = 0 56 | n_test_data = 0 57 | length = [] 58 | for c in classes: 59 | if c[0] != ".": 60 | test_files = os.listdir(expanduser(data_path+"/Test/"+c)) 61 | train_files = os.listdir(expanduser(data_path+"/Train/"+c)) 62 | all_files = test_files + train_files 63 | for t in test_files: 64 | if t[0] != ".": 65 | if n_data_per_class.get(c) == None: 66 | n_data_per_class[c] = 1 67 | else: 68 | n_data_per_class[c] += 1 69 | n_test_data += 1 70 | with open(expanduser(data_path+"/Test/"+c+"/"+t), "r") as f: 71 | txt = f.read() 72 | words = txt.split() 73 | length.append(len(words)) 74 | f.close() 75 | for t in train_files: 76 | if t[0] != ".": 77 | if n_data_per_class.get(c) == None: 78 | n_data_per_class[c] = 1 79 | else: 80 | n_data_per_class[c] += 1 81 | n_train_data += 1 82 | with open(expanduser(data_path+"/Train/"+c+"/"+t), "r") as f: 83 | txt = f.read() 84 | words = txt.split() 85 | length.append(len(words)) 86 | f.close() 87 | length = sorted(length) 88 | print "Number of classes: ", len(n_data_per_class) 89 | print "Numbe of data per class: ", n_data_per_class 90 | print "Number of train data: ", n_train_data 91 | print "Number of test data: ", n_test_data 92 | print "Longest sequence: ", length[-1] 93 | print "Shortest sequence: ", length[0] 94 | print "Average sequence: ", sum(length) * 1.0 / len(length) 95 | print length[len(length)-200:len(length)] 96 | all_data = 0 97 | for i in n_data_per_class.keys(): 98 | all_data += n_data_per_class[i] 99 | print all_data 100 | data_stats={} 101 | data_stats['n_classes'] = len(n_data_per_class) 102 | data_stats['n_data'] = n_train_data + n_test_data 103 | data_stats['n_data_per_class'] = n_data_per_class 104 | data_stats['n_train_data'] = n_train_data 105 | data_stats['n_test_data'] = n_test_data 106 | data_stats['max_length'] = length[-1] 107 | print data_stats 108 | pickle.dump(data_stats, open(expanduser(data_path+"/stats.pkl"),"w")) 109 | if __name__ == "__main__": 110 | stats("~/automatedMTL/data/ag_news_csv") 111 | -------------------------------------------------------------------------------- /auto/src/model/mcrnn_model.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import numpy as np 3 | import os 4 | import cPickle as pickle 5 | from os.path import expanduser 6 | import sys 7 | 8 | sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "..","util"))) 9 | from tf_utils import fcLayer, createLSTMCell, applyActivation, predictionLayer 10 | #from predContext import predContext, createHtDict 11 | 12 | class model(object): 13 | 14 | # Model params 15 | # 0 -- shared; 1 -- context; 2 -- task 16 | fc_activation = "tanh" 17 | output_activation = "tanh" 18 | dropout = 0.0 19 | body_lstm_size = 128 20 | context_lstm_size = 128 21 | task_lstm_size = 128 22 | body_n_layer = 1 23 | context_n_layer = 1 24 | task_n_layer = 1 25 | context_branch_fc = 512 26 | task_branch_fc = 512 27 | 28 | # Data params 29 | batch_size = 128 30 | max_length = 52 31 | feature_length = 300 32 | context_dim = 300 33 | task_dim = 2 34 | 35 | # Hyper- params 36 | lr = 0.001 37 | context_lr = lr 38 | n_epoch = 500 39 | topN = 4 40 | keep_prob_val = 1.0 41 | 42 | def buildModel(self, x, y_context, y_task, is_train, dropout, scope="multiTask"): 43 | 44 | # Assume the input shape is (batch_size, max_length, feature_length) 45 | 46 | #TASK = primary task, CONTEXT = secondary task 47 | 48 | # Create lstm cell for the shared layer 49 | body_lstm_cell, _ = createLSTMCell(self.batch_size, self.body_lstm_size, self.body_n_layer, forget_bias=0.0) 50 | # Create lstm cell for branch 1 51 | context_lstm_cell, _ = createLSTMCell(self.batch_size, self.context_lstm_size, self.context_n_layer, forget_bias=0.0) 52 | # Create lstm cells for branch 2 53 | task_lstm_cell, _ = createLSTMCell(self.batch_size, self.task_lstm_size, self.task_n_layer, forget_bias=0.0) 54 | 55 | context_cost = tf.constant(0) 56 | task_cost = tf.constant(0) 57 | 58 | with tf.variable_scope("shared_lstm"): 59 | body_cell_output, last_body_state = tf.nn.dynamic_rnn(cell = body_lstm_cell, dtype=tf.float32, sequence_length=self.length(x), inputs=x) 60 | 61 | with tf.variable_scope("context_branch"): 62 | context_cell_output, last_context_state = tf.nn.dynamic_rnn(cell = context_lstm_cell, dtype=tf.float32, sequence_length=self.length(body_cell_output), inputs=body_cell_output) 63 | 64 | # The output from LSTMs will be (batch_size, max_length, out_size) 65 | with tf.variable_scope("context_fc"): 66 | # Select the last output that is not generated by zero vectors 67 | last_context_output = self.last_relevant(context_cell_output, self.length(context_cell_output)) 68 | # feed the last output to the fc layer and make prediction 69 | context_fc_out = fcLayer(x=last_context_output, in_shape=self.context_lstm_size, out_shape=self.context_branch_fc, activation=self.fc_activation, dropout=self.dropout, is_train=is_train, scope="fc1") 70 | context_cost, context_output = predictionLayer(x=context_fc_out, y=y_context, in_shape=self.context_branch_fc, out_shape=y_context.get_shape()[-1].value, activation=self.output_activation) 71 | 72 | with tf.variable_scope("task_branch"): 73 | task_cell_output, last_task_state = tf.nn.dynamic_rnn(cell = task_lstm_cell, dtype=tf.float32, sequence_length=self.length(body_cell_output), inputs=body_cell_output) 74 | 75 | with tf.variable_scope("task_fc"): 76 | # Select the last output that is not generated by zero vectors 77 | last_task_output = self.last_relevant(task_cell_output, self.length(task_cell_output)) 78 | # feed the last output to the fc layer and make prediction 79 | task_fc_out = fcLayer(x=last_task_output, in_shape=self.task_lstm_size, out_shape=self.task_branch_fc, activation=self.fc_activation, dropout=self.dropout, is_train=is_train, scope="fc2") 80 | task_cost, task_output = predictionLayer(x=task_fc_out, y=y_task, in_shape=self.context_branch_fc, out_shape=y_task.get_shape()[-1].value, activation=self.output_activation) 81 | 82 | return context_cost, task_cost, task_output, context_output 83 | 84 | # Flatten the output tensor to shape features in all examples x output size 85 | # construct an index into that by creating a tensor with the start indices for each example tf.range(0, batch_size) x max_length 86 | # and add the individual sequence lengths to it 87 | # tf.gather() then performs the acutal indexing. 88 | def last_relevant(self, output, length): 89 | index = tf.range(0, self.batch_size) * self.max_length + (length - 1) 90 | out_size = int(output.get_shape()[2]) 91 | flat = tf.reshape(output, [-1, out_size]) 92 | relevant = tf.gather(flat, index) 93 | return relevant 94 | 95 | # Assume that the sequences are padded with 0 vectors to have shape (batch_size, max_length, feature_length) 96 | 97 | def length(self, sequence): 98 | used = tf.sign(tf.reduce_max(tf.abs(sequence), reduction_indices=2)) 99 | length = tf.reduce_sum(used, reduction_indices=1) 100 | length = tf.cast(length, tf.int32) 101 | print length.get_shape() 102 | return length 103 | 104 | 105 | 106 | 107 | 108 | -------------------------------------------------------------------------------- /auto/src/model/mcrnn_model_1_lstm.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import numpy as np 3 | import os 4 | import cPickle as pickle 5 | from os.path import expanduser 6 | import sys 7 | 8 | sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "..","util"))) 9 | from tf_utils import fcLayer, createLSTMCell, createGRUCell, applyActivation, predictionLayer, compute_cost 10 | #from predContext import predContext, createHtDict 11 | 12 | class model(object): 13 | 14 | # Task params 15 | is_multi_task = True 16 | secondary_task = "word generation" 17 | primary_task = "classification" 18 | 19 | # Model params 20 | # 0 -- shared; 1 -- context; 2 -- task 21 | fc_activation = "tanh" 22 | context_output_activation = "tanh" 23 | task_output_activation = "softmax" 24 | body_lstm_size = 1024 25 | body_n_layer = 1 26 | context_n_layer = 1 27 | task_n_layer = 1 28 | context_branch_fc = 512 29 | task_branch_fc = 30 30 | 31 | # Data params 32 | n_classes = 2 33 | batch_size = 64 34 | max_length = 52 35 | feature_length = 300 36 | context_dim = 300 37 | task_dim = n_classes 38 | 39 | # Hyper- params 40 | lr = 0.0001 #hp 41 | lr_mod = 1.0 #hp 42 | context_lr = lr_mod*lr 43 | n_epoch = 50 #hp 44 | 45 | def buildModel(self, x, y_context, y_task, is_train, dropout, scope="multiTask"): 46 | 47 | # Assume the input shape is (batch_size, max_length, feature_length) 48 | 49 | #TASK = primary task, CONTEXT = secondary task 50 | 51 | # Create lstm cell for the shared layer 52 | body_lstm_cell, _ = createLSTMCell(self.batch_size, self.body_lstm_size, self.body_n_layer, forget_bias=0.0) 53 | 54 | context_cost = tf.constant(0) 55 | task_cost = tf.constant(0.0, dtype=tf.float32) 56 | 57 | if not self.is_multi_task: context_output = tf.constant(0) 58 | 59 | with tf.variable_scope("shared_lstm"): 60 | body_cell_output, last_body_state = tf.nn.dynamic_rnn(cell = body_lstm_cell, dtype=tf.float32, sequence_length=self.length(x), inputs=x) 61 | 62 | if self.is_multi_task: 63 | with tf.variable_scope("context_branch"): 64 | # Select the last output that is not generated by zero vectors 65 | if self.secondary_task == "missing word": 66 | last_body_output = self.last_relevant(body_cell_output, self.length(body_cell_output)) 67 | # feed the last output to the fc layer and make prediction 68 | with tf.variable_scope("context_fc"): 69 | context_fc_out = fcLayer(x=last_body_output, in_shape=self.body_lstm_size, out_shape=self.context_branch_fc, activation=self.fc_activation, dropout=dropout, is_train=is_train, scope="fc1") 70 | with tf.variable_scope("context_pred"): 71 | context_output, context_logits = predictionLayer(x=context_fc_out, y=y_context, in_shape=self.context_branch_fc, out_shape=y_context.get_shape()[-1].value, activation=self.context_output_activation) 72 | context_cost = compute_cost(logit=context_logits, y=y_context, out_type="last_only", max_length=self.max_length, batch_size=self.batch_size, embed_dim=self.feature_length, activation=self.context_output_activation) 73 | 74 | if self.secondary_task == "word generation": 75 | context_input = tf.transpose(body_cell_output, [1, 0, 2]) 76 | context_input = tf.reshape(context_input, [-1, self.body_lstm_size]) 77 | context_input_list = tf.split(context_input, self.max_length, 0) 78 | fc_output_list = [] 79 | with tf.variable_scope("context_fc"): 80 | for step in range(self.max_length): 81 | if step > 0: tf.get_variable_scope().reuse_variables() 82 | fc_out = fcLayer(x=context_input_list[step], in_shape=self.body_lstm_size, out_shape=self.context_branch_fc, activation=self.fc_activation, dropout=dropout, is_train=is_train, scope="fc1") 83 | fc_output_list.append(tf.expand_dims(fc_out, axis=1)) 84 | context_fc_out = tf.concat(fc_output_list, axis=1) 85 | with tf.variable_scope("context_pred"): 86 | context_output, context_logits = predictionLayer(x=context_fc_out, y=y_context, in_shape=self.context_branch_fc, out_shape=y_context.get_shape()[-1].value, activation=self.context_output_activation) 87 | context_cost = compute_cost(logit=context_logits, y=y_context, out_type="sequential", max_length=self.max_length, batch_size=self.batch_size, embed_dim=self.feature_length,activation=self.context_output_activation) 88 | 89 | 90 | print "Context cost shape: ", context_cost.get_shape() 91 | 92 | with tf.variable_scope("task_branch"): 93 | with tf.variable_scope("task_fc"): 94 | # Select the last output that is not generated by zero vectors 95 | last_body_output = self.last_relevant(body_cell_output, self.length(body_cell_output)) 96 | # feed the last output to the fc layer and make prediction 97 | task_fc_out = fcLayer(x=last_body_output, in_shape=self.body_lstm_size, out_shape=self.task_branch_fc, activation=self.fc_activation, dropout=dropout, is_train=is_train, scope="fc2") 98 | task_output, task_logits = predictionLayer(x=task_fc_out, y=y_task, in_shape=self.task_branch_fc, out_shape=y_task.get_shape()[-1].value, activation=self.task_output_activation) 99 | print "Task output shape: ", task_output.get_shape() 100 | task_cost = compute_cost(logit=task_logits, y=y_task, out_type="last_only", max_length=self.max_length, batch_size=self.batch_size, embed_dim=self.n_classes,activation=self.task_output_activation) 101 | 102 | return context_cost, task_cost, task_output, context_output 103 | 104 | # Flatten the output tensor to shape features in all examples x output size 105 | # construct an index into that by creating a tensor with the start indices for each example tf.range(0, batch_size) x max_length 106 | # and add the individual sequence lengths to it 107 | # tf.gather() then performs the acutal indexing. 108 | def last_relevant(self, output, length): 109 | index = tf.range(0, self.batch_size) * self.max_length + (length - 1) 110 | out_size = int(output.get_shape()[2]) 111 | flat = tf.reshape(output, [-1, out_size]) 112 | relevant = tf.gather(flat, index) 113 | return relevant 114 | 115 | # Assume that the sequences are padded with 0 vectors to have shape (batch_size, max_length, feature_length) 116 | 117 | def length(self, sequence): 118 | used = tf.sign(tf.reduce_max(tf.abs(sequence), reduction_indices=2)) 119 | length = tf.reduce_sum(used, reduction_indices=1) 120 | length = tf.cast(length, tf.int32) 121 | print length.get_shape() 122 | return length 123 | 124 | 125 | 126 | 127 | 128 | -------------------------------------------------------------------------------- /auto/src/model/train.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import numpy as np 3 | import os 4 | import cPickle as pickle 5 | from os.path import expanduser 6 | import sys 7 | import mcrnn_model 8 | from mcrnn_model import model 9 | 10 | sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "..","utils"))) 11 | from tf_utils import fcLayer, createLSTMCell, applyActivation, predictionLayer 12 | from load_batch import get_file_identifiers, get_classes, load_data, get_word2vec, load_batch 13 | 14 | def get_data(data_path): 15 | data_stats = pickle.load(open(expanduser(data_path + "/rt_stats.pkl"))) 16 | max_length, nPos, nNeg, trainPercent, testPercent = data_stats["longest"], data_stats[0], data_stats[1], data_stats['trainPercent'], data_stats['testPercent'] 17 | word2vec_dic = get_word2vec("~/tweetnet/data/word2vec_dict.pkl") 18 | missing_word_dic = pickle.load(open(expanduser(data_path + "/missing_word_dic.pkl"))) 19 | nTest = int(testPercent*nPos) + int(testPercent*nNeg) 20 | nTrain = nPos + nNeg - nTest 21 | 22 | return max_length, nPos, nNeg, trainPercent, testPercent, word2vec_dic, missing_word_dic, nTest, nTrain 23 | 24 | 25 | def trainModel(): 26 | 27 | M = model() 28 | data_path = "~/automatedMTL/data/rotten_tomato" 29 | max_length, nPos, nNeg, trainPercent, testPercent, word2vec_dic, missing_word_dic, nTest, nTrain = get_data(data_path) 30 | 31 | x = tf.placeholder(tf.float32, shape=(None, M.max_length, M.feature_length)) 32 | y_context = tf.placeholder(tf.float32, shape=(None, M.context_dim)) 33 | y_task = tf.placeholder(tf.float32, shape=(None, M.task_dim)) 34 | 35 | optimizer1 = tf.train.AdamOptimizer(learning_rate=M.context_lr) 36 | optimizer2 = tf.train.AdamOptimizer(learning_rate=M.lr) 37 | is_train = tf.placeholder(tf.int32) 38 | n_train_batches = np.ceil(nTrain / M.batch_size).astype(int) 39 | keep_prob = tf.placeholder(tf.float32) 40 | 41 | context_cost, task_cost, task_output, context_output = M.buildModel(x, y_context, y_task, is_train, keep_prob) 42 | train_step1 = optimizer1.minimize(context_cost) 43 | train_step2 = optimizer2.minimize(task_cost) 44 | 45 | # Start running operations on the graph 46 | sess = tf.Session() 47 | sess.run(tf.initialize_all_variables()) 48 | 49 | with sess.as_default(): 50 | for epoch in range(100): 51 | taskCost = 0 52 | contextCost = 0 53 | 54 | all_classes, train_file, test_file = load_data(data_path) 55 | start_idx = 0 56 | for minibatch in range(n_train_batches): 57 | encoded_batch, batch_classes, batch_missing_word_encoded, batch_missing_word, batch_identifier, batch_text, batch_length = load_batch(word2vec_dic, missing_word_dic, M.feature_length, max_length, data_path+"/Train/", 1, train_file, test_file, all_classes, start_idx, M.batch_size) 58 | start_idx += M.batch_size 59 | 60 | feed_dict = {x: encoded_batch, y_context: batch_missing_word_encoded, y_task: batch_classes, is_train:1, keep_prob:0.5} 61 | 62 | train_step1.run(feed_dict=feed_dict) 63 | context_cost_val, _, _ = sess.run(fetches = [context_cost, task_cost, task_output], feed_dict=feed_dict) 64 | contextCost += context_cost_val 65 | 66 | train_step2.run(feed_dict=feed_dict) 67 | _, task_cost_val, _ = sess.run(fetches = [context_cost, task_cost, task_output], feed_dict=feed_dict) 68 | taskCost += task_cost_val 69 | 70 | #if minibatch !=0 and minibatch % 100 == 0: 71 | print "Minibatch ", minibatch, " Missing Word: ", contextCost , " Classification: ", taskCost 72 | contextCost = 0 73 | taskCost = 0 74 | 75 | start_idx = 0 76 | accuracy = 0 77 | 78 | for i in range(nTest): 79 | encoded_batch, batch_classes, batch_missing_word_encoded, batch_missing_word, batch_identifier, batch_text, batch_length = load_batch(word2vec_dic, missing_word_dic, M.feature_length, max_length, data_path+"/Test/", 0, train_file, test_file, all_classes, start_idx, 1) 80 | start_idx += 1 81 | feed_dict = {x:encoded_batch, y_context: batch_missing_word_encoded, y_task: batch_classes, is_train:0, keep_prob:0.5} 82 | task_output_val = sess.run(fetches = [task_output], feed_dict=feed_dict) 83 | accuracy += is_correct(batch_classes, task_output_val) 84 | print "The accuracy in epoch ", epoch, " is: ", accuracy * 1.0 / nTest 85 | 86 | def is_correct(target, output): 87 | prediction = np.argmax(output) 88 | target = np.argmax(target) 89 | #print prediction, target 90 | return prediction == target 91 | 92 | 93 | if __name__ == "__main__": 94 | trainModel() 95 | -------------------------------------------------------------------------------- /auto/src/util/1q: -------------------------------------------------------------------------------- 1 | from os.path import expanduser 2 | import sys 3 | import numpy 4 | import os 5 | sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "..","model"))) 6 | from mcrnn_model_1_lstm import model 7 | from train_1_lstm import trainModel as TM 8 | #does hyperparameter search over some set of hyperparams. 9 | 10 | LR = [0.001] 11 | LR_MOD = [1.0] #4 12 | N_EPOCHS = [50] # 30 13 | N_EXPERIMENTS = [1] # 5 14 | KEEP_PROB_VAL = [1.0] 15 | CONTEXT_FC = [30] #1024 on AWS 16 | #3*3*3*30*5/60=67.5 hrs. 17 | experiment = "context_lr=0.5*lr, task_lr=0.5*lr, no learning rate anealing. Learning rates: " 18 | for lr in LR: 19 | experiment = experiment + str(lr) + ", " 20 | experiment = experiment + " N epoch: "+str(N_EPOCHS[0]) + " Keep prob: " 21 | for prob in KEEP_PROB_VAL: 22 | experiment = experiment + str(prob) + ", " 23 | experiment = experiment + " Context_fc: " 24 | for fc in CONTEXT_FC: 25 | experiment = experiment + str(fc) + ", " 26 | 27 | 28 | def runExperiment(lr, lr_mod,n_epoch,n_experiments,f1, keep_prob_val, context_fc): 29 | M= model() 30 | 31 | print M.is_multi_task 32 | if lr_mod == 0.0: 33 | M.is_multi_task = False 34 | else: 35 | M.is_multi_task = True 36 | print M.is_multi_task 37 | 38 | print M.lr 39 | M.lr = lr 40 | print M.lr 41 | 42 | print M.lr_mod 43 | M.lr_mod = lr_mod 44 | print M.lr_mod 45 | 46 | print M.n_epoch 47 | M.n_epoch = n_epoch 48 | print M.n_epoch 49 | 50 | print M.context_branch_fc 51 | M.context_branch_fc = context_fc 52 | print M.context_branch_fc 53 | 54 | maxAccList = []; 55 | for i in range(n_experiments): 56 | accuracyVec = TM(M, keep_prob_val)#INSERT CODE TO run for n epochs 57 | maxAcc = numpy.max(accuracyVec) 58 | maxAccList.append(maxAcc) 59 | expVal = numpy.mean(maxAccList) 60 | string_result = "lr = " + str(lr) + " lr_mod = "+ "self-annealing" + " avg_acc = " + str(expVal)+'\n' 61 | f1.write("") 62 | f1.write(string_result) 63 | f1.flush() 64 | print string_result 65 | 66 | 67 | 68 | f1 = open(expanduser('~/tweetnet/logs/hps_log_mrnn_bidir.log'),'w+') 69 | f1.write(experiment) 70 | f1.write("\n") 71 | f1.flush() 72 | for lr in LR: 73 | for lr_mod in LR_MOD: 74 | for n_epoch in N_EPOCHS: 75 | for n_experiments in N_EXPERIMENTS: 76 | for keep_prob_val in KEEP_PROB_VAL: 77 | for context_fc in CONTEXT_FC: 78 | runExperiment(lr,lr_mod,n_epoch,n_experiments,f1, keep_prob_val, context_fc) 79 | f1.close() 80 | -------------------------------------------------------------------------------- /auto/src/util/hps.py: -------------------------------------------------------------------------------- 1 | from os.path import expanduser 2 | import sys 3 | import numpy 4 | import os 5 | sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "..","model"))) 6 | from mcrnn_model_1_lstm import model 7 | from train_1_lstm import trainModel as TM 8 | #does hyperparameter search over some set of hyperparams. 9 | 10 | LR = [0.01] 11 | LR_MOD = [1.0] #4 12 | N_EPOCHS = [30] # 30 13 | N_EXPERIMENTS = [10] # 5 14 | KEEP_PROB_VAL = [1.0] 15 | CONTEXT_FC = [30] #1024 on AWS 16 | #3*3*3*30*5/60=67.5 hrs. 17 | experiment = "context_lr=0.5*lr, task_lr=0.5*lr, no learning rate anealing. Learning rates: " 18 | for lr in LR: 19 | experiment = experiment + str(lr) + ", " 20 | experiment = experiment + " N epoch: "+str(N_EPOCHS[0]) + " Keep prob: " 21 | for prob in KEEP_PROB_VAL: 22 | experiment = experiment + str(prob) + ", " 23 | experiment = experiment + " Context_fc: " 24 | for fc in CONTEXT_FC: 25 | experiment = experiment + str(fc) + ", " 26 | 27 | epoch_ratio_list = [(0.1, 1.0), (0.5, 0.5), (1.0, 0.0)] 28 | 29 | def runExperiment(lr, lr_mod,n_epoch,n_experiments,f1, keep_prob_val, context_fc): 30 | M= model() 31 | 32 | print M.is_multi_task 33 | if lr_mod == 0.0: 34 | M.is_multi_task = False 35 | else: 36 | M.is_multi_task = True 37 | print M.is_multi_task 38 | 39 | print M.lr 40 | M.lr = lr 41 | print M.lr 42 | 43 | print M.lr_mod 44 | M.lr_mod = lr_mod 45 | print M.lr_mod 46 | 47 | print M.n_epoch 48 | M.n_epoch = n_epoch 49 | print M.n_epoch 50 | 51 | print M.context_branch_fc 52 | M.context_branch_fc = context_fc 53 | print M.context_branch_fc 54 | 55 | maxAccList = []; 56 | for i in range(n_experiments): 57 | accuracyVec = TM(M, keep_prob_val, epoch_ratio_list)#INSERT CODE TO run for n epochs 58 | maxAcc = numpy.max(accuracyVec) 59 | maxAccList.append(maxAcc) 60 | expVal = numpy.mean(maxAccList) 61 | string_result = "lr = " + str(lr) + " lr_mod = "+ "self-annealing" + " avg_acc = " + str(expVal)+'\n' 62 | f1.write("") 63 | f1.write(string_result) 64 | f1.flush() 65 | print string_result 66 | 67 | 68 | 69 | f1 = open(expanduser('~/tweetnet/logs/hps_log_mrnn_bidir.log'),'w+') 70 | f1.write(experiment) 71 | f1.write("\n") 72 | f1.flush() 73 | for lr in LR: 74 | for lr_mod in LR_MOD: 75 | for n_epoch in N_EPOCHS: 76 | for n_experiments in N_EXPERIMENTS: 77 | for keep_prob_val in KEEP_PROB_VAL: 78 | for context_fc in CONTEXT_FC: 79 | runExperiment(lr,lr_mod,n_epoch,n_experiments,f1, keep_prob_val, context_fc) 80 | f1.close() 81 | -------------------------------------------------------------------------------- /auto/src/util/hps2.py: -------------------------------------------------------------------------------- 1 | from os.path import expanduser 2 | import sys 3 | import numpy 4 | import os 5 | sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "..","model"))) 6 | from mcrnn_model_gen2 import model 7 | from train_gen2 import trainModel as TM 8 | #does hyperparameter search over some set of hyperparams. 9 | 10 | 11 | LR = [0.001] 12 | LR_MOD = [0.0,1.0] #4 13 | N_EPOCHS = [30] # 30 14 | N_EXPERIMENTS = [5] # 5 15 | KEEP_PROB_VAL = [1.0] 16 | CONTEXT_FC = [128] 17 | #3*3*3*30*5/60=67.5 hrs. 18 | #experiment = "context_lr=0.5*lr, task_lr=0.5*lr, no learning rate anealing. Learning rates: " 19 | 20 | dataset = "ag_news" # or "rotten_tomato" 21 | experiment = "N_epochs = 50. N_exp = 10. lstm: 512 for both. hidden fc: 512 for both. dropout: none." 22 | 23 | for lr in LR: 24 | experiment = experiment + str(lr) + ", " 25 | experiment = experiment + " N epoch: "+str(N_EPOCHS[0]) + " Keep prob: " 26 | for prob in KEEP_PROB_VAL: 27 | experiment = experiment + str(prob) + ", " 28 | experiment = experiment + " Context_fc: " 29 | for fc in CONTEXT_FC: 30 | experiment = experiment + str(fc) + ", " 31 | 32 | def runExperiment(lr, lr_mod,n_epoch,n_experiments,f1, keep_prob_val, context_fc): 33 | M= model() 34 | 35 | print M.dataset 36 | M.dataset = dataset 37 | print M.dataset 38 | 39 | print M.is_multi_task 40 | if lr_mod == 0.0: 41 | M.is_multi_task = False 42 | else: 43 | M.is_multi_task = True 44 | print M.is_multi_task 45 | 46 | print M.lr 47 | M.lr = lr 48 | print M.lr 49 | 50 | print M.lr_mod 51 | M.lr_mod = lr_mod 52 | print M.lr_mod 53 | 54 | print M.n_epoch 55 | M.n_epoch = n_epoch 56 | print M.n_epoch 57 | 58 | print M.context_branch_fc 59 | M.context_branch_fc = context_fc 60 | print M.context_branch_fc 61 | 62 | maxAccList = [] 63 | testResult = [] 64 | for i in range(n_experiments): 65 | accuracyVec, testAcc = TM(M)#INSERT CODE TO run for n epochs 66 | maxAcc = numpy.max(accuracyVec) 67 | maxAccList.append(maxAcc) 68 | maxIdx = numpy.argmax(accuracyVec) 69 | testResult.append(testAcc[maxIdx]) 70 | 71 | expVal = numpy.mean(maxAccList) 72 | testVal = numpy.mean(testResult) 73 | if lr_mod == 0.0: 74 | string_result = "lr = " + str(lr) + " lr_mod = "+ "none (lstm)" + " avg_val_acc = " + str(expVal) + " avg_test_acc = " + str(testVal) + '\n' 75 | else: 76 | string_result = "lr = " + str(lr) + " lr_mod = "+ "annealing" + " avg_val_acc = " + str(expVal) + " avg_test_acc = " + str(testVal) + '\n' 77 | 78 | f1.write("") 79 | f1.write(string_result) 80 | f1.flush() 81 | print string_result 82 | 83 | #f1 = open(expanduser('~/tweetnet/logs/hps_log_mrnn_bidir.log'),'w+') 84 | f1 = open(expanduser("~/tweetnet/logs/hps_mrnn_ag_news.log"), "w+") 85 | f1.write(experiment) 86 | f1.write("\n") 87 | f1.flush() 88 | for lr in LR: 89 | for lr_mod in LR_MOD: 90 | for n_epoch in N_EPOCHS: 91 | for n_experiments in N_EXPERIMENTS: 92 | for keep_prob_val in KEEP_PROB_VAL: 93 | for context_fc in CONTEXT_FC: 94 | runExperiment(lr,lr_mod,n_epoch,n_experiments,f1, keep_prob_val, context_fc) 95 | f1.close() 96 | -------------------------------------------------------------------------------- /auto/src/util/hps_script.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/davisliang/automated/5e07e5802a604036fc6295ed3538d596936fca0e/auto/src/util/hps_script.sh -------------------------------------------------------------------------------- /auto/src/util/load_batch.py: -------------------------------------------------------------------------------- 1 | import cPickle as pickle 2 | import numpy as np 3 | import os 4 | import random 5 | from os.path import expanduser 6 | from reformat import reformat_data 7 | from load_util import class_look_up 8 | 9 | # The files are named from 0.txt to n.txt 10 | # This function returns a list of all shuffled file names 11 | 12 | def get_file_identifiers(data_path): 13 | ids = [] 14 | f = open(expanduser(data_path)) 15 | for l in f.readlines(): 16 | ids.append(int(l.split(" ")[0])) 17 | random.shuffle(ids) 18 | return ids 19 | 20 | def get_classes(all_classes, id): 21 | return all_classes[id][0] 22 | 23 | def get_text_by_batch(data_path, is_train, train_file, test_file, all_classes, start_idx, batch_size): 24 | if is_train: 25 | identifiers = train_file 26 | else: 27 | identifiers = test_file 28 | batch_identifiers = identifiers[start_idx: start_idx + batch_size] 29 | 30 | batch_text = [] 31 | for idx in batch_identifiers: 32 | text = open(expanduser(data_path + get_classes(all_classes, idx)+ "/" + str(idx)+".txt")) 33 | batch_text.append(text.read()) 34 | 35 | return batch_identifiers, batch_text 36 | 37 | def load_data(data_path): 38 | 39 | all_classes = pickle.load(open(expanduser(data_path + '/classes.pkl'))) 40 | test_file = get_file_identifiers(data_path + "/test_classes.txt") 41 | train_file = get_file_identifiers(data_path + "/train_classes.txt") 42 | return all_classes, train_file, test_file 43 | 44 | def get_word2vec(data_path): 45 | # TO DO: download word2vec! 46 | word2vec_dic = pickle.load(open(expanduser(data_path))) 47 | return word2vec_dic 48 | 49 | # Unknown symbols are UNK 50 | # Missing word symbols are zeros 51 | # EOS are EOS 52 | 53 | def encode_sequence(word2vec_dic, sequence, encode_dim, max_len): 54 | sequence_by_word = sequence.split(" ") 55 | encoded_seq = np.zeros((max_len, encode_dim)) 56 | for i in range(len(sequence_by_word)): 57 | word = sequence_by_word[i] 58 | if word2vec_dic.get(word) == None: 59 | encoded_seq[i, :] = word2vec_dic["UNK"] 60 | continue 61 | else: 62 | if word != "REMOVE": 63 | encoded_seq[i, :] = word2vec_dic[word] 64 | else: 65 | encoded_seq[i, :] = word2vec_dic["_"] 66 | return encoded_seq, len(sequence_by_word) 67 | 68 | def encode_sequence_generation(word2vec_dic, sequence, encode_dim, max_len): 69 | sequence_by_word = sequence.split(" ") 70 | encoded_seq = np.zeros((max_len, encode_dim)) 71 | for i in range(1, len(sequence_by_word)): 72 | word = sequence_by_word[i] 73 | if word2vec_dic.get(word) == None: 74 | encoded_seq[i-1, :] = word2vec_dic["UNK"] 75 | else: 76 | encoded_seq[i-1, :] = word2vec_dic[word] 77 | encoded_seq[len(sequence_by_word)-1, :] = word2vec_dic["EOS"] 78 | context_target = sequence_by_word[1:len(sequence_by_word)] + ["EOS"] 79 | 80 | return encoded_seq, context_target, len(sequence_by_word) 81 | 82 | def oneHot(nclasses, idx): 83 | one_hot = np.zeros((nclasses)) 84 | one_hot[idx] = 1 85 | return one_hot 86 | 87 | def load_batch(n_classes, word2vec_dic, missing_word_dic, encode_dim, max_len, data_path, is_train, train_file, test_file, all_classes, start_idx, batch_size, automated_task): 88 | batch_identifiers, batch_text = get_text_by_batch(data_path, is_train, train_file, test_file, all_classes, start_idx, batch_size) 89 | encoded_batch = np.zeros((batch_size, max_len, encode_dim)) 90 | batch_classes = np.zeros((batch_size, n_classes)) 91 | batch_context_encoded = np.zeros((batch_size, encode_dim)) 92 | if automated_task == "word generation": batch_context_encoded = np.zeros((batch_size, max_len, encode_dim)) 93 | batch_context = [] 94 | batch_length = [] 95 | for i in range(batch_size): 96 | encoded_batch[i,:, :], text_length = encode_sequence(word2vec_dic, batch_text[i], encode_dim, max_len) 97 | batch_classes[i,:] = oneHot(n_classes, all_classes[batch_identifiers[i]][-1]) 98 | if automated_task != "word generation": 99 | batch_context_encoded[i,:] = word2vec_dic[missing_word_dic[batch_identifiers[i]]] 100 | batch_context.append(missing_word_dic[batch_identifiers[i]]) 101 | else: 102 | batch_context_encoded[i, :, :], context_target, text_length = encode_sequence_generation(word2vec_dic, batch_text[i], encode_dim, max_len) 103 | batch_context.append(context_target) 104 | batch_length.append(text_length) 105 | return encoded_batch, batch_classes, batch_context_encoded, batch_context, batch_identifiers, batch_text, batch_length 106 | 107 | if __name__ == "__main__": 108 | data_path = "~/automatedMTL/data/rotten_tomato" 109 | max_length = reformat_data(data_path, False) 110 | class_look_up(data_path) 111 | data_stats = pickle.load(open(expanduser(data_path + "/stats.pkl"))) 112 | n_classes, n_data, n_data_per_class, trainPercent, testPercent = data_stats['n_classes'], data_stats['n_data'], data_stats['n_data_per_class'],data_stats['trainPercent'], data_stats['testPercent'] 113 | word2vec_dic = get_word2vec("~/tweetnet/data/word2vec_dict.pkl") 114 | missing_word_dic = pickle.load(open(expanduser(data_path + "/missing_word_dic.pkl"))) 115 | for epoch in range(3): 116 | dic = {} 117 | all_classes, train_file, test_file = load_data(data_path) 118 | start_idx = 0 119 | for minibatch in range(73): 120 | encoded_batch, batch_classes, batch_context_encoded, batch_context, batch_identifier, batch_text, batch_length = load_batch(n_classes, word2vec_dic, missing_word_dic, 300, max_length, data_path+"/Train/", 1, train_file, test_file, all_classes, start_idx, 128, automated_task="word generation") 121 | start_idx += 128 122 | print batch_context 123 | for i in batch_identifier: 124 | if dic.get(i) != None: print "Wrong" 125 | else: dic[i] = 1 126 | -------------------------------------------------------------------------------- /auto/src/util/load_batch2.py: -------------------------------------------------------------------------------- 1 | import cPickle as pickle 2 | import numpy as np 3 | import os 4 | import random 5 | from os.path import expanduser 6 | from reformat import reformat_data 7 | from load_util import class_look_up 8 | 9 | # The files are named from 0.txt to n.txt 10 | # This function returns a list of all shuffled file names 11 | 12 | def get_file_identifiers(data_path): 13 | ids = [] 14 | f = open(expanduser(data_path)) 15 | for l in f.readlines(): 16 | ids.append(int(l.split(" ")[0])) 17 | random.shuffle(ids) 18 | return ids 19 | 20 | def get_classes(all_classes, id): 21 | return all_classes[id][0] 22 | 23 | def get_text_by_batch(data_path, is_train, train_file, test_file, all_classes, start_idx, batch_size): 24 | if is_train: 25 | identifiers = train_file 26 | else: 27 | identifiers = test_file 28 | batch_identifiers = identifiers[start_idx: start_idx + batch_size] 29 | 30 | batch_text = [] 31 | for idx in batch_identifiers: 32 | text = open(expanduser(data_path + get_classes(all_classes, idx)+ "/" + str(idx)+".txt")) 33 | batch_text.append(text.read()) 34 | 35 | return batch_identifiers, batch_text 36 | 37 | def load_data(data_path): 38 | 39 | all_classes = pickle.load(open(expanduser(data_path + '/classes.pkl'))) 40 | test_file = get_file_identifiers(data_path + "/test_classes.txt") 41 | train_file = get_file_identifiers(data_path + "/train_classes.txt") 42 | return all_classes, train_file, test_file 43 | 44 | def get_word2vec(data_path): 45 | # TO DO: download word2vec! 46 | word2vec_dic = pickle.load(open(expanduser(data_path))) 47 | return word2vec_dic 48 | 49 | # Unknown symbols are UNK 50 | # Missing word symbols are zeros 51 | # EOS are EOS 52 | 53 | def encode_sequence(word2vec_dic, sequence, encode_dim, max_len): 54 | sequence_by_word = sequence.split(" ") 55 | encoded_seq = np.zeros((max_len, encode_dim)) 56 | for i in range(len(sequence_by_word)): 57 | word = sequence_by_word[i] 58 | if word2vec_dic.get(word) == None: 59 | encoded_seq[i, :] = word2vec_dic["UNK"] 60 | else: 61 | if word != "REMOVE": 62 | encoded_seq[i, :] = word2vec_dic[word] 63 | else: 64 | encoded_seq[i, :] = word2vec_dic["_"] 65 | return encoded_seq, len(sequence_by_word) 66 | 67 | def oneHot(nclasses, idx): 68 | one_hot = np.zeros((nclasses)) 69 | one_hot[idx] = 1 70 | return one_hot 71 | 72 | def load_batch(n_classes, word2vec_dic, missing_word_dic, encode_dim, max_len, data_path, is_train, train_file, test_file, all_classes, start_idx, batch_size): 73 | batch_identifiers, batch_text = get_text_by_batch(data_path, is_train, train_file, test_file, all_classes, start_idx, batch_size) 74 | encoded_batch = np.zeros((batch_size, max_len, encode_dim)) 75 | batch_classes = np.zeros((batch_size, 2)) 76 | batch_missing_word_encoded = np.zeros((batch_size, encode_dim)) 77 | batch_missing_word = [] 78 | batch_length = [] 79 | for i in range(batch_size): 80 | encoded_batch[i,:, :], text_length = encode_sequence(word2vec_dic, batch_text[i], encode_dim, max_len) 81 | batch_classes[i,:] = oneHot(n_classes, all_classes[batch_identifiers[i]][-1]) 82 | batch_missing_word_encoded[i,:] = word2vec_dic[missing_word_dic[batch_identifiers[i]]] 83 | batch_missing_word.append(missing_word_dic[batch_identifiers[i]]) 84 | batch_length.append(text_length) 85 | return encoded_batch, batch_classes, batch_missing_word_encoded, batch_missing_word, batch_identifiers, batch_text, batch_length 86 | 87 | if __name__ == "__main__": 88 | data_path = "~/automatedMTL/data/rotten_tomato" 89 | max_length = reformat_data(data_path, False) 90 | class_look_up(data_path) 91 | data_stats = pickle.load(open(expanduser(data_path + "/stats.pkl"))) 92 | n_classes, n_data, n_data_per_class, trainPercent, testPercent = data_stats['n_classes'], data_stats['n_data'], data_stats['n_data_per_class'],data_stats['trainPercent'], data_stats['testPercent'] 93 | word2vec_dic = get_word2vec("~/tweetnet/data/word2vec_dict.pkl") 94 | missing_word_dic = pickle.load(open(expanduser(data_path + "/missing_word_dic.pkl"))) 95 | for epoch in range(3): 96 | dic = {} 97 | all_classes, train_file, test_file = load_data(data_path) 98 | start_idx = 0 99 | for minibatch in range(73): 100 | encoded_batch, batch_classes, batch_missing_word_encoded, batch_missing_word, batch_identifier, batch_text, batch_length = load_batch(n_classes, word2vec_dic, missing_word_dic, 300, max_length, data_path+"/Train/", 1, train_file, test_file, all_classes, start_idx, 128) 101 | start_idx += 128 102 | #print batch_text 103 | #print batch_missing_word 104 | #print batch_length 105 | for i in batch_identifier: 106 | if dic.get(i) != None: print "Wrong" 107 | else: dic[i] = 1 108 | -------------------------------------------------------------------------------- /auto/src/util/load_batch_val.py: -------------------------------------------------------------------------------- 1 | import cPickle as pickle 2 | import numpy as np 3 | import os 4 | import random 5 | from os.path import expanduser 6 | #from reformat import reformat_data 7 | from load_util import class_look_up 8 | 9 | # The files are named from 0.txt to n.txt 10 | # This function returns a list of all shuffled file names 11 | 12 | def get_file_identifiers(data_path): 13 | ids = [] 14 | f = open(expanduser(data_path)) 15 | for l in f.readlines(): 16 | ids.append(int(l.split(" ")[0])) 17 | random.shuffle(ids) 18 | return ids 19 | 20 | def get_classes(all_classes, id): 21 | return all_classes[id][0] 22 | 23 | def get_text_by_batch(data_path, is_train, is_val, train_file, test_file, val_file, all_classes, start_idx, batch_size): 24 | if is_train: 25 | identifiers = train_file 26 | elif is_val: 27 | identifiers = val_file 28 | else: 29 | identifiers = test_file 30 | batch_identifiers = identifiers[start_idx: start_idx + batch_size] 31 | 32 | batch_text = [] 33 | for idx in batch_identifiers: 34 | text = open(expanduser(data_path + get_classes(all_classes, idx)+ "/" + str(idx)+".txt")) 35 | batch_text.append(text.read()) 36 | 37 | return batch_identifiers, batch_text 38 | 39 | def load_data(data_path): 40 | 41 | all_classes = pickle.load(open(expanduser(data_path + '/classes.pkl'))) 42 | test_file = get_file_identifiers(data_path + "/test_classes.txt") 43 | train_file = get_file_identifiers(data_path + "/train_classes.txt") 44 | val_file = get_file_identifiers(data_path + "/validation_classes.txt") 45 | return all_classes, train_file, test_file, val_file 46 | 47 | def get_word2vec(data_path): 48 | # TO DO: download word2vec! 49 | word2vec_dic = pickle.load(open(expanduser(data_path))) 50 | return word2vec_dic 51 | 52 | # Unknown symbols are UNK 53 | # Missing word symbols are zeros 54 | # EOS are EOS 55 | 56 | def encode_sequence(word2vec_dic, sequence, encode_dim, max_len): 57 | sequence_by_word = sequence.split(" ") 58 | encoded_seq = np.zeros((max_len, encode_dim)) 59 | for i in range(len(sequence_by_word)): 60 | word = sequence_by_word[i] 61 | if word2vec_dic.get(word) == None: 62 | encoded_seq[i, :] = word2vec_dic["UNK"] 63 | else: 64 | if word != "REMOVE": 65 | encoded_seq[i, :] = word2vec_dic[word] 66 | else: 67 | encoded_seq[i, :] = word2vec_dic["_"] 68 | return encoded_seq, len(sequence_by_word) 69 | 70 | def encode_sequence_generation(word2vec_dic, sequence, encode_dim, max_len): 71 | sequence_by_word = sequence.split(" ") 72 | encoded_seq = np.zeros((max_len, encode_dim)) 73 | for i in range(1, len(sequence_by_word)): 74 | word = sequence_by_word[i] 75 | if word2vec_dic.get(word) == None: 76 | encoded_seq[i-1, :] = word2vec_dic["UNK"] 77 | else: 78 | encoded_seq[i-1, :] = word2vec_dic[word] 79 | encoded_seq[len(sequence_by_word)-1, :] = word2vec_dic["EOS"] 80 | context_target = sequence_by_word[1:len(sequence_by_word)] + ["EOS"] 81 | 82 | return encoded_seq, context_target, len(sequence_by_word) 83 | 84 | def oneHot(nclasses, idx): 85 | one_hot = np.zeros((nclasses)) 86 | one_hot[idx-1] = 1 87 | return one_hot 88 | 89 | def load_batch(n_classes, word2vec_dic, missing_word_dic, encode_dim, max_len, data_path, is_train, is_val, train_file, test_file, val_file,all_classes, start_idx, batch_size, automated_task): 90 | batch_identifiers, batch_text = get_text_by_batch(data_path, is_train,is_val, train_file, test_file, val_file, all_classes, start_idx, batch_size) 91 | encoded_batch = np.zeros((batch_size, max_len, encode_dim)) 92 | batch_classes = np.zeros((batch_size, n_classes)) 93 | batch_context_encoded = np.zeros((batch_size, encode_dim)) 94 | if automated_task == "word generation": batch_context_encoded = np.zeros((batch_size, max_len, encode_dim)) 95 | batch_context = [] 96 | batch_length = [] 97 | for i in range(batch_size): 98 | encoded_batch[i,:, :], text_length = encode_sequence(word2vec_dic, batch_text[i], encode_dim, max_len) 99 | batch_classes[i,:] = oneHot(n_classes, all_classes[batch_identifiers[i]][-1]) 100 | if automated_task != "word generation": 101 | batch_context_encoded[i,:] = word2vec_dic[missing_word_dic[batch_identifiers[i]]] 102 | batch_context.append(missing_word_dic[batch_identifiers[i]]) 103 | else: 104 | batch_context_encoded[i, :, :], context_target, text_length = encode_sequence_generation(word2vec_dic, batch_text[i], encode_dim, max_len) 105 | batch_context.append(context_target) 106 | batch_length.append(text_length) 107 | return encoded_batch, batch_classes, batch_context_encoded, batch_context, batch_identifiers, batch_text, batch_length 108 | 109 | if __name__ == "__main__": 110 | data_path = "~/tweetnet/automatedMTL/data/ag_news_csv" 111 | data_stats = pickle.load(open(expanduser(data_path + "/stats.pkl"))) 112 | n_classes, n_data, n_data_per_class, n_train_data, n_test_data, max_length = data_stats['n_classes'], data_stats['n_data'], data_stats['n_data_per_class'],data_stats['n_train_data'], data_stats['n_test_data'], data_stats['max_length'] 113 | print n_classes, n_data, n_data_per_class 114 | word2vec_dic = get_word2vec("~/tweetnet/data/word2vec_dict.pkl") 115 | for epoch in range(3): 116 | dic = {} 117 | all_classes, train_file, test_file, val_file = load_data(data_path) 118 | start_idx = 0 119 | for minibatch in range(3): 120 | encoded_batch, batch_classes, batch_context_encoded, batch_context, batch_identifier, batch_text, batch_length = load_batch(n_classes, word2vec_dic, {}, 300, max_length, data_path+"/Train/", 1, train_file, test_file, all_classes, start_idx, 1, automated_task="word generation") 121 | start_idx += 1 122 | print batch_text 123 | print batch_classes 124 | print batch_context 125 | print encoded_batch.shape 126 | print batch_context_encoded.shape 127 | for i in batch_identifier: 128 | if dic.get(i) != None: print "Wrong" 129 | else: dic[i] = 1 130 | -------------------------------------------------------------------------------- /auto/src/util/load_util.py: -------------------------------------------------------------------------------- 1 | import cPickle as pickle 2 | import os 3 | from os.path import expanduser 4 | from os.path import basename 5 | 6 | def class_look_up(data_path): 7 | out_train = open(expanduser(data_path+"/train_classes.txt"), "w") 8 | out_test = open(expanduser(data_path+"/test_classes.txt"), "w") 9 | train_folders = os.listdir(expanduser(data_path+"/Train/")) 10 | test_folders = os.listdir(expanduser(data_path+"/Test/")) 11 | 12 | dict = {} 13 | cnt = 0 14 | file2class_dict = {} 15 | 16 | for i in train_folders: 17 | if i[0] != '.': 18 | if dict.get(i) == None: 19 | dict[i] = cnt 20 | cnt += 1 21 | files = os.listdir(expanduser(data_path+"/Train/"+i)) 22 | for f in files: 23 | if f[0] == ".": continue 24 | out_train.write(f[0:len(f) - 4] + " " + i + " " + str(dict[i])) 25 | out_train.write("\n") 26 | file2class_dict[int(f[0:len(f) - 4])] = (i, dict[i]) 27 | 28 | for i in test_folders: 29 | if i[0] != '.': 30 | files = os.listdir(expanduser(data_path+"/Test/"+i)) 31 | for f in files: 32 | if f[0] == ".": continue 33 | out_test.write(f[0:len(f) - 4] + " " + i + " " + str(dict[i])) 34 | out_test.write("\n") 35 | file2class_dict[int(f[0:len(f) - 4])] = (i, dict[i]) 36 | out_train.close() 37 | out_test.close() 38 | pickle.dump(file2class_dict, open(expanduser(data_path+"/classes.pkl"), "w")) 39 | #print file2class_dict 40 | #print len(file2class_dict) 41 | 42 | if __name__ == "__main__": 43 | class_look_up("~/automatedMTL/data/rotten_tomato") 44 | -------------------------------------------------------------------------------- /auto/src/util/reformat.py: -------------------------------------------------------------------------------- 1 | import cPickle as pickle 2 | import numpy as np 3 | import os 4 | from os.path import expanduser 5 | from path import Path 6 | import random 7 | from stop_words import get_stop_words 8 | 9 | # The dataset has 5331 positive and 5331 negative reviews 10 | # According to prev work, split inot 90% training (4998) and 10% testing (533) 11 | 12 | word2vec_dic = pickle.load(open(expanduser("~/tweetnet/data/word2vec_dict.pkl"))) 13 | stop_words = get_stop_words('english') 14 | length = [] 15 | missing_word_dic = {} 16 | 17 | def get_dataset(dataset_path): 18 | data_stats = pickle.load(open(expanduser(dataset_path + "/stats.pkl"))) 19 | all_example = {} 20 | all_class_folders = os.listdir(expanduser(dataset_path+"/all_data/")) 21 | for class_folder in all_class_folders: 22 | if class_folder[0] != ".": 23 | all_example[class_folder] = open(expanduser(dataset_path+'/all_data/' + class_folder)) 24 | return all_example, data_stats 25 | 26 | 27 | def replace_missing_word(data_by_word): 28 | 29 | new_data = [] 30 | for i in range(len(data_by_word)): 31 | word = data_by_word[i] 32 | if word in stop_words and word2vec_dic.get(word) == None: 33 | continue 34 | else: 35 | new_data.append(word) 36 | 37 | idx = range(0, len(new_data)) 38 | random.shuffle(idx) 39 | removed = "" 40 | 41 | if len(new_data) == 1 and word2vec_dic.get(new_data[0]) != None: 42 | return new_data + ["-"], new_data[-1] 43 | elif len(new_data) == 1 and word2vec_dic.get(new_data[0]) == None: 44 | return [], "" 45 | 46 | valid = False 47 | for i in idx: 48 | word = new_data[i] 49 | if word not in stop_words and word2vec_dic.get(word)!= None: 50 | removed = new_data[i] 51 | data_by_word[i] = "REMOVE" 52 | valid = True 53 | break 54 | if not valid: 55 | print data_by_word 56 | return [], "" 57 | return data_by_word, removed 58 | 59 | 60 | def process_data(data, is_missing_word): 61 | 62 | d = list(data) 63 | for i in range(len(d)): 64 | if ord(d[i]) > ord('z') or ord(d[i]) < ord('a') and d[i] != "'": 65 | d[i] = " " 66 | string = "".join(d) 67 | 68 | if not is_missing_word: 69 | string = " ".join(string.split()) 70 | string = string + " " + "EOS" 71 | length.append(len(string.split())) 72 | return string, "_" 73 | 74 | string, removed = replace_missing_word(string.split()) 75 | if string == []: return [], "" 76 | string = " ".join(string) 77 | string = string + " " + "EOS" 78 | length.append(len(string.split())) 79 | return string, removed 80 | 81 | def reformat_data(dataset_path, is_missing_word): 82 | 83 | # Clean up the directory in train and test folder 84 | d_train, d_test = Path(expanduser(dataset_path+"/Train")), Path(expanduser(dataset_path+"/Test")) 85 | train_files, test_files = d_train.walk("*.txt"), d_test.walk("*.txt") 86 | for f in train_files: 87 | f.remove() 88 | for f in test_files: 89 | f.remove() 90 | 91 | all_example, data_stats = get_dataset(dataset_path) 92 | n_classes, n_data, n_data_per_class, trainPercent, testPercent = data_stats['n_classes'], data_stats['n_data'], data_stats['n_data_per_class'], data_stats['trainPercent'], data_stats['testPercent'] 93 | all_idx = range(0,n_data) 94 | random.shuffle(all_idx) 95 | test_idx = all_idx[0:int(testPercent*n_data)] 96 | identifier = 0 97 | 98 | for one_class in all_example.keys(): 99 | for p in all_example[one_class].readlines(): 100 | if p == "\n": 101 | continue 102 | else: 103 | if identifier in test_idx: 104 | file = open(expanduser(dataset_path + "/Test/" + one_class[0:len(one_class)-4] + "/" + str(identifier) + ".txt"), "w") 105 | else: 106 | file = open(expanduser(dataset_path + "/Train/" + one_class[0:len(one_class)-4] + "/" + str(identifier) + ".txt"), "w") 107 | 108 | string, removed = process_data(p, is_missing_word) 109 | if string != []: 110 | file.write(string) 111 | missing_word_dic[identifier] = removed 112 | identifier += 1 113 | file.close() 114 | else: print p 115 | pickle.dump(missing_word_dic, open(expanduser(dataset_path + "/missing_word_dic.pkl"),"w")) 116 | return sorted(length)[-1] 117 | 118 | if __name__ == "__main__": 119 | print reformat_data("~/automatedMTL/data/rotten_tomato", False) 120 | -------------------------------------------------------------------------------- /auto/src/util/tf_utils.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import numpy as np 3 | 4 | def fcLayer(x, in_shape, out_shape, activation, dropout, is_train, scope="fc"): 5 | 6 | x = tf.reshape(x, [-1, in_shape]) 7 | 8 | with tf.variable_scope(scope): 9 | w = tf.get_variable(name="w", shape = [in_shape, out_shape], dtype=tf.float32, initializer=tf.contrib.layers.xavier_initializer()) 10 | b = tf.get_variable(name="b", shape= [out_shape], dtype=tf.float32, initializer=tf.constant_initializer(0.0)) 11 | fc = tf.add(tf.matmul(x, w), b) 12 | 13 | with tf.variable_scope("activation"): 14 | output = applyActivation(fc, activation) 15 | #out_op = tf.nn.dropout(output, dropout) 16 | out_op = output 17 | 18 | return out_op 19 | 20 | def createGRUCell(batch_size, lstm_size): 21 | gru_cell = tf.contrib.rnn.GRUCell(num_units=lstm_size, activation=tf.tanh) 22 | state=gru_cell.zero_state(batch_size, tf.float32) 23 | 24 | return gru_cell, state 25 | 26 | def createLSTMCell(batch_size, lstm_size, n_layers, forget_bias): 27 | 28 | lstm_cell = tf.contrib.rnn.BasicLSTMCell(lstm_size, forget_bias=forget_bias) 29 | lstm_cell = tf.contrib.rnn.MultiRNNCell([lstm_cell for i in range(n_layers)], state_is_tuple=True) 30 | state = lstm_cell.zero_state(batch_size, tf.float32) 31 | 32 | return lstm_cell, state 33 | 34 | def applyActivation(x, activation): 35 | 36 | if activation == "tanh": 37 | return tf.nn.tanh(x) 38 | elif activation == "relu": 39 | return tf.nn.relu(x) 40 | elif activation == "sigmoid": 41 | return tf.nn.sigmoid(x) 42 | elif activation == "relu6": 43 | return tf.nn.relu6(x) 44 | elif activation == "softmax": 45 | return tf.nn.softmax(x) 46 | else: return None 47 | 48 | def length(sequence): 49 | used = tf.sign(tf.reduce_max(tf.abs(sequence), reduction_indices=2)) 50 | length = tf.reduce_sum(used, reduction_indices=1) 51 | length = tf.cast(length, tf.int32) 52 | return length 53 | 54 | def predictionLayer(x, y, in_shape, out_shape, activation, scope="prediction"): 55 | 56 | x = tf.reshape(x, [-1, in_shape]) 57 | 58 | with tf.variable_scope(scope): 59 | w = tf.get_variable(name=scope+"w", shape = [in_shape, out_shape], dtype=tf.float32, initializer=tf.contrib.layers.xavier_initializer()) 60 | b = tf.get_variable(name=scope+"b", shape= [out_shape], dtype=tf.float32, initializer=tf.constant_initializer(0.0)) 61 | logits = tf.add(tf.matmul(x, w), b) 62 | output = applyActivation(logits, activation) 63 | return output, logits 64 | 65 | def compute_cost(logit, y, out_type, max_length, batch_size, embed_dim, activation): 66 | if out_type=="last_only": 67 | cost = tf.nn.sigmoid_cross_entropy_with_logits(labels=y, logits=logit) 68 | cost = tf.reduce_mean(cost, reduction_indices=1) 69 | else: 70 | pred_out = applyActivation(logit, activation) 71 | pred_out = tf.reshape(pred_out, [batch_size, max_length, embed_dim]) 72 | mse = tf.reduce_mean(tf.square(tf.subtract(y, pred_out)), reduction_indices=2) 73 | mask = tf.sign(tf.reduce_max(tf.abs(y), reduction_indices=2)) 74 | mse *= mask 75 | mse = tf.reduce_sum(mse, reduction_indices=1) 76 | mse /= tf.cast(length(y), tf.float32) 77 | cost = mse 78 | cost = tf.reduce_mean(cost, reduction_indices=0) 79 | print "final cost shape: ", cost.get_shape() 80 | return cost 81 | -------------------------------------------------------------------------------- /auto/src/util/tf_utils_old.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import numpy as np 3 | 4 | def fcLayer(x, in_shape, out_shape, activation, dropout, is_train, scope="fc"): 5 | 6 | x = tf.reshape(x, [-1, in_shape]) 7 | 8 | with tf.variable_scope(scope): 9 | w = tf.get_variable(name="w", shape = [in_shape, out_shape], dtype=tf.float32, initializer=tf.truncated_normal_initializer(stddev=1e-2)) 10 | b = tf.get_variable(name="b", shape= [out_shape], dtype=tf.float32, initializer=tf.constant_initializer(0.0)) 11 | fc = tf.add(tf.matmul(x, w), b) 12 | 13 | with tf.variable_scope("activation"): 14 | output = applyActivation(fc, activation) 15 | #out_op = tf.nn.dropout(output, dropout) 16 | out_op = output 17 | 18 | return out_op 19 | 20 | def createLSTMCell(batch_size, lstm_size, n_layers, forget_bias): 21 | 22 | lstm_cell = tf.contrib.rnn.BasicLSTMCell(lstm_size, forget_bias=forget_bias) 23 | lstm_cell = tf.contrib.rnn.MultiRNNCell([lstm_cell for i in range(n_layers)], state_is_tuple=True) 24 | state = lstm_cell.zero_state(batch_size, tf.float32) 25 | 26 | return lstm_cell, state 27 | 28 | def applyActivation(x, activation): 29 | 30 | if activation == "tanh": 31 | return tf.nn.tanh(x) 32 | elif activation == "relu": 33 | return tf.nn.relu(x) 34 | elif activation == "sigmoid": 35 | return tf.nn.sigmoid(x) 36 | elif activation == "relu6": 37 | return tf.nn.relu6(x) 38 | else: return None 39 | 40 | 41 | def predictionLayer(x, y, in_shape, out_shape, activation, scope="prediction"): 42 | 43 | x = tf.reshape(x, [-1, in_shape]) 44 | 45 | with tf.variable_scope(scope): 46 | w = tf.get_variable(name=scope+"w", shape = [in_shape, out_shape], dtype=tf.float32, initializer=tf.truncated_normal_initializer(stddev=1e-2)) 47 | b = tf.get_variable(name=scope+"b", shape= [out_shape], dtype=tf.float32, initializer=tf.constant_initializer(0.0)) 48 | logits = tf.add(tf.matmul(x, w), b) 49 | output = applyActivation(logits, activation) 50 | # Compute the mean-squared-error 51 | cost = tf.reduce_mean(tf.square(tf.subtract(y , output))) 52 | 53 | return cost, output 54 | -------------------------------------------------------------------------------- /auto/src/util/tf_utils_reg.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import numpy as np 3 | 4 | def fcLayer(x, in_shape, out_shape, activation, dropout, is_train, reg_const, scope="fc"): 5 | 6 | x = tf.reshape(x, [-1, in_shape]) 7 | 8 | with tf.variable_scope(scope): 9 | w = tf.get_variable(name="w", shape = [in_shape, out_shape], dtype=tf.float32, initializer=tf.truncated_normal_initializer(stddev=1e-2), regularizer=tf.contrib.layers.l2_regularizer(reg_const)) 10 | b = tf.get_variable(name="b", shape= [out_shape], dtype=tf.float32, initializer=tf.constant_initializer(0.0)) 11 | fc = tf.add(tf.matmul(x, w), b) 12 | 13 | with tf.variable_scope("activation"): 14 | output = applyActivation(fc, activation) 15 | #out_op = tf.nn.dropout(output, dropout) 16 | out_op = output 17 | 18 | return out_op 19 | 20 | def createLSTMCell(batch_size, lstm_size, n_layers, forget_bias): 21 | 22 | lstm_cell = tf.contrib.rnn.BasicLSTMCell(lstm_size, forget_bias=forget_bias) 23 | lstm_cell = tf.contrib.rnn.MultiRNNCell([lstm_cell for i in range(n_layers)], state_is_tuple=True) 24 | state = lstm_cell.zero_state(batch_size, tf.float32) 25 | 26 | return lstm_cell, state 27 | 28 | def applyActivation(x, activation): 29 | 30 | if activation == "tanh": 31 | return tf.nn.tanh(x) 32 | elif activation == "relu": 33 | return tf.nn.relu(x) 34 | elif activation == "sigmoid": 35 | return tf.nn.sigmoid(x) 36 | elif activation == "relu6": 37 | return tf.nn.relu6(x) 38 | elif activation == "softmax": 39 | return tf.nn.softmax(x) 40 | else: return None 41 | 42 | def length(sequence): 43 | used = tf.sign(tf.reduce_max(tf.abs(sequence), reduction_indices=2)) 44 | length = tf.reduce_sum(used, reduction_indices=1) 45 | length = tf.cast(length, tf.int32) 46 | return length 47 | 48 | def predictionLayer(x, y, in_shape, out_shape, activation, reg_const, scope="prediction"): 49 | 50 | x = tf.reshape(x, [-1, in_shape]) 51 | 52 | with tf.variable_scope(scope): 53 | w = tf.get_variable(name=scope+"w", shape = [in_shape, out_shape], dtype=tf.float32, initializer=tf.truncated_normal_initializer(stddev=1e-2), regularizer=tf.contrib.layers.l2_regularizer(reg_const)) 54 | b = tf.get_variable(name=scope+"b", shape= [out_shape], dtype=tf.float32, initializer=tf.constant_initializer(0.0)) 55 | logits = tf.add(tf.matmul(x, w), b) 56 | output = applyActivation(logits, activation) 57 | return output, logits 58 | 59 | def compute_cost(logit, y, out_type, max_length, batch_size, embed_dim, activation): 60 | if out_type=="last_only": 61 | cost = tf.nn.sigmoid_cross_entropy_with_logits(labels=y, logits=logit) 62 | cost = tf.reduce_mean(cost, reduction_indices=1) 63 | else: 64 | pred_out = applyActivation(logit, activation) 65 | pred_out = tf.reshape(pred_out, [batch_size, max_length, embed_dim]) 66 | mse = tf.reduce_mean(tf.square(tf.subtract(y, pred_out)), reduction_indices=2) 67 | mask = tf.sign(tf.reduce_max(tf.abs(y), reduction_indices=2)) 68 | mse *= mask 69 | mse = tf.reduce_sum(mse, reduction_indices=1) 70 | mse /= tf.cast(length(y), tf.float32) 71 | cost = mse 72 | cost = tf.reduce_mean(cost, reduction_indices=0) 73 | print "final cost shape: ", cost.get_shape() 74 | return cost 75 | -------------------------------------------------------------------------------- /auto/src_final/model/mcrnn_model.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import numpy as np 3 | import os 4 | import cPickle as pickle 5 | from os.path import expanduser 6 | import sys 7 | 8 | sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "..","util"))) 9 | from tf_utils import fcLayer, createLSTMCell, applyActivation, predictionLayer 10 | #from predContext import predContext, createHtDict 11 | 12 | class model(object): 13 | 14 | # Model params 15 | # 0 -- shared; 1 -- context; 2 -- task 16 | fc_activation = "tanh" 17 | output_activation = "tanh" 18 | dropout = 0.0 19 | body_lstm_size = 128 20 | context_lstm_size = 128 21 | task_lstm_size = 128 22 | body_n_layer = 1 23 | context_n_layer = 1 24 | task_n_layer = 1 25 | context_branch_fc = 512 26 | task_branch_fc = 512 27 | 28 | # Data params 29 | batch_size = 128 30 | max_length = 52 31 | feature_length = 300 32 | context_dim = 300 33 | task_dim = 2 34 | 35 | # Hyper- params 36 | lr = 0.001 37 | context_lr = lr 38 | n_epoch = 500 39 | topN = 4 40 | keep_prob_val = 1.0 41 | 42 | def buildModel(self, x, y_context, y_task, is_train, dropout, scope="multiTask"): 43 | 44 | # Assume the input shape is (batch_size, max_length, feature_length) 45 | 46 | #TASK = primary task, CONTEXT = secondary task 47 | 48 | # Create lstm cell for the shared layer 49 | body_lstm_cell, _ = createLSTMCell(self.batch_size, self.body_lstm_size, self.body_n_layer, forget_bias=0.0) 50 | # Create lstm cell for branch 1 51 | context_lstm_cell, _ = createLSTMCell(self.batch_size, self.context_lstm_size, self.context_n_layer, forget_bias=0.0) 52 | # Create lstm cells for branch 2 53 | task_lstm_cell, _ = createLSTMCell(self.batch_size, self.task_lstm_size, self.task_n_layer, forget_bias=0.0) 54 | 55 | context_cost = tf.constant(0) 56 | task_cost = tf.constant(0) 57 | 58 | with tf.variable_scope("shared_lstm"): 59 | body_cell_output, last_body_state = tf.nn.dynamic_rnn(cell = body_lstm_cell, dtype=tf.float32, sequence_length=self.length(x), inputs=x) 60 | 61 | with tf.variable_scope("context_branch"): 62 | context_cell_output, last_context_state = tf.nn.dynamic_rnn(cell = context_lstm_cell, dtype=tf.float32, sequence_length=self.length(body_cell_output), inputs=body_cell_output) 63 | 64 | # The output from LSTMs will be (batch_size, max_length, out_size) 65 | with tf.variable_scope("context_fc"): 66 | # Select the last output that is not generated by zero vectors 67 | last_context_output = self.last_relevant(context_cell_output, self.length(context_cell_output)) 68 | # feed the last output to the fc layer and make prediction 69 | context_fc_out = fcLayer(x=last_context_output, in_shape=self.context_lstm_size, out_shape=self.context_branch_fc, activation=self.fc_activation, dropout=self.dropout, is_train=is_train, scope="fc1") 70 | context_cost, context_output = predictionLayer(x=context_fc_out, y=y_context, in_shape=self.context_branch_fc, out_shape=y_context.get_shape()[-1].value, activation=self.output_activation) 71 | 72 | with tf.variable_scope("task_branch"): 73 | task_cell_output, last_task_state = tf.nn.dynamic_rnn(cell = task_lstm_cell, dtype=tf.float32, sequence_length=self.length(body_cell_output), inputs=body_cell_output) 74 | 75 | with tf.variable_scope("task_fc"): 76 | # Select the last output that is not generated by zero vectors 77 | last_task_output = self.last_relevant(task_cell_output, self.length(task_cell_output)) 78 | # feed the last output to the fc layer and make prediction 79 | task_fc_out = fcLayer(x=last_task_output, in_shape=self.task_lstm_size, out_shape=self.task_branch_fc, activation=self.fc_activation, dropout=self.dropout, is_train=is_train, scope="fc2") 80 | task_cost, task_output = predictionLayer(x=task_fc_out, y=y_task, in_shape=self.context_branch_fc, out_shape=y_task.get_shape()[-1].value, activation=self.output_activation) 81 | 82 | return context_cost, task_cost, task_output, context_output 83 | 84 | # Flatten the output tensor to shape features in all examples x output size 85 | # construct an index into that by creating a tensor with the start indices for each example tf.range(0, batch_size) x max_length 86 | # and add the individual sequence lengths to it 87 | # tf.gather() then performs the acutal indexing. 88 | def last_relevant(self, output, length): 89 | index = tf.range(0, self.batch_size) * self.max_length + (length - 1) 90 | out_size = int(output.get_shape()[2]) 91 | flat = tf.reshape(output, [-1, out_size]) 92 | relevant = tf.gather(flat, index) 93 | return relevant 94 | 95 | # Assume that the sequences are padded with 0 vectors to have shape (batch_size, max_length, feature_length) 96 | 97 | def length(self, sequence): 98 | used = tf.sign(tf.reduce_max(tf.abs(sequence), reduction_indices=2)) 99 | length = tf.reduce_sum(used, reduction_indices=1) 100 | length = tf.cast(length, tf.int32) 101 | print length.get_shape() 102 | return length 103 | 104 | 105 | 106 | 107 | 108 | -------------------------------------------------------------------------------- /auto/src_final/model/mcrnn_model_1_lstm.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import numpy as np 3 | import os 4 | import cPickle as pickle 5 | from os.path import expanduser 6 | import sys 7 | 8 | sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "..","util"))) 9 | from tf_utils import fcLayer, createLSTMCell, createGRUCell, applyActivation, predictionLayer, compute_cost 10 | #from predContext import predContext, createHtDict 11 | 12 | class model(object): 13 | 14 | # Task params 15 | is_multi_task = True 16 | secondary_task = "word generation" 17 | primary_task = "classification" 18 | 19 | # Model params 20 | # 0 -- shared; 1 -- context; 2 -- task 21 | fc_activation = "tanh" 22 | context_output_activation = "tanh" 23 | task_output_activation = "softmax" 24 | body_lstm_size = 1024 25 | body_n_layer = 1 26 | context_n_layer = 1 27 | task_n_layer = 1 28 | context_branch_fc = 512 29 | task_branch_fc = 30 30 | 31 | # Data params 32 | n_classes = 2 33 | batch_size = 64 34 | max_length = 52 35 | feature_length = 300 36 | context_dim = 300 37 | task_dim = n_classes 38 | 39 | # Hyper- params 40 | lr = 0.0001 #hp 41 | lr_mod = 1.0 #hp 42 | context_lr = lr_mod*lr 43 | n_epoch = 50 #hp 44 | 45 | def buildModel(self, x, y_context, y_task, is_train, dropout, scope="multiTask"): 46 | 47 | # Assume the input shape is (batch_size, max_length, feature_length) 48 | 49 | #TASK = primary task, CONTEXT = secondary task 50 | 51 | # Create lstm cell for the shared layer 52 | body_lstm_cell, _ = createLSTMCell(self.batch_size, self.body_lstm_size, self.body_n_layer, forget_bias=0.0) 53 | 54 | context_cost = tf.constant(0) 55 | task_cost = tf.constant(0.0, dtype=tf.float32) 56 | 57 | if not self.is_multi_task: context_output = tf.constant(0) 58 | 59 | with tf.variable_scope("shared_lstm"): 60 | body_cell_output, last_body_state = tf.nn.dynamic_rnn(cell = body_lstm_cell, dtype=tf.float32, sequence_length=self.length(x), inputs=x) 61 | 62 | if self.is_multi_task: 63 | with tf.variable_scope("context_branch"): 64 | # Select the last output that is not generated by zero vectors 65 | if self.secondary_task == "missing word": 66 | last_body_output = self.last_relevant(body_cell_output, self.length(body_cell_output)) 67 | # feed the last output to the fc layer and make prediction 68 | with tf.variable_scope("context_fc"): 69 | context_fc_out = fcLayer(x=last_body_output, in_shape=self.body_lstm_size, out_shape=self.context_branch_fc, activation=self.fc_activation, dropout=dropout, is_train=is_train, scope="fc1") 70 | with tf.variable_scope("context_pred"): 71 | context_output, context_logits = predictionLayer(x=context_fc_out, y=y_context, in_shape=self.context_branch_fc, out_shape=y_context.get_shape()[-1].value, activation=self.context_output_activation) 72 | context_cost = compute_cost(logit=context_logits, y=y_context, out_type="last_only", max_length=self.max_length, batch_size=self.batch_size, embed_dim=self.feature_length, activation=self.context_output_activation) 73 | 74 | if self.secondary_task == "word generation": 75 | context_input = tf.transpose(body_cell_output, [1, 0, 2]) 76 | context_input = tf.reshape(context_input, [-1, self.body_lstm_size]) 77 | context_input_list = tf.split(context_input, self.max_length, 0) 78 | fc_output_list = [] 79 | with tf.variable_scope("context_fc"): 80 | for step in range(self.max_length): 81 | if step > 0: tf.get_variable_scope().reuse_variables() 82 | fc_out = fcLayer(x=context_input_list[step], in_shape=self.body_lstm_size, out_shape=self.context_branch_fc, activation=self.fc_activation, dropout=dropout, is_train=is_train, scope="fc1") 83 | fc_output_list.append(tf.expand_dims(fc_out, axis=1)) 84 | context_fc_out = tf.concat(fc_output_list, axis=1) 85 | with tf.variable_scope("context_pred"): 86 | context_output, context_logits = predictionLayer(x=context_fc_out, y=y_context, in_shape=self.context_branch_fc, out_shape=y_context.get_shape()[-1].value, activation=self.context_output_activation) 87 | context_cost = compute_cost(logit=context_logits, y=y_context, out_type="sequential", max_length=self.max_length, batch_size=self.batch_size, embed_dim=self.feature_length,activation=self.context_output_activation) 88 | 89 | 90 | print "Context cost shape: ", context_cost.get_shape() 91 | 92 | with tf.variable_scope("task_branch"): 93 | with tf.variable_scope("task_fc"): 94 | # Select the last output that is not generated by zero vectors 95 | last_body_output = self.last_relevant(body_cell_output, self.length(body_cell_output)) 96 | # feed the last output to the fc layer and make prediction 97 | task_fc_out = fcLayer(x=last_body_output, in_shape=self.body_lstm_size, out_shape=self.task_branch_fc, activation=self.fc_activation, dropout=dropout, is_train=is_train, scope="fc2") 98 | task_output, task_logits = predictionLayer(x=task_fc_out, y=y_task, in_shape=self.task_branch_fc, out_shape=y_task.get_shape()[-1].value, activation=self.task_output_activation) 99 | print "Task output shape: ", task_output.get_shape() 100 | task_cost = compute_cost(logit=task_logits, y=y_task, out_type="last_only", max_length=self.max_length, batch_size=self.batch_size, embed_dim=self.n_classes,activation=self.task_output_activation) 101 | 102 | return context_cost, task_cost, task_output, context_output 103 | 104 | # Flatten the output tensor to shape features in all examples x output size 105 | # construct an index into that by creating a tensor with the start indices for each example tf.range(0, batch_size) x max_length 106 | # and add the individual sequence lengths to it 107 | # tf.gather() then performs the acutal indexing. 108 | def last_relevant(self, output, length): 109 | index = tf.range(0, self.batch_size) * self.max_length + (length - 1) 110 | out_size = int(output.get_shape()[2]) 111 | flat = tf.reshape(output, [-1, out_size]) 112 | relevant = tf.gather(flat, index) 113 | return relevant 114 | 115 | # Assume that the sequences are padded with 0 vectors to have shape (batch_size, max_length, feature_length) 116 | 117 | def length(self, sequence): 118 | used = tf.sign(tf.reduce_max(tf.abs(sequence), reduction_indices=2)) 119 | length = tf.reduce_sum(used, reduction_indices=1) 120 | length = tf.cast(length, tf.int32) 121 | print length.get_shape() 122 | return length 123 | 124 | 125 | 126 | 127 | 128 | -------------------------------------------------------------------------------- /auto/src_final/model/train.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import numpy as np 3 | import os 4 | import cPickle as pickle 5 | from os.path import expanduser 6 | import sys 7 | import mcrnn_model 8 | from mcrnn_model import model 9 | 10 | sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "..","utils"))) 11 | from tf_utils import fcLayer, createLSTMCell, applyActivation, predictionLayer 12 | from load_batch import get_file_identifiers, get_classes, load_data, get_word2vec, load_batch 13 | 14 | def get_data(data_path): 15 | data_stats = pickle.load(open(expanduser(data_path + "/rt_stats.pkl"))) 16 | max_length, nPos, nNeg, trainPercent, testPercent = data_stats["longest"], data_stats[0], data_stats[1], data_stats['trainPercent'], data_stats['testPercent'] 17 | word2vec_dic = get_word2vec("~/tweetnet/data/word2vec_dict.pkl") 18 | missing_word_dic = pickle.load(open(expanduser(data_path + "/missing_word_dic.pkl"))) 19 | nTest = int(testPercent*nPos) + int(testPercent*nNeg) 20 | nTrain = nPos + nNeg - nTest 21 | 22 | return max_length, nPos, nNeg, trainPercent, testPercent, word2vec_dic, missing_word_dic, nTest, nTrain 23 | 24 | 25 | def trainModel(): 26 | 27 | M = model() 28 | data_path = "~/automatedMTL/data/rotten_tomato" 29 | max_length, nPos, nNeg, trainPercent, testPercent, word2vec_dic, missing_word_dic, nTest, nTrain = get_data(data_path) 30 | 31 | x = tf.placeholder(tf.float32, shape=(None, M.max_length, M.feature_length)) 32 | y_context = tf.placeholder(tf.float32, shape=(None, M.context_dim)) 33 | y_task = tf.placeholder(tf.float32, shape=(None, M.task_dim)) 34 | 35 | optimizer1 = tf.train.AdamOptimizer(learning_rate=M.context_lr) 36 | optimizer2 = tf.train.AdamOptimizer(learning_rate=M.lr) 37 | is_train = tf.placeholder(tf.int32) 38 | n_train_batches = np.ceil(nTrain / M.batch_size).astype(int) 39 | keep_prob = tf.placeholder(tf.float32) 40 | 41 | context_cost, task_cost, task_output, context_output = M.buildModel(x, y_context, y_task, is_train, keep_prob) 42 | train_step1 = optimizer1.minimize(context_cost) 43 | train_step2 = optimizer2.minimize(task_cost) 44 | 45 | # Start running operations on the graph 46 | sess = tf.Session() 47 | sess.run(tf.initialize_all_variables()) 48 | 49 | with sess.as_default(): 50 | for epoch in range(100): 51 | taskCost = 0 52 | contextCost = 0 53 | 54 | all_classes, train_file, test_file = load_data(data_path) 55 | start_idx = 0 56 | for minibatch in range(n_train_batches): 57 | encoded_batch, batch_classes, batch_missing_word_encoded, batch_missing_word, batch_identifier, batch_text, batch_length = load_batch(word2vec_dic, missing_word_dic, M.feature_length, max_length, data_path+"/Train/", 1, train_file, test_file, all_classes, start_idx, M.batch_size) 58 | start_idx += M.batch_size 59 | 60 | feed_dict = {x: encoded_batch, y_context: batch_missing_word_encoded, y_task: batch_classes, is_train:1, keep_prob:0.5} 61 | 62 | train_step1.run(feed_dict=feed_dict) 63 | context_cost_val, _, _ = sess.run(fetches = [context_cost, task_cost, task_output], feed_dict=feed_dict) 64 | contextCost += context_cost_val 65 | 66 | train_step2.run(feed_dict=feed_dict) 67 | _, task_cost_val, _ = sess.run(fetches = [context_cost, task_cost, task_output], feed_dict=feed_dict) 68 | taskCost += task_cost_val 69 | 70 | #if minibatch !=0 and minibatch % 100 == 0: 71 | print "Minibatch ", minibatch, " Missing Word: ", contextCost , " Classification: ", taskCost 72 | contextCost = 0 73 | taskCost = 0 74 | 75 | start_idx = 0 76 | accuracy = 0 77 | 78 | for i in range(nTest): 79 | encoded_batch, batch_classes, batch_missing_word_encoded, batch_missing_word, batch_identifier, batch_text, batch_length = load_batch(word2vec_dic, missing_word_dic, M.feature_length, max_length, data_path+"/Test/", 0, train_file, test_file, all_classes, start_idx, 1) 80 | start_idx += 1 81 | feed_dict = {x:encoded_batch, y_context: batch_missing_word_encoded, y_task: batch_classes, is_train:0, keep_prob:0.5} 82 | task_output_val = sess.run(fetches = [task_output], feed_dict=feed_dict) 83 | accuracy += is_correct(batch_classes, task_output_val) 84 | print "The accuracy in epoch ", epoch, " is: ", accuracy * 1.0 / nTest 85 | 86 | def is_correct(target, output): 87 | prediction = np.argmax(output) 88 | target = np.argmax(target) 89 | #print prediction, target 90 | return prediction == target 91 | 92 | 93 | if __name__ == "__main__": 94 | trainModel() 95 | -------------------------------------------------------------------------------- /auto/src_final/util/1q: -------------------------------------------------------------------------------- 1 | from os.path import expanduser 2 | import sys 3 | import numpy 4 | import os 5 | sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "..","model"))) 6 | from mcrnn_model_1_lstm import model 7 | from train_1_lstm import trainModel as TM 8 | #does hyperparameter search over some set of hyperparams. 9 | 10 | LR = [0.001] 11 | LR_MOD = [1.0] #4 12 | N_EPOCHS = [50] # 30 13 | N_EXPERIMENTS = [1] # 5 14 | KEEP_PROB_VAL = [1.0] 15 | CONTEXT_FC = [30] #1024 on AWS 16 | #3*3*3*30*5/60=67.5 hrs. 17 | experiment = "context_lr=0.5*lr, task_lr=0.5*lr, no learning rate anealing. Learning rates: " 18 | for lr in LR: 19 | experiment = experiment + str(lr) + ", " 20 | experiment = experiment + " N epoch: "+str(N_EPOCHS[0]) + " Keep prob: " 21 | for prob in KEEP_PROB_VAL: 22 | experiment = experiment + str(prob) + ", " 23 | experiment = experiment + " Context_fc: " 24 | for fc in CONTEXT_FC: 25 | experiment = experiment + str(fc) + ", " 26 | 27 | 28 | def runExperiment(lr, lr_mod,n_epoch,n_experiments,f1, keep_prob_val, context_fc): 29 | M= model() 30 | 31 | print M.is_multi_task 32 | if lr_mod == 0.0: 33 | M.is_multi_task = False 34 | else: 35 | M.is_multi_task = True 36 | print M.is_multi_task 37 | 38 | print M.lr 39 | M.lr = lr 40 | print M.lr 41 | 42 | print M.lr_mod 43 | M.lr_mod = lr_mod 44 | print M.lr_mod 45 | 46 | print M.n_epoch 47 | M.n_epoch = n_epoch 48 | print M.n_epoch 49 | 50 | print M.context_branch_fc 51 | M.context_branch_fc = context_fc 52 | print M.context_branch_fc 53 | 54 | maxAccList = []; 55 | for i in range(n_experiments): 56 | accuracyVec = TM(M, keep_prob_val)#INSERT CODE TO run for n epochs 57 | maxAcc = numpy.max(accuracyVec) 58 | maxAccList.append(maxAcc) 59 | expVal = numpy.mean(maxAccList) 60 | string_result = "lr = " + str(lr) + " lr_mod = "+ "self-annealing" + " avg_acc = " + str(expVal)+'\n' 61 | f1.write("") 62 | f1.write(string_result) 63 | f1.flush() 64 | print string_result 65 | 66 | 67 | 68 | f1 = open(expanduser('~/tweetnet/logs/hps_log_mrnn_bidir.log'),'w+') 69 | f1.write(experiment) 70 | f1.write("\n") 71 | f1.flush() 72 | for lr in LR: 73 | for lr_mod in LR_MOD: 74 | for n_epoch in N_EPOCHS: 75 | for n_experiments in N_EXPERIMENTS: 76 | for keep_prob_val in KEEP_PROB_VAL: 77 | for context_fc in CONTEXT_FC: 78 | runExperiment(lr,lr_mod,n_epoch,n_experiments,f1, keep_prob_val, context_fc) 79 | f1.close() 80 | -------------------------------------------------------------------------------- /auto/src_final/util/hps.py: -------------------------------------------------------------------------------- 1 | from os.path import expanduser 2 | import sys 3 | import numpy 4 | import os 5 | sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "..","model"))) 6 | from mcrnn_model_1_lstm import model 7 | from train_1_lstm import trainModel as TM 8 | #does hyperparameter search over some set of hyperparams. 9 | 10 | LR = [0.01] 11 | LR_MOD = [1.0] #4 12 | N_EPOCHS = [30] # 30 13 | N_EXPERIMENTS = [10] # 5 14 | KEEP_PROB_VAL = [1.0] 15 | CONTEXT_FC = [30] #1024 on AWS 16 | #3*3*3*30*5/60=67.5 hrs. 17 | experiment = "context_lr=0.5*lr, task_lr=0.5*lr, no learning rate anealing. Learning rates: " 18 | for lr in LR: 19 | experiment = experiment + str(lr) + ", " 20 | experiment = experiment + " N epoch: "+str(N_EPOCHS[0]) + " Keep prob: " 21 | for prob in KEEP_PROB_VAL: 22 | experiment = experiment + str(prob) + ", " 23 | experiment = experiment + " Context_fc: " 24 | for fc in CONTEXT_FC: 25 | experiment = experiment + str(fc) + ", " 26 | 27 | epoch_ratio_list = [(0.1, 1.0), (0.5, 0.5), (1.0, 0.0)] 28 | 29 | def runExperiment(lr, lr_mod,n_epoch,n_experiments,f1, keep_prob_val, context_fc): 30 | M= model() 31 | 32 | print M.is_multi_task 33 | if lr_mod == 0.0: 34 | M.is_multi_task = False 35 | else: 36 | M.is_multi_task = True 37 | print M.is_multi_task 38 | 39 | print M.lr 40 | M.lr = lr 41 | print M.lr 42 | 43 | print M.lr_mod 44 | M.lr_mod = lr_mod 45 | print M.lr_mod 46 | 47 | print M.n_epoch 48 | M.n_epoch = n_epoch 49 | print M.n_epoch 50 | 51 | print M.context_branch_fc 52 | M.context_branch_fc = context_fc 53 | print M.context_branch_fc 54 | 55 | maxAccList = []; 56 | for i in range(n_experiments): 57 | accuracyVec = TM(M, keep_prob_val, epoch_ratio_list)#INSERT CODE TO run for n epochs 58 | maxAcc = numpy.max(accuracyVec) 59 | maxAccList.append(maxAcc) 60 | expVal = numpy.mean(maxAccList) 61 | string_result = "lr = " + str(lr) + " lr_mod = "+ "self-annealing" + " avg_acc = " + str(expVal)+'\n' 62 | f1.write("") 63 | f1.write(string_result) 64 | f1.flush() 65 | print string_result 66 | 67 | 68 | 69 | f1 = open(expanduser('~/tweetnet/logs/hps_log_mrnn_bidir.log'),'w+') 70 | f1.write(experiment) 71 | f1.write("\n") 72 | f1.flush() 73 | for lr in LR: 74 | for lr_mod in LR_MOD: 75 | for n_epoch in N_EPOCHS: 76 | for n_experiments in N_EXPERIMENTS: 77 | for keep_prob_val in KEEP_PROB_VAL: 78 | for context_fc in CONTEXT_FC: 79 | runExperiment(lr,lr_mod,n_epoch,n_experiments,f1, keep_prob_val, context_fc) 80 | f1.close() 81 | -------------------------------------------------------------------------------- /auto/src_final/util/hps2.py: -------------------------------------------------------------------------------- 1 | from os.path import expanduser 2 | import sys 3 | import numpy 4 | import os 5 | sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "..","model"))) 6 | from mcrnn_model_gen2 import model 7 | from train_gen2 import trainModel as TM 8 | #does hyperparameter search over some set of hyperparams. 9 | 10 | 11 | LR = [0.01] 12 | LR_MOD = [1.0,0.0] #4 13 | N_EPOCHS = [30] # 30 14 | N_EXPERIMENTS = [5] # 5 15 | KEEP_PROB_VAL = [1.0] 16 | CONTEXT_FC = [128] 17 | #3*3*3*30*5/60=67.5 hrs. 18 | #experiment = "context_lr=0.5*lr, task_lr=0.5*lr, no learning rate anealing. Learning rates: " 19 | 20 | dataset = "ag_news" # or "rotten_tomato" 21 | experiment = "N_epochs = 50. N_exp = 10. lstm: 512 for both. hidden fc: 512 for both. dropout: none." 22 | 23 | for lr in LR: 24 | experiment = experiment + str(lr) + ", " 25 | experiment = experiment + " N epoch: "+str(N_EPOCHS[0]) + " Keep prob: " 26 | for prob in KEEP_PROB_VAL: 27 | experiment = experiment + str(prob) + ", " 28 | experiment = experiment + " Context_fc: " 29 | for fc in CONTEXT_FC: 30 | experiment = experiment + str(fc) + ", " 31 | 32 | def runExperiment(lr, lr_mod,n_epoch,n_experiments,f1, keep_prob_val, context_fc): 33 | M= model() 34 | 35 | print M.dataset 36 | M.dataset = dataset 37 | print M.dataset 38 | 39 | print M.is_multi_task 40 | if lr_mod == 0.0: 41 | M.is_multi_task = False 42 | else: 43 | M.is_multi_task = True 44 | print M.is_multi_task 45 | 46 | print M.lr 47 | M.lr = lr 48 | print M.lr 49 | 50 | print M.lr_mod 51 | M.lr_mod = lr_mod 52 | print M.lr_mod 53 | 54 | print M.n_epoch 55 | M.n_epoch = n_epoch 56 | print M.n_epoch 57 | 58 | print M.context_branch_fc 59 | M.context_branch_fc = context_fc 60 | print M.context_branch_fc 61 | 62 | maxAccList = [] 63 | testResult = [] 64 | for i in range(n_experiments): 65 | accuracyVec, testAcc = TM(M)#INSERT CODE TO run for n epochs 66 | maxAcc = numpy.max(accuracyVec) 67 | maxAccList.append(maxAcc) 68 | maxIdx = numpy.argmax(accuracyVec) 69 | testResult.append(testAcc[maxIdx]) 70 | 71 | expVal = numpy.mean(maxAccList) 72 | testVal = numpy.mean(testResult) 73 | if lr_mod == 0.0: 74 | string_result = "lr = " + str(lr) + " lr_mod = "+ "none (lstm)" + " avg_val_acc = " + str(expVal) + " avg_test_acc = " + str(testVal) + '\n' 75 | else: 76 | string_result = "lr = " + str(lr) + " lr_mod = "+ "annealing" + " avg_val_acc = " + str(expVal) + " avg_test_acc = " + str(testVal) + '\n' 77 | 78 | f1.write("") 79 | f1.write(string_result) 80 | f1.flush() 81 | print string_result 82 | 83 | #f1 = open(expanduser('~/tweetnet/logs/hps_log_mrnn_bidir.log'),'w+') 84 | f1 = open(expanduser("~/tweetnet/logs/hps_mrnn_ag_news.log"), "w+") 85 | f1.write(experiment) 86 | f1.write("\n") 87 | f1.flush() 88 | for lr in LR: 89 | for lr_mod in LR_MOD: 90 | for n_epoch in N_EPOCHS: 91 | for n_experiments in N_EXPERIMENTS: 92 | for keep_prob_val in KEEP_PROB_VAL: 93 | for context_fc in CONTEXT_FC: 94 | runExperiment(lr,lr_mod,n_epoch,n_experiments,f1, keep_prob_val, context_fc) 95 | f1.close() 96 | -------------------------------------------------------------------------------- /auto/src_final/util/hps_script.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/davisliang/automated/5e07e5802a604036fc6295ed3538d596936fca0e/auto/src_final/util/hps_script.sh -------------------------------------------------------------------------------- /auto/src_final/util/load_batch.py: -------------------------------------------------------------------------------- 1 | import cPickle as pickle 2 | import numpy as np 3 | import os 4 | import random 5 | from os.path import expanduser 6 | from reformat import reformat_data 7 | from load_util import class_look_up 8 | 9 | # The files are named from 0.txt to n.txt 10 | # This function returns a list of all shuffled file names 11 | 12 | def get_file_identifiers(data_path): 13 | ids = [] 14 | f = open(expanduser(data_path)) 15 | for l in f.readlines(): 16 | ids.append(int(l.split(" ")[0])) 17 | random.shuffle(ids) 18 | return ids 19 | 20 | def get_classes(all_classes, id): 21 | return all_classes[id][0] 22 | 23 | def get_text_by_batch(data_path, is_train, train_file, test_file, all_classes, start_idx, batch_size): 24 | if is_train: 25 | identifiers = train_file 26 | else: 27 | identifiers = test_file 28 | batch_identifiers = identifiers[start_idx: start_idx + batch_size] 29 | 30 | batch_text = [] 31 | for idx in batch_identifiers: 32 | text = open(expanduser(data_path + get_classes(all_classes, idx)+ "/" + str(idx)+".txt")) 33 | batch_text.append(text.read()) 34 | 35 | return batch_identifiers, batch_text 36 | 37 | def load_data(data_path): 38 | 39 | all_classes = pickle.load(open(expanduser(data_path + '/classes.pkl'))) 40 | test_file = get_file_identifiers(data_path + "/test_classes.txt") 41 | train_file = get_file_identifiers(data_path + "/train_classes.txt") 42 | return all_classes, train_file, test_file 43 | 44 | def get_word2vec(data_path): 45 | # TO DO: download word2vec! 46 | word2vec_dic = pickle.load(open(expanduser(data_path))) 47 | return word2vec_dic 48 | 49 | # Unknown symbols are UNK 50 | # Missing word symbols are zeros 51 | # EOS are EOS 52 | 53 | def encode_sequence(word2vec_dic, sequence, encode_dim, max_len): 54 | sequence_by_word = sequence.split(" ") 55 | encoded_seq = np.zeros((max_len, encode_dim)) 56 | for i in range(len(sequence_by_word)): 57 | word = sequence_by_word[i] 58 | if word2vec_dic.get(word) == None: 59 | encoded_seq[i, :] = word2vec_dic["UNK"] 60 | continue 61 | else: 62 | if word != "REMOVE": 63 | encoded_seq[i, :] = word2vec_dic[word] 64 | else: 65 | encoded_seq[i, :] = word2vec_dic["_"] 66 | return encoded_seq, len(sequence_by_word) 67 | 68 | def encode_sequence_generation(word2vec_dic, sequence, encode_dim, max_len): 69 | sequence_by_word = sequence.split(" ") 70 | encoded_seq = np.zeros((max_len, encode_dim)) 71 | for i in range(1, len(sequence_by_word)): 72 | word = sequence_by_word[i] 73 | if word2vec_dic.get(word) == None: 74 | encoded_seq[i-1, :] = word2vec_dic["UNK"] 75 | else: 76 | encoded_seq[i-1, :] = word2vec_dic[word] 77 | encoded_seq[len(sequence_by_word)-1, :] = word2vec_dic["EOS"] 78 | context_target = sequence_by_word[1:len(sequence_by_word)] + ["EOS"] 79 | 80 | return encoded_seq, context_target, len(sequence_by_word) 81 | 82 | def oneHot(nclasses, idx): 83 | one_hot = np.zeros((nclasses)) 84 | one_hot[idx] = 1 85 | return one_hot 86 | 87 | def load_batch(n_classes, word2vec_dic, missing_word_dic, encode_dim, max_len, data_path, is_train, train_file, test_file, all_classes, start_idx, batch_size, automated_task): 88 | batch_identifiers, batch_text = get_text_by_batch(data_path, is_train, train_file, test_file, all_classes, start_idx, batch_size) 89 | encoded_batch = np.zeros((batch_size, max_len, encode_dim)) 90 | batch_classes = np.zeros((batch_size, n_classes)) 91 | batch_context_encoded = np.zeros((batch_size, encode_dim)) 92 | if automated_task == "word generation": batch_context_encoded = np.zeros((batch_size, max_len, encode_dim)) 93 | batch_context = [] 94 | batch_length = [] 95 | for i in range(batch_size): 96 | encoded_batch[i,:, :], text_length = encode_sequence(word2vec_dic, batch_text[i], encode_dim, max_len) 97 | batch_classes[i,:] = oneHot(n_classes, all_classes[batch_identifiers[i]][-1]) 98 | if automated_task != "word generation": 99 | batch_context_encoded[i,:] = word2vec_dic[missing_word_dic[batch_identifiers[i]]] 100 | batch_context.append(missing_word_dic[batch_identifiers[i]]) 101 | else: 102 | batch_context_encoded[i, :, :], context_target, text_length = encode_sequence_generation(word2vec_dic, batch_text[i], encode_dim, max_len) 103 | batch_context.append(context_target) 104 | batch_length.append(text_length) 105 | return encoded_batch, batch_classes, batch_context_encoded, batch_context, batch_identifiers, batch_text, batch_length 106 | 107 | if __name__ == "__main__": 108 | data_path = "~/automatedMTL/data/rotten_tomato" 109 | max_length = reformat_data(data_path, False) 110 | class_look_up(data_path) 111 | data_stats = pickle.load(open(expanduser(data_path + "/stats.pkl"))) 112 | n_classes, n_data, n_data_per_class, trainPercent, testPercent = data_stats['n_classes'], data_stats['n_data'], data_stats['n_data_per_class'],data_stats['trainPercent'], data_stats['testPercent'] 113 | word2vec_dic = get_word2vec("~/tweetnet/data/word2vec_dict.pkl") 114 | missing_word_dic = pickle.load(open(expanduser(data_path + "/missing_word_dic.pkl"))) 115 | for epoch in range(3): 116 | dic = {} 117 | all_classes, train_file, test_file = load_data(data_path) 118 | start_idx = 0 119 | for minibatch in range(73): 120 | encoded_batch, batch_classes, batch_context_encoded, batch_context, batch_identifier, batch_text, batch_length = load_batch(n_classes, word2vec_dic, missing_word_dic, 300, max_length, data_path+"/Train/", 1, train_file, test_file, all_classes, start_idx, 128, automated_task="word generation") 121 | start_idx += 128 122 | print batch_context 123 | for i in batch_identifier: 124 | if dic.get(i) != None: print "Wrong" 125 | else: dic[i] = 1 126 | -------------------------------------------------------------------------------- /auto/src_final/util/load_batch2.py: -------------------------------------------------------------------------------- 1 | import cPickle as pickle 2 | import numpy as np 3 | import os 4 | import random 5 | from os.path import expanduser 6 | from reformat import reformat_data 7 | from load_util import class_look_up 8 | 9 | # The files are named from 0.txt to n.txt 10 | # This function returns a list of all shuffled file names 11 | 12 | def get_file_identifiers(data_path): 13 | ids = [] 14 | f = open(expanduser(data_path)) 15 | for l in f.readlines(): 16 | ids.append(int(l.split(" ")[0])) 17 | random.shuffle(ids) 18 | return ids 19 | 20 | def get_classes(all_classes, id): 21 | return all_classes[id][0] 22 | 23 | def get_text_by_batch(data_path, is_train, train_file, test_file, all_classes, start_idx, batch_size): 24 | if is_train: 25 | identifiers = train_file 26 | else: 27 | identifiers = test_file 28 | batch_identifiers = identifiers[start_idx: start_idx + batch_size] 29 | 30 | batch_text = [] 31 | for idx in batch_identifiers: 32 | text = open(expanduser(data_path + get_classes(all_classes, idx)+ "/" + str(idx)+".txt")) 33 | batch_text.append(text.read()) 34 | 35 | return batch_identifiers, batch_text 36 | 37 | def load_data(data_path): 38 | 39 | all_classes = pickle.load(open(expanduser(data_path + '/classes.pkl'))) 40 | test_file = get_file_identifiers(data_path + "/test_classes.txt") 41 | train_file = get_file_identifiers(data_path + "/train_classes.txt") 42 | return all_classes, train_file, test_file 43 | 44 | def get_word2vec(data_path): 45 | # TO DO: download word2vec! 46 | word2vec_dic = pickle.load(open(expanduser(data_path))) 47 | return word2vec_dic 48 | 49 | # Unknown symbols are UNK 50 | # Missing word symbols are zeros 51 | # EOS are EOS 52 | 53 | def encode_sequence(word2vec_dic, sequence, encode_dim, max_len): 54 | sequence_by_word = sequence.split(" ") 55 | encoded_seq = np.zeros((max_len, encode_dim)) 56 | for i in range(len(sequence_by_word)): 57 | word = sequence_by_word[i] 58 | if word2vec_dic.get(word) == None: 59 | encoded_seq[i, :] = word2vec_dic["UNK"] 60 | else: 61 | if word != "REMOVE": 62 | encoded_seq[i, :] = word2vec_dic[word] 63 | else: 64 | encoded_seq[i, :] = word2vec_dic["_"] 65 | return encoded_seq, len(sequence_by_word) 66 | 67 | def oneHot(nclasses, idx): 68 | one_hot = np.zeros((nclasses)) 69 | one_hot[idx] = 1 70 | return one_hot 71 | 72 | def load_batch(n_classes, word2vec_dic, missing_word_dic, encode_dim, max_len, data_path, is_train, train_file, test_file, all_classes, start_idx, batch_size): 73 | batch_identifiers, batch_text = get_text_by_batch(data_path, is_train, train_file, test_file, all_classes, start_idx, batch_size) 74 | encoded_batch = np.zeros((batch_size, max_len, encode_dim)) 75 | batch_classes = np.zeros((batch_size, 2)) 76 | batch_missing_word_encoded = np.zeros((batch_size, encode_dim)) 77 | batch_missing_word = [] 78 | batch_length = [] 79 | for i in range(batch_size): 80 | encoded_batch[i,:, :], text_length = encode_sequence(word2vec_dic, batch_text[i], encode_dim, max_len) 81 | batch_classes[i,:] = oneHot(n_classes, all_classes[batch_identifiers[i]][-1]) 82 | batch_missing_word_encoded[i,:] = word2vec_dic[missing_word_dic[batch_identifiers[i]]] 83 | batch_missing_word.append(missing_word_dic[batch_identifiers[i]]) 84 | batch_length.append(text_length) 85 | return encoded_batch, batch_classes, batch_missing_word_encoded, batch_missing_word, batch_identifiers, batch_text, batch_length 86 | 87 | if __name__ == "__main__": 88 | data_path = "~/automatedMTL/data/rotten_tomato" 89 | max_length = reformat_data(data_path, False) 90 | class_look_up(data_path) 91 | data_stats = pickle.load(open(expanduser(data_path + "/stats.pkl"))) 92 | n_classes, n_data, n_data_per_class, trainPercent, testPercent = data_stats['n_classes'], data_stats['n_data'], data_stats['n_data_per_class'],data_stats['trainPercent'], data_stats['testPercent'] 93 | word2vec_dic = get_word2vec("~/tweetnet/data/word2vec_dict.pkl") 94 | missing_word_dic = pickle.load(open(expanduser(data_path + "/missing_word_dic.pkl"))) 95 | for epoch in range(3): 96 | dic = {} 97 | all_classes, train_file, test_file = load_data(data_path) 98 | start_idx = 0 99 | for minibatch in range(73): 100 | encoded_batch, batch_classes, batch_missing_word_encoded, batch_missing_word, batch_identifier, batch_text, batch_length = load_batch(n_classes, word2vec_dic, missing_word_dic, 300, max_length, data_path+"/Train/", 1, train_file, test_file, all_classes, start_idx, 128) 101 | start_idx += 128 102 | #print batch_text 103 | #print batch_missing_word 104 | #print batch_length 105 | for i in batch_identifier: 106 | if dic.get(i) != None: print "Wrong" 107 | else: dic[i] = 1 108 | -------------------------------------------------------------------------------- /auto/src_final/util/load_batch_val.py: -------------------------------------------------------------------------------- 1 | import cPickle as pickle 2 | import numpy as np 3 | import os 4 | import random 5 | from os.path import expanduser 6 | #from reformat import reformat_data 7 | from load_util import class_look_up 8 | 9 | # The files are named from 0.txt to n.txt 10 | # This function returns a list of all shuffled file names 11 | 12 | def get_file_identifiers(data_path): 13 | ids = [] 14 | f = open(expanduser(data_path)) 15 | for l in f.readlines(): 16 | ids.append(int(l.split(" ")[0])) 17 | random.shuffle(ids) 18 | return ids 19 | 20 | def get_classes(all_classes, id): 21 | return all_classes[id][0] 22 | 23 | def get_text_by_batch(data_path, is_train, is_val, train_file, test_file, val_file, all_classes, start_idx, batch_size): 24 | if is_train: 25 | identifiers = train_file 26 | elif is_val: 27 | identifiers = val_file 28 | else: 29 | identifiers = test_file 30 | batch_identifiers = identifiers[start_idx: start_idx + batch_size] 31 | 32 | batch_text = [] 33 | for idx in batch_identifiers: 34 | text = open(expanduser(data_path + get_classes(all_classes, idx)+ "/" + str(idx)+".txt")) 35 | batch_text.append(text.read()) 36 | 37 | return batch_identifiers, batch_text 38 | 39 | def load_data(data_path): 40 | 41 | all_classes = pickle.load(open(expanduser(data_path + '/classes.pkl'))) 42 | test_file = get_file_identifiers(data_path + "/test_classes.txt") 43 | train_file = get_file_identifiers(data_path + "/train_classes.txt") 44 | val_file = get_file_identifiers(data_path + "/validation_classes.txt") 45 | return all_classes, train_file, test_file, val_file 46 | 47 | def get_word2vec(data_path): 48 | # TO DO: download word2vec! 49 | word2vec_dic = pickle.load(open(expanduser(data_path))) 50 | return word2vec_dic 51 | 52 | # Unknown symbols are UNK 53 | # Missing word symbols are zeros 54 | # EOS are EOS 55 | 56 | def encode_sequence(word2vec_dic, sequence, encode_dim, max_len): 57 | sequence_by_word = sequence.split(" ") 58 | encoded_seq = np.zeros((max_len, encode_dim)) 59 | for i in range(len(sequence_by_word)): 60 | word = sequence_by_word[i] 61 | if word2vec_dic.get(word) == None: 62 | encoded_seq[i, :] = word2vec_dic["UNK"] 63 | else: 64 | if word != "REMOVE": 65 | encoded_seq[i, :] = word2vec_dic[word] 66 | else: 67 | encoded_seq[i, :] = word2vec_dic["_"] 68 | return encoded_seq, len(sequence_by_word) 69 | 70 | def encode_sequence_generation(word2vec_dic, sequence, encode_dim, max_len): 71 | sequence_by_word = sequence.split(" ") 72 | encoded_seq = np.zeros((max_len, encode_dim)) 73 | for i in range(1, len(sequence_by_word)): 74 | word = sequence_by_word[i] 75 | if word2vec_dic.get(word) == None: 76 | encoded_seq[i-1, :] = word2vec_dic["UNK"] 77 | else: 78 | encoded_seq[i-1, :] = word2vec_dic[word] 79 | encoded_seq[len(sequence_by_word)-1, :] = word2vec_dic["EOS"] 80 | context_target = sequence_by_word[1:len(sequence_by_word)] + ["EOS"] 81 | 82 | return encoded_seq, context_target, len(sequence_by_word) 83 | 84 | def oneHot(nclasses, idx): 85 | one_hot = np.zeros((nclasses)) 86 | one_hot[idx-1] = 1 87 | return one_hot 88 | 89 | def load_batch(n_classes, word2vec_dic, missing_word_dic, encode_dim, max_len, data_path, is_train, is_val, train_file, test_file, val_file,all_classes, start_idx, batch_size, automated_task): 90 | batch_identifiers, batch_text = get_text_by_batch(data_path, is_train,is_val, train_file, test_file, val_file, all_classes, start_idx, batch_size) 91 | encoded_batch = np.zeros((batch_size, max_len, encode_dim)) 92 | batch_classes = np.zeros((batch_size, n_classes)) 93 | batch_context_encoded = np.zeros((batch_size, encode_dim)) 94 | if automated_task == "word generation": batch_context_encoded = np.zeros((batch_size, max_len, encode_dim)) 95 | batch_context = [] 96 | batch_length = [] 97 | for i in range(batch_size): 98 | encoded_batch[i,:, :], text_length = encode_sequence(word2vec_dic, batch_text[i], encode_dim, max_len) 99 | batch_classes[i,:] = oneHot(n_classes, all_classes[batch_identifiers[i]][-1]) 100 | if automated_task != "word generation": 101 | batch_context_encoded[i,:] = word2vec_dic[missing_word_dic[batch_identifiers[i]]] 102 | batch_context.append(missing_word_dic[batch_identifiers[i]]) 103 | else: 104 | batch_context_encoded[i, :, :], context_target, text_length = encode_sequence_generation(word2vec_dic, batch_text[i], encode_dim, max_len) 105 | batch_context.append(context_target) 106 | batch_length.append(text_length) 107 | return encoded_batch, batch_classes, batch_context_encoded, batch_context, batch_identifiers, batch_text, batch_length 108 | 109 | if __name__ == "__main__": 110 | data_path = "~/tweetnet/automatedMTL/data/ag_news_csv" 111 | data_stats = pickle.load(open(expanduser(data_path + "/stats.pkl"))) 112 | n_classes, n_data, n_data_per_class, n_train_data, n_test_data, max_length = data_stats['n_classes'], data_stats['n_data'], data_stats['n_data_per_class'],data_stats['n_train_data'], data_stats['n_test_data'], data_stats['max_length'] 113 | print n_classes, n_data, n_data_per_class 114 | word2vec_dic = get_word2vec("~/tweetnet/data/word2vec_dict.pkl") 115 | for epoch in range(3): 116 | dic = {} 117 | all_classes, train_file, test_file, val_file = load_data(data_path) 118 | start_idx = 0 119 | for minibatch in range(3): 120 | encoded_batch, batch_classes, batch_context_encoded, batch_context, batch_identifier, batch_text, batch_length = load_batch(n_classes, word2vec_dic, {}, 300, max_length, data_path+"/Train/", 1, train_file, test_file, all_classes, start_idx, 1, automated_task="word generation") 121 | start_idx += 1 122 | print batch_text 123 | print batch_classes 124 | print batch_context 125 | print encoded_batch.shape 126 | print batch_context_encoded.shape 127 | for i in batch_identifier: 128 | if dic.get(i) != None: print "Wrong" 129 | else: dic[i] = 1 130 | -------------------------------------------------------------------------------- /auto/src_final/util/load_util.py: -------------------------------------------------------------------------------- 1 | import cPickle as pickle 2 | import os 3 | from os.path import expanduser 4 | from os.path import basename 5 | 6 | def class_look_up(data_path): 7 | out_train = open(expanduser(data_path+"/train_classes.txt"), "w") 8 | out_test = open(expanduser(data_path+"/test_classes.txt"), "w") 9 | train_folders = os.listdir(expanduser(data_path+"/Train/")) 10 | test_folders = os.listdir(expanduser(data_path+"/Test/")) 11 | 12 | dict = {} 13 | cnt = 0 14 | file2class_dict = {} 15 | 16 | for i in train_folders: 17 | if i[0] != '.': 18 | if dict.get(i) == None: 19 | dict[i] = cnt 20 | cnt += 1 21 | files = os.listdir(expanduser(data_path+"/Train/"+i)) 22 | for f in files: 23 | if f[0] == ".": continue 24 | out_train.write(f[0:len(f) - 4] + " " + i + " " + str(dict[i])) 25 | out_train.write("\n") 26 | file2class_dict[int(f[0:len(f) - 4])] = (i, dict[i]) 27 | 28 | for i in test_folders: 29 | if i[0] != '.': 30 | files = os.listdir(expanduser(data_path+"/Test/"+i)) 31 | for f in files: 32 | if f[0] == ".": continue 33 | out_test.write(f[0:len(f) - 4] + " " + i + " " + str(dict[i])) 34 | out_test.write("\n") 35 | file2class_dict[int(f[0:len(f) - 4])] = (i, dict[i]) 36 | out_train.close() 37 | out_test.close() 38 | pickle.dump(file2class_dict, open(expanduser(data_path+"/classes.pkl"), "w")) 39 | #print file2class_dict 40 | #print len(file2class_dict) 41 | 42 | if __name__ == "__main__": 43 | class_look_up("~/automatedMTL/data/rotten_tomato") 44 | -------------------------------------------------------------------------------- /auto/src_final/util/reformat.py: -------------------------------------------------------------------------------- 1 | import cPickle as pickle 2 | import numpy as np 3 | import os 4 | from os.path import expanduser 5 | from path import Path 6 | import random 7 | from stop_words import get_stop_words 8 | 9 | # The dataset has 5331 positive and 5331 negative reviews 10 | # According to prev work, split inot 90% training (4998) and 10% testing (533) 11 | 12 | word2vec_dic = pickle.load(open(expanduser("~/tweetnet/data/word2vec_dict.pkl"))) 13 | stop_words = get_stop_words('english') 14 | length = [] 15 | missing_word_dic = {} 16 | 17 | def get_dataset(dataset_path): 18 | data_stats = pickle.load(open(expanduser(dataset_path + "/stats.pkl"))) 19 | all_example = {} 20 | all_class_folders = os.listdir(expanduser(dataset_path+"/all_data/")) 21 | for class_folder in all_class_folders: 22 | if class_folder[0] != ".": 23 | all_example[class_folder] = open(expanduser(dataset_path+'/all_data/' + class_folder)) 24 | return all_example, data_stats 25 | 26 | 27 | def replace_missing_word(data_by_word): 28 | 29 | new_data = [] 30 | for i in range(len(data_by_word)): 31 | word = data_by_word[i] 32 | if word in stop_words and word2vec_dic.get(word) == None: 33 | continue 34 | else: 35 | new_data.append(word) 36 | 37 | idx = range(0, len(new_data)) 38 | random.shuffle(idx) 39 | removed = "" 40 | 41 | if len(new_data) == 1 and word2vec_dic.get(new_data[0]) != None: 42 | return new_data + ["-"], new_data[-1] 43 | elif len(new_data) == 1 and word2vec_dic.get(new_data[0]) == None: 44 | return [], "" 45 | 46 | valid = False 47 | for i in idx: 48 | word = new_data[i] 49 | if word not in stop_words and word2vec_dic.get(word)!= None: 50 | removed = new_data[i] 51 | data_by_word[i] = "REMOVE" 52 | valid = True 53 | break 54 | if not valid: 55 | print data_by_word 56 | return [], "" 57 | return data_by_word, removed 58 | 59 | 60 | def process_data(data, is_missing_word): 61 | 62 | d = list(data) 63 | for i in range(len(d)): 64 | if ord(d[i]) > ord('z') or ord(d[i]) < ord('a') and d[i] != "'": 65 | d[i] = " " 66 | string = "".join(d) 67 | 68 | if not is_missing_word: 69 | string = " ".join(string.split()) 70 | string = string + " " + "EOS" 71 | length.append(len(string.split())) 72 | return string, "_" 73 | 74 | string, removed = replace_missing_word(string.split()) 75 | if string == []: return [], "" 76 | string = " ".join(string) 77 | string = string + " " + "EOS" 78 | length.append(len(string.split())) 79 | return string, removed 80 | 81 | def reformat_data(dataset_path, is_missing_word): 82 | 83 | # Clean up the directory in train and test folder 84 | d_train, d_test = Path(expanduser(dataset_path+"/Train")), Path(expanduser(dataset_path+"/Test")) 85 | train_files, test_files = d_train.walk("*.txt"), d_test.walk("*.txt") 86 | for f in train_files: 87 | f.remove() 88 | for f in test_files: 89 | f.remove() 90 | 91 | all_example, data_stats = get_dataset(dataset_path) 92 | n_classes, n_data, n_data_per_class, trainPercent, testPercent = data_stats['n_classes'], data_stats['n_data'], data_stats['n_data_per_class'], data_stats['trainPercent'], data_stats['testPercent'] 93 | all_idx = range(0,n_data) 94 | random.shuffle(all_idx) 95 | test_idx = all_idx[0:int(testPercent*n_data)] 96 | identifier = 0 97 | 98 | for one_class in all_example.keys(): 99 | for p in all_example[one_class].readlines(): 100 | if p == "\n": 101 | continue 102 | else: 103 | if identifier in test_idx: 104 | file = open(expanduser(dataset_path + "/Test/" + one_class[0:len(one_class)-4] + "/" + str(identifier) + ".txt"), "w") 105 | else: 106 | file = open(expanduser(dataset_path + "/Train/" + one_class[0:len(one_class)-4] + "/" + str(identifier) + ".txt"), "w") 107 | 108 | string, removed = process_data(p, is_missing_word) 109 | if string != []: 110 | file.write(string) 111 | missing_word_dic[identifier] = removed 112 | identifier += 1 113 | file.close() 114 | else: print p 115 | pickle.dump(missing_word_dic, open(expanduser(dataset_path + "/missing_word_dic.pkl"),"w")) 116 | return sorted(length)[-1] 117 | 118 | if __name__ == "__main__": 119 | print reformat_data("~/automatedMTL/data/rotten_tomato", False) 120 | -------------------------------------------------------------------------------- /auto/src_final/util/tf_utils.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import numpy as np 3 | 4 | def fcLayer(x, in_shape, out_shape, activation, dropout, is_train, scope="fc"): 5 | 6 | x = tf.reshape(x, [-1, in_shape]) 7 | 8 | with tf.variable_scope(scope): 9 | w = tf.get_variable(name="w", shape = [in_shape, out_shape], dtype=tf.float32, initializer=tf.contrib.layers.xavier_initializer()) 10 | b = tf.get_variable(name="b", shape= [out_shape], dtype=tf.float32, initializer=tf.constant_initializer(0.0)) 11 | fc = tf.add(tf.matmul(x, w), b) 12 | 13 | with tf.variable_scope("activation"): 14 | output = applyActivation(fc, activation) 15 | #out_op = tf.nn.dropout(output, dropout) 16 | out_op = output 17 | 18 | return out_op 19 | 20 | def createGRUCell(batch_size, lstm_size): 21 | gru_cell = tf.contrib.rnn.GRUCell(num_units=lstm_size, activation=tf.tanh) 22 | state=gru_cell.zero_state(batch_size, tf.float32) 23 | 24 | return gru_cell, state 25 | 26 | def createLSTMCell(batch_size, lstm_size, n_layers, forget_bias): 27 | 28 | lstm_cell = tf.contrib.rnn.BasicLSTMCell(lstm_size, forget_bias=forget_bias) 29 | lstm_cell = tf.contrib.rnn.MultiRNNCell([lstm_cell for i in range(n_layers)], state_is_tuple=True) 30 | state = lstm_cell.zero_state(batch_size, tf.float32) 31 | 32 | return lstm_cell, state 33 | 34 | def applyActivation(x, activation): 35 | 36 | if activation == "tanh": 37 | return tf.nn.tanh(x) 38 | elif activation == "relu": 39 | return tf.nn.relu(x) 40 | elif activation == "sigmoid": 41 | return tf.nn.sigmoid(x) 42 | elif activation == "relu6": 43 | return tf.nn.relu6(x) 44 | elif activation == "softmax": 45 | return tf.nn.softmax(x) 46 | else: return None 47 | 48 | def length(sequence): 49 | used = tf.sign(tf.reduce_max(tf.abs(sequence), reduction_indices=2)) 50 | length = tf.reduce_sum(used, reduction_indices=1) 51 | length = tf.cast(length, tf.int32) 52 | return length 53 | 54 | def predictionLayer(x, y, in_shape, out_shape, activation, scope="prediction"): 55 | 56 | x = tf.reshape(x, [-1, in_shape]) 57 | 58 | with tf.variable_scope(scope): 59 | w = tf.get_variable(name=scope+"w", shape = [in_shape, out_shape], dtype=tf.float32, initializer=tf.contrib.layers.xavier_initializer()) 60 | b = tf.get_variable(name=scope+"b", shape= [out_shape], dtype=tf.float32, initializer=tf.constant_initializer(0.0)) 61 | logits = tf.add(tf.matmul(x, w), b) 62 | output = applyActivation(logits, activation) 63 | return output, logits 64 | 65 | def compute_cost(logit, y, out_type, max_length, batch_size, embed_dim, activation): 66 | if out_type=="last_only": 67 | cost = tf.nn.sigmoid_cross_entropy_with_logits(labels=y, logits=logit) 68 | cost = tf.reduce_mean(cost, reduction_indices=1) 69 | else: 70 | pred_out = applyActivation(logit, activation) 71 | pred_out = tf.reshape(pred_out, [batch_size, max_length, embed_dim]) 72 | mse = tf.reduce_mean(tf.square(tf.subtract(y, pred_out)), reduction_indices=2) 73 | mask = tf.sign(tf.reduce_max(tf.abs(y), reduction_indices=2)) 74 | mse *= mask 75 | mse = tf.reduce_sum(mse, reduction_indices=1) 76 | mse /= tf.cast(length(y), tf.float32) 77 | cost = mse 78 | cost = tf.reduce_mean(cost, reduction_indices=0) 79 | print "final cost shape: ", cost.get_shape() 80 | return cost 81 | -------------------------------------------------------------------------------- /auto/src_final/util/tf_utils_old.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import numpy as np 3 | 4 | def fcLayer(x, in_shape, out_shape, activation, dropout, is_train, scope="fc"): 5 | 6 | x = tf.reshape(x, [-1, in_shape]) 7 | 8 | with tf.variable_scope(scope): 9 | w = tf.get_variable(name="w", shape = [in_shape, out_shape], dtype=tf.float32, initializer=tf.truncated_normal_initializer(stddev=1e-2)) 10 | b = tf.get_variable(name="b", shape= [out_shape], dtype=tf.float32, initializer=tf.constant_initializer(0.0)) 11 | fc = tf.add(tf.matmul(x, w), b) 12 | 13 | with tf.variable_scope("activation"): 14 | output = applyActivation(fc, activation) 15 | #out_op = tf.nn.dropout(output, dropout) 16 | out_op = output 17 | 18 | return out_op 19 | 20 | def createLSTMCell(batch_size, lstm_size, n_layers, forget_bias): 21 | 22 | lstm_cell = tf.contrib.rnn.BasicLSTMCell(lstm_size, forget_bias=forget_bias) 23 | lstm_cell = tf.contrib.rnn.MultiRNNCell([lstm_cell for i in range(n_layers)], state_is_tuple=True) 24 | state = lstm_cell.zero_state(batch_size, tf.float32) 25 | 26 | return lstm_cell, state 27 | 28 | def applyActivation(x, activation): 29 | 30 | if activation == "tanh": 31 | return tf.nn.tanh(x) 32 | elif activation == "relu": 33 | return tf.nn.relu(x) 34 | elif activation == "sigmoid": 35 | return tf.nn.sigmoid(x) 36 | elif activation == "relu6": 37 | return tf.nn.relu6(x) 38 | else: return None 39 | 40 | 41 | def predictionLayer(x, y, in_shape, out_shape, activation, scope="prediction"): 42 | 43 | x = tf.reshape(x, [-1, in_shape]) 44 | 45 | with tf.variable_scope(scope): 46 | w = tf.get_variable(name=scope+"w", shape = [in_shape, out_shape], dtype=tf.float32, initializer=tf.truncated_normal_initializer(stddev=1e-2)) 47 | b = tf.get_variable(name=scope+"b", shape= [out_shape], dtype=tf.float32, initializer=tf.constant_initializer(0.0)) 48 | logits = tf.add(tf.matmul(x, w), b) 49 | output = applyActivation(logits, activation) 50 | # Compute the mean-squared-error 51 | cost = tf.reduce_mean(tf.square(tf.subtract(y , output))) 52 | 53 | return cost, output 54 | -------------------------------------------------------------------------------- /auto/src_final/util/tf_utils_reg.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import numpy as np 3 | 4 | def fcLayer(x, in_shape, out_shape, activation, dropout, is_train, reg_const, scope="fc"): 5 | 6 | x = tf.reshape(x, [-1, in_shape]) 7 | 8 | with tf.variable_scope(scope): 9 | w = tf.get_variable(name="w", shape = [in_shape, out_shape], dtype=tf.float32, initializer=tf.truncated_normal_initializer(stddev=1e-2), regularizer=tf.contrib.layers.l2_regularizer(reg_const)) 10 | b = tf.get_variable(name="b", shape= [out_shape], dtype=tf.float32, initializer=tf.constant_initializer(0.0)) 11 | fc = tf.add(tf.matmul(x, w), b) 12 | 13 | with tf.variable_scope("activation"): 14 | output = applyActivation(fc, activation) 15 | #out_op = tf.nn.dropout(output, dropout) 16 | out_op = output 17 | 18 | return out_op 19 | 20 | def createLSTMCell(batch_size, lstm_size, n_layers, forget_bias): 21 | 22 | lstm_cell = tf.contrib.rnn.BasicLSTMCell(lstm_size, forget_bias=forget_bias) 23 | lstm_cell = tf.contrib.rnn.MultiRNNCell([lstm_cell for i in range(n_layers)], state_is_tuple=True) 24 | state = lstm_cell.zero_state(batch_size, tf.float32) 25 | 26 | return lstm_cell, state 27 | 28 | def applyActivation(x, activation): 29 | 30 | if activation == "tanh": 31 | return tf.nn.tanh(x) 32 | elif activation == "relu": 33 | return tf.nn.relu(x) 34 | elif activation == "sigmoid": 35 | return tf.nn.sigmoid(x) 36 | elif activation == "relu6": 37 | return tf.nn.relu6(x) 38 | elif activation == "softmax": 39 | return tf.nn.softmax(x) 40 | else: return None 41 | 42 | def length(sequence): 43 | used = tf.sign(tf.reduce_max(tf.abs(sequence), reduction_indices=2)) 44 | length = tf.reduce_sum(used, reduction_indices=1) 45 | length = tf.cast(length, tf.int32) 46 | return length 47 | 48 | def predictionLayer(x, y, in_shape, out_shape, activation, reg_const, scope="prediction"): 49 | 50 | x = tf.reshape(x, [-1, in_shape]) 51 | 52 | with tf.variable_scope(scope): 53 | w = tf.get_variable(name=scope+"w", shape = [in_shape, out_shape], dtype=tf.float32, initializer=tf.truncated_normal_initializer(stddev=1e-2), regularizer=tf.contrib.layers.l2_regularizer(reg_const)) 54 | b = tf.get_variable(name=scope+"b", shape= [out_shape], dtype=tf.float32, initializer=tf.constant_initializer(0.0)) 55 | logits = tf.add(tf.matmul(x, w), b) 56 | output = applyActivation(logits, activation) 57 | return output, logits 58 | 59 | def compute_cost(logit, y, out_type, max_length, batch_size, embed_dim, activation): 60 | if out_type=="last_only": 61 | cost = tf.nn.sigmoid_cross_entropy_with_logits(labels=y, logits=logit) 62 | cost = tf.reduce_mean(cost, reduction_indices=1) 63 | else: 64 | pred_out = applyActivation(logit, activation) 65 | pred_out = tf.reshape(pred_out, [batch_size, max_length, embed_dim]) 66 | mse = tf.reduce_mean(tf.square(tf.subtract(y, pred_out)), reduction_indices=2) 67 | mask = tf.sign(tf.reduce_max(tf.abs(y), reduction_indices=2)) 68 | mse *= mask 69 | mse = tf.reduce_sum(mse, reduction_indices=1) 70 | mse /= tf.cast(length(y), tf.float32) 71 | cost = mse 72 | cost = tf.reduce_mean(cost, reduction_indices=0) 73 | print "final cost shape: ", cost.get_shape() 74 | return cost 75 | -------------------------------------------------------------------------------- /data/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/davisliang/automated/5e07e5802a604036fc6295ed3538d596936fca0e/data/.gitkeep -------------------------------------------------------------------------------- /data/helpX: -------------------------------------------------------------------------------- 1 | cnumpy.core.multiarray 2 | _reconstruct 3 | p1 4 | (cnumpy 5 | ndarray 6 | p2 7 | (I0 8 | tS'b' 9 | tRp3 10 | (I1 11 | (I14216263 12 | I10 13 | I64 14 | tcnumpy 15 | dtype 16 | p4 17 | (S'f8' 18 | I0 19 | I1 20 | tRp5 21 | (I3 22 | S'<' 23 | NNNI-1 24 | I-1 25 | I0 26 | tbI00 27 | -------------------------------------------------------------------------------- /data/test_Category.json.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/davisliang/automated/5e07e5802a604036fc6295ed3538d596936fca0e/data/test_Category.json.gz -------------------------------------------------------------------------------- /data/test_Helpful.json.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/davisliang/automated/5e07e5802a604036fc6295ed3538d596936fca0e/data/test_Helpful.json.gz -------------------------------------------------------------------------------- /data/train.json.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/davisliang/automated/5e07e5802a604036fc6295ed3538d596936fca0e/data/train.json.gz -------------------------------------------------------------------------------- /logs/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/davisliang/automated/5e07e5802a604036fc6295ed3538d596936fca0e/logs/.gitkeep -------------------------------------------------------------------------------- /scripts/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/davisliang/automated/5e07e5802a604036fc6295ed3538d596936fca0e/scripts/.gitkeep -------------------------------------------------------------------------------- /scripts/killZk.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # alternatively, you can just exit your terminal. 3 | ~/zookeeper-3.4.6/bin/zkServer.sh stop 4 | -------------------------------------------------------------------------------- /scripts/newTerminalMac.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | echo ' 4 | on run argv 5 | if length of argv is equal to 0 6 | set command to "" 7 | else 8 | set command to item 1 of argv 9 | end if 10 | 11 | if length of argv is greater than 1 12 | set profile to item 2 of argv 13 | runWithProfile(command, profile) 14 | else 15 | runSimple(command) 16 | end if 17 | end run 18 | 19 | on runSimple(command) 20 | tell application "Terminal" 21 | activate 22 | set newTab to do script(command) 23 | end tell 24 | return newTab 25 | end runSimple 26 | 27 | on runWithProfile(command, profile) 28 | set newTab to runSimple(command) 29 | tell application "Terminal" to set current settings of newTab to (first settings set whose name is profile) 30 | end runWithProfile 31 | ' | osascript - "$@" > /dev/null -------------------------------------------------------------------------------- /scripts/startKafkaServer.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | echo "starting kafka client" 3 | 4 | KAFKAHOME="$HOME/kafka-0.10.1.1-src" 5 | 6 | # Added the sudo due to file permissions being messed up 7 | sudo $KAFKAHOME/bin/kafka-server-start.sh $KAFKAHOME/config/server.properties 8 | -------------------------------------------------------------------------------- /scripts/startNimbus.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | sudo ~/apache-storm-1.0.3/bin/storm nimbus 3 | -------------------------------------------------------------------------------- /scripts/startStormUI.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | sudo ~/apache-storm-1.0.3/bin/storm ui 3 | -------------------------------------------------------------------------------- /scripts/startSupervisor.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | sudo ~/apache-storm-1.0.3/bin/storm supervisor 3 | -------------------------------------------------------------------------------- /scripts/startZK.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | echo "Starting Zookeeper" 3 | ~/zookeeper-3.4.6/bin/zkServer.sh start 4 | -------------------------------------------------------------------------------- /scripts/startZKClient.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | echo "starting zookeeper client" 3 | ~/zookeeper-3.4.6/bin/zkCli.sh 4 | -------------------------------------------------------------------------------- /scripts/systemStartMac.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | ~/tweetnet/scripts/startZK.sh 3 | sleep 5 4 | ~/tweetnet/scripts/newTerminalMac.sh ~/tweetnet/scripts/startZKClient.sh 5 | sleep 5 6 | ~/tweetnet/scripts/newTerminalMac.sh ~/tweetnet/scripts/startNimbus.sh 7 | sleep 5 8 | ~/tweetnet/scripts/newTerminalMac.sh ~/tweetnet/scripts/startSupervisor.sh 9 | sleep 5 10 | ~/tweetnet/scripts/newTerminalMac.sh ~/tweetnet/scripts/startStormUI.sh 11 | sleep 6 12 | ~/tweetnet/scripts/newTerminalMac.sh ~/tweetnet/scripts/startKafkaServer.sh 13 | -------------------------------------------------------------------------------- /scripts/systemStartUbuntu.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | sudo ~/tweetnet/scripts/startZK.sh 3 | sleep 5 4 | gnome-terminal -e ~/tweetnet/scripts/startZKClient.sh 5 | sleep 5 6 | gnome-terminal -e ~/tweetnet/scripts/startNimbus.sh 7 | sleep 5 8 | gnome-terminal -e ~/tweetnet/scripts/startSupervisor.sh 9 | sleep 5 10 | gnome-terminal -e ~/tweetnet/scripts/startStormUI.sh 11 | sleep 5 12 | gnome-terminal -e ~/tweetnet/scripts/startKafkaServer.sh 13 | -------------------------------------------------------------------------------- /scripts/userRunAPI.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | echo "Starting up TweetFeeder." 3 | 4 | STORMPATH="$HOME/apache-storm-0.9.5/lib/*" 5 | TWITTERPATH="$HOME/twitter4j-4.0.4/lib/*" 6 | CLASSPATH="$HOME/tweetnet/src/storm/" 7 | 8 | javac -cp $STORMPATH:$TWITTERPATH ~/tweetnet/src/storm/TwitterStreamSpout.java ~/tweetnet/src/storm/TwitterCleanerBolt.java ~/tweetnet/src/storm/TwitterStorm.java 9 | 10 | java -cp $STORMPATH:$TWITTERPATH:$CLASSPATH TwitterStorm #append_twitter_credentials_here 11 | -------------------------------------------------------------------------------- /src/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/davisliang/automated/5e07e5802a604036fc6295ed3538d596936fca0e/src/.gitkeep -------------------------------------------------------------------------------- /src/models/c2c_cooccurence.py: -------------------------------------------------------------------------------- 1 | import cPickle as pickle 2 | import numpy as np 3 | from numpy import random 4 | from random import shuffle 5 | from os.path import expanduser 6 | import time 7 | import sys 8 | import os 9 | 10 | sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "..","utils"))) 11 | 12 | from logger import logger 13 | 14 | def create_coocc_dict(trainHt): 15 | coocc_dict = {} 16 | for htstr in trainHt: 17 | ht = htstr.split(" ") 18 | if coocc_dict.get(ht[0]) == None: 19 | coocc_dict[ht[0]] = {ht[1]:1} 20 | else: 21 | if coocc_dict[ht[0]].get(ht[1]) == None: 22 | coocc_dict[ht[0]][ht[1]] = 1 23 | else: 24 | coocc_dict[ht[0]][ht[1]] += 1 25 | return coocc_dict 26 | 27 | def predict(testHt, coocc_dict): 28 | correct = 0 29 | name = "cooc" + time.strftime("%Y-%m-%d_%H:%M") + ".log" 30 | log = [] 31 | 32 | for htstr in testHt: 33 | ht = htstr.split(" ") 34 | if coocc_dict.get(ht[0]) == None: 35 | continue 36 | dic = coocc_dict[ht[0]] 37 | dic_key = dic.keys() 38 | dic_val = dic.values() 39 | idx = np.argsort(dic_val) 40 | prediction = [] 41 | for i in range(topN): 42 | if i < len(dic_val): 43 | prediction.append(dic_key[idx[i]]) 44 | isCorrect = False 45 | if ht[1] in prediction: 46 | correct += 1 47 | isCorrect = True 48 | 49 | log.append([ht[0],ht[1],isCorrect,prediction]) 50 | 51 | accuracy=correct*1.0/len(testHt) 52 | log.append([correct,accuracy]) 53 | 54 | logger(log,name) 55 | 56 | hashtags = pickle.load(open(expanduser("~/tweetnet/data/englishHashtag.pkl"), "rb")) 57 | hashtagFreq = pickle.load(open(expanduser("~/tweetnet/data/hashtagFreq.pkl"), "rb")) 58 | 59 | idx_shuf = range(len(hashtags)) 60 | shuffle(idx_shuf) 61 | freqThreshold = 84 62 | hashtagFreqCnt = {} 63 | hashtags_shuf = [] 64 | 65 | for i in idx_shuf: 66 | ht = hashtags[i].split(" ") 67 | if hashtagFreq[ht[2]] >= freqThreshold: 68 | if hashtagFreqCnt.get(ht[2]) == None: 69 | 70 | hashtagFreqCnt[ht[2]] = 1 71 | hashtags_shuf.append(ht[1] + " " + ht[2]) 72 | 73 | elif hashtagFreqCnt[ht[2]] < freqThreshold: 74 | 75 | hashtagFreqCnt[ht[2]] += 1 76 | hashtags_shuf.append(ht[1] + " " + ht[2]) 77 | 78 | hashtags = hashtags_shuf 79 | 80 | trainPercent = 0.95 81 | nTrainData = np.round(len(hashtags)*trainPercent).astype(int) 82 | topN = 4 83 | trainHt = hashtags[0:nTrainData] 84 | testHt = hashtags[nTrainData:] 85 | coocc_dict = create_coocc_dict(trainHt) 86 | predict(testHt, coocc_dict) 87 | 88 | 89 | -------------------------------------------------------------------------------- /src/models/c2c_cooccurenceNonUniform.py: -------------------------------------------------------------------------------- 1 | import cPickle as pickle 2 | import numpy as np 3 | from os.path import expanduser 4 | import time 5 | import sys 6 | import os 7 | 8 | sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "..","utils"))) 9 | 10 | from logger import logger 11 | 12 | def create_coocc_dict(trainHt): 13 | coocc_dict = {} 14 | for htstr in trainHt: 15 | ht = htstr.split(" ") 16 | if coocc_dict.get(ht[1]) == None: 17 | coocc_dict[ht[1]] = {ht[2]:1} 18 | else: 19 | if coocc_dict[ht[1]].get(ht[2]) == None: 20 | coocc_dict[ht[1]][ht[2]] = 1 21 | else: 22 | coocc_dict[ht[1]][ht[2]] += 1 23 | return coocc_dict 24 | 25 | def predict(testHt, coocc_dict): 26 | correct = 0 27 | name = "cooc" + time.strftime("%Y-%m-%d_%H:%M") + ".log" 28 | log = [] 29 | 30 | for htstr in testHt: 31 | ht = htstr.split(" ") 32 | if coocc_dict.get(ht[1]) == None: 33 | continue 34 | dic = coocc_dict[ht[1]] 35 | dic_key = dic.keys() 36 | dic_val = dic.values() 37 | idx = np.argsort(dic_val) 38 | prediction = [] 39 | for i in range(topN): 40 | if i < len(dic_val): 41 | prediction.append(dic_key[idx[i]]) 42 | isCorrect = False 43 | if ht[2] in prediction: 44 | correct += 1 45 | isCorrect = True 46 | 47 | log.append([ht[1],ht[2],isCorrect,prediction]) 48 | 49 | accuracy=correct*1.0/len(testHt) 50 | log.append([correct,accuracy]) 51 | 52 | logger(log,name) 53 | 54 | hashtags = pickle.load(open(expanduser("~/tweetnet/data/englishHashtag.pkl"), "rb")) 55 | trainPercent = 0.99 56 | nTrainData = np.round(len(hashtags)*trainPercent).astype(int) 57 | topN = 10 58 | trainHt = hashtags[0:nTrainData] 59 | testHt = hashtags[nTrainData + 1 :] 60 | coocc_dict = create_coocc_dict(trainHt) 61 | predict(testHt, coocc_dict) 62 | 63 | 64 | -------------------------------------------------------------------------------- /src/models/cascKeras.py: -------------------------------------------------------------------------------- 1 | from keras.layers import Input, Dense, Embedding, LSTM, merge 2 | from keras.models import Model 3 | 4 | # this returns a tensor 5 | contextX, contexty, taskX, taskY = loadData(); 6 | 7 | text_input = Input(shape=(100,), dtype='float32', name='text_input') 8 | 9 | lstm_body = lstm(32)(text_input) 10 | 11 | lstm_context = lstm(32)(lstm_body) 12 | fc_context = Dense(256)(lstm_context) 13 | out_context = Dense(300)(fc_context) 14 | 15 | lstm_task = lstm(32)(lstm_body) 16 | fc_task = Dense(256)(lstm_task) 17 | fc_out = Dense(300)(fc_task) -------------------------------------------------------------------------------- /src/models/cascade.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/davisliang/automated/5e07e5802a604036fc6295ed3538d596936fca0e/src/models/cascade.py -------------------------------------------------------------------------------- /src/models/contextToContext.py: -------------------------------------------------------------------------------- 1 | from os.path import expanduser 2 | import os 3 | import sys 4 | import numpy 5 | from numpy import shape 6 | from numpy import random 7 | from random import shuffle 8 | import cPickle as pickle 9 | sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "..","utils"))) 10 | import time 11 | from keras.utils import np_utils 12 | from keras.models import Sequential 13 | from keras.layers import Dense 14 | from keras.layers import Activation 15 | from keras.optimizers import SGD 16 | from keras.optimizers import RMSprop 17 | from keras.layers import BatchNormalization 18 | import keras.callbacks 19 | from logger import logger 20 | from predContext import predContext, createHtDict 21 | from keras.layers import PReLU 22 | hashtags = pickle.load(open(expanduser("~/tweetnet/data/englishHashtag.pkl"),"rb")) 23 | dictionary = pickle.load(open(expanduser("~/tweetnet/data/word2vec_dict.pkl"), "rb")) 24 | hashtagFreq = pickle.load(open(expanduser("~/tweetnet/data/hashtagFreq.pkl"), "rb")) 25 | 26 | idx_shuf = range(len(hashtags)) 27 | #shuffle(idx_shuf) 28 | freqThreshold = 84 29 | hashtags_shuf = [] 30 | context_shuf = [] 31 | hashtagFreqCnt = {} 32 | 33 | for i in idx_shuf: 34 | ht = hashtags[i].split(" ") 35 | if hashtagFreq[ht[2]] >= freqThreshold: 36 | if hashtagFreqCnt.get(ht[2]) == None: 37 | 38 | hashtagFreqCnt[ht[2]] = 1 39 | hashtags_shuf.append(ht[2]) 40 | context_shuf.append(ht[1]) 41 | 42 | elif hashtagFreqCnt[ht[2]] < freqThreshold: 43 | 44 | hashtagFreqCnt[ht[2]] += 1 45 | hashtags_shuf.append(ht[2]) 46 | context_shuf.append(ht[1]) 47 | 48 | data = numpy.zeros([len(hashtags_shuf),300]) 49 | label = numpy.zeros([len(hashtags_shuf),300]) 50 | inputStringLabel = [] 51 | outputStringLabel = [] 52 | for i in range(len(hashtags_shuf)): 53 | data[i,:]=dictionary[context_shuf[i]] 54 | label[i,:]=dictionary[hashtags_shuf[i]] 55 | inputStringLabel.append(context_shuf[i]) 56 | outputStringLabel.append(hashtags_shuf[i]) 57 | 58 | htDic = createHtDict(dictionary, outputStringLabel) 59 | 60 | # Train and Test split 61 | trainPercent = 0.9 62 | nTrainData = numpy.round(len(data)*trainPercent).astype(int) 63 | topN = 4 64 | nEpoch = 5000 65 | logAllPredictions = True 66 | trainData = data[0 : nTrainData] 67 | testData = data[nTrainData :] 68 | testInputStringLabel = inputStringLabel[nTrainData:] 69 | print testData.shape 70 | trainLabel = label[0 : nTrainData] 71 | testOutputStringLabel = outputStringLabel[nTrainData:] 72 | 73 | 74 | model = Sequential() 75 | 76 | model.add(Dense(512, input_shape=(300,))) 77 | model.add(PReLU()) 78 | model.add(BatchNormalization()) 79 | 80 | model.add(Dense(512)) 81 | model.add(PReLU()) 82 | model.add(BatchNormalization()) 83 | 84 | model.add(Dense(300)) 85 | model.add(PReLU()) 86 | 87 | optimizer = RMSprop(lr=0.005) 88 | model.compile(loss='mse', optimizer=optimizer) 89 | 90 | model.summary() 91 | 92 | name = "c2c" + time.strftime("%Y-%m-%d_%H:%M") + ".log" 93 | for epoch in range(nEpoch): 94 | model.fit(trainData, trainLabel, nb_epoch=1, batch_size=128, validation_split=0.1) 95 | 96 | correctCnt = 0 97 | randIdx = numpy.random.randint(0, len(testData), 10) 98 | log = [] 99 | log.append([epoch]) 100 | for testIdx in range(len(testData)): 101 | modelOutput = model.predict(numpy.expand_dims(testData[testIdx, :], axis=0)) 102 | topNht, isCorrect, topNdist = predContext(htDic, modelOutput, topN, testOutputStringLabel[testIdx]) 103 | if isCorrect: 104 | correctCnt += 1.0 105 | if logAllPredictions: 106 | #verbose logging 107 | log.append([testInputStringLabel[testIdx],testOutputStringLabel[testIdx],isCorrect,topNht]) 108 | 109 | accuracy = correctCnt*1.0 / len(testData) 110 | #always log accuracy 111 | log.append([correctCnt, accuracy]) 112 | logger(log,name) 113 | 114 | -------------------------------------------------------------------------------- /src/models/contextToContextNonUniform.py: -------------------------------------------------------------------------------- 1 | from os.path import expanduser 2 | import os 3 | import sys 4 | import numpy 5 | from numpy import shape 6 | import cPickle as pickle 7 | sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "..","utils"))) 8 | import time 9 | from keras.utils import np_utils 10 | from keras.models import Sequential 11 | from keras.layers import Dense 12 | from keras.layers import Activation 13 | from keras.optimizers import SGD 14 | from keras.optimizers import RMSprop 15 | from keras.layers import BatchNormalization 16 | import keras.callbacks 17 | from logger import logger 18 | from predContext import predContext, createHtDict 19 | from keras.layers import PReLU 20 | hashtags = pickle.load(open(expanduser("~/tweetnet/data/englishHashtag.pkl"),"rb")) 21 | dictionary = pickle.load(open(expanduser("~/tweetnet/data/word2vec_dict.pkl"), "rb")) 22 | 23 | data = numpy.zeros([len(hashtags),300]) 24 | label = numpy.zeros([len(hashtags),300]) 25 | inputStringLabel = [] 26 | outputStringLabel = [] 27 | for i in range(len(hashtags)): 28 | line = hashtags[i] 29 | listHashtag = line.split() 30 | data[i,:]=dictionary[listHashtag[1]] 31 | label[i,:]=dictionary[listHashtag[2]] 32 | inputStringLabel.append(listHashtag[1]) 33 | outputStringLabel.append(listHashtag[2]) 34 | 35 | htDic = createHtDict(dictionary, outputStringLabel) 36 | 37 | # Train and Test split 38 | trainPercent = 0.99 39 | nTrainData = numpy.round(len(data)*trainPercent).astype(int) 40 | topN = 10 41 | nEpoch = 5000 42 | logAllPredictions = True 43 | trainData = data[0 : nTrainData] 44 | testData = data[nTrainData + 1 :] 45 | testInputStringLabel = inputStringLabel[nTrainData + 1 :] 46 | print testData.shape 47 | trainLabel = label[0 : nTrainData] 48 | testOutputStringLabel = outputStringLabel[nTrainData + 1 :] 49 | 50 | 51 | model = Sequential() 52 | 53 | model.add(Dense(512, input_shape=(300,))) 54 | model.add(PReLU()) 55 | model.add(BatchNormalization()) 56 | 57 | model.add(Dense(512)) 58 | model.add(PReLU()) 59 | model.add(BatchNormalization()) 60 | 61 | model.add(Dense(300)) 62 | model.add(PReLU()) 63 | 64 | optimizer = RMSprop(lr=0.005) 65 | model.compile(loss='mse', optimizer=optimizer) 66 | 67 | name = "c2c" + time.strftime("%Y-%m-%d_%H:%M") + ".log" 68 | for epoch in range(nEpoch): 69 | model.fit(trainData, trainLabel, nb_epoch=1, batch_size=128, validation_split=0.1) 70 | 71 | correctCnt = 0 72 | randIdx = numpy.random.randint(0, len(testData), 10) 73 | log = [] 74 | log.append([epoch]) 75 | for testIdx in range(len(testData)): 76 | modelOutput = model.predict(numpy.expand_dims(testData[testIdx, :], axis=0)) 77 | topNht, isCorrect, topNdist = predContext(htDic, modelOutput, topN, testOutputStringLabel[testIdx]) 78 | if isCorrect: 79 | correctCnt += 1.0 80 | if logAllPredictions: 81 | #verbose logging 82 | log.append([testInputStringLabel[testIdx],testOutputStringLabel[testIdx],isCorrect,topNht]) 83 | 84 | accuracy = correctCnt*1.0 / len(testData) 85 | #always log accuracy 86 | log.append([correctCnt, accuracy]) 87 | logger(log,name) 88 | 89 | -------------------------------------------------------------------------------- /src/models/mtlKeras.py: -------------------------------------------------------------------------------- 1 | from keras.layers import Input, Dense, Embedding, LSTM, merge 2 | from keras.models import Model 3 | 4 | # this returns a tensor 5 | contextX, contexty, taskX, taskY = loadData(); 6 | 7 | text_input = Input(shape=(100,), dtype='float32', name='text_input') 8 | 9 | lstm_body = lstm(32)(text_input) 10 | 11 | lstm_context = lstm(32)(lstm_body) 12 | fc_context = Dense(256)(lstm_context) 13 | out_context = Dense(300)(fc_context) 14 | 15 | lstm_task = lstm(32)(lstm_body) 16 | fc_task = Dense(256)(lstm_task) 17 | fc_out = Dense(300)(fc_task) 18 | 19 | 20 | -------------------------------------------------------------------------------- /src/models/tc2c.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Keras backend. LSTM Model. 3 | 4 | Using standard default range: 5 | Input: (65x1) 64 unique chars, 1 EOS char 6 | Output: (65x1) 64 unique chars, 1 EOS char 7 | 8 | ''' 9 | import cPickle as pickle 10 | import numpy as np 11 | import h5py 12 | import os 13 | import sys 14 | from os.path import expanduser 15 | sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "..","utils"))) 16 | from loadDataNewModel import loadData 17 | from predContext import predContext, createHtDict 18 | from keras.utils import np_utils 19 | from keras.models import Sequential 20 | from keras.layers import LSTM 21 | from keras.layers import Dense 22 | from keras.layers import PReLU 23 | from keras.layers import Activation 24 | from keras.layers.wrappers import TimeDistributed 25 | from keras.optimizers import RMSprop 26 | from keras.optimizers import Adagrad 27 | from keras.layers import Dropout 28 | from keras.layers import BatchNormalization 29 | from tweetGenerator_lstm import generateText 30 | from keras.callbacks import ModelCheckpoint 31 | from logger import logger 32 | import time 33 | 34 | #sequenceLength: sequence length (k in BPTTk) 35 | sequenceLength = 30 36 | #Number of symbols 37 | vocabLen = 66 38 | #train test split 39 | trainPercent = 0.9 40 | #threshold on hashtag frequency 41 | freqThreshold = 84 42 | 43 | logAllPredictions=True 44 | #X: [# Seuqences, 40 (sequenceLength), 65(inputSize)]. 45 | #y: [# Sequences, 300] 46 | 47 | print("Start loading data ...") 48 | trainTweets, trainHashtags, testTweets, testHashtags, trainX, trainY, testX, testY, trainTweetSequence, trainHashtagSequence, testTweetSequence, testHashtagSequence, trainContextSequence, testContextSequence, dictionary, nUniqueHt = loadData({},np.array([]), sequenceLength, trainPercent, freqThreshold) 49 | print("Finished loading data") 50 | 51 | 52 | #initialize some hyper-parameters 53 | topN = np.ceil(0.05*nUniqueHt).astype(int) 54 | print topN 55 | 56 | 57 | #embeddingLength: size of the word embedding 58 | embeddingLength = 300 59 | 60 | #inputSize: size of each input vector (default: 365x1) 61 | inputSize = vocabLen + embeddingLength 62 | 63 | #numHiddenFirst: size of first hidden layer 64 | numHiddenFirst = 512 65 | 66 | #Number of testing/training tweets 67 | nTestData = len(testTweets) 68 | nTrainData = len(trainTweets) 69 | nTestSequences = len(testTweetSequence) 70 | nTrainSequences = len(trainTweetSequence) 71 | print "Number of testing sequences: ", nTestSequences 72 | print "Number of training sequences: ", nTrainSequences 73 | print "Number of testing tweets: ", nTestData 74 | print "Number of training tweets: ", nTrainData 75 | 76 | dictionary = pickle.load(open(expanduser("~/tweetnet/data/word2vec_dict.pkl"))) 77 | 78 | # Create the hashtag dictionary 79 | htDic = createHtDict(dictionary, testHashtags) 80 | 81 | numEpochs=50 82 | 83 | #building cLSTM model 84 | #print("\n") 85 | print("Start building model ....") 86 | model = Sequential() 87 | 88 | #model.add(TimeDistributed(Dense(numHiddenFirst), input_shape=(sequenceLength, inputSize))) 89 | #model.add(BatchNormalization()) 90 | 91 | model.add(LSTM(numHiddenFirst, input_shape=(sequenceLength, inputSize))) 92 | model.add(BatchNormalization()) 93 | 94 | model.add(Dense(numHiddenFirst)) 95 | model.add(PReLU()) 96 | model.add(BatchNormalization()) 97 | 98 | model.add(Dense(embeddingLength)) 99 | model.add(PReLU()) 100 | 101 | optimizer = RMSprop(lr=0.005) 102 | 103 | model.compile(loss='mean_squared_error', optimizer=optimizer) 104 | print("Finished building model.") 105 | 106 | model.summary() 107 | 108 | name = "t2c"+time.strftime("%Y-%m-%d_%H:%M") + ".log" 109 | for epoch in range(numEpochs): 110 | 111 | model.fit(trainX, trainY, nb_epoch=1, batch_size=128) 112 | 113 | correctCnt = 0 114 | randIdx = np.random.randint(0, nTestData, 10) 115 | 116 | tweetCnt = 0 117 | tweetStartIdx = 0 118 | log = [] 119 | log.append([epoch]) 120 | for testIdx in range(nTestSequences): 121 | # Stack the windows (1 x 40 x 65) of each tweet as a 3D matrix (#windows x 40 x 65) 122 | if testTweetSequence[testIdx][-1] == chr(3): 123 | oneTweet = testX[tweetStartIdx:testIdx+1, :, :] 124 | modelOutput = model.predict(oneTweet) 125 | topNht, isCorrect, topNdist = predContext(htDic, modelOutput, topN, testHashtags[tweetCnt]) 126 | tweetStartIdx = testIdx + 1 127 | if isCorrect: 128 | correctCnt += 1 129 | isCorrect = True 130 | if tweetCnt in randIdx: 131 | print testTweets[tweetCnt][:-2] 132 | print "Given label is ", testContextSequence[testIdx] 133 | print "True label is ", testHashtags[tweetCnt] 134 | print "Top ", topN, " hashtags are ", topNht 135 | 136 | if logAllPredictions: 137 | log.append([testTweets[tweetCnt][:-2],testHashtags[tweetCnt],isCorrect,topNht]) 138 | tweetCnt += 1 139 | accuracy = correctCnt*1.0 / nTestData 140 | log.append([correctCnt, accuracy]) 141 | logger(log,name) 142 | 143 | 144 | 145 | 146 | -------------------------------------------------------------------------------- /src/models/textToContext.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Keras backend. LSTM Model. 3 | 4 | Using standard default range: 5 | Input: (65x1) 64 unique chars, 1 EOS char 6 | Output: (65x1) 64 unique chars, 1 EOS char 7 | 8 | ''' 9 | import cPickle as pickle 10 | import numpy as np 11 | import h5py 12 | import os 13 | import sys 14 | from os.path import expanduser 15 | sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "..","utils"))) 16 | from loadDataT2C import loadData 17 | from predContext import predContext, createHtDict 18 | from keras.utils import np_utils 19 | from keras.models import Sequential 20 | from keras.layers import LSTM 21 | from keras.layers import Dense 22 | from keras.layers import PReLU 23 | from keras.layers import Activation 24 | from keras.layers.wrappers import Bidirectional 25 | from keras.optimizers import RMSprop 26 | from keras.optimizers import Adadelta 27 | from keras.optimizers import Adam 28 | from keras.layers import Dropout 29 | from keras.layers import BatchNormalization 30 | from tweetGenerator_lstm import generateText 31 | from keras.callbacks import ModelCheckpoint 32 | from keras.regularizers import l2, activity_l2 33 | 34 | from logger import logger 35 | import time 36 | #get the top N prediction of hashtags 37 | topN = 4 38 | #sequenceLength: sequence length (k in BPTTk) 39 | sequenceLength = 40 40 | #Number of symbols 41 | vocabLen = 66 42 | #train test split 43 | trainPercent = 0.9 44 | #freqThreshold for hashtags 45 | freqThreshold = 84 46 | logAllPredictions=True 47 | #X: [# Seuqences, 40 (sequenceLength), 65(inputSize)]. 48 | #y: [# Sequences, 300] 49 | 50 | print("Start loading data ...") 51 | trainTweets, trainHashtags, testTweets, testHashtags, trainX, trainY, testX, testY, trainTweetSequence, trainHashtagSequence, testTweetSequence, testHashtagSequence, dictionary = loadData({},np.array([]), sequenceLength, trainPercent, freqThreshold) 52 | print("Finished loading data") 53 | 54 | 55 | #initialize some hyper-parameters 56 | #inputSize: size of each input vector (default: 365x1) 57 | inputSize = vocabLen 58 | 59 | #outputSize: size of the word embedding 60 | outputSize = 300 61 | 62 | #numHiddenFirst: size of first hidden layer 63 | numHiddenFirst = 512 64 | 65 | #Number of testing/training tweets 66 | nTestData = len(testTweets) 67 | nTrainData = len(trainTweets) 68 | nTestSequences = len(testTweetSequence) 69 | nTrainSequences = len(trainTweetSequence) 70 | print "Number of testing sequences: ", nTestSequences 71 | print "Number of training sequences: ", nTrainSequences 72 | print "Number of testing tweets: ", nTestData 73 | print "Number of training tweets: ", nTrainData 74 | 75 | #for i in range(1000): 76 | # print (trainTweetSequence[i], trainHashtagSequence[i]) 77 | # Load word2vec dictionary 78 | dictionary = pickle.load(open(expanduser("~/tweetnet/data/word2vec_dict.pkl"))) 79 | 80 | # Create the hashtag dictionary 81 | htDic = createHtDict(dictionary, testHashtags) 82 | 83 | numEpochs=50 84 | 85 | lamb = 0.0001 86 | #building cLSTM model 87 | #print("\n") 88 | print("Start building model ....") 89 | model = Sequential() 90 | 91 | #model.add(LSTM(numHiddenFirst, return_sequences=True, input_shape=(sequenceLength, inputSize))) 92 | 93 | model.add(LSTM(numHiddenFirst, input_shape=(sequenceLength, inputSize))) 94 | 95 | #model.add(BatchNormalization()) 96 | 97 | model.add(Dense(numHiddenFirst)) 98 | model.add(Activation('relu')) 99 | #model.add(PReLU()) 100 | #model.add(BatchNormalization()) 101 | 102 | model.add(Dense(outputSize)) 103 | model.add(Activation('tanh')) 104 | #model.add(PReLU()) 105 | #model.add(BatchNormalization()) 106 | 107 | #optimizer = RMSprop(lr=0.005) 108 | optimizer = Adam(lr=0.0001) 109 | model.compile(loss='mean_squared_error', optimizer=optimizer) 110 | print("Finished building model.") 111 | 112 | model.summary() 113 | 114 | name = "t2c"+time.strftime("%Y-%m-%d_%H:%M") + ".log" 115 | for epoch in range(numEpochs): 116 | 117 | model.fit(trainX, trainY, nb_epoch=1, batch_size=128) 118 | 119 | correctCnt = 0 120 | randIdx = np.random.randint(0, nTestData, 10) 121 | 122 | tweetCnt = 0 123 | tweetStartIdx = 0 124 | log = [] 125 | log.append([epoch]) 126 | for testIdx in range(nTestSequences): 127 | # Stack the windows (1 x 40 x 65) of each tweet as a 3D matrix (#windows x 40 x 65) 128 | if testTweetSequence[testIdx][-1] == chr(3): 129 | oneTweet = testX[tweetStartIdx:testIdx+1, :, :] 130 | modelOutput = model.predict(oneTweet) 131 | topNht, isCorrect, topNdist = predContext(htDic, modelOutput, topN, testHashtags[tweetCnt]) 132 | tweetStartIdx = testIdx + 1 133 | if isCorrect: 134 | correctCnt += 1 135 | isCorrect = True 136 | if tweetCnt in randIdx: 137 | print testTweets[tweetCnt][:-2] 138 | print "True label is ", testHashtags[tweetCnt] 139 | print "Top ", topN, " hashtags are ", topNht 140 | 141 | if logAllPredictions: 142 | log.append([testTweets[tweetCnt][:-2],testHashtags[tweetCnt],isCorrect,topNht]) 143 | tweetCnt += 1 144 | accuracy = correctCnt*1.0 / nTestData 145 | log.append([correctCnt, accuracy]) 146 | logger(log,name) 147 | 148 | 149 | 150 | 151 | -------------------------------------------------------------------------------- /src/models/tweetnet.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Keras backend. cLSTM Model. 3 | 4 | Using standard default range: 5 | Input: (365x1) 64 unique chars, 1 EOS char, 300 word2vec context 6 | Output: (65x1) 64 unique chars, 1 EOS char 7 | 8 | ''' 9 | import pickle as pkl 10 | import numpy as np 11 | from loadData import loadData 12 | from keras.utils import np_utils 13 | from keras.models import Sequential 14 | from keras.layers import LSTM 15 | from keras.layers import Dense 16 | from keras.layers import Activation 17 | from keras.optimizers import RMSprop 18 | from keras.optimizers import Adagrad 19 | from keras.layers import Dropout 20 | from keras.layers import BatchNormalization 21 | from tweetGenerator import generateText 22 | from keras.callbacks import ModelCheckpoint 23 | from os.path import expanduser 24 | print("Start loading data ...") 25 | data, dictLen, tweetLen, dictionary = loadData({},np.array([])) 26 | # data shape = #tweets x 141 x inputSize(365) 27 | print("Finished loading data") 28 | 29 | loadWeights=False 30 | 31 | #initialize some hyper-parameters 32 | #inputSize: size of each input vector (default: 365x1) 33 | inputSize = data.shape[2] 34 | #sequenceLength: sequence length (k in BPTTk) 35 | sequenceLength = 50 36 | #numHiddenFirst: size of first hidden layer 37 | numHiddenFirst = 512 38 | #numTweets: total number of tweets in dataset 39 | numTweets = data.shape[0] 40 | #seqPerSegment: sequences (of size sequenceLength) per mini-epoch. 41 | #Lowers maximum memory usage. 42 | seqPerSegment = 5000 43 | 44 | X = [] 45 | y = [] 46 | 47 | #create input and target datasets from loaded data. 48 | for i in range(numTweets): 49 | for j in range(0, int(tweetLen[i])-sequenceLength, 1): 50 | seq_in = data[i, j:j+sequenceLength, :] 51 | seq_out = data[i, j+sequenceLength, 0:dictLen] 52 | X.append(seq_in) 53 | y.append(seq_out) 54 | 55 | #X: [10000 (numTweets), 40 (sequenceLength), 365(inputSize)]. 56 | n_examples = len(X) 57 | numSegments = np.ceil(n_examples/seqPerSegment).astype(int) 58 | numEpochs=50 59 | #print('# of sequences per segments: ', seqPerSegment) 60 | #print('# of segments: ', numSegments) 61 | 62 | #building cLSTM model 63 | #print("\n") 64 | print("Start building model ....") 65 | model = Sequential() 66 | 67 | model.add(LSTM(numHiddenFirst, input_shape=(sequenceLength, inputSize), return_sequences=True)) 68 | model.add(LSTM(numHiddenFirst)) 69 | 70 | model.add(Dense(numHiddenFirst)) 71 | model.add(Activation('relu')) 72 | model.add(BatchNormalization()) 73 | 74 | model.add(Dense(numHiddenFirst)) 75 | model.add(Activation('relu')) 76 | model.add(BatchNormalization()) 77 | 78 | model.add(Dense(dictLen)) 79 | model.add(Activation('softmax')) 80 | 81 | optimizer = Adagrad() 82 | 83 | if(loadWeights==True): 84 | model.load_weights(expanduser("~/tweetnet/logs/intermediateWeights.hdf5")) 85 | 86 | 87 | model.compile(loss='categorical_crossentropy', optimizer=optimizer) 88 | print("Finished building model.") 89 | #define file checkpoint 90 | filePath = expanduser("~/tweetnet/logs/intermediateWeights.hdf5") 91 | checkPoint = ModelCheckpoint(filePath, monitor='loss', verbose=1) 92 | callbacksList = [checkPoint] 93 | 94 | #train on mini-epochs (sized seqPerSegment) to lower total RAM usage. 95 | for epoch in range(numEpochs): 96 | for seg in range(numSegments): 97 | print("\n") 98 | print "Segment: ", seg, "/", numSegments, " | Epoch: ", epoch, "/", numEpochs 99 | dataX = np.asarray(X[seg*seqPerSegment: (seg+1)*seqPerSegment]) 100 | datay = np.asarray(y[seg*seqPerSegment: (seg+1)*seqPerSegment]) 101 | #print("Input shape: ", dataX.shape) 102 | #print("Output shape: ", datay.shape) 103 | model.fit(dataX, datay, nb_epoch=1, batch_size=128, callbacks=callbacksList) 104 | 105 | generateText(dictionary, data, dictLen, tweetLen, X, y, 106 | inputSize, sequenceLength, numHiddenFirst, numTweets, seqPerSegment, 107 | n_examples, numSegments) 108 | 109 | 110 | 111 | 112 | 113 | -------------------------------------------------------------------------------- /src/models/tweetnet_lstm.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Keras backend. LSTM Model. 3 | 4 | Using standard default range: 5 | Input: (65x1) 64 unique chars, 1 EOS char 6 | Output: (65x1) 64 unique chars, 1 EOS char 7 | 8 | ''' 9 | import pickle as pkl 10 | import numpy as np 11 | import h5py 12 | from loadData_lstm import loadData 13 | from keras.utils import np_utils 14 | from keras.models import Sequential 15 | from keras.layers import LSTM 16 | from keras.layers import Dense 17 | from keras.layers import Activation 18 | from keras.optimizers import RMSprop 19 | from keras.optimizers import Adagrad 20 | from keras.layers import Dropout 21 | from keras.layers import BatchNormalization 22 | from tweetGenerator_lstm import generateText 23 | from keras.callbacks import ModelCheckpoint 24 | from os.path import expanduser 25 | 26 | 27 | #sequenceLength: sequence length (k in BPTTk) 28 | sequenceLength = 40 29 | 30 | # number of tweets to use 31 | nTweet = 120000 32 | print("Start loading data ...") 33 | X, y, vocabLen, dictionary, tweetSequence, nextChar, tweets = loadData({},np.array([]), sequenceLength, nTweet) 34 | print("Finished loading data") 35 | 36 | loadWeights=False 37 | 38 | #initialize some hyper-parameters 39 | #inputSize: size of each input vector (default: 365x1) 40 | inputSize = vocabLen 41 | print vocabLen 42 | #numHiddenFirst: size of first hidden layer 43 | numHiddenFirst = 128 44 | #seqPerSegment: sequences (of size sequenceLength) per mini-epoch. 45 | #Lowers maximum memory usage. 46 | seqPerSegment = 10000 47 | 48 | #X: [10000 (numTweets), 40 (sequenceLength), 65(inputSize)]. 49 | n_examples = len(X) 50 | numSegments = np.ceil(n_examples/seqPerSegment).astype(int) 51 | numEpochs=50 52 | print('# of sequences per segments: ', seqPerSegment) 53 | print('# of segments: ', numSegments) 54 | 55 | #building cLSTM model 56 | #print("\n") 57 | print("Start building model ....") 58 | model = Sequential() 59 | 60 | model.add(LSTM(numHiddenFirst, input_shape=(sequenceLength, inputSize))) 61 | #model.add(LSTM(numHiddenFirst)) 62 | 63 | #model.add(Dense(numHiddenFirst)) 64 | #model.add(Activation('relu')) 65 | #model.add(BatchNormalization()) 66 | 67 | #model.add(Dense(numHiddenFirst)) 68 | #model.add(Activation('relu')) 69 | #model.add(BatchNormalization()) 70 | 71 | model.add(Dense(vocabLen)) 72 | model.add(Activation('softmax')) 73 | 74 | optimizer = RMSprop(lr=0.01) 75 | 76 | if(loadWeights==True): 77 | model.load_weights(expanduser("~/tweetnet/logs/intermediateWeights.hdf5")) 78 | 79 | 80 | model.compile(loss='categorical_crossentropy', optimizer=optimizer) 81 | print("Finished building model.") 82 | #define file checkpoint 83 | #filePath = expanduser("~/tweetnet/logs/intermediateWeights.hdf5") 84 | #checkPoint = ModelCheckpoint(filePath, monitor='loss', verbose=1) 85 | #callbacksList = [checkPoint] 86 | 87 | #train on mini-epochs (sized seqPerSegment) to lower total RAM usage. 88 | for epoch in range(numEpochs): 89 | # model.fit(X, y, nb_epoch=1, batch_size=128) 90 | # generateText(model, tweets, sequenceLength, vocabLen, dictionary) 91 | for seg in range(numSegments): 92 | print("\n") 93 | print "Segment: ", seg+1, "/", numSegments, " | Epoch: ", epoch, "/", numEpochs 94 | model.fit(X[seg*seqPerSegment: (seg+1)*seqPerSegment], y[seg*seqPerSegment: (seg+1)*seqPerSegment], nb_epoch=1, batch_size=128) 95 | generateText(model, tweets, sequenceLength, vocabLen, dictionary) 96 | 97 | 98 | 99 | 100 | 101 | 102 | -------------------------------------------------------------------------------- /src/storm/TwitterStorm.java: -------------------------------------------------------------------------------- 1 | 2 | import java.util.*; 3 | import org.apache.storm.tuple.Fields; 4 | import org.apache.storm.tuple.Values; 5 | import org.apache.storm.Config; 6 | import org.apache.storm.LocalCluster; 7 | import org.apache.storm.topology.TopologyBuilder; 8 | import org.apache.storm.kafka.bolt.KafkaBolt; 9 | import org.apache.storm.kafka.bolt.selector.DefaultTopicSelector; 10 | import org.apache.storm.kafka.bolt.mapper.*; 11 | /** 12 | * Main class for storm topology. 13 | */ 14 | 15 | 16 | public class TwitterStorm { 17 | 18 | /** 19 | * The main method extracts user arguments (in runAPI.sh), and constructs 20 | * the topology. Optional Kill Command can be added at the end. 21 | * 22 | * @param args[] array of size 5. Last argument are 'keyword' arguments 23 | */ 24 | public static void main(String[] args) throws Exception{ 25 | 26 | //grab authentication tokens 27 | String consumerKey = args[0]; 28 | String consumerSecret = args[1]; 29 | String accessToken = args[2]; 30 | String accessTokenSecret = args[3]; 31 | 32 | //grab keyword tokens 33 | String[] arguments = args.clone(); 34 | String[] keyWords = Arrays.copyOfRange(arguments, 4, arguments.length); 35 | 36 | //create a new Storm configuration. 37 | Config config = new Config(); 38 | config.setDebug(true); 39 | 40 | //create a new topology. 41 | TopologyBuilder builder = new TopologyBuilder(); 42 | 43 | TwitterStreamSpout streamSpout = new TwitterStreamSpout( 44 | consumerKey,consumerSecret, accessToken, accessTokenSecret, keyWords); 45 | 46 | // streamSpout.scheme = new SchemeAsMultiScheme(new KafkaBoltKeyValueScheme()); 47 | builder.setSpout("streamSpout", streamSpout); 48 | 49 | TwitterCleanerBolt cleanerBolt = new TwitterCleanerBolt(); 50 | 51 | builder.setBolt("cleanerBolt", cleanerBolt).shuffleGrouping("streamSpout"); 52 | 53 | Properties props = new Properties(); 54 | props.put("bootstrap.servers", "localhost:9092"); 55 | props.put("acks", "1"); 56 | props.put("key.serializer", "org.apache.kafka.common.serialization.StringSerializer"); 57 | props.put("value.serializer", "org.apache.kafka.common.serialization.StringSerializer"); 58 | 59 | KafkaBolt kafkaBolt = new KafkaBolt() 60 | .withProducerProperties(props) 61 | .withTopicSelector(new DefaultTopicSelector("twitterstorm")) 62 | .withTupleToKafkaMapper(new FieldNameBasedTupleToKafkaMapper()); 63 | 64 | builder.setBolt("forwardToKafka", kafkaBolt).shuffleGrouping("cleanerBolt"); 65 | 66 | //submit topology to local cluster. 67 | LocalCluster cluster = new LocalCluster(); 68 | cluster.submitTopology("TwitterHashtagStorm", config, 69 | builder.createTopology()); 70 | //no kill condition. Run until manual kill command. 71 | } 72 | } 73 | -------------------------------------------------------------------------------- /src/storm/TwitterStreamSpout.java: -------------------------------------------------------------------------------- 1 | 2 | import java.util.Map; 3 | import java.util.concurrent.LinkedBlockingQueue; 4 | 5 | import twitter4j.FilterQuery; 6 | import twitter4j.StallWarning; 7 | import twitter4j.Status; 8 | import twitter4j.StatusDeletionNotice; 9 | import twitter4j.StatusListener; 10 | 11 | import twitter4j.TwitterStream; 12 | import twitter4j.TwitterStreamFactory; 13 | import twitter4j.auth.AccessToken; 14 | import twitter4j.conf.ConfigurationBuilder; 15 | 16 | import org.apache.storm.Config; 17 | import org.apache.storm.spout.SpoutOutputCollector; 18 | 19 | import org.apache.storm.task.TopologyContext; 20 | import org.apache.storm.topology.OutputFieldsDeclarer; 21 | import org.apache.storm.topology.base.BaseRichSpout; 22 | import org.apache.storm.tuple.Fields; 23 | import org.apache.storm.tuple.Values; 24 | 25 | import org.apache.storm.utils.Utils; 26 | 27 | /** 28 | * this class talks directly the the twitterAPI using the user credentials 29 | * in runAPI.sh. The data from this spout feeds into the parser and cleaner 30 | * bolts. 31 | */ 32 | @SuppressWarnings("serial") 33 | public class TwitterStreamSpout extends BaseRichSpout { 34 | SpoutOutputCollector _collector; 35 | LinkedBlockingQueue queue = null; 36 | TwitterStream _twitterStream; 37 | 38 | String consumerKey; 39 | String consumerSecret; 40 | String accessToken; 41 | String accessTokenSecret; 42 | String[] keyWords; 43 | 44 | /** 45 | * Constructor. 46 | * @param consumerKey Twitter API credential 47 | * @param consumerSecret Twitter API credential 48 | * @param accessToken Twitter API credential 49 | * @param accessTokenSecret Twitter API credential 50 | * @param keyWords array of words to filter for 51 | */ 52 | public TwitterStreamSpout(String consumerKey, String consumerSecret, 53 | String accessToken, String accessTokenSecret, String[] keyWords) { 54 | 55 | this.consumerKey = consumerKey; 56 | this.consumerSecret = consumerSecret; 57 | this.accessToken = accessToken; 58 | this.accessTokenSecret = accessTokenSecret; 59 | this.keyWords = keyWords; 60 | } 61 | 62 | /** 63 | * TO DO: default constructor is a stub. 64 | */ 65 | public TwitterStreamSpout() { 66 | // TODO Auto-generated constructor stub 67 | } 68 | 69 | /** 70 | * creates a new status blockingQueue and a statusListener. 71 | * @param conf Storm configuration 72 | * @param context Storm context 73 | * @param collector Storm spout collector 74 | */ 75 | @Override 76 | public void open(Map conf, TopologyContext context, 77 | SpoutOutputCollector collector) { 78 | 79 | queue = new LinkedBlockingQueue(1000); 80 | _collector = collector; 81 | 82 | StatusListener listener = new StatusListener() { 83 | 84 | @Override 85 | public void onStatus(Status status) { 86 | queue.offer(status); 87 | } 88 | 89 | @Override 90 | public void onDeletionNotice(StatusDeletionNotice sdn) {} 91 | 92 | @Override 93 | public void onTrackLimitationNotice(int i) {} 94 | 95 | @Override 96 | public void onScrubGeo(long l, long l1) {} 97 | 98 | @Override 99 | public void onException(Exception ex) {} 100 | 101 | @Override 102 | public void onStallWarning(StallWarning arg0) { 103 | // TODO Auto-generated method stub 104 | } 105 | }; 106 | 107 | ConfigurationBuilder cb = new ConfigurationBuilder(); 108 | 109 | cb.setDebugEnabled(true) 110 | .setOAuthConsumerKey(consumerKey) 111 | .setOAuthConsumerSecret(consumerSecret) 112 | .setOAuthAccessToken(accessToken) 113 | .setOAuthAccessTokenSecret(accessTokenSecret); 114 | 115 | _twitterStream = new TwitterStreamFactory(cb.build()).getInstance(); 116 | _twitterStream.addListener(listener); 117 | 118 | if (keyWords.length == 0) { 119 | _twitterStream.sample(); 120 | }else { 121 | FilterQuery query = new FilterQuery().track(keyWords); 122 | _twitterStream.filter(query); 123 | } 124 | } 125 | 126 | /** 127 | * polls from the blocking queue to get next status. 128 | */ 129 | @Override 130 | public void nextTuple() { 131 | Status ret = queue.poll(); 132 | 133 | if (ret == null) { 134 | Utils.sleep(50); 135 | } else { 136 | _collector.emit(new Values(ret)); 137 | } 138 | } 139 | 140 | /** 141 | * closes twitter stream. 142 | */ 143 | @Override 144 | public void close() { 145 | _twitterStream.shutdown(); 146 | } 147 | 148 | /** 149 | * worker node configurator. Default set to 1 local machine. 150 | */ 151 | @Override 152 | public Map getComponentConfiguration() { 153 | Config ret = new Config(); 154 | ret.setMaxTaskParallelism(1); 155 | return ret; 156 | } 157 | 158 | @Override 159 | public void ack(Object id) {} 160 | 161 | @Override 162 | public void fail(Object id) {} 163 | 164 | /** 165 | * Declare output field type. 166 | */ 167 | @Override 168 | public void declareOutputFields(OutputFieldsDeclarer declarer) { 169 | declarer.declare(new Fields("tweet")); 170 | } 171 | } 172 | -------------------------------------------------------------------------------- /src/storm/pom.xml: -------------------------------------------------------------------------------- 1 | 3 | 4.0.0 4 | tweetnet 5 | twitter_storm 6 | 1.0-SNAPSHOT 7 | maven-plugin 8 | twitter_storm 9 | https://github.com/davisliang/tweetnet.git 10 | 11 | UTF-8 12 | 13 | 14 | 15 | central 16 | Central Repository 17 | http://repo.maven.apache.org/maven2 18 | default 19 | 20 | false 21 | 22 | 23 | 24 | github-releases 25 | http://oss.sonatype.org/content/repositories/github-releases/ 26 | 27 | 28 | clojars.org 29 | http://clojars.org/repo 30 | 31 | 32 | twitter4j 33 | http://twitter4j.org/maven2 34 | 35 | 36 | 37 | 38 | org.apache.storm 39 | storm-kafka-client 40 | 1.0.3 41 | 42 | 43 | org.apache.storm 44 | storm-kafka 45 | 1.0.3 46 | 47 | 48 | org.apache.kafka 49 | kafka-clients 50 | 0.10.0.0 51 | 52 | 53 | org.slf4j 54 | slf4j-log4j12 55 | 56 | 57 | log4j 58 | log4j 59 | 60 | 61 | 62 | 63 | org.apache.kafka 64 | kafka_2.10 65 | 0.10.2.0 66 | 67 | 68 | org.slf4j 69 | slf4j-log4j12 70 | 71 | 72 | log4j 73 | log4j 74 | 75 | 76 | 77 | 78 | org.twitter4j 79 | twitter4j-core 80 | 4.0.4 81 | 82 | 83 | org.twitter4j 84 | twitter4j-stream 85 | 4.0.4 86 | 87 | 88 | org.apache.storm 89 | storm-core 90 | 1.0.3 91 | 92 | 93 | 94 | 95 | . 96 | 97 | 98 | ${basedir}/multilang 99 | 100 | 101 | 102 | 103 | org.apache.maven.plugins 104 | maven-plugin-plugin 105 | 3.2 106 | 107 | 108 | true 109 | 110 | 111 | 112 | mojo-descriptor 113 | 114 | descriptor 115 | 116 | 117 | 118 | 119 | 120 | org.codehaus.mojo 121 | exec-maven-plugin 122 | 1.2.1 123 | 124 | TwitterStorm 125 | 126 | 127 | 128 | 129 | 130 | 131 | 132 | 133 | 134 | 135 | 138 | 139 | 140 | 141 | 142 | maven-clean-plugin 143 | 3.0.0 144 | 145 | 146 | 147 | . 148 | 149 | *.class 150 | *.log 151 | 152 | 153 | * 154 | 155 | false 156 | 157 | 158 | 159 | 160 | 161 | 162 | 163 | -------------------------------------------------------------------------------- /src/utils/ReducedAsciiDictionary.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | class ReducedAsciiDictionary: 4 | """ Creates a dictionary that maps characters to an integer 5 | 6 | This is useful because not all characters are used and this 7 | method allows for low-dimensional character vectors. 8 | 9 | """ 10 | 11 | def __init__(self, dictionary, ranges): 12 | """ Initialize the dictionary based on a set of ranges 13 | 14 | The ranges parameter is a 2D numpy array, doubly inclusive. 15 | Recall that the function ord() gets the int value of a char. 16 | Recall that the function chr() gets the char value of an int. 17 | 18 | TO-DO: Add range array validity checker. 19 | 20 | """ 21 | 22 | self.dictionary = dictionary 23 | 24 | # default constructor runs if size is 0. 25 | if(ranges.size == 0): 26 | self.ranges = np.array([[32,63],[96,127]]) 27 | else: 28 | self.ranges = ranges 29 | 30 | #build dictionary 31 | counter = 0 32 | numRanges = self.ranges.shape[0] 33 | for i in range(0,numRanges): 34 | start = self.ranges[i][0] 35 | end = self.ranges[i][1] 36 | for j in range(start,end+1): 37 | self.dictionary[chr(j)] = counter 38 | counter += 1 39 | 40 | -------------------------------------------------------------------------------- /src/utils/checkTrainTestDup.py: -------------------------------------------------------------------------------- 1 | import cPickle as pickle 2 | import numpy as np 3 | import os 4 | from os.path import expanduser 5 | 6 | test_data = pickle.load(open(expanduser("~/tweetnet/data/test_data.pkl"))) 7 | train_data = pickle.load(open(expanduser("~/tweetnet/data/train_data.pkl"))) 8 | 9 | testX = test_data[0] 10 | trainX = train_data[0] 11 | 12 | idx = len(test_data)*np.random.rand(2000) 13 | 14 | n = 0 15 | cnt = 0 16 | for i in idx: 17 | print n 18 | n += 1 19 | test_x = testX[int(i), :, :] 20 | for j in range(len(trainX)): 21 | if np.array_equal(test_x, trainX[j, :, :]): 22 | cnt += 1 23 | print "Dup" 24 | break 25 | print cnt 26 | -------------------------------------------------------------------------------- /src/utils/dumpDedup.py: -------------------------------------------------------------------------------- 1 | # python dumpDedup.py > ~/tweetnet/data/dump.txt 2 | import numpy 3 | from os.path import expanduser 4 | 5 | dumpFile = open(expanduser("~/tweetnet/data/dumpBig.txt")) 6 | dumpLines = dumpFile.readlines() 7 | 8 | dumpSet = set() 9 | 10 | for i in range(len(dumpLines)): 11 | if dumpLines[i][0:4] == 'text': 12 | if dumpLines[i] in dumpSet: 13 | continue; 14 | else: 15 | dumpSet.add(dumpLines[i]) 16 | print "\n", 17 | print dumpLines[i], 18 | print dumpLines[i+1], 19 | -------------------------------------------------------------------------------- /src/utils/getEnglishHashTweets.py: -------------------------------------------------------------------------------- 1 | from os.path import expanduser 2 | import numpy 3 | import cPickle as pickle 4 | 5 | def checkHashtags(hashtagStr,dictionary): 6 | hasMultiEnglishHashtag = False 7 | returnHt = "hashtags:" 8 | htStr = hashtagStr[9:] 9 | htTokens = htStr.split(" ") 10 | nEnglishHashtag = 0 11 | for token in htTokens: 12 | try: 13 | if(len(dictionary[token])>0): 14 | returnHt = returnHt + " " + token 15 | nEnglishHashtag += 1 16 | except KeyError: 17 | continue 18 | if nEnglishHashtag >= 2: 19 | hasMultiEnglishHashtag = True 20 | return returnHt, hasMultiEnglishHashtag 21 | 22 | 23 | dictionary = pickle.load(open(expanduser("~/tweetnet/data/word2vec_dict.pkl"), "r")) 24 | 25 | textFile = open(expanduser("~/tweetnet/data/dump.txt"), "r") 26 | fileLines = textFile.readlines() 27 | keepTweets = [] 28 | keepHashtags = [] 29 | counter = 0 30 | 31 | while((counter+1)= threshold: 19 | cnt += 1 20 | print "Threshold = ", threshold, " # Hashtags= ", cnt*threshold 21 | 22 | ht = dic.keys() 23 | freq = dic.values() 24 | idx = np.argsort(np.array(freq)) 25 | 26 | sortedHt = [] 27 | sortedFq = [] 28 | for i in idx: 29 | sortedHt.append(ht[i]) 30 | sortedFq.append(freq[i]) 31 | -------------------------------------------------------------------------------- /src/utils/loadData.py: -------------------------------------------------------------------------------- 1 | ''' 2 | loads twitter dataset from storm API. 3 | ''' 4 | import numpy as np 5 | import cPickle as pickle 6 | from ReducedAsciiDictionary import ReducedAsciiDictionary 7 | from os.path import expanduser 8 | 9 | def loadData(dictionary,ranges): 10 | ''' Creates dataset based on dictionary, a set of ascii 11 | ranges, and pickled twitter data from Apache Storm. 12 | 13 | X: (numTweets, 141, dictionaryLength + embeddings length) 14 | vocabLen: (dictionary length) 15 | tweetLength: (numTweets) 16 | ''' 17 | 18 | #load tweets and hashtag embeddings 19 | tweets = pickle.load(open(expanduser("~/tweetnet/data/preprocessed_new_tweets.pkl"),"rb")) 20 | embeddings = pickle.load(open(expanduser("~/tweetnet/data/new_embeddings.pkl"),"rb")) 21 | 22 | #visualize data 23 | #print "tweets (ELEMENT TYPE): ", type(tweets[0]) 24 | #print "tweets (Number Of Tweets): ", len(tweets) 25 | #print "hashtag (ELEMENT TYPE): ", type(embeddings[0]) 26 | #print "hashtag (SHAPE): ", embeddings.shape 27 | 28 | #create character dictionary for tweets. 29 | dictionary = ReducedAsciiDictionary({},ranges).dictionary 30 | 31 | #total number of tweets 32 | numData = len(tweets) 33 | 34 | #number of unique characters in dataset 35 | vocabLen = len(dictionary)+1 36 | 37 | #initialize datastore arrays 38 | X = np.zeros([numData, 140+1, vocabLen + embeddings.shape[1]]) 39 | tweetLength = np.zeros(numData) 40 | 41 | # for each tweet create onehot encoding for each character 42 | for twt in range(numData): 43 | if(twt%1000==0): 44 | print "loaded: ", twt, " of ", numData 45 | tweetLength[twt] = len(tweets[twt])-6+1 46 | currTweet = tweets[twt][6:len(tweets[twt])] 47 | 48 | for ch in range(len(currTweet)): 49 | oneHotIndex = dictionary.get(currTweet[ch]) 50 | X[twt,ch,oneHotIndex] = 1 51 | 52 | for embIndex in range(embeddings.shape[1]): 53 | X[twt,ch,embIndex+vocabLen] = embeddings[twt,embIndex] 54 | #end of tweet character (EOS) 55 | X[twt,len(currTweet),len(dictionary)]=1 56 | 57 | return X, vocabLen, tweetLength, dictionary 58 | -------------------------------------------------------------------------------- /src/utils/loadDataT2C.py: -------------------------------------------------------------------------------- 1 | ''' 2 | loads twitter dataset from storm API. 3 | ''' 4 | import sys 5 | import os 6 | import numpy as np 7 | import cPickle as pickle 8 | sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "..","utils"))) 9 | from ReducedAsciiDictionary import ReducedAsciiDictionary 10 | from getEnglishHashTweets import checkHashtags 11 | from numpy import random 12 | from random import shuffle 13 | from os.path import expanduser 14 | 15 | def loadData(dictionary,ranges,sequenceLength,trainPercent, freqThreshold): 16 | ''' Creates dataset based on dictionary, a set of ascii 17 | ranges, and pickled twitter data from Apache Storm. 18 | 19 | 20 | X: [#sequences, 40, 65] 21 | y: [#sequences, 300] 22 | vocabLen: (dictionary length) 23 | tweetLength: (numTweets) 24 | ''' 25 | 26 | #load tweets with >=2 hashtags and corresponding english hashtags 27 | tweets = pickle.load(open(expanduser("~/tweetnet/data/multitaskTweets.pkl"), "rb")) 28 | hashtags = pickle.load(open(expanduser("~/tweetnet/data/multitaskHashtags.pkl"), "rb")) 29 | 30 | #load hashtag frequency dictionary 31 | hashtagFreq = pickle.load(open(expanduser("~/tweetnet/data/hashtagFreq.pkl"), "rb")) 32 | 33 | #modifiedTweets = [] 34 | #for i in range(len(tweets)): 35 | # # Get rid of the "text: " and add start of text and end of text 36 | # modifiedTweets.append(chr(2) + tweets[i][6:] + chr(3)) 37 | #tweets = modifiedTweets 38 | 39 | #Normalize data by frequency 40 | tweets_shuf, hashtags_shuf = normalizeByFreq(tweets, hashtags, hashtagFreq, freqThreshold) 41 | 42 | nTweet = len(tweets_shuf) 43 | nTrainData = np.ceil(nTweet*trainPercent).astype(int) 44 | 45 | #Split the tweets and hashtags into training and testing set 46 | trainTweets = tweets_shuf[0: nTrainData] 47 | trainHashtags = hashtags_shuf[0: nTrainData] 48 | testTweets = tweets_shuf[nTrainData: nTweet] 49 | testHashtags = hashtags_shuf[nTrainData: nTweet] 50 | nTestData = len(testTweets) 51 | 52 | 53 | #load word2vec dictionary 54 | print("Loading word2vec dictionary") 55 | word2vecDict = pickle.load(open(expanduser("~/tweetnet/data/word2vec_dict.pkl"), "rb")) 56 | print("Finished loading word2vec dictionary") 57 | 58 | #create character dictionary for tweets. 59 | dictionary = ReducedAsciiDictionary({},ranges).dictionary 60 | dictionary[chr(2)] = len(dictionary) 61 | dictionary[chr(3)] = len(dictionary) 62 | #number of unique characters in dataset 63 | vocabLen = len(dictionary) 64 | 65 | #initialize datastore arrays 66 | trainTweetSequence = [] 67 | trainHashtagSequence = [] 68 | testTweetSequence = [] 69 | testHashtagSequence = [] 70 | 71 | #vector in word2vec is 300 72 | embeddingLength = 300 73 | 74 | #Split data into sequences of length 40 for training 75 | for i in range(nTrainData): 76 | oneTweet = trainTweets[i] 77 | for j in range(0, len(oneTweet) - sequenceLength + 1, 1): 78 | trainTweetSequence.append(oneTweet[j : j+sequenceLength]) 79 | trainHashtagSequence.append(trainHashtags[i]) 80 | print('Number of sequences in training data: ', len(trainTweetSequence)) 81 | print('Number of hashtags in training data: ', len(trainHashtagSequence)) 82 | 83 | 84 | #Split data into sequences of length 40 for testing 85 | for i in range(nTestData): 86 | oneTweet = testTweets[i] 87 | ht = hashtags[i].split(" ") 88 | for j in range(0, len(oneTweet) - sequenceLength + 1, 1): 89 | testTweetSequence.append(oneTweet[j : j+sequenceLength]) 90 | testHashtagSequence.append(testHashtags[i]) 91 | print('Number of sequences in testing data: ', len(testTweetSequence)) 92 | print('Number of hashtags in testing data: ', len(testHashtagSequence)) 93 | 94 | 95 | # for each sequence, create onehot encoding for each character 96 | print("Vectorization...") 97 | 98 | # trainX: [#training sequences, 40, 65] 99 | # trainy: [#training sequences, 300] 100 | trainX = np.zeros((len(trainTweetSequence), sequenceLength, vocabLen), dtype=np.bool) 101 | trainY = np.zeros((len(trainTweetSequence), embeddingLength)) 102 | for i, seq in enumerate(trainTweetSequence): 103 | if i % 10000 == 0: 104 | print("Loading training tweet ", i) 105 | for j, ch in enumerate(seq): 106 | oneHotIndex = dictionary.get(ch) 107 | trainX[i,j,oneHotIndex] = 1 108 | trainY[i] = word2vecDict[trainHashtagSequence[i]] 109 | 110 | # testX: [#testing sequences, 40, 65] 111 | # testy: [#testing sequences, 300] 112 | testX = np.zeros((len(testTweetSequence), sequenceLength, vocabLen), dtype=np.bool) 113 | testY = np.zeros((len(testTweetSequence), embeddingLength)) 114 | 115 | for i, seq in enumerate(testTweetSequence): 116 | if i % 10000 == 0: 117 | print("Loading testing tweet ", i) 118 | for j, ch in enumerate(seq): 119 | oneHotIndex = dictionary.get(ch) 120 | testX[i,j,oneHotIndex] = 1 121 | testY[i] = word2vecDict[testHashtagSequence[i]] 122 | 123 | tweet2hashtagParam = [trainTweets, trainHashtags, testTweets, testHashtags, trainX, trainY, testX, testY, trainTweetSequence, trainHashtagSequence, testTweetSequence, testHashtagSequence] 124 | 125 | 126 | return trainTweets, trainHashtags, testTweets, testHashtags, trainX, trainY, testX, testY, trainTweetSequence, trainHashtagSequence, testTweetSequence, testHashtagSequence, word2vecDict 127 | 128 | 129 | if __name__ == "__main__": 130 | trainTweets, trainHashtags, testTweets, testHashtags, trainX, trainY, testX, testY, trainTweetSequence, trainHashtagSequence, testTweetSequence, testHashtagSequence, dictionary = loadData({},np.array([]), 40, 0.99, 84) 131 | dic = {} 132 | for i in range(1000): 133 | if dic.get(trainHashtagSequence[i]) == None: 134 | dic[trainHashtagSequence[i]] = 1 135 | print dic.keys() 136 | 137 | -------------------------------------------------------------------------------- /src/utils/loadData_lstm.py: -------------------------------------------------------------------------------- 1 | ''' 2 | loads twitter dataset from storm API. 3 | ''' 4 | import numpy as np 5 | import cPickle as pickle 6 | from ReducedAsciiDictionary import ReducedAsciiDictionary 7 | from os.path import expanduser 8 | import sys 9 | 10 | def loadData(dictionary,ranges,sequenceLength, nTweet): 11 | ''' Creates dataset based on dictionary, a set of ascii 12 | ranges, and pickled twitter data from Apache Storm. 13 | 14 | 15 | X: [#sequences, 40, 65] 16 | y: [#sequences, 65] 17 | vocabLen: (dictionary length) 18 | tweetLength: (numTweets) 19 | ''' 20 | 21 | #load tweets and hashtag embeddings 22 | tweets = pickle.load(open(expanduser("~/tweetnet/data/preprocessed_new_tweets.pkl"),"rb")) 23 | np.random.shuffle(tweets) 24 | 25 | #use the first nTweet tweets 26 | tweets = tweets[0:nTweet] 27 | print "Number of tweets ", len(tweets) 28 | 29 | #create character dictionary for tweets. 30 | dictionary = ReducedAsciiDictionary({},ranges).dictionary 31 | 32 | #total number of tweets 33 | numData = len(tweets) 34 | 35 | #number of unique characters in dataset 36 | vocabLen = len(dictionary)+1 37 | 38 | #initialize datastore arrays 39 | tweetSequence = [] 40 | nextChar = [] 41 | tweetLength = np.zeros(numData) 42 | 43 | #Split data into sequences of length 40 and create nextChar array 44 | for i in range(numData): 45 | oneTweet = tweets[i] 46 | for j in range(0, len(oneTweet) - sequenceLength - 1, 1): 47 | tweetSequence.append(oneTweet[j : j+sequenceLength]) 48 | nextChar.append(oneTweet[j+sequenceLength]) 49 | tweetSequence.append(oneTweet[len(oneTweet)-sequenceLength - 1:len(oneTweet) - 1]) 50 | nextChar.append("") 51 | print('Number of sequences: ', len(tweetSequence)) 52 | 53 | # for each sequence, create onehot encoding for each character 54 | # X: [#sequences, 40, 65] 55 | # y: [#sequences, 65] 56 | print("Vectorization...") 57 | X = np.zeros((len(tweetSequence), sequenceLength, vocabLen), dtype=np.bool) 58 | y = np.zeros((len(tweetSequence), vocabLen), dtype=np.bool) 59 | 60 | for i, seq in enumerate(tweetSequence): 61 | if i % 10000 == 0: 62 | print "Loading tweet ", i 63 | for j, ch in enumerate(seq): 64 | oneHotIndex = dictionary.get(ch) 65 | X[i,j,oneHotIndex] = 1 66 | 67 | if nextChar[i] != "": 68 | y[i, dictionary.get(nextChar[i])] = 1 69 | else: 70 | y[i, len(dictionary)] = 1 71 | return X, y, vocabLen, dictionary, tweetSequence, nextChar, tweets 72 | 73 | if __name__ == "__main__": 74 | X, y, vocabLen, dictionary, tweetSequence, nextChar, tweets = loadData({},np.array([]),40) 75 | print "The first tweet sequence is: ", tweetSequence[0] 76 | -------------------------------------------------------------------------------- /src/utils/loadKaggleHelpful.py: -------------------------------------------------------------------------------- 1 | #import variables 2 | import cPickle as pickle 3 | import numpy 4 | import gzip 5 | from os.path import expanduser 6 | # function implementations 7 | def readGz(f): 8 | for l in gzip.open(f): 9 | yield eval(l) 10 | 11 | def loadTrain(): 12 | text=[];helpful=[];outOf=[];userID=[];itemID=[] 13 | # collecting the data 14 | for metablock in readGz(expanduser('~/tweetnet/data/train.json.gz')): 15 | text.append(metablock['reviewText']) 16 | helpful.append(metablock['helpful']['nHelpful']) 17 | outOf.append(metablock['helpful']['outOf']) 18 | userID.append(metablock['reviewerID']) 19 | itemID.append(metablock['itemID']) 20 | return text, helpful,outOf, userID, itemID 21 | 22 | def loadTest(): 23 | text=[];outOf=[];userID=[];itemID=[] 24 | #collecting the data 25 | for metablock in readGz(expanduser('~/tweetnet/data/test_Helpful.json.gz')): 26 | text.append(metablock['reviewText']) 27 | outOf.append(metablock['helpful']['outOf']) 28 | userID.append(metablock['reviewerID']) 29 | itemID.append(metablock['itemID']) 30 | return text, outOf, userID, itemID 31 | -------------------------------------------------------------------------------- /src/utils/logger.py: -------------------------------------------------------------------------------- 1 | import numpy 2 | from os.path import expanduser 3 | def logger(listVals, name): 4 | logwriter = open(expanduser('~/tweetnet/logs/'+ name), 'a') 5 | logwriter.write("\n ########################################## \n") 6 | logwriter.write("EPOCH: " + str(listVals[0][0]) + "\n") 7 | for i in xrange(1,len(listVals)-1): 8 | logwriter.write("input: " + str(listVals[i][0]) + "\n") 9 | logwriter.write("target: " + str(listVals[i][1]) + "\n") 10 | logwriter.write("isCorrect: " + str(listVals[i][2]) + "\n") 11 | logwriter.write("topN: "+ str(listVals[i][3]) + "\n\n") 12 | logwriter.write("numCorrect: " + str(listVals[len(listVals)-1][0])) 13 | logwriter.write(" percCorrect: " + str(listVals[len(listVals)-1][1])) 14 | 15 | 16 | 17 | -------------------------------------------------------------------------------- /src/utils/mkMultiTaskTweet.py: -------------------------------------------------------------------------------- 1 | ''' 2 | loads twitter dataset from storm API for multitasking model training 3 | Task 1: Hashtag prediction 4 | Task 2: missing word completion 5 | 6 | Data format: tweet -- hashtag -- missing word 7 | ''' 8 | import sys 9 | import os 10 | import numpy as np 11 | import cPickle as pickle 12 | sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "..","utils"))) 13 | from ReducedAsciiDictionary import ReducedAsciiDictionary 14 | from getEnglishHashTweets import checkHashtags 15 | from numpy import random 16 | from random import shuffle 17 | from os.path import expanduser 18 | import re 19 | import string 20 | from stop_words import get_stop_words 21 | 22 | def mkMissingWord(text, word2vecDict): 23 | 24 | punctuation = set(string.punctuation) 25 | stop_words = get_stop_words('english') 26 | words = text.split() 27 | cnt = 0 28 | while cnt <= 7: 29 | idx = 1 + random.randint(len(words) - 1) 30 | w = words[idx] 31 | w = ''.join([c for c in w.lower() if not c in punctuation]) 32 | if len(w)==1 or word2vecDict.get(w) == None or w in stop_words: 33 | cnt += 1 34 | else: 35 | missingWord = w 36 | words[idx] = "UNK" 37 | text = " ".join(words) 38 | return (text, missingWord) 39 | return (None,None) 40 | 41 | 42 | 43 | def tweetsForMultiTask(tweets, hashtags, word2vecDict): 44 | 45 | 46 | tweets_shuf = [] 47 | hashtags_shuf = [] 48 | missingWords = [] 49 | idx_shuf = range(len(tweets)) 50 | shuffle(idx_shuf) 51 | for i in idx_shuf: 52 | ht = hashtags[i].split(" ") 53 | 54 | text, missingWord = mkMissingWord(tweets[i], word2vecDict) 55 | if text != None and missingWord != None: 56 | tweets_shuf.append(text) 57 | hashtags_shuf.append(ht[2]) 58 | missingWords.append(missingWord) 59 | print (text, ht[2], missingWord) 60 | 61 | return tweets_shuf, hashtags_shuf, missingWords 62 | 63 | 64 | def loadData(dictionary,ranges,sequenceLength,trainPercent): 65 | ''' Creates dataset based on dictionary, a set of ascii 66 | ranges, and pickled twitter data from Apache Storm. 67 | 68 | 69 | X: [#sequences, 40, 65] 70 | y: [#sequences, 300] 71 | vocabLen: (dictionary length) 72 | tweetLength: (numTweets) 73 | ''' 74 | 75 | 76 | #load tweets with >=2 hashtags and corresponding english hashtags 77 | tweets = pickle.load(open(expanduser("~/tweetnet/data/englishHashtagTweet.pkl"), "rb")) 78 | hashtags = pickle.load(open(expanduser("~/tweetnet/data/englishHashtag.pkl"), "rb")) 79 | modifiedTweets = [] 80 | 81 | #load word2vec dictionary 82 | print("Loading word2vec dictionary") 83 | word2vecDict = pickle.load(open(expanduser("~/tweetnet/data/word2vec_dict.pkl"), "rb")) 84 | print("Finished loading word2vec dictionary") 85 | 86 | for i in range(len(tweets)): 87 | # Get rid of the "text: " and add start of text and end of text 88 | modifiedTweets.append(chr(2) + tweets[i][6:] + chr(3)) 89 | 90 | tweets = modifiedTweets 91 | tweets, hashtags, missingWords = tweetsForMultiTask(tweets, hashtags, word2vecDict) 92 | 93 | nTweet = len(tweets) 94 | 95 | print "Number of remaining tweets: ", nTweet 96 | 97 | print "Saving data to files ..." 98 | with open(expanduser("~/tweetnet/data/multitaskTweets.pkl"), "wb") as file1: 99 | pickle.dump(tweets, file1, pickle.HIGHEST_PROTOCOL) 100 | with open(expanduser("~/tweetnet/data/multitaskHashtags.pkl"), "wb") as file2: 101 | pickle.dump(hashtags, file2, pickle.HIGHEST_PROTOCOL) 102 | with open(expanduser("~/tweetnet/data/multitaskTweetMw.pkl"), "wb") as file3: 103 | pickle.dump(missingWords, file3, pickle.HIGHEST_PROTOCOL) 104 | 105 | if __name__ == "__main__": 106 | loadData({},np.array([]), 40, 0.9) 107 | -------------------------------------------------------------------------------- /src/utils/predContext.py: -------------------------------------------------------------------------------- 1 | 2 | import numpy 3 | 4 | def createHtDict(dic, allHashtags): 5 | htDic = {} 6 | for ht in allHashtags: 7 | if ht not in htDic.keys(): 8 | htDic[ht] = dic[ht] 9 | return htDic 10 | 11 | def predContext(htDictionary, modelOutput, topN, label): 12 | correct = False 13 | keyResult = [] 14 | sortedKeyResult = [] 15 | dotResult = numpy.zeros([len(htDictionary)]) 16 | 17 | counter = 0 18 | for k in htDictionary.keys(): 19 | dotResult[counter] = -numpy.dot(modelOutput,htDictionary[k])[-1] 20 | keyResult.append(k) 21 | counter = counter + 1 22 | 23 | sortIndex = numpy.argsort(dotResult) 24 | topNdots = dotResult[sortIndex[0:topN]] 25 | 26 | for i in range(topN): 27 | sortedKeyResult.append(keyResult[sortIndex[i]]) 28 | if label == keyResult[sortIndex[i]]: 29 | correct = True 30 | return sortedKeyResult, correct, topNdots 31 | -------------------------------------------------------------------------------- /src/utils/prelimTest.py: -------------------------------------------------------------------------------- 1 | import os 2 | from os.path import expanduser 3 | 4 | c2c = open(expanduser("~/tweetnet/logs/Feb/c2c2017-03-04_13:19.log"), "rb") 5 | t2c = open(expanduser("~/tweetnet/logs/t2c2017-03-04_13:42.log"), "rb") 6 | 7 | cnt = 0 8 | 9 | correctDic = {} 10 | 11 | lines = c2c.read() 12 | lines = lines.split("\n\n") 13 | 14 | for blocks in lines: 15 | if blocks[0:5] == "input": 16 | blocks = blocks.split("\n") 17 | if "True" in blocks[2]: 18 | correctDic[cnt] = 1 19 | else: 20 | correctDic[cnt] = 0 21 | cnt += 1 22 | 23 | cnt = 0 24 | lines = t2c.read() 25 | lines = lines.split("\n\n") 26 | 27 | for blocks in lines: 28 | if blocks[0:5] == "input": 29 | blocks = blocks.split("\n") 30 | if "True" in blocks[2]: 31 | if correctDic[cnt] == 0: 32 | correctDic[cnt] = 1 33 | cnt += 1 34 | 35 | accuracy = sum(correctDic.values()) * 1.0/ len(correctDic) 36 | print accuracy 37 | -------------------------------------------------------------------------------- /src/utils/preprocessor.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Function reduces all tweets to contain only characters in 3 | the second and fourth columns of the standard ascii table. 4 | This should be used if you are using an old version 5 | of the storm topology that does not do this online. 6 | ''' 7 | 8 | import cPickle as pickle 9 | import numpy as np 10 | from os.path import expanduser 11 | 12 | tweets = pickle.load(open(expanduser("~/tweetnet/data/new_tweets_list_string.pkl"),"rb")) 13 | embeddings = pickle.load(open(expanduser("~/tweetnet/data/new_embeddings.pkl","rb"))) 14 | 15 | print "tweet array shape: ", len(tweets) 16 | print "embeddings array shape: ", embeddings.shape 17 | print "tweet array type: ", type(tweets[0]) 18 | print "embeddings array type: ", type(embeddings[0]) 19 | 20 | 21 | for i in range(len(tweets)): 22 | s="" 23 | for j in range(len(tweets[i])): 24 | asciiVal = ord(tweets[i][j]) 25 | 26 | if(asciiVal>=32 and asciiVal<=63): 27 | s+=tweets[i][j] 28 | elif(asciiVal>=96 and asciiVal <= 127): 29 | s+=tweets[i][j] 30 | else: 31 | continue 32 | tweets[i]=s 33 | 34 | pickle.dump(tweets, open(expanduser("~/tweetnet/data/preprocessed_new_tweets","wb"))) 35 | 36 | -------------------------------------------------------------------------------- /src/utils/tf_utils.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import numpy as np 3 | 4 | def fcLayer(x, in_shape, out_shape, activation, dropout, is_train, scope="fc"): 5 | 6 | x = tf.reshape(x, [-1, in_shape]) 7 | 8 | with tf.variable_scope(scope): 9 | w = tf.get_variable(name="w", shape = [in_shape, out_shape], dtype=tf.float32, initializer=tf.truncated_normal_initializer(stddev=1e-2)) 10 | b = tf.get_variable(name="b", shape= [out_shape], dtype=tf.float32, initializer=tf.constant_initializer(0.0)) 11 | fc = tf.add(tf.matmul(x, w), b) 12 | 13 | with tf.variable_scope("activation"): 14 | output = applyActivation(fc, activation) 15 | #out_op = tf.nn.dropout(output, dropout) 16 | out_op = output 17 | 18 | return out_op 19 | 20 | def createLSTMCell(batch_size, lstm_size, n_layers, forget_bias): 21 | 22 | lstm_cell = tf.contrib.rnn.BasicLSTMCell(lstm_size, forget_bias=forget_bias) 23 | lstm_cell = tf.contrib.rnn.MultiRNNCell([lstm_cell for i in range(n_layers)], state_is_tuple=True) 24 | state = lstm_cell.zero_state(batch_size, tf.float32) 25 | 26 | return lstm_cell, state 27 | 28 | def applyActivation(x, activation): 29 | 30 | if activation == "tanh": 31 | return tf.nn.tanh(x) 32 | elif activation == "relu": 33 | return tf.nn.relu(x) 34 | elif activation == "sigmoid": 35 | return tf.nn.sigmoid(x) 36 | elif activation == "relu6": 37 | return tf.nn.relu6(x) 38 | else: return None 39 | 40 | 41 | def predictionLayer(x, y, in_shape, out_shape, activation, scope="prediction"): 42 | 43 | x = tf.reshape(x, [-1, in_shape]) 44 | 45 | with tf.variable_scope(scope): 46 | w = tf.get_variable(name=scope+"w", shape = [in_shape, out_shape], dtype=tf.float32, initializer=tf.truncated_normal_initializer(stddev=1e-2)) 47 | b = tf.get_variable(name=scope+"b", shape= [out_shape], dtype=tf.float32, initializer=tf.constant_initializer(0.0)) 48 | logits = tf.add(tf.matmul(x, w), b) 49 | output = applyActivation(logits, activation) 50 | # Compute the mean-squared-error 51 | cost = tf.reduce_mean(tf.square(tf.subtract(y , output))) 52 | 53 | return cost, output 54 | -------------------------------------------------------------------------------- /src/utils/tf_utils_reg.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import numpy as np 3 | 4 | def fcLayer(x, in_shape, out_shape, activation, dropout, is_train, scope="fc"): 5 | 6 | x = tf.reshape(x, [-1, in_shape]) 7 | 8 | with tf.variable_scope(scope): 9 | w = tf.get_variable(name="w", shape = [in_shape, out_shape], dtype=tf.float32, initializer=tf.truncated_normal_initializer(stddev=1e-2)) 10 | b = tf.get_variable(name="b", shape= [out_shape], dtype=tf.float32, initializer=tf.constant_initializer(0.0)) 11 | fc = tf.add(tf.matmul(x, w), b) 12 | 13 | with tf.variable_scope("activation"): 14 | output = applyActivation(fc, activation) 15 | #out_op = tf.nn.dropout(output, dropout) 16 | out_op = output 17 | 18 | return out_op 19 | 20 | def createLSTMCell(batch_size, lstm_size, n_layers, forget_bias): 21 | 22 | lstm_cell = tf.contrib.rnn.BasicLSTMCell(lstm_size, forget_bias=forget_bias) 23 | lstm_cell = tf.contrib.rnn.MultiRNNCell([lstm_cell for i in range(n_layers)], state_is_tuple=True) 24 | state = lstm_cell.zero_state(batch_size, tf.float32) 25 | 26 | return lstm_cell, state 27 | 28 | def applyActivation(x, activation): 29 | 30 | if activation == "tanh": 31 | return tf.nn.tanh(x) 32 | elif activation == "relu": 33 | return tf.nn.relu(x) 34 | elif activation == "sigmoid": 35 | return tf.nn.sigmoid(x) 36 | elif activation == "relu6": 37 | return tf.nn.relu6(x) 38 | else: return None 39 | 40 | 41 | def predictionLayer(x, y, in_shape, out_shape, activation, scope="prediction"): 42 | 43 | x = tf.reshape(x, [-1, in_shape]) 44 | 45 | with tf.variable_scope(scope): 46 | w = tf.get_variable(name=scope+"w", shape = [in_shape, out_shape], dtype=tf.float32, initializer=tf.truncated_normal_initializer(stddev=1e-2)) 47 | b = tf.get_variable(name=scope+"b", shape= [out_shape], dtype=tf.float32, initializer=tf.constant_initializer(0.0)) 48 | logits = tf.add(tf.matmul(x, w), b) 49 | output = applyActivation(logits, activation) 50 | # Compute the mean-squared-error 51 | cost = tf.reduce_mean(tf.square(tf.subtract(y , output))) 52 | 53 | return cost, output 54 | -------------------------------------------------------------------------------- /src/utils/tweetGenerator.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Reads weight matrix from hdf5 file and generates text using seed. 3 | ''' 4 | import numpy as np 5 | import pickle as pkl 6 | import numpy as np 7 | from numpy import random 8 | from loadData import loadData 9 | from keras.utils import np_utils 10 | from keras.models import Sequential 11 | from keras.layers import LSTM 12 | from keras.layers import Dense 13 | from keras.layers import Activation 14 | from keras.optimizers import RMSprop 15 | from keras.optimizers import Adagrad 16 | from keras.layers import Dropout 17 | from keras.layers import BatchNormalization 18 | from scipy.stats import rv_discrete 19 | import sys 20 | 21 | def generateText(dictionary, data, dictLen, tweetLen, X, y, 22 | inputSize, sequenceLength, numHiddenFirst, numTweets, seqPerSegment, 23 | n_examples, numSegments): 24 | 25 | # data shape = #tweets x 141 x inputSize(365) 26 | #initialize inverse dictionary to map integers to characterse 27 | inverseDictionary = {v: k for k, v in dictionary.iteritems()} 28 | print "inverseDictionary Size", len(inverseDictionary) 29 | 30 | #building cLSTM model 31 | print("\n") 32 | print("Generating Text... ") 33 | model = Sequential() 34 | 35 | model.add(LSTM(numHiddenFirst, input_shape=(sequenceLength, inputSize), return_sequences=True)) 36 | model.add(LSTM(numHiddenFirst)) 37 | 38 | model.add(Dense(numHiddenFirst)) 39 | model.add(Activation('relu')) 40 | model.add(BatchNormalization()) 41 | 42 | model.add(Dense(numHiddenFirst)) 43 | model.add(Activation('relu')) 44 | model.add(BatchNormalization()) 45 | 46 | model.add(Dense(dictLen)) 47 | model.add(Activation('softmax')) 48 | 49 | 50 | #load the network weights 51 | fileName = "~/tweetnet/logs/intermediateWeights.hdf5" 52 | model.load_weights(fileName) 53 | model.compile(loss='categorical_crossentropy', optimizer='adam') 54 | 55 | #initializing to random seed 56 | seedTweet = np.random.randint(n_examples, size=1) 57 | contextVector=np.zeros(inputSize-(dictLen)) 58 | 59 | printSeed="SEED: " 60 | for c in range(sequenceLength): 61 | #for each character in the sequence 62 | 63 | #grab the pattern, which is the 1x365 input vector 64 | pattern = X[seedTweet][c,:] 65 | 66 | #grab the 1x300 context subvector 67 | contextVector = pattern[dictLen:] 68 | 69 | #search, in the pattern itself, for the one-hot element 70 | counter = 0 71 | for i in range(dictLen): 72 | if(pattern[i] == 1): 73 | counter = i 74 | break 75 | #if one-hot element is greater than 64, then EOS. 76 | #technically you'll never reach this as seqLen should be < tweetLen 77 | if(counter>=64): 78 | printSeed = printSeed + "<>" 79 | continue; 80 | 81 | printSeed = printSeed + inverseDictionary[counter] 82 | print printSeed 83 | 84 | x = X[seedTweet][0:sequenceLength] 85 | inputVector = np.reshape(x,(1,len(x),len(x[0]))) 86 | #generate characters 87 | 88 | printResult = "GENERATED TEXT: " 89 | 90 | charsGenerated = 140 91 | for i in range(charsGenerated): 92 | 93 | prediction = model.predict(inputVector, verbose=0) 94 | #index = np.argsort(prediction) 95 | #rand = np.random.randint(5) 96 | #rand_index = index[0][len(index[0]) - rand - 1] 97 | 98 | rand_index = rv_discrete(values=(list(xrange(len(prediction[0]))),prediction[0])).rvs(size=1)[0] 99 | if(rand_index==(dictLen-1)): 100 | printResult = printResult + "<>" 101 | break 102 | 103 | result = inverseDictionary[rand_index] 104 | printResult = printResult+result 105 | 106 | charVector=np.zeros(dictLen) 107 | charVector[rand_index]=1 108 | currInput = np.concatenate((charVector,contextVector)) 109 | 110 | concatVector = np.reshape(currInput, (1,1,len(currInput))) 111 | 112 | inputVector=np.concatenate((inputVector,concatVector), axis=1) 113 | inputVector=inputVector[:,1:len(inputVector[0]),:] 114 | 115 | print printResult 116 | -------------------------------------------------------------------------------- /src/utils/tweetGenerator_lstm.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Reads weight matrix from hdf5 file and generates text using seed. 3 | ''' 4 | import numpy as np 5 | import pickle as pkl 6 | import numpy as np 7 | import h5py 8 | from numpy import random 9 | from loadData_lstm import loadData 10 | from keras.utils import np_utils 11 | from keras.models import Sequential 12 | from keras.layers import LSTM 13 | from keras.layers import Dense 14 | from keras.layers import Activation 15 | from keras.optimizers import RMSprop 16 | from keras.optimizers import Adagrad 17 | from keras.layers import Dropout 18 | from keras.layers import BatchNormalization 19 | from scipy.stats import rv_discrete 20 | import sys 21 | from os.path import expanduser 22 | 23 | def sample(preds, temperature=1.0): 24 | preds = np.asarray(preds).astype('float64') 25 | preds = np.log(preds) / temperature 26 | exp_preds = np.exp(preds) 27 | preds = exp_preds / np.sum(exp_preds) 28 | probas = np.random.multinomial(1, preds, 1) 29 | return np.argmax(probas) 30 | 31 | def generateText(model, tweets, sequenceLength, vocabLen, dictionary): 32 | 33 | # Random select a tweet for generation 34 | start_index = random.randint(len(tweets)) 35 | inverseDictionary = {v: k for k, v in dictionary.iteritems()} 36 | 37 | # Different temperature adds randomness to character generation 38 | for diversity in [0.2, 0.5, 1.0, 1.2]: 39 | print("\n") 40 | print('----- diversity:', diversity) 41 | 42 | generated = "" 43 | 44 | seed = tweets[start_index][6:sequenceLength+6] 45 | print('----- Generating with seed: "' + seed + '"') 46 | generated += seed 47 | sys.stdout.write(generated) 48 | 49 | for i in range(140): 50 | 51 | # x: [1, sequenceLength(40), 65] 52 | x = np.zeros((1, sequenceLength, vocabLen)) 53 | 54 | # Create one hot encoding vectors for the seed 55 | for j, ch in enumerate(seed): 56 | x[0, j, dictionary.get(ch)] = 1 57 | 58 | preds = model.predict(x, verbose=0)[0] 59 | next_index = sample(preds, diversity) 60 | 61 | # If an EOS symbol is genearted, append "" to the end of generated and stop 62 | if next_index == vocabLen - 1: 63 | next_char = "" 64 | generated += next_char 65 | seed = seed[1:] + next_char 66 | sys.stdout.write(next_char) 67 | sys.stdout.flush() 68 | break 69 | 70 | # If not an EOS symbol, append the last generated char 71 | else: 72 | next_char = inverseDictionary[next_index] 73 | generated += next_char 74 | # Shift the window by 1 go create the new seed 75 | seed = seed[1:] + next_char 76 | sys.stdout.write(next_char) 77 | sys.stdout.flush() 78 | print("\n") 79 | -------------------------------------------------------------------------------- /src/utils/visualizeData.py: -------------------------------------------------------------------------------- 1 | import cPickle as pickle 2 | import numpy as np 3 | from os.path import expanduser 4 | 5 | tweets = pickle.load(open(expanduser("~/tweetnet/data/preprocessed_new_tweets.pkl"),"rb")) 6 | embeddings = pickle.load(open(expanduser("~/tweetnet/data/new_embeddings.pkl","rb"))) 7 | 8 | print "tweet array shape: ", len(tweets) 9 | print "embeddings array shape: ", embeddings.shape 10 | print "tweet array type: ", type(tweets[0]) 11 | print "embeddings array type: ", type(embeddings[0]) 12 | 13 | for i in range(100): 14 | print tweets[i] 15 | --------------------------------------------------------------------------------