├── .gitignore
├── 1709.05554.pdf
├── README.md
├── auto
    ├── data
    │   └── ag_news_csv
    │   │   ├── create_vali.py
    │   │   ├── load_batch.py
    │   │   ├── load_util.py
    │   │   ├── reformat.py
    │   │   └── stats.py
    ├── src
    │   ├── model
    │   │   ├── @
    │   │   ├── ccrnn_model_gen.py
    │   │   ├── mcrnn_model.py
    │   │   ├── mcrnn_model_1_lstm.py
    │   │   ├── mcrnn_model_gen.py
    │   │   ├── mcrnn_model_gen2.py
    │   │   ├── mcrnn_model_gen_bi.py
    │   │   ├── mcrnn_model_gen_bi2.py
    │   │   ├── train.py
    │   │   ├── train_1_lstm.py
    │   │   ├── train_gen.py
    │   │   └── train_gen2.py
    │   └── util
    │   │   ├── 1q
    │   │   ├── hps.py
    │   │   ├── hps2.py
    │   │   ├── hps_script.sh
    │   │   ├── load_batch.py
    │   │   ├── load_batch2.py
    │   │   ├── load_batch_val.py
    │   │   ├── load_util.py
    │   │   ├── reformat.py
    │   │   ├── tf_utils.py
    │   │   ├── tf_utils_old.py
    │   │   └── tf_utils_reg.py
    └── src_final
    │   ├── model
    │       ├── ccrnn_model_gen.py
    │       ├── mcrnn_model.py
    │       ├── mcrnn_model_1_lstm.py
    │       ├── mcrnn_model_gen.py
    │       ├── mcrnn_model_gen2.py
    │       ├── mcrnn_model_gen_bi.py
    │       ├── mcrnn_model_gen_bi2.py
    │       ├── train.py
    │       ├── train_1_lstm.py
    │       ├── train_gen.py
    │       └── train_gen2.py
    │   └── util
    │       ├── 1q
    │       ├── hps.py
    │       ├── hps2.py
    │       ├── hps_script.sh
    │       ├── load_batch.py
    │       ├── load_batch2.py
    │       ├── load_batch_val.py
    │       ├── load_util.py
    │       ├── reformat.py
    │       ├── tf_utils.py
    │       ├── tf_utils_old.py
    │       └── tf_utils_reg.py
├── data
    ├── .gitkeep
    ├── helpX
    ├── test_Category.json.gz
    ├── test_Helpful.json.gz
    └── train.json.gz
├── logs
    └── .gitkeep
├── scripts
    ├── .gitkeep
    ├── killZk.sh
    ├── newTerminalMac.sh
    ├── startKafkaServer.sh
    ├── startNimbus.sh
    ├── startStormUI.sh
    ├── startSupervisor.sh
    ├── startZK.sh
    ├── startZKClient.sh
    ├── systemStartMac.sh
    ├── systemStartUbuntu.sh
    └── userRunAPI.sh
└── src
    ├── .gitkeep
    ├── models
        ├── c2c_cooccurence.py
        ├── c2c_cooccurenceNonUniform.py
        ├── cascKeras.py
        ├── cascade.py
        ├── ccrnn.py
        ├── ccrnn_bn.py
        ├── ccrnn_drop.py
        ├── ccrnn_swap.py
        ├── cflstm.py
        ├── contextToContext.py
        ├── contextToContextNonUniform.py
        ├── mcrnn.py
        ├── mcrnn_bn.py
        ├── mtlKeras.py
        ├── tc2c.py
        ├── textToContext.py
        ├── tf_t2c.py
        ├── tweetnet.py
        └── tweetnet_lstm.py
    ├── storm
        ├── TwitterCleanerBolt.java
        ├── TwitterStorm.java
        ├── TwitterStreamSpout.java
        └── pom.xml
    └── utils
        ├── ReducedAsciiDictionary.py
        ├── checkTrainTestDup.py
        ├── dumpDedup.py
        ├── embeddingGeneration.py
        ├── getEnglishHashTweets.py
        ├── hashtagFrequency.py
        ├── loadData.py
        ├── loadDataNewModel.py
        ├── loadDataT2C.py
        ├── loadDataText2Hashtag.py
        ├── loadDataTweetMultiTask.py
        ├── loadData_lstm.py
        ├── loadKaggleHelpful.py
        ├── logger.py
        ├── mkMultiTaskTweet.py
        ├── predContext.py
        ├── prelimTest.py
        ├── preprocessor.py
        ├── tf_utils.py
        ├── tf_utils_reg.py
        ├── tweetGenerator.py
        ├── tweetGenerator_lstm.py
        └── visualizeData.py


/.gitignore:
--------------------------------------------------------------------------------
 1 | *.class
 2 | 
 3 | # Mobile Tools for Java (J2ME)
 4 | .mtj.tmp/
 5 | 
 6 | # Package Files #
 7 | *.jar
 8 | *.war
 9 | *.ear
10 | *.html
11 | *.pkl
12 | *.pyc
13 | *.swp
14 | *.out
15 | *.hdf5
16 | *.txt
17 | *.csv
18 | *zookeeper*
19 | *.jpg
20 | *.png
21 | *storm-local*
22 | *.bmp
23 | runAPI.sh
24 | *.gz
25 | *.log
26 | # misc data files#
27 | *.html
28 | *.pkl
29 | *.pyc
30 | *.swp
31 | *.out
32 | *.hdf5
33 | *.txt
34 | *.csv
35 | 
36 | # misc files generated by zk and storm
37 | *zookeeper*
38 | *storm-local*
39 | runAPI.sh
40 | 
41 | # misc data files#
42 | *.html
43 | *.pkl
44 | *.pyc
45 | *.swp
46 | *.out
47 | *.hdf5
48 | *.txt
49 | *.csv
50 | 
51 | # misc files generated by zk and storm
52 | *zookeeper*
53 | *storm-local*
54 | runAPI.sh
55 | 
56 | # virtual machine crash logs, see http://www.java.com/en/download/help/error_hotspot.xml
57 | hs_err_pid*
58 | 


--------------------------------------------------------------------------------
/1709.05554.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/davisliang/automated/5e07e5802a604036fc6295ed3538d596936fca0e/1709.05554.pdf


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Automated Multi-task Learning
 2 | **Automated MTL** supports two generalized multi-tasking, and recurrent deep learning architectures. Automated MTL uses the statistical regularities within the original dataset itself to reinforce the representations learned for the primary task. Automated MTL comes in two flavors: the CRNN (Cascaded Recurrent Neural Network) and the MRNN (Multi-tasking Recurrent Neural Network). 
 3 | 
 4 | The automated MTL architectures have achieved state-of-the-art performance in sentiment analysis, topic prediction, and hashtag recommendation using a diverse set of text corpuses including Twitter, Rotten Tomatoes, and IMDB.
 5 | 
 6 | ## The Infinite Data Pipeline (∞DP):
 7 | A side project of automated MTL resulted in the ***Infinite Data Pipeline*** which is built on Java, Apache Storm, Kafka, and the Twitter API. The Infinite Data Pipeline streams and preprocesses Twitter data online and directly injects the streamed data into a running Tensorflow topology. 
 8 | 
 9 | ## Requirements:
10 | 1. CUDNN (tested on cuDNN 5105)
11 | 2. CUDA Drivers + NVIDIA Graphics Card with 5.0+ support (tested on GTX 1080)
12 | 3. Apache Zookeeper (tested on version 3.4.6)
13 | 4. Apache Storm (tested on version 0.9.5)
14 | 5. Twitter API + Developer Credentials (tested on version 4.0.4)
15 | 6. Theano (tested on version 0.8.2)
16 | 7. Keras (tested on latest version as of January 9, 2017)
17 | 8. Linux Based OS (tested on Ubuntu 16.04LTS)
18 | 
19 | ## Install Guide:
20 | 1. [Install CUDA and cuDNN](http://tleyden.github.io/blog/2015/11/22/cuda-7-dot-5-on-aws-gpu-instance-running-ubuntu-14-dot-04/)
21 | 2. [Apache Storm and Twitter API Setup](https://www.tutorialspoint.com/apache_storm/apache_storm_installation.htm)
22 | 3. [Install keras and Theano](http://www.pyimagesearch.com/2016/07/18/installing-keras-for-deep-learning/)
23 | 4. [Download Kafka 2.10](https://www.apache.org/dyn/closer.cgi?path=/kafka/0.10.1.1/kafka_2.10-0.10.1.1.tgz)
24 | 
25 | ## Data Miner Run Guide (MacOSX Local):
26 | 1. Run **systemStartMac.sh** to start your *Storm* instance. Make sure `KAFKAHOME` is set correctly in `scripts/startKafkaServer.sh`.
27 | 2. Edit `src/storm/pom.xml` with the appropriate Twitter credentials. Run `mvn install` inside `src/storm` to compile and `mvn exec:java` to start the data collection and streaming.
28 | 
29 | ## Data Miner Run Guide (Ubuntu 16.04 Local):
30 | 1. Run **systemStartUbuntu.sh** to start your *Storm* instance. 
31 | 2. Run **runAPI.sh** to open the *Twitter* stream and start collection. (Requires you to edit **runAPI.sh** with correct *Twitter* API credentials).
32 | 
33 | ## Tweetnet Run Guide:
34 | 1. Run **tweetnet.py**.
35 | 
36 | ## Notes:
37 | 
38 | **Note**: The system start script opens five new terminals; *Apache Zookeeper*, the *Nimbus*, the *Supervisor*, *StormUI*, and the *Kafka* server. Each new open terminal requires **sudo** access and will request for the user's password. To view *StormUI* you can navigate to *localhost:8080*. 
39 | 
40 | **Note**: In the CUDA setup, the section where you link cuda to cuda-7.5 is outdated. 
41 | 
42 | Intead of following this step:
43 | 
44 |     export CUDA_HOME=/usr/local/cuda-7.5
45 | 
46 | Make sure you using and linking *CUDA v8.0*:
47 | 
48 |     export CUDA_HOME=/usr/local/cuda-8.0
49 | 
50 | **Note**: You will need to register for Twitter Developer credentials to run the data miner.
51 | 


--------------------------------------------------------------------------------
/auto/data/ag_news_csv/create_vali.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from os.path import expanduser
 3 | import random
 4 | import shutil
 5 | from random import shuffle
 6 | #Step 1: list all folder under training data set:
 7 | data_path = "~/tweetnet/automatedMTL/data/ag_news_csv"
 8 | all_classes = os.listdir(expanduser(data_path + "/Train")) 
 9 | 
10 | for c in all_classes:
11 |     if c[0] != ".":
12 | 	print c
13 | 	files = os.listdir(expanduser(data_path+"/Train/"+c))
14 | 	shuffle(files)
15 | 	if "Sports" in c:
16 | 	    for f in files[0:4568]:
17 | 	        shutil.move(expanduser(data_path+"/Train/"+c+"/"+f), expanduser(data_path+"/Validation/"+c+"/"+f))
18 | 	else:
19 | 	    for f in files[0:4569]:
20 | 	        shutil.move(expanduser(data_path+"/Train/"+c+"/"+f), expanduser(data_path+"/Validation/"+c+"/"+f))
21 | 
22 | 	
23 | 


--------------------------------------------------------------------------------
/auto/data/ag_news_csv/load_batch.py:
--------------------------------------------------------------------------------
  1 | import cPickle as pickle
  2 | import numpy as np
  3 | import os
  4 | import random
  5 | from os.path import expanduser
  6 | #from reformat import reformat_data
  7 | from load_util import class_look_up
  8 | 
  9 | # The files are named from 0.txt to n.txt
 10 | # This function returns a list of all shuffled file names 
 11 | 
 12 | def get_file_identifiers(data_path):
 13 |     ids = []
 14 |     f = open(expanduser(data_path))    
 15 |     for l in f.readlines():
 16 |         ids.append(int(l.split(" ")[0]))
 17 |     random.shuffle(ids)
 18 |     return ids
 19 | 
 20 | def get_classes(all_classes, id):
 21 |     return all_classes[id][0]
 22 | 
 23 | def get_text_by_batch(data_path, is_train, train_file, test_file, all_classes, start_idx, batch_size):
 24 |     if is_train:
 25 |         identifiers = train_file
 26 |     else:
 27 |         identifiers = test_file
 28 |     batch_identifiers = identifiers[start_idx: start_idx + batch_size]
 29 |     
 30 |     batch_text = []
 31 |     for idx in batch_identifiers:
 32 |         text = open(expanduser(data_path + get_classes(all_classes, idx)+ "/" + str(idx)+".txt"))
 33 |         batch_text.append(text.read())
 34 | 
 35 |     return batch_identifiers, batch_text
 36 | 
 37 | def load_data(data_path):
 38 |     
 39 |     all_classes = pickle.load(open(expanduser(data_path + '/classes.pkl')))
 40 |     test_file = get_file_identifiers(data_path + "/test_classes.txt")
 41 |     train_file = get_file_identifiers(data_path + "/train_classes.txt")
 42 |     return all_classes, train_file, test_file
 43 | 
 44 | def get_word2vec(data_path):
 45 |     # TO DO: download word2vec!
 46 |     word2vec_dic = pickle.load(open(expanduser(data_path)))
 47 |     return word2vec_dic
 48 | 
 49 | # Unknown symbols are UNK 
 50 | # Missing word symbols are zeros
 51 | # EOS are EOS
 52 | 
 53 | def encode_sequence(word2vec_dic, sequence, encode_dim, max_len):
 54 |     sequence_by_word = sequence.split(" ")
 55 |     encoded_seq = np.zeros((max_len, encode_dim))
 56 |     for i in range(len(sequence_by_word)):
 57 |         word = sequence_by_word[i]
 58 |         if word2vec_dic.get(word) == None:
 59 |             encoded_seq[i, :] = word2vec_dic["UNK"]
 60 |         else:
 61 |             if word != "REMOVE":
 62 |                 encoded_seq[i, :] = word2vec_dic[word]
 63 | 	    else:
 64 | 		encoded_seq[i, :] = word2vec_dic["_"]
 65 |     return encoded_seq, len(sequence_by_word)
 66 | 
 67 | def encode_sequence_generation(word2vec_dic, sequence, encode_dim, max_len):
 68 |     sequence_by_word = sequence.split(" ")
 69 |     encoded_seq = np.zeros((max_len, encode_dim))
 70 |     for i in range(1, len(sequence_by_word)):
 71 |         word = sequence_by_word[i]
 72 |         if word2vec_dic.get(word) == None:
 73 |             encoded_seq[i-1, :] = word2vec_dic["UNK"]
 74 |         else:
 75 |             encoded_seq[i-1, :] = word2vec_dic[word]
 76 |     encoded_seq[len(sequence_by_word)-1, :] = word2vec_dic["EOS"]
 77 |     context_target = sequence_by_word[1:len(sequence_by_word)] + ["EOS"]
 78 | 
 79 |     return encoded_seq, context_target, len(sequence_by_word)
 80 | 
 81 | def oneHot(nclasses, idx):
 82 |     one_hot = np.zeros((nclasses))
 83 |     one_hot[idx-1] = 1
 84 |     return one_hot
 85 | 
 86 | def load_batch(n_classes, word2vec_dic, missing_word_dic, encode_dim, max_len, data_path, is_train, train_file, test_file, all_classes, start_idx, batch_size, automated_task):
 87 |     batch_identifiers, batch_text = get_text_by_batch(data_path, is_train, train_file, test_file, all_classes, start_idx, batch_size)
 88 |     encoded_batch = np.zeros((batch_size, max_len, encode_dim))
 89 |     batch_classes = np.zeros((batch_size, n_classes))
 90 |     batch_context_encoded = np.zeros((batch_size, encode_dim))
 91 |     if automated_task == "word generation": batch_context_encoded = np.zeros((batch_size, max_len, encode_dim))
 92 |     batch_context = []
 93 |     batch_length = []
 94 |     for i in range(batch_size):
 95 |         encoded_batch[i,:, :], text_length = encode_sequence(word2vec_dic, batch_text[i], encode_dim, max_len)
 96 |         batch_classes[i,:] = oneHot(n_classes, all_classes[batch_identifiers[i]][-1])
 97 |         if automated_task != "word generation":
 98 |             batch_context_encoded[i,:] = word2vec_dic[missing_word_dic[batch_identifiers[i]]]
 99 |             batch_context.append(missing_word_dic[batch_identifiers[i]])
100 |         else:
101 | 	    batch_context_encoded[i, :, :], context_target, text_length = encode_sequence_generation(word2vec_dic, batch_text[i], encode_dim, max_len)
102 | 	    batch_context.append(context_target)
103 |         batch_length.append(text_length)
104 |     return encoded_batch, batch_classes, batch_context_encoded, batch_context, batch_identifiers, batch_text, batch_length
105 | 
106 | if __name__ == "__main__":
107 |     data_path = "~/tweetnet/automatedMTL/data/ag_news_csv"
108 |     data_stats = pickle.load(open(expanduser(data_path + "/stats.pkl")))
109 |     n_classes, n_data, n_data_per_class, n_train_data, n_test_data, max_length = data_stats['n_classes'], data_stats['n_data'], data_stats['n_data_per_class'],data_stats['n_train_data'], data_stats['n_test_data'], data_stats['max_length']
110 |     print n_classes, n_data, n_data_per_class
111 |     word2vec_dic = get_word2vec("~/tweetnet/data/word2vec_dict.pkl")
112 |     for epoch in range(3):
113 |         dic = {}
114 |         all_classes, train_file, test_file = load_data(data_path)
115 |         start_idx = 0
116 |         for minibatch in range(3):
117 |             encoded_batch, batch_classes, batch_context_encoded, batch_context, batch_identifier, batch_text, batch_length = load_batch(n_classes, word2vec_dic, {}, 300, max_length, data_path+"/Train/", 1, train_file, test_file, all_classes, start_idx, 1, automated_task="word generation")
118 |             start_idx += 1
119 |             print batch_text
120 |             print batch_classes
121 |             print batch_context
122 |             print encoded_batch.shape
123 |             print batch_context_encoded.shape
124 |             for i in batch_identifier:
125 |                 if dic.get(i) != None: print "Wrong"
126 |                 else: dic[i] = 1
127 | 


--------------------------------------------------------------------------------
/auto/data/ag_news_csv/load_util.py:
--------------------------------------------------------------------------------
 1 | import cPickle as pickle
 2 | import os
 3 | from os.path import expanduser
 4 | from os.path import basename
 5 | 
 6 | def class_look_up(data_path):
 7 |     out_train = open(expanduser(data_path+"/train_classes.txt"), "w")
 8 |     out_test = open(expanduser(data_path+"/test_classes.txt"), "w")
 9 |     train_folders = os.listdir(expanduser(data_path+"/Train/"))
10 |     test_folders = os.listdir(expanduser(data_path+"/Test/"))
11 |     if "rotten_tomato" not in data_path:
12 |         validation_folders = os.listdir(expanduser(data_path+"/Validation/"))
13 |         out_val = open(expanduser(data_path+"/validation_classes.txt"), "w")
14 |  
15 |     dict = {"World": 0, "Sports": 1, "Business":2, "Sci_Tech":3}
16 |     cnt = 0
17 |     file2class_dict = {}
18 | 
19 |     for i in train_folders:
20 |         if i[0] != '.':
21 |             #if dict.get(i) == None:
22 | 	    #    dict[i] = cnt
23 |             #    cnt += 1
24 |             files = os.listdir(expanduser(data_path+"/Train/"+i))
25 | 	    for f in files:
26 |                 if f[0] == ".": continue
27 | 	        out_train.write(f[0:len(f) - 4] + "   " + i + "   " + str(dict[i]))
28 |                 out_train.write("\n") 
29 |                 file2class_dict[int(f[0:len(f) - 4])] = (i, dict[i])
30 | 
31 |     for i in test_folders:
32 |         if i[0] != '.':
33 |             files = os.listdir(expanduser(data_path+"/Test/"+i))
34 | 	    for f in files:
35 |                 if f[0] == ".": continue
36 | 	        out_test.write(f[0:len(f) - 4] + "   " + i + "   " + str(dict[i]))
37 | 		out_test.write("\n")
38 |                 file2class_dict[int(f[0:len(f) - 4])] = (i, dict[i])
39 |     
40 | 
41 |     if "rotten_tomato" not in data_path:
42 |         for i in validation_folders:
43 |             if i[0] != '.':
44 |                 files = os.listdir(expanduser(data_path+"/Validation/"+i))
45 | 	        for f in files:
46 |                     if f[0] == ".": continue
47 | 	            out_val.write(f[0:len(f) - 4] + "   " + i + "   " + str(dict[i]))
48 | 		    out_val.write("\n")
49 |                     file2class_dict[int(f[0:len(f) - 4])] = (i, dict[i])
50 | 	out_val.close()
51 |     out_train.close()
52 |     out_test.close()
53 |     pickle.dump(file2class_dict, open(expanduser(data_path+"/classes.pkl"), "w"))
54 |     #print file2class_dict
55 |     #print len(file2class_dict)
56 | 
57 | if __name__ == "__main__":
58 |     #class_look_up("~/automatedMTL/data/rotten_tomato")
59 |     class_look_up("~/tweetnet/automatedMTL/data/ag_news_csv")
60 | 


--------------------------------------------------------------------------------
/auto/data/ag_news_csv/reformat.py:
--------------------------------------------------------------------------------
 1 | from os.path import expanduser
 2 | import csv
 3 | class_dict = {"1": "World", "2": "Sports", "3": "Business", "4": "Sci_Tech"}
 4 | file_cnt = 0
 5 | with open(expanduser("~/automatedMTL/data/ag_news_csv/train.csv")) as f:
 6 |     reader = csv.reader(f)
 7 |     for row in reader:
 8 |         class_ = row[0]
 9 |         content = " ".join(row[1:len(row)])
10 |         with open(expanduser("~/automatedMTL/data/ag_news_csv/Train_raw/"+class_dict[class_]+"/"+str(file_cnt)+".txt"), "w") as f:
11 |             f.write(content)
12 |             f.close()
13 |             file_cnt += 1
14 | 
15 | print file_cnt
16 | with open(expanduser("~/automatedMTL/data/ag_news_csv/test.csv")) as f:
17 |     reader = csv.reader(f)
18 |     for row in reader:
19 |         class_ = row[0]
20 |         content = " ".join(row[1:len(row)])
21 |         with open(expanduser("~/automatedMTL/data/ag_news_csv/Test_raw/"+class_dict[class_]+"/"+str(file_cnt)+".txt"), "w") as f:
22 |             f.write(content)
23 |             f.close()
24 |             file_cnt += 1
25 | print file_cnt
26 | 


--------------------------------------------------------------------------------
/auto/data/ag_news_csv/stats.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import cPickle as pickle
  3 | from os.path import expanduser
  4 | 
  5 | def dedup(data_path):
  6 |     classes = os.listdir(expanduser(data_path+"/Test_raw/")) 
  7 |     n_data_per_class = {}
  8 |     n_train_data = 0
  9 |     n_test_data = 0
 10 |     file_cnt = 0
 11 |     for c in classes:
 12 |         print c
 13 | 	if c[0] != ".":
 14 | 	    test_files = os.listdir(expanduser(data_path+"/Test_raw/"+c))
 15 |             train_files = os.listdir(expanduser(data_path+"/Train_raw/"+c))
 16 | 	    for t in test_files:
 17 | 		if t[0] != ".":
 18 |                     print file_cnt
 19 | 		    f = open(expanduser(data_path+"/Test_raw/"+c+"/"+t), "r")
 20 | 		    txt = f.read().lower()
 21 | 		    words = txt.split(" ")
 22 | 		    chrs = list(" ".join(words))
 23 | 		    for i in range(len(chrs)):
 24 | 			if ((ord(chrs[i]) < ord('a') or ord(chrs[i]) > ord('z'))) and chrs[i] != "'":
 25 | 			    chrs[i] = " "
 26 | 		    remove_long_txt = "".join(chrs)
 27 |                     words = remove_long_txt.split()
 28 |                     words.append("EOS")
 29 |                     remove_long_txt = " ".join(words)
 30 | 		    with open(expanduser(data_path+"/Test/"+c+"/"+str(file_cnt)+".txt"), "w") as f:
 31 | 			f.write(remove_long_txt)
 32 | 			f.close()
 33 |                         file_cnt += 1
 34 | 	    for t in train_files:
 35 | 		if t[0] != ".":
 36 |                     print file_cnt
 37 | 		    f = open(expanduser(data_path+"/Train_raw/"+c+"/"+t), "r")
 38 | 		    txt = f.read().lower()
 39 | 		    words = txt.split(" ")
 40 | 		    chrs = list(" ".join(words))
 41 | 		    for i in range(len(chrs)):
 42 | 			if ((ord(chrs[i]) < ord('a') or ord(chrs[i]) > ord('z'))) and chrs[i] != "'":
 43 | 			    chrs[i] = " "
 44 | 		    remove_long_txt = "".join(chrs)
 45 |                     words = remove_long_txt.split()
 46 |                     words.append("EOS")
 47 |                     remove_long_txt = " ".join(words)
 48 | 		    with open(expanduser(data_path+"/Train/"+c+"/"+str(file_cnt)+".txt"), "w") as f:
 49 | 			f.write(remove_long_txt)
 50 |                         f.close()
 51 |                         file_cnt += 1
 52 | def stats(data_path):
 53 |     classes = os.listdir(expanduser(data_path+"/Test/")) 
 54 |     n_data_per_class = {}
 55 |     n_train_data = 0
 56 |     n_test_data = 0
 57 |     length = []
 58 |     for c in classes:
 59 |         if c[0] != ".":
 60 |             test_files = os.listdir(expanduser(data_path+"/Test/"+c))
 61 | 	    train_files = os.listdir(expanduser(data_path+"/Train/"+c))
 62 | 	    all_files = test_files + train_files
 63 | 	    for t in test_files:
 64 | 	        if t[0] != ".":
 65 | 	    	    if n_data_per_class.get(c) == None:
 66 | 		        n_data_per_class[c] = 1
 67 | 		    else:
 68 | 			n_data_per_class[c] += 1
 69 | 		    n_test_data += 1
 70 |                     with open(expanduser(data_path+"/Test/"+c+"/"+t), "r") as f:
 71 |                         txt = f.read()
 72 |                         words = txt.split()
 73 |                         length.append(len(words))
 74 |                         f.close()
 75 | 	    for t in train_files:
 76 | 	        if t[0] != ".":
 77 | 	    	    if n_data_per_class.get(c) == None:
 78 | 		        n_data_per_class[c] = 1
 79 | 		    else:
 80 | 			n_data_per_class[c] += 1
 81 | 		    n_train_data += 1
 82 |                     with open(expanduser(data_path+"/Train/"+c+"/"+t), "r") as f:
 83 |                         txt = f.read()
 84 |                         words = txt.split()
 85 |                         length.append(len(words))
 86 |                         f.close()
 87 |     length = sorted(length)
 88 |     print "Number of classes: ", len(n_data_per_class)
 89 |     print "Numbe of data per class: ", n_data_per_class
 90 |     print "Number of train data: ", n_train_data
 91 |     print "Number of test data: ", n_test_data
 92 |     print "Longest sequence: ", length[-1]
 93 |     print "Shortest sequence: ", length[0]
 94 |     print "Average sequence: ", sum(length) * 1.0 / len(length)
 95 |     print length[len(length)-200:len(length)]
 96 |     all_data = 0
 97 |     for i in n_data_per_class.keys():
 98 |         all_data += n_data_per_class[i]
 99 |     print all_data
100 |     data_stats={}
101 |     data_stats['n_classes'] = len(n_data_per_class)
102 |     data_stats['n_data'] = n_train_data + n_test_data
103 |     data_stats['n_data_per_class'] = n_data_per_class
104 |     data_stats['n_train_data'] = n_train_data
105 |     data_stats['n_test_data'] = n_test_data
106 |     data_stats['max_length'] = length[-1]
107 |     print data_stats
108 |     pickle.dump(data_stats, open(expanduser(data_path+"/stats.pkl"),"w")) 
109 | if __name__ == "__main__":
110 |     stats("~/automatedMTL/data/ag_news_csv")
111 | 


--------------------------------------------------------------------------------
/auto/src/model/mcrnn_model.py:
--------------------------------------------------------------------------------
  1 | import tensorflow as tf
  2 | import numpy as np
  3 | import os
  4 | import cPickle as pickle
  5 | from os.path import expanduser
  6 | import sys
  7 | 
  8 | sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "..","util")))
  9 | from tf_utils import fcLayer, createLSTMCell, applyActivation, predictionLayer
 10 | #from predContext import predContext, createHtDict
 11 | 
 12 | class model(object):
 13 |         
 14 |         # Model params
 15 |         # 0 -- shared;  1 -- context;  2 -- task
 16 | 	fc_activation = "tanh"
 17 | 	output_activation = "tanh"
 18 | 	dropout = 0.0
 19 | 	body_lstm_size = 128
 20 | 	context_lstm_size = 128
 21 | 	task_lstm_size = 128
 22 | 	body_n_layer = 1
 23 | 	context_n_layer = 1
 24 | 	task_n_layer = 1
 25 | 	context_branch_fc = 512
 26 | 	task_branch_fc = 512
 27 | 
 28 | 	# Data params
 29 | 	batch_size = 128
 30 | 	max_length = 52
 31 | 	feature_length = 300
 32 | 	context_dim = 300
 33 | 	task_dim = 2
 34 | 
 35 | 	# Hyper- params
 36 | 	lr = 0.001
 37 | 	context_lr = lr
 38 | 	n_epoch = 500
 39 | 	topN = 4
 40 | 	keep_prob_val = 1.0
 41 | 
 42 | 	def buildModel(self, x, y_context, y_task, is_train, dropout, scope="multiTask"):
 43 |      
 44 |     	    # Assume the input shape is (batch_size, max_length, feature_length) 
 45 | 
 46 |     	    #TASK = primary task, CONTEXT = secondary task
 47 |     
 48 |     	    # Create lstm cell for the shared layer 
 49 |             body_lstm_cell, _ = createLSTMCell(self.batch_size, self.body_lstm_size, self.body_n_layer, forget_bias=0.0)
 50 |             # Create lstm cell for branch 1 
 51 |             context_lstm_cell, _ = createLSTMCell(self.batch_size, self.context_lstm_size, self.context_n_layer, forget_bias=0.0)
 52 |             # Create lstm cells for branch 2
 53 | 	    task_lstm_cell, _ = createLSTMCell(self.batch_size, self.task_lstm_size, self.task_n_layer, forget_bias=0.0)
 54 | 
 55 |     	    context_cost = tf.constant(0)
 56 |     	    task_cost = tf.constant(0)
 57 | 
 58 |     	    with tf.variable_scope("shared_lstm"):
 59 |         	body_cell_output, last_body_state = tf.nn.dynamic_rnn(cell = body_lstm_cell, dtype=tf.float32, sequence_length=self.length(x), inputs=x)
 60 |         
 61 |     	    with tf.variable_scope("context_branch"):
 62 |         	context_cell_output, last_context_state = tf.nn.dynamic_rnn(cell = context_lstm_cell, dtype=tf.float32, sequence_length=self.length(body_cell_output), inputs=body_cell_output)
 63 | 
 64 |     	    # The output from LSTMs will be (batch_size, max_length, out_size)
 65 |     	    with tf.variable_scope("context_fc"):
 66 |         	# Select the last output that is not generated by zero vectors
 67 |         	last_context_output = self.last_relevant(context_cell_output, self.length(context_cell_output))
 68 |         	# feed the last output to the fc layer and make prediction
 69 |         	context_fc_out = fcLayer(x=last_context_output, in_shape=self.context_lstm_size, out_shape=self.context_branch_fc, activation=self.fc_activation, dropout=self.dropout, is_train=is_train, scope="fc1")
 70 |         	context_cost, context_output = predictionLayer(x=context_fc_out, y=y_context, in_shape=self.context_branch_fc, out_shape=y_context.get_shape()[-1].value, activation=self.output_activation)
 71 | 
 72 |     	    with tf.variable_scope("task_branch"):
 73 |         	task_cell_output, last_task_state = tf.nn.dynamic_rnn(cell = task_lstm_cell, dtype=tf.float32, sequence_length=self.length(body_cell_output), inputs=body_cell_output)
 74 | 
 75 |     	    with tf.variable_scope("task_fc"):
 76 |         	# Select the last output that is not generated by zero vectors
 77 |         	last_task_output = self.last_relevant(task_cell_output, self.length(task_cell_output))
 78 |         	# feed the last output to the fc layer and make prediction
 79 |         	task_fc_out = fcLayer(x=last_task_output, in_shape=self.task_lstm_size, out_shape=self.task_branch_fc, activation=self.fc_activation, dropout=self.dropout, is_train=is_train, scope="fc2")
 80 |         	task_cost, task_output = predictionLayer(x=task_fc_out, y=y_task, in_shape=self.context_branch_fc, out_shape=y_task.get_shape()[-1].value, activation=self.output_activation)
 81 | 
 82 |     	    return context_cost, task_cost, task_output, context_output
 83 | 
 84 | 	# Flatten the output tensor to shape features in all examples x output size
 85 | 	# construct an index into that by creating a tensor with the start indices for each example tf.range(0, batch_size) x max_length
 86 | 	# and add the individual sequence lengths to it
 87 | 	# tf.gather() then performs the acutal indexing.
 88 | 	def last_relevant(self, output, length):
 89 |     	    index = tf.range(0, self.batch_size) * self.max_length + (length - 1)
 90 |             out_size = int(output.get_shape()[2])
 91 |     	    flat = tf.reshape(output, [-1, out_size])
 92 |    	    relevant = tf.gather(flat, index)
 93 |     	    return relevant
 94 | 
 95 | # Assume that the sequences are padded with 0 vectors to have shape (batch_size, max_length, feature_length)
 96 | 
 97 |         def length(self, sequence):
 98 |             used = tf.sign(tf.reduce_max(tf.abs(sequence), reduction_indices=2))
 99 |             length = tf.reduce_sum(used, reduction_indices=1)
100 |             length = tf.cast(length, tf.int32)
101 |             print length.get_shape()
102 |             return length
103 | 
104 | 
105 | 
106 | 
107 | 
108 | 


--------------------------------------------------------------------------------
/auto/src/model/mcrnn_model_1_lstm.py:
--------------------------------------------------------------------------------
  1 | import tensorflow as tf
  2 | import numpy as np
  3 | import os
  4 | import cPickle as pickle
  5 | from os.path import expanduser
  6 | import sys
  7 | 
  8 | sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "..","util")))
  9 | from tf_utils import fcLayer, createLSTMCell, createGRUCell, applyActivation, predictionLayer, compute_cost
 10 | #from predContext import predContext, createHtDict
 11 | 
 12 | class model(object):
 13 | 
 14 |         # Task params
 15 |         is_multi_task = True
 16 |         secondary_task = "word generation"
 17 |         primary_task = "classification"
 18 | 
 19 |         # Model params
 20 |         # 0 -- shared;  1 -- context;  2 -- task
 21 | 	fc_activation = "tanh"
 22 | 	context_output_activation = "tanh"
 23 | 	task_output_activation = "softmax"
 24 | 	body_lstm_size = 1024
 25 | 	body_n_layer = 1
 26 | 	context_n_layer = 1
 27 | 	task_n_layer = 1
 28 | 	context_branch_fc = 512
 29 | 	task_branch_fc = 30
 30 | 
 31 | 	# Data params
 32 | 	n_classes = 2
 33 | 	batch_size = 64
 34 | 	max_length = 52
 35 | 	feature_length = 300
 36 |  	context_dim = 300
 37 | 	task_dim = n_classes
 38 | 
 39 | 	# Hyper- params
 40 | 	lr = 0.0001 #hp
 41 |         lr_mod = 1.0 #hp
 42 | 	context_lr = lr_mod*lr
 43 | 	n_epoch = 50 #hp
 44 | 
 45 | 	def buildModel(self, x, y_context, y_task, is_train, dropout, scope="multiTask"):
 46 | 
 47 |     	    # Assume the input shape is (batch_size, max_length, feature_length)
 48 | 
 49 |     	    #TASK = primary task, CONTEXT = secondary task
 50 | 
 51 |     	    # Create lstm cell for the shared layer
 52 |             body_lstm_cell, _ = createLSTMCell(self.batch_size, self.body_lstm_size, self.body_n_layer, forget_bias=0.0)
 53 | 
 54 |     	    context_cost = tf.constant(0)
 55 |     	    task_cost = tf.constant(0.0, dtype=tf.float32)
 56 | 
 57 |             if not self.is_multi_task: context_output = tf.constant(0)
 58 | 
 59 |     	    with tf.variable_scope("shared_lstm"):
 60 |         	body_cell_output, last_body_state = tf.nn.dynamic_rnn(cell = body_lstm_cell, dtype=tf.float32, sequence_length=self.length(x), inputs=x)
 61 | 
 62 |             if self.is_multi_task:
 63 |     	        with tf.variable_scope("context_branch"):
 64 |         	    # Select the last output that is not generated by zero vectors
 65 |                     if self.secondary_task == "missing word":
 66 |         	        last_body_output = self.last_relevant(body_cell_output, self.length(body_cell_output))
 67 |         	        # feed the last output to the fc layer and make prediction
 68 |     	                with tf.variable_scope("context_fc"):
 69 |         	            context_fc_out = fcLayer(x=last_body_output, in_shape=self.body_lstm_size, out_shape=self.context_branch_fc, activation=self.fc_activation, dropout=dropout, is_train=is_train, scope="fc1")
 70 |         	        with tf.variable_scope("context_pred"):
 71 | 		            context_output, context_logits = predictionLayer(x=context_fc_out, y=y_context, in_shape=self.context_branch_fc, out_shape=y_context.get_shape()[-1].value, activation=self.context_output_activation)
 72 | 		            context_cost = compute_cost(logit=context_logits, y=y_context, out_type="last_only", max_length=self.max_length, batch_size=self.batch_size, embed_dim=self.feature_length, activation=self.context_output_activation)
 73 | 
 74 |                     if self.secondary_task == "word generation":
 75 | 			context_input = tf.transpose(body_cell_output, [1, 0, 2])
 76 |  	                context_input = tf.reshape(context_input, [-1, self.body_lstm_size])
 77 |                         context_input_list = tf.split(context_input, self.max_length, 0)
 78 |                         fc_output_list = []
 79 | 			with tf.variable_scope("context_fc"):
 80 | 		            for step in range(self.max_length):
 81 | 			        if step > 0: tf.get_variable_scope().reuse_variables()
 82 | 			        fc_out = fcLayer(x=context_input_list[step], in_shape=self.body_lstm_size, out_shape=self.context_branch_fc, activation=self.fc_activation, dropout=dropout, is_train=is_train, scope="fc1")
 83 | 			        fc_output_list.append(tf.expand_dims(fc_out, axis=1))
 84 | 			    context_fc_out = tf.concat(fc_output_list, axis=1)
 85 | 			with tf.variable_scope("context_pred"):
 86 |         	            context_output, context_logits = predictionLayer(x=context_fc_out, y=y_context, in_shape=self.context_branch_fc, out_shape=y_context.get_shape()[-1].value, activation=self.context_output_activation)
 87 | 			    context_cost = compute_cost(logit=context_logits, y=y_context, out_type="sequential", max_length=self.max_length, batch_size=self.batch_size, embed_dim=self.feature_length,activation=self.context_output_activation)
 88 | 
 89 | 
 90 | 		    print "Context cost shape: ", context_cost.get_shape()
 91 | 
 92 |     	    with tf.variable_scope("task_branch"):
 93 |     	    	with tf.variable_scope("task_fc"):
 94 |         	    # Select the last output that is not generated by zero vectors
 95 |         	    last_body_output = self.last_relevant(body_cell_output, self.length(body_cell_output))
 96 |         	    # feed the last output to the fc layer and make prediction
 97 |         	    task_fc_out = fcLayer(x=last_body_output, in_shape=self.body_lstm_size, out_shape=self.task_branch_fc, activation=self.fc_activation, dropout=dropout, is_train=is_train, scope="fc2")
 98 |         	    task_output, task_logits = predictionLayer(x=task_fc_out, y=y_task, in_shape=self.task_branch_fc, out_shape=y_task.get_shape()[-1].value, activation=self.task_output_activation)
 99 | 		    print "Task output shape: ", task_output.get_shape()
100 | 		    task_cost = compute_cost(logit=task_logits, y=y_task, out_type="last_only", max_length=self.max_length, batch_size=self.batch_size, embed_dim=self.n_classes,activation=self.task_output_activation)
101 | 
102 |             return context_cost, task_cost, task_output, context_output
103 | 
104 | 	# Flatten the output tensor to shape features in all examples x output size
105 | 	# construct an index into that by creating a tensor with the start indices for each example tf.range(0, batch_size) x max_length
106 | 	# and add the individual sequence lengths to it
107 | 	# tf.gather() then performs the acutal indexing.
108 | 	def last_relevant(self, output, length):
109 |     	    index = tf.range(0, self.batch_size) * self.max_length + (length - 1)
110 |             out_size = int(output.get_shape()[2])
111 |     	    flat = tf.reshape(output, [-1, out_size])
112 |    	    relevant = tf.gather(flat, index)
113 |     	    return relevant
114 | 
115 | # Assume that the sequences are padded with 0 vectors to have shape (batch_size, max_length, feature_length)
116 | 
117 |         def length(self, sequence):
118 |             used = tf.sign(tf.reduce_max(tf.abs(sequence), reduction_indices=2))
119 |             length = tf.reduce_sum(used, reduction_indices=1)
120 |             length = tf.cast(length, tf.int32)
121 |             print length.get_shape()
122 |             return length
123 | 
124 | 
125 | 
126 | 
127 | 
128 | 


--------------------------------------------------------------------------------
/auto/src/model/train.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | import numpy as np
 3 | import os
 4 | import cPickle as pickle
 5 | from os.path import expanduser
 6 | import sys
 7 | import mcrnn_model
 8 | from mcrnn_model import model
 9 | 
10 | sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "..","utils")))
11 | from tf_utils import fcLayer, createLSTMCell, applyActivation, predictionLayer
12 | from load_batch import get_file_identifiers, get_classes, load_data, get_word2vec, load_batch
13 | 
14 | def get_data(data_path):
15 |     data_stats = pickle.load(open(expanduser(data_path + "/rt_stats.pkl")))
16 |     max_length, nPos, nNeg, trainPercent, testPercent = data_stats["longest"], data_stats[0], data_stats[1], data_stats['trainPercent'], data_stats['testPercent']
17 |     word2vec_dic = get_word2vec("~/tweetnet/data/word2vec_dict.pkl")
18 |     missing_word_dic = pickle.load(open(expanduser(data_path + "/missing_word_dic.pkl")))
19 |     nTest = int(testPercent*nPos) + int(testPercent*nNeg)
20 |     nTrain = nPos + nNeg - nTest
21 | 
22 |     return max_length, nPos, nNeg, trainPercent, testPercent, word2vec_dic, missing_word_dic, nTest, nTrain
23 | 
24 | 
25 | def trainModel():
26 |     
27 |     M = model()
28 |     data_path = "~/automatedMTL/data/rotten_tomato"
29 |     max_length, nPos, nNeg, trainPercent, testPercent, word2vec_dic, missing_word_dic, nTest, nTrain = get_data(data_path)
30 |     
31 |     x = tf.placeholder(tf.float32, shape=(None, M.max_length, M.feature_length))
32 |     y_context = tf.placeholder(tf.float32, shape=(None, M.context_dim))
33 |     y_task = tf.placeholder(tf.float32, shape=(None, M.task_dim))
34 |     
35 |     optimizer1 = tf.train.AdamOptimizer(learning_rate=M.context_lr)
36 |     optimizer2 = tf.train.AdamOptimizer(learning_rate=M.lr)
37 |     is_train = tf.placeholder(tf.int32)
38 |     n_train_batches = np.ceil(nTrain / M.batch_size).astype(int)
39 |     keep_prob = tf.placeholder(tf.float32)
40 |     
41 |     context_cost, task_cost, task_output, context_output = M.buildModel(x, y_context, y_task, is_train, keep_prob)
42 |     train_step1 = optimizer1.minimize(context_cost)
43 |     train_step2 = optimizer2.minimize(task_cost)
44 | 
45 |     # Start running operations on the graph
46 |     sess = tf.Session()
47 |     sess.run(tf.initialize_all_variables())
48 |     
49 |     with sess.as_default():
50 |         for epoch in range(100):
51 |             taskCost = 0
52 |             contextCost = 0
53 | 
54 |             all_classes, train_file, test_file = load_data(data_path)
55 |             start_idx = 0 
56 |             for minibatch in range(n_train_batches):
57 |                 encoded_batch, batch_classes, batch_missing_word_encoded, batch_missing_word, batch_identifier, batch_text, batch_length = load_batch(word2vec_dic, missing_word_dic, M.feature_length, max_length, data_path+"/Train/", 1, train_file, test_file, all_classes, start_idx, M.batch_size)
58 |                 start_idx += M.batch_size
59 |         
60 |                 feed_dict = {x: encoded_batch, y_context: batch_missing_word_encoded, y_task: batch_classes, is_train:1, keep_prob:0.5}
61 |                 
62 | 		train_step1.run(feed_dict=feed_dict)
63 | 	        context_cost_val, _, _ = sess.run(fetches = [context_cost, task_cost, task_output], feed_dict=feed_dict)
64 |                 contextCost += context_cost_val
65 | 
66 |                 train_step2.run(feed_dict=feed_dict)
67 | 	        _, task_cost_val, _ = sess.run(fetches = [context_cost, task_cost, task_output], feed_dict=feed_dict)
68 |                 taskCost += task_cost_val
69 | 
70 |                 #if minibatch !=0 and minibatch % 100 == 0:
71 |                 print "Minibatch ", minibatch, " Missing Word: ", contextCost , " Classification: ", taskCost 
72 |                 contextCost = 0
73 |                 taskCost = 0
74 | 
75 |             start_idx = 0
76 | 	    accuracy = 0
77 | 
78 |             for i in range(nTest):
79 |                 encoded_batch, batch_classes, batch_missing_word_encoded, batch_missing_word, batch_identifier, batch_text, batch_length = load_batch(word2vec_dic, missing_word_dic, M.feature_length, max_length, data_path+"/Test/", 0, train_file, test_file, all_classes, start_idx, 1)
80 | 		start_idx += 1
81 |                 feed_dict = {x:encoded_batch, y_context: batch_missing_word_encoded, y_task: batch_classes, is_train:0, keep_prob:0.5}
82 |                 task_output_val = sess.run(fetches = [task_output], feed_dict=feed_dict)
83 | 		accuracy += is_correct(batch_classes, task_output_val)
84 |             print "The accuracy in epoch ", epoch, " is: ", accuracy * 1.0 / nTest
85 | 
86 | def is_correct(target, output):
87 |     prediction = np.argmax(output)
88 |     target = np.argmax(target)
89 |     #print prediction, target
90 |     return prediction == target
91 | 
92 |             
93 | if __name__ == "__main__":
94 |     trainModel()    
95 | 


--------------------------------------------------------------------------------
/auto/src/util/1q:
--------------------------------------------------------------------------------
 1 | from os.path import expanduser
 2 | import sys
 3 | import numpy
 4 | import os
 5 | sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "..","model")))
 6 | from mcrnn_model_1_lstm import model
 7 | from train_1_lstm import trainModel as TM
 8 | #does hyperparameter search over some set of hyperparams.
 9 | 
10 | LR = [0.001]
11 | LR_MOD = [1.0] #4
12 | N_EPOCHS = [50] # 30
13 | N_EXPERIMENTS = [1] # 5
14 | KEEP_PROB_VAL = [1.0]
15 | CONTEXT_FC = [30] #1024 on AWS
16 | #3*3*3*30*5/60=67.5 hrs.
17 | experiment = "context_lr=0.5*lr, task_lr=0.5*lr, no learning rate anealing. Learning rates: "
18 | for lr in LR:
19 |     experiment = experiment + str(lr) + ", "
20 | experiment = experiment + " N epoch: "+str(N_EPOCHS[0]) + " Keep prob: "
21 | for prob in KEEP_PROB_VAL:
22 |     experiment = experiment + str(prob) + ", "
23 | experiment = experiment + " Context_fc: "
24 | for fc in CONTEXT_FC:
25 |     experiment = experiment + str(fc) + ", "
26 | 
27 | 
28 | def runExperiment(lr, lr_mod,n_epoch,n_experiments,f1, keep_prob_val, context_fc):
29 |     M= model()
30 | 
31 |     print M.is_multi_task
32 |     if lr_mod == 0.0:
33 |         M.is_multi_task = False
34 |     else:
35 |         M.is_multi_task = True
36 |     print M.is_multi_task
37 | 
38 |     print M.lr
39 |     M.lr = lr
40 |     print M.lr
41 | 
42 |     print M.lr_mod
43 |     M.lr_mod = lr_mod
44 |     print M.lr_mod
45 | 
46 |     print M.n_epoch
47 |     M.n_epoch = n_epoch
48 |     print M.n_epoch
49 | 
50 |     print M.context_branch_fc
51 |     M.context_branch_fc = context_fc
52 |     print M.context_branch_fc
53 | 
54 |     maxAccList = [];
55 |     for i in range(n_experiments):
56 |         accuracyVec = TM(M, keep_prob_val)#INSERT CODE TO run for n epochs
57 |         maxAcc = numpy.max(accuracyVec)
58 |         maxAccList.append(maxAcc)
59 |     expVal = numpy.mean(maxAccList)
60 |     string_result = "lr = " + str(lr) + " lr_mod = "+ "self-annealing" + " avg_acc = " + str(expVal)+'\n'
61 |     f1.write("")
62 |     f1.write(string_result)
63 |     f1.flush()
64 |     print string_result
65 | 
66 | 
67 | 
68 | f1 = open(expanduser('~/tweetnet/logs/hps_log_mrnn_bidir.log'),'w+') 
69 | f1.write(experiment)
70 | f1.write("\n")
71 | f1.flush()
72 | for lr in LR:
73 |     for lr_mod in LR_MOD:
74 |         for n_epoch in N_EPOCHS:
75 |             for n_experiments in N_EXPERIMENTS:
76 | 		for keep_prob_val in KEEP_PROB_VAL:
77 | 		    for context_fc in CONTEXT_FC:
78 |                         runExperiment(lr,lr_mod,n_epoch,n_experiments,f1, keep_prob_val, context_fc)      
79 | f1.close()
80 | 


--------------------------------------------------------------------------------
/auto/src/util/hps.py:
--------------------------------------------------------------------------------
 1 | from os.path import expanduser
 2 | import sys
 3 | import numpy
 4 | import os
 5 | sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "..","model")))
 6 | from mcrnn_model_1_lstm import model
 7 | from train_1_lstm import trainModel as TM
 8 | #does hyperparameter search over some set of hyperparams.
 9 | 
10 | LR = [0.01]
11 | LR_MOD = [1.0] #4
12 | N_EPOCHS = [30] # 30
13 | N_EXPERIMENTS = [10] # 5
14 | KEEP_PROB_VAL = [1.0]
15 | CONTEXT_FC = [30] #1024 on AWS
16 | #3*3*3*30*5/60=67.5 hrs.
17 | experiment = "context_lr=0.5*lr, task_lr=0.5*lr, no learning rate anealing. Learning rates: "
18 | for lr in LR:
19 |     experiment = experiment + str(lr) + ", "
20 | experiment = experiment + " N epoch: "+str(N_EPOCHS[0]) + " Keep prob: "
21 | for prob in KEEP_PROB_VAL:
22 |     experiment = experiment + str(prob) + ", "
23 | experiment = experiment + " Context_fc: "
24 | for fc in CONTEXT_FC:
25 |     experiment = experiment + str(fc) + ", "
26 | 
27 | epoch_ratio_list = [(0.1, 1.0), (0.5, 0.5), (1.0, 0.0)]
28 | 
29 | def runExperiment(lr, lr_mod,n_epoch,n_experiments,f1, keep_prob_val, context_fc):
30 |     M= model()
31 | 
32 |     print M.is_multi_task
33 |     if lr_mod == 0.0:
34 |         M.is_multi_task = False
35 |     else:
36 |         M.is_multi_task = True
37 |     print M.is_multi_task
38 | 
39 |     print M.lr
40 |     M.lr = lr
41 |     print M.lr
42 | 
43 |     print M.lr_mod
44 |     M.lr_mod = lr_mod
45 |     print M.lr_mod
46 | 
47 |     print M.n_epoch
48 |     M.n_epoch = n_epoch
49 |     print M.n_epoch
50 | 
51 |     print M.context_branch_fc
52 |     M.context_branch_fc = context_fc
53 |     print M.context_branch_fc
54 | 
55 |     maxAccList = [];
56 |     for i in range(n_experiments):
57 |         accuracyVec = TM(M, keep_prob_val, epoch_ratio_list)#INSERT CODE TO run for n epochs
58 |         maxAcc = numpy.max(accuracyVec)
59 |         maxAccList.append(maxAcc)
60 |     expVal = numpy.mean(maxAccList)
61 |     string_result = "lr = " + str(lr) + " lr_mod = "+ "self-annealing" + " avg_acc = " + str(expVal)+'\n'
62 |     f1.write("")
63 |     f1.write(string_result)
64 |     f1.flush()
65 |     print string_result
66 | 
67 | 
68 | 
69 | f1 = open(expanduser('~/tweetnet/logs/hps_log_mrnn_bidir.log'),'w+') 
70 | f1.write(experiment)
71 | f1.write("\n")
72 | f1.flush()
73 | for lr in LR:
74 |     for lr_mod in LR_MOD:
75 |         for n_epoch in N_EPOCHS:
76 |             for n_experiments in N_EXPERIMENTS:
77 | 		for keep_prob_val in KEEP_PROB_VAL:
78 | 		    for context_fc in CONTEXT_FC:
79 |                         runExperiment(lr,lr_mod,n_epoch,n_experiments,f1, keep_prob_val, context_fc)      
80 | f1.close()
81 | 


--------------------------------------------------------------------------------
/auto/src/util/hps2.py:
--------------------------------------------------------------------------------
 1 | from os.path import expanduser
 2 | import sys
 3 | import numpy
 4 | import os
 5 | sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "..","model")))
 6 | from mcrnn_model_gen2 import model
 7 | from train_gen2 import trainModel as TM
 8 | #does hyperparameter search over some set of hyperparams.
 9 | 
10 | 
11 | LR = [0.001]
12 | LR_MOD = [0.0,1.0] #4
13 | N_EPOCHS = [30] # 30
14 | N_EXPERIMENTS = [5] # 5
15 | KEEP_PROB_VAL = [1.0]
16 | CONTEXT_FC = [128] 
17 | #3*3*3*30*5/60=67.5 hrs.
18 | #experiment = "context_lr=0.5*lr, task_lr=0.5*lr, no learning rate anealing. Learning rates: "
19 | 
20 | dataset = "ag_news" # or "rotten_tomato"
21 | experiment = "N_epochs = 50. N_exp = 10. lstm: 512 for both. hidden fc: 512 for both. dropout: none."
22 | 
23 | for lr in LR:
24 |     experiment = experiment + str(lr) + ", "
25 | experiment = experiment + " N epoch: "+str(N_EPOCHS[0]) + " Keep prob: "
26 | for prob in KEEP_PROB_VAL:
27 |     experiment = experiment + str(prob) + ", "
28 | experiment = experiment + " Context_fc: "
29 | for fc in CONTEXT_FC:
30 |     experiment = experiment + str(fc) + ", "
31 | 
32 | def runExperiment(lr, lr_mod,n_epoch,n_experiments,f1, keep_prob_val, context_fc):
33 |     M= model()
34 | 
35 |     print M.dataset    
36 |     M.dataset = dataset
37 |     print M.dataset
38 | 
39 |     print M.is_multi_task
40 |     if lr_mod == 0.0:
41 |         M.is_multi_task = False
42 |     else:
43 |         M.is_multi_task = True
44 |     print M.is_multi_task
45 | 
46 |     print M.lr
47 |     M.lr = lr
48 |     print M.lr
49 | 
50 |     print M.lr_mod
51 |     M.lr_mod = lr_mod
52 |     print M.lr_mod
53 | 
54 |     print M.n_epoch
55 |     M.n_epoch = n_epoch
56 |     print M.n_epoch
57 | 
58 |     print M.context_branch_fc
59 |     M.context_branch_fc = context_fc
60 |     print M.context_branch_fc
61 | 
62 |     maxAccList = []
63 |     testResult = []
64 |     for i in range(n_experiments):
65 |         accuracyVec, testAcc = TM(M)#INSERT CODE TO run for n epochs
66 |         maxAcc = numpy.max(accuracyVec)
67 |         maxAccList.append(maxAcc)
68 |         maxIdx = numpy.argmax(accuracyVec)
69 |         testResult.append(testAcc[maxIdx])
70 | 
71 |     expVal = numpy.mean(maxAccList)
72 |     testVal = numpy.mean(testResult)
73 |     if lr_mod == 0.0:
74 |         string_result = "lr = " + str(lr) + " lr_mod = "+ "none (lstm)" + " avg_val_acc = " + str(expVal) + " avg_test_acc = " + str(testVal) + '\n'
75 |     else:
76 |         string_result = "lr = " + str(lr) + " lr_mod = "+ "annealing" + " avg_val_acc = " + str(expVal) + " avg_test_acc = " + str(testVal) + '\n'
77 | 
78 |     f1.write("")
79 |     f1.write(string_result)
80 |     f1.flush()
81 |     print string_result
82 | 
83 | #f1 = open(expanduser('~/tweetnet/logs/hps_log_mrnn_bidir.log'),'w+') 
84 | f1 = open(expanduser("~/tweetnet/logs/hps_mrnn_ag_news.log"), "w+")
85 | f1.write(experiment)
86 | f1.write("\n")
87 | f1.flush()
88 | for lr in LR:
89 |     for lr_mod in LR_MOD:
90 |         for n_epoch in N_EPOCHS:
91 |             for n_experiments in N_EXPERIMENTS:
92 | 		for keep_prob_val in KEEP_PROB_VAL:
93 | 		    for context_fc in CONTEXT_FC:
94 |                         runExperiment(lr,lr_mod,n_epoch,n_experiments,f1, keep_prob_val, context_fc)      
95 | f1.close()
96 | 


--------------------------------------------------------------------------------
/auto/src/util/hps_script.sh:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/davisliang/automated/5e07e5802a604036fc6295ed3538d596936fca0e/auto/src/util/hps_script.sh


--------------------------------------------------------------------------------
/auto/src/util/load_batch.py:
--------------------------------------------------------------------------------
  1 | import cPickle as pickle
  2 | import numpy as np
  3 | import os
  4 | import random
  5 | from os.path import expanduser
  6 | from reformat import reformat_data
  7 | from load_util import class_look_up
  8 | 
  9 | # The files are named from 0.txt to n.txt
 10 | # This function returns a list of all shuffled file names 
 11 | 
 12 | def get_file_identifiers(data_path):
 13 |     ids = []
 14 |     f = open(expanduser(data_path))    
 15 |     for l in f.readlines():
 16 |         ids.append(int(l.split(" ")[0]))
 17 |     random.shuffle(ids)
 18 |     return ids
 19 | 
 20 | def get_classes(all_classes, id):
 21 |     return all_classes[id][0]
 22 | 
 23 | def get_text_by_batch(data_path, is_train, train_file, test_file, all_classes, start_idx, batch_size):
 24 |     if is_train:
 25 |         identifiers = train_file
 26 |     else:
 27 |         identifiers = test_file
 28 |     batch_identifiers = identifiers[start_idx: start_idx + batch_size]
 29 |     
 30 |     batch_text = []
 31 |     for idx in batch_identifiers:
 32 |         text = open(expanduser(data_path + get_classes(all_classes, idx)+ "/" + str(idx)+".txt"))
 33 |         batch_text.append(text.read())
 34 | 
 35 |     return batch_identifiers, batch_text
 36 | 
 37 | def load_data(data_path):
 38 |     
 39 |     all_classes = pickle.load(open(expanduser(data_path + '/classes.pkl')))
 40 |     test_file = get_file_identifiers(data_path + "/test_classes.txt")
 41 |     train_file = get_file_identifiers(data_path + "/train_classes.txt")
 42 |     return all_classes, train_file, test_file
 43 | 
 44 | def get_word2vec(data_path):
 45 |     # TO DO: download word2vec!
 46 |     word2vec_dic = pickle.load(open(expanduser(data_path)))
 47 |     return word2vec_dic
 48 | 
 49 | # Unknown symbols are UNK 
 50 | # Missing word symbols are zeros
 51 | # EOS are EOS
 52 | 
 53 | def encode_sequence(word2vec_dic, sequence, encode_dim, max_len):
 54 |     sequence_by_word = sequence.split(" ")
 55 |     encoded_seq = np.zeros((max_len, encode_dim))
 56 |     for i in range(len(sequence_by_word)):
 57 |         word = sequence_by_word[i]
 58 |         if word2vec_dic.get(word) == None:
 59 |             encoded_seq[i, :] = word2vec_dic["UNK"]
 60 | 	    continue
 61 |         else:
 62 |             if word != "REMOVE":
 63 |                 encoded_seq[i, :] = word2vec_dic[word]
 64 | 	    else:
 65 | 		encoded_seq[i, :] = word2vec_dic["_"]
 66 |     return encoded_seq, len(sequence_by_word)
 67 | 
 68 | def encode_sequence_generation(word2vec_dic, sequence, encode_dim, max_len):
 69 |     sequence_by_word = sequence.split(" ")
 70 |     encoded_seq = np.zeros((max_len, encode_dim))
 71 |     for i in range(1, len(sequence_by_word)):
 72 |         word = sequence_by_word[i]
 73 |         if word2vec_dic.get(word) == None:
 74 |             encoded_seq[i-1, :] = word2vec_dic["UNK"]
 75 |         else:
 76 |             encoded_seq[i-1, :] = word2vec_dic[word]
 77 |     encoded_seq[len(sequence_by_word)-1, :] = word2vec_dic["EOS"]
 78 |     context_target = sequence_by_word[1:len(sequence_by_word)] + ["EOS"]
 79 | 
 80 |     return encoded_seq, context_target, len(sequence_by_word)
 81 | 
 82 | def oneHot(nclasses, idx):
 83 |     one_hot = np.zeros((nclasses))
 84 |     one_hot[idx] = 1
 85 |     return one_hot
 86 | 
 87 | def load_batch(n_classes, word2vec_dic, missing_word_dic, encode_dim, max_len, data_path, is_train, train_file, test_file, all_classes, start_idx, batch_size, automated_task):
 88 |     batch_identifiers, batch_text = get_text_by_batch(data_path, is_train, train_file, test_file, all_classes, start_idx, batch_size)
 89 |     encoded_batch = np.zeros((batch_size, max_len, encode_dim))
 90 |     batch_classes = np.zeros((batch_size, n_classes))
 91 |     batch_context_encoded = np.zeros((batch_size, encode_dim))
 92 |     if automated_task == "word generation": batch_context_encoded = np.zeros((batch_size, max_len, encode_dim))
 93 |     batch_context = []
 94 |     batch_length = []
 95 |     for i in range(batch_size):
 96 |         encoded_batch[i,:, :], text_length = encode_sequence(word2vec_dic, batch_text[i], encode_dim, max_len)
 97 |         batch_classes[i,:] = oneHot(n_classes, all_classes[batch_identifiers[i]][-1])
 98 |         if automated_task != "word generation":
 99 |             batch_context_encoded[i,:] = word2vec_dic[missing_word_dic[batch_identifiers[i]]]
100 |             batch_context.append(missing_word_dic[batch_identifiers[i]])
101 |         else:
102 | 	    batch_context_encoded[i, :, :], context_target, text_length = encode_sequence_generation(word2vec_dic, batch_text[i], encode_dim, max_len)
103 | 	    batch_context.append(context_target)
104 |         batch_length.append(text_length)
105 |     return encoded_batch, batch_classes, batch_context_encoded, batch_context, batch_identifiers, batch_text, batch_length
106 | 
107 | if __name__ == "__main__":
108 |     data_path = "~/automatedMTL/data/rotten_tomato"
109 |     max_length = reformat_data(data_path, False)
110 |     class_look_up(data_path)
111 |     data_stats = pickle.load(open(expanduser(data_path + "/stats.pkl")))
112 |     n_classes, n_data, n_data_per_class, trainPercent, testPercent = data_stats['n_classes'], data_stats['n_data'], data_stats['n_data_per_class'],data_stats['trainPercent'], data_stats['testPercent']
113 |     word2vec_dic = get_word2vec("~/tweetnet/data/word2vec_dict.pkl")
114 |     missing_word_dic = pickle.load(open(expanduser(data_path + "/missing_word_dic.pkl")))
115 |     for epoch in range(3):
116 |         dic = {}
117 |         all_classes, train_file, test_file = load_data(data_path)
118 |         start_idx = 0
119 |         for minibatch in range(73):
120 |             encoded_batch, batch_classes, batch_context_encoded, batch_context, batch_identifier, batch_text, batch_length = load_batch(n_classes, word2vec_dic, missing_word_dic, 300, max_length, data_path+"/Train/", 1, train_file, test_file, all_classes, start_idx, 128, automated_task="word generation")
121 |             start_idx += 128
122 |             print batch_context
123 |             for i in batch_identifier:
124 |                 if dic.get(i) != None: print "Wrong"
125 |                 else: dic[i] = 1
126 | 


--------------------------------------------------------------------------------
/auto/src/util/load_batch2.py:
--------------------------------------------------------------------------------
  1 | import cPickle as pickle
  2 | import numpy as np
  3 | import os
  4 | import random
  5 | from os.path import expanduser
  6 | from reformat import reformat_data
  7 | from load_util import class_look_up
  8 | 
  9 | # The files are named from 0.txt to n.txt
 10 | # This function returns a list of all shuffled file names 
 11 | 
 12 | def get_file_identifiers(data_path):
 13 |     ids = []
 14 |     f = open(expanduser(data_path))    
 15 |     for l in f.readlines():
 16 |         ids.append(int(l.split(" ")[0]))
 17 |     random.shuffle(ids)
 18 |     return ids
 19 | 
 20 | def get_classes(all_classes, id):
 21 |     return all_classes[id][0]
 22 | 
 23 | def get_text_by_batch(data_path, is_train, train_file, test_file, all_classes, start_idx, batch_size):
 24 |     if is_train:
 25 |         identifiers = train_file
 26 |     else:
 27 |         identifiers = test_file
 28 |     batch_identifiers = identifiers[start_idx: start_idx + batch_size]
 29 |     
 30 |     batch_text = []
 31 |     for idx in batch_identifiers:
 32 |         text = open(expanduser(data_path + get_classes(all_classes, idx)+ "/" + str(idx)+".txt"))
 33 |         batch_text.append(text.read())
 34 | 
 35 |     return batch_identifiers, batch_text
 36 | 
 37 | def load_data(data_path):
 38 |     
 39 |     all_classes = pickle.load(open(expanduser(data_path + '/classes.pkl')))
 40 |     test_file = get_file_identifiers(data_path + "/test_classes.txt")
 41 |     train_file = get_file_identifiers(data_path + "/train_classes.txt")
 42 |     return all_classes, train_file, test_file
 43 | 
 44 | def get_word2vec(data_path):
 45 |     # TO DO: download word2vec!
 46 |     word2vec_dic = pickle.load(open(expanduser(data_path)))
 47 |     return word2vec_dic
 48 | 
 49 | # Unknown symbols are UNK 
 50 | # Missing word symbols are zeros
 51 | # EOS are EOS
 52 | 
 53 | def encode_sequence(word2vec_dic, sequence, encode_dim, max_len):
 54 |     sequence_by_word = sequence.split(" ")
 55 |     encoded_seq = np.zeros((max_len, encode_dim))
 56 |     for i in range(len(sequence_by_word)):
 57 |         word = sequence_by_word[i]
 58 |         if word2vec_dic.get(word) == None:
 59 |             encoded_seq[i, :] = word2vec_dic["UNK"]
 60 |         else:
 61 |             if word != "REMOVE":
 62 |                 encoded_seq[i, :] = word2vec_dic[word]
 63 | 	    else:
 64 | 		encoded_seq[i, :] = word2vec_dic["_"]
 65 |     return encoded_seq, len(sequence_by_word)
 66 | 
 67 | def oneHot(nclasses, idx):
 68 |     one_hot = np.zeros((nclasses))
 69 |     one_hot[idx] = 1
 70 |     return one_hot
 71 | 
 72 | def load_batch(n_classes, word2vec_dic, missing_word_dic, encode_dim, max_len, data_path, is_train, train_file, test_file, all_classes, start_idx, batch_size):
 73 |     batch_identifiers, batch_text = get_text_by_batch(data_path, is_train, train_file, test_file, all_classes, start_idx, batch_size)
 74 |     encoded_batch = np.zeros((batch_size, max_len, encode_dim))
 75 |     batch_classes = np.zeros((batch_size, 2))
 76 |     batch_missing_word_encoded = np.zeros((batch_size, encode_dim))
 77 |     batch_missing_word = []
 78 |     batch_length = []
 79 |     for i in range(batch_size):
 80 |         encoded_batch[i,:, :], text_length = encode_sequence(word2vec_dic, batch_text[i], encode_dim, max_len)
 81 |         batch_classes[i,:] = oneHot(n_classes, all_classes[batch_identifiers[i]][-1])
 82 |         batch_missing_word_encoded[i,:] = word2vec_dic[missing_word_dic[batch_identifiers[i]]]
 83 |         batch_missing_word.append(missing_word_dic[batch_identifiers[i]])
 84 |         batch_length.append(text_length)
 85 |     return encoded_batch, batch_classes, batch_missing_word_encoded, batch_missing_word, batch_identifiers, batch_text, batch_length
 86 | 
 87 | if __name__ == "__main__":
 88 |     data_path = "~/automatedMTL/data/rotten_tomato"
 89 |     max_length = reformat_data(data_path, False)
 90 |     class_look_up(data_path)
 91 |     data_stats = pickle.load(open(expanduser(data_path + "/stats.pkl")))
 92 |     n_classes, n_data, n_data_per_class, trainPercent, testPercent = data_stats['n_classes'], data_stats['n_data'], data_stats['n_data_per_class'],data_stats['trainPercent'], data_stats['testPercent']
 93 |     word2vec_dic = get_word2vec("~/tweetnet/data/word2vec_dict.pkl")
 94 |     missing_word_dic = pickle.load(open(expanduser(data_path + "/missing_word_dic.pkl")))
 95 |     for epoch in range(3):
 96 |         dic = {}
 97 |         all_classes, train_file, test_file = load_data(data_path)
 98 |         start_idx = 0
 99 |         for minibatch in range(73):
100 |             encoded_batch, batch_classes, batch_missing_word_encoded, batch_missing_word, batch_identifier, batch_text, batch_length = load_batch(n_classes, word2vec_dic, missing_word_dic, 300, max_length, data_path+"/Train/", 1, train_file, test_file, all_classes, start_idx, 128)
101 |             start_idx += 128
102 |             #print batch_text
103 |             #print batch_missing_word
104 |             #print batch_length
105 |             for i in batch_identifier:
106 |                 if dic.get(i) != None: print "Wrong"
107 |                 else: dic[i] = 1
108 | 


--------------------------------------------------------------------------------
/auto/src/util/load_batch_val.py:
--------------------------------------------------------------------------------
  1 | import cPickle as pickle
  2 | import numpy as np
  3 | import os
  4 | import random
  5 | from os.path import expanduser
  6 | #from reformat import reformat_data
  7 | from load_util import class_look_up
  8 | 
  9 | # The files are named from 0.txt to n.txt
 10 | # This function returns a list of all shuffled file names 
 11 | 
 12 | def get_file_identifiers(data_path):
 13 |     ids = []
 14 |     f = open(expanduser(data_path))    
 15 |     for l in f.readlines():
 16 |         ids.append(int(l.split(" ")[0]))
 17 |     random.shuffle(ids)
 18 |     return ids
 19 | 
 20 | def get_classes(all_classes, id):
 21 |     return all_classes[id][0]
 22 | 
 23 | def get_text_by_batch(data_path, is_train, is_val, train_file, test_file, val_file, all_classes, start_idx, batch_size):
 24 |     if is_train:
 25 |         identifiers = train_file
 26 |     elif is_val:
 27 |         identifiers = val_file
 28 |     else:
 29 |         identifiers = test_file
 30 |     batch_identifiers = identifiers[start_idx: start_idx + batch_size]
 31 |     
 32 |     batch_text = []
 33 |     for idx in batch_identifiers:
 34 |         text = open(expanduser(data_path + get_classes(all_classes, idx)+ "/" + str(idx)+".txt"))
 35 |         batch_text.append(text.read())
 36 | 
 37 |     return batch_identifiers, batch_text
 38 | 
 39 | def load_data(data_path):
 40 |     
 41 |     all_classes = pickle.load(open(expanduser(data_path + '/classes.pkl')))
 42 |     test_file = get_file_identifiers(data_path + "/test_classes.txt")
 43 |     train_file = get_file_identifiers(data_path + "/train_classes.txt")
 44 |     val_file = get_file_identifiers(data_path + "/validation_classes.txt")
 45 |     return all_classes, train_file, test_file, val_file
 46 | 
 47 | def get_word2vec(data_path):
 48 |     # TO DO: download word2vec!
 49 |     word2vec_dic = pickle.load(open(expanduser(data_path)))
 50 |     return word2vec_dic
 51 | 
 52 | # Unknown symbols are UNK 
 53 | # Missing word symbols are zeros
 54 | # EOS are EOS
 55 | 
 56 | def encode_sequence(word2vec_dic, sequence, encode_dim, max_len):
 57 |     sequence_by_word = sequence.split(" ")
 58 |     encoded_seq = np.zeros((max_len, encode_dim))
 59 |     for i in range(len(sequence_by_word)):
 60 |         word = sequence_by_word[i]
 61 |         if word2vec_dic.get(word) == None:
 62 |             encoded_seq[i, :] = word2vec_dic["UNK"]
 63 |         else:
 64 |             if word != "REMOVE":
 65 |                 encoded_seq[i, :] = word2vec_dic[word]
 66 | 	    else:
 67 | 		encoded_seq[i, :] = word2vec_dic["_"]
 68 |     return encoded_seq, len(sequence_by_word)
 69 | 
 70 | def encode_sequence_generation(word2vec_dic, sequence, encode_dim, max_len):
 71 |     sequence_by_word = sequence.split(" ")
 72 |     encoded_seq = np.zeros((max_len, encode_dim))
 73 |     for i in range(1, len(sequence_by_word)):
 74 |         word = sequence_by_word[i]
 75 |         if word2vec_dic.get(word) == None:
 76 |             encoded_seq[i-1, :] = word2vec_dic["UNK"]
 77 |         else:
 78 |             encoded_seq[i-1, :] = word2vec_dic[word]
 79 |     encoded_seq[len(sequence_by_word)-1, :] = word2vec_dic["EOS"]
 80 |     context_target = sequence_by_word[1:len(sequence_by_word)] + ["EOS"]
 81 | 
 82 |     return encoded_seq, context_target, len(sequence_by_word)
 83 | 
 84 | def oneHot(nclasses, idx):
 85 |     one_hot = np.zeros((nclasses))
 86 |     one_hot[idx-1] = 1
 87 |     return one_hot
 88 | 
 89 | def load_batch(n_classes, word2vec_dic, missing_word_dic, encode_dim, max_len, data_path, is_train, is_val, train_file, test_file, val_file,all_classes, start_idx, batch_size, automated_task):
 90 |     batch_identifiers, batch_text = get_text_by_batch(data_path, is_train,is_val,  train_file, test_file, val_file, all_classes, start_idx, batch_size)
 91 |     encoded_batch = np.zeros((batch_size, max_len, encode_dim))
 92 |     batch_classes = np.zeros((batch_size, n_classes))
 93 |     batch_context_encoded = np.zeros((batch_size, encode_dim))
 94 |     if automated_task == "word generation": batch_context_encoded = np.zeros((batch_size, max_len, encode_dim))
 95 |     batch_context = []
 96 |     batch_length = []
 97 |     for i in range(batch_size):
 98 |         encoded_batch[i,:, :], text_length = encode_sequence(word2vec_dic, batch_text[i], encode_dim, max_len)
 99 |         batch_classes[i,:] = oneHot(n_classes, all_classes[batch_identifiers[i]][-1])
100 |         if automated_task != "word generation":
101 |             batch_context_encoded[i,:] = word2vec_dic[missing_word_dic[batch_identifiers[i]]]
102 |             batch_context.append(missing_word_dic[batch_identifiers[i]])
103 |         else:
104 | 	    batch_context_encoded[i, :, :], context_target, text_length = encode_sequence_generation(word2vec_dic, batch_text[i], encode_dim, max_len)
105 | 	    batch_context.append(context_target)
106 |         batch_length.append(text_length)
107 |     return encoded_batch, batch_classes, batch_context_encoded, batch_context, batch_identifiers, batch_text, batch_length
108 | 
109 | if __name__ == "__main__":
110 |     data_path = "~/tweetnet/automatedMTL/data/ag_news_csv"
111 |     data_stats = pickle.load(open(expanduser(data_path + "/stats.pkl")))
112 |     n_classes, n_data, n_data_per_class, n_train_data, n_test_data, max_length = data_stats['n_classes'], data_stats['n_data'], data_stats['n_data_per_class'],data_stats['n_train_data'], data_stats['n_test_data'], data_stats['max_length']
113 |     print n_classes, n_data, n_data_per_class
114 |     word2vec_dic = get_word2vec("~/tweetnet/data/word2vec_dict.pkl")
115 |     for epoch in range(3):
116 |         dic = {}
117 |         all_classes, train_file, test_file, val_file = load_data(data_path)
118 |         start_idx = 0
119 |         for minibatch in range(3):
120 |             encoded_batch, batch_classes, batch_context_encoded, batch_context, batch_identifier, batch_text, batch_length = load_batch(n_classes, word2vec_dic, {}, 300, max_length, data_path+"/Train/", 1, train_file, test_file, all_classes, start_idx, 1, automated_task="word generation")
121 |             start_idx += 1
122 |             print batch_text
123 |             print batch_classes
124 |             print batch_context
125 |             print encoded_batch.shape
126 |             print batch_context_encoded.shape
127 |             for i in batch_identifier:
128 |                 if dic.get(i) != None: print "Wrong"
129 |                 else: dic[i] = 1
130 | 


--------------------------------------------------------------------------------
/auto/src/util/load_util.py:
--------------------------------------------------------------------------------
 1 | import cPickle as pickle
 2 | import os
 3 | from os.path import expanduser
 4 | from os.path import basename
 5 | 
 6 | def class_look_up(data_path):
 7 |     out_train = open(expanduser(data_path+"/train_classes.txt"), "w")
 8 |     out_test = open(expanduser(data_path+"/test_classes.txt"), "w")
 9 |     train_folders = os.listdir(expanduser(data_path+"/Train/"))
10 |     test_folders = os.listdir(expanduser(data_path+"/Test/"))
11 |  
12 |     dict = {}
13 |     cnt = 0
14 |     file2class_dict = {}
15 | 
16 |     for i in train_folders:
17 |         if i[0] != '.':
18 |             if dict.get(i) == None:
19 | 	        dict[i] = cnt
20 |                 cnt += 1
21 |             files = os.listdir(expanduser(data_path+"/Train/"+i))
22 | 	    for f in files:
23 |                 if f[0] == ".": continue
24 | 	        out_train.write(f[0:len(f) - 4] + "   " + i + "   " + str(dict[i]))
25 |                 out_train.write("\n") 
26 |                 file2class_dict[int(f[0:len(f) - 4])] = (i, dict[i])
27 | 
28 |     for i in test_folders:
29 |         if i[0] != '.':
30 |             files = os.listdir(expanduser(data_path+"/Test/"+i))
31 | 	    for f in files:
32 |                 if f[0] == ".": continue
33 | 	        out_test.write(f[0:len(f) - 4] + "   " + i + "   " + str(dict[i]))
34 | 		out_test.write("\n")
35 |                 file2class_dict[int(f[0:len(f) - 4])] = (i, dict[i])
36 |     out_train.close()
37 |     out_test.close()
38 |     pickle.dump(file2class_dict, open(expanduser(data_path+"/classes.pkl"), "w"))
39 |     #print file2class_dict
40 |     #print len(file2class_dict)
41 | 
42 | if __name__ == "__main__":
43 |     class_look_up("~/automatedMTL/data/rotten_tomato")
44 | 


--------------------------------------------------------------------------------
/auto/src/util/reformat.py:
--------------------------------------------------------------------------------
  1 | import cPickle as pickle
  2 | import numpy as np
  3 | import os
  4 | from os.path import expanduser
  5 | from path import Path
  6 | import random
  7 | from stop_words import get_stop_words
  8 | 
  9 | # The dataset has 5331 positive and 5331 negative reviews
 10 | # According to prev work, split inot 90% training (4998) and 10% testing (533)
 11 | 
 12 | word2vec_dic = pickle.load(open(expanduser("~/tweetnet/data/word2vec_dict.pkl")))
 13 | stop_words = get_stop_words('english')
 14 | length = []
 15 | missing_word_dic = {}
 16 | 
 17 | def get_dataset(dataset_path):
 18 |     data_stats = pickle.load(open(expanduser(dataset_path + "/stats.pkl")))
 19 |     all_example = {}
 20 |     all_class_folders = os.listdir(expanduser(dataset_path+"/all_data/"))
 21 |     for class_folder in all_class_folders:
 22 |         if class_folder[0] != ".":
 23 |             all_example[class_folder] = open(expanduser(dataset_path+'/all_data/' + class_folder))
 24 |     return all_example, data_stats
 25 | 
 26 | 
 27 | def replace_missing_word(data_by_word):
 28 | 
 29 |     new_data = []
 30 |     for i in range(len(data_by_word)):
 31 |         word = data_by_word[i]
 32 |         if word in stop_words and word2vec_dic.get(word) == None:
 33 | 	    continue
 34 |         else:
 35 |             new_data.append(word)
 36 |              
 37 |     idx = range(0, len(new_data))
 38 |     random.shuffle(idx) 
 39 |     removed = ""
 40 |     
 41 |     if len(new_data) == 1 and word2vec_dic.get(new_data[0]) != None:
 42 |         return new_data + ["-"], new_data[-1]
 43 |     elif len(new_data) == 1 and word2vec_dic.get(new_data[0]) == None:
 44 |         return [], ""
 45 | 
 46 |     valid = False
 47 |     for i in idx:
 48 |         word = new_data[i]
 49 |         if word not in stop_words and word2vec_dic.get(word)!= None:
 50 |             removed = new_data[i]
 51 | 	    data_by_word[i] = "REMOVE"
 52 | 	    valid = True
 53 |             break
 54 |     if not valid:  
 55 |         print data_by_word
 56 |         return [], ""
 57 |     return data_by_word, removed
 58 | 
 59 |  
 60 | def process_data(data, is_missing_word):
 61 |     
 62 |     d = list(data)
 63 |     for i in range(len(d)):
 64 |         if ord(d[i]) > ord('z') or ord(d[i]) < ord('a') and d[i] != "'":
 65 |             d[i] = " "
 66 |     string = "".join(d)
 67 |    
 68 |     if not is_missing_word:
 69 |         string = " ".join(string.split())
 70 |         string = string + " " + "EOS"
 71 |         length.append(len(string.split()))
 72 |         return string, "_"
 73 |     
 74 |     string, removed = replace_missing_word(string.split())
 75 |     if string == []: return [], ""
 76 |     string = " ".join(string)
 77 |     string = string + " " + "EOS"
 78 |     length.append(len(string.split()))
 79 |     return string, removed
 80 | 
 81 | def reformat_data(dataset_path, is_missing_word):
 82 |     
 83 |     # Clean up the directory in train and test folder
 84 |     d_train, d_test = Path(expanduser(dataset_path+"/Train")), Path(expanduser(dataset_path+"/Test"))
 85 |     train_files, test_files = d_train.walk("*.txt"), d_test.walk("*.txt")
 86 |     for f in train_files:   
 87 |         f.remove()
 88 |     for f in test_files:
 89 |         f.remove()
 90 | 
 91 |     all_example, data_stats = get_dataset(dataset_path)
 92 |     n_classes, n_data, n_data_per_class, trainPercent, testPercent = data_stats['n_classes'], data_stats['n_data'], data_stats['n_data_per_class'], data_stats['trainPercent'], data_stats['testPercent']
 93 |     all_idx = range(0,n_data)
 94 |     random.shuffle(all_idx)
 95 |     test_idx = all_idx[0:int(testPercent*n_data)]
 96 |     identifier = 0
 97 | 
 98 |     for one_class in all_example.keys(): 
 99 |         for p in all_example[one_class].readlines():
100 |             if p == "\n":
101 |                 continue
102 |             else:
103 |                 if identifier in test_idx:
104 |                     file = open(expanduser(dataset_path + "/Test/" + one_class[0:len(one_class)-4] + "/" + str(identifier) + ".txt"), "w")
105 |                 else:
106 |                     file = open(expanduser(dataset_path + "/Train/" + one_class[0:len(one_class)-4] + "/" + str(identifier) + ".txt"), "w")
107 | 
108 |                 string, removed = process_data(p, is_missing_word)
109 |                 if string != []:
110 |                     file.write(string)
111 |                     missing_word_dic[identifier] = removed
112 |                     identifier += 1
113 |                     file.close()
114 |                 else: print p 
115 |     pickle.dump(missing_word_dic, open(expanduser(dataset_path + "/missing_word_dic.pkl"),"w"))
116 |     return sorted(length)[-1]
117 | 
118 | if __name__ == "__main__":
119 |     print reformat_data("~/automatedMTL/data/rotten_tomato", False)
120 | 


--------------------------------------------------------------------------------
/auto/src/util/tf_utils.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | import numpy as np
 3 | 
 4 | def fcLayer(x, in_shape, out_shape, activation, dropout, is_train, scope="fc"):
 5 |     
 6 |     x = tf.reshape(x, [-1, in_shape])
 7 |  
 8 |     with tf.variable_scope(scope):
 9 |         w = tf.get_variable(name="w", shape = [in_shape, out_shape], dtype=tf.float32, initializer=tf.contrib.layers.xavier_initializer())
10 |         b = tf.get_variable(name="b", shape= [out_shape], dtype=tf.float32, initializer=tf.constant_initializer(0.0))
11 |         fc = tf.add(tf.matmul(x, w), b)
12 | 
13 |         with tf.variable_scope("activation"):
14 |             output = applyActivation(fc, activation)
15 |             #out_op = tf.nn.dropout(output, dropout)
16 |             out_op = output
17 | 
18 |     return out_op
19 | 
20 | def createGRUCell(batch_size, lstm_size):
21 |     gru_cell = tf.contrib.rnn.GRUCell(num_units=lstm_size, activation=tf.tanh)
22 |     state=gru_cell.zero_state(batch_size, tf.float32)
23 | 
24 |     return gru_cell, state
25 | 
26 | def createLSTMCell(batch_size, lstm_size, n_layers, forget_bias):
27 | 
28 |     lstm_cell = tf.contrib.rnn.BasicLSTMCell(lstm_size, forget_bias=forget_bias)
29 |     lstm_cell = tf.contrib.rnn.MultiRNNCell([lstm_cell  for i in range(n_layers)], state_is_tuple=True)
30 |     state = lstm_cell.zero_state(batch_size, tf.float32)
31 |    
32 |     return lstm_cell, state
33 | 
34 | def applyActivation(x, activation):
35 | 
36 |     if activation == "tanh":
37 |         return tf.nn.tanh(x)
38 |     elif activation == "relu":
39 |         return tf.nn.relu(x)
40 |     elif activation == "sigmoid":
41 |         return tf.nn.sigmoid(x)
42 |     elif activation == "relu6":
43 |         return tf.nn.relu6(x)
44 |     elif activation == "softmax":
45 | 	return tf.nn.softmax(x)
46 |     else: return None
47 | 
48 | def length(sequence):
49 |     used = tf.sign(tf.reduce_max(tf.abs(sequence), reduction_indices=2))
50 |     length = tf.reduce_sum(used, reduction_indices=1)
51 |     length = tf.cast(length, tf.int32)
52 |     return length
53 | 
54 | def predictionLayer(x, y, in_shape, out_shape, activation, scope="prediction"):
55 |     
56 |     x = tf.reshape(x, [-1, in_shape])
57 | 
58 |     with tf.variable_scope(scope):
59 |         w = tf.get_variable(name=scope+"w", shape = [in_shape, out_shape], dtype=tf.float32, initializer=tf.contrib.layers.xavier_initializer())
60 |         b = tf.get_variable(name=scope+"b", shape= [out_shape], dtype=tf.float32, initializer=tf.constant_initializer(0.0))
61 |         logits = tf.add(tf.matmul(x, w), b)
62 |         output = applyActivation(logits, activation)
63 |     return output, logits
64 | 
65 | def compute_cost(logit, y, out_type, max_length, batch_size, embed_dim, activation):
66 |     if out_type=="last_only":
67 |         cost = tf.nn.sigmoid_cross_entropy_with_logits(labels=y, logits=logit)
68 |         cost = tf.reduce_mean(cost, reduction_indices=1)
69 |     else:
70 |         pred_out = applyActivation(logit, activation)
71 | 	pred_out = tf.reshape(pred_out, [batch_size, max_length, embed_dim])
72 |         mse = tf.reduce_mean(tf.square(tf.subtract(y, pred_out)), reduction_indices=2)
73 | 	mask = tf.sign(tf.reduce_max(tf.abs(y), reduction_indices=2))
74 |         mse *= mask
75 |         mse = tf.reduce_sum(mse, reduction_indices=1)
76 |         mse /= tf.cast(length(y), tf.float32)
77 |         cost = mse
78 |     cost = tf.reduce_mean(cost, reduction_indices=0)
79 |     print "final cost shape: ", cost.get_shape()
80 |     return cost 
81 | 


--------------------------------------------------------------------------------
/auto/src/util/tf_utils_old.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | import numpy as np
 3 | 
 4 | def fcLayer(x, in_shape, out_shape, activation, dropout, is_train, scope="fc"):
 5 |     
 6 |     x = tf.reshape(x, [-1, in_shape])
 7 |  
 8 |     with tf.variable_scope(scope):
 9 |         w = tf.get_variable(name="w", shape = [in_shape, out_shape], dtype=tf.float32, initializer=tf.truncated_normal_initializer(stddev=1e-2))
10 |         b = tf.get_variable(name="b", shape= [out_shape], dtype=tf.float32, initializer=tf.constant_initializer(0.0))
11 |         fc = tf.add(tf.matmul(x, w), b)
12 | 
13 |         with tf.variable_scope("activation"):
14 |             output = applyActivation(fc, activation)
15 |             #out_op = tf.nn.dropout(output, dropout)
16 |             out_op = output
17 | 
18 |     return out_op
19 | 
20 | def createLSTMCell(batch_size, lstm_size, n_layers, forget_bias):
21 | 
22 |     lstm_cell = tf.contrib.rnn.BasicLSTMCell(lstm_size, forget_bias=forget_bias)
23 |     lstm_cell = tf.contrib.rnn.MultiRNNCell([lstm_cell  for i in range(n_layers)], state_is_tuple=True)
24 |     state = lstm_cell.zero_state(batch_size, tf.float32)
25 |    
26 |     return lstm_cell, state
27 | 
28 | def applyActivation(x, activation):
29 | 
30 |     if activation == "tanh":
31 |         return tf.nn.tanh(x)
32 |     elif activation == "relu":
33 |         return tf.nn.relu(x)
34 |     elif activation == "sigmoid":
35 |         return tf.nn.sigmoid(x)
36 |     elif activation == "relu6":
37 |         return tf.nn.relu6(x)
38 |     else: return None
39 | 
40 | 
41 | def predictionLayer(x, y, in_shape, out_shape, activation, scope="prediction"):
42 |     
43 |     x = tf.reshape(x, [-1, in_shape])
44 | 
45 |     with tf.variable_scope(scope):
46 |         w = tf.get_variable(name=scope+"w", shape = [in_shape, out_shape], dtype=tf.float32, initializer=tf.truncated_normal_initializer(stddev=1e-2))
47 |         b = tf.get_variable(name=scope+"b", shape= [out_shape], dtype=tf.float32, initializer=tf.constant_initializer(0.0))
48 |         logits = tf.add(tf.matmul(x, w), b)
49 |         output = applyActivation(logits, activation)
50 |         # Compute the mean-squared-error
51 |         cost = tf.reduce_mean(tf.square(tf.subtract(y , output)))
52 | 
53 |     return cost, output
54 | 


--------------------------------------------------------------------------------
/auto/src/util/tf_utils_reg.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | import numpy as np
 3 | 
 4 | def fcLayer(x, in_shape, out_shape, activation, dropout, is_train, reg_const, scope="fc"):
 5 |     
 6 |     x = tf.reshape(x, [-1, in_shape])
 7 |  
 8 |     with tf.variable_scope(scope):
 9 |         w = tf.get_variable(name="w", shape = [in_shape, out_shape], dtype=tf.float32, initializer=tf.truncated_normal_initializer(stddev=1e-2), regularizer=tf.contrib.layers.l2_regularizer(reg_const))
10 |         b = tf.get_variable(name="b", shape= [out_shape], dtype=tf.float32, initializer=tf.constant_initializer(0.0))
11 |         fc = tf.add(tf.matmul(x, w), b)
12 | 
13 |         with tf.variable_scope("activation"):
14 |             output = applyActivation(fc, activation)
15 |             #out_op = tf.nn.dropout(output, dropout)
16 |             out_op = output
17 | 
18 |     return out_op
19 | 
20 | def createLSTMCell(batch_size, lstm_size, n_layers, forget_bias):
21 | 
22 |     lstm_cell = tf.contrib.rnn.BasicLSTMCell(lstm_size, forget_bias=forget_bias)
23 |     lstm_cell = tf.contrib.rnn.MultiRNNCell([lstm_cell  for i in range(n_layers)], state_is_tuple=True)
24 |     state = lstm_cell.zero_state(batch_size, tf.float32)
25 |    
26 |     return lstm_cell, state
27 | 
28 | def applyActivation(x, activation):
29 | 
30 |     if activation == "tanh":
31 |         return tf.nn.tanh(x)
32 |     elif activation == "relu":
33 |         return tf.nn.relu(x)
34 |     elif activation == "sigmoid":
35 |         return tf.nn.sigmoid(x)
36 |     elif activation == "relu6":
37 |         return tf.nn.relu6(x)
38 |     elif activation == "softmax":
39 | 	return tf.nn.softmax(x)
40 |     else: return None
41 | 
42 | def length(sequence):
43 |     used = tf.sign(tf.reduce_max(tf.abs(sequence), reduction_indices=2))
44 |     length = tf.reduce_sum(used, reduction_indices=1)
45 |     length = tf.cast(length, tf.int32)
46 |     return length
47 | 
48 | def predictionLayer(x, y, in_shape, out_shape, activation, reg_const, scope="prediction"):
49 |     
50 |     x = tf.reshape(x, [-1, in_shape])
51 | 
52 |     with tf.variable_scope(scope):
53 |         w = tf.get_variable(name=scope+"w", shape = [in_shape, out_shape], dtype=tf.float32, initializer=tf.truncated_normal_initializer(stddev=1e-2), regularizer=tf.contrib.layers.l2_regularizer(reg_const))
54 |         b = tf.get_variable(name=scope+"b", shape= [out_shape], dtype=tf.float32, initializer=tf.constant_initializer(0.0))
55 |         logits = tf.add(tf.matmul(x, w), b)
56 |         output = applyActivation(logits, activation)
57 |     return output, logits
58 | 
59 | def compute_cost(logit, y, out_type, max_length, batch_size, embed_dim, activation):
60 |     if out_type=="last_only":
61 |         cost = tf.nn.sigmoid_cross_entropy_with_logits(labels=y, logits=logit)
62 |         cost = tf.reduce_mean(cost, reduction_indices=1)
63 |     else:
64 |         pred_out = applyActivation(logit, activation)
65 | 	pred_out = tf.reshape(pred_out, [batch_size, max_length, embed_dim])
66 |         mse = tf.reduce_mean(tf.square(tf.subtract(y, pred_out)), reduction_indices=2)
67 | 	mask = tf.sign(tf.reduce_max(tf.abs(y), reduction_indices=2))
68 |         mse *= mask
69 |         mse = tf.reduce_sum(mse, reduction_indices=1)
70 |         mse /= tf.cast(length(y), tf.float32)
71 |         cost = mse
72 |     cost = tf.reduce_mean(cost, reduction_indices=0)
73 |     print "final cost shape: ", cost.get_shape()
74 |     return cost 
75 | 


--------------------------------------------------------------------------------
/auto/src_final/model/mcrnn_model.py:
--------------------------------------------------------------------------------
  1 | import tensorflow as tf
  2 | import numpy as np
  3 | import os
  4 | import cPickle as pickle
  5 | from os.path import expanduser
  6 | import sys
  7 | 
  8 | sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "..","util")))
  9 | from tf_utils import fcLayer, createLSTMCell, applyActivation, predictionLayer
 10 | #from predContext import predContext, createHtDict
 11 | 
 12 | class model(object):
 13 |         
 14 |         # Model params
 15 |         # 0 -- shared;  1 -- context;  2 -- task
 16 | 	fc_activation = "tanh"
 17 | 	output_activation = "tanh"
 18 | 	dropout = 0.0
 19 | 	body_lstm_size = 128
 20 | 	context_lstm_size = 128
 21 | 	task_lstm_size = 128
 22 | 	body_n_layer = 1
 23 | 	context_n_layer = 1
 24 | 	task_n_layer = 1
 25 | 	context_branch_fc = 512
 26 | 	task_branch_fc = 512
 27 | 
 28 | 	# Data params
 29 | 	batch_size = 128
 30 | 	max_length = 52
 31 | 	feature_length = 300
 32 | 	context_dim = 300
 33 | 	task_dim = 2
 34 | 
 35 | 	# Hyper- params
 36 | 	lr = 0.001
 37 | 	context_lr = lr
 38 | 	n_epoch = 500
 39 | 	topN = 4
 40 | 	keep_prob_val = 1.0
 41 | 
 42 | 	def buildModel(self, x, y_context, y_task, is_train, dropout, scope="multiTask"):
 43 |      
 44 |     	    # Assume the input shape is (batch_size, max_length, feature_length) 
 45 | 
 46 |     	    #TASK = primary task, CONTEXT = secondary task
 47 |     
 48 |     	    # Create lstm cell for the shared layer 
 49 |             body_lstm_cell, _ = createLSTMCell(self.batch_size, self.body_lstm_size, self.body_n_layer, forget_bias=0.0)
 50 |             # Create lstm cell for branch 1 
 51 |             context_lstm_cell, _ = createLSTMCell(self.batch_size, self.context_lstm_size, self.context_n_layer, forget_bias=0.0)
 52 |             # Create lstm cells for branch 2
 53 | 	    task_lstm_cell, _ = createLSTMCell(self.batch_size, self.task_lstm_size, self.task_n_layer, forget_bias=0.0)
 54 | 
 55 |     	    context_cost = tf.constant(0)
 56 |     	    task_cost = tf.constant(0)
 57 | 
 58 |     	    with tf.variable_scope("shared_lstm"):
 59 |         	body_cell_output, last_body_state = tf.nn.dynamic_rnn(cell = body_lstm_cell, dtype=tf.float32, sequence_length=self.length(x), inputs=x)
 60 |         
 61 |     	    with tf.variable_scope("context_branch"):
 62 |         	context_cell_output, last_context_state = tf.nn.dynamic_rnn(cell = context_lstm_cell, dtype=tf.float32, sequence_length=self.length(body_cell_output), inputs=body_cell_output)
 63 | 
 64 |     	    # The output from LSTMs will be (batch_size, max_length, out_size)
 65 |     	    with tf.variable_scope("context_fc"):
 66 |         	# Select the last output that is not generated by zero vectors
 67 |         	last_context_output = self.last_relevant(context_cell_output, self.length(context_cell_output))
 68 |         	# feed the last output to the fc layer and make prediction
 69 |         	context_fc_out = fcLayer(x=last_context_output, in_shape=self.context_lstm_size, out_shape=self.context_branch_fc, activation=self.fc_activation, dropout=self.dropout, is_train=is_train, scope="fc1")
 70 |         	context_cost, context_output = predictionLayer(x=context_fc_out, y=y_context, in_shape=self.context_branch_fc, out_shape=y_context.get_shape()[-1].value, activation=self.output_activation)
 71 | 
 72 |     	    with tf.variable_scope("task_branch"):
 73 |         	task_cell_output, last_task_state = tf.nn.dynamic_rnn(cell = task_lstm_cell, dtype=tf.float32, sequence_length=self.length(body_cell_output), inputs=body_cell_output)
 74 | 
 75 |     	    with tf.variable_scope("task_fc"):
 76 |         	# Select the last output that is not generated by zero vectors
 77 |         	last_task_output = self.last_relevant(task_cell_output, self.length(task_cell_output))
 78 |         	# feed the last output to the fc layer and make prediction
 79 |         	task_fc_out = fcLayer(x=last_task_output, in_shape=self.task_lstm_size, out_shape=self.task_branch_fc, activation=self.fc_activation, dropout=self.dropout, is_train=is_train, scope="fc2")
 80 |         	task_cost, task_output = predictionLayer(x=task_fc_out, y=y_task, in_shape=self.context_branch_fc, out_shape=y_task.get_shape()[-1].value, activation=self.output_activation)
 81 | 
 82 |     	    return context_cost, task_cost, task_output, context_output
 83 | 
 84 | 	# Flatten the output tensor to shape features in all examples x output size
 85 | 	# construct an index into that by creating a tensor with the start indices for each example tf.range(0, batch_size) x max_length
 86 | 	# and add the individual sequence lengths to it
 87 | 	# tf.gather() then performs the acutal indexing.
 88 | 	def last_relevant(self, output, length):
 89 |     	    index = tf.range(0, self.batch_size) * self.max_length + (length - 1)
 90 |             out_size = int(output.get_shape()[2])
 91 |     	    flat = tf.reshape(output, [-1, out_size])
 92 |    	    relevant = tf.gather(flat, index)
 93 |     	    return relevant
 94 | 
 95 | # Assume that the sequences are padded with 0 vectors to have shape (batch_size, max_length, feature_length)
 96 | 
 97 |         def length(self, sequence):
 98 |             used = tf.sign(tf.reduce_max(tf.abs(sequence), reduction_indices=2))
 99 |             length = tf.reduce_sum(used, reduction_indices=1)
100 |             length = tf.cast(length, tf.int32)
101 |             print length.get_shape()
102 |             return length
103 | 
104 | 
105 | 
106 | 
107 | 
108 | 


--------------------------------------------------------------------------------
/auto/src_final/model/mcrnn_model_1_lstm.py:
--------------------------------------------------------------------------------
  1 | import tensorflow as tf
  2 | import numpy as np
  3 | import os
  4 | import cPickle as pickle
  5 | from os.path import expanduser
  6 | import sys
  7 | 
  8 | sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "..","util")))
  9 | from tf_utils import fcLayer, createLSTMCell, createGRUCell, applyActivation, predictionLayer, compute_cost
 10 | #from predContext import predContext, createHtDict
 11 | 
 12 | class model(object):
 13 | 
 14 |         # Task params
 15 |         is_multi_task = True
 16 |         secondary_task = "word generation"
 17 |         primary_task = "classification"
 18 | 
 19 |         # Model params
 20 |         # 0 -- shared;  1 -- context;  2 -- task
 21 | 	fc_activation = "tanh"
 22 | 	context_output_activation = "tanh"
 23 | 	task_output_activation = "softmax"
 24 | 	body_lstm_size = 1024
 25 | 	body_n_layer = 1
 26 | 	context_n_layer = 1
 27 | 	task_n_layer = 1
 28 | 	context_branch_fc = 512
 29 | 	task_branch_fc = 30
 30 | 
 31 | 	# Data params
 32 | 	n_classes = 2
 33 | 	batch_size = 64
 34 | 	max_length = 52
 35 | 	feature_length = 300
 36 |  	context_dim = 300
 37 | 	task_dim = n_classes
 38 | 
 39 | 	# Hyper- params
 40 | 	lr = 0.0001 #hp
 41 |         lr_mod = 1.0 #hp
 42 | 	context_lr = lr_mod*lr
 43 | 	n_epoch = 50 #hp
 44 | 
 45 | 	def buildModel(self, x, y_context, y_task, is_train, dropout, scope="multiTask"):
 46 | 
 47 |     	    # Assume the input shape is (batch_size, max_length, feature_length)
 48 | 
 49 |     	    #TASK = primary task, CONTEXT = secondary task
 50 | 
 51 |     	    # Create lstm cell for the shared layer
 52 |             body_lstm_cell, _ = createLSTMCell(self.batch_size, self.body_lstm_size, self.body_n_layer, forget_bias=0.0)
 53 | 
 54 |     	    context_cost = tf.constant(0)
 55 |     	    task_cost = tf.constant(0.0, dtype=tf.float32)
 56 | 
 57 |             if not self.is_multi_task: context_output = tf.constant(0)
 58 | 
 59 |     	    with tf.variable_scope("shared_lstm"):
 60 |         	body_cell_output, last_body_state = tf.nn.dynamic_rnn(cell = body_lstm_cell, dtype=tf.float32, sequence_length=self.length(x), inputs=x)
 61 | 
 62 |             if self.is_multi_task:
 63 |     	        with tf.variable_scope("context_branch"):
 64 |         	    # Select the last output that is not generated by zero vectors
 65 |                     if self.secondary_task == "missing word":
 66 |         	        last_body_output = self.last_relevant(body_cell_output, self.length(body_cell_output))
 67 |         	        # feed the last output to the fc layer and make prediction
 68 |     	                with tf.variable_scope("context_fc"):
 69 |         	            context_fc_out = fcLayer(x=last_body_output, in_shape=self.body_lstm_size, out_shape=self.context_branch_fc, activation=self.fc_activation, dropout=dropout, is_train=is_train, scope="fc1")
 70 |         	        with tf.variable_scope("context_pred"):
 71 | 		            context_output, context_logits = predictionLayer(x=context_fc_out, y=y_context, in_shape=self.context_branch_fc, out_shape=y_context.get_shape()[-1].value, activation=self.context_output_activation)
 72 | 		            context_cost = compute_cost(logit=context_logits, y=y_context, out_type="last_only", max_length=self.max_length, batch_size=self.batch_size, embed_dim=self.feature_length, activation=self.context_output_activation)
 73 | 
 74 |                     if self.secondary_task == "word generation":
 75 | 			context_input = tf.transpose(body_cell_output, [1, 0, 2])
 76 |  	                context_input = tf.reshape(context_input, [-1, self.body_lstm_size])
 77 |                         context_input_list = tf.split(context_input, self.max_length, 0)
 78 |                         fc_output_list = []
 79 | 			with tf.variable_scope("context_fc"):
 80 | 		            for step in range(self.max_length):
 81 | 			        if step > 0: tf.get_variable_scope().reuse_variables()
 82 | 			        fc_out = fcLayer(x=context_input_list[step], in_shape=self.body_lstm_size, out_shape=self.context_branch_fc, activation=self.fc_activation, dropout=dropout, is_train=is_train, scope="fc1")
 83 | 			        fc_output_list.append(tf.expand_dims(fc_out, axis=1))
 84 | 			    context_fc_out = tf.concat(fc_output_list, axis=1)
 85 | 			with tf.variable_scope("context_pred"):
 86 |         	            context_output, context_logits = predictionLayer(x=context_fc_out, y=y_context, in_shape=self.context_branch_fc, out_shape=y_context.get_shape()[-1].value, activation=self.context_output_activation)
 87 | 			    context_cost = compute_cost(logit=context_logits, y=y_context, out_type="sequential", max_length=self.max_length, batch_size=self.batch_size, embed_dim=self.feature_length,activation=self.context_output_activation)
 88 | 
 89 | 
 90 | 		    print "Context cost shape: ", context_cost.get_shape()
 91 | 
 92 |     	    with tf.variable_scope("task_branch"):
 93 |     	    	with tf.variable_scope("task_fc"):
 94 |         	    # Select the last output that is not generated by zero vectors
 95 |         	    last_body_output = self.last_relevant(body_cell_output, self.length(body_cell_output))
 96 |         	    # feed the last output to the fc layer and make prediction
 97 |         	    task_fc_out = fcLayer(x=last_body_output, in_shape=self.body_lstm_size, out_shape=self.task_branch_fc, activation=self.fc_activation, dropout=dropout, is_train=is_train, scope="fc2")
 98 |         	    task_output, task_logits = predictionLayer(x=task_fc_out, y=y_task, in_shape=self.task_branch_fc, out_shape=y_task.get_shape()[-1].value, activation=self.task_output_activation)
 99 | 		    print "Task output shape: ", task_output.get_shape()
100 | 		    task_cost = compute_cost(logit=task_logits, y=y_task, out_type="last_only", max_length=self.max_length, batch_size=self.batch_size, embed_dim=self.n_classes,activation=self.task_output_activation)
101 | 
102 |             return context_cost, task_cost, task_output, context_output
103 | 
104 | 	# Flatten the output tensor to shape features in all examples x output size
105 | 	# construct an index into that by creating a tensor with the start indices for each example tf.range(0, batch_size) x max_length
106 | 	# and add the individual sequence lengths to it
107 | 	# tf.gather() then performs the acutal indexing.
108 | 	def last_relevant(self, output, length):
109 |     	    index = tf.range(0, self.batch_size) * self.max_length + (length - 1)
110 |             out_size = int(output.get_shape()[2])
111 |     	    flat = tf.reshape(output, [-1, out_size])
112 |    	    relevant = tf.gather(flat, index)
113 |     	    return relevant
114 | 
115 | # Assume that the sequences are padded with 0 vectors to have shape (batch_size, max_length, feature_length)
116 | 
117 |         def length(self, sequence):
118 |             used = tf.sign(tf.reduce_max(tf.abs(sequence), reduction_indices=2))
119 |             length = tf.reduce_sum(used, reduction_indices=1)
120 |             length = tf.cast(length, tf.int32)
121 |             print length.get_shape()
122 |             return length
123 | 
124 | 
125 | 
126 | 
127 | 
128 | 


--------------------------------------------------------------------------------
/auto/src_final/model/train.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | import numpy as np
 3 | import os
 4 | import cPickle as pickle
 5 | from os.path import expanduser
 6 | import sys
 7 | import mcrnn_model
 8 | from mcrnn_model import model
 9 | 
10 | sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "..","utils")))
11 | from tf_utils import fcLayer, createLSTMCell, applyActivation, predictionLayer
12 | from load_batch import get_file_identifiers, get_classes, load_data, get_word2vec, load_batch
13 | 
14 | def get_data(data_path):
15 |     data_stats = pickle.load(open(expanduser(data_path + "/rt_stats.pkl")))
16 |     max_length, nPos, nNeg, trainPercent, testPercent = data_stats["longest"], data_stats[0], data_stats[1], data_stats['trainPercent'], data_stats['testPercent']
17 |     word2vec_dic = get_word2vec("~/tweetnet/data/word2vec_dict.pkl")
18 |     missing_word_dic = pickle.load(open(expanduser(data_path + "/missing_word_dic.pkl")))
19 |     nTest = int(testPercent*nPos) + int(testPercent*nNeg)
20 |     nTrain = nPos + nNeg - nTest
21 | 
22 |     return max_length, nPos, nNeg, trainPercent, testPercent, word2vec_dic, missing_word_dic, nTest, nTrain
23 | 
24 | 
25 | def trainModel():
26 |     
27 |     M = model()
28 |     data_path = "~/automatedMTL/data/rotten_tomato"
29 |     max_length, nPos, nNeg, trainPercent, testPercent, word2vec_dic, missing_word_dic, nTest, nTrain = get_data(data_path)
30 |     
31 |     x = tf.placeholder(tf.float32, shape=(None, M.max_length, M.feature_length))
32 |     y_context = tf.placeholder(tf.float32, shape=(None, M.context_dim))
33 |     y_task = tf.placeholder(tf.float32, shape=(None, M.task_dim))
34 |     
35 |     optimizer1 = tf.train.AdamOptimizer(learning_rate=M.context_lr)
36 |     optimizer2 = tf.train.AdamOptimizer(learning_rate=M.lr)
37 |     is_train = tf.placeholder(tf.int32)
38 |     n_train_batches = np.ceil(nTrain / M.batch_size).astype(int)
39 |     keep_prob = tf.placeholder(tf.float32)
40 |     
41 |     context_cost, task_cost, task_output, context_output = M.buildModel(x, y_context, y_task, is_train, keep_prob)
42 |     train_step1 = optimizer1.minimize(context_cost)
43 |     train_step2 = optimizer2.minimize(task_cost)
44 | 
45 |     # Start running operations on the graph
46 |     sess = tf.Session()
47 |     sess.run(tf.initialize_all_variables())
48 |     
49 |     with sess.as_default():
50 |         for epoch in range(100):
51 |             taskCost = 0
52 |             contextCost = 0
53 | 
54 |             all_classes, train_file, test_file = load_data(data_path)
55 |             start_idx = 0 
56 |             for minibatch in range(n_train_batches):
57 |                 encoded_batch, batch_classes, batch_missing_word_encoded, batch_missing_word, batch_identifier, batch_text, batch_length = load_batch(word2vec_dic, missing_word_dic, M.feature_length, max_length, data_path+"/Train/", 1, train_file, test_file, all_classes, start_idx, M.batch_size)
58 |                 start_idx += M.batch_size
59 |         
60 |                 feed_dict = {x: encoded_batch, y_context: batch_missing_word_encoded, y_task: batch_classes, is_train:1, keep_prob:0.5}
61 |                 
62 | 		train_step1.run(feed_dict=feed_dict)
63 | 	        context_cost_val, _, _ = sess.run(fetches = [context_cost, task_cost, task_output], feed_dict=feed_dict)
64 |                 contextCost += context_cost_val
65 | 
66 |                 train_step2.run(feed_dict=feed_dict)
67 | 	        _, task_cost_val, _ = sess.run(fetches = [context_cost, task_cost, task_output], feed_dict=feed_dict)
68 |                 taskCost += task_cost_val
69 | 
70 |                 #if minibatch !=0 and minibatch % 100 == 0:
71 |                 print "Minibatch ", minibatch, " Missing Word: ", contextCost , " Classification: ", taskCost 
72 |                 contextCost = 0
73 |                 taskCost = 0
74 | 
75 |             start_idx = 0
76 | 	    accuracy = 0
77 | 
78 |             for i in range(nTest):
79 |                 encoded_batch, batch_classes, batch_missing_word_encoded, batch_missing_word, batch_identifier, batch_text, batch_length = load_batch(word2vec_dic, missing_word_dic, M.feature_length, max_length, data_path+"/Test/", 0, train_file, test_file, all_classes, start_idx, 1)
80 | 		start_idx += 1
81 |                 feed_dict = {x:encoded_batch, y_context: batch_missing_word_encoded, y_task: batch_classes, is_train:0, keep_prob:0.5}
82 |                 task_output_val = sess.run(fetches = [task_output], feed_dict=feed_dict)
83 | 		accuracy += is_correct(batch_classes, task_output_val)
84 |             print "The accuracy in epoch ", epoch, " is: ", accuracy * 1.0 / nTest
85 | 
86 | def is_correct(target, output):
87 |     prediction = np.argmax(output)
88 |     target = np.argmax(target)
89 |     #print prediction, target
90 |     return prediction == target
91 | 
92 |             
93 | if __name__ == "__main__":
94 |     trainModel()    
95 | 


--------------------------------------------------------------------------------
/auto/src_final/util/1q:
--------------------------------------------------------------------------------
 1 | from os.path import expanduser
 2 | import sys
 3 | import numpy
 4 | import os
 5 | sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "..","model")))
 6 | from mcrnn_model_1_lstm import model
 7 | from train_1_lstm import trainModel as TM
 8 | #does hyperparameter search over some set of hyperparams.
 9 | 
10 | LR = [0.001]
11 | LR_MOD = [1.0] #4
12 | N_EPOCHS = [50] # 30
13 | N_EXPERIMENTS = [1] # 5
14 | KEEP_PROB_VAL = [1.0]
15 | CONTEXT_FC = [30] #1024 on AWS
16 | #3*3*3*30*5/60=67.5 hrs.
17 | experiment = "context_lr=0.5*lr, task_lr=0.5*lr, no learning rate anealing. Learning rates: "
18 | for lr in LR:
19 |     experiment = experiment + str(lr) + ", "
20 | experiment = experiment + " N epoch: "+str(N_EPOCHS[0]) + " Keep prob: "
21 | for prob in KEEP_PROB_VAL:
22 |     experiment = experiment + str(prob) + ", "
23 | experiment = experiment + " Context_fc: "
24 | for fc in CONTEXT_FC:
25 |     experiment = experiment + str(fc) + ", "
26 | 
27 | 
28 | def runExperiment(lr, lr_mod,n_epoch,n_experiments,f1, keep_prob_val, context_fc):
29 |     M= model()
30 | 
31 |     print M.is_multi_task
32 |     if lr_mod == 0.0:
33 |         M.is_multi_task = False
34 |     else:
35 |         M.is_multi_task = True
36 |     print M.is_multi_task
37 | 
38 |     print M.lr
39 |     M.lr = lr
40 |     print M.lr
41 | 
42 |     print M.lr_mod
43 |     M.lr_mod = lr_mod
44 |     print M.lr_mod
45 | 
46 |     print M.n_epoch
47 |     M.n_epoch = n_epoch
48 |     print M.n_epoch
49 | 
50 |     print M.context_branch_fc
51 |     M.context_branch_fc = context_fc
52 |     print M.context_branch_fc
53 | 
54 |     maxAccList = [];
55 |     for i in range(n_experiments):
56 |         accuracyVec = TM(M, keep_prob_val)#INSERT CODE TO run for n epochs
57 |         maxAcc = numpy.max(accuracyVec)
58 |         maxAccList.append(maxAcc)
59 |     expVal = numpy.mean(maxAccList)
60 |     string_result = "lr = " + str(lr) + " lr_mod = "+ "self-annealing" + " avg_acc = " + str(expVal)+'\n'
61 |     f1.write("")
62 |     f1.write(string_result)
63 |     f1.flush()
64 |     print string_result
65 | 
66 | 
67 | 
68 | f1 = open(expanduser('~/tweetnet/logs/hps_log_mrnn_bidir.log'),'w+') 
69 | f1.write(experiment)
70 | f1.write("\n")
71 | f1.flush()
72 | for lr in LR:
73 |     for lr_mod in LR_MOD:
74 |         for n_epoch in N_EPOCHS:
75 |             for n_experiments in N_EXPERIMENTS:
76 | 		for keep_prob_val in KEEP_PROB_VAL:
77 | 		    for context_fc in CONTEXT_FC:
78 |                         runExperiment(lr,lr_mod,n_epoch,n_experiments,f1, keep_prob_val, context_fc)      
79 | f1.close()
80 | 


--------------------------------------------------------------------------------
/auto/src_final/util/hps.py:
--------------------------------------------------------------------------------
 1 | from os.path import expanduser
 2 | import sys
 3 | import numpy
 4 | import os
 5 | sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "..","model")))
 6 | from mcrnn_model_1_lstm import model
 7 | from train_1_lstm import trainModel as TM
 8 | #does hyperparameter search over some set of hyperparams.
 9 | 
10 | LR = [0.01]
11 | LR_MOD = [1.0] #4
12 | N_EPOCHS = [30] # 30
13 | N_EXPERIMENTS = [10] # 5
14 | KEEP_PROB_VAL = [1.0]
15 | CONTEXT_FC = [30] #1024 on AWS
16 | #3*3*3*30*5/60=67.5 hrs.
17 | experiment = "context_lr=0.5*lr, task_lr=0.5*lr, no learning rate anealing. Learning rates: "
18 | for lr in LR:
19 |     experiment = experiment + str(lr) + ", "
20 | experiment = experiment + " N epoch: "+str(N_EPOCHS[0]) + " Keep prob: "
21 | for prob in KEEP_PROB_VAL:
22 |     experiment = experiment + str(prob) + ", "
23 | experiment = experiment + " Context_fc: "
24 | for fc in CONTEXT_FC:
25 |     experiment = experiment + str(fc) + ", "
26 | 
27 | epoch_ratio_list = [(0.1, 1.0), (0.5, 0.5), (1.0, 0.0)]
28 | 
29 | def runExperiment(lr, lr_mod,n_epoch,n_experiments,f1, keep_prob_val, context_fc):
30 |     M= model()
31 | 
32 |     print M.is_multi_task
33 |     if lr_mod == 0.0:
34 |         M.is_multi_task = False
35 |     else:
36 |         M.is_multi_task = True
37 |     print M.is_multi_task
38 | 
39 |     print M.lr
40 |     M.lr = lr
41 |     print M.lr
42 | 
43 |     print M.lr_mod
44 |     M.lr_mod = lr_mod
45 |     print M.lr_mod
46 | 
47 |     print M.n_epoch
48 |     M.n_epoch = n_epoch
49 |     print M.n_epoch
50 | 
51 |     print M.context_branch_fc
52 |     M.context_branch_fc = context_fc
53 |     print M.context_branch_fc
54 | 
55 |     maxAccList = [];
56 |     for i in range(n_experiments):
57 |         accuracyVec = TM(M, keep_prob_val, epoch_ratio_list)#INSERT CODE TO run for n epochs
58 |         maxAcc = numpy.max(accuracyVec)
59 |         maxAccList.append(maxAcc)
60 |     expVal = numpy.mean(maxAccList)
61 |     string_result = "lr = " + str(lr) + " lr_mod = "+ "self-annealing" + " avg_acc = " + str(expVal)+'\n'
62 |     f1.write("")
63 |     f1.write(string_result)
64 |     f1.flush()
65 |     print string_result
66 | 
67 | 
68 | 
69 | f1 = open(expanduser('~/tweetnet/logs/hps_log_mrnn_bidir.log'),'w+') 
70 | f1.write(experiment)
71 | f1.write("\n")
72 | f1.flush()
73 | for lr in LR:
74 |     for lr_mod in LR_MOD:
75 |         for n_epoch in N_EPOCHS:
76 |             for n_experiments in N_EXPERIMENTS:
77 | 		for keep_prob_val in KEEP_PROB_VAL:
78 | 		    for context_fc in CONTEXT_FC:
79 |                         runExperiment(lr,lr_mod,n_epoch,n_experiments,f1, keep_prob_val, context_fc)      
80 | f1.close()
81 | 


--------------------------------------------------------------------------------
/auto/src_final/util/hps2.py:
--------------------------------------------------------------------------------
 1 | from os.path import expanduser
 2 | import sys
 3 | import numpy
 4 | import os
 5 | sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "..","model")))
 6 | from mcrnn_model_gen2 import model
 7 | from train_gen2 import trainModel as TM
 8 | #does hyperparameter search over some set of hyperparams.
 9 | 
10 | 
11 | LR = [0.01]
12 | LR_MOD = [1.0,0.0] #4
13 | N_EPOCHS = [30] # 30
14 | N_EXPERIMENTS = [5] # 5
15 | KEEP_PROB_VAL = [1.0]
16 | CONTEXT_FC = [128] 
17 | #3*3*3*30*5/60=67.5 hrs.
18 | #experiment = "context_lr=0.5*lr, task_lr=0.5*lr, no learning rate anealing. Learning rates: "
19 | 
20 | dataset = "ag_news" # or "rotten_tomato"
21 | experiment = "N_epochs = 50. N_exp = 10. lstm: 512 for both. hidden fc: 512 for both. dropout: none."
22 | 
23 | for lr in LR:
24 |     experiment = experiment + str(lr) + ", "
25 | experiment = experiment + " N epoch: "+str(N_EPOCHS[0]) + " Keep prob: "
26 | for prob in KEEP_PROB_VAL:
27 |     experiment = experiment + str(prob) + ", "
28 | experiment = experiment + " Context_fc: "
29 | for fc in CONTEXT_FC:
30 |     experiment = experiment + str(fc) + ", "
31 | 
32 | def runExperiment(lr, lr_mod,n_epoch,n_experiments,f1, keep_prob_val, context_fc):
33 |     M= model()
34 | 
35 |     print M.dataset    
36 |     M.dataset = dataset
37 |     print M.dataset
38 | 
39 |     print M.is_multi_task
40 |     if lr_mod == 0.0:
41 |         M.is_multi_task = False
42 |     else:
43 |         M.is_multi_task = True
44 |     print M.is_multi_task
45 | 
46 |     print M.lr
47 |     M.lr = lr
48 |     print M.lr
49 | 
50 |     print M.lr_mod
51 |     M.lr_mod = lr_mod
52 |     print M.lr_mod
53 | 
54 |     print M.n_epoch
55 |     M.n_epoch = n_epoch
56 |     print M.n_epoch
57 | 
58 |     print M.context_branch_fc
59 |     M.context_branch_fc = context_fc
60 |     print M.context_branch_fc
61 | 
62 |     maxAccList = []
63 |     testResult = []
64 |     for i in range(n_experiments):
65 |         accuracyVec, testAcc = TM(M)#INSERT CODE TO run for n epochs
66 |         maxAcc = numpy.max(accuracyVec)
67 |         maxAccList.append(maxAcc)
68 |         maxIdx = numpy.argmax(accuracyVec)
69 |         testResult.append(testAcc[maxIdx])
70 | 
71 |     expVal = numpy.mean(maxAccList)
72 |     testVal = numpy.mean(testResult)
73 |     if lr_mod == 0.0:
74 |         string_result = "lr = " + str(lr) + " lr_mod = "+ "none (lstm)" + " avg_val_acc = " + str(expVal) + " avg_test_acc = " + str(testVal) + '\n'
75 |     else:
76 |         string_result = "lr = " + str(lr) + " lr_mod = "+ "annealing" + " avg_val_acc = " + str(expVal) + " avg_test_acc = " + str(testVal) + '\n'
77 | 
78 |     f1.write("")
79 |     f1.write(string_result)
80 |     f1.flush()
81 |     print string_result
82 | 
83 | #f1 = open(expanduser('~/tweetnet/logs/hps_log_mrnn_bidir.log'),'w+') 
84 | f1 = open(expanduser("~/tweetnet/logs/hps_mrnn_ag_news.log"), "w+")
85 | f1.write(experiment)
86 | f1.write("\n")
87 | f1.flush()
88 | for lr in LR:
89 |     for lr_mod in LR_MOD:
90 |         for n_epoch in N_EPOCHS:
91 |             for n_experiments in N_EXPERIMENTS:
92 | 		for keep_prob_val in KEEP_PROB_VAL:
93 | 		    for context_fc in CONTEXT_FC:
94 |                         runExperiment(lr,lr_mod,n_epoch,n_experiments,f1, keep_prob_val, context_fc)      
95 | f1.close()
96 | 


--------------------------------------------------------------------------------
/auto/src_final/util/hps_script.sh:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/davisliang/automated/5e07e5802a604036fc6295ed3538d596936fca0e/auto/src_final/util/hps_script.sh


--------------------------------------------------------------------------------
/auto/src_final/util/load_batch.py:
--------------------------------------------------------------------------------
  1 | import cPickle as pickle
  2 | import numpy as np
  3 | import os
  4 | import random
  5 | from os.path import expanduser
  6 | from reformat import reformat_data
  7 | from load_util import class_look_up
  8 | 
  9 | # The files are named from 0.txt to n.txt
 10 | # This function returns a list of all shuffled file names 
 11 | 
 12 | def get_file_identifiers(data_path):
 13 |     ids = []
 14 |     f = open(expanduser(data_path))    
 15 |     for l in f.readlines():
 16 |         ids.append(int(l.split(" ")[0]))
 17 |     random.shuffle(ids)
 18 |     return ids
 19 | 
 20 | def get_classes(all_classes, id):
 21 |     return all_classes[id][0]
 22 | 
 23 | def get_text_by_batch(data_path, is_train, train_file, test_file, all_classes, start_idx, batch_size):
 24 |     if is_train:
 25 |         identifiers = train_file
 26 |     else:
 27 |         identifiers = test_file
 28 |     batch_identifiers = identifiers[start_idx: start_idx + batch_size]
 29 |     
 30 |     batch_text = []
 31 |     for idx in batch_identifiers:
 32 |         text = open(expanduser(data_path + get_classes(all_classes, idx)+ "/" + str(idx)+".txt"))
 33 |         batch_text.append(text.read())
 34 | 
 35 |     return batch_identifiers, batch_text
 36 | 
 37 | def load_data(data_path):
 38 |     
 39 |     all_classes = pickle.load(open(expanduser(data_path + '/classes.pkl')))
 40 |     test_file = get_file_identifiers(data_path + "/test_classes.txt")
 41 |     train_file = get_file_identifiers(data_path + "/train_classes.txt")
 42 |     return all_classes, train_file, test_file
 43 | 
 44 | def get_word2vec(data_path):
 45 |     # TO DO: download word2vec!
 46 |     word2vec_dic = pickle.load(open(expanduser(data_path)))
 47 |     return word2vec_dic
 48 | 
 49 | # Unknown symbols are UNK 
 50 | # Missing word symbols are zeros
 51 | # EOS are EOS
 52 | 
 53 | def encode_sequence(word2vec_dic, sequence, encode_dim, max_len):
 54 |     sequence_by_word = sequence.split(" ")
 55 |     encoded_seq = np.zeros((max_len, encode_dim))
 56 |     for i in range(len(sequence_by_word)):
 57 |         word = sequence_by_word[i]
 58 |         if word2vec_dic.get(word) == None:
 59 |             encoded_seq[i, :] = word2vec_dic["UNK"]
 60 | 	    continue
 61 |         else:
 62 |             if word != "REMOVE":
 63 |                 encoded_seq[i, :] = word2vec_dic[word]
 64 | 	    else:
 65 | 		encoded_seq[i, :] = word2vec_dic["_"]
 66 |     return encoded_seq, len(sequence_by_word)
 67 | 
 68 | def encode_sequence_generation(word2vec_dic, sequence, encode_dim, max_len):
 69 |     sequence_by_word = sequence.split(" ")
 70 |     encoded_seq = np.zeros((max_len, encode_dim))
 71 |     for i in range(1, len(sequence_by_word)):
 72 |         word = sequence_by_word[i]
 73 |         if word2vec_dic.get(word) == None:
 74 |             encoded_seq[i-1, :] = word2vec_dic["UNK"]
 75 |         else:
 76 |             encoded_seq[i-1, :] = word2vec_dic[word]
 77 |     encoded_seq[len(sequence_by_word)-1, :] = word2vec_dic["EOS"]
 78 |     context_target = sequence_by_word[1:len(sequence_by_word)] + ["EOS"]
 79 | 
 80 |     return encoded_seq, context_target, len(sequence_by_word)
 81 | 
 82 | def oneHot(nclasses, idx):
 83 |     one_hot = np.zeros((nclasses))
 84 |     one_hot[idx] = 1
 85 |     return one_hot
 86 | 
 87 | def load_batch(n_classes, word2vec_dic, missing_word_dic, encode_dim, max_len, data_path, is_train, train_file, test_file, all_classes, start_idx, batch_size, automated_task):
 88 |     batch_identifiers, batch_text = get_text_by_batch(data_path, is_train, train_file, test_file, all_classes, start_idx, batch_size)
 89 |     encoded_batch = np.zeros((batch_size, max_len, encode_dim))
 90 |     batch_classes = np.zeros((batch_size, n_classes))
 91 |     batch_context_encoded = np.zeros((batch_size, encode_dim))
 92 |     if automated_task == "word generation": batch_context_encoded = np.zeros((batch_size, max_len, encode_dim))
 93 |     batch_context = []
 94 |     batch_length = []
 95 |     for i in range(batch_size):
 96 |         encoded_batch[i,:, :], text_length = encode_sequence(word2vec_dic, batch_text[i], encode_dim, max_len)
 97 |         batch_classes[i,:] = oneHot(n_classes, all_classes[batch_identifiers[i]][-1])
 98 |         if automated_task != "word generation":
 99 |             batch_context_encoded[i,:] = word2vec_dic[missing_word_dic[batch_identifiers[i]]]
100 |             batch_context.append(missing_word_dic[batch_identifiers[i]])
101 |         else:
102 | 	    batch_context_encoded[i, :, :], context_target, text_length = encode_sequence_generation(word2vec_dic, batch_text[i], encode_dim, max_len)
103 | 	    batch_context.append(context_target)
104 |         batch_length.append(text_length)
105 |     return encoded_batch, batch_classes, batch_context_encoded, batch_context, batch_identifiers, batch_text, batch_length
106 | 
107 | if __name__ == "__main__":
108 |     data_path = "~/automatedMTL/data/rotten_tomato"
109 |     max_length = reformat_data(data_path, False)
110 |     class_look_up(data_path)
111 |     data_stats = pickle.load(open(expanduser(data_path + "/stats.pkl")))
112 |     n_classes, n_data, n_data_per_class, trainPercent, testPercent = data_stats['n_classes'], data_stats['n_data'], data_stats['n_data_per_class'],data_stats['trainPercent'], data_stats['testPercent']
113 |     word2vec_dic = get_word2vec("~/tweetnet/data/word2vec_dict.pkl")
114 |     missing_word_dic = pickle.load(open(expanduser(data_path + "/missing_word_dic.pkl")))
115 |     for epoch in range(3):
116 |         dic = {}
117 |         all_classes, train_file, test_file = load_data(data_path)
118 |         start_idx = 0
119 |         for minibatch in range(73):
120 |             encoded_batch, batch_classes, batch_context_encoded, batch_context, batch_identifier, batch_text, batch_length = load_batch(n_classes, word2vec_dic, missing_word_dic, 300, max_length, data_path+"/Train/", 1, train_file, test_file, all_classes, start_idx, 128, automated_task="word generation")
121 |             start_idx += 128
122 |             print batch_context
123 |             for i in batch_identifier:
124 |                 if dic.get(i) != None: print "Wrong"
125 |                 else: dic[i] = 1
126 | 


--------------------------------------------------------------------------------
/auto/src_final/util/load_batch2.py:
--------------------------------------------------------------------------------
  1 | import cPickle as pickle
  2 | import numpy as np
  3 | import os
  4 | import random
  5 | from os.path import expanduser
  6 | from reformat import reformat_data
  7 | from load_util import class_look_up
  8 | 
  9 | # The files are named from 0.txt to n.txt
 10 | # This function returns a list of all shuffled file names 
 11 | 
 12 | def get_file_identifiers(data_path):
 13 |     ids = []
 14 |     f = open(expanduser(data_path))    
 15 |     for l in f.readlines():
 16 |         ids.append(int(l.split(" ")[0]))
 17 |     random.shuffle(ids)
 18 |     return ids
 19 | 
 20 | def get_classes(all_classes, id):
 21 |     return all_classes[id][0]
 22 | 
 23 | def get_text_by_batch(data_path, is_train, train_file, test_file, all_classes, start_idx, batch_size):
 24 |     if is_train:
 25 |         identifiers = train_file
 26 |     else:
 27 |         identifiers = test_file
 28 |     batch_identifiers = identifiers[start_idx: start_idx + batch_size]
 29 |     
 30 |     batch_text = []
 31 |     for idx in batch_identifiers:
 32 |         text = open(expanduser(data_path + get_classes(all_classes, idx)+ "/" + str(idx)+".txt"))
 33 |         batch_text.append(text.read())
 34 | 
 35 |     return batch_identifiers, batch_text
 36 | 
 37 | def load_data(data_path):
 38 |     
 39 |     all_classes = pickle.load(open(expanduser(data_path + '/classes.pkl')))
 40 |     test_file = get_file_identifiers(data_path + "/test_classes.txt")
 41 |     train_file = get_file_identifiers(data_path + "/train_classes.txt")
 42 |     return all_classes, train_file, test_file
 43 | 
 44 | def get_word2vec(data_path):
 45 |     # TO DO: download word2vec!
 46 |     word2vec_dic = pickle.load(open(expanduser(data_path)))
 47 |     return word2vec_dic
 48 | 
 49 | # Unknown symbols are UNK 
 50 | # Missing word symbols are zeros
 51 | # EOS are EOS
 52 | 
 53 | def encode_sequence(word2vec_dic, sequence, encode_dim, max_len):
 54 |     sequence_by_word = sequence.split(" ")
 55 |     encoded_seq = np.zeros((max_len, encode_dim))
 56 |     for i in range(len(sequence_by_word)):
 57 |         word = sequence_by_word[i]
 58 |         if word2vec_dic.get(word) == None:
 59 |             encoded_seq[i, :] = word2vec_dic["UNK"]
 60 |         else:
 61 |             if word != "REMOVE":
 62 |                 encoded_seq[i, :] = word2vec_dic[word]
 63 | 	    else:
 64 | 		encoded_seq[i, :] = word2vec_dic["_"]
 65 |     return encoded_seq, len(sequence_by_word)
 66 | 
 67 | def oneHot(nclasses, idx):
 68 |     one_hot = np.zeros((nclasses))
 69 |     one_hot[idx] = 1
 70 |     return one_hot
 71 | 
 72 | def load_batch(n_classes, word2vec_dic, missing_word_dic, encode_dim, max_len, data_path, is_train, train_file, test_file, all_classes, start_idx, batch_size):
 73 |     batch_identifiers, batch_text = get_text_by_batch(data_path, is_train, train_file, test_file, all_classes, start_idx, batch_size)
 74 |     encoded_batch = np.zeros((batch_size, max_len, encode_dim))
 75 |     batch_classes = np.zeros((batch_size, 2))
 76 |     batch_missing_word_encoded = np.zeros((batch_size, encode_dim))
 77 |     batch_missing_word = []
 78 |     batch_length = []
 79 |     for i in range(batch_size):
 80 |         encoded_batch[i,:, :], text_length = encode_sequence(word2vec_dic, batch_text[i], encode_dim, max_len)
 81 |         batch_classes[i,:] = oneHot(n_classes, all_classes[batch_identifiers[i]][-1])
 82 |         batch_missing_word_encoded[i,:] = word2vec_dic[missing_word_dic[batch_identifiers[i]]]
 83 |         batch_missing_word.append(missing_word_dic[batch_identifiers[i]])
 84 |         batch_length.append(text_length)
 85 |     return encoded_batch, batch_classes, batch_missing_word_encoded, batch_missing_word, batch_identifiers, batch_text, batch_length
 86 | 
 87 | if __name__ == "__main__":
 88 |     data_path = "~/automatedMTL/data/rotten_tomato"
 89 |     max_length = reformat_data(data_path, False)
 90 |     class_look_up(data_path)
 91 |     data_stats = pickle.load(open(expanduser(data_path + "/stats.pkl")))
 92 |     n_classes, n_data, n_data_per_class, trainPercent, testPercent = data_stats['n_classes'], data_stats['n_data'], data_stats['n_data_per_class'],data_stats['trainPercent'], data_stats['testPercent']
 93 |     word2vec_dic = get_word2vec("~/tweetnet/data/word2vec_dict.pkl")
 94 |     missing_word_dic = pickle.load(open(expanduser(data_path + "/missing_word_dic.pkl")))
 95 |     for epoch in range(3):
 96 |         dic = {}
 97 |         all_classes, train_file, test_file = load_data(data_path)
 98 |         start_idx = 0
 99 |         for minibatch in range(73):
100 |             encoded_batch, batch_classes, batch_missing_word_encoded, batch_missing_word, batch_identifier, batch_text, batch_length = load_batch(n_classes, word2vec_dic, missing_word_dic, 300, max_length, data_path+"/Train/", 1, train_file, test_file, all_classes, start_idx, 128)
101 |             start_idx += 128
102 |             #print batch_text
103 |             #print batch_missing_word
104 |             #print batch_length
105 |             for i in batch_identifier:
106 |                 if dic.get(i) != None: print "Wrong"
107 |                 else: dic[i] = 1
108 | 


--------------------------------------------------------------------------------
/auto/src_final/util/load_batch_val.py:
--------------------------------------------------------------------------------
  1 | import cPickle as pickle
  2 | import numpy as np
  3 | import os
  4 | import random
  5 | from os.path import expanduser
  6 | #from reformat import reformat_data
  7 | from load_util import class_look_up
  8 | 
  9 | # The files are named from 0.txt to n.txt
 10 | # This function returns a list of all shuffled file names 
 11 | 
 12 | def get_file_identifiers(data_path):
 13 |     ids = []
 14 |     f = open(expanduser(data_path))    
 15 |     for l in f.readlines():
 16 |         ids.append(int(l.split(" ")[0]))
 17 |     random.shuffle(ids)
 18 |     return ids
 19 | 
 20 | def get_classes(all_classes, id):
 21 |     return all_classes[id][0]
 22 | 
 23 | def get_text_by_batch(data_path, is_train, is_val, train_file, test_file, val_file, all_classes, start_idx, batch_size):
 24 |     if is_train:
 25 |         identifiers = train_file
 26 |     elif is_val:
 27 |         identifiers = val_file
 28 |     else:
 29 |         identifiers = test_file
 30 |     batch_identifiers = identifiers[start_idx: start_idx + batch_size]
 31 |     
 32 |     batch_text = []
 33 |     for idx in batch_identifiers:
 34 |         text = open(expanduser(data_path + get_classes(all_classes, idx)+ "/" + str(idx)+".txt"))
 35 |         batch_text.append(text.read())
 36 | 
 37 |     return batch_identifiers, batch_text
 38 | 
 39 | def load_data(data_path):
 40 |     
 41 |     all_classes = pickle.load(open(expanduser(data_path + '/classes.pkl')))
 42 |     test_file = get_file_identifiers(data_path + "/test_classes.txt")
 43 |     train_file = get_file_identifiers(data_path + "/train_classes.txt")
 44 |     val_file = get_file_identifiers(data_path + "/validation_classes.txt")
 45 |     return all_classes, train_file, test_file, val_file
 46 | 
 47 | def get_word2vec(data_path):
 48 |     # TO DO: download word2vec!
 49 |     word2vec_dic = pickle.load(open(expanduser(data_path)))
 50 |     return word2vec_dic
 51 | 
 52 | # Unknown symbols are UNK 
 53 | # Missing word symbols are zeros
 54 | # EOS are EOS
 55 | 
 56 | def encode_sequence(word2vec_dic, sequence, encode_dim, max_len):
 57 |     sequence_by_word = sequence.split(" ")
 58 |     encoded_seq = np.zeros((max_len, encode_dim))
 59 |     for i in range(len(sequence_by_word)):
 60 |         word = sequence_by_word[i]
 61 |         if word2vec_dic.get(word) == None:
 62 |             encoded_seq[i, :] = word2vec_dic["UNK"]
 63 |         else:
 64 |             if word != "REMOVE":
 65 |                 encoded_seq[i, :] = word2vec_dic[word]
 66 | 	    else:
 67 | 		encoded_seq[i, :] = word2vec_dic["_"]
 68 |     return encoded_seq, len(sequence_by_word)
 69 | 
 70 | def encode_sequence_generation(word2vec_dic, sequence, encode_dim, max_len):
 71 |     sequence_by_word = sequence.split(" ")
 72 |     encoded_seq = np.zeros((max_len, encode_dim))
 73 |     for i in range(1, len(sequence_by_word)):
 74 |         word = sequence_by_word[i]
 75 |         if word2vec_dic.get(word) == None:
 76 |             encoded_seq[i-1, :] = word2vec_dic["UNK"]
 77 |         else:
 78 |             encoded_seq[i-1, :] = word2vec_dic[word]
 79 |     encoded_seq[len(sequence_by_word)-1, :] = word2vec_dic["EOS"]
 80 |     context_target = sequence_by_word[1:len(sequence_by_word)] + ["EOS"]
 81 | 
 82 |     return encoded_seq, context_target, len(sequence_by_word)
 83 | 
 84 | def oneHot(nclasses, idx):
 85 |     one_hot = np.zeros((nclasses))
 86 |     one_hot[idx-1] = 1
 87 |     return one_hot
 88 | 
 89 | def load_batch(n_classes, word2vec_dic, missing_word_dic, encode_dim, max_len, data_path, is_train, is_val, train_file, test_file, val_file,all_classes, start_idx, batch_size, automated_task):
 90 |     batch_identifiers, batch_text = get_text_by_batch(data_path, is_train,is_val,  train_file, test_file, val_file, all_classes, start_idx, batch_size)
 91 |     encoded_batch = np.zeros((batch_size, max_len, encode_dim))
 92 |     batch_classes = np.zeros((batch_size, n_classes))
 93 |     batch_context_encoded = np.zeros((batch_size, encode_dim))
 94 |     if automated_task == "word generation": batch_context_encoded = np.zeros((batch_size, max_len, encode_dim))
 95 |     batch_context = []
 96 |     batch_length = []
 97 |     for i in range(batch_size):
 98 |         encoded_batch[i,:, :], text_length = encode_sequence(word2vec_dic, batch_text[i], encode_dim, max_len)
 99 |         batch_classes[i,:] = oneHot(n_classes, all_classes[batch_identifiers[i]][-1])
100 |         if automated_task != "word generation":
101 |             batch_context_encoded[i,:] = word2vec_dic[missing_word_dic[batch_identifiers[i]]]
102 |             batch_context.append(missing_word_dic[batch_identifiers[i]])
103 |         else:
104 | 	    batch_context_encoded[i, :, :], context_target, text_length = encode_sequence_generation(word2vec_dic, batch_text[i], encode_dim, max_len)
105 | 	    batch_context.append(context_target)
106 |         batch_length.append(text_length)
107 |     return encoded_batch, batch_classes, batch_context_encoded, batch_context, batch_identifiers, batch_text, batch_length
108 | 
109 | if __name__ == "__main__":
110 |     data_path = "~/tweetnet/automatedMTL/data/ag_news_csv"
111 |     data_stats = pickle.load(open(expanduser(data_path + "/stats.pkl")))
112 |     n_classes, n_data, n_data_per_class, n_train_data, n_test_data, max_length = data_stats['n_classes'], data_stats['n_data'], data_stats['n_data_per_class'],data_stats['n_train_data'], data_stats['n_test_data'], data_stats['max_length']
113 |     print n_classes, n_data, n_data_per_class
114 |     word2vec_dic = get_word2vec("~/tweetnet/data/word2vec_dict.pkl")
115 |     for epoch in range(3):
116 |         dic = {}
117 |         all_classes, train_file, test_file, val_file = load_data(data_path)
118 |         start_idx = 0
119 |         for minibatch in range(3):
120 |             encoded_batch, batch_classes, batch_context_encoded, batch_context, batch_identifier, batch_text, batch_length = load_batch(n_classes, word2vec_dic, {}, 300, max_length, data_path+"/Train/", 1, train_file, test_file, all_classes, start_idx, 1, automated_task="word generation")
121 |             start_idx += 1
122 |             print batch_text
123 |             print batch_classes
124 |             print batch_context
125 |             print encoded_batch.shape
126 |             print batch_context_encoded.shape
127 |             for i in batch_identifier:
128 |                 if dic.get(i) != None: print "Wrong"
129 |                 else: dic[i] = 1
130 | 


--------------------------------------------------------------------------------
/auto/src_final/util/load_util.py:
--------------------------------------------------------------------------------
 1 | import cPickle as pickle
 2 | import os
 3 | from os.path import expanduser
 4 | from os.path import basename
 5 | 
 6 | def class_look_up(data_path):
 7 |     out_train = open(expanduser(data_path+"/train_classes.txt"), "w")
 8 |     out_test = open(expanduser(data_path+"/test_classes.txt"), "w")
 9 |     train_folders = os.listdir(expanduser(data_path+"/Train/"))
10 |     test_folders = os.listdir(expanduser(data_path+"/Test/"))
11 |  
12 |     dict = {}
13 |     cnt = 0
14 |     file2class_dict = {}
15 | 
16 |     for i in train_folders:
17 |         if i[0] != '.':
18 |             if dict.get(i) == None:
19 | 	        dict[i] = cnt
20 |                 cnt += 1
21 |             files = os.listdir(expanduser(data_path+"/Train/"+i))
22 | 	    for f in files:
23 |                 if f[0] == ".": continue
24 | 	        out_train.write(f[0:len(f) - 4] + "   " + i + "   " + str(dict[i]))
25 |                 out_train.write("\n") 
26 |                 file2class_dict[int(f[0:len(f) - 4])] = (i, dict[i])
27 | 
28 |     for i in test_folders:
29 |         if i[0] != '.':
30 |             files = os.listdir(expanduser(data_path+"/Test/"+i))
31 | 	    for f in files:
32 |                 if f[0] == ".": continue
33 | 	        out_test.write(f[0:len(f) - 4] + "   " + i + "   " + str(dict[i]))
34 | 		out_test.write("\n")
35 |                 file2class_dict[int(f[0:len(f) - 4])] = (i, dict[i])
36 |     out_train.close()
37 |     out_test.close()
38 |     pickle.dump(file2class_dict, open(expanduser(data_path+"/classes.pkl"), "w"))
39 |     #print file2class_dict
40 |     #print len(file2class_dict)
41 | 
42 | if __name__ == "__main__":
43 |     class_look_up("~/automatedMTL/data/rotten_tomato")
44 | 


--------------------------------------------------------------------------------
/auto/src_final/util/reformat.py:
--------------------------------------------------------------------------------
  1 | import cPickle as pickle
  2 | import numpy as np
  3 | import os
  4 | from os.path import expanduser
  5 | from path import Path
  6 | import random
  7 | from stop_words import get_stop_words
  8 | 
  9 | # The dataset has 5331 positive and 5331 negative reviews
 10 | # According to prev work, split inot 90% training (4998) and 10% testing (533)
 11 | 
 12 | word2vec_dic = pickle.load(open(expanduser("~/tweetnet/data/word2vec_dict.pkl")))
 13 | stop_words = get_stop_words('english')
 14 | length = []
 15 | missing_word_dic = {}
 16 | 
 17 | def get_dataset(dataset_path):
 18 |     data_stats = pickle.load(open(expanduser(dataset_path + "/stats.pkl")))
 19 |     all_example = {}
 20 |     all_class_folders = os.listdir(expanduser(dataset_path+"/all_data/"))
 21 |     for class_folder in all_class_folders:
 22 |         if class_folder[0] != ".":
 23 |             all_example[class_folder] = open(expanduser(dataset_path+'/all_data/' + class_folder))
 24 |     return all_example, data_stats
 25 | 
 26 | 
 27 | def replace_missing_word(data_by_word):
 28 | 
 29 |     new_data = []
 30 |     for i in range(len(data_by_word)):
 31 |         word = data_by_word[i]
 32 |         if word in stop_words and word2vec_dic.get(word) == None:
 33 | 	    continue
 34 |         else:
 35 |             new_data.append(word)
 36 |              
 37 |     idx = range(0, len(new_data))
 38 |     random.shuffle(idx) 
 39 |     removed = ""
 40 |     
 41 |     if len(new_data) == 1 and word2vec_dic.get(new_data[0]) != None:
 42 |         return new_data + ["-"], new_data[-1]
 43 |     elif len(new_data) == 1 and word2vec_dic.get(new_data[0]) == None:
 44 |         return [], ""
 45 | 
 46 |     valid = False
 47 |     for i in idx:
 48 |         word = new_data[i]
 49 |         if word not in stop_words and word2vec_dic.get(word)!= None:
 50 |             removed = new_data[i]
 51 | 	    data_by_word[i] = "REMOVE"
 52 | 	    valid = True
 53 |             break
 54 |     if not valid:  
 55 |         print data_by_word
 56 |         return [], ""
 57 |     return data_by_word, removed
 58 | 
 59 |  
 60 | def process_data(data, is_missing_word):
 61 |     
 62 |     d = list(data)
 63 |     for i in range(len(d)):
 64 |         if ord(d[i]) > ord('z') or ord(d[i]) < ord('a') and d[i] != "'":
 65 |             d[i] = " "
 66 |     string = "".join(d)
 67 |    
 68 |     if not is_missing_word:
 69 |         string = " ".join(string.split())
 70 |         string = string + " " + "EOS"
 71 |         length.append(len(string.split()))
 72 |         return string, "_"
 73 |     
 74 |     string, removed = replace_missing_word(string.split())
 75 |     if string == []: return [], ""
 76 |     string = " ".join(string)
 77 |     string = string + " " + "EOS"
 78 |     length.append(len(string.split()))
 79 |     return string, removed
 80 | 
 81 | def reformat_data(dataset_path, is_missing_word):
 82 |     
 83 |     # Clean up the directory in train and test folder
 84 |     d_train, d_test = Path(expanduser(dataset_path+"/Train")), Path(expanduser(dataset_path+"/Test"))
 85 |     train_files, test_files = d_train.walk("*.txt"), d_test.walk("*.txt")
 86 |     for f in train_files:   
 87 |         f.remove()
 88 |     for f in test_files:
 89 |         f.remove()
 90 | 
 91 |     all_example, data_stats = get_dataset(dataset_path)
 92 |     n_classes, n_data, n_data_per_class, trainPercent, testPercent = data_stats['n_classes'], data_stats['n_data'], data_stats['n_data_per_class'], data_stats['trainPercent'], data_stats['testPercent']
 93 |     all_idx = range(0,n_data)
 94 |     random.shuffle(all_idx)
 95 |     test_idx = all_idx[0:int(testPercent*n_data)]
 96 |     identifier = 0
 97 | 
 98 |     for one_class in all_example.keys(): 
 99 |         for p in all_example[one_class].readlines():
100 |             if p == "\n":
101 |                 continue
102 |             else:
103 |                 if identifier in test_idx:
104 |                     file = open(expanduser(dataset_path + "/Test/" + one_class[0:len(one_class)-4] + "/" + str(identifier) + ".txt"), "w")
105 |                 else:
106 |                     file = open(expanduser(dataset_path + "/Train/" + one_class[0:len(one_class)-4] + "/" + str(identifier) + ".txt"), "w")
107 | 
108 |                 string, removed = process_data(p, is_missing_word)
109 |                 if string != []:
110 |                     file.write(string)
111 |                     missing_word_dic[identifier] = removed
112 |                     identifier += 1
113 |                     file.close()
114 |                 else: print p 
115 |     pickle.dump(missing_word_dic, open(expanduser(dataset_path + "/missing_word_dic.pkl"),"w"))
116 |     return sorted(length)[-1]
117 | 
118 | if __name__ == "__main__":
119 |     print reformat_data("~/automatedMTL/data/rotten_tomato", False)
120 | 


--------------------------------------------------------------------------------
/auto/src_final/util/tf_utils.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | import numpy as np
 3 | 
 4 | def fcLayer(x, in_shape, out_shape, activation, dropout, is_train, scope="fc"):
 5 |     
 6 |     x = tf.reshape(x, [-1, in_shape])
 7 |  
 8 |     with tf.variable_scope(scope):
 9 |         w = tf.get_variable(name="w", shape = [in_shape, out_shape], dtype=tf.float32, initializer=tf.contrib.layers.xavier_initializer())
10 |         b = tf.get_variable(name="b", shape= [out_shape], dtype=tf.float32, initializer=tf.constant_initializer(0.0))
11 |         fc = tf.add(tf.matmul(x, w), b)
12 | 
13 |         with tf.variable_scope("activation"):
14 |             output = applyActivation(fc, activation)
15 |             #out_op = tf.nn.dropout(output, dropout)
16 |             out_op = output
17 | 
18 |     return out_op
19 | 
20 | def createGRUCell(batch_size, lstm_size):
21 |     gru_cell = tf.contrib.rnn.GRUCell(num_units=lstm_size, activation=tf.tanh)
22 |     state=gru_cell.zero_state(batch_size, tf.float32)
23 | 
24 |     return gru_cell, state
25 | 
26 | def createLSTMCell(batch_size, lstm_size, n_layers, forget_bias):
27 | 
28 |     lstm_cell = tf.contrib.rnn.BasicLSTMCell(lstm_size, forget_bias=forget_bias)
29 |     lstm_cell = tf.contrib.rnn.MultiRNNCell([lstm_cell  for i in range(n_layers)], state_is_tuple=True)
30 |     state = lstm_cell.zero_state(batch_size, tf.float32)
31 |    
32 |     return lstm_cell, state
33 | 
34 | def applyActivation(x, activation):
35 | 
36 |     if activation == "tanh":
37 |         return tf.nn.tanh(x)
38 |     elif activation == "relu":
39 |         return tf.nn.relu(x)
40 |     elif activation == "sigmoid":
41 |         return tf.nn.sigmoid(x)
42 |     elif activation == "relu6":
43 |         return tf.nn.relu6(x)
44 |     elif activation == "softmax":
45 | 	return tf.nn.softmax(x)
46 |     else: return None
47 | 
48 | def length(sequence):
49 |     used = tf.sign(tf.reduce_max(tf.abs(sequence), reduction_indices=2))
50 |     length = tf.reduce_sum(used, reduction_indices=1)
51 |     length = tf.cast(length, tf.int32)
52 |     return length
53 | 
54 | def predictionLayer(x, y, in_shape, out_shape, activation, scope="prediction"):
55 |     
56 |     x = tf.reshape(x, [-1, in_shape])
57 | 
58 |     with tf.variable_scope(scope):
59 |         w = tf.get_variable(name=scope+"w", shape = [in_shape, out_shape], dtype=tf.float32, initializer=tf.contrib.layers.xavier_initializer())
60 |         b = tf.get_variable(name=scope+"b", shape= [out_shape], dtype=tf.float32, initializer=tf.constant_initializer(0.0))
61 |         logits = tf.add(tf.matmul(x, w), b)
62 |         output = applyActivation(logits, activation)
63 |     return output, logits
64 | 
65 | def compute_cost(logit, y, out_type, max_length, batch_size, embed_dim, activation):
66 |     if out_type=="last_only":
67 |         cost = tf.nn.sigmoid_cross_entropy_with_logits(labels=y, logits=logit)
68 |         cost = tf.reduce_mean(cost, reduction_indices=1)
69 |     else:
70 |         pred_out = applyActivation(logit, activation)
71 | 	pred_out = tf.reshape(pred_out, [batch_size, max_length, embed_dim])
72 |         mse = tf.reduce_mean(tf.square(tf.subtract(y, pred_out)), reduction_indices=2)
73 | 	mask = tf.sign(tf.reduce_max(tf.abs(y), reduction_indices=2))
74 |         mse *= mask
75 |         mse = tf.reduce_sum(mse, reduction_indices=1)
76 |         mse /= tf.cast(length(y), tf.float32)
77 |         cost = mse
78 |     cost = tf.reduce_mean(cost, reduction_indices=0)
79 |     print "final cost shape: ", cost.get_shape()
80 |     return cost 
81 | 


--------------------------------------------------------------------------------
/auto/src_final/util/tf_utils_old.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | import numpy as np
 3 | 
 4 | def fcLayer(x, in_shape, out_shape, activation, dropout, is_train, scope="fc"):
 5 |     
 6 |     x = tf.reshape(x, [-1, in_shape])
 7 |  
 8 |     with tf.variable_scope(scope):
 9 |         w = tf.get_variable(name="w", shape = [in_shape, out_shape], dtype=tf.float32, initializer=tf.truncated_normal_initializer(stddev=1e-2))
10 |         b = tf.get_variable(name="b", shape= [out_shape], dtype=tf.float32, initializer=tf.constant_initializer(0.0))
11 |         fc = tf.add(tf.matmul(x, w), b)
12 | 
13 |         with tf.variable_scope("activation"):
14 |             output = applyActivation(fc, activation)
15 |             #out_op = tf.nn.dropout(output, dropout)
16 |             out_op = output
17 | 
18 |     return out_op
19 | 
20 | def createLSTMCell(batch_size, lstm_size, n_layers, forget_bias):
21 | 
22 |     lstm_cell = tf.contrib.rnn.BasicLSTMCell(lstm_size, forget_bias=forget_bias)
23 |     lstm_cell = tf.contrib.rnn.MultiRNNCell([lstm_cell  for i in range(n_layers)], state_is_tuple=True)
24 |     state = lstm_cell.zero_state(batch_size, tf.float32)
25 |    
26 |     return lstm_cell, state
27 | 
28 | def applyActivation(x, activation):
29 | 
30 |     if activation == "tanh":
31 |         return tf.nn.tanh(x)
32 |     elif activation == "relu":
33 |         return tf.nn.relu(x)
34 |     elif activation == "sigmoid":
35 |         return tf.nn.sigmoid(x)
36 |     elif activation == "relu6":
37 |         return tf.nn.relu6(x)
38 |     else: return None
39 | 
40 | 
41 | def predictionLayer(x, y, in_shape, out_shape, activation, scope="prediction"):
42 |     
43 |     x = tf.reshape(x, [-1, in_shape])
44 | 
45 |     with tf.variable_scope(scope):
46 |         w = tf.get_variable(name=scope+"w", shape = [in_shape, out_shape], dtype=tf.float32, initializer=tf.truncated_normal_initializer(stddev=1e-2))
47 |         b = tf.get_variable(name=scope+"b", shape= [out_shape], dtype=tf.float32, initializer=tf.constant_initializer(0.0))
48 |         logits = tf.add(tf.matmul(x, w), b)
49 |         output = applyActivation(logits, activation)
50 |         # Compute the mean-squared-error
51 |         cost = tf.reduce_mean(tf.square(tf.subtract(y , output)))
52 | 
53 |     return cost, output
54 | 


--------------------------------------------------------------------------------
/auto/src_final/util/tf_utils_reg.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | import numpy as np
 3 | 
 4 | def fcLayer(x, in_shape, out_shape, activation, dropout, is_train, reg_const, scope="fc"):
 5 |     
 6 |     x = tf.reshape(x, [-1, in_shape])
 7 |  
 8 |     with tf.variable_scope(scope):
 9 |         w = tf.get_variable(name="w", shape = [in_shape, out_shape], dtype=tf.float32, initializer=tf.truncated_normal_initializer(stddev=1e-2), regularizer=tf.contrib.layers.l2_regularizer(reg_const))
10 |         b = tf.get_variable(name="b", shape= [out_shape], dtype=tf.float32, initializer=tf.constant_initializer(0.0))
11 |         fc = tf.add(tf.matmul(x, w), b)
12 | 
13 |         with tf.variable_scope("activation"):
14 |             output = applyActivation(fc, activation)
15 |             #out_op = tf.nn.dropout(output, dropout)
16 |             out_op = output
17 | 
18 |     return out_op
19 | 
20 | def createLSTMCell(batch_size, lstm_size, n_layers, forget_bias):
21 | 
22 |     lstm_cell = tf.contrib.rnn.BasicLSTMCell(lstm_size, forget_bias=forget_bias)
23 |     lstm_cell = tf.contrib.rnn.MultiRNNCell([lstm_cell  for i in range(n_layers)], state_is_tuple=True)
24 |     state = lstm_cell.zero_state(batch_size, tf.float32)
25 |    
26 |     return lstm_cell, state
27 | 
28 | def applyActivation(x, activation):
29 | 
30 |     if activation == "tanh":
31 |         return tf.nn.tanh(x)
32 |     elif activation == "relu":
33 |         return tf.nn.relu(x)
34 |     elif activation == "sigmoid":
35 |         return tf.nn.sigmoid(x)
36 |     elif activation == "relu6":
37 |         return tf.nn.relu6(x)
38 |     elif activation == "softmax":
39 | 	return tf.nn.softmax(x)
40 |     else: return None
41 | 
42 | def length(sequence):
43 |     used = tf.sign(tf.reduce_max(tf.abs(sequence), reduction_indices=2))
44 |     length = tf.reduce_sum(used, reduction_indices=1)
45 |     length = tf.cast(length, tf.int32)
46 |     return length
47 | 
48 | def predictionLayer(x, y, in_shape, out_shape, activation, reg_const, scope="prediction"):
49 |     
50 |     x = tf.reshape(x, [-1, in_shape])
51 | 
52 |     with tf.variable_scope(scope):
53 |         w = tf.get_variable(name=scope+"w", shape = [in_shape, out_shape], dtype=tf.float32, initializer=tf.truncated_normal_initializer(stddev=1e-2), regularizer=tf.contrib.layers.l2_regularizer(reg_const))
54 |         b = tf.get_variable(name=scope+"b", shape= [out_shape], dtype=tf.float32, initializer=tf.constant_initializer(0.0))
55 |         logits = tf.add(tf.matmul(x, w), b)
56 |         output = applyActivation(logits, activation)
57 |     return output, logits
58 | 
59 | def compute_cost(logit, y, out_type, max_length, batch_size, embed_dim, activation):
60 |     if out_type=="last_only":
61 |         cost = tf.nn.sigmoid_cross_entropy_with_logits(labels=y, logits=logit)
62 |         cost = tf.reduce_mean(cost, reduction_indices=1)
63 |     else:
64 |         pred_out = applyActivation(logit, activation)
65 | 	pred_out = tf.reshape(pred_out, [batch_size, max_length, embed_dim])
66 |         mse = tf.reduce_mean(tf.square(tf.subtract(y, pred_out)), reduction_indices=2)
67 | 	mask = tf.sign(tf.reduce_max(tf.abs(y), reduction_indices=2))
68 |         mse *= mask
69 |         mse = tf.reduce_sum(mse, reduction_indices=1)
70 |         mse /= tf.cast(length(y), tf.float32)
71 |         cost = mse
72 |     cost = tf.reduce_mean(cost, reduction_indices=0)
73 |     print "final cost shape: ", cost.get_shape()
74 |     return cost 
75 | 


--------------------------------------------------------------------------------
/data/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/davisliang/automated/5e07e5802a604036fc6295ed3538d596936fca0e/data/.gitkeep


--------------------------------------------------------------------------------
/data/helpX:
--------------------------------------------------------------------------------
 1 | cnumpy.core.multiarray
 2 | _reconstruct
 3 | p1
 4 | (cnumpy
 5 | ndarray
 6 | p2
 7 | (I0
 8 | tS'b'
 9 | tRp3
10 | (I1
11 | (I14216263
12 | I10
13 | I64
14 | tcnumpy
15 | dtype
16 | p4
17 | (S'f8'
18 | I0
19 | I1
20 | tRp5
21 | (I3
22 | S'<'
23 | NNNI-1
24 | I-1
25 | I0
26 | tbI00
27 | 


--------------------------------------------------------------------------------
/data/test_Category.json.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/davisliang/automated/5e07e5802a604036fc6295ed3538d596936fca0e/data/test_Category.json.gz


--------------------------------------------------------------------------------
/data/test_Helpful.json.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/davisliang/automated/5e07e5802a604036fc6295ed3538d596936fca0e/data/test_Helpful.json.gz


--------------------------------------------------------------------------------
/data/train.json.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/davisliang/automated/5e07e5802a604036fc6295ed3538d596936fca0e/data/train.json.gz


--------------------------------------------------------------------------------
/logs/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/davisliang/automated/5e07e5802a604036fc6295ed3538d596936fca0e/logs/.gitkeep


--------------------------------------------------------------------------------
/scripts/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/davisliang/automated/5e07e5802a604036fc6295ed3538d596936fca0e/scripts/.gitkeep


--------------------------------------------------------------------------------
/scripts/killZk.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # alternatively, you can just exit your terminal.
3 | ~/zookeeper-3.4.6/bin/zkServer.sh stop
4 | 


--------------------------------------------------------------------------------
/scripts/newTerminalMac.sh:
--------------------------------------------------------------------------------
 1 |  #!/bin/sh
 2 | 
 3 | echo '
 4 | on run argv
 5 |   if length of argv is equal to 0
 6 |     set command to ""
 7 |   else
 8 |     set command to item 1 of argv
 9 |   end if
10 | 
11 |   if length of argv is greater than 1
12 |     set profile to item 2 of argv
13 |     runWithProfile(command, profile)
14 |   else
15 |     runSimple(command)
16 |   end if
17 | end run
18 | 
19 | on runSimple(command)
20 |   tell application "Terminal"
21 |     activate
22 |     set newTab to do script(command)
23 |   end tell
24 |   return newTab
25 | end runSimple
26 | 
27 | on runWithProfile(command, profile)
28 |   set newTab to runSimple(command)
29 |   tell application "Terminal" to set current settings of newTab to (first settings set whose name is profile)
30 | end runWithProfile
31 | ' | osascript - "$@" > /dev/null


--------------------------------------------------------------------------------
/scripts/startKafkaServer.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | echo "starting kafka client"
3 | 
4 | KAFKAHOME="$HOME/kafka-0.10.1.1-src"
5 | 
6 | # Added the sudo due to file permissions being messed up
7 | sudo $KAFKAHOME/bin/kafka-server-start.sh $KAFKAHOME/config/server.properties
8 | 


--------------------------------------------------------------------------------
/scripts/startNimbus.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | sudo ~/apache-storm-1.0.3/bin/storm nimbus
3 | 


--------------------------------------------------------------------------------
/scripts/startStormUI.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | sudo ~/apache-storm-1.0.3/bin/storm ui
3 | 


--------------------------------------------------------------------------------
/scripts/startSupervisor.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | sudo ~/apache-storm-1.0.3/bin/storm supervisor
3 | 


--------------------------------------------------------------------------------
/scripts/startZK.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | echo "Starting Zookeeper"
3 | ~/zookeeper-3.4.6/bin/zkServer.sh start
4 | 


--------------------------------------------------------------------------------
/scripts/startZKClient.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | echo "starting zookeeper client"
3 | ~/zookeeper-3.4.6/bin/zkCli.sh
4 | 


--------------------------------------------------------------------------------
/scripts/systemStartMac.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | ~/tweetnet/scripts/startZK.sh
 3 | sleep 5
 4 | ~/tweetnet/scripts/newTerminalMac.sh ~/tweetnet/scripts/startZKClient.sh
 5 | sleep 5
 6 | ~/tweetnet/scripts/newTerminalMac.sh ~/tweetnet/scripts/startNimbus.sh
 7 | sleep 5
 8 | ~/tweetnet/scripts/newTerminalMac.sh ~/tweetnet/scripts/startSupervisor.sh
 9 | sleep 5
10 | ~/tweetnet/scripts/newTerminalMac.sh ~/tweetnet/scripts/startStormUI.sh
11 | sleep 6
12 | ~/tweetnet/scripts/newTerminalMac.sh ~/tweetnet/scripts/startKafkaServer.sh
13 | 


--------------------------------------------------------------------------------
/scripts/systemStartUbuntu.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | sudo ~/tweetnet/scripts/startZK.sh
 3 | sleep 5
 4 | gnome-terminal -e ~/tweetnet/scripts/startZKClient.sh
 5 | sleep 5
 6 | gnome-terminal -e ~/tweetnet/scripts/startNimbus.sh
 7 | sleep 5
 8 | gnome-terminal -e ~/tweetnet/scripts/startSupervisor.sh
 9 | sleep 5
10 | gnome-terminal -e ~/tweetnet/scripts/startStormUI.sh
11 | sleep 5
12 | gnome-terminal -e ~/tweetnet/scripts/startKafkaServer.sh
13 | 


--------------------------------------------------------------------------------
/scripts/userRunAPI.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | echo "Starting up TweetFeeder."
 3 | 
 4 | STORMPATH="$HOME/apache-storm-0.9.5/lib/*"
 5 | TWITTERPATH="$HOME/twitter4j-4.0.4/lib/*"
 6 | CLASSPATH="$HOME/tweetnet/src/storm/"
 7 | 
 8 | javac -cp $STORMPATH:$TWITTERPATH ~/tweetnet/src/storm/TwitterStreamSpout.java ~/tweetnet/src/storm/TwitterCleanerBolt.java ~/tweetnet/src/storm/TwitterStorm.java
 9 | 
10 | java -cp $STORMPATH:$TWITTERPATH:$CLASSPATH TwitterStorm #append_twitter_credentials_here
11 | 


--------------------------------------------------------------------------------
/src/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/davisliang/automated/5e07e5802a604036fc6295ed3538d596936fca0e/src/.gitkeep


--------------------------------------------------------------------------------
/src/models/c2c_cooccurence.py:
--------------------------------------------------------------------------------
 1 | import cPickle as pickle
 2 | import numpy as np
 3 | from numpy import random
 4 | from random import shuffle
 5 | from os.path import expanduser
 6 | import time
 7 | import sys
 8 | import os
 9 | 
10 | sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "..","utils")))
11 | 
12 | from logger import logger
13 | 
14 | def create_coocc_dict(trainHt):
15 |     coocc_dict = {}
16 |     for htstr in trainHt:
17 |         ht = htstr.split(" ")
18 |         if coocc_dict.get(ht[0]) == None:
19 |             coocc_dict[ht[0]] = {ht[1]:1}
20 |         else:
21 |             if coocc_dict[ht[0]].get(ht[1]) == None:
22 |                 coocc_dict[ht[0]][ht[1]] = 1
23 |             else:
24 |                 coocc_dict[ht[0]][ht[1]] += 1
25 |     return coocc_dict
26 | 
27 | def predict(testHt, coocc_dict):
28 |     correct = 0
29 |     name = "cooc" + time.strftime("%Y-%m-%d_%H:%M") + ".log"    
30 |     log = []
31 |     
32 |     for htstr in testHt:
33 |         ht = htstr.split(" ")
34 |         if coocc_dict.get(ht[0]) == None:
35 |             continue
36 |     	dic = coocc_dict[ht[0]]
37 |     	dic_key = dic.keys()
38 |     	dic_val = dic.values()
39 |     	idx = np.argsort(dic_val)
40 |         prediction = []
41 |     	for i in range(topN):
42 |             if i < len(dic_val):
43 |             	prediction.append(dic_key[idx[i]])
44 |         isCorrect = False
45 |         if ht[1] in prediction:
46 |             correct += 1
47 |             isCorrect = True
48 |         
49 |         log.append([ht[0],ht[1],isCorrect,prediction])
50 |     
51 |     accuracy=correct*1.0/len(testHt)
52 |     log.append([correct,accuracy])
53 |     
54 |     logger(log,name) 
55 | 
56 | hashtags = pickle.load(open(expanduser("~/tweetnet/data/englishHashtag.pkl"), "rb"))
57 | hashtagFreq = pickle.load(open(expanduser("~/tweetnet/data/hashtagFreq.pkl"), "rb"))
58 | 
59 | idx_shuf = range(len(hashtags))
60 | shuffle(idx_shuf)
61 | freqThreshold = 84
62 | hashtagFreqCnt = {}
63 | hashtags_shuf = []
64 | 
65 | for i in idx_shuf:
66 |     ht = hashtags[i].split(" ")
67 |     if hashtagFreq[ht[2]] >= freqThreshold:
68 |         if hashtagFreqCnt.get(ht[2]) == None:
69 | 
70 |             hashtagFreqCnt[ht[2]] = 1
71 |             hashtags_shuf.append(ht[1] + " " + ht[2])
72 | 
73 |         elif hashtagFreqCnt[ht[2]] < freqThreshold:
74 | 
75 |             hashtagFreqCnt[ht[2]] += 1
76 |             hashtags_shuf.append(ht[1] + " " + ht[2])
77 | 
78 | hashtags = hashtags_shuf
79 | 
80 | trainPercent = 0.95
81 | nTrainData = np.round(len(hashtags)*trainPercent).astype(int)
82 | topN = 4
83 | trainHt = hashtags[0:nTrainData]
84 | testHt = hashtags[nTrainData:]
85 | coocc_dict = create_coocc_dict(trainHt)
86 | predict(testHt, coocc_dict)
87 | 
88 | 
89 | 


--------------------------------------------------------------------------------
/src/models/c2c_cooccurenceNonUniform.py:
--------------------------------------------------------------------------------
 1 | import cPickle as pickle
 2 | import numpy as np
 3 | from os.path import expanduser
 4 | import time
 5 | import sys
 6 | import os
 7 | 
 8 | sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "..","utils")))
 9 | 
10 | from logger import logger
11 | 
12 | def create_coocc_dict(trainHt):
13 |     coocc_dict = {}
14 |     for htstr in trainHt:
15 |         ht = htstr.split(" ")
16 |         if coocc_dict.get(ht[1]) == None:
17 |             coocc_dict[ht[1]] = {ht[2]:1}
18 |         else:
19 |             if coocc_dict[ht[1]].get(ht[2]) == None:
20 |                 coocc_dict[ht[1]][ht[2]] = 1
21 |             else:
22 |                 coocc_dict[ht[1]][ht[2]] += 1
23 |     return coocc_dict
24 | 
25 | def predict(testHt, coocc_dict):
26 |     correct = 0
27 |     name = "cooc" + time.strftime("%Y-%m-%d_%H:%M") + ".log"    
28 |     log = []
29 |     
30 |     for htstr in testHt:
31 |         ht = htstr.split(" ")
32 |         if coocc_dict.get(ht[1]) == None:
33 |             continue
34 |     	dic = coocc_dict[ht[1]]
35 |     	dic_key = dic.keys()
36 |     	dic_val = dic.values()
37 |     	idx = np.argsort(dic_val)
38 |         prediction = []
39 |     	for i in range(topN):
40 |             if i < len(dic_val):
41 |             	prediction.append(dic_key[idx[i]])
42 |         isCorrect = False
43 |         if ht[2] in prediction:
44 |             correct += 1
45 |             isCorrect = True
46 |         
47 |         log.append([ht[1],ht[2],isCorrect,prediction])
48 |     
49 |     accuracy=correct*1.0/len(testHt)
50 |     log.append([correct,accuracy])
51 |     
52 |     logger(log,name) 
53 | 
54 | hashtags = pickle.load(open(expanduser("~/tweetnet/data/englishHashtag.pkl"), "rb"))
55 | trainPercent = 0.99
56 | nTrainData = np.round(len(hashtags)*trainPercent).astype(int)
57 | topN = 10
58 | trainHt = hashtags[0:nTrainData]
59 | testHt = hashtags[nTrainData + 1 :]
60 | coocc_dict = create_coocc_dict(trainHt)
61 | predict(testHt, coocc_dict)
62 | 
63 | 
64 | 


--------------------------------------------------------------------------------
/src/models/cascKeras.py:
--------------------------------------------------------------------------------
 1 | from keras.layers import Input, Dense, Embedding, LSTM, merge
 2 | from keras.models import Model
 3 | 
 4 | # this returns a tensor
 5 | contextX, contexty, taskX, taskY = loadData();
 6 | 
 7 | text_input = Input(shape=(100,), dtype='float32', name='text_input')
 8 | 
 9 | lstm_body = lstm(32)(text_input)
10 | 
11 | lstm_context = lstm(32)(lstm_body)
12 | fc_context = Dense(256)(lstm_context)
13 | out_context = Dense(300)(fc_context)
14 | 
15 | lstm_task = lstm(32)(lstm_body)
16 | fc_task = Dense(256)(lstm_task)
17 | fc_out = Dense(300)(fc_task)


--------------------------------------------------------------------------------
/src/models/cascade.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/davisliang/automated/5e07e5802a604036fc6295ed3538d596936fca0e/src/models/cascade.py


--------------------------------------------------------------------------------
/src/models/contextToContext.py:
--------------------------------------------------------------------------------
  1 | from os.path import expanduser
  2 | import os
  3 | import sys
  4 | import numpy
  5 | from numpy import shape
  6 | from numpy import random
  7 | from random import shuffle
  8 | import cPickle as pickle
  9 | sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "..","utils")))
 10 | import time
 11 | from keras.utils import np_utils
 12 | from keras.models import Sequential
 13 | from keras.layers import Dense
 14 | from keras.layers import Activation
 15 | from keras.optimizers import SGD
 16 | from keras.optimizers import RMSprop
 17 | from keras.layers import BatchNormalization
 18 | import keras.callbacks
 19 | from logger import logger
 20 | from predContext import predContext, createHtDict
 21 | from keras.layers import PReLU
 22 | hashtags = pickle.load(open(expanduser("~/tweetnet/data/englishHashtag.pkl"),"rb"))
 23 | dictionary = pickle.load(open(expanduser("~/tweetnet/data/word2vec_dict.pkl"), "rb"))
 24 | hashtagFreq = pickle.load(open(expanduser("~/tweetnet/data/hashtagFreq.pkl"), "rb"))
 25 | 
 26 | idx_shuf = range(len(hashtags))
 27 | #shuffle(idx_shuf)
 28 | freqThreshold = 84
 29 | hashtags_shuf = []
 30 | context_shuf = []
 31 | hashtagFreqCnt = {}
 32 | 
 33 | for i in idx_shuf:
 34 |     ht = hashtags[i].split(" ")
 35 |     if hashtagFreq[ht[2]] >= freqThreshold:
 36 | 	if hashtagFreqCnt.get(ht[2]) == None:
 37 | 
 38 |             hashtagFreqCnt[ht[2]] = 1
 39 |             hashtags_shuf.append(ht[2])
 40 |             context_shuf.append(ht[1])
 41 | 
 42 |         elif hashtagFreqCnt[ht[2]] < freqThreshold:
 43 | 
 44 |             hashtagFreqCnt[ht[2]] += 1
 45 |             hashtags_shuf.append(ht[2]) 
 46 |             context_shuf.append(ht[1])
 47 | 
 48 | data = numpy.zeros([len(hashtags_shuf),300])
 49 | label = numpy.zeros([len(hashtags_shuf),300])
 50 | inputStringLabel = []
 51 | outputStringLabel = []
 52 | for i in range(len(hashtags_shuf)):
 53 |     data[i,:]=dictionary[context_shuf[i]]
 54 |     label[i,:]=dictionary[hashtags_shuf[i]]
 55 |     inputStringLabel.append(context_shuf[i])
 56 |     outputStringLabel.append(hashtags_shuf[i])
 57 | 
 58 | htDic = createHtDict(dictionary, outputStringLabel)
 59 | 
 60 | # Train and Test split
 61 | trainPercent = 0.9
 62 | nTrainData = numpy.round(len(data)*trainPercent).astype(int)
 63 | topN = 4
 64 | nEpoch = 5000
 65 | logAllPredictions = True
 66 | trainData = data[0 : nTrainData]
 67 | testData = data[nTrainData :]
 68 | testInputStringLabel = inputStringLabel[nTrainData:]
 69 | print testData.shape
 70 | trainLabel = label[0 : nTrainData]
 71 | testOutputStringLabel = outputStringLabel[nTrainData:]
 72 | 
 73 | 
 74 | model = Sequential()
 75 | 
 76 | model.add(Dense(512, input_shape=(300,)))
 77 | model.add(PReLU())
 78 | model.add(BatchNormalization())
 79 | 
 80 | model.add(Dense(512))
 81 | model.add(PReLU())
 82 | model.add(BatchNormalization())
 83 | 
 84 | model.add(Dense(300))
 85 | model.add(PReLU())
 86 | 
 87 | optimizer = RMSprop(lr=0.005)
 88 | model.compile(loss='mse', optimizer=optimizer)
 89 | 
 90 | model.summary()
 91 | 
 92 | name = "c2c" + time.strftime("%Y-%m-%d_%H:%M") + ".log"
 93 | for epoch in range(nEpoch):
 94 |     model.fit(trainData, trainLabel, nb_epoch=1, batch_size=128, validation_split=0.1)
 95 |     
 96 |     correctCnt = 0
 97 |     randIdx = numpy.random.randint(0, len(testData), 10)
 98 |     log = []    
 99 |     log.append([epoch]) 
100 |     for testIdx in range(len(testData)):
101 | 	modelOutput = model.predict(numpy.expand_dims(testData[testIdx, :], axis=0))
102 | 	topNht, isCorrect, topNdist = predContext(htDic, modelOutput, topN, testOutputStringLabel[testIdx])
103 | 	if isCorrect:
104 |             correctCnt += 1.0
105 |         if logAllPredictions:
106 |             #verbose logging
107 | 	    log.append([testInputStringLabel[testIdx],testOutputStringLabel[testIdx],isCorrect,topNht])
108 |     
109 |     accuracy = correctCnt*1.0 / len(testData)
110 |     #always log accuracy
111 |     log.append([correctCnt, accuracy])
112 |     logger(log,name)
113 |         
114 | 


--------------------------------------------------------------------------------
/src/models/contextToContextNonUniform.py:
--------------------------------------------------------------------------------
 1 | from os.path import expanduser
 2 | import os
 3 | import sys
 4 | import numpy
 5 | from numpy import shape
 6 | import cPickle as pickle
 7 | sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "..","utils")))
 8 | import time
 9 | from keras.utils import np_utils
10 | from keras.models import Sequential
11 | from keras.layers import Dense
12 | from keras.layers import Activation
13 | from keras.optimizers import SGD
14 | from keras.optimizers import RMSprop
15 | from keras.layers import BatchNormalization
16 | import keras.callbacks
17 | from logger import logger
18 | from predContext import predContext, createHtDict
19 | from keras.layers import PReLU
20 | hashtags = pickle.load(open(expanduser("~/tweetnet/data/englishHashtag.pkl"),"rb"))
21 | dictionary = pickle.load(open(expanduser("~/tweetnet/data/word2vec_dict.pkl"), "rb"))
22 | 
23 | data = numpy.zeros([len(hashtags),300])
24 | label = numpy.zeros([len(hashtags),300])
25 | inputStringLabel = []
26 | outputStringLabel = []
27 | for i in range(len(hashtags)):
28 |     line = hashtags[i]
29 |     listHashtag = line.split()
30 |     data[i,:]=dictionary[listHashtag[1]]
31 |     label[i,:]=dictionary[listHashtag[2]]
32 |     inputStringLabel.append(listHashtag[1])
33 |     outputStringLabel.append(listHashtag[2])
34 | 
35 | htDic = createHtDict(dictionary, outputStringLabel)
36 | 
37 | # Train and Test split
38 | trainPercent = 0.99
39 | nTrainData = numpy.round(len(data)*trainPercent).astype(int)
40 | topN = 10
41 | nEpoch = 5000
42 | logAllPredictions = True
43 | trainData = data[0 : nTrainData]
44 | testData = data[nTrainData + 1 :]
45 | testInputStringLabel = inputStringLabel[nTrainData + 1 :]
46 | print testData.shape
47 | trainLabel = label[0 : nTrainData]
48 | testOutputStringLabel = outputStringLabel[nTrainData + 1 :]
49 | 
50 | 
51 | model = Sequential()
52 | 
53 | model.add(Dense(512, input_shape=(300,)))
54 | model.add(PReLU())
55 | model.add(BatchNormalization())
56 | 
57 | model.add(Dense(512))
58 | model.add(PReLU())
59 | model.add(BatchNormalization())
60 | 
61 | model.add(Dense(300))
62 | model.add(PReLU())
63 | 
64 | optimizer = RMSprop(lr=0.005)
65 | model.compile(loss='mse', optimizer=optimizer)
66 | 
67 | name = "c2c" + time.strftime("%Y-%m-%d_%H:%M") + ".log"
68 | for epoch in range(nEpoch):
69 |     model.fit(trainData, trainLabel, nb_epoch=1, batch_size=128, validation_split=0.1)
70 |     
71 |     correctCnt = 0
72 |     randIdx = numpy.random.randint(0, len(testData), 10)
73 |     log = []    
74 |     log.append([epoch]) 
75 |     for testIdx in range(len(testData)):
76 | 	modelOutput = model.predict(numpy.expand_dims(testData[testIdx, :], axis=0))
77 | 	topNht, isCorrect, topNdist = predContext(htDic, modelOutput, topN, testOutputStringLabel[testIdx])
78 | 	if isCorrect:
79 |             correctCnt += 1.0
80 |         if logAllPredictions:
81 |             #verbose logging
82 | 	    log.append([testInputStringLabel[testIdx],testOutputStringLabel[testIdx],isCorrect,topNht])
83 |     
84 |     accuracy = correctCnt*1.0 / len(testData)
85 |     #always log accuracy
86 |     log.append([correctCnt, accuracy])
87 |     logger(log,name)
88 |         
89 | 


--------------------------------------------------------------------------------
/src/models/mtlKeras.py:
--------------------------------------------------------------------------------
 1 | from keras.layers import Input, Dense, Embedding, LSTM, merge
 2 | from keras.models import Model
 3 | 
 4 | # this returns a tensor
 5 | contextX, contexty, taskX, taskY = loadData();
 6 | 
 7 | text_input = Input(shape=(100,), dtype='float32', name='text_input')
 8 | 
 9 | lstm_body = lstm(32)(text_input)
10 | 
11 | lstm_context = lstm(32)(lstm_body)
12 | fc_context = Dense(256)(lstm_context)
13 | out_context = Dense(300)(fc_context)
14 | 
15 | lstm_task = lstm(32)(lstm_body)
16 | fc_task = Dense(256)(lstm_task)
17 | fc_out = Dense(300)(fc_task)
18 | 
19 | 
20 | 


--------------------------------------------------------------------------------
/src/models/tc2c.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | Keras backend. LSTM Model.
  3 | 
  4 | Using standard default range:
  5 | Input: (65x1) 64 unique chars, 1 EOS char
  6 | Output: (65x1) 64 unique chars, 1 EOS char
  7 | 
  8 | '''
  9 | import cPickle as pickle
 10 | import  numpy as np
 11 | import h5py
 12 | import os
 13 | import sys
 14 | from os.path import expanduser
 15 | sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "..","utils")))
 16 | from loadDataNewModel import loadData
 17 | from predContext import predContext, createHtDict
 18 | from keras.utils import np_utils
 19 | from keras.models import Sequential
 20 | from keras.layers import LSTM
 21 | from keras.layers import Dense
 22 | from keras.layers import PReLU
 23 | from keras.layers import Activation
 24 | from keras.layers.wrappers import TimeDistributed
 25 | from keras.optimizers import RMSprop
 26 | from keras.optimizers import Adagrad
 27 | from keras.layers import Dropout
 28 | from keras.layers import BatchNormalization
 29 | from tweetGenerator_lstm import generateText
 30 | from keras.callbacks import ModelCheckpoint
 31 | from logger import logger
 32 | import time
 33 | 
 34 | #sequenceLength: sequence length (k in BPTTk)
 35 | sequenceLength = 30
 36 | #Number of symbols
 37 | vocabLen = 66
 38 | #train test split
 39 | trainPercent = 0.9
 40 | #threshold on hashtag frequency 
 41 | freqThreshold = 84
 42 | 
 43 | logAllPredictions=True
 44 | #X: [# Seuqences, 40 (sequenceLength), 65(inputSize)].
 45 | #y: [# Sequences, 300]
 46 | 
 47 | print("Start loading data ...")
 48 | trainTweets, trainHashtags, testTweets, testHashtags, trainX, trainY, testX, testY, trainTweetSequence, trainHashtagSequence, testTweetSequence, testHashtagSequence, trainContextSequence, testContextSequence, dictionary, nUniqueHt = loadData({},np.array([]), sequenceLength, trainPercent, freqThreshold)
 49 | print("Finished loading data")
 50 | 
 51 | 
 52 | #initialize some hyper-parameters
 53 | topN = np.ceil(0.05*nUniqueHt).astype(int)
 54 | print  topN
 55 | 
 56 | 
 57 | #embeddingLength: size of the word embedding
 58 | embeddingLength = 300
 59 | 
 60 | #inputSize: size of each input vector (default: 365x1)
 61 | inputSize = vocabLen + embeddingLength
 62 | 
 63 | #numHiddenFirst: size of first hidden layer
 64 | numHiddenFirst = 512
 65 | 
 66 | #Number of testing/training tweets
 67 | nTestData = len(testTweets)
 68 | nTrainData = len(trainTweets)
 69 | nTestSequences = len(testTweetSequence)
 70 | nTrainSequences = len(trainTweetSequence)
 71 | print "Number of testing sequences: ", nTestSequences
 72 | print "Number of training sequences: ", nTrainSequences
 73 | print "Number of testing tweets: ", nTestData
 74 | print "Number of training tweets: ", nTrainData
 75 | 
 76 | dictionary = pickle.load(open(expanduser("~/tweetnet/data/word2vec_dict.pkl")))
 77 | 
 78 | # Create the hashtag dictionary
 79 | htDic = createHtDict(dictionary, testHashtags)
 80 | 
 81 | numEpochs=50
 82 | 
 83 | #building cLSTM model
 84 | #print("\n")
 85 | print("Start building model ....")
 86 | model = Sequential()
 87 | 
 88 | #model.add(TimeDistributed(Dense(numHiddenFirst), input_shape=(sequenceLength, inputSize)))
 89 | #model.add(BatchNormalization())
 90 | 
 91 | model.add(LSTM(numHiddenFirst, input_shape=(sequenceLength, inputSize)))
 92 | model.add(BatchNormalization())
 93 | 
 94 | model.add(Dense(numHiddenFirst))
 95 | model.add(PReLU())
 96 | model.add(BatchNormalization())
 97 | 
 98 | model.add(Dense(embeddingLength))
 99 | model.add(PReLU())
100 | 
101 | optimizer = RMSprop(lr=0.005)
102 | 
103 | model.compile(loss='mean_squared_error', optimizer=optimizer)
104 | print("Finished building model.")
105 | 
106 | model.summary()
107 | 
108 | name = "t2c"+time.strftime("%Y-%m-%d_%H:%M") + ".log"
109 | for epoch in range(numEpochs):
110 |     
111 |     model.fit(trainX, trainY, nb_epoch=1, batch_size=128)
112 |      
113 |     correctCnt = 0
114 |     randIdx = np.random.randint(0, nTestData, 10)
115 |  
116 |     tweetCnt = 0
117 |     tweetStartIdx = 0
118 |     log = []
119 |     log.append([epoch])
120 |     for testIdx in range(nTestSequences):
121 |         # Stack the windows (1 x 40 x 65) of each tweet as a 3D matrix (#windows x 40 x 65)
122 |         if testTweetSequence[testIdx][-1] == chr(3):
123 |             oneTweet = testX[tweetStartIdx:testIdx+1, :, :]
124 |             modelOutput = model.predict(oneTweet)
125 |             topNht, isCorrect, topNdist = predContext(htDic, modelOutput, topN, testHashtags[tweetCnt])
126 |             tweetStartIdx = testIdx + 1
127 |             if isCorrect:
128 |                 correctCnt += 1
129 |                 isCorrect = True
130 |             if tweetCnt in randIdx:
131 |                 print testTweets[tweetCnt][:-2]
132 |                 print "Given label is ", testContextSequence[testIdx]
133 |                 print "True label is ", testHashtags[tweetCnt]
134 |                 print "Top ", topN, " hashtags are ", topNht
135 |             
136 |             if logAllPredictions:
137 |                 log.append([testTweets[tweetCnt][:-2],testHashtags[tweetCnt],isCorrect,topNht])
138 |             tweetCnt += 1
139 |     accuracy = correctCnt*1.0 / nTestData
140 |     log.append([correctCnt, accuracy])        
141 |     logger(log,name)
142 | 
143 | 
144 | 
145 | 
146 | 


--------------------------------------------------------------------------------
/src/models/textToContext.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | Keras backend. LSTM Model.
  3 | 
  4 | Using standard default range:
  5 | Input: (65x1) 64 unique chars, 1 EOS char
  6 | Output: (65x1) 64 unique chars, 1 EOS char
  7 | 
  8 | '''
  9 | import cPickle as pickle
 10 | import  numpy as np
 11 | import h5py
 12 | import os
 13 | import sys
 14 | from os.path import expanduser
 15 | sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "..","utils")))
 16 | from loadDataT2C import loadData
 17 | from predContext import predContext, createHtDict
 18 | from keras.utils import np_utils
 19 | from keras.models import Sequential
 20 | from keras.layers import LSTM
 21 | from keras.layers import Dense
 22 | from keras.layers import PReLU
 23 | from keras.layers import Activation
 24 | from keras.layers.wrappers import Bidirectional
 25 | from keras.optimizers import RMSprop
 26 | from keras.optimizers import Adadelta
 27 | from keras.optimizers import Adam
 28 | from keras.layers import Dropout
 29 | from keras.layers import BatchNormalization
 30 | from tweetGenerator_lstm import generateText
 31 | from keras.callbacks import ModelCheckpoint
 32 | from keras.regularizers import l2, activity_l2
 33 | 
 34 | from logger import logger
 35 | import time
 36 | #get the top N prediction of hashtags
 37 | topN = 4
 38 | #sequenceLength: sequence length (k in BPTTk)
 39 | sequenceLength = 40
 40 | #Number of symbols
 41 | vocabLen = 66
 42 | #train test split
 43 | trainPercent = 0.9
 44 | #freqThreshold for hashtags
 45 | freqThreshold = 84
 46 | logAllPredictions=True
 47 | #X: [# Seuqences, 40 (sequenceLength), 65(inputSize)].
 48 | #y: [# Sequences, 300]
 49 | 
 50 | print("Start loading data ...")
 51 | trainTweets, trainHashtags, testTweets, testHashtags, trainX, trainY, testX, testY, trainTweetSequence, trainHashtagSequence, testTweetSequence, testHashtagSequence, dictionary = loadData({},np.array([]), sequenceLength, trainPercent, freqThreshold)
 52 | print("Finished loading data")
 53 | 
 54 | 
 55 | #initialize some hyper-parameters
 56 | #inputSize: size of each input vector (default: 365x1)
 57 | inputSize = vocabLen
 58 | 
 59 | #outputSize: size of the word embedding
 60 | outputSize = 300
 61 | 
 62 | #numHiddenFirst: size of first hidden layer
 63 | numHiddenFirst = 512
 64 | 
 65 | #Number of testing/training tweets
 66 | nTestData = len(testTweets)
 67 | nTrainData = len(trainTweets)
 68 | nTestSequences = len(testTweetSequence)
 69 | nTrainSequences = len(trainTweetSequence)
 70 | print "Number of testing sequences: ", nTestSequences
 71 | print "Number of training sequences: ", nTrainSequences
 72 | print "Number of testing tweets: ", nTestData
 73 | print "Number of training tweets: ", nTrainData
 74 | 
 75 | #for i in range(1000):
 76 | #    print (trainTweetSequence[i], trainHashtagSequence[i])
 77 | # Load word2vec dictionary
 78 | dictionary = pickle.load(open(expanduser("~/tweetnet/data/word2vec_dict.pkl")))
 79 | 
 80 | # Create the hashtag dictionary
 81 | htDic = createHtDict(dictionary, testHashtags)
 82 | 
 83 | numEpochs=50
 84 | 
 85 | lamb = 0.0001
 86 | #building cLSTM model
 87 | #print("\n")
 88 | print("Start building model ....")
 89 | model = Sequential()
 90 | 
 91 | #model.add(LSTM(numHiddenFirst, return_sequences=True, input_shape=(sequenceLength, inputSize)))
 92 | 
 93 | model.add(LSTM(numHiddenFirst, input_shape=(sequenceLength, inputSize)))
 94 | 
 95 | #model.add(BatchNormalization())
 96 | 
 97 | model.add(Dense(numHiddenFirst))
 98 | model.add(Activation('relu'))
 99 | #model.add(PReLU())
100 | #model.add(BatchNormalization())
101 | 
102 | model.add(Dense(outputSize))
103 | model.add(Activation('tanh'))
104 | #model.add(PReLU())
105 | #model.add(BatchNormalization())
106 | 
107 | #optimizer = RMSprop(lr=0.005)
108 | optimizer = Adam(lr=0.0001)
109 | model.compile(loss='mean_squared_error', optimizer=optimizer)
110 | print("Finished building model.")
111 | 
112 | model.summary()
113 | 
114 | name = "t2c"+time.strftime("%Y-%m-%d_%H:%M") + ".log"
115 | for epoch in range(numEpochs):
116 |     
117 |     model.fit(trainX, trainY, nb_epoch=1, batch_size=128)
118 |      
119 |     correctCnt = 0
120 |     randIdx = np.random.randint(0, nTestData, 10)
121 |  
122 |     tweetCnt = 0
123 |     tweetStartIdx = 0
124 |     log = []
125 |     log.append([epoch])
126 |     for testIdx in range(nTestSequences):
127 |         # Stack the windows (1 x 40 x 65) of each tweet as a 3D matrix (#windows x 40 x 65)
128 |         if testTweetSequence[testIdx][-1] == chr(3):
129 |             oneTweet = testX[tweetStartIdx:testIdx+1, :, :]
130 |             modelOutput = model.predict(oneTweet)
131 |             topNht, isCorrect, topNdist = predContext(htDic, modelOutput, topN, testHashtags[tweetCnt])
132 |             tweetStartIdx = testIdx + 1
133 |             if isCorrect:
134 |                 correctCnt += 1
135 |                 isCorrect = True
136 |             if tweetCnt in randIdx:
137 |                 print testTweets[tweetCnt][:-2]
138 |                 print "True label is ", testHashtags[tweetCnt]
139 |                 print "Top ", topN, " hashtags are ", topNht
140 |             
141 |             if logAllPredictions:
142 |                 log.append([testTweets[tweetCnt][:-2],testHashtags[tweetCnt],isCorrect,topNht])
143 |             tweetCnt += 1
144 |     accuracy = correctCnt*1.0 / nTestData
145 |     log.append([correctCnt, accuracy])        
146 |     logger(log,name)
147 | 
148 | 
149 | 
150 | 
151 | 


--------------------------------------------------------------------------------
/src/models/tweetnet.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | Keras backend. cLSTM Model.
  3 | 
  4 | Using standard default range:
  5 | Input: (365x1) 64 unique chars, 1 EOS char, 300 word2vec context
  6 | Output: (65x1) 64 unique chars, 1 EOS char
  7 | 
  8 | '''
  9 | import pickle as pkl
 10 | import  numpy as np
 11 | from loadData import loadData
 12 | from keras.utils import np_utils
 13 | from keras.models import Sequential
 14 | from keras.layers import LSTM
 15 | from keras.layers import Dense
 16 | from keras.layers import Activation
 17 | from keras.optimizers import RMSprop
 18 | from keras.optimizers import Adagrad
 19 | from keras.layers import Dropout
 20 | from keras.layers import BatchNormalization
 21 | from tweetGenerator import generateText
 22 | from keras.callbacks import ModelCheckpoint
 23 | from os.path import expanduser
 24 | print("Start loading data ...")
 25 | data, dictLen, tweetLen, dictionary = loadData({},np.array([])) 
 26 | # data shape = #tweets x 141 x inputSize(365)
 27 | print("Finished loading data")
 28 | 
 29 | loadWeights=False
 30 | 
 31 | #initialize some hyper-parameters
 32 | #inputSize: size of each input vector (default: 365x1)
 33 | inputSize = data.shape[2]
 34 | #sequenceLength: sequence length (k in BPTTk)
 35 | sequenceLength = 50
 36 | #numHiddenFirst: size of first hidden layer
 37 | numHiddenFirst = 512
 38 | #numTweets: total number of tweets in dataset
 39 | numTweets = data.shape[0]
 40 | #seqPerSegment: sequences (of size sequenceLength) per mini-epoch. 
 41 | #Lowers maximum memory usage.
 42 | seqPerSegment = 5000
 43 | 
 44 | X = []
 45 | y = []
 46 | 
 47 | #create input and target datasets from loaded data.
 48 | for i in range(numTweets):
 49 |     for j in range(0, int(tweetLen[i])-sequenceLength, 1):
 50 |         seq_in = data[i, j:j+sequenceLength, :]
 51 |         seq_out = data[i, j+sequenceLength, 0:dictLen]
 52 |         X.append(seq_in)
 53 |         y.append(seq_out)
 54 | 
 55 | #X: [10000 (numTweets), 40 (sequenceLength), 365(inputSize)].
 56 | n_examples = len(X)
 57 | numSegments = np.ceil(n_examples/seqPerSegment).astype(int)
 58 | numEpochs=50
 59 | #print('# of sequences per segments: ', seqPerSegment)
 60 | #print('# of segments: ', numSegments)
 61 | 
 62 | #building cLSTM model
 63 | #print("\n")
 64 | print("Start building model ....")
 65 | model = Sequential()
 66 | 
 67 | model.add(LSTM(numHiddenFirst, input_shape=(sequenceLength, inputSize), return_sequences=True))
 68 | model.add(LSTM(numHiddenFirst))
 69 | 
 70 | model.add(Dense(numHiddenFirst))
 71 | model.add(Activation('relu'))
 72 | model.add(BatchNormalization())
 73 | 
 74 | model.add(Dense(numHiddenFirst))
 75 | model.add(Activation('relu'))
 76 | model.add(BatchNormalization())
 77 | 
 78 | model.add(Dense(dictLen))
 79 | model.add(Activation('softmax'))
 80 | 
 81 | optimizer = Adagrad()
 82 | 
 83 | if(loadWeights==True):
 84 |     model.load_weights(expanduser("~/tweetnet/logs/intermediateWeights.hdf5"))
 85 | 
 86 | 
 87 | model.compile(loss='categorical_crossentropy', optimizer=optimizer)
 88 | print("Finished building model.")
 89 | #define file checkpoint
 90 | filePath = expanduser("~/tweetnet/logs/intermediateWeights.hdf5")
 91 | checkPoint = ModelCheckpoint(filePath, monitor='loss', verbose=1)
 92 | callbacksList = [checkPoint]
 93 | 
 94 | #train on mini-epochs (sized seqPerSegment) to lower total RAM usage.
 95 | for epoch in range(numEpochs):
 96 |     for seg in range(numSegments):
 97 |         print("\n")
 98 |         print "Segment: ", seg, "/", numSegments, " | Epoch: ", epoch, "/", numEpochs 
 99 |         dataX = np.asarray(X[seg*seqPerSegment: (seg+1)*seqPerSegment])
100 |         datay = np.asarray(y[seg*seqPerSegment: (seg+1)*seqPerSegment])
101 |         #print("Input shape: ", dataX.shape)
102 |         #print("Output shape: ", datay.shape)
103 |         model.fit(dataX, datay, nb_epoch=1, batch_size=128, callbacks=callbacksList)
104 |         
105 |         generateText(dictionary, data, dictLen, tweetLen, X, y, 
106 |         inputSize, sequenceLength, numHiddenFirst, numTweets, seqPerSegment,
107 |         n_examples, numSegments)
108 |         
109 | 
110 | 
111 | 
112 | 
113 | 


--------------------------------------------------------------------------------
/src/models/tweetnet_lstm.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | Keras backend. LSTM Model.
  3 | 
  4 | Using standard default range:
  5 | Input: (65x1) 64 unique chars, 1 EOS char
  6 | Output: (65x1) 64 unique chars, 1 EOS char
  7 | 
  8 | '''
  9 | import pickle as pkl
 10 | import  numpy as np
 11 | import h5py
 12 | from loadData_lstm import loadData
 13 | from keras.utils import np_utils
 14 | from keras.models import Sequential
 15 | from keras.layers import LSTM
 16 | from keras.layers import Dense
 17 | from keras.layers import Activation
 18 | from keras.optimizers import RMSprop
 19 | from keras.optimizers import Adagrad
 20 | from keras.layers import Dropout
 21 | from keras.layers import BatchNormalization
 22 | from tweetGenerator_lstm import generateText
 23 | from keras.callbacks import ModelCheckpoint
 24 | from os.path import expanduser
 25 | 
 26 | 
 27 | #sequenceLength: sequence length (k in BPTTk)
 28 | sequenceLength = 40
 29 | 
 30 | # number of tweets to use
 31 | nTweet = 120000
 32 | print("Start loading data ...")
 33 | X, y, vocabLen, dictionary, tweetSequence, nextChar, tweets = loadData({},np.array([]), sequenceLength, nTweet) 
 34 | print("Finished loading data")
 35 | 
 36 | loadWeights=False
 37 | 
 38 | #initialize some hyper-parameters
 39 | #inputSize: size of each input vector (default: 365x1)
 40 | inputSize = vocabLen
 41 | print vocabLen
 42 | #numHiddenFirst: size of first hidden layer
 43 | numHiddenFirst = 128
 44 | #seqPerSegment: sequences (of size sequenceLength) per mini-epoch. 
 45 | #Lowers maximum memory usage.
 46 | seqPerSegment = 10000
 47 | 
 48 | #X: [10000 (numTweets), 40 (sequenceLength), 65(inputSize)].
 49 | n_examples = len(X)
 50 | numSegments = np.ceil(n_examples/seqPerSegment).astype(int)
 51 | numEpochs=50
 52 | print('# of sequences per segments: ', seqPerSegment)
 53 | print('# of segments: ', numSegments)
 54 | 
 55 | #building cLSTM model
 56 | #print("\n")
 57 | print("Start building model ....")
 58 | model = Sequential()
 59 | 
 60 | model.add(LSTM(numHiddenFirst, input_shape=(sequenceLength, inputSize)))
 61 | #model.add(LSTM(numHiddenFirst))
 62 | 
 63 | #model.add(Dense(numHiddenFirst))
 64 | #model.add(Activation('relu'))
 65 | #model.add(BatchNormalization())
 66 | 
 67 | #model.add(Dense(numHiddenFirst))
 68 | #model.add(Activation('relu'))
 69 | #model.add(BatchNormalization())
 70 | 
 71 | model.add(Dense(vocabLen))
 72 | model.add(Activation('softmax'))
 73 | 
 74 | optimizer = RMSprop(lr=0.01)
 75 | 
 76 | if(loadWeights==True):
 77 |     model.load_weights(expanduser("~/tweetnet/logs/intermediateWeights.hdf5"))
 78 | 
 79 | 
 80 | model.compile(loss='categorical_crossentropy', optimizer=optimizer)
 81 | print("Finished building model.")
 82 | #define file checkpoint
 83 | #filePath = expanduser("~/tweetnet/logs/intermediateWeights.hdf5")
 84 | #checkPoint = ModelCheckpoint(filePath, monitor='loss', verbose=1)
 85 | #callbacksList = [checkPoint]
 86 | 
 87 | #train on mini-epochs (sized seqPerSegment) to lower total RAM usage.
 88 | for epoch in range(numEpochs):
 89 | #    model.fit(X, y, nb_epoch=1, batch_size=128)
 90 | #    generateText(model, tweets, sequenceLength, vocabLen, dictionary)
 91 |     for seg in range(numSegments):
 92 |         print("\n")
 93 |         print "Segment: ", seg+1, "/", numSegments, " | Epoch: ", epoch, "/", numEpochs 
 94 |         model.fit(X[seg*seqPerSegment: (seg+1)*seqPerSegment], y[seg*seqPerSegment: (seg+1)*seqPerSegment], nb_epoch=1, batch_size=128)
 95 |         generateText(model, tweets, sequenceLength, vocabLen, dictionary)
 96 | 
 97 |         
 98 | 
 99 | 
100 | 
101 | 
102 | 


--------------------------------------------------------------------------------
/src/storm/TwitterStorm.java:
--------------------------------------------------------------------------------
 1 | 
 2 | import java.util.*;
 3 | import org.apache.storm.tuple.Fields;
 4 | import org.apache.storm.tuple.Values;
 5 | import org.apache.storm.Config;
 6 | import org.apache.storm.LocalCluster;
 7 | import org.apache.storm.topology.TopologyBuilder;
 8 | import org.apache.storm.kafka.bolt.KafkaBolt;
 9 | import org.apache.storm.kafka.bolt.selector.DefaultTopicSelector;
10 | import org.apache.storm.kafka.bolt.mapper.*;
11 | /**
12 |  * Main class for storm topology.
13 |  */
14 | 
15 | 
16 | public class TwitterStorm {
17 | 
18 |     /**
19 |      * The main method extracts user arguments (in runAPI.sh), and constructs
20 |      * the topology. Optional Kill Command can be added at the end.
21 |      *
22 |      * @param args[] array of size 5. Last argument are 'keyword' arguments
23 |      */
24 |     public static void main(String[] args) throws Exception{
25 | 
26 |         //grab authentication tokens
27 |         String consumerKey = args[0];
28 |         String consumerSecret = args[1];
29 |         String accessToken = args[2];
30 |         String accessTokenSecret = args[3];
31 | 
32 |         //grab keyword tokens
33 |         String[] arguments = args.clone();
34 |         String[] keyWords = Arrays.copyOfRange(arguments, 4, arguments.length);
35 | 
36 |         //create a new Storm configuration.
37 |         Config config = new Config();
38 |         config.setDebug(true);
39 | 
40 |         //create a new topology.
41 |         TopologyBuilder builder = new TopologyBuilder();
42 | 
43 |         TwitterStreamSpout streamSpout = new TwitterStreamSpout(
44 |             consumerKey,consumerSecret, accessToken, accessTokenSecret, keyWords);
45 | 
46 |         // streamSpout.scheme = new SchemeAsMultiScheme(new KafkaBoltKeyValueScheme());
47 |         builder.setSpout("streamSpout", streamSpout);
48 | 
49 |         TwitterCleanerBolt cleanerBolt = new TwitterCleanerBolt();
50 | 
51 |         builder.setBolt("cleanerBolt", cleanerBolt).shuffleGrouping("streamSpout");
52 | 
53 |         Properties props = new Properties();
54 |         props.put("bootstrap.servers", "localhost:9092");
55 |         props.put("acks", "1");
56 |         props.put("key.serializer", "org.apache.kafka.common.serialization.StringSerializer");
57 |         props.put("value.serializer", "org.apache.kafka.common.serialization.StringSerializer");
58 | 
59 |         KafkaBolt kafkaBolt = new KafkaBolt()
60 |                 .withProducerProperties(props)
61 |                 .withTopicSelector(new DefaultTopicSelector("twitterstorm"))
62 |                 .withTupleToKafkaMapper(new FieldNameBasedTupleToKafkaMapper());
63 | 
64 |         builder.setBolt("forwardToKafka", kafkaBolt).shuffleGrouping("cleanerBolt");
65 | 
66 |         //submit topology to local cluster.
67 |         LocalCluster cluster = new LocalCluster();
68 |         cluster.submitTopology("TwitterHashtagStorm", config,
69 |             builder.createTopology());
70 |         //no kill condition. Run until manual kill command.
71 |     }
72 | }
73 | 


--------------------------------------------------------------------------------
/src/storm/TwitterStreamSpout.java:
--------------------------------------------------------------------------------
  1 | 
  2 | import java.util.Map;
  3 | import java.util.concurrent.LinkedBlockingQueue;
  4 | 
  5 | import twitter4j.FilterQuery;
  6 | import twitter4j.StallWarning;
  7 | import twitter4j.Status;
  8 | import twitter4j.StatusDeletionNotice;
  9 | import twitter4j.StatusListener;
 10 | 
 11 | import twitter4j.TwitterStream;
 12 | import twitter4j.TwitterStreamFactory;
 13 | import twitter4j.auth.AccessToken;
 14 | import twitter4j.conf.ConfigurationBuilder;
 15 | 
 16 | import org.apache.storm.Config;
 17 | import org.apache.storm.spout.SpoutOutputCollector;
 18 | 
 19 | import org.apache.storm.task.TopologyContext;
 20 | import org.apache.storm.topology.OutputFieldsDeclarer;
 21 | import org.apache.storm.topology.base.BaseRichSpout;
 22 | import org.apache.storm.tuple.Fields;
 23 | import org.apache.storm.tuple.Values;
 24 | 
 25 | import org.apache.storm.utils.Utils;
 26 | 
 27 | /**
 28 |  * this class talks directly the the twitterAPI using the user credentials
 29 |  * in runAPI.sh. The data from this spout feeds into the parser and cleaner
 30 |  * bolts.
 31 |  */
 32 | @SuppressWarnings("serial")
 33 | public class TwitterStreamSpout extends BaseRichSpout {
 34 |     SpoutOutputCollector _collector;
 35 |     LinkedBlockingQueue<Status> queue = null;
 36 |     TwitterStream _twitterStream;
 37 | 
 38 |     String consumerKey;
 39 |     String consumerSecret;
 40 |     String accessToken;
 41 |     String accessTokenSecret;
 42 |     String[] keyWords;
 43 | 
 44 |     /**
 45 |      * Constructor.
 46 |      * @param consumerKey Twitter API credential
 47 |      * @param consumerSecret Twitter API credential
 48 |      * @param accessToken Twitter API credential
 49 |      * @param accessTokenSecret Twitter API credential
 50 |      * @param keyWords array of words to filter for
 51 |      */
 52 |     public TwitterStreamSpout(String consumerKey, String consumerSecret,
 53 |         String accessToken, String accessTokenSecret, String[] keyWords) {
 54 | 
 55 |         this.consumerKey = consumerKey;
 56 |         this.consumerSecret = consumerSecret;
 57 |         this.accessToken = accessToken;
 58 |         this.accessTokenSecret = accessTokenSecret;
 59 |         this.keyWords = keyWords;
 60 |     }
 61 | 
 62 |     /**
 63 |      * TO DO: default constructor is a stub.
 64 |      */
 65 |     public TwitterStreamSpout() {
 66 |       // TODO Auto-generated constructor stub
 67 |     }
 68 | 
 69 |     /**
 70 |      * creates a new status blockingQueue and a statusListener.
 71 |      * @param conf Storm configuration
 72 |      * @param context Storm context
 73 |      * @param collector Storm spout collector
 74 |      */
 75 |     @Override
 76 |     public void open(Map conf, TopologyContext context,
 77 |         SpoutOutputCollector collector) {
 78 | 
 79 |         queue = new LinkedBlockingQueue<Status>(1000);
 80 |         _collector = collector;
 81 | 
 82 |         StatusListener listener = new StatusListener() {
 83 | 
 84 |         @Override
 85 |         public void onStatus(Status status) {
 86 |            queue.offer(status);
 87 |         }
 88 | 
 89 |         @Override
 90 |         public void onDeletionNotice(StatusDeletionNotice sdn) {}
 91 | 
 92 |         @Override
 93 |         public void onTrackLimitationNotice(int i) {}
 94 | 
 95 |         @Override
 96 |         public void onScrubGeo(long l, long l1) {}
 97 | 
 98 |         @Override
 99 |         public void onException(Exception ex) {}
100 | 
101 |         @Override
102 |         public void onStallWarning(StallWarning arg0) {
103 |            // TODO Auto-generated method stub
104 |         }
105 |     };
106 | 
107 |         ConfigurationBuilder cb = new ConfigurationBuilder();
108 | 
109 |         cb.setDebugEnabled(true)
110 |         .setOAuthConsumerKey(consumerKey)
111 |         .setOAuthConsumerSecret(consumerSecret)
112 |         .setOAuthAccessToken(accessToken)
113 |         .setOAuthAccessTokenSecret(accessTokenSecret);
114 | 
115 |         _twitterStream = new TwitterStreamFactory(cb.build()).getInstance();
116 |         _twitterStream.addListener(listener);
117 | 
118 |         if (keyWords.length == 0) {
119 |             _twitterStream.sample();
120 |         }else {
121 |             FilterQuery query = new FilterQuery().track(keyWords);
122 |             _twitterStream.filter(query);
123 |         }
124 |     }
125 | 
126 |     /**
127 |      * polls from the blocking queue to get next status.
128 |      */
129 |     @Override
130 |     public void nextTuple() {
131 |         Status ret = queue.poll();
132 | 
133 |         if (ret == null) {
134 |             Utils.sleep(50);
135 |         } else {
136 |             _collector.emit(new Values(ret));
137 |         }
138 |     }
139 | 
140 |     /**
141 |      * closes twitter stream.
142 |      */
143 |     @Override
144 |     public void close() {
145 |         _twitterStream.shutdown();
146 |     }
147 | 
148 |     /**
149 |      * worker node configurator. Default set to 1 local machine.
150 |      */
151 |     @Override
152 |     public Map<String, Object> getComponentConfiguration() {
153 |         Config ret = new Config();
154 |         ret.setMaxTaskParallelism(1);
155 |         return ret;
156 |     }
157 | 
158 |     @Override
159 |     public void ack(Object id) {}
160 | 
161 |     @Override
162 |     public void fail(Object id) {}
163 | 
164 |     /**
165 |      * Declare output field type.
166 |      */
167 |     @Override
168 |     public void declareOutputFields(OutputFieldsDeclarer declarer) {
169 |         declarer.declare(new Fields("tweet"));
170 |     }
171 | }
172 | 


--------------------------------------------------------------------------------
/src/storm/pom.xml:
--------------------------------------------------------------------------------
  1 | <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
  2 | 	xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
  3 | 	<modelVersion>4.0.0</modelVersion>
  4 |     <groupId>tweetnet</groupId>
  5 |     <artifactId>twitter_storm</artifactId>
  6 |     <version>1.0-SNAPSHOT</version>
  7 |     <packaging>maven-plugin</packaging>
  8 | 	<name>twitter_storm</name>
  9 | 	<url>https://github.com/davisliang/tweetnet.git</url>
 10 | 	<properties>
 11 | 		<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
 12 | 	</properties>
 13 | 	<repositories>
 14 |         <repository>
 15 |             <id>central</id>
 16 |             <name>Central Repository</name>
 17 |             <url>http://repo.maven.apache.org/maven2</url>
 18 |             <layout>default</layout>
 19 |             <snapshots>
 20 |                 <enabled>false</enabled>
 21 |             </snapshots>
 22 |         </repository>
 23 | 		<repository>
 24 | 			<id>github-releases</id>
 25 | 			<url>http://oss.sonatype.org/content/repositories/github-releases/</url>
 26 | 		</repository>
 27 | 		<repository>
 28 | 			<id>clojars.org</id>
 29 | 			<url>http://clojars.org/repo</url>
 30 | 		</repository>
 31 | 		<repository>
 32 | 			<id>twitter4j</id>
 33 | 			<url>http://twitter4j.org/maven2</url>
 34 | 		</repository>
 35 | 	</repositories>
 36 | 	<dependencies>
 37 |         <dependency>
 38 |             <groupId>org.apache.storm</groupId>
 39 |             <artifactId>storm-kafka-client</artifactId>
 40 |             <version>1.0.3</version>
 41 |         </dependency>
 42 |         <dependency>
 43 |             <groupId>org.apache.storm</groupId>
 44 |             <artifactId>storm-kafka</artifactId>
 45 |             <version>1.0.3</version>
 46 |         </dependency>
 47 |         <dependency>
 48 |             <groupId>org.apache.kafka</groupId>
 49 |             <artifactId>kafka-clients</artifactId>
 50 |             <version>0.10.0.0</version>
 51 |             <exclusions>
 52 |               <exclusion>
 53 |                 <groupId>org.slf4j</groupId>
 54 |                 <artifactId>slf4j-log4j12</artifactId>
 55 |               </exclusion>
 56 |               <exclusion>
 57 |                 <groupId>log4j</groupId>
 58 |                 <artifactId>log4j</artifactId>
 59 |               </exclusion>
 60 |             </exclusions>
 61 |         </dependency>
 62 |         <dependency>
 63 |             <groupId>org.apache.kafka</groupId>
 64 |             <artifactId>kafka_2.10</artifactId>
 65 |             <version>0.10.2.0</version>
 66 |             <exclusions>
 67 |               <exclusion>
 68 |                 <groupId>org.slf4j</groupId>
 69 |                 <artifactId>slf4j-log4j12</artifactId>
 70 |               </exclusion>
 71 |               <exclusion>
 72 |                 <groupId>log4j</groupId>
 73 |                 <artifactId>log4j</artifactId>
 74 |               </exclusion>
 75 |             </exclusions>
 76 |         </dependency>
 77 | 		<dependency>
 78 | 			<groupId>org.twitter4j</groupId>
 79 | 			<artifactId>twitter4j-core</artifactId>
 80 | 			<version>4.0.4</version>
 81 | 		</dependency>
 82 | 		<dependency>
 83 | 			<groupId>org.twitter4j</groupId>
 84 | 			<artifactId>twitter4j-stream</artifactId>
 85 | 			<version>4.0.4</version>
 86 | 		</dependency>
 87 |         <dependency>
 88 |             <groupId>org.apache.storm</groupId>
 89 |             <artifactId>storm-core</artifactId>
 90 |             <version>1.0.3</version>
 91 |         </dependency>
 92 | 	</dependencies>
 93 | 
 94 | 	<build>
 95 | 		<sourceDirectory>.</sourceDirectory>
 96 | 		<resources>
 97 | 			<resource>
 98 | 				<directory>${basedir}/multilang</directory>
 99 | 			</resource>
100 | 		</resources>
101 | 		<plugins>
102 |          <plugin>
103 |              <groupId>org.apache.maven.plugins</groupId>
104 |              <artifactId>maven-plugin-plugin</artifactId>
105 |              <version>3.2</version>
106 |              <configuration>
107 |                  <!-- see http://jira.codehaus.org/browse/MNG-5346 -->
108 |                  <skipErrorNoDescriptorsFound>true</skipErrorNoDescriptorsFound>
109 |              </configuration>
110 |              <executions>
111 |                  <execution>
112 |                      <id>mojo-descriptor</id>
113 |                      <goals>
114 |                          <goal>descriptor</goal>
115 |                      </goals>
116 |                  </execution>
117 |              </executions>
118 |          </plugin>
119 |         <plugin>
120 |             <groupId>org.codehaus.mojo</groupId>
121 |             <artifactId>exec-maven-plugin</artifactId>
122 |             <version>1.2.1</version>
123 |             <configuration>
124 |                 <mainClass>TwitterStorm</mainClass>
125 |                 <!-- Update command-line arguments here -->
126 |                 <arguments>
127 |                     <!-- Consumer Key -->
128 |                     <argument></argument>
129 |                     <!-- Consumer Secret -->
130 |                     <argument></argument>
131 |                     <!-- Access Token -->
132 |                     <argument></argument>
133 |                     <!-- Access Token Secret -->
134 |                     <argument></argument>
135 |                     <!-- Keywords, optional
136 |                     <argument></argument>
137 |                     -->
138 |                 </arguments>
139 |             </configuration>
140 |         </plugin>
141 |         <plugin>
142 |           <artifactId>maven-clean-plugin</artifactId>
143 |           <version>3.0.0</version>
144 |           <configuration>
145 |             <filesets>
146 |               <fileset>
147 |                 <directory>.</directory>
148 |                 <includes>
149 |                   <include>*.class</include>
150 |                   <include>*.log</include>
151 |                 </includes>
152 |                 <excludes>
153 |                   <exclude>*</exclude>
154 |                 </excludes>
155 |                 <followSymlinks>false</followSymlinks>
156 |               </fileset>
157 |             </filesets>
158 |           </configuration>
159 |         </plugin>
160 | 		</plugins>
161 | 	</build>
162 | </project>
163 | 


--------------------------------------------------------------------------------
/src/utils/ReducedAsciiDictionary.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | class ReducedAsciiDictionary:
 4 | 	""" Creates a dictionary that maps characters to an integer
 5 | 
 6 | 	This is useful because not all characters are used and this
 7 | 	method allows for low-dimensional character vectors.
 8 | 
 9 | 	"""
10 | 
11 | 	def __init__(self, dictionary, ranges):
12 | 		""" Initialize the dictionary based on a set of ranges
13 | 
14 | 		The ranges parameter is a 2D numpy array, doubly inclusive.
15 | 		Recall that the function ord() gets the int value of a char.
16 | 		Recall that the function chr() gets the char value of an int.
17 | 
18 | 		TO-DO: Add range array validity checker.
19 | 
20 | 		"""
21 | 
22 | 		self.dictionary = dictionary
23 | 
24 | 		# default constructor runs if size is 0. 
25 | 		if(ranges.size == 0):
26 | 			self.ranges = np.array([[32,63],[96,127]])
27 | 		else: 
28 | 			self.ranges = ranges
29 | 
30 | 		#build dictionary
31 | 		counter = 0
32 | 		numRanges = self.ranges.shape[0]
33 | 		for i in range(0,numRanges):
34 | 			start = self.ranges[i][0]
35 | 			end = self.ranges[i][1]
36 | 			for j in range(start,end+1):
37 | 				self.dictionary[chr(j)] = counter
38 | 				counter += 1
39 | 				
40 | 


--------------------------------------------------------------------------------
/src/utils/checkTrainTestDup.py:
--------------------------------------------------------------------------------
 1 | import cPickle as pickle
 2 | import numpy as np
 3 | import os
 4 | from os.path import expanduser
 5 | 
 6 | test_data = pickle.load(open(expanduser("~/tweetnet/data/test_data.pkl")))
 7 | train_data = pickle.load(open(expanduser("~/tweetnet/data/train_data.pkl")))
 8 | 
 9 | testX = test_data[0]
10 | trainX = train_data[0]
11 | 
12 | idx = len(test_data)*np.random.rand(2000)
13 | 
14 | n = 0
15 | cnt = 0
16 | for i in idx:
17 |     print n
18 |     n += 1
19 |     test_x = testX[int(i), :, :]
20 |     for j in range(len(trainX)):
21 |         if np.array_equal(test_x, trainX[j, :, :]):
22 |             cnt += 1
23 |             print "Dup"
24 |             break
25 | print cnt    
26 | 


--------------------------------------------------------------------------------
/src/utils/dumpDedup.py:
--------------------------------------------------------------------------------
 1 | # python dumpDedup.py > ~/tweetnet/data/dump.txt
 2 | import numpy
 3 | from os.path import expanduser
 4 | 
 5 | dumpFile = open(expanduser("~/tweetnet/data/dumpBig.txt"))
 6 | dumpLines = dumpFile.readlines()
 7 | 
 8 | dumpSet = set()
 9 | 
10 | for i in range(len(dumpLines)):
11 |     if dumpLines[i][0:4] == 'text':
12 |         if dumpLines[i] in dumpSet:
13 |             continue;
14 |         else:
15 |             dumpSet.add(dumpLines[i])
16 |             print "\n",
17 |             print dumpLines[i],
18 |             print dumpLines[i+1],
19 | 


--------------------------------------------------------------------------------
/src/utils/getEnglishHashTweets.py:
--------------------------------------------------------------------------------
 1 | from os.path import expanduser
 2 | import numpy
 3 | import cPickle as pickle
 4 | 
 5 | def checkHashtags(hashtagStr,dictionary):
 6 |     hasMultiEnglishHashtag = False 
 7 |     returnHt = "hashtags:"
 8 |     htStr = hashtagStr[9:]
 9 |     htTokens = htStr.split(" ")
10 |     nEnglishHashtag = 0
11 |     for token in htTokens:
12 |         try:
13 |             if(len(dictionary[token])>0):
14 |                 returnHt = returnHt + " " + token
15 | 		nEnglishHashtag += 1
16 |         except KeyError:
17 |             continue
18 |     if nEnglishHashtag >= 2:
19 | 	hasMultiEnglishHashtag = True
20 |     return returnHt, hasMultiEnglishHashtag
21 | 
22 | 
23 | dictionary = pickle.load(open(expanduser("~/tweetnet/data/word2vec_dict.pkl"), "r"))
24 | 
25 | textFile = open(expanduser("~/tweetnet/data/dump.txt"), "r")
26 | fileLines = textFile.readlines()
27 | keepTweets = []
28 | keepHashtags = []
29 | counter = 0
30 | 
31 | while((counter+1)<len(fileLines)):
32 |     if(counter%1000==0):
33 |         print "line %i of %i" %(counter, len(fileLines)) 
34 |     textLine = fileLines[counter]
35 |     htLine = fileLines[counter+1]
36 |     if(textLine[0:5]=="text:"):
37 |         wordHT, hasMultiEnglishHashtag = checkHashtags(htLine,dictionary)
38 |         if(hasMultiEnglishHashtag):
39 |             keepTweets.append(textLine)
40 |             keepHashtags.append(wordHT)
41 |     counter = counter + 1
42 | 
43 | print "Number of remaining tweets: ",len(keepTweets)     
44 | with open(expanduser("~/tweetnet/data/englishHashtagTweet.pkl"), "wb") as file1:
45 |     pickle.dump(keepTweets,file1, pickle.HIGHEST_PROTOCOL) 
46 | 
47 | with open(expanduser("~/tweetnet/data/englishHashtag.pkl"), "wb") as file2:
48 |     pickle.dump(keepHashtags,file2,pickle.HIGHEST_PROTOCOL) 
49 | 
50 | 


--------------------------------------------------------------------------------
/src/utils/hashtagFrequency.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import cPickle as pickle
 3 | from os.path import expanduser
 4 | 
 5 | hashtag = pickle.load(open(expanduser("~/tweetnet/data/englishHashtag.pkl")))
 6 | dic = {}
 7 | for ht in hashtag:
 8 |     ht = ht.split(" ")
 9 |     if dic.get(ht[2]) == None:
10 |         dic[ht[2]] = 1
11 |     else:
12 |         dic[ht[2]] += 1
13 | 
14 | pickle.dump(dic, open(expanduser("~/tweetnet/data/hashtagFreq.pkl"), "w"))
15 | for threshold in range(1,1000):
16 |     cnt = 0
17 |     for k in dic.keys():
18 |         if dic[k] >= threshold:
19 | 	    cnt += 1
20 |     print "Threshold = ", threshold, "    # Hashtags= ", cnt*threshold
21 | 
22 | ht = dic.keys()
23 | freq = dic.values()
24 | idx = np.argsort(np.array(freq))
25 | 
26 | sortedHt = []
27 | sortedFq = []
28 | for i in idx:
29 |     sortedHt.append(ht[i])
30 |     sortedFq.append(freq[i])
31 | 


--------------------------------------------------------------------------------
/src/utils/loadData.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | loads twitter dataset from storm API.
 3 | '''
 4 | import numpy as np
 5 | import cPickle as pickle
 6 | from ReducedAsciiDictionary import ReducedAsciiDictionary
 7 | from os.path import expanduser
 8 | 
 9 | def loadData(dictionary,ranges):
10 | 	''' Creates dataset based on dictionary, a set of ascii
11 | 	ranges, and pickled twitter data from Apache Storm.
12 | 
13 | 	X: (numTweets, 141, dictionaryLength + embeddings length)
14 | 	vocabLen: (dictionary length)
15 | 	tweetLength: (numTweets)
16 | 	'''
17 | 
18 | 	#load tweets and hashtag embeddings
19 | 	tweets = pickle.load(open(expanduser("~/tweetnet/data/preprocessed_new_tweets.pkl"),"rb"))
20 | 	embeddings = pickle.load(open(expanduser("~/tweetnet/data/new_embeddings.pkl"),"rb"))
21 | 
22 | 	#visualize data
23 | 	#print "tweets (ELEMENT TYPE): ", type(tweets[0])
24 | 	#print "tweets (Number Of Tweets): ", len(tweets)
25 | 	#print "hashtag (ELEMENT TYPE): ", type(embeddings[0])
26 | 	#print "hashtag (SHAPE): ", embeddings.shape
27 | 
28 | 	#create character dictionary for tweets.
29 | 	dictionary = ReducedAsciiDictionary({},ranges).dictionary
30 | 
31 | 	#total number of tweets
32 | 	numData = len(tweets)
33 | 
34 | 	#number of unique characters in dataset
35 | 	vocabLen = len(dictionary)+1
36 | 
37 | 	#initialize datastore arrays
38 | 	X = np.zeros([numData, 140+1, vocabLen + embeddings.shape[1]])
39 | 	tweetLength = np.zeros(numData)
40 | 
41 | 	# for each tweet create onehot encoding for each character
42 | 	for twt in range(numData):
43 | 		if(twt%1000==0):
44 | 			print "loaded: ", twt, " of ", numData
45 | 		tweetLength[twt] = len(tweets[twt])-6+1
46 | 		currTweet = tweets[twt][6:len(tweets[twt])]
47 | 
48 | 		for ch in range(len(currTweet)):
49 | 			oneHotIndex = dictionary.get(currTweet[ch])
50 | 			X[twt,ch,oneHotIndex] = 1
51 | 			
52 | 			for embIndex in range(embeddings.shape[1]):
53 | 				X[twt,ch,embIndex+vocabLen] = embeddings[twt,embIndex]
54 | 		#end of tweet character (EOS)
55 | 		X[twt,len(currTweet),len(dictionary)]=1
56 | 
57 | 	return X, vocabLen, tweetLength, dictionary
58 | 


--------------------------------------------------------------------------------
/src/utils/loadDataT2C.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | loads twitter dataset from storm API.
  3 | '''
  4 | import sys
  5 | import os
  6 | import numpy as np
  7 | import cPickle as pickle
  8 | sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "..","utils")))
  9 | from ReducedAsciiDictionary import ReducedAsciiDictionary
 10 | from getEnglishHashTweets import checkHashtags
 11 | from numpy import random
 12 | from random import shuffle
 13 | from os.path import expanduser
 14 | 
 15 | def loadData(dictionary,ranges,sequenceLength,trainPercent, freqThreshold):
 16 | 	''' Creates dataset based on dictionary, a set of ascii
 17 | 	ranges, and pickled twitter data from Apache Storm.
 18 | 
 19 | 
 20 |         X: [#sequences, 40, 65]
 21 |         y: [#sequences, 300]
 22 | 	vocabLen: (dictionary length)
 23 | 	tweetLength: (numTweets)
 24 | 	'''
 25 | 
 26 |         #load tweets with >=2 hashtags and corresponding english hashtags
 27 |         tweets = pickle.load(open(expanduser("~/tweetnet/data/multitaskTweets.pkl"), "rb"))
 28 |         hashtags = pickle.load(open(expanduser("~/tweetnet/data/multitaskHashtags.pkl"), "rb"))
 29 |         
 30 |         #load hashtag frequency dictionary
 31 |         hashtagFreq = pickle.load(open(expanduser("~/tweetnet/data/hashtagFreq.pkl"), "rb"))
 32 |         
 33 |         #modifiedTweets = []
 34 |         #for i in range(len(tweets)):
 35 | 	#    # Get rid of the "text: " and add start of text and end of text
 36 |         #    modifiedTweets.append(chr(2) + tweets[i][6:] + chr(3))
 37 |         #tweets = modifiedTweets
 38 | 
 39 |         #Normalize data by frequency
 40 |         tweets_shuf, hashtags_shuf = normalizeByFreq(tweets, hashtags, hashtagFreq, freqThreshold)
 41 |  
 42 |         nTweet = len(tweets_shuf)
 43 |         nTrainData = np.ceil(nTweet*trainPercent).astype(int)
 44 |         
 45 |         #Split the tweets and hashtags into training and testing set 
 46 |         trainTweets = tweets_shuf[0: nTrainData]
 47 |         trainHashtags = hashtags_shuf[0: nTrainData]
 48 |         testTweets = tweets_shuf[nTrainData: nTweet]
 49 |         testHashtags = hashtags_shuf[nTrainData: nTweet]
 50 |         nTestData = len(testTweets)
 51 | 
 52 |         
 53 |         #load word2vec dictionary
 54 |         print("Loading word2vec dictionary")
 55 |         word2vecDict = pickle.load(open(expanduser("~/tweetnet/data/word2vec_dict.pkl"), "rb"))
 56 |         print("Finished loading word2vec dictionary")
 57 |         
 58 | 	#create character dictionary for tweets.
 59 | 	dictionary = ReducedAsciiDictionary({},ranges).dictionary
 60 |         dictionary[chr(2)] = len(dictionary)
 61 |         dictionary[chr(3)] = len(dictionary)
 62 | 	#number of unique characters in dataset
 63 | 	vocabLen = len(dictionary)
 64 | 
 65 | 	#initialize datastore arrays
 66 |         trainTweetSequence = []
 67 |         trainHashtagSequence = []
 68 |         testTweetSequence = []
 69 |         testHashtagSequence = []
 70 | 
 71 |         #vector in word2vec is 300
 72 |         embeddingLength = 300
 73 | 
 74 |         #Split data into sequences of length 40 for training
 75 |         for i in range(nTrainData):
 76 |             oneTweet = trainTweets[i]
 77 |             for j in range(0, len(oneTweet) - sequenceLength + 1, 1):
 78 |                 trainTweetSequence.append(oneTweet[j : j+sequenceLength])
 79 |                 trainHashtagSequence.append(trainHashtags[i])
 80 |         print('Number of sequences in training data: ', len(trainTweetSequence))
 81 |         print('Number of hashtags in training data: ', len(trainHashtagSequence))
 82 | 
 83 | 
 84 |         #Split data into sequences of length 40 for testing
 85 |         for i in range(nTestData):
 86 |             oneTweet = testTweets[i]
 87 |             ht = hashtags[i].split(" ")
 88 |             for j in range(0, len(oneTweet) - sequenceLength + 1, 1):
 89 |                 testTweetSequence.append(oneTweet[j : j+sequenceLength])
 90 |                 testHashtagSequence.append(testHashtags[i])
 91 |         print('Number of sequences in testing data: ', len(testTweetSequence))
 92 |         print('Number of hashtags in testing data: ', len(testHashtagSequence))
 93 | 	
 94 | 	
 95 |         # for each sequence, create onehot encoding for each character
 96 | 	print("Vectorization...")
 97 |         
 98 |         # trainX: [#training sequences, 40, 65]
 99 |         # trainy: [#training sequences, 300]
100 |         trainX = np.zeros((len(trainTweetSequence), sequenceLength, vocabLen), dtype=np.bool)
101 |         trainY = np.zeros((len(trainTweetSequence), embeddingLength))
102 | 	for i, seq in enumerate(trainTweetSequence):
103 |             if i % 10000 == 0:
104 |                 print("Loading training tweet ", i)
105 | 	    for j, ch in enumerate(seq):
106 | 		oneHotIndex = dictionary.get(ch)
107 | 		trainX[i,j,oneHotIndex] = 1
108 |                 trainY[i] = word2vecDict[trainHashtagSequence[i]]	    
109 |         
110 |         # testX: [#testing sequences, 40, 65]
111 |         # testy: [#testing sequences, 300]
112 |         testX = np.zeros((len(testTweetSequence), sequenceLength, vocabLen), dtype=np.bool)
113 |         testY = np.zeros((len(testTweetSequence), embeddingLength))
114 |         
115 | 	for i, seq in enumerate(testTweetSequence):
116 |             if i % 10000 == 0:
117 |                 print("Loading testing tweet ", i)
118 | 	    for j, ch in enumerate(seq):
119 | 		oneHotIndex = dictionary.get(ch)
120 | 		testX[i,j,oneHotIndex] = 1
121 |                 testY[i] = word2vecDict[testHashtagSequence[i]]	  
122 | 
123 |         tweet2hashtagParam = [trainTweets, trainHashtags, testTweets, testHashtags, trainX, trainY, testX, testY, trainTweetSequence, trainHashtagSequence, testTweetSequence, testHashtagSequence]
124 |           
125 | 
126 | 	return trainTweets, trainHashtags, testTweets, testHashtags, trainX, trainY, testX, testY, trainTweetSequence, trainHashtagSequence, testTweetSequence, testHashtagSequence, word2vecDict
127 | 
128 | 
129 | if __name__ == "__main__":
130 |     trainTweets, trainHashtags, testTweets, testHashtags, trainX, trainY, testX, testY, trainTweetSequence, trainHashtagSequence, testTweetSequence, testHashtagSequence, dictionary = loadData({},np.array([]), 40, 0.99, 84)
131 |     dic = {}
132 |     for i in range(1000):
133 |         if dic.get(trainHashtagSequence[i]) == None:
134 |             dic[trainHashtagSequence[i]] = 1
135 |     print dic.keys()
136 |     
137 | 


--------------------------------------------------------------------------------
/src/utils/loadData_lstm.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | loads twitter dataset from storm API.
 3 | '''
 4 | import numpy as np
 5 | import cPickle as pickle
 6 | from ReducedAsciiDictionary import ReducedAsciiDictionary
 7 | from os.path import expanduser
 8 | import sys
 9 | 
10 | def loadData(dictionary,ranges,sequenceLength, nTweet):
11 | 	''' Creates dataset based on dictionary, a set of ascii
12 | 	ranges, and pickled twitter data from Apache Storm.
13 | 
14 | 
15 |         X: [#sequences, 40, 65]
16 |         y: [#sequences, 65]
17 | 	vocabLen: (dictionary length)
18 | 	tweetLength: (numTweets)
19 | 	'''
20 | 
21 | 	#load tweets and hashtag embeddings
22 | 	tweets = pickle.load(open(expanduser("~/tweetnet/data/preprocessed_new_tweets.pkl"),"rb"))
23 |         np.random.shuffle(tweets)
24 | 
25 |         #use the first nTweet tweets
26 |         tweets = tweets[0:nTweet]
27 |         print "Number of tweets ", len(tweets)
28 | 
29 | 	#create character dictionary for tweets.
30 | 	dictionary = ReducedAsciiDictionary({},ranges).dictionary
31 | 
32 | 	#total number of tweets
33 | 	numData = len(tweets)
34 | 
35 | 	#number of unique characters in dataset
36 | 	vocabLen = len(dictionary)+1
37 | 
38 | 	#initialize datastore arrays
39 |         tweetSequence = []
40 |         nextChar = []
41 | 	tweetLength = np.zeros(numData)
42 | 
43 |         #Split data into sequences of length 40 and create nextChar array
44 |         for i in range(numData):
45 |             oneTweet = tweets[i]
46 |             for j in range(0, len(oneTweet) - sequenceLength - 1, 1):
47 |                 tweetSequence.append(oneTweet[j : j+sequenceLength])
48 |                 nextChar.append(oneTweet[j+sequenceLength])
49 |             tweetSequence.append(oneTweet[len(oneTweet)-sequenceLength - 1:len(oneTweet) - 1])
50 |             nextChar.append("<EOS>")
51 |         print('Number of sequences: ', len(tweetSequence))
52 | 
53 | 	# for each sequence, create onehot encoding for each character
54 |         # X: [#sequences, 40, 65]
55 |         # y: [#sequences, 65]
56 | 	print("Vectorization...")
57 |         X = np.zeros((len(tweetSequence), sequenceLength, vocabLen), dtype=np.bool)
58 |         y = np.zeros((len(tweetSequence), vocabLen), dtype=np.bool)
59 | 
60 |         for i, seq in enumerate(tweetSequence):
61 |             if i % 10000 == 0:
62 |                 print "Loading tweet ", i
63 | 	    for j, ch in enumerate(seq):
64 | 		oneHotIndex = dictionary.get(ch)
65 | 		X[i,j,oneHotIndex] = 1
66 | 	    
67 |             if nextChar[i] != "<EOS>":
68 |                 y[i, dictionary.get(nextChar[i])] = 1
69 |             else:
70 |                 y[i, len(dictionary)] = 1
71 |         return X, y, vocabLen, dictionary, tweetSequence, nextChar, tweets
72 | 
73 | if __name__ == "__main__":
74 |     X, y, vocabLen, dictionary, tweetSequence, nextChar, tweets = loadData({},np.array([]),40)
75 |     print "The first tweet sequence is: ", tweetSequence[0]
76 | 


--------------------------------------------------------------------------------
/src/utils/loadKaggleHelpful.py:
--------------------------------------------------------------------------------
 1 | #import variables
 2 | import cPickle as pickle
 3 | import numpy
 4 | import gzip
 5 | from os.path import expanduser
 6 | # function implementations
 7 | def readGz(f):
 8 |   for l in gzip.open(f):
 9 |     yield eval(l)
10 |     
11 | def loadTrain():
12 |     text=[];helpful=[];outOf=[];userID=[];itemID=[]
13 |     # collecting the data
14 |     for metablock in readGz(expanduser('~/tweetnet/data/train.json.gz')):
15 |         text.append(metablock['reviewText'])
16 |         helpful.append(metablock['helpful']['nHelpful'])
17 |         outOf.append(metablock['helpful']['outOf'])
18 |         userID.append(metablock['reviewerID'])
19 |         itemID.append(metablock['itemID'])
20 |     return text, helpful,outOf, userID, itemID
21 | 
22 | def loadTest():
23 |     text=[];outOf=[];userID=[];itemID=[]
24 |     #collecting the data
25 |     for metablock in readGz(expanduser('~/tweetnet/data/test_Helpful.json.gz')):
26 |         text.append(metablock['reviewText'])
27 |         outOf.append(metablock['helpful']['outOf'])
28 |         userID.append(metablock['reviewerID'])
29 |         itemID.append(metablock['itemID'])
30 |     return text, outOf, userID, itemID    
31 | 


--------------------------------------------------------------------------------
/src/utils/logger.py:
--------------------------------------------------------------------------------
 1 | import numpy
 2 | from os.path import expanduser
 3 | def logger(listVals, name):
 4 |     logwriter = open(expanduser('~/tweetnet/logs/'+ name), 'a')
 5 |     logwriter.write("\n ########################################## \n")
 6 |     logwriter.write("EPOCH: " + str(listVals[0][0]) + "\n")
 7 |     for i in xrange(1,len(listVals)-1):
 8 |         logwriter.write("input: " + str(listVals[i][0]) + "\n")
 9 |         logwriter.write("target: " + str(listVals[i][1]) + "\n")
10 |         logwriter.write("isCorrect: " + str(listVals[i][2]) + "\n")
11 |         logwriter.write("topN: "+ str(listVals[i][3]) + "\n\n")
12 |     logwriter.write("numCorrect: " + str(listVals[len(listVals)-1][0]))
13 |     logwriter.write(" percCorrect: " + str(listVals[len(listVals)-1][1]))
14 |     
15 |    
16 |     
17 | 


--------------------------------------------------------------------------------
/src/utils/mkMultiTaskTweet.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | loads twitter dataset from storm API for multitasking model training
  3 | Task 1: Hashtag prediction
  4 | Task 2: missing word completion
  5 | 
  6 | Data format: tweet -- hashtag -- missing word
  7 | '''
  8 | import sys
  9 | import os
 10 | import numpy as np
 11 | import cPickle as pickle
 12 | sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "..","utils")))
 13 | from ReducedAsciiDictionary import ReducedAsciiDictionary
 14 | from getEnglishHashTweets import checkHashtags
 15 | from numpy import random
 16 | from random import shuffle
 17 | from os.path import expanduser
 18 | import re
 19 | import string
 20 | from stop_words import get_stop_words
 21 | 
 22 | def mkMissingWord(text, word2vecDict):
 23 |     
 24 |     punctuation = set(string.punctuation)
 25 |     stop_words = get_stop_words('english')
 26 |     words = text.split()
 27 |     cnt = 0
 28 |     while cnt <= 7:
 29 |         idx = 1 + random.randint(len(words) - 1)
 30 |         w = words[idx]
 31 |         w = ''.join([c for c in w.lower() if not c in punctuation])
 32 |         if len(w)==1 or word2vecDict.get(w) == None or w in stop_words:
 33 |             cnt += 1
 34 |         else:
 35 |             missingWord = w
 36 |             words[idx] = "UNK"
 37 |             text = " ".join(words)
 38 |             return (text, missingWord)
 39 |     return (None,None)
 40 |         
 41 |     
 42 | 
 43 | def tweetsForMultiTask(tweets, hashtags, word2vecDict):
 44 | 
 45 | 
 46 |     tweets_shuf = []
 47 |     hashtags_shuf = []
 48 |     missingWords = []
 49 |     idx_shuf = range(len(tweets))
 50 |     shuffle(idx_shuf)
 51 |     for i in idx_shuf:
 52 |         ht = hashtags[i].split(" ")
 53 | 
 54 |         text, missingWord = mkMissingWord(tweets[i], word2vecDict)
 55 |         if text != None and missingWord != None:
 56 |             tweets_shuf.append(text)
 57 |             hashtags_shuf.append(ht[2])
 58 |             missingWords.append(missingWord)
 59 |             print (text, ht[2], missingWord)
 60 | 
 61 |     return tweets_shuf, hashtags_shuf, missingWords
 62 | 
 63 | 
 64 | def loadData(dictionary,ranges,sequenceLength,trainPercent):
 65 | 	''' Creates dataset based on dictionary, a set of ascii
 66 | 	ranges, and pickled twitter data from Apache Storm.
 67 | 
 68 | 
 69 |         X: [#sequences, 40, 65]
 70 |         y: [#sequences, 300]
 71 | 	vocabLen: (dictionary length)
 72 | 	tweetLength: (numTweets)
 73 | 	'''
 74 | 
 75 | 
 76 |         #load tweets with >=2 hashtags and corresponding english hashtags
 77 |         tweets = pickle.load(open(expanduser("~/tweetnet/data/englishHashtagTweet.pkl"), "rb"))
 78 |         hashtags = pickle.load(open(expanduser("~/tweetnet/data/englishHashtag.pkl"), "rb"))
 79 |         modifiedTweets = []
 80 | 
 81 |         #load word2vec dictionary
 82 |         print("Loading word2vec dictionary")
 83 |         word2vecDict = pickle.load(open(expanduser("~/tweetnet/data/word2vec_dict.pkl"), "rb"))
 84 |         print("Finished loading word2vec dictionary")
 85 |         
 86 |         for i in range(len(tweets)):
 87 | 	    # Get rid of the "text: " and add start of text and end of text
 88 |             modifiedTweets.append(chr(2) + tweets[i][6:] + chr(3))
 89 |         
 90 |         tweets = modifiedTweets
 91 |         tweets, hashtags, missingWords = tweetsForMultiTask(tweets, hashtags, word2vecDict)
 92 | 
 93 | 	nTweet = len(tweets)
 94 |         
 95 |         print "Number of remaining tweets: ", nTweet
 96 |         
 97 |         print "Saving data to files ..."
 98 |         with open(expanduser("~/tweetnet/data/multitaskTweets.pkl"), "wb") as file1:
 99 |             pickle.dump(tweets, file1, pickle.HIGHEST_PROTOCOL)
100 |         with open(expanduser("~/tweetnet/data/multitaskHashtags.pkl"), "wb") as file2:
101 |             pickle.dump(hashtags, file2, pickle.HIGHEST_PROTOCOL)
102 |         with open(expanduser("~/tweetnet/data/multitaskTweetMw.pkl"), "wb") as file3:
103 |             pickle.dump(missingWords, file3, pickle.HIGHEST_PROTOCOL)
104 | 
105 | if __name__ == "__main__":
106 |     loadData({},np.array([]), 40, 0.9)
107 | 


--------------------------------------------------------------------------------
/src/utils/predContext.py:
--------------------------------------------------------------------------------
 1 | 
 2 | import numpy
 3 | 
 4 | def createHtDict(dic, allHashtags):
 5 |     htDic = {}
 6 |     for ht  in allHashtags:
 7 |         if ht not in htDic.keys():
 8 |             htDic[ht] = dic[ht]
 9 |     return htDic
10 | 
11 | def predContext(htDictionary, modelOutput, topN, label):
12 |     correct = False
13 |     keyResult = []
14 |     sortedKeyResult = []
15 |     dotResult = numpy.zeros([len(htDictionary)])
16 | 
17 |     counter = 0
18 |     for k in htDictionary.keys():
19 |         dotResult[counter] = -numpy.dot(modelOutput,htDictionary[k])[-1]
20 |         keyResult.append(k)
21 |         counter = counter + 1
22 |     
23 |     sortIndex = numpy.argsort(dotResult)
24 |     topNdots = dotResult[sortIndex[0:topN]]
25 |     
26 |     for i in range(topN):
27 |         sortedKeyResult.append(keyResult[sortIndex[i]])
28 |    	if label == keyResult[sortIndex[i]]:
29 | 	    correct = True 
30 |     return sortedKeyResult, correct, topNdots
31 | 


--------------------------------------------------------------------------------
/src/utils/prelimTest.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from os.path import expanduser
 3 | 
 4 | c2c = open(expanduser("~/tweetnet/logs/Feb/c2c2017-03-04_13:19.log"), "rb")
 5 | t2c = open(expanduser("~/tweetnet/logs/t2c2017-03-04_13:42.log"), "rb")
 6 | 
 7 | cnt = 0
 8 | 
 9 | correctDic = {}
10 | 
11 | lines = c2c.read()
12 | lines = lines.split("\n\n")
13 | 
14 | for blocks in lines:
15 |     if blocks[0:5] == "input":
16 |         blocks = blocks.split("\n")
17 |         if "True" in blocks[2]:
18 |             correctDic[cnt] = 1
19 |         else:
20 |             correctDic[cnt] = 0
21 |     cnt += 1
22 | 
23 | cnt = 0
24 | lines = t2c.read()
25 | lines = lines.split("\n\n")
26 | 
27 | for blocks in lines:
28 |     if blocks[0:5] == "input":
29 |         blocks = blocks.split("\n")
30 |         if "True" in blocks[2]:
31 |             if correctDic[cnt] == 0:
32 |                 correctDic[cnt] = 1
33 |     cnt += 1
34 | 
35 | accuracy = sum(correctDic.values()) * 1.0/ len(correctDic) 
36 | print accuracy
37 | 


--------------------------------------------------------------------------------
/src/utils/preprocessor.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Function reduces all tweets to contain only characters in
 3 | the second and fourth columns of the standard ascii table.
 4 | This should be used if you are using an old version
 5 | of the storm topology that does not do this online.
 6 | '''
 7 | 
 8 | import cPickle as pickle
 9 | import numpy as np
10 | from os.path import expanduser
11 | 
12 | tweets = pickle.load(open(expanduser("~/tweetnet/data/new_tweets_list_string.pkl"),"rb"))
13 | embeddings = pickle.load(open(expanduser("~/tweetnet/data/new_embeddings.pkl","rb")))
14 | 
15 | print "tweet array shape: ", len(tweets)
16 | print "embeddings array shape: ", embeddings.shape
17 | print "tweet array type: ", type(tweets[0])
18 | print "embeddings array type: ", type(embeddings[0])
19 | 
20 | 
21 | for i in range(len(tweets)):
22 |     s=""
23 |     for j in range(len(tweets[i])):
24 |         asciiVal = ord(tweets[i][j])
25 |         
26 |         if(asciiVal>=32 and asciiVal<=63):
27 |             s+=tweets[i][j]
28 |         elif(asciiVal>=96 and asciiVal <= 127):
29 |             s+=tweets[i][j]
30 |         else:
31 |             continue
32 |     tweets[i]=s
33 | 
34 | pickle.dump(tweets, open(expanduser("~/tweetnet/data/preprocessed_new_tweets","wb")))
35 | 
36 | 


--------------------------------------------------------------------------------
/src/utils/tf_utils.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | import numpy as np
 3 | 
 4 | def fcLayer(x, in_shape, out_shape, activation, dropout, is_train, scope="fc"):
 5 |     
 6 |     x = tf.reshape(x, [-1, in_shape])
 7 |  
 8 |     with tf.variable_scope(scope):
 9 |         w = tf.get_variable(name="w", shape = [in_shape, out_shape], dtype=tf.float32, initializer=tf.truncated_normal_initializer(stddev=1e-2))
10 |         b = tf.get_variable(name="b", shape= [out_shape], dtype=tf.float32, initializer=tf.constant_initializer(0.0))
11 |         fc = tf.add(tf.matmul(x, w), b)
12 | 
13 |         with tf.variable_scope("activation"):
14 |             output = applyActivation(fc, activation)
15 |             #out_op = tf.nn.dropout(output, dropout)
16 |             out_op = output
17 | 
18 |     return out_op
19 | 
20 | def createLSTMCell(batch_size, lstm_size, n_layers, forget_bias):
21 | 
22 |     lstm_cell = tf.contrib.rnn.BasicLSTMCell(lstm_size, forget_bias=forget_bias)
23 |     lstm_cell = tf.contrib.rnn.MultiRNNCell([lstm_cell  for i in range(n_layers)], state_is_tuple=True)
24 |     state = lstm_cell.zero_state(batch_size, tf.float32)
25 |    
26 |     return lstm_cell, state
27 | 
28 | def applyActivation(x, activation):
29 | 
30 |     if activation == "tanh":
31 |         return tf.nn.tanh(x)
32 |     elif activation == "relu":
33 |         return tf.nn.relu(x)
34 |     elif activation == "sigmoid":
35 |         return tf.nn.sigmoid(x)
36 |     elif activation == "relu6":
37 |         return tf.nn.relu6(x)
38 |     else: return None
39 | 
40 | 
41 | def predictionLayer(x, y, in_shape, out_shape, activation, scope="prediction"):
42 |     
43 |     x = tf.reshape(x, [-1, in_shape])
44 | 
45 |     with tf.variable_scope(scope):
46 |         w = tf.get_variable(name=scope+"w", shape = [in_shape, out_shape], dtype=tf.float32, initializer=tf.truncated_normal_initializer(stddev=1e-2))
47 |         b = tf.get_variable(name=scope+"b", shape= [out_shape], dtype=tf.float32, initializer=tf.constant_initializer(0.0))
48 |         logits = tf.add(tf.matmul(x, w), b)
49 |         output = applyActivation(logits, activation)
50 |         # Compute the mean-squared-error
51 |         cost = tf.reduce_mean(tf.square(tf.subtract(y , output)))
52 | 
53 |     return cost, output
54 | 


--------------------------------------------------------------------------------
/src/utils/tf_utils_reg.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | import numpy as np
 3 | 
 4 | def fcLayer(x, in_shape, out_shape, activation, dropout, is_train, scope="fc"):
 5 |     
 6 |     x = tf.reshape(x, [-1, in_shape])
 7 |  
 8 |     with tf.variable_scope(scope):
 9 |         w = tf.get_variable(name="w", shape = [in_shape, out_shape], dtype=tf.float32, initializer=tf.truncated_normal_initializer(stddev=1e-2))
10 |         b = tf.get_variable(name="b", shape= [out_shape], dtype=tf.float32, initializer=tf.constant_initializer(0.0))
11 |         fc = tf.add(tf.matmul(x, w), b)
12 | 
13 |         with tf.variable_scope("activation"):
14 |             output = applyActivation(fc, activation)
15 |             #out_op = tf.nn.dropout(output, dropout)
16 |             out_op = output
17 | 
18 |     return out_op
19 | 
20 | def createLSTMCell(batch_size, lstm_size, n_layers, forget_bias):
21 | 
22 |     lstm_cell = tf.contrib.rnn.BasicLSTMCell(lstm_size, forget_bias=forget_bias)
23 |     lstm_cell = tf.contrib.rnn.MultiRNNCell([lstm_cell  for i in range(n_layers)], state_is_tuple=True)
24 |     state = lstm_cell.zero_state(batch_size, tf.float32)
25 |    
26 |     return lstm_cell, state
27 | 
28 | def applyActivation(x, activation):
29 | 
30 |     if activation == "tanh":
31 |         return tf.nn.tanh(x)
32 |     elif activation == "relu":
33 |         return tf.nn.relu(x)
34 |     elif activation == "sigmoid":
35 |         return tf.nn.sigmoid(x)
36 |     elif activation == "relu6":
37 |         return tf.nn.relu6(x)
38 |     else: return None
39 | 
40 | 
41 | def predictionLayer(x, y, in_shape, out_shape, activation, scope="prediction"):
42 |     
43 |     x = tf.reshape(x, [-1, in_shape])
44 | 
45 |     with tf.variable_scope(scope):
46 |         w = tf.get_variable(name=scope+"w", shape = [in_shape, out_shape], dtype=tf.float32, initializer=tf.truncated_normal_initializer(stddev=1e-2))
47 |         b = tf.get_variable(name=scope+"b", shape= [out_shape], dtype=tf.float32, initializer=tf.constant_initializer(0.0))
48 |         logits = tf.add(tf.matmul(x, w), b)
49 |         output = applyActivation(logits, activation)
50 |         # Compute the mean-squared-error
51 |         cost = tf.reduce_mean(tf.square(tf.subtract(y , output)))
52 | 
53 |     return cost, output
54 | 


--------------------------------------------------------------------------------
/src/utils/tweetGenerator.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | Reads weight matrix from hdf5 file and generates text using seed.
  3 | '''
  4 | import numpy as np
  5 | import pickle as pkl
  6 | import  numpy as np
  7 | from numpy import random
  8 | from loadData import loadData
  9 | from keras.utils import np_utils
 10 | from keras.models import Sequential
 11 | from keras.layers import LSTM
 12 | from keras.layers import Dense
 13 | from keras.layers import Activation
 14 | from keras.optimizers import RMSprop
 15 | from keras.optimizers import Adagrad
 16 | from keras.layers import Dropout
 17 | from keras.layers import BatchNormalization
 18 | from scipy.stats import rv_discrete
 19 | import sys
 20 | 
 21 | def generateText(dictionary, data, dictLen, tweetLen, X, y,
 22 | 	inputSize, sequenceLength, numHiddenFirst, numTweets, seqPerSegment,
 23 | 	n_examples, numSegments):
 24 | 
 25 | 	# data shape = #tweets x 141 x inputSize(365)
 26 | 	#initialize inverse dictionary to map integers to characterse
 27 | 	inverseDictionary = {v: k for k, v in dictionary.iteritems()}
 28 | 	print "inverseDictionary Size", len(inverseDictionary)
 29 | 
 30 | 	#building cLSTM model
 31 | 	print("\n")
 32 | 	print("Generating Text... ")
 33 | 	model = Sequential()
 34 | 
 35 | 	model.add(LSTM(numHiddenFirst, input_shape=(sequenceLength, inputSize), return_sequences=True))
 36 | 	model.add(LSTM(numHiddenFirst))
 37 | 
 38 | 	model.add(Dense(numHiddenFirst))
 39 | 	model.add(Activation('relu'))
 40 | 	model.add(BatchNormalization())
 41 | 
 42 | 	model.add(Dense(numHiddenFirst))
 43 | 	model.add(Activation('relu'))
 44 | 	model.add(BatchNormalization())
 45 | 
 46 | 	model.add(Dense(dictLen))
 47 | 	model.add(Activation('softmax'))
 48 | 
 49 | 
 50 | 	#load the network weights
 51 | 	fileName = "~/tweetnet/logs/intermediateWeights.hdf5"
 52 | 	model.load_weights(fileName)
 53 | 	model.compile(loss='categorical_crossentropy', optimizer='adam')
 54 | 
 55 | 	#initializing to random seed
 56 | 	seedTweet = np.random.randint(n_examples, size=1)
 57 | 	contextVector=np.zeros(inputSize-(dictLen))
 58 | 
 59 | 	printSeed="SEED: "
 60 | 	for c in range(sequenceLength):
 61 | 		#for each character in the sequence
 62 | 
 63 | 		#grab the pattern, which is the 1x365 input vector
 64 | 		pattern = X[seedTweet][c,:]
 65 | 
 66 | 		#grab the 1x300 context subvector
 67 | 		contextVector = pattern[dictLen:]
 68 | 
 69 | 		#search, in the pattern itself, for the one-hot element
 70 | 		counter = 0
 71 | 		for i in range(dictLen):
 72 | 			if(pattern[i] == 1):
 73 | 				counter = i
 74 | 				break
 75 | 		#if one-hot element is greater than 64, then EOS.
 76 | 		#technically you'll never reach this as seqLen should be < tweetLen
 77 | 		if(counter>=64):
 78 | 			printSeed = printSeed + "<<EOS>>"
 79 | 			continue;
 80 | 
 81 | 		printSeed = printSeed + inverseDictionary[counter]
 82 | 	print printSeed
 83 | 
 84 | 	x = X[seedTweet][0:sequenceLength]
 85 | 	inputVector = np.reshape(x,(1,len(x),len(x[0])))
 86 | 	#generate characters
 87 | 
 88 | 	printResult = "GENERATED TEXT: "
 89 | 
 90 | 	charsGenerated = 140
 91 | 	for i in range(charsGenerated):
 92 | 
 93 | 		prediction = model.predict(inputVector, verbose=0)
 94 | 		#index = np.argsort(prediction)
 95 | 		#rand = np.random.randint(5)
 96 | 		#rand_index = index[0][len(index[0]) - rand - 1]
 97 | 
 98 | 		rand_index = rv_discrete(values=(list(xrange(len(prediction[0]))),prediction[0])).rvs(size=1)[0]
 99 | 		if(rand_index==(dictLen-1)):
100 | 			printResult = printResult + "<<EOS>>"
101 | 			break
102 | 
103 | 		result = inverseDictionary[rand_index]
104 | 		printResult = printResult+result
105 | 
106 | 		charVector=np.zeros(dictLen)
107 | 		charVector[rand_index]=1
108 | 		currInput = np.concatenate((charVector,contextVector))
109 | 
110 | 		concatVector = np.reshape(currInput, (1,1,len(currInput)))
111 | 
112 | 		inputVector=np.concatenate((inputVector,concatVector), axis=1)
113 | 		inputVector=inputVector[:,1:len(inputVector[0]),:]
114 | 
115 | 	print printResult
116 | 


--------------------------------------------------------------------------------
/src/utils/tweetGenerator_lstm.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Reads weight matrix from hdf5 file and generates text using seed.
 3 | '''
 4 | import numpy as np
 5 | import pickle as pkl
 6 | import  numpy as np
 7 | import h5py
 8 | from numpy import random
 9 | from loadData_lstm import loadData
10 | from keras.utils import np_utils
11 | from keras.models import Sequential
12 | from keras.layers import LSTM
13 | from keras.layers import Dense
14 | from keras.layers import Activation
15 | from keras.optimizers import RMSprop
16 | from keras.optimizers import Adagrad
17 | from keras.layers import Dropout
18 | from keras.layers import BatchNormalization
19 | from scipy.stats import rv_discrete
20 | import sys
21 | from os.path import expanduser
22 | 
23 | def sample(preds, temperature=1.0):
24 |     preds = np.asarray(preds).astype('float64')
25 |     preds = np.log(preds) / temperature
26 |     exp_preds = np.exp(preds)
27 |     preds = exp_preds / np.sum(exp_preds)
28 |     probas = np.random.multinomial(1, preds, 1)
29 |     return np.argmax(probas)
30 | 
31 | def generateText(model, tweets, sequenceLength, vocabLen, dictionary):
32 |     
33 |     # Random select a tweet for generation
34 |     start_index = random.randint(len(tweets))
35 |     inverseDictionary = {v: k for k, v in dictionary.iteritems()}
36 |     
37 |     # Different temperature adds randomness to character generation
38 |     for diversity in [0.2, 0.5, 1.0, 1.2]:
39 |         print("\n")
40 |         print('----- diversity:', diversity)
41 | 
42 |         generated = ""
43 | 
44 |         seed = tweets[start_index][6:sequenceLength+6]
45 |         print('----- Generating with seed: "' + seed + '"')
46 |         generated += seed
47 |         sys.stdout.write(generated)
48 | 
49 |         for i in range(140):
50 | 
51 |             # x: [1, sequenceLength(40), 65]
52 |             x = np.zeros((1, sequenceLength, vocabLen))
53 |             
54 |             # Create one hot encoding vectors for the seed
55 |             for j, ch in enumerate(seed):
56 |                 x[0, j, dictionary.get(ch)] = 1
57 | 
58 |             preds = model.predict(x, verbose=0)[0]
59 |             next_index = sample(preds, diversity)
60 |             
61 |             # If an EOS symbol is genearted, append "<EOS>" to the end of generated and stop
62 |             if next_index == vocabLen - 1:
63 |                 next_char = "<EOS>"
64 |                 generated += next_char
65 |                 seed = seed[1:] + next_char
66 |                 sys.stdout.write(next_char)
67 |                 sys.stdout.flush()
68 |                 break
69 |             
70 |             # If not an EOS symbol, append the last generated char
71 |             else:
72 |                 next_char = inverseDictionary[next_index]
73 |                 generated += next_char
74 |                 # Shift the window by 1 go create the new seed
75 |                 seed = seed[1:] + next_char
76 |                 sys.stdout.write(next_char)
77 |                 sys.stdout.flush()
78 |         print("\n")
79 | 


--------------------------------------------------------------------------------
/src/utils/visualizeData.py:
--------------------------------------------------------------------------------
 1 | import cPickle as pickle
 2 | import numpy as np
 3 | from os.path import expanduser
 4 | 
 5 | tweets = pickle.load(open(expanduser("~/tweetnet/data/preprocessed_new_tweets.pkl"),"rb"))
 6 | embeddings = pickle.load(open(expanduser("~/tweetnet/data/new_embeddings.pkl","rb")))
 7 | 
 8 | print "tweet array shape: ", len(tweets)
 9 | print "embeddings array shape: ", embeddings.shape
10 | print "tweet array type: ", type(tweets[0])
11 | print "embeddings array type: ", type(embeddings[0])
12 | 
13 | for i in range(100):
14 | 	print tweets[i]
15 | 


--------------------------------------------------------------------------------