├── Items2Vec.py ├── README.md └── view_sessions.zip /Items2Vec.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pandas as pd 3 | import numpy as np 4 | from keras.preprocessing import sequence 5 | from keras.models import Sequential 6 | from keras.layers import Dense, Dropout, Embedding, LSTM, Input, Bidirectional 7 | import operator 8 | import io 9 | 10 | 11 | class ItemsConfig(object): 12 | max_description = 10 13 | max_items_in_session = 3 14 | 15 | 16 | def clean_text(token): 17 | token = token.replace(",", "") 18 | token = token.replace(".", "") 19 | token = token.replace("?", "") 20 | token = token.replace(":", "") 21 | token = token.replace(";", "") 22 | token = token.replace("\"", "") 23 | token = token.replace(")", "") 24 | token = token.replace("(", "") 25 | token = token.replace("[", "") 26 | token = token.replace("]", "") 27 | token = token.replace("}", "") 28 | token = token.replace("{", "") 29 | return token 30 | 31 | 32 | def extract_vocabulary(dir_name, targetTextFileName="bla"): 33 | count = 0 34 | vocabulary = dict() 35 | for root, dirs, files in os.walk(dir_name): 36 | path = root.split('/') 37 | print (len(path) - 1) *'---' , os.path.basename(root) 38 | for file in files: 39 | if (file.endswith("txt")): 40 | count = count + 1 41 | print("count: ", count) 42 | with open(dir_name + "/" + os.path.basename(root) + "/" + file) as f: 43 | for line in f: 44 | line = line.lower() 45 | tokens = line.split() 46 | for token in tokens: 47 | clean_token = clean_text(token) 48 | if vocabulary.has_key(clean_token): 49 | vocabulary[clean_token] = vocabulary[clean_token] + 1 50 | else: 51 | vocabulary[clean_token] = 1 52 | 53 | return vocabulary 54 | 55 | 56 | def write_vocab_2_file(filename, vocab): 57 | with open(filename, "w") as myfile: 58 | for pair in vocab: 59 | myfile.write(pair[0] + " " + str(pair[1]) + "\n") 60 | 61 | 62 | def read_vocab_to_list(filename): 63 | return [word for line in open(filename, 'r') for word in line.split()] 64 | 65 | 66 | def get_num_sorted(sorted_dict,token): 67 | count = 0 68 | for pair in sorted_dict: 69 | if pair[0] == token: 70 | return count 71 | else: 72 | count +=1 73 | return count 74 | 75 | def get_ys(filename, vocab): 76 | count = 0 77 | ys_tokens = [] 78 | config = ItemsConfig() 79 | ys = "" 80 | 81 | sorted_vocabulary = sorted(vocab.items(), key=operator.itemgetter(1), reverse=True) 82 | 83 | with open(filename) as f: 84 | for line in f: 85 | line = line.lower() 86 | tokens = line.split() 87 | for token in tokens: 88 | token = clean_text(token) 89 | if token.isalpha() and count < config.max_description: 90 | # print("appending token y") 91 | num_token = get_num_sorted(sorted_vocabulary, token) 92 | ys_tokens.append(vocab[token]) 93 | count += 1 94 | for y in ys_tokens: 95 | ys += str(y)+"," 96 | 97 | return ys[:-1] 98 | 99 | 100 | def get_xs(folder, exclude_file, vocab): 101 | xs = "" 102 | count = 0 103 | config = ItemsConfig() 104 | 105 | for root, dirs, files in os.walk(folder): 106 | path = root.split('/') 107 | print (len(path) - 1) *'---' , os.path.basename(root) 108 | for file in files: 109 | print (file, exclude_file) 110 | if file.endswith("txt") and (not exclude_file.endswith(file)): 111 | if count < config.max_items_in_session: 112 | count += 1 113 | xs += get_ys(folder + "/" + file, vocab) + "," 114 | 115 | return xs 116 | 117 | 118 | def create_train_file(folders_list, train_filename, vocab): 119 | 120 | for session_folder in folders_list: 121 | for root, dirs, files in os.walk(session_folder): 122 | path = root.split('/') 123 | print (len(path) - 1) *'---' , os.path.basename(root) 124 | for file in files: 125 | if file.endswith("txt"): 126 | print("starting ys") 127 | ys = get_ys(session_folder + "/" + file, vocab) 128 | print("starting xs") 129 | xs = get_xs(session_folder, session_folder + "/" + file, vocab) 130 | csv_line = xs + ys + "\n" 131 | with open(train_filename, "a") as train_file: 132 | train_file.write(csv_line) 133 | 134 | 135 | def trainModel(train_file, vocab_size): 136 | 137 | train = pd.read_csv(train_file) 138 | Xs = train.iloc[:, :20] 139 | Ys = train.iloc[:, 20:] 140 | 141 | xs = np.array(Xs) 142 | ys = np.array(Ys) 143 | 144 | model = Sequential() 145 | model.add(Embedding(vocab_size, 10, input_length=20)) 146 | model.add(Bidirectional(LSTM(64))) 147 | model.add(Dense(10, activation='linear')) 148 | 149 | print("compile..") 150 | model.compile('adam', 'mse') 151 | print("tyrain..") 152 | model.fit(xs, ys) 153 | print("trained...") 154 | 155 | model.layers.pop() # Get rid of the classification layer 156 | model.outputs = [model.layers[-1].output] 157 | model.layers[-1].outbound_nodes = [] 158 | 159 | predictions = model.predict(xs) # here actually you have the item_doc-2-vec 160 | print(predictions) 161 | 162 | 163 | def main(): 164 | 165 | complete_sessions_folder = "/Users/macbook/Desktop/corpora/view_sessions" 166 | session_folder1 = "/Users/macbook/Desktop/corpora/view_sessions/session1" 167 | session_folder2 = "/Users/macbook/Desktop/corpora/view_sessions/session2" 168 | vocabulary_filename = "/Users/macbook/Desktop/corpora/aux_files/sessions_vocab.txt" 169 | train_set_filename = "/Users/macbook/Desktop/corpora/aux_files/sessions_train.txt" 170 | 171 | vocab = extract_vocabulary(complete_sessions_folder, vocabulary_filename) 172 | create_train_file([session_folder1, session_folder2], train_set_filename, vocab) 173 | 174 | trainModel("/Users/macbook/Desktop/corpora/aux_files/sessions_train.txt", len(vocab.keys())) 175 | 176 | 177 | 178 | if __name__ == "__main__": 179 | main() -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # item2vec 2 | items browsed in a session as a context are modeled to vec with bidirectional lstm 3 | -------------------------------------------------------------------------------- /view_sessions.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vicmak/item2vec/95f67ea00e66861cba5801c065937647026439fc/view_sessions.zip --------------------------------------------------------------------------------