├── Items2Vec.py
├── README.md
└── view_sessions.zip


/Items2Vec.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import pandas as pd
  3 | import numpy as np
  4 | from keras.preprocessing import sequence
  5 | from keras.models import Sequential
  6 | from keras.layers import Dense, Dropout, Embedding, LSTM, Input, Bidirectional
  7 | import operator
  8 | import io
  9 | 
 10 | 
 11 | class ItemsConfig(object):
 12 |     max_description = 10
 13 |     max_items_in_session = 3
 14 | 
 15 | 
 16 | def clean_text(token):
 17 |     token = token.replace(",", "")
 18 |     token = token.replace(".", "")
 19 |     token = token.replace("?", "")
 20 |     token = token.replace(":", "")
 21 |     token = token.replace(";", "")
 22 |     token = token.replace("\"", "")
 23 |     token = token.replace(")", "")
 24 |     token = token.replace("(", "")
 25 |     token = token.replace("[", "")
 26 |     token = token.replace("]", "")
 27 |     token = token.replace("}", "")
 28 |     token = token.replace("{", "")
 29 |     return token
 30 | 
 31 | 
 32 | def extract_vocabulary(dir_name, targetTextFileName="bla"):
 33 |     count = 0
 34 |     vocabulary = dict()
 35 |     for root, dirs, files in os.walk(dir_name):
 36 |         path = root.split('/')
 37 |         print (len(path) - 1) *'---' , os.path.basename(root)
 38 |         for file in files:
 39 |             if (file.endswith("txt")):
 40 |                 count = count + 1
 41 |                 print("count: ", count)
 42 |                 with open(dir_name + "/" + os.path.basename(root) + "/" + file) as f:
 43 |                     for line in f:
 44 |                         line = line.lower()
 45 |                         tokens = line.split()
 46 |                         for token in tokens:
 47 |                             clean_token = clean_text(token)
 48 |                             if vocabulary.has_key(clean_token):
 49 |                                 vocabulary[clean_token] = vocabulary[clean_token] + 1
 50 |                             else:
 51 |                                 vocabulary[clean_token] = 1
 52 | 
 53 |     return vocabulary
 54 | 
 55 | 
 56 | def write_vocab_2_file(filename, vocab):
 57 |     with open(filename, "w") as myfile:
 58 |         for pair in vocab:
 59 |             myfile.write(pair[0] + " " + str(pair[1]) + "\n")
 60 | 
 61 | 
 62 | def read_vocab_to_list(filename):
 63 |     return [word for line in open(filename, 'r') for word in line.split()]
 64 | 
 65 | 
 66 | def get_num_sorted(sorted_dict,token):
 67 |     count = 0
 68 |     for pair in sorted_dict:
 69 |         if pair[0] == token:
 70 |             return count
 71 |         else:
 72 |             count +=1
 73 |     return count
 74 | 
 75 | def get_ys(filename, vocab):
 76 |     count = 0
 77 |     ys_tokens = []
 78 |     config = ItemsConfig()
 79 |     ys = ""
 80 | 
 81 |     sorted_vocabulary = sorted(vocab.items(), key=operator.itemgetter(1), reverse=True)
 82 | 
 83 |     with open(filename) as f:
 84 |         for line in f:
 85 |             line = line.lower()
 86 |             tokens = line.split()
 87 |             for token in tokens:
 88 |                 token = clean_text(token)
 89 |                 if token.isalpha() and count < config.max_description:
 90 |                    # print("appending token y")
 91 |                     num_token = get_num_sorted(sorted_vocabulary, token)
 92 |                     ys_tokens.append(vocab[token])
 93 |                     count += 1
 94 |     for y in ys_tokens:
 95 |         ys += str(y)+","
 96 | 
 97 |     return ys[:-1]
 98 | 
 99 | 
100 | def get_xs(folder, exclude_file, vocab):
101 |     xs = ""
102 |     count = 0
103 |     config = ItemsConfig()
104 | 
105 |     for root, dirs, files in os.walk(folder):
106 |         path = root.split('/')
107 |         print (len(path) - 1) *'---' , os.path.basename(root)
108 |         for file in files:
109 |             print (file, exclude_file)
110 |             if file.endswith("txt") and (not exclude_file.endswith(file)):
111 |                 if count < config.max_items_in_session:
112 |                     count += 1
113 |                     xs += get_ys(folder + "/" + file, vocab) + ","
114 | 
115 |     return xs
116 | 
117 | 
118 | def create_train_file(folders_list, train_filename, vocab):
119 | 
120 |     for session_folder in folders_list:
121 |         for root, dirs, files in os.walk(session_folder):
122 |             path = root.split('/')
123 |             print (len(path) - 1) *'---' , os.path.basename(root)
124 |             for file in files:
125 |                 if file.endswith("txt"):
126 |                     print("starting ys")
127 |                     ys = get_ys(session_folder + "/" + file, vocab)
128 |                     print("starting xs")
129 |                     xs = get_xs(session_folder, session_folder + "/" + file, vocab)
130 |                     csv_line = xs + ys + "\n"
131 |                     with open(train_filename, "a") as train_file:
132 |                         train_file.write(csv_line)
133 | 
134 | 
135 | def trainModel(train_file, vocab_size):
136 | 
137 |     train = pd.read_csv(train_file)
138 |     Xs = train.iloc[:, :20]
139 |     Ys = train.iloc[:, 20:]
140 | 
141 |     xs = np.array(Xs)
142 |     ys = np.array(Ys)
143 | 
144 |     model = Sequential()
145 |     model.add(Embedding(vocab_size, 10, input_length=20))
146 |     model.add(Bidirectional(LSTM(64)))
147 |     model.add(Dense(10, activation='linear'))
148 | 
149 |     print("compile..")
150 |     model.compile('adam', 'mse')
151 |     print("tyrain..")
152 |     model.fit(xs, ys)
153 |     print("trained...")
154 | 
155 |     model.layers.pop() # Get rid of the classification layer
156 |     model.outputs = [model.layers[-1].output]
157 |     model.layers[-1].outbound_nodes = []
158 | 
159 |     predictions = model.predict(xs) # here actually you have the item_doc-2-vec
160 |     print(predictions)
161 | 
162 | 
163 | def main():
164 | 
165 |     complete_sessions_folder = "/Users/macbook/Desktop/corpora/view_sessions"
166 |     session_folder1 = "/Users/macbook/Desktop/corpora/view_sessions/session1"
167 |     session_folder2 = "/Users/macbook/Desktop/corpora/view_sessions/session2"
168 |     vocabulary_filename = "/Users/macbook/Desktop/corpora/aux_files/sessions_vocab.txt"
169 |     train_set_filename = "/Users/macbook/Desktop/corpora/aux_files/sessions_train.txt"
170 | 
171 |     vocab = extract_vocabulary(complete_sessions_folder, vocabulary_filename)
172 |     create_train_file([session_folder1, session_folder2], train_set_filename, vocab)
173 | 
174 |     trainModel("/Users/macbook/Desktop/corpora/aux_files/sessions_train.txt", len(vocab.keys()))
175 | 
176 | 
177 | 
178 | if __name__ == "__main__":
179 |     main()


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # item2vec
2 | items browsed in a session as a context are modeled to vec with bidirectional lstm
3 | 


--------------------------------------------------------------------------------
/view_sessions.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vicmak/item2vec/95f67ea00e66861cba5801c065937647026439fc/view_sessions.zip


--------------------------------------------------------------------------------