├── Hybrid ├── README.md ├── feature_extraction │ ├── Genome │ │ ├── Readme.md │ │ ├── get_genomes.py │ │ └── unique_movies │ ├── Genre │ │ ├── Readme.md │ │ ├── get_genres.py │ │ └── unique_movies │ ├── IMDb │ │ ├── Readme.md │ │ ├── combine_all_features.py │ │ ├── get_imdb_data.py │ │ ├── liwc.py │ │ ├── process_imdb_data.py │ │ ├── process_plot.py │ │ └── vad.py │ └── Movie-VAE │ │ ├── Readme.md │ │ ├── get_embeddings.py │ │ └── vae_movie.py └── vae_imdb │ ├── Readme.md │ ├── h-vae_imdb.py │ ├── test_h-vae_eval1.py │ └── test_h-vae_eval2.py ├── README.md ├── Spotify ├── README.md ├── generate_song_predictions.py ├── read_challenge.py ├── read_mpd.py ├── run_vae_on_test_playlists.py └── vae_cf_spotify.py ├── Standard ├── evaluate_model_approach_1.py ├── evaluate_model_approach_2.py ├── plot_loss_graphs.ipynb ├── project_user_clusters.ipynb ├── read_data.py ├── readme.md └── vae_cf_keras.py ├── standard_vae_model_results.txt └── update_ratings.py /Hybrid/README.md: -------------------------------------------------------------------------------- 1 | 2 | This directory contains code for variational autoencoder applied to 3 | collaborative filtering powered with movie embeddings/hand crafted movie features. 4 | 5 | update_ratings.csv : creates a new ratings.csv file for only those movies whose IMDb data is downloaded 6 | 7 | -------------------------------------------------------------------------------- /Hybrid/feature_extraction/Genome/Readme.md: -------------------------------------------------------------------------------- 1 | Run : get_genomes.py 2 | obtain a (26621, 1128) matrix containing one hot encoding of genome tags. 3 | 4 | This is the input to movie-VAE 5 | -------------------------------------------------------------------------------- /Hybrid/feature_extraction/Genome/get_genomes.py: -------------------------------------------------------------------------------- 1 | import json 2 | from operator import itemgetter 3 | import numpy as np 4 | NUM_GENOMES = 1128 5 | 6 | 7 | #converts tuple to a 1128 dim vector 8 | def get_genome_vec(genome_tup): 9 | vec = np.zeros(NUM_GENOMES) 10 | print len(genome_tup) 11 | for i in genome_tup: 12 | 13 | tag = int(i[0])-1 14 | vec[tag] = 1 15 | return vec 16 | 17 | 18 | with open('./ml-20m/genome-scores.csv', 'r') as f: 19 | genome_data = f.read().splitlines() 20 | 21 | genome_dict = {} 22 | 23 | #Collect all the genomes in genomes-scores.csv 24 | for i in genome_data[1:]: 25 | i = i.split(",") 26 | mid = i[0] 27 | tagid = i[1] 28 | relevance = float(i[2]) 29 | try : 30 | genome_dict[mid].append((tagid, relevance)) 31 | except: 32 | genome_dict[mid] = [(tagid, relevance)] 33 | 34 | #sort and select genomes 35 | for mid in genome_dict.keys(): 36 | scores = genome_dict[mid] 37 | scores = sorted(scores , key=itemgetter(1), reverse = True) 38 | scores = scores[:20] 39 | genome_dict[mid] = scores 40 | print len(genome_dict[mid]) 41 | 42 | with open('genome_scores.json', 'w') as f: 43 | json.dump(genome_dict, f) 44 | 45 | unk = np.array([0]*num_genomes) 46 | movie_embeddings_array = [] 47 | 48 | #convert list to a one hot vector 49 | with open('unique_movies', 'r') as f: 50 | movie_id = f.read().splitlines() 51 | for i in movie_id: 52 | try: 53 | i = i.split(',') 54 | mid = i[0] 55 | movie_embeddings_array.append(get_genome_vec(genome_dict[mid])) 56 | except KeyError: 57 | movie_embeddings_array.append(unk) 58 | 59 | movie_embeddings_array = np.array(movie_embeddings_array) 60 | with open('movie_genomes.npy', 'wb') as f: 61 | np.save(f, movie_embeddings_array) 62 | -------------------------------------------------------------------------------- /Hybrid/feature_extraction/Genre/Readme.md: -------------------------------------------------------------------------------- 1 | Run get_genres.py to get a .npy file containing a (26621, ) array with each element representing the genre of that movie 2 | -------------------------------------------------------------------------------- /Hybrid/feature_extraction/Genre/get_genres.py: -------------------------------------------------------------------------------- 1 | import json 2 | import numpy as np 3 | 4 | with open('./ml20-m/movies.csv', 'r') as f: 5 | data = f.read().splitlines() 6 | 7 | genre_json = {} 8 | for i in data[1:]: 9 | i = i.split('::') 10 | m_id = i[0] 11 | genres = i[2] 12 | print genres 13 | genre_data[m_id] = genres.split('|') 14 | 15 | with open('genres.json', 'w') as f: 16 | json.dump(genre_data, f) 17 | 18 | 19 | genres = set() 20 | for m in genre_data.keys(): 21 | for g in genre_data[m]: 22 | genres.add(g) 23 | genre2id = dict((genre, i) for (i, genre) in enumerate(genres)) 24 | 25 | with open('unique_movies', 'r') as f: 26 | movie_ids = f.read().splitlines() 27 | 28 | with open('movie_genres.npy', 'w') as f: 29 | genre_array = [] 30 | for i in movie_ids: 31 | i = i.split(',') 32 | mid = i[0] 33 | genre_array.append(genre2id[genre_data[mid][0]]) 34 | genre_array = np.array(genre_array) 35 | np.save(f, genre_array) 36 | 37 | -------------------------------------------------------------------------------- /Hybrid/feature_extraction/IMDb/Readme.md: -------------------------------------------------------------------------------- 1 | Get IMDB features for each movie to be given as input to MovieVAE 2 | 3 | Order of running scripts 4 | 5 | get_imdb_data.py 6 | 7 | process_imdb_data.py 8 | 9 | process_plot.py 10 | 11 | combine_all_features.py 12 | 13 | -------------------------------------------------------------------------------- /Hybrid/feature_extraction/IMDb/combine_all_features.py: -------------------------------------------------------------------------------- 1 | import json 2 | import numpy as np 3 | 4 | with open('language.json', 'r') as f: 5 | language = json.load(f) 6 | 7 | with open('imdb_rat.json', 'r') as f: 8 | imdb_rat = json.load(f) 9 | 10 | with open('plot_feature.json', 'r') as f: 11 | plot = json.load(f) 12 | 13 | with open('rated.json', 'r') as f: 14 | rated = json.load(f) 15 | 16 | mid2idx = {} 17 | with open('unique_movies', 'r') as f: 18 | data = f.read().splitlines() 19 | for i in data: 20 | i = i.split(',') 21 | mid2idx[i[0]] = int(i[1]) 22 | 23 | movie_feat_list = [] 24 | 25 | idx = language['unique'] 26 | for i in language['list'].keys(): 27 | vec = np.zeros(len(idx)) 28 | lang = language['list'][i] 29 | for j in lang: 30 | item = idx[j] 31 | vec[item] =1.0 32 | 33 | language['list'][i] = list(vec) 34 | 35 | 36 | idx = rated['unique'] 37 | for i in rated['list'].keys(): 38 | vec = np.zeros(len(idx)) 39 | item = idx[rated['list'][i]] 40 | vec[item] =1.0 41 | rated['list'][i] = list(vec) 42 | 43 | 44 | for i in mid2idx.keys(): 45 | mid = str(mid2idx[i]) 46 | # mid = str(mid) 47 | vec = language['list'][mid] 48 | vec+= plot[mid] 49 | vec+= rated['list'][mid] 50 | vec.append(imdb_rat[mid]) 51 | vec = np.array(vec) 52 | movie_feat_list.append(vec) 53 | 54 | movie_feat_list = np.array(movie_feat_list) 55 | print movie_feat_list.shape 56 | 57 | with open('movie_features.npy', 'wb') as f: 58 | np.save(f, movie_feat_list) 59 | 60 | -------------------------------------------------------------------------------- /Hybrid/feature_extraction/IMDb/get_imdb_data.py: -------------------------------------------------------------------------------- 1 | import json 2 | import requests 3 | import time 4 | 5 | def get_movie_data(imdb_id): 6 | url = 'http://www.omdbapi.com/?i={}&apikey=96725c4f&plot=full'.format(imdb_id) 7 | response = requests.get(url) 8 | return response.json() 9 | 10 | with open('links.csv', 'r') as f: 11 | ids = f.read().splitlines() 12 | 13 | 14 | movie_dict = [] 15 | count = 0 16 | for i in ids[1:]: 17 | i = i.split(",") 18 | ml_id = i[0] 19 | imdb_id = 'tt' + i[1] 20 | 21 | print imdb_id 22 | 23 | movie = get_movie_data(imdb_id) 24 | if movie['Response'] == 'True': 25 | movie_dict.append(movie) 26 | 27 | if count % 500 == 0: 28 | json_data = open("imdb_data.json", "w") 29 | movie_dict_json = json.dumps(movie_dict) 30 | json_data.write(movie_dict_json) 31 | json_data.close() 32 | 33 | count += 1 34 | time.sleep(1) 35 | -------------------------------------------------------------------------------- /Hybrid/feature_extraction/IMDb/liwc.py: -------------------------------------------------------------------------------- 1 | import json 2 | from nltk.stem.lancaster import LancasterStemmer 3 | from nltk import PorterStemmer 4 | from nltk.tokenize import RegexpTokenizer 5 | from nltk.corpus import stopwords 6 | from nltk import FreqDist as freq 7 | import numpy as np 8 | from nltk.stem import WordNetLemmatizer 9 | 10 | """with open('liwc.cat', 'r') as f: 11 | liwc_cat = f.read().splitlines() 12 | 13 | liwc_json = {} 14 | for i in range(1, 65): 15 | print i 16 | with open('dictionary/{}.csv'.format(i), 'r') as f: 17 | words = f.read().splitlines() 18 | for word in words: 19 | #print word 20 | #word = word.encode('utf-8') 21 | try : 22 | liwc_json[word][i-1] += 1 23 | except KeyError: 24 | liwc_json[word] = [0]*64 25 | liwc_json[word][i-1] = 1 26 | 27 | print liwc_json 28 | with open('liwc.json', 'w') as f: 29 | json.dump(liwc_json, f)""" 30 | 31 | class liwc_score(): 32 | def __init__(self): 33 | with open('liwc_dict.json', 'r') as f: 34 | self.liwc = json.load(f) 35 | #print self.liwc 36 | self.stemmer = LancasterStemmer() 37 | self.lemmatizer = WordNetLemmatizer() 38 | self.tokenizer = RegexpTokenizer(r'\w+') 39 | self.stop_words = set(stopwords.words('english')) 40 | 41 | #Remove stopwords ? 42 | def token_stemmer(self, inp_sentence, remove_stop): 43 | token_list = self.tokenizer.tokenize(inp_sentence) 44 | if remove_stop : 45 | token_list = [word for word in token_list if word not in self.stop_words] 46 | stem_list = [self.stemmer.stem(word) for word in token_list] 47 | #stem_list = [self.lemmatizer.lemmatize(word) for word in token_list] 48 | 49 | return stem_list 50 | 51 | def get_liwc_score(self, inp_sentence, remove_stop = False): 52 | stem_list = self.token_stemmer(inp_sentence, remove_stop) 53 | #print stem_list 54 | fdist = dict(freq(stem_list)) 55 | 56 | liwc_vec = np.array([0.0]*64) 57 | count = 1.0 58 | #print fdist 59 | for word in fdist.keys(): 60 | try : 61 | #print self.liwc[word] 62 | liwc_vec += np.array( self.liwc[word] )*fdist[word] 63 | #print 'hello' 64 | count += fdist[word] 65 | #print liwc_vec 66 | except KeyError: 67 | 68 | pass 69 | 70 | return liwc_vec/count 71 | 72 | if __name__ == "__main__": 73 | with open('test', 'r') as f: 74 | inp_data = f.read() 75 | 76 | l = liwc_score() 77 | print l.get_liwc_score(inp_data) 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | -------------------------------------------------------------------------------- /Hybrid/feature_extraction/IMDb/process_imdb_data.py: -------------------------------------------------------------------------------- 1 | import json 2 | import numpy as np 3 | 4 | def process_lang(string): 5 | string = string.split(",") 6 | string = [i.lstrip(' ').rstrip(' ') for i in string] 7 | return string 8 | 9 | with open('imdb_data.json', 'r') as f: 10 | raw_data = json.load(f) 11 | 12 | language = {'unique' : set(), 'list' : {} } 13 | imdb_rat = {} 14 | plot = {} 15 | rated = {'unique' : set(), 'list' : {} } 16 | 17 | mid2idx = {} 18 | with open('unique_movies', 'r') as f: 19 | data = f.read().splitlines() 20 | for i in data: 21 | i = i.split(',') 22 | mid2idx[i[0]] = int(i[1]) 23 | 24 | for mid in mid2idx.keys(): 25 | midx = mid2idx[mid] 26 | 27 | lang = process_lang(raw_data[mid]['Language']) 28 | language['list'][midx] = lang 29 | for i in lang: 30 | language['unique'].add(i) 31 | 32 | if raw_data[mid]['imdbRating'] != 'N/A': 33 | imdb_rat[midx] = float(raw_data[mid]['imdbRating']) 34 | else : 35 | imdb_rat[midx] = float(0.0) 36 | 37 | plot[midx] = raw_data[mid]['Plot'] 38 | 39 | rated['list'][midx] = raw_data[mid]['Rated'] 40 | rated['unique'].add(raw_data[mid]['Rated']) 41 | 42 | 43 | language['unique'] = dict( (item, i) for (i, item) in enumerate(list(language['unique']))) 44 | rated['unique'] = dict( (item, i) for (i, item) in enumerate(list(rated['unique']))) 45 | 46 | with open('language.json', 'w') as f: 47 | json.dump( language,f) 48 | 49 | with open('imdb_rat.json', 'w') as f: 50 | json.dump( imdb_rat ,f) 51 | 52 | with open('plot.json', 'w') as f: 53 | json.dump( plot,f) 54 | 55 | with open('rated.json', 'w') as f: 56 | json.dump( rated ,f) 57 | 58 | 59 | 60 | -------------------------------------------------------------------------------- /Hybrid/feature_extraction/IMDb/process_plot.py: -------------------------------------------------------------------------------- 1 | import gensim 2 | import sys 3 | from liwc import liwc_score 4 | from vad import vad_score 5 | 6 | import json 7 | from nltk.stem.lancaster import LancasterStemmer 8 | from nltk import PorterStemmer 9 | from nltk.tokenize import RegexpTokenizer 10 | from nltk.corpus import stopwords 11 | from nltk import FreqDist as freq 12 | import numpy as np 13 | from nltk.stem import WordNetLemmatizer 14 | 15 | model = gensim.models.KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True) 16 | model.save("word2vec.model") 17 | 18 | tokenizer = RegexpTokenizer(r'\w+') 19 | stop_words = set(stopwords.words('english')) 20 | lemmatizer = WordNetLemmatizer() 21 | ls = liwc_score() 22 | vs = vad_score() 23 | 24 | def token_stemmer(inp_sentence): 25 | token_list = tokenizer.tokenize(inp_sentence) 26 | token_list = [word for word in token_list if word not in stop_words] 27 | stem_list = [lemmatizer.lemmatize(word) for word in token_list] 28 | return stem_list 29 | 30 | def normalize(x): 31 | return (x - np.min(x))/(np.max(x) - np.min(x)) 32 | 33 | def get_word2vec(text): 34 | word_list = token_stemmer(text) 35 | vec = np.zeros(300) 36 | count = 1.0 37 | for word in word_list: 38 | try: 39 | vec+= (model[word]) 40 | count += 1.0 41 | #vec+= model[word] 42 | except: 43 | pass 44 | return vec/count 45 | 46 | 47 | with open('plot.json', 'r') as f: 48 | plot_data = json.load(f) 49 | 50 | w2vec_list = [] 51 | plot_feature = {} 52 | for mid in plot_data.keys(): 53 | text = plot_data[mid] 54 | 55 | w2vec = get_word2vec(text) 56 | w2vec_list.append(w2vec) 57 | 58 | vec = list(ls.get_liwc_score(text)) 59 | vec += list(vs.get_vad_score(text)) 60 | 61 | plot_feature[mid] = vec 62 | 63 | w2vec_list = np.array(w2vec_list) 64 | w2vec_list = (w2vec_list - np.min(w2vec_list))/(np.max(w2vec_list) - np.min(w2vec_list)) 65 | 66 | mids = liwc_vad_dict.keys() 67 | 68 | for i in range(0, len(mids)): 69 | mid = mids[i] 70 | vec = list(w2vec_list[i]) 71 | plot_feature[mid] = plot_feature[mid] + vec 72 | 73 | with open('plot_feature.json', 'w') as f: 74 | json.dump(plot_feature, f) 75 | 76 | 77 | -------------------------------------------------------------------------------- /Hybrid/feature_extraction/IMDb/vad.py: -------------------------------------------------------------------------------- 1 | import json 2 | from nltk.stem import WordNetLemmatizer 3 | from nltk.tokenize import RegexpTokenizer 4 | from nltk.corpus import stopwords 5 | from nltk import FreqDist as freq 6 | 7 | """with open('warriner.csv', 'r') as f: 8 | vad_raw = f.read().splitlines() 9 | 10 | vad_json = {} 11 | for i in vad_raw[1:]: 12 | i = i.split(",") 13 | item = {} 14 | item['v'] = float(i[2]) 15 | item['a'] = float(i[5]) 16 | item['d'] = float(i[8]) 17 | vad_json[i[1]] = item 18 | 19 | with open('warriner.json', 'w') as f: 20 | json.dump(vad_json, f)""" 21 | 22 | 23 | class vad_score(): 24 | def __init__(self): 25 | with open('warriner.json', 'r') as f: 26 | self.vad = json.load(f) 27 | self.tokenizer = RegexpTokenizer(r'\w+') 28 | self.stop_words = set(stopwords.words('english')) 29 | self.lemmatizer = WordNetLemmatizer() 30 | 31 | def token_stemmer(self, inp_sentence, remove_stop ): 32 | token_list = self.tokenizer.tokenize(inp_sentence) 33 | if remove_stop : 34 | token_list = [word for word in token_list if word not in self.stop_words] 35 | lemma_list = [self.lemmatizer.lemmatize(word) for word in token_list] 36 | 37 | return lemma_list 38 | 39 | def get_vad_score(self, inp_sentence, remove_stop= False): 40 | token_list = self.token_stemmer(inp_sentence, remove_stop) 41 | fdist = dict(freq(token_list)) 42 | 43 | v_sum = 0.0 44 | a_sum = 0.0 45 | d_sum = 0.0 46 | count = 1.0 47 | 48 | for word in fdist.keys(): 49 | try: 50 | v_sum = self.vad[word]['v']*fdist[word] 51 | a_sum = self.vad[word]['a']*fdist[word] 52 | d_sum = self.vad[word]['d']*fdist[word] 53 | count += fdist[word] 54 | except KeyError: 55 | pass 56 | 57 | return (v_sum/count, a_sum/count, d_sum/count) 58 | 59 | 60 | if __name__ == "__main__": 61 | with open('test', 'r') as f: 62 | inp_data = f.read() 63 | 64 | l = vad_score() 65 | print l.get_vad_score(inp_data) 66 | 67 | -------------------------------------------------------------------------------- /Hybrid/feature_extraction/Movie-VAE/Readme.md: -------------------------------------------------------------------------------- 1 | get movie embeddings to be given as input to hybrid models. 2 | 3 | run : 4 | 5 | vae_movie.py 6 | 7 | get_embeddings.py 8 | 9 | -------------------------------------------------------------------------------- /Hybrid/feature_extraction/Movie-VAE/get_embeddings.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pickle 3 | import os 4 | import math 5 | from keras.layers import Input, Dense, Lambda 6 | from keras.models import Model, load_model 7 | from keras import objectives 8 | from keras import backend as K 9 | 10 | batch_size=1 11 | original_dim=1128 12 | intermediate_dim=50 13 | latent_dim=3 14 | epsilon_std=1.0 15 | 16 | x=Input(batch_shape=(batch_size,original_dim)) 17 | h=Dense(intermediate_dim, activation='relu')(x) 18 | z_mean=Dense(latent_dim)(h) 19 | z_log_var=Dense(latent_dim)(h) 20 | 21 | def sampling(args): 22 | _mean,_log_var=args 23 | epsilon=K.random_normal(shape=(K.shape(z_mean)[0], latent_dim), mean=0., std=epsilon_std) 24 | return _mean+K.exp(_log_var/2)*epsilon 25 | z = Lambda(sampling, output_shape=(latent_dim,))([z_mean, z_log_var]) 26 | 27 | h_decoder=Dense(intermediate_dim, activation='relu') 28 | x_bar=Dense(original_dim,activation='sigmoid') 29 | h_decoded = h_decoder(z) 30 | x_decoded = x_bar(h_decoded) 31 | 32 | vae = Model(x, [x_decoded,z]) 33 | weightsPath = "mov_genomes.hdf5" 34 | vae.load_weights(weightsPath) 35 | 36 | x_test_matrix = np.load( open( "movie_genomes.npy", "rb" ) ) 37 | 38 | x_test_reconstructed = vae.predict(x_test_matrix, batch_size=batch_size) # float values per user 39 | 40 | with open('genome_embed.npy', 'wb') as f: 41 | np.save(f, np.array(x_test_reconstructed[1].tolist())) 42 | 43 | -------------------------------------------------------------------------------- /Hybrid/feature_extraction/Movie-VAE/vae_movie.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pickle 3 | import os 4 | from keras.layers import Input, Dense, Lambda 5 | from keras.models import Model 6 | from keras import objectives 7 | from keras import backend as K 8 | from keras.callbacks import ReduceLROnPlateau, ModelCheckpoint, Callback 9 | 10 | 11 | batch_size=20 12 | original_dim=1128 13 | intermediate_dim=100 14 | latent_dim=2 15 | nb_epochs=20 16 | epsilon_std=1.0 17 | 18 | x=Input(batch_shape=(batch_size,original_dim)) 19 | h=Dense(intermediate_dim, activation='relu')(x) 20 | z_mean=Dense(latent_dim)(h) 21 | z_log_var=Dense(latent_dim)(h) 22 | 23 | def sampling(args): 24 | _mean,_log_var=args 25 | epsilon=K.random_normal(shape=(K.shape(z_mean)[0], latent_dim), mean=0., std=epsilon_std) 26 | return _mean+K.exp(_log_var/2)*epsilon 27 | z= Lambda(sampling, output_shape=(latent_dim,))([z_mean, z_log_var]) 28 | 29 | h_decoder=Dense(intermediate_dim, activation='relu') 30 | x_bar=Dense(original_dim,activation='sigmoid') 31 | h_decoded = h_decoder(z) 32 | x_decoded = x_bar(h_decoded) 33 | 34 | vae = Model(x, x_decoded) 35 | def vae_loss(x,x_bar): 36 | reconst_loss=original_dim*objectives.binary_crossentropy(x, x_bar) 37 | kl_loss=-0.5*K.sum(1 + z_log_var - K.square(z_mean) - K.exp(z_log_var), axis=-1) 38 | return reconst_loss + kl_loss 39 | 40 | vae.compile(optimizer='adam', loss=vae_loss) 41 | 42 | x_train = pickle.load( open( "movie_genomes.npy", "rb" ) ) 43 | 44 | def nn_batch_generator(x, y, batch_size, samples_per_epoch): 45 | number_of_batches = samples_per_epoch/batch_size 46 | counter=0 47 | shuffle_index = np.arange(np.shape(y)[0]) 48 | np.random.shuffle(shuffle_index) 49 | x = x[shuffle_index, :] 50 | y = y[shuffle_index, :] 51 | while 1: 52 | index_batch = shuffle_index[batch_size*counter:batch_size*(counter+1)] 53 | x_batch = x[index_batch,:] 54 | y_batch = y[index_batch,:] 55 | counter += 1 56 | yield (np.array(x_batch),np.array(y_batch)) 57 | if (counter >= number_of_batches): 58 | counter=0 59 | 60 | 61 | weightsPath = "./mov_genome.hdf5" 62 | checkpointer = ModelCheckpoint(filepath=weightsPath, verbose=1) 63 | 64 | vae.fit_generator(nn_batch_generator(x_train, x_train, batch_size, 26620), samples_per_epoch=26620, nb_epoch=nb_epochs, callbacks=[checkpointer]) 65 | -------------------------------------------------------------------------------- /Hybrid/vae_imdb/Readme.md: -------------------------------------------------------------------------------- 1 | train hyrbid-VAE : h-vae_imdb.py 2 | 3 | eval 1 : test_h-vae_eval1.py 4 | 5 | eval 2 : test_h-vae_eval2.py 6 | -------------------------------------------------------------------------------- /Hybrid/vae_imdb/h-vae_imdb.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pickle 3 | import os 4 | from keras.layers import Input, Dense, Lambda, merge, Embedding, Flatten, LSTM 5 | from keras.models import Model, Sequential 6 | from keras import objectives 7 | from keras import backend as K 8 | from keras.callbacks import ReduceLROnPlateau, ModelCheckpoint, Callback 9 | import keras 10 | import tensorflow as tf 11 | import pdb 12 | 13 | 14 | batch_size=500 15 | original_dim=26621 16 | intermediate_dim=600 17 | latent_dim=200 18 | nb_epochs=15 19 | epsilon_std=1.0 20 | 21 | vocab_size = 26621 22 | embed_dim = 3 23 | seq_length = 26621 24 | 25 | 26 | with open('feature_embed.npy', 'rb') as f: 27 | embedding_matrix = np.load(f) 28 | embedding_matrix = np.append(np.array([[0.0, 0.0, 0.0]]) ,embedding_matrix, axis =0) 29 | 30 | x=Input(batch_shape=(batch_size,original_dim)) 31 | embedding_layer = Embedding(vocab_size+1, 3, weights=[embedding_matrix], input_length=seq_length, trainable=True) 32 | embed = embedding_layer(x) 33 | flat_embed = Flatten() 34 | embed = flat_embed(embed) 35 | h=Dense(intermediate_dim, activation='tanh')(embed) 36 | 37 | z_mean=Dense(latent_dim)(h) 38 | z_log_var=Dense(latent_dim)(h) 39 | def sampling(args): 40 | _mean,_log_var=args 41 | epsilon=K.random_normal(shape=(K.shape(z_mean)[0], latent_dim), mean=0., std=epsilon_std) 42 | return _mean+K.exp(_log_var/2)*epsilon 43 | z= Lambda(sampling, output_shape=(latent_dim,))([z_mean, z_log_var]) 44 | 45 | # decoder network 46 | h_decoder=Dense(intermediate_dim, activation='tanh') 47 | x_bar=Dense(original_dim2, activation='softmax') 48 | h_decoded = h_decoder(z) 49 | x_decoded = x_bar(h_decoded) 50 | 51 | mul_inp = Input(batch_shape=(batch_size,original_dim)) 52 | x_decoded = merge([x_decoded, mul_inp], mode = 'mul') 53 | vae = Model([x, mul_inp], x_decoded) 54 | 55 | 56 | def vae_loss(x,x_bar): 57 | reconst_loss=original_dim2*objectives.binary_crossentropy(x, x_bar) 58 | kl_loss=-0.5*K.sum(1 + z_log_var - K.square(z_mean) - K.exp(z_log_var), axis=-1) 59 | return reconst_loss + kl_loss 60 | 61 | vae.compile(optimizer='adam', loss=vae_loss) 62 | print(vae.summary()) 63 | 64 | x_train = pickle.load( open( "train_data.file", "rb" ) ) 65 | movie_indices = np.array([range(1,26622)]) 66 | movie_indices = np.repeat(movie_indices, batch_size, axis = 0) 67 | 68 | 69 | def nn_batch_generator(x, y, batch_size, samples_per_epoch): 70 | number_of_batches = samples_per_epoch/batch_size 71 | counter=0 72 | shuffle_index = np.arange(np.shape(y)[0]) 73 | np.random.shuffle(shuffle_index) 74 | x = x[shuffle_index, :] 75 | y = y[shuffle_index, :] 76 | while 1: 77 | index_batch = shuffle_index[batch_size*counter:batch_size*(counter+1)] 78 | x_batch = np.array(x[index_batch,:].todense()).astype(float) 79 | x_new_batch = x_batch*movie_indices 80 | 81 | counter += 1 82 | yield ([x_new_batch, x_batch], x_batch) 83 | if (counter >= number_of_batches): 84 | counter=0 85 | 86 | 87 | weightsPath = "weights_h-vae_imdb.hdf5" 88 | checkpointer = ModelCheckpoint(filepath=weightsPath, verbose=1) 89 | 90 | vae.fit_generator(nn_batch_generator(x_train, x_train, batch_size, 118000) , samples_per_epoch=118000, nb_epoch=nb_epochs, callbacks = [checkpointer]) 91 | -------------------------------------------------------------------------------- /Hybrid/vae_imdb/test_h-vae_eval1.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pickle 3 | import os 4 | import math 5 | from keras.layers import Input, Dense, Lambda, Embedding, Flatten, merge 6 | from keras.models import Model, load_model 7 | from keras import objectives 8 | from keras import backend as K 9 | 10 | batch_size=500 11 | original_dim=26621 12 | intermediate_dim=600 13 | latent_dim=200 14 | nb_epochs=15 15 | epsilon_std=1.0 16 | vocab_size = 26621 17 | embed_dim = 3 18 | seq_length = 26621 19 | 20 | 21 | with open('feature_embed_3dim.npy', 'rb') as f: 22 | embedding_matrix = np.load(f) 23 | embedding_matrix = np.append(np.array([[0.0, 0.0, 0.0]]) ,embedding_matrix, axis =0) 24 | 25 | x=Input(batch_shape=(batch_size,original_dim)) 26 | embedding_layer = Embedding(vocab_size+1, 3, weights=[embedding_matrix], input_length=seq_length, trainable=True) 27 | embed = embedding_layer(x) 28 | flat_embed = Flatten() 29 | embed = flat_embed(embed) 30 | h=Dense(intermediate_dim, activation='tanh')(embed) 31 | 32 | z_mean=Dense(latent_dim)(h) 33 | z_log_var=Dense(latent_dim)(h) 34 | 35 | def sampling(args): 36 | _mean,_log_var=args 37 | epsilon=K.random_normal(shape=(K.shape(z_mean)[0], latent_dim), mean=0., std=epsilon_std) 38 | return _mean+K.exp(_log_var/2)*epsilon 39 | z= Lambda(sampling, output_shape=(latent_dim,))([z_mean, z_log_var]) 40 | 41 | h_decoder=Dense(intermediate_dim, activation='tanh') 42 | x_bar=Dense(original_dim2, activation='softmax') 43 | h_decoded = h_decoder(z) 44 | x_decoded = x_bar(h_decoded) 45 | 46 | mul_inp = Input(batch_shape=(batch_size,original_dim)) 47 | x_decoded2 = merge([x_decoded, mul_inp], mode = 'mul') 48 | vae = Model([x, mul_inp], x_decoded) 49 | 50 | weightsPath = "weights_h-vae_imdb.hdf5" 51 | vae.load_weights(weightsPath) 52 | 53 | movie_indices = np.array([range(1,26622)]) 54 | movie_indices = np.repeat(movie_indices, batch_size, axis = 0) 55 | 56 | x_test_matrix = pickle.load( open( "test_data.file", "rb" ) ) 57 | x_test_matrix = x_test_matrix.todense() 58 | x_test = np.squeeze(np.asarray(x_test_matrix)) 59 | 60 | x_test_new = x_test*movie_indices 61 | 62 | x_test_reconstructed = vae.predict([x_test_new, x_test], batch_size=batch_size) # float values per user 63 | 64 | def recallatk(x_test, x_test_reconstructed, k): 65 | recall_values = [] 66 | total_recall = 0.0 67 | for i in range(len(x_test)): 68 | top_rated_movies_idx = [i for i, x in enumerate(x_test[i].tolist()) if x == 1.0] 69 | if len(top_rated_movies_idx) == 0: 70 | continue 71 | 72 | sorted_ratings = x_test_reconstructed[i].tolist() 73 | top_predicted_movies_idx = sorted(range(len(sorted_ratings)), key=lambda i: sorted_ratings[i])[-k:] 74 | 75 | sum = 0.0 76 | for i in range(0, k): 77 | if top_predicted_movies_idx[i] in top_rated_movies_idx: 78 | sum+=1.0 79 | recall = sum/float(min(k, len(top_rated_movies_idx))) 80 | total_recall += recall 81 | recall_values.append(recall) 82 | return total_recall/float(len(recall_values)) 83 | 84 | def ndcgatk(x_test, x_test_reconstructed, k): 85 | ndcg_values = [] 86 | total_ndcg = 0.0 87 | best = 0.0 88 | for i in range(len(x_test)): 89 | top_rated_movies_idx = [i for i, x in enumerate(x_test[i].tolist()) if x == 1.0] 90 | 91 | if len(top_rated_movies_idx) == 0: 92 | continue 93 | sorted_ratings = x_test_reconstructed[i].tolist() 94 | top_predicted_movies_idx = sorted(range(len(sorted_ratings)), key=lambda i: sorted_ratings[i])[-k:] 95 | sum_ndcg = 0 96 | for i in range(0, k): 97 | if top_predicted_movies_idx[i] in top_rated_movies_idx: 98 | ndcg = 1/(math.log(i+2)) 99 | else: 100 | ndcg = 0 101 | sum_ndcg += ndcg 102 | 103 | total_ndcg += sum_ndcg 104 | ndcg_values.append(sum_ndcg) 105 | 106 | ndcg_values = np.array(ndcg_values) 107 | max_ndcg = ndcg_values.max() 108 | ndcg_values = ndcg_values / max_ndcg 109 | total_ndcg = np.sum(ndcg_values) 110 | 111 | return total_ndcg/float(len(ndcg_values)) 112 | 113 | print("NDCG at 100: ", ndcgatk(x_test, x_test_reconstructed, 100)) 114 | print("recall at 20: ", recallatk(x_test, x_test_reconstructed, 20)) 115 | print("recall at 50: ", recallatk(x_test, x_test_reconstructed, 50)) 116 | -------------------------------------------------------------------------------- /Hybrid/vae_imdb/test_h-vae_eval2.py: -------------------------------------------------------------------------------- 1 | import random 2 | import numpy as np 3 | import pickle 4 | import os 5 | import math 6 | from keras.layers import Input, Dense, Lambda, Embedding, Flatten 7 | from keras.models import Model, load_model 8 | from keras import objectives 9 | from keras import backend as K 10 | 11 | batch_size=500 12 | original_dim=26621 13 | intermediate_dim=600 14 | latent_dim=200 15 | 16 | epsilon_std=1.0 17 | 18 | vocab_size = 26621 19 | embed_dim = 3 20 | seq_length = 26621 21 | x_test_size = 10000 22 | 23 | with open('feature_embed_3dim.npy', 'rb') as f: 24 | embedding_matrix = np.load(f) 25 | embedding_matrix = np.append(np.array([[0.0, 0.0, 0.0]]) ,embedding_matrix, axis =0) 26 | 27 | 28 | x=Input(batch_shape=(batch_size,original_dim)) 29 | embedding_layer = Embedding(vocab_size+1, 3, weights=[embedding_matrix], input_length=seq_length, trainable=True) 30 | embed = embedding_layer(x) 31 | flat_embed = Flatten() 32 | embed = flat_embed(embed) 33 | h=Dense(intermediate_dim, activation='tanh')(embed) 34 | 35 | z_mean=Dense(latent_dim)(h) 36 | z_log_var=Dense(latent_dim)(h) 37 | 38 | 39 | def sampling(args): 40 | _mean,_log_var=args 41 | epsilon=K.random_normal(shape=(K.shape(z_mean)[0], latent_dim), mean=0., std=epsilon_std) 42 | return _mean+K.exp(_log_var/2)*epsilon 43 | z= Lambda(sampling, output_shape=(latent_dim,))([z_mean, z_log_var]) 44 | 45 | h_decoder=Dense(intermediate_dim, activation='tanh') 46 | x_bar=Dense(original_dim2, activation='softmax') 47 | h_decoded = h_decoder(z) 48 | x_decoded = x_bar(h_decoded) 49 | 50 | mul_inp = Input(batch_shape=(batch_size,original_dim)) 51 | x_decoded2 = merge([x_decoded, mul_inp], mode = 'mul') 52 | 53 | vae = Model([x, mul_inp], x_decoded) 54 | 55 | weightsPath = "weights_h-vae_imdb.hdf5" 56 | vae.load_weights(weightsPath) 57 | 58 | 59 | 60 | movie_indices = np.array([range(1,26622)]) 61 | movie_indices = np.repeat(movie_indices, batch_size, axis = 0) 62 | 63 | x_test_matrix = pickle.load( open( "test_data.file", "rb" ) ) 64 | x_test_matrix = x_test_matrix.todense() 65 | x_test = np.squeeze(np.asarray(x_test_matrix)) 66 | 67 | x_test_new_list = [] 68 | x_test_fold_out_indices = [] 69 | 70 | for i in range(x_test_size): 71 | user_i_features = x_test[i] 72 | one_indices = np.argwhere(user_i_features > 0.0) 73 | number_of_one_indices = one_indices.shape[0] 74 | fold_out_number = int(0.2*number_of_one_indices) 75 | 76 | fold_out_indices = random.sample(one_indices.tolist(), fold_out_number) 77 | x_test_fold_out_indices.append(fold_out_indices) 78 | 79 | np.put(user_i_features, fold_out_indices, np.zeros(fold_out_number)) 80 | x_test_new_list.append(user_i_features) 81 | 82 | x_test_new_list = np.array(x_test_new_list) 83 | x_test_new_list = x_test_new_list * movie_indices 84 | x_test_reconstructed = vae.predict([x_test_new_list, x_test], batch_size=batch_size) 85 | 86 | 87 | def recallatk(x_test, x_test_fold_out_indices, x_test_reconstructed, k): 88 | recall_values = [] 89 | total_recall = 0.0 90 | for i in range(len(x_test)): 91 | if len(x_test_fold_out_indices[i]) == 0: # if this user hadn't rated any movie as 1 92 | continue 93 | 94 | item_list = [item for sublist in x_test_fold_out_indices[i] for item in sublist] 95 | 96 | sorted_ratings = x_test_reconstructed[i].tolist() 97 | top_predicted_movies_idx = sorted(range(len(sorted_ratings)), key=lambda i: sorted_ratings[i])[-k:] 98 | 99 | sum = 0.0 100 | for j in range(0, k): 101 | if top_predicted_movies_idx[j] in item_list: 102 | sum+=1.0 103 | recall = sum/float(min(k, len(x_test_fold_out_indices[i]))) 104 | total_recall += recall 105 | recall_values.append(recall) 106 | return total_recall/float(len(recall_values)) 107 | 108 | def ndcgatk(x_test, x_test_fold_out_indices, x_test_reconstructed, k): 109 | ndcg_values = [] 110 | total_ndcg = 0.0 111 | best = 0.0 112 | for i in range(len(x_test)): 113 | if len(x_test_fold_out_indices[i]) == 0: 114 | continue 115 | 116 | sorted_ratings = x_test_reconstructed[i].tolist() 117 | top_predicted_movies_idx = sorted(range(len(sorted_ratings)), key=lambda i: sorted_ratings[i])[-k:] 118 | sum_ndcg = 0 119 | item_list = [item for sublist in x_test_fold_out_indices[i] for item in sublist] 120 | for j in range(0, k): 121 | if top_predicted_movies_idx[j] in item_list: 122 | ndcg = 1/(math.log(j+2)) 123 | else: 124 | ndcg = 0 125 | sum_ndcg += ndcg 126 | total_ndcg += sum_ndcg 127 | ndcg_values.append(sum_ndcg) 128 | 129 | ndcg_values = np.array(ndcg_values) 130 | max_ndcg = ndcg_values.max() 131 | ndcg_values = ndcg_values / max_ndcg 132 | total_ndcg = np.sum(ndcg_values) 133 | 134 | return total_ndcg/float(len(ndcg_values)) 135 | 136 | print("NDCG at 100: ", ndcgatk(x_test, x_test_fold_out_indices, x_test_reconstructed, 100)) 137 | 138 | print("recall at 20: ", recallatk(x_test, x_test_fold_out_indices, x_test_reconstructed, 20)) 139 | 140 | print("recall at 50: ", recallatk(x_test, x_test_fold_out_indices, x_test_reconstructed, 50)) 141 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Variational-Autoencoders-Collaborative-Filtering 2 | Please cite our paper: [A Hybrid Variational Autoencoder for Collaborative Filtering](https://arxiv.org/abs/1808.01006), if you find this repository helpful. 3 | 4 | This repository contains the code implementing variational autoencoders (VAE) for collaborative filtering (CF) on movielens data and spotify's Million Playlist dataset (MPD). 5 | 6 | **Link to movielens data**: http://files.grouplens.org/datasets/movielens/ml-20m.zip 7 | For movielens dataset, we couldn't use the ratings.csv file directly as it had some movies which IMDB didn't understand, and hence created new_ratings.csv. The code for this filtering is in: update_ratings.py 8 | 9 | **Million Playlist Dataset**- official website hosted at https://recsys-challenge.spotify.com/ 10 | One needs to register on the website and download the training data and the test data (challenge set) as part of the recsys 2018 playlist completion challenge. 11 | 12 | **The folder ./Hybrid** contains the code for the implementation of our proposed hybrid VAE model. 13 | 14 | **The folder ./Standard** contains the code for the implementation of standard VAE model. 15 | 16 | **The folder ./Spotify** contains the code used for playlist completion challenge, from data preprocessing, training and generating predictions. 17 | 18 | Please look into the specific folder to read more about the files that were used for the specific implementation. 19 | -------------------------------------------------------------------------------- /Spotify/README.md: -------------------------------------------------------------------------------- 1 | 2 | Variational autoencoder code adapted for the task of playlist completion/song recommendation on the Spotify million playlist dataset (MPD). 3 | 4 | - **read_mpd.py** file is used to create the training and validation files from the Spotify RecSys MPD dataset. It takes the trackcount.file as input, which stores the most popular 125,000 tracks from the MPD, which reduces the size of the input data. It creates the train_data.file and val_data.file using the sparse matrix representation, while also creating files to store the raw rows and columns for both datasets. In addition, it also creates the track_dict.file, the dictionary that maintains the track_uri to matrix index mapping. 5 | 6 | - **read_challenge.py** file also uses the trackcount.file and track_dict.file to create a sparse matrix representation of the challenge dataset provided by Spotify, to create the test_data.file. 7 | 8 | - **vae_cf_spotify.py** file is used to create the vae network, compile it and train it on the train_data. This code saves the model's weights in the specified location, and also logs the train and validation losses for analysis purposes. 9 | 10 | - **run_vae_on_test_playlists.py** file reads in the test_data.file (which has 10,000 playlists as given in the challenge set), loads the saved model weights, and predicts on these playlists. Since ours is a VAE architecture, the reconstructed playlist representation (x_test_reconstructed variable) is then saved using pickle. This saved file is used to finally generate song predictions for each of these 10,000 playlists. 11 | 12 | - **generate_song_predictions.py** file uses the output from the **run_vae_on_test_playlists.py** along with track_dict.file and the test_data.file to create a list of 500 predicted tracks for each playlist. It sorts the input and filters tracks with highest probabilities for each playlist, eliminates tracks that already belong to the playlist, and stores them in predictions.file. 13 | 14 | - **generate_csv.py** file takes the output from **generate_song_predictions.py** and creates a CSV in the format acceptable by the Spotify RecSys Challenge, suitable for submission. 15 | -------------------------------------------------------------------------------- /Spotify/generate_song_predictions.py: -------------------------------------------------------------------------------- 1 | import pickle 2 | import json 3 | 4 | x_test_size = 10000 5 | first_half = 5000 6 | 7 | print("started") 8 | x_test_reconstructed = pickle.load(open("x_test_reconstructed.file", "rb")) 9 | print(x_test_reconstructed.shape) 10 | 11 | tracks = pickle.load(open("track_dict.file", "rb")) 12 | test_data = pickle.load(open("test_data.file", "rb")) 13 | 14 | challenge = json.load(open("challenge_set.json")) 15 | playlists = challenge['playlists'] 16 | predictions = {} 17 | 18 | # TODO: change the range to (first_half, x_test_size) 19 | for i in range(0, first_half): 20 | print("Reading playlist: " + str(i)) 21 | sorted_probabilities = x_test_reconstructed[i].tolist() 22 | 23 | # we can pick top 1000 per say, disregard the ones (get track id from array index using track_dict) which were already there using SongCheck.py 24 | #and get the track ids of the remaining top ones using array index and track_dicts 25 | 26 | pred_size = 0 27 | current_prediction = [] 28 | # get top 700 playlists - enough to find top 500 tracks 29 | top_predicted_movies_idx = (sorted(range(len(sorted_probabilities)), key=lambda i: sorted_probabilities[i])[-700:]) 30 | # reverse since it stores index in ascending order within top 700 31 | top_predicted_movies_idx.reverse() 32 | 33 | for j in top_predicted_movies_idx: 34 | if (test_data[i, j] == 1.0): 35 | track_uri = False 36 | else: 37 | track_uri = (list(tracks.keys())[list(tracks.values()).index(j)]) 38 | 39 | if (track_uri): 40 | current_prediction.append(track_uri) 41 | pred_size += 1 42 | if (pred_size == 500): 43 | break 44 | # print(current_prediction) 45 | pid = playlists[i]['pid'] 46 | predictions[pid] = current_prediction 47 | # print(predictions) 48 | 49 | # TODO: change this to predictions2.file 50 | pickle.dump(predictions, open("predictions1.file", "wb")) -------------------------------------------------------------------------------- /Spotify/read_challenge.py: -------------------------------------------------------------------------------- 1 | import json 2 | import pandas as pd 3 | import numpy as np 4 | import pickle 5 | from scipy import sparse 6 | 7 | number_of_songs = 125000 8 | 9 | rows = [] 10 | cols = [] 11 | 12 | json_data = json.load(open("challenge/challenge_set.json")) 13 | playlists = json_data['playlists'] 14 | 15 | tracks = pickle.load(open("track_dict.file", "rb")) 16 | trackfile = pickle.load(open("trackcount.file", "rb")) 17 | 18 | user_count = 0 19 | 20 | for playlist in playlists: 21 | track_list = [] 22 | for track in playlist['tracks']: 23 | if track['track_uri'] in trackfile: 24 | track_list.append(tracks[track["track_uri"]]) 25 | rows.extend(user_count for i in range(len(track_list))) 26 | cols.extend(track_list) 27 | user_count += 1 28 | 29 | test_data = sparse.csr_matrix((np.ones_like(rows),(np.array(rows), np.array(cols))), dtype='float64', shape=(10000, number_of_songs)) 30 | pickle.dump(test_data, open("test_data.file", "wb")) 31 | 32 | -------------------------------------------------------------------------------- /Spotify/read_mpd.py: -------------------------------------------------------------------------------- 1 | import json 2 | import pandas as pd 3 | import numpy as np 4 | import random 5 | import pickle 6 | from scipy import sparse 7 | 8 | # number_of_songs = 2262292 9 | number_of_songs = 125000 10 | 11 | track_count = 0 12 | tracks = {} 13 | 14 | file_count = 0 15 | validation = 0 16 | playlist_count = 0 17 | 18 | rows = [] 19 | cols = [] 20 | valid_rows = [] 21 | valid_cols = [] 22 | 23 | trackfile = pickle.load(open("trackcount.file", "rb")) 24 | 25 | for i in range(0, 1000): 26 | print("----FILE------ " + str(i)) 27 | filename = 'mpd/data/mpd.slice.' + str(file_count) + '-' + str(file_count + 999) + '.json' 28 | json_data = json.load(open(filename)) 29 | 30 | playlists = json_data["playlists"] 31 | 32 | choice = False 33 | if validation < 10000: 34 | choice = random.choice([True, False]) 35 | # print(choice) 36 | 37 | for playlist in playlists: 38 | track_list = [] 39 | for track in playlist['tracks']: 40 | if track['track_uri'] in trackfile: 41 | if track['track_uri'] not in tracks: 42 | tracks[track['track_uri']] = track_count 43 | track_count += 1 44 | track_list.append(tracks[track["track_uri"]]) 45 | 46 | if (choice): 47 | valid_rows.extend(validation for i in range(len(track_list))) 48 | valid_cols.extend(track_list) 49 | validation += 1 50 | else: 51 | rows.extend(playlist_count for i in range(len(track_list))) 52 | # print(rows) 53 | cols.extend(track_list) 54 | playlist_count += 1 55 | 56 | # if (i == 2): 57 | # break 58 | 59 | file_count += 1000 60 | 61 | pickle.dump(rows, open("rows.file", "wb")) 62 | pickle.dump(cols, open("cols.file", "wb")) 63 | 64 | pickle.dump(tracks, open("track_dict.file", "wb")) 65 | 66 | pickle.dump(valid_rows, open("valid_rows.file", "wb")) 67 | pickle.dump(valid_cols, open("valid_cols.file", "wb")) 68 | 69 | train_data = sparse.csr_matrix((np.ones_like(rows),(np.array(rows), np.array(cols))), dtype='float64', shape=(990000, number_of_songs)) 70 | pickle.dump(train_data, open("train_data.file", "wb")) 71 | 72 | val_data = sparse.csr_matrix((np.ones_like(valid_rows),(np.array(valid_rows), np.array(valid_cols))), dtype='float64', shape=(10000, number_of_songs)) 73 | pickle.dump(val_data, open("val_data.file", "wb")) 74 | 75 | -------------------------------------------------------------------------------- /Spotify/run_vae_on_test_playlists.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pickle 3 | import os 4 | import math 5 | from keras.layers import Input, Dense, Lambda 6 | from keras.models import Model, load_model 7 | from keras import objectives 8 | from keras import backend as K 9 | 10 | 11 | # encoder/decoder network size 12 | batch_size=500 13 | original_dim=125000 # number of movies 14 | intermediate_dim=600 15 | latent_dim=200 16 | epsilon_std=1.0 17 | 18 | # encoder network 19 | x=Input(batch_shape=(batch_size,original_dim)) 20 | h=Dense(intermediate_dim, activation='tanh')(x) 21 | z_mean=Dense(latent_dim)(h) 22 | z_log_var=Dense(latent_dim)(h) 23 | 24 | 25 | # sampling from latent dimension for decoder/generative part of network 26 | def sampling(args): 27 | _mean,_log_var=args 28 | # does this mean we are modelling this is as a gaussian and not multinomial? 29 | epsilon=K.random_normal(shape=(K.shape(z_mean)[0], latent_dim), mean=0., std=epsilon_std) 30 | return _mean+K.exp(_log_var/2)*epsilon 31 | 32 | z = Lambda(sampling, output_shape=(latent_dim,))([z_mean, z_log_var]) 33 | 34 | # decoder network 35 | h_decoder=Dense(intermediate_dim, activation='tanh') 36 | x_bar=Dense(original_dim,activation='softmax') # this should be softmax right? 37 | h_decoded = h_decoder(z) 38 | x_decoded = x_bar(h_decoded) 39 | 40 | vae = Model(x, x_decoded) 41 | weightsPath = "./tmp/weights.hdf5" 42 | vae.load_weights(weightsPath) 43 | 44 | x_test_matrix = pickle.load( open( "test_data.file", "rb" ) ) 45 | print("number of playlists in test data", x_test_matrix.shape[0]) 46 | print("number of songs in test playlists", x_test_matrix.shape[1]) 47 | 48 | 49 | def nn_batch_generator(x, batch_size, samples_per_epoch): 50 | number_of_batches = samples_per_epoch/batch_size 51 | shuffle_index = np.arange(np.shape(x)[0]) 52 | counter=0 53 | while 1: 54 | index_batch = shuffle_index[batch_size*counter:batch_size*(counter+1)] 55 | x_batch = x[index_batch,:].todense() 56 | counter += 1 57 | yield (np.array(x_batch)) 58 | if (counter >= number_of_batches): 59 | counter=0 60 | 61 | 62 | x_test_reconstructed = vae.predict_generator(generator=nn_batch_generator(x_test_matrix, batch_size, 10000), val_samples=x_test_matrix.shape[0]) 63 | print(type(x_test_reconstructed)) 64 | print(len(x_test_reconstructed)) 65 | print(x_test_reconstructed[0]) 66 | pickle.dump(x_test_reconstructed, open("x_test_reconstructed.file", "wb"), protocol=4) 67 | -------------------------------------------------------------------------------- /Spotify/vae_cf_spotify.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pickle 3 | import os 4 | from keras.layers import Input, Dense, Lambda 5 | from keras.models import Model 6 | from keras import objectives 7 | from keras import backend as K 8 | from keras.callbacks import ReduceLROnPlateau, ModelCheckpoint, Callback, EarlyStopping 9 | 10 | # encoder/decoder network size 11 | batch_size=500 12 | original_dim = 125000 # number of filtered songs (songs appearing in less than 46 playlists) 13 | intermediate_dim=600 14 | latent_dim=200 15 | nb_epochs=50 16 | epsilon_std=1.0 17 | 18 | class LossHistory(Callback): 19 | def on_train_begin(self, logs={}): 20 | self.losses = [] 21 | self.val_losses = [] 22 | 23 | def on_epoch_end(self, epoch, logs={}): 24 | self.losses.append(logs.get('loss')) 25 | self.val_losses.append(logs.get('val_loss')) 26 | 27 | history = LossHistory() 28 | 29 | # encoder network 30 | x=Input(batch_shape=(batch_size,original_dim)) 31 | h=Dense(intermediate_dim, activation='tanh')(x) 32 | z_mean=Dense(latent_dim)(h) 33 | z_log_var=Dense(latent_dim)(h) 34 | 35 | 36 | # sampling from latent dimension for decoder/generative part of network 37 | def sampling(args): 38 | _mean,_log_var=args 39 | epsilon=K.random_normal(shape=(K.shape(z_mean)[0], latent_dim), mean=0., std=epsilon_std) 40 | return _mean+K.exp(_log_var/2)*epsilon 41 | 42 | z= Lambda(sampling, output_shape=(latent_dim,))([z_mean, z_log_var]) 43 | 44 | # decoder network 45 | h_decoder=Dense(intermediate_dim, activation='tanh') 46 | x_bar=Dense(original_dim,activation='softmax') # this should be softmax right? 47 | h_decoded = h_decoder(z) 48 | x_decoded = x_bar(h_decoded) 49 | 50 | # build and compile model 51 | vae = Model(x, x_decoded) 52 | def vae_loss(x,x_bar): 53 | reconst_loss=original_dim*objectives.binary_crossentropy(x, x_bar) 54 | kl_loss=-0.5*K.sum(1 + z_log_var - K.square(z_mean) - K.exp(z_log_var), axis=-1) 55 | return reconst_loss + kl_loss 56 | 57 | vae.compile(optimizer='adam', loss=vae_loss) 58 | 59 | x_train = pickle.load( open( "train_data.file", "rb" ) ) 60 | print("number of training playlists: ", x_train.shape[0]) 61 | print("number of songs after filtering: ", x_train.shape[1]) 62 | 63 | x_val = pickle.load( open( "val_data.file", "rb" ) ) 64 | print("number of validation playlists: ", x_val.shape[0]) 65 | print("number of songs in validation playlists: ", x_val.shape[1]) 66 | 67 | 68 | def nn_batch_generator(x, y, batch_size, samples_per_epoch): 69 | number_of_batches = samples_per_epoch/batch_size 70 | counter=0 71 | shuffle_index = np.arange(np.shape(y)[0]) 72 | np.random.shuffle(shuffle_index) 73 | x = x[shuffle_index, :] 74 | y = y[shuffle_index, :] 75 | while 1: 76 | index_batch = shuffle_index[batch_size*counter:batch_size*(counter+1)] 77 | x_batch = x[index_batch,:].todense() 78 | y_batch = y[index_batch,:].todense() 79 | counter += 1 80 | yield (np.array(x_batch),np.array(y_batch)) 81 | if (counter >= number_of_batches): 82 | counter=0 83 | 84 | 85 | weightsPath = "./tmp/weights.hdf5" 86 | checkpointer = ModelCheckpoint(filepath=weightsPath, verbose=1, save_best_only=True) 87 | earlyStopping = EarlyStopping(monitor='val_loss', min_delta=0, patience=5, verbose=0, mode='min') 88 | 89 | # original size of training data = 9,90,000 90 | # sending only 1,00,000 playlists in each epoch and shuffling before every epoch so that each playlist is seen in the training 91 | 92 | vae.fit_generator(nn_batch_generator(x_train, x_train, batch_size, 100000), samples_per_epoch=100000, nb_epoch=nb_epochs, 93 | validation_data=nn_batch_generator(x_val, x_val, batch_size, 10000), nb_val_samples=10000, callbacks=[checkpointer, earlyStopping, history]) 94 | 95 | 96 | pickle.dump(history.losses, open('train_losses.file', 'wb')) 97 | pickle.dump(history.val_losses, open('val_losses.file', 'wb')) 98 | -------------------------------------------------------------------------------- /Standard/evaluate_model_approach_1.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pickle 3 | import os 4 | import math 5 | from keras.layers import Input, Dense, Lambda 6 | from keras.models import Model, load_model 7 | from keras import objectives 8 | from keras import backend as K 9 | 10 | 11 | # def measure_performance(x, x_bar): 12 | # return 13 | 14 | 15 | # encoder/decoder network size 16 | batch_size=500 17 | original_dim=26621 # number of movies 18 | intermediate_dim=600 19 | latent_dim=200 20 | epsilon_std=1.0 21 | 22 | # activation used is tanh 23 | # softmax activation is used at the final dense layer which produces x_reconstructed 24 | 25 | # encoder network 26 | x=Input(batch_shape=(batch_size,original_dim)) 27 | h=Dense(intermediate_dim, activation='tanh')(x) 28 | z_mean=Dense(latent_dim)(h) 29 | z_log_var=Dense(latent_dim)(h) 30 | 31 | 32 | # sampling from latent dimension for decoder/generative part of network 33 | def sampling(args): 34 | _mean,_log_var=args 35 | # does this mean we are modelling this is as a gaussian and not multinomial? 36 | epsilon=K.random_normal(shape=(K.shape(z_mean)[0], latent_dim), mean=0., std=epsilon_std) 37 | return _mean+K.exp(_log_var/2)*epsilon 38 | 39 | z = Lambda(sampling, output_shape=(latent_dim,))([z_mean, z_log_var]) 40 | 41 | # decoder network 42 | h_decoder=Dense(intermediate_dim, activation='tanh') 43 | x_bar=Dense(original_dim,activation='softmax') # this should be softmax right? 44 | h_decoded = h_decoder(z) 45 | x_decoded = x_bar(h_decoded) 46 | 47 | vae = Model(x, x_decoded) 48 | weightsPath = "./hybrid/weights_org.hdf5" 49 | vae.load_weights(weightsPath) 50 | 51 | x_test_matrix = pickle.load( open( "test_data.file", "rb" ) ) 52 | x_test_matrix = x_test_matrix.todense() # 1s and 0s per user 53 | x_test = np.squeeze(np.asarray(x_test_matrix)) 54 | 55 | x_test_reconstructed = vae.predict(x_test, batch_size=batch_size) # float values per user 56 | 57 | 58 | # no concept of held out items in the test set, calculating overall 59 | def recallatk(x_test, x_test_reconstructed, k): 60 | recall_values = [] 61 | total_recall = 0.0 62 | for i in range(len(x_test)): 63 | top_rated_movies_idx = [i for i, x in enumerate(x_test[i].tolist()) if x == 1.0] 64 | 65 | if len(top_rated_movies_idx) == 0: 66 | #print("test user has no 1 rated movies: ", i) 67 | continue 68 | 69 | sorted_ratings = x_test_reconstructed[i].tolist() 70 | top_predicted_movies_idx = sorted(range(len(sorted_ratings)), key=lambda i: sorted_ratings[i])[-k:] 71 | 72 | sum = 0.0 73 | for i in range(0, k): 74 | if top_predicted_movies_idx[i] in top_rated_movies_idx: 75 | sum+=1.0 76 | recall = sum/float(min(k, len(top_rated_movies_idx))) 77 | total_recall += recall 78 | recall_values.append(recall) 79 | return total_recall/float(len(recall_values)) 80 | 81 | def ndcgatk(x_test, x_test_reconstructed, k): 82 | ndcg_values = [] 83 | total_ndcg = 0.0 84 | best = 0.0 85 | for i in range(len(x_test)): 86 | top_rated_movies_idx = [i for i, x in enumerate(x_test[i].tolist()) if x == 1.0] 87 | 88 | if len(top_rated_movies_idx) == 0: 89 | #print("test user has no 1 rated movies: ", i) 90 | continue 91 | sorted_ratings = x_test_reconstructed[i].tolist() 92 | top_predicted_movies_idx = sorted(range(len(sorted_ratings)), key=lambda i: sorted_ratings[i])[-k:] 93 | sum_ndcg = 0 94 | for i in range(0, k): 95 | if top_predicted_movies_idx[i] in top_rated_movies_idx: 96 | ndcg = 1/(math.log(i+2)) 97 | else: 98 | ndcg = 0 99 | sum_ndcg += ndcg 100 | 101 | total_ndcg += sum_ndcg 102 | ndcg_values.append(sum_ndcg) 103 | 104 | ndcg_values = np.array(ndcg_values) 105 | max_ndcg = ndcg_values.max() 106 | ndcg_values = ndcg_values / max_ndcg 107 | total_ndcg = np.sum(ndcg_values) 108 | 109 | return total_ndcg/float(len(ndcg_values)) 110 | 111 | print("NDCG at 100: ", ndcgatk(x_test, x_test_reconstructed, 100)) 112 | 113 | #recall at 20: 0.542023083825468 114 | print("recall at 20: ", recallatk(x_test, x_test_reconstructed, 20)) 115 | 116 | #recall at 50: 0.5759154842447732 117 | print("recall at 50: ", recallatk(x_test, x_test_reconstructed, 50)) 118 | -------------------------------------------------------------------------------- /Standard/evaluate_model_approach_2.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pickle 3 | import os 4 | import math 5 | import random 6 | from keras.layers import Input, Dense, Lambda 7 | from keras.models import Model, load_model 8 | from keras import objectives 9 | from keras import backend as K 10 | 11 | 12 | batch_size=500 13 | original_dim=26621 14 | intermediate_dim=600 15 | latent_dim=200 16 | epsilon_std=1.0 17 | x_test_size = 10000 18 | 19 | x=Input(batch_shape=(batch_size,original_dim)) 20 | h=Dense(intermediate_dim, activation='tanh')(x) 21 | z_mean=Dense(latent_dim)(h) 22 | z_log_var=Dense(latent_dim)(h) 23 | 24 | def sampling(args): 25 | _mean,_log_var=args 26 | epsilon=K.random_normal(shape=(K.shape(z_mean)[0], latent_dim), mean=0., std=epsilon_std) 27 | return _mean+K.exp(_log_var/2)*epsilon 28 | z = Lambda(sampling, output_shape=(latent_dim,))([z_mean, z_log_var]) 29 | 30 | h_decoder=Dense(intermediate_dim, activation='tanh') 31 | x_bar=Dense(original_dim,activation='softmax') 32 | h_decoded = h_decoder(z) 33 | x_decoded = x_bar(h_decoded) 34 | 35 | vae = Model(x, x_decoded) 36 | weightsPath = "./weights.hdf5" 37 | vae.load_weights(weightsPath) 38 | 39 | x_test_matrix = pickle.load( open( "test_data.file", "rb" ) ) 40 | x_test_matrix = x_test_matrix.todense() 41 | x_test = np.squeeze(np.asarray(x_test_matrix)) 42 | 43 | 44 | x_test_new_list = [] 45 | x_test_fold_out_indices = [] 46 | 47 | 48 | for i in range(x_test_size): 49 | user_i_features = x_test[i] 50 | one_indices = np.argwhere(user_i_features > 0.0) 51 | number_of_one_indices = one_indices.shape[0] 52 | fold_out_number = int(0.2*number_of_one_indices) 53 | 54 | fold_out_indices = random.sample(one_indices.tolist(), fold_out_number) 55 | x_test_fold_out_indices.append(fold_out_indices) 56 | 57 | np.put(user_i_features, fold_out_indices, np.zeros(fold_out_number)) 58 | x_test_new_list.append(user_i_features) 59 | #print(i) 60 | 61 | x_test_reconstructed = vae.predict(np.asarray(x_test_new_list), batch_size=batch_size) 62 | 63 | 64 | def recallatk(x_test, x_test_fold_out_indices, x_test_reconstructed, k): 65 | recall_values = [] 66 | total_recall = 0.0 67 | for i in range(len(x_test)): 68 | if len(x_test_fold_out_indices[i]) == 0: # if this user hadn't rated any movie as 1 69 | continue 70 | 71 | item_list = [item for sublist in x_test_fold_out_indices[i] for item in sublist] 72 | 73 | sorted_ratings = x_test_reconstructed[i].tolist() 74 | top_predicted_movies_idx = sorted(range(len(sorted_ratings)), key=lambda i: sorted_ratings[i])[-k:] 75 | 76 | sum = 0.0 77 | for j in range(0, k): 78 | if top_predicted_movies_idx[j] in item_list: 79 | sum+=1.0 80 | recall = sum/float(min(k, len(x_test_fold_out_indices[i]))) 81 | total_recall += recall 82 | recall_values.append(recall) 83 | return total_recall/float(len(recall_values)) 84 | 85 | def ndcgatk(x_test, x_test_fold_out_indices, x_test_reconstructed, k): 86 | ndcg_values = [] 87 | total_ndcg = 0.0 88 | best = 0.0 89 | for i in range(len(x_test)): 90 | if len(x_test_fold_out_indices[i]) == 0: 91 | continue 92 | 93 | sorted_ratings = x_test_reconstructed[i].tolist() 94 | top_predicted_movies_idx = sorted(range(len(sorted_ratings)), key=lambda i: sorted_ratings[i])[-k:] 95 | sum_ndcg = 0 96 | item_list = [item for sublist in x_test_fold_out_indices[i] for item in sublist] 97 | for j in range(0, k): 98 | if top_predicted_movies_idx[j] in item_list: 99 | ndcg = 1/(math.log(j+2)) 100 | else: 101 | ndcg = 0 102 | sum_ndcg += ndcg 103 | total_ndcg += sum_ndcg 104 | ndcg_values.append(sum_ndcg) 105 | 106 | ndcg_values = np.array(ndcg_values) 107 | max_ndcg = ndcg_values.max() 108 | ndcg_values = ndcg_values / max_ndcg 109 | total_ndcg = np.sum(ndcg_values) 110 | 111 | return total_ndcg/float(len(ndcg_values)) 112 | 113 | print("NDCG at 100: ", ndcgatk(x_test, x_test_fold_out_indices, x_test_reconstructed, 100)) 114 | 115 | print("recall at 20: ", recallatk(x_test, x_test_fold_out_indices, x_test_reconstructed, 20)) 116 | 117 | print("recall at 50: ", recallatk(x_test, x_test_fold_out_indices, x_test_reconstructed, 50)) -------------------------------------------------------------------------------- /Standard/plot_loss_graphs.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": { 7 | "collapsed": false 8 | }, 9 | "outputs": [ 10 | { 11 | "name": "stderr", 12 | "output_type": "stream", 13 | "text": [ 14 | "Using TensorFlow backend.\n" 15 | ] 16 | } 17 | ], 18 | "source": [ 19 | " #%matplotlib inline\n", 20 | "import numpy as np\n", 21 | "import pickle\n", 22 | "import os\n", 23 | "#from matplotlib import pyplot as plt\n", 24 | "from keras.layers import Input, Dense, Lambda\n", 25 | "from keras.models import Model\n", 26 | "from keras import objectives\n", 27 | "from keras import backend as K\n", 28 | "from keras.callbacks import ReduceLROnPlateau, ModelCheckpoint, Callback\n", 29 | "#from IPython.display import clear_output" 30 | ] 31 | }, 32 | { 33 | "cell_type": "code", 34 | "execution_count": 2, 35 | "metadata": { 36 | "collapsed": true 37 | }, 38 | "outputs": [], 39 | "source": [ 40 | "class PlotLosses(Callback):\n", 41 | " def on_train_begin(self, logs={}):\n", 42 | " self.i = 0\n", 43 | " self.x = []\n", 44 | " self.losses = []\n", 45 | " self.val_losses = []\n", 46 | " \n", 47 | " self.fig = plt.figure()\n", 48 | " \n", 49 | " self.logs = []\n", 50 | "\n", 51 | " def on_epoch_end(self, epoch, logs={}):\n", 52 | " \n", 53 | " self.logs.append(logs)\n", 54 | " self.x.append(self.i)\n", 55 | " self.losses.append(logs.get('loss'))\n", 56 | " self.val_losses.append(logs.get('val_loss'))\n", 57 | " self.i += 1\n", 58 | " \n", 59 | " clear_output(wait=True)\n", 60 | " plt.plot(self.x, self.losses, label=\"loss\")\n", 61 | " plt.plot(self.x, self.val_losses, label=\"val_loss\")\n", 62 | " plt.legend()\n", 63 | " plt.show();\n", 64 | " \n", 65 | "plot_losses = PlotLosses()" 66 | ] 67 | }, 68 | { 69 | "cell_type": "code", 70 | "execution_count": 2, 71 | "metadata": { 72 | "collapsed": true 73 | }, 74 | "outputs": [], 75 | "source": [ 76 | "# encoder/decoder network size\n", 77 | "batch_size=500\n", 78 | "original_dim=26621 # number of movies\n", 79 | "intermediate_dim=600\n", 80 | "latent_dim=200\n", 81 | "nb_epochs=20\n", 82 | "epsilon_std=1.0" 83 | ] 84 | }, 85 | { 86 | "cell_type": "code", 87 | "execution_count": 3, 88 | "metadata": { 89 | "collapsed": true 90 | }, 91 | "outputs": [], 92 | "source": [ 93 | "# encoder network\n", 94 | "x=Input(batch_shape=(batch_size,original_dim))\n", 95 | "h=Dense(intermediate_dim, activation='tanh')(x)\n", 96 | "z_mean=Dense(latent_dim)(h)\n", 97 | "z_log_var=Dense(latent_dim)(h)" 98 | ] 99 | }, 100 | { 101 | "cell_type": "code", 102 | "execution_count": 4, 103 | "metadata": { 104 | "collapsed": true 105 | }, 106 | "outputs": [], 107 | "source": [ 108 | "def sampling(args):\n", 109 | " _mean,_log_var=args\n", 110 | " epsilon=K.random_normal(shape=(K.shape(z_mean)[0], latent_dim), mean=0., std=epsilon_std)\n", 111 | " return _mean+K.exp(_log_var/2)*epsilon\n", 112 | "\n", 113 | "z= Lambda(sampling, output_shape=(latent_dim,))([z_mean, z_log_var])" 114 | ] 115 | }, 116 | { 117 | "cell_type": "code", 118 | "execution_count": 5, 119 | "metadata": { 120 | "collapsed": true 121 | }, 122 | "outputs": [], 123 | "source": [ 124 | "# decoder network\n", 125 | "h_decoder=Dense(intermediate_dim, activation='tanh')\n", 126 | "x_bar=Dense(original_dim,activation='softmax') # this should be softmax right?\n", 127 | "h_decoded = h_decoder(z)\n", 128 | "x_decoded = x_bar(h_decoded)" 129 | ] 130 | }, 131 | { 132 | "cell_type": "code", 133 | "execution_count": 6, 134 | "metadata": { 135 | "collapsed": true 136 | }, 137 | "outputs": [], 138 | "source": [ 139 | "# build and compile model\n", 140 | "vae = Model(x, x_decoded)\n", 141 | "def vae_loss(x,x_bar):\n", 142 | " reconst_loss=original_dim*objectives.binary_crossentropy(x, x_bar)\n", 143 | " kl_loss=-0.5*K.sum(1 + z_log_var - K.square(z_mean) - K.exp(z_log_var), axis=-1)\n", 144 | " return reconst_loss + kl_loss\n", 145 | "\n", 146 | "vae.compile(optimizer='adam', loss=vae_loss)" 147 | ] 148 | }, 149 | { 150 | "cell_type": "code", 151 | "execution_count": 7, 152 | "metadata": { 153 | "collapsed": false 154 | }, 155 | "outputs": [ 156 | { 157 | "name": "stdout", 158 | "output_type": "stream", 159 | "text": [ 160 | "number of training users: 118493\n" 161 | ] 162 | } 163 | ], 164 | "source": [ 165 | "x_train = pickle.load( open( \"train_data_0.file\", \"rb\" ) )\n", 166 | "#x_train = x_train[0:118000, :]\n", 167 | "print(\"number of training users: \", x_train.shape[0])\n", 168 | "\n", 169 | "x_val = pickle.load( open( \"val_data_0.file\", \"rb\" ) )\n", 170 | "x_val = x_val.todense()" 171 | ] 172 | }, 173 | { 174 | "cell_type": "code", 175 | "execution_count": 9, 176 | "metadata": { 177 | "collapsed": true 178 | }, 179 | "outputs": [], 180 | "source": [ 181 | "def nn_batch_generator(x, y, batch_size, samples_per_epoch):\n", 182 | " number_of_batches = samples_per_epoch/batch_size\n", 183 | " counter=0\n", 184 | " shuffle_index = np.arange(np.shape(y)[0])\n", 185 | " np.random.shuffle(shuffle_index)\n", 186 | " x = x[shuffle_index, :]\n", 187 | " y = y[shuffle_index, :]\n", 188 | " while 1:\n", 189 | " index_batch = shuffle_index[batch_size*counter:batch_size*(counter+1)]\n", 190 | " x_batch = x[index_batch,:].todense()\n", 191 | " y_batch = y[index_batch,:].todense()\n", 192 | " counter += 1\n", 193 | " yield (np.array(x_batch),np.array(y_batch))\n", 194 | " if (counter >= number_of_batches):\n", 195 | " counter=0" 196 | ] 197 | }, 198 | { 199 | "cell_type": "code", 200 | "execution_count": 10, 201 | "metadata": { 202 | "collapsed": true 203 | }, 204 | "outputs": [], 205 | "source": [ 206 | "weightsPath = \"./tmp/weights.hdf5\"\n", 207 | "checkpointer = ModelCheckpoint(filepath=weightsPath, verbose=1, save_best_only=True)\n", 208 | "reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=5, min_lr=0.001)" 209 | ] 210 | }, 211 | { 212 | "cell_type": "code", 213 | "execution_count": 11, 214 | "metadata": { 215 | "collapsed": false 216 | }, 217 | "outputs": [ 218 | { 219 | "data": { 220 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXoAAAD8CAYAAAB5Pm/hAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMi4yLCBo\ndHRwOi8vbWF0cGxvdGxpYi5vcmcvhp/UCwAAIABJREFUeJzt3Xl8XOV97/HPbzaN9s1abMsrNjbY\nxsYIwmKcsMSAQ3CTNJgUGuCm8W1KszQpzdbe0KTc5IY2dIfSNgmkpMFZaElMbCg4GJKwyMYrXrEt\nW7KtXbL2ZeZ3/3iOZMmLdnmso9/79ZrXOXM0y3Nk+TvP/M5zniOqijHGGP8KJLoBxhhjxpYFvTHG\n+JwFvTHG+JwFvTHG+JwFvTHG+JwFvTHG+JwFvTHG+JwFvTHG+JwFvTHG+Fwo0Q0AmDRpks6cOTPR\nzTDGmHFl8+bN1aqaN9DjLoignzlzJiUlJYluhjHGjCsiUjqYx1npxhhjfM6C3hhjfM6C3hhjfO6C\nqNEbYyamzs5OysrKaGtrS3RTLmjRaJSioiLC4fCwnm9Bb4xJmLKyMtLT05k5cyYikujmXJBUlZqa\nGsrKypg1a9awXsNKN8aYhGlrayM3N9dCvh8iQm5u7oi+9Qwq6EXksIjsEJGtIlLibXtIRMq9bVtF\nZGWvx39ZRA6IyF4RuWXYrTPG+J6F/MBG+jsaSunmBlWtPm3bo6r616c16FLgLmABMAX4HxG5WFVj\nI2rpWew90cjPtx3jE8tmkZ0aGe2XN8YYXxiL0s0q4Eeq2q6qh4ADwFVj8D4cqm7iHzce4HiDHcgx\nxgxPWlpaopsw5gYb9Aq8ICKbRWRNr+1/LCLbReS7IpLtbZsKHO31mDJvWx8iskZESkSkpKqqaliN\nz0x2vfj61o5hPd8YYyaCwQb9MlVdCtwGPCAiy4HHgIuAJcBx4G+G8saq+oSqFqtqcV7egFM1nFVW\nihtq1NDSOaznG2NMN1XlwQcfZOHChSxatIhnnnkGgOPHj7N8+XKWLFnCwoULefXVV4nFYtx33309\nj3300UcT3Pr+DapGr6rl3rJSRJ4FrlLVTd0/F5F/BX7h3S0HpvV6epG3bdR1B319qwW9MePdX/58\nF+8cOzmqr3nplAy+9sEFg3rsz372M7Zu3cq2bduorq7myiuvZPny5fzwhz/klltu4atf/SqxWIyW\nlha2bt1KeXk5O3fuBKC+vn5U2z3aBuzRi0iqiKR3rwMrgJ0iMrnXwz4E7PTWnwPuEpEkEZkFzAXe\nHN1mO1ndpRvr0RtjRui1117jYx/7GMFgkIKCAt773vfy1ltvceWVV/K9732Phx56iB07dpCens7s\n2bM5ePAgn/70p1m/fj0ZGRmJbn6/BtOjLwCe9Yb3hIAfqup6EfmBiCzB1e8PA/8bQFV3icha4B2g\nC3hgLEbcAETDASKhgNXojfGBwfa8z7fly5ezadMm1q1bx3333cfnP/95Pv7xj7Nt2zY2bNjA448/\nztq1a/nud7+b6Kae04BBr6oHgcVn2f77/TznYeDhkTVtYCJCVnLYavTGmBG7/vrr+Zd/+Rfuvfde\namtr2bRpE4888gilpaUUFRXxyU9+kvb2drZs2cLKlSuJRCJ85CMfYd68edxzzz2Jbn6/xv0UCFkp\nYSvdGGNG7EMf+hC//e1vWbx4MSLCt7/9bQoLC3nyySd55JFHCIfDpKWl8dRTT1FeXs79999PPB4H\n4Jvf/GaCW98/UdVEt4Hi4mId7oVHPvr4bwgFAvznmqtHuVXGmLG2e/duLrnkkkQ3Y1w42+9KRDar\navFAzx33c91kJkds1I0xxvRj3Ad9VkqYhhY7GGuMMecy/oM+OWw9emOM6cf4D/qUMC0dMdq7xmQE\npzHGjHvjPugzU9xJUw3WqzfGmLMa90GflWzz3RhjTH/Gf9DbfDfGGNOv8R/03nw31qM3xoy1/uau\nP3z4MAsXLjyPrRm8cR/0mcnWozfGmP6M+ykQMrtLNzaW3pjx7ZdfghM7Rvc1CxfBbd8654+/9KUv\nMW3aNB544AEAHnroIUKhEBs3bqSuro7Ozk7+6q/+ilWrVg3pbdva2vjUpz5FSUkJoVCI73znO9xw\nww3s2rWL+++/n46ODuLxOD/96U+ZMmUKd955J2VlZcRiMf7iL/6C1atXj2i3Tzfugz49KURAbNSN\nMWboVq9ezec+97meoF+7di0bNmzgM5/5DBkZGVRXV3P11Vdzxx13DOkC3f/0T/+EiLBjxw727NnD\nihUr2LdvH48//jif/exnufvuu+no6CAWi/H8888zZcoU1q1bB0BDQ8Oo7+e4D/pAQMhMtonNjBn3\n+ul5j5XLL7+cyspKjh07RlVVFdnZ2RQWFvInf/InbNq0iUAgQHl5ORUVFRQWFg76dV977TU+/elP\nAzB//nxmzJjBvn37uOaaa3j44YcpKyvjwx/+MHPnzmXRokV84Qtf4Itf/CK33347119//ajv57iv\n0QNkpdh8N8aY4fnoRz/KT37yE5555hlWr17N008/TVVVFZs3b2br1q0UFBTQ1tY2Ku/1e7/3ezz3\n3HMkJyezcuVKXn75ZS6++GK2bNnCokWL+PM//3O+/vWvj8p79Tbue/SA16O3Gr0xZuhWr17NJz/5\nSaqrq3nllVdYu3Yt+fn5hMNhNm7cSGlp6ZBf8/rrr+fpp5/mxhtvZN++fRw5coR58+Zx8OBBZs+e\nzWc+8xmOHDnC9u3bmT9/Pjk5Odxzzz1kZWXxb//2b6O+j74I+qyUMLXNFvTGmKFbsGABjY2NTJ06\nlcmTJ3P33XfzwQ9+kEWLFlFcXMz8+fOH/Jp/9Ed/xKc+9SkWLVpEKBTi+9//PklJSaxdu5Yf/OAH\nhMNhCgsL+cpXvsJbb73Fgw8+SCAQIBwO89hjj436Po77+egBPvejt3n7aD2vPHjDKLbKGDPWbD76\nwZvQ89EDdjDWGGP64YvSTWZKhJNtncTiSjAw+CFQxhgzVDt27OD3f7/vJbOTkpJ44403EtSigfki\n6LOSw6hCY1snWd5slsaY8UFVhzRGPdEWLVrE1q1bz+t7jrTE7ovSTc/EZla+MWZciUaj1NTUjDjI\n/ExVqampIRqNDvs1BtWjF5HDQCMQA7p6F/9F5AvAXwN5qlot7qP574CVQAtwn6puGXYLB8FmsDRm\nfCoqKqKsrIyqqqpEN+WCFo1GKSoqGvbzh1K6uUFVq3tvEJFpwArgSK/NtwFzvdt7gMe85ZjJ9Gaw\ntLH0xowv4XCYWbNmJboZvjfS0s2jwJ8Bvb93rQKeUud1IEtEJo/wffrV3aO3+W6MMeZMgw16BV4Q\nkc0isgZARFYB5aq67bTHTgWO9rpf5m3rQ0TWiEiJiJSM9Gtbz1WmLOiNMeYMgy3dLFPVchHJB14U\nkT3AV3Blm2FR1SeAJ8CdMDXc1wHISLaDscYYcy6D6tGrarm3rASeBd4LzAK2eQdqi4AtIlIIlAPT\nej29yNs2ZsLBAGlJIQt6Y4w5iwGDXkRSRSS9ex3Xi39LVfNVdaaqzsSVZ5aq6gngOeDj4lwNNKjq\n8bHbBSczOUx9qx2MNcaY0w2mdFMAPOud0BACfqiq6/t5/PO4oZUHcMMr7x9pIwcjKyVs1401xpiz\nGDDoVfUgsHiAx8zsta7AAyNu2RBlpYRtHL0xxpyFL86MBchKjtg4emOMOQvfBH1mSpiG1q5EN8MY\nYy44vgn6rOQwDa0dNmeGMcacxjdBn5kcpjOmtHTEEt0UY4y5oPgm6G1iM2OMOTvfBL1NbGaMMWfn\nm6DvmdjMxtIbY0wfvgt6K90YY0xf/gn6ntKNBb0xxvTmn6C3OemNMeasfBP00XCQpFDAJjYzxpjT\n+CbowY2lt4OxxhjTl6+CPislbDV6Y4w5jb+CPjlipRtjjDmNr4I+03r0xhhzBl8FvZvYzILeGGN6\n81fQW4/eGGPO4LOgj9DaGaO9y2awNMaYbr4K+sxkO2nKGGNO58+gt/KNMcb08FXQ28RmxhhzJn8F\nvU1sZowxZxhU0IvIYRHZISJbRaTE2/YNEdnubXtBRKZ420VE/l5EDng/XzqWO9BbT4/eLj5ijDE9\nhtKjv0FVl6hqsXf/EVW9TFWXAL8A/o+3/TZgrndbAzw2aq0dQKbNYGmMMWcYdulGVU/2upsKqLe+\nCnhKndeBLBGZPII2Dlp6UohgQKx0Y4wxvQw26BV4QUQ2i8ia7o0i8rCIHAXu5lSPfipwtNdzy7xt\nfYjIGhEpEZGSqqqq4bX+zNd0M1haj94YY3oMNuiXqepSXFnmARFZDqCqX1XVacDTwB8P5Y1V9QlV\nLVbV4ry8vCE1uj9ZyWEbdWOMMb0MKuhVtdxbVgLPAled9pCngY946+XAtF4/K/K2nRduYjM7GGuM\nMd0GDHoRSRWR9O51YAWwU0Tm9nrYKmCPt/4c8HFv9M3VQIOqHh/ldp+TlW6MMaav0CAeUwA8KyLd\nj/+hqq4XkZ+KyDwgDpQCf+g9/nlgJXAAaAHuH/VW9yMrOczBqubz+ZbGGHNBGzDoVfUgsPgs2z9y\nloejqgo8MPKmDU9WSsRKN8YY04uvzowFV7o52dZFLK4DP9gYYyYA3wV999mxjW1WpzfGGPBx0NtJ\nU8YY4/gv6LsnNrORN8YYA/gw6DNtYjNjjOnDf0FvV5kyxpg+fBf0WclWozfGmN58F/SZFvTGGNOH\n74I+FAyQnhSivtVq9MYYAz4MenAHZK1Gb4wxji+DPislTIOVbowxBhjvQb9vAzy6CJoq+2zOSo7Y\nOHpjjPGM76BPyYWGI1D66z6bbU56Y4w5ZXwH/eTFEE6Fw6cFvc1Jb4wxPcZ30AfDMP09UPqbPpuz\nksPUt3TiZkw2xpiJbXwHPcCMa6FyF7TU9mzKSgnTFVeaO2IJbJgxxlwYfBD0y9yyV6++Z2Izq9Mb\nY4wPgn7qUghF+wR998RmVqc3xhg/BH0oCYquhNLXejZ1z3djY+mNMcYPQQ8w4zo4sQPaGgB33Viw\nOemNMQb8EvQzrwONw5E3ACjMiCICe080JrhhxhiTeIMKehE5LCI7RGSriJR42x4RkT0isl1EnhWR\nrF6P/7KIHBCRvSJyy1g1vsfUYgiEe8o3mSlhLp+Wxca9lQM80Rhj/G8oPfobVHWJqhZ7918EFqrq\nZcA+4MsAInIpcBewALgV+GcRCY5im88USYGpV/Q5ceqmSwrYXtZAxcm2MX1rY4y50A27dKOqL6hq\nl3f3daDIW18F/EhV21X1EHAAuGpkzRyEmdfB8a3Q3gTATZfkA7Bxj/XqjTET22CDXoEXRGSziKw5\ny8//F/BLb30qcLTXz8q8bWNrxrUQ74KyNwGYV5DO1KxkXrKgN8ZMcIMN+mWquhS4DXhARJZ3/0BE\nvgp0AU8P5Y1FZI2IlIhISVVV1VCeenbT3gMS7CnfiAg3XZLPa/uraeu0M2SNMRPXoIJeVcu9ZSXw\nLF4pRkTuA24H7tZTE8uUA9N6Pb3I23b6az6hqsWqWpyXlzfsHeiRlA5TlvQ5cerG+fm0dsb47cGa\nkb++McaMUwMGvYikikh69zqwAtgpIrcCfwbcoaotvZ7yHHCXiCSJyCxgLvDm6Df9LGZcC+Ul0NkK\nwNWzc0mJBHlpd8V5eXtjjLkQDaZHXwC8JiLbcIG9TlXXA/8IpAMvesMuHwdQ1V3AWuAdYD3wgKqe\nn9rJjGUQ64CyEgCi4SDL5kzi5d2VNpOlMWbCCg30AFU9CCw+y/Y5/TznYeDhkTVtGKZfDYgr38y6\nHnCjb154p4I9Jxq5ZHLGeW+SMcYkmj/OjO2WnAWFi/rMe3PDfDfM0so3xpiJyl9BD27em6NvQZeb\nojg/PcriokwbZmmMmbD8F/Qzr4OuVjj2ds+mmy4pYOvReqqb2hPYMGOMSQz/Bf30a92yV/nmxvn5\nqNpZssaYicl/QZ+aC3mX9Jn3ZsGUDAozory024LeGDPx+C/owZVvjr4BMTcVj4hw4yX5vLq/ivYu\nO0vWGDOx+DPoZ1wHHU1wYlvPppvm59PcEePNQ7X9PNEYY/zHn0E/cxkgsG9Dz6br5kwiGg5Y+cYY\nM+H4M+jT8mH2e2H7WvDOiI2Gg1x30SRe2lNhZ8kaYyYUfwY9wGWroe5Qz3QI4IZZHq1tZX9lUwIb\nZowx55d/g37+7RBKhu0/6tl0o3eW7P/YWbLGmAnEv0EfzYD5K2Hnz3rOki3MjLJoaiYvvmNBb4yZ\nOPwb9ODKN6218O5LPZtuWVDA20fq7VqyxpgJw99Bf9GNkJIL25/p2XTLgkIAXth1IlGtMsaY88rf\nQR8Mw8KPwN5fQlsDAHPy05g9KZUNu6x8Y4yZGPwd9ODKN11tsPvngDtL9paFhbx+sIb6lo4EN84Y\nY8ae/4N+6hWQM/uM8k1XXO3kKWPMhOD/oBdxvfpDr0KDu0b5ZVMzKcyIssHq9MaYCcD/QQ+w6KOA\nwo4fAxAICCsWFLBpfxWtHTbJmTHG3yZG0OdeBEVXuikRPLcuKKStM84r+6oS2DBjjBl7EyPowZVv\nKnfBiZ0AXDUrh6yUsJVvjDG+N3GCfsGHIBDqOSgbCga4aX4BL+2uoDMWT3DjjDFm7Awq6EXksIjs\nEJGtIlLibfuoiOwSkbiIFJ/2+C+LyAER2Ssit4xFw4csdRLMuRl2/ATiri5/y4ICTrZ18frBmgQ3\nzhhjxs5QevQ3qOoSVe0O9Z3Ah4FNvR8kIpcCdwELgFuBfxaR4Gg0dsQuuxMaj8Fhdz3Z5RfnkRwO\nsn6nlW+MMf417NKNqu5W1b1n+dEq4Eeq2q6qh4ADwFXDfZ9RNW8lRNJ7Rt9Ew0HeNy+PF9+pIB63\nOeqNMf402KBX4AUR2SwiawZ47FTgaK/7Zd62xAsnw9z3w771vco3hVQ2tvP20foEN84YY8bGYIN+\nmaouBW4DHhCR5SN9YxFZIyIlIlJSVXUehzjO/wA0V/VckOSG+fmEAmKTnBljfGtQQa+q5d6yEniW\n/ksx5cC0XveLvG2nv+YTqlqsqsV5eXmDb/FIzbnZjb7Zuw6AzOQw186ZxPpdJ+wSg8YYXxow6EUk\nVUTSu9eBFbgDsefyHHCXiCSJyCxgLvDmaDR2VCRnuYuH73m+Z9MtCwoorWlhb0VjAhtmjDFjYzA9\n+gLgNRHZhgvsdaq6XkQ+JCJlwDXAOhHZAKCqu4C1wDvAeuABVb2w5hmY9wGo2Q/V+wF4/6UFiMCG\nnTZ1sTHGfwYMelU9qKqLvdsCVX3Y2/6sqhapapKqFqjqLb2e87CqXqSq81T1l2O5A8My7za33OPK\nN/npUa6Yns26HcesfGOM8Z2Jc2Zsb1nToPAy2HuqfPOhpVPZV9HEtrKGBDbMGGNG38QMenCjb46+\nCU1uTvoPLp5CNBzgmbeODvBEY4wZXyZu0M9bCagbUw9kRMOsXDSZn287RktHV2LbZowxo2jiBn3h\nIsic3mf0zeriaTS1d/H8DhtTb4zxj4kb9CLuoOzBjdDRDLipi2dNSmWtlW+MMT4ycYMeYP5Kd+Hw\ndzcC7sLhdxZP483Dtbxb1ZTgxhljzOiY2EE/4zqIZvYZffORK6YSDAhrS6xXb4zxh4kd9MEwzF3R\nZ5Kz/PQoN8zL56eby+2CJMYYX5jYQQ9u9E1LDRx9o2fT6iunUd3UzsY9lQlsmDHGjA4L+jk3QyDc\nc5YswA3z8shPT7LyjTHGFyzooxkwa7mr03vTH4SCAT5yRREv76mk4mRbghtojDEjY0EPbvRN7UGo\nOnXBrDuLpxFX+MnmsgQ2zBhjRs6CHryzZOmZox5g1qRUrpqVw49LjtpEZ8aYcc2CHiBjCkxZCtt/\n3DP6BuCuK6dxuKaFNw7VJrBxxhgzMhb03a79Y6jaDZu/37PptoWTSU8K2ZmyxphxzYK+24IPw8zr\n4eVvQIvrwSdHgtyxZAq/2HGc/Xb1KWPMOGVB300Ebvs2tJ10Ye/5zE1zyYiG+MP/2ExTu81qaYwZ\nfyzoeyu4FK76JJR8D45vc5syovzDx5ZyqLqZL/50ux2YNcaMOxb0p3vflyElF55/sGdc/TUX5fKn\nt8xj3fbjfP83hxPbPmOMGSIL+tMlZ8HND7kpEbY/07P5D5dfxM2XFPDwut1sLq1LWPOMMWaoLOjP\nZsndMPUKePH/uJo9EAgIf3PnYqZkJfPA01uobmpPcCONMWZwLOjPJhCA2x6BpgrY9O2ezZnJYf75\n7qXUtnTw2R+9TSxu9XpjzIVvUEEvIodFZIeIbBWREm9bjoi8KCL7vWW2t11E5O9F5ICIbBeRpWO5\nA2Om6Aq4/B54/bE+UyMsnJrJN1Yt4NcHavjb/9mXwAYaY8zghIbw2BtUtbrX/S8BL6nqt0TkS979\nLwK3AXO923uAx7zl+HPTQ/DOz+Gnn4BZ7wUJgARYLQFyp1Xz21fi/GvwU/zBjQsQkUS31hhjzmoo\nQX+6VcD7vPUngV/hgn4V8JS6cYivi0iWiExW1eMjaWhCpOXBykdg/RfdkEuNe7cYN2mcm8Nx1v1q\nH58+8U2+/dElpERG8us0xpixMdhkUuAFEVHgX1T1CaCgV3ifAAq89alA7zkDyrxt4y/oARavdrfT\nCKC//ns+8OJfcHj33/Phf/4Dnvj9Yqbnppz/NhpjTD8GG/TLVLVcRPKBF0VkT+8fqqp6HwKDJiJr\ngDUA06dPH8pTLxhy7aehZj8PbHmK4/VFfPAf2/iHj13O8ovzEt00Y4zpMaiDsapa7i0rgWeBq4AK\nEZkM4C27r7tXDkzr9fQib9vpr/mEqharanFe3jgNRhH4wHdg1nK+EXiC96ce4L7vvcnjr7xrZ9Aa\nYy4YAwa9iKSKSHr3OrAC2Ak8B9zrPexe4L+99eeAj3ujb64GGsZlfX6wgmG48ykkeyaPxL7Nx+fF\n+dYv9/CJJ0uotKtTGWMuAIPp0RcAr4nINuBNYJ2qrge+BbxfRPYDN3v3AZ4HDgIHgH8F/mjUW32h\nSc6G33sGAb7W+BD/99Yifn2gmhV/u4lfbD+W6NYZYyY4uRBKDMXFxVpSUpLoZozc4V/DU6tgxjUc\nWPEkX/jZbrYdreeDi6fwjVULyEqJJLqFxhgfEZHNqlo80OPszNjRNPM6uOMf4NAm5vzwOp6d/yu+\ndn0Gv9xxnBWPbmLj3sqBX8MYY0aZBf1oW/IxuPsnULCAwKuPcH/JHWy56F+5Lfw2f/C91/n8M1s5\nVt+a6FYaYyYQK92MpbpSePsHsOUH0HSCxkgez7UtYbfO5OLF1/I7t76fjLT0RLfSGDNODbZ0Y0F/\nPsQ6Yd8G2PIU8dLfEOhwlyXsIkBj6iwyZ19BYOYyWPRRiNgJV8aYwbGgv1DF41BfSumu19n8xiYy\nG3ZzWaiUPK1Fk7ORK+6Hq9ZAxuREt9QYMxKdbVBzAKr2QH0p5M2H6ddASs6ovYUF/TigqmzcW8k3\n1+0ms3ozfxhZz43yFgRCsOBDBK55AKYsSXQzjZk4Oluh8TicPO6WXe3uXJlAEAJhbz3sHtvV6sK8\nswW62txz2+qhap8L97pDbm6sPgQKFsCM69zgjRnXQeqkYTfXgn4cicWVNw7V8F9vl7Njx1bujK3j\nztArpNJGa86lRNMyEbzZMbtnyQyEoHCR6yFMv3pEfyzGjIr2Rqg/4m6tdRBOhnAqRFJdSTLsLYNJ\nEIq4ZTDirv8wWlQhHoPgALO7xDrh2FYofQ2OvO7afPKYC+qRCIQgdw7kzXM9+O5lZhGc2Amlv4bD\nr8HRN90HBcCyz8PNXxvW21nQj1NtnTE27qlkw+a9FL77Y5bxNslhISc1Qm5KhPRoyE2J3NkCJ3ZA\nrMM9MXeOC/xpV0PuRZBWAGn5EEk79eFgzEi1NUD1fneNhup9UHuwV7jXDu81A2EIRSE1F9InQ3oh\npE/xlpPdB0F7E3Q0Q0eTu7U3QftJaK13Hypt3rK13vWiM6ZA9sy+t5RcOLbFne9y9E3obHbvnzsX\nJl3syqXp3q17PZzsPjhinRDv9JZd7nmhKIRTIByFUPKp5WA+uLo64PhWF/pTLoeLbhjWr86C3gca\nWjrZ8M4JXth1gk37q+noipObGmHFggJuWVDItTPSiVRuhyO/db2So6+7P/bewimQmueCP/cid4nE\nqUuhYJHrVRlzLvE4lL0Fe34Bx7e5YG/sNZtJIOyF6AzImu7dZrhbSo4rZXS2eOHc4q03u85JVzvE\n2l3gxTpc6aO5ChpPuJ5143H3+LMJJrlvCUnp7qz05Gx3refudQlCw1GoO+xujafNwJK/wCubXOtK\nJ2n5Y/QLHHsW9D7T1N7Fr/ZWsmFXBS/vrqC5I0ZGNMT7Ly1k5aJCls2dRFJAoPZd90feVOkuhdhU\n6d1OQOUeaPZO2gpGoPAyKCp2NcPUPHdLyXXLSGrivwl0dbgeZFu9W7bWu9DIvxQmzU18+/woHnMd\nh3f+G3b/3IVkMOLKhJPmud973jy3nj1z4BLJcKm6UlDjcbceSYWkNPcNNRge2mt1trpvHE0VULBw\nVA+GJpoFvY+1dcb49YFqnt9xghfeOUFjWxfpSSFuvrSA2xYWcv3cPJIjwTOfqAoNZVBeAuWboXwL\nHHv77D2nUNT7JpDvlYEKTpWDUvPc19jevbSOZvcfKi0P8i6B/PmQOX1wX2PjMVcOOPa2d9sCFe+c\n+mp9NulTYNZymP1ed/WvzKln7mtHk/uGE4y4tg/mgyHW6T4o2xu9/erukTa7fY3Hut/gtN9XEiRl\nQDQLohkQzXT3w1H3nHjMfeVXbykBSM5xvdKx+MCKexfJ6S+I4zEXgDXvQs1+qNgF+9a7nnUoCnNu\nhkt/By5e4fbHXHAs6CeIjq44v363mue3H+eFdypoaO0kEgxw5axsls/N4/q5ecwvTCcQOEeYxLrg\nZBk010BLNTRXe8sqt979raDxhNt+LhJw9cne4RxOhbyLXQ88JdeFaKzd+6re4dabquDEdhem3c+Z\nvNj1IFPz3FfyaKYXoJkuOMu3wKFX4NAmaKlxz8ud44Kzte5Uzba7lgruuT0Hxy5xy9RJ3vC3vW6U\nRNU+dz/eObJ/lKEIhN3vJiXX9TRTct0HRVKvD4tohvtA6Gr3atH1p77ptNa7D6XTb965GoRTTr1W\n9wdQIORKGrUHTx3j6f4dXXRI640mAAANz0lEQVQjXLoK5rzf9aDNBc2CfgLqjMV5/WANr+yt4tX9\n1eytcP/ZJ6Ulcf3cSVxzUS5XzcxhRm7K8K5xG+s89QEQjLgRFJE0FyahJNczba13wVn5jgvPyt1u\n2dbgjbIIu8cGw+41olluCOmUpe6g1KS5bijbYMTjULkLDr4Ch191dd7kbPeaPbXbbNcTr9rjtWv3\nmQcNJeDKEN0fBLlz3GtEUt3+RVLceji1b9u6f4eqXgg3uAOEbSddCLefdMPvAiFveF7QrUvQ9bZb\na90HVUsNtPRab2twr9HVz1QZoeipD7/uD4akdBfO3esSdG1oP3nqNdsaXLhnz3THbHLnuv3NneM+\n+KwcNq5Y0BsqTrbx6v5qXt1fxWv7q6lpdr23/PQkrpyZw1WzcrhyZg7zCtMJnqvH70fN1S7wW2pO\nhVw4muhWnSnW6cK5vcH10kPRU99uLsT2mvPOgt70EY8rB6qaePNQLW8druWtQ7Uca3AXRkmPhrhi\nRjZXzsyheEY2i6dlEQ0PsldtjEmYwQb9GB0yNxeaQEC4uCCdiwvSuefqGQCU1bV4wV9HyeFafrV3\nLwDhoLBwaiZXTM9mTn4asyalMisvlby0pOGVfIwxCWVBP4EVZadQlJ3Ch5cWAVDX3MHm0jpKSl3w\nP/V6KR1dp07hTksKudCflMolkzNYODWDBVMyyUm18fjGXMisdGPOKRZXjtW3cqi6ued2sLqZdyub\nKO81p/6UzCgLpmaycEomi6dlsnRGNhnRIY51NsYMmZVuzIgFA8K0nBSm5aSw/OK8Pj+rb+ngnWMn\n2XmsgZ3lbvk/uytQdQM35hWkUzzT1f2vmJHN1KxkK/sYkyDWozejpqm9i61H6ikprWVzaR1bSuto\n7nAnGE1Ki/QcI3C3NOYWpJOZbD1/Y4bLevTmvEtLCrFs7iSWzXUzaXbF4uw50cjm0jp2HWtgb0UT\nPy452hP+AAUZSV7dP43ZXv1/Vl4q07JTiITsSpfGjAYLejNmQsEAC6dmsnDqqdPn43GlvL6V/ZWN\n7KtoYn9FE4drmtmw6wS1zafO0gwITMlKZkZuCtNzUpmRm8KMnBSm57pSkh0DMGbwBh30IhIESoBy\nVb1dRG4E/hqIAJuBT6hql7hC7N8BK4EW4D5V3TL6TTfjUaBX3f/G+QV9flbf0tHnwG9pTQultS2s\n33mcupa+0xJkREPeqKHknuXkzCh56UnkpScxKS2J1CTrxxgDQ+vRfxbYDWSISAB4ErhJVfeJyNeB\ne4F/B24D5nq39wCPeUtj+pWVEuHy6REun559xs9OtnVypKaFI7UtlNW1UFbXSlldK4drmnntQDUt\nvcpB3VIiwZ7Qz02NkJsWITc1idy0CDmpEQoyoszJT2NSWtL52D1jEmZQQS8iRcAHgIeBzwO5QIeq\n7vMe8iLwZVzQrwKeUneU93URyRKRyap6/CwvbcygZETDZ5SBuqkqdS2dVJxso6qx3d2a2k+tN7ZT\nWtPCliP11Da3Ez9t/EFOaoQ5+WlcXJDGxQXpzMlLoyg7hYLMJJJCdoawGf8G26P/W+DPgHTvfjUQ\nEpFiVS0BfheY5v1sKnC013PLvG0W9GZMiLgrcOWkRrhkgGuqx+NKQ2snNc3tHKtvY39lE/srGtlf\n2cR/bz1GY1tXn8dPSktiSlaUKZnJTM6K9nxDmJQWcd8UvG8LNmWEuZANGPQicjtQqaqbReR9AKqq\nInIX8KiIJAEvAGd+d+7/ddcAawCmT58+1HYbMyyBgJCdGiE7NcKc/PQ+5weoKhUn23m3yp0Qdry+\njWP1rRxraOVAVROv7q/qM2Kot4xoiMLMKAUZUQozohRmutuUzGSm5bhjCPZhYBJlMD3664A7RGQl\nEMXV6P9DVe8BrgcQkRXAxd7jyznVuwco8rb1oapPAE+AG0c/7D0wZpSISE9An0tLRxc1TR1UN7VT\n3dRBTVM71U3tVJxs58TJNipOtrH3RCNVTe2cfopKQUYS03NSmOYdPM7xPnCyUyI96zkpkbNfNMaY\nERgw6FX1y7j6O16P/k9V9R4RyVfVSq9H/0Vc/R7gOeCPReRHuIOwDVafN36REgmRkhNiWk5Kv4/r\nisWpamqnvK6Vo3UtHK1t5UitO5j8+sEajp9sO+ODoFt+uju3YHZeKrMnpfWsT89JIRS0cwvM0I1k\n/NmDXlknADymqi9725/HDa08gBteef/ImmjM+BMKBpicmczkzGSKZ555jdKYd6ygtrmDupYOaps7\nqG/poLqpg8PenEIbdlVQ23zqcFc4KMyelMbcgjTm5nefXZzGtJwUO2hs+mVTIBhzAatv6eBgdTMH\nq5o50OvA8dG6lj7fCDKTw+SnJ5GfkUReWhL5GVEmpUXISnHloOzUcM96RnJ4VC4009EVRwTC9i0j\nYWwKBGN8ICslwtLpEZaedm5Ba0eMd6ua2FfRSHldK5WN7VQ2uuGlJaV1VDa295liujcRSE8KkZEc\nJiMaJjM5TEZyiPRomLgqnTGlsytOZyxORyxOR1ects4YzR0xWtq73LKji86Y+6RJSwqRmRwmK8W7\nJUfITAmTlRwmO8WtZ6dEyPKWhZlR0uxktvPKftvGjEPJkeA5zysAN4Koqb2L+pZT5aHu9fqWDk62\ndXGytZOTbZ2cbO3icHULjW2dBAJCJBggHAwQDolbBgNkpUSYmh0kJRIiNRIkJcktY3FoaO2kvrWD\nhpZO6ls72dNwknpvPXb6SQuejGiIKVnubObJWclMyYxSmOnuF2a6kUt2ZvPosd+kMT4kIqRHw6RH\nwwMeOB4rvT9sXPC7YxHHG9o4Xt/KsQY3fHVbWUOfeY66pUdDPdNa5KR6ZzenRshJi3hnOp86pyEt\nKWTTYPfDgt4YMyb6ftj0/9i2zhgVJ9s43tDGiYY2Tpx0y2P1rVQ3tbOjrp6a5o4zTmjrlhQKuNBP\nTyIv7cwhq24Ya5hoOEg0HCApFCQp5C3DAZJCAV9/UFjQG2MSLhoOMiM3lRm5qf0+rr0rRl1zJ9VN\n7dQ0d1Dd2O6d0+DOa6huaqe8vo2d5Sepbe6gI3b24xSnSw67eZHyvUnx8tLdQe3kSJBYXImpEo8r\ncXUjpmJx7TmG0RmL09nl7nfFlVBACAVPlb1CASESCpCXnuSNxIoyOSvKpNQkAqNwUHwwLOiNMeNG\nUihIYWaw35PauqkqLR2xnmMUdS2dtHXGaO9yB5jbu2K0d8Zp7YxR19zRMz/SgcomfvNuDQ2tned8\n7YA32igSDBAOBQh7wR4MCF0xpSsepyvmwr8zpnTE4mccrwgHhYKMKPdeM5NPLp894t9NfyzojTG+\nJCKkJoVITRr4BLezae+K0dEVJyBCMCC9lgy5zKOq1DR39JSjTnhlquP1reRnjP3sqRb0xhhzFq6O\nPzonoomId+A46ZwjpcaSnelgjDE+Z0FvjDE+Z0FvjDE+Z0FvjDE+Z0FvjDE+Z0FvjDE+Z0FvjDE+\nZ0FvjDE+d0FceEREqoDSYT59ElA9is0ZTybqvtt+Tyy23+c2Q1XzBnjMhRH0IyEiJYO5woofTdR9\nt/2eWGy/R85KN8YY43MW9MYY43N+CPonEt2ABJqo+277PbHYfo/QuK/RG2OM6Z8fevTGGGP6Ma6D\nXkRuFZG9InJARL6U6PaMFRH5rohUisjOXttyRORFEdnvLbMT2caxICLTRGSjiLwjIrtE5LPedl/v\nu4hEReRNEdnm7fdfettnicgb3t/7MyISSXRbx4KIBEXkbRH5hXff9/stIodFZIeIbBWREm/bqP2d\nj9ugF5Eg8E/AbcClwMdE5NLEtmrMfB+49bRtXwJeUtW5wEvefb/pAr6gqpcCVwMPeP/Gft/3duBG\nVV0MLAFuFZGrgf8HPKqqc4A64BMJbONY+iywu9f9ibLfN6jqkl5DKkft73zcBj1wFXBAVQ+qagfw\nI2BVgts0JlR1E1B72uZVwJPe+pPA75zXRp0HqnpcVbd46424//xT8fm+q9Pk3Q17NwVuBH7ibffd\nfgOISBHwAeDfvPvCBNjvcxi1v/PxHPRTgaO97pd52yaKAlU97q2fAAoS2ZixJiIzgcuBN5gA++6V\nL7YClcCLwLtAvap2eQ/x69/73wJ/BsS9+7lMjP1W4AUR2Swia7xto/Z3bteM9QFVVRHx7fApEUkD\nfgp8TlVP9r4ws1/3XVVjwBIRyQKeBeYnuEljTkRuBypVdbOIvC/R7TnPlqlquYjkAy+KyJ7ePxzp\n3/l47tGXA9N63S/ytk0UFSIyGcBbVia4PWNCRMK4kH9aVX/mbZ4Q+w6gqvXARuAaIEtEujtnfvx7\nvw64Q0QO40qxNwJ/h//3G1Ut95aVuA/2qxjFv/PxHPRvAXO9I/IR4C7guQS36Xx6DrjXW78X+O8E\ntmVMePXZfwd2q+p3ev3I1/suInleTx4RSQbejzs+sRH4Xe9hvttvVf2yqhap6kzc/+eXVfVufL7f\nIpIqIund68AKYCej+Hc+rk+YEpGVuJpeEPiuqj6c4CaNCRH5T+B9uNnsKoCvAf8FrAWm42b+vFNV\nTz9gO66JyDLgVWAHp2q2X8HV6X277yJyGe7gWxDXGVurql8Xkdm4nm4O8DZwj6q2J66lY8cr3fyp\nqt7u9/329u9Z724I+KGqPiwiuYzS3/m4DnpjjDEDG8+lG2OMMYNgQW+MMT5nQW+MMT5nQW+MMT5n\nQW+MMT5nQW+MMT5nQW+MMT5nQW+MMT73/wGB9R4eTS3JLgAAAABJRU5ErkJggg==\n", 221 | "text/plain": [ 222 | "
" 223 | ] 224 | }, 225 | "metadata": {}, 226 | "output_type": "display_data" 227 | }, 228 | { 229 | "name": "stdout", 230 | "output_type": "stream", 231 | "text": [ 232 | "118000/118000 [==============================] - 88s - loss: 486.4370 - val_loss: 496.1855\n" 233 | ] 234 | }, 235 | { 236 | "data": { 237 | "text/plain": [ 238 | "" 239 | ] 240 | }, 241 | "execution_count": 11, 242 | "metadata": {}, 243 | "output_type": "execute_result" 244 | } 245 | ], 246 | "source": [ 247 | "vae.fit_generator(nn_batch_generator(x_train, x_train, batch_size, 118000), samples_per_epoch=118000, nb_epoch=nb_epochs, validation_data=(x_val, x_val), callbacks=[checkpointer, reduce_lr, plot_losses])" 248 | ] 249 | }, 250 | { 251 | "cell_type": "code", 252 | "execution_count": null, 253 | "metadata": { 254 | "collapsed": true 255 | }, 256 | "outputs": [], 257 | "source": [] 258 | } 259 | ], 260 | "metadata": { 261 | "kernelspec": { 262 | "display_name": "Python 3", 263 | "language": "python", 264 | "name": "python3" 265 | }, 266 | "language_info": { 267 | "codemirror_mode": { 268 | "name": "ipython", 269 | "version": 3 270 | }, 271 | "file_extension": ".py", 272 | "mimetype": "text/x-python", 273 | "name": "python", 274 | "nbconvert_exporter": "python", 275 | "pygments_lexer": "ipython3", 276 | "version": "3.5.1" 277 | } 278 | }, 279 | "nbformat": 4, 280 | "nbformat_minor": 0 281 | } 282 | -------------------------------------------------------------------------------- /Standard/read_data.py: -------------------------------------------------------------------------------- 1 | import os, sys 2 | import pandas as pd 3 | import numpy as np 4 | import random 5 | import pickle 6 | from scipy import sparse 7 | DATA_DIR = '/home/kg2719/project/data-random-fold-1' 8 | #DATA_DIR = '/home/kg2719/project/kilol' 9 | raw_data = pd.read_csv(os.path.join(DATA_DIR, 'new_ratings.csv'), header=0) 10 | 11 | unique_user_ids = raw_data.userId.unique() 12 | 13 | np.random.seed(1) 14 | np.random.shuffle(unique_user_ids) # to ensure random splitting between train, val and test 15 | unique_movie_ids = raw_data.movieId.unique() 16 | 17 | number_of_users = len(unique_user_ids) 18 | print("total number of users: ", number_of_users) 19 | 20 | number_of_movies = len(unique_movie_ids) 21 | print("total number of movies: ", number_of_movies) 22 | 23 | # split users into training, validation and test 24 | val_user_ids = [] 25 | test_user_ids = [] 26 | train_user_ids = [] 27 | 28 | for i in range(10000): 29 | val_user_ids.append(unique_user_ids[i]) # first 10k after shuffling keys 30 | 31 | for i in range(10000, 20000): 32 | test_user_ids.append(unique_user_ids[i]) # next 10k after shuffling keys 33 | 34 | for i in range(20000, number_of_users): # all the remaining form training data 35 | train_user_ids.append(unique_user_ids[i]) 36 | 37 | 38 | # creating a movieId and userId to index dictionary for creating train_data ndarray 39 | movie2id = {} 40 | movie2id = dict((mid, i) for (i, mid) in enumerate(unique_movie_ids)) 41 | 42 | 43 | #------------------------------------------------------------------------------------------------------------------------------------ 44 | 45 | print("creating training data....") 46 | 47 | user2id = {} 48 | user2id = dict((uid, i) for (i, uid) in enumerate(train_user_ids)) 49 | 50 | 51 | rows = [] 52 | cols = [] 53 | for u_id in train_user_ids: 54 | print("train-",user2id[u_id]) 55 | m_ids = raw_data[(raw_data.userId == u_id) & (raw_data.rating > 3.5)]['movieId'].tolist() 56 | movie_indexes = [movie2id[m] for m in m_ids] 57 | rows.extend([user2id[u_id] for i in range(len(m_ids))]) 58 | cols.extend(movie_indexes) 59 | 60 | # pickle.dump(rows, open("rows.file", "wb")) 61 | # pickle.dump(cols, open("cols.file", "wb")) 62 | 63 | 64 | # creating a sparse matrix with no_of_train_users X movies for training, binarized feedback 65 | # rows and cols should be of same length 66 | train_data = sparse.csr_matrix((np.ones_like(rows),(np.array(rows), np.array(cols))), dtype='float64', shape=(len(train_user_ids), number_of_movies)) 67 | 68 | # dumping variable to load later for use in VAE code 69 | pickle.dump(train_data, open("train_data.file", "wb")) 70 | print("number of training users: ", len(train_user_ids)) 71 | 72 | 73 | #----------------------------------------------------------------------------------------------------------------------------------- 74 | 75 | print("creating test data....") 76 | test_user2id = {} 77 | test_user2id = dict((uid, i) for (i, uid) in enumerate(test_user_ids)) 78 | 79 | test_rows = [] 80 | test_cols = [] 81 | for u_id in test_user_ids: 82 | print("test-", test_user2id[u_id]) 83 | m_ids = raw_data[(raw_data.userId == u_id) & (raw_data.rating > 3.5)]['movieId'].tolist() 84 | movie_indexes = [movie2id[m] for m in m_ids] 85 | test_rows.extend([test_user2id[u_id] for i in range(len(m_ids))]) 86 | test_cols.extend(movie_indexes) 87 | 88 | test_data = sparse.csr_matrix((np.ones_like(test_rows),(np.array(test_rows), np.array(test_cols))), dtype='float64', shape=(len(test_user_ids), number_of_movies)) 89 | pickle.dump(test_data, open("test_data.file", "wb")) 90 | print("number of test users: ", len(test_user_ids)) 91 | 92 | #---------------------------------------------------------------------------------------------------------------------------------------------------------------------- 93 | 94 | 95 | print("creating validation data") 96 | val_user2id = {} 97 | val_user2id = dict((uid, i) for (i, uid) in enumerate(val_user_ids)) 98 | 99 | val_rows = [] 100 | val_cols = [] 101 | for u_id in val_user_ids: 102 | print("val-", val_user2id[u_id]) 103 | m_ids = raw_data[(raw_data.userId == u_id) & (raw_data.rating > 3.5)]['movieId'].tolist() 104 | movie_indexes = [movie2id[m] for m in m_ids] 105 | val_rows.extend([val_user2id[u_id] for i in range(len(m_ids))]) 106 | val_cols.extend(movie_indexes) 107 | 108 | val_data = sparse.csr_matrix((np.ones_like(val_rows),(np.array(val_rows), np.array(val_cols))), dtype='float64', shape=(len(val_user_ids), number_of_movies)) 109 | pickle.dump(val_data, open("val_data.file", "wb")) 110 | print("number of validation users: ", len(val_user_ids)) -------------------------------------------------------------------------------- /Standard/readme.md: -------------------------------------------------------------------------------- 1 | 2 | - **read_data.py** file used to create training, validation and testing data files. It reads in the new_ratings.csv and creates train_data.file, val_data.file and test_data.file which store the dumped sparse_matrix representation of these data-sets. To create multiple random folds, change the random seed's value in line 13. 3 | 4 | - **vae_cf_keras.py** file is used to create the vae network, compile it and train it on the train_data. This code saves the model's weights in the specified location, and also logs the train and validation losses for analysis purposes. 5 | 6 | - **evaluate_model_approach_1.py** and **evaluate_model_approach_2.py** load the saved weights, run the model on the test_data.file and calculates recall@20, recall@50 and ndcg@100 using the below mentioned testing approaches. Approach 2 is consistent with Liang et al (https://arxiv.org/pdf/1802.05814.pdf) 7 | 8 | - **plot_loss_graphs.ipynb** has the code which can be used to plot the loss vs epochs graph while running the model on test data 9 | 10 | - **project_user_clusters.ipynb** has the code which generates user clusters from the user-embeddings using k-means clustering and t-SNE dimensionality reduction. 11 | 12 | Testing approach 1 is where we obtained the metrics over all the movies which test users had marked 1 whereas approach 2 is the one where the 20% of the movies which were marked 1 were set off to 0 and then metrics were calculated how well our model recommended on these 20% of the movies. 13 | -------------------------------------------------------------------------------- /Standard/vae_cf_keras.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pickle 3 | import os 4 | from keras.layers import Input, Dense, Lambda 5 | from keras.models import Model 6 | from keras import objectives 7 | from keras import backend as K 8 | from keras.callbacks import ReduceLROnPlateau, ModelCheckpoint, Callback 9 | 10 | class LossHistory(Callback): 11 | def on_train_begin(self, logs={}): 12 | self.losses = [] 13 | self.val_losses = [] 14 | 15 | def on_epoch_end(self, epoch, logs={}): 16 | self.losses.append(logs.get('loss')) 17 | self.val_losses.append(logs.get('val_loss')) 18 | 19 | history = LossHistory() 20 | 21 | 22 | 23 | # encoder/decoder network size 24 | batch_size=500 25 | original_dim=26621 # number of movies 26 | intermediate_dim=600 27 | latent_dim=200 28 | nb_epochs=20 29 | epsilon_std=1.0 30 | 31 | 32 | # encoder network 33 | x=Input(batch_shape=(batch_size,original_dim)) 34 | h=Dense(intermediate_dim, activation='tanh')(x) 35 | z_mean=Dense(latent_dim)(h) 36 | z_log_var=Dense(latent_dim)(h) 37 | 38 | 39 | # sampling from latent dimension for decoder/generative part of network 40 | def sampling(args): 41 | _mean,_log_var=args 42 | epsilon=K.random_normal(shape=(K.shape(z_mean)[0], latent_dim), mean=0., std=epsilon_std) 43 | return _mean+K.exp(_log_var/2)*epsilon 44 | 45 | z= Lambda(sampling, output_shape=(latent_dim,))([z_mean, z_log_var]) 46 | 47 | # decoder network 48 | h_decoder=Dense(intermediate_dim, activation='tanh') 49 | x_bar=Dense(original_dim,activation='softmax') # this should be softmax right? 50 | h_decoded = h_decoder(z) 51 | x_decoded = x_bar(h_decoded) 52 | 53 | # build and compile model 54 | vae = Model(x, x_decoded) 55 | def vae_loss(x,x_bar): 56 | reconst_loss=original_dim*objectives.binary_crossentropy(x, x_bar) 57 | kl_loss= -0.5*K.sum(1 + z_log_var - K.square(z_mean) - K.exp(z_log_var), axis=-1) 58 | return kl_loss + reconst_loss 59 | 60 | vae.compile(optimizer='adam', loss=vae_loss) 61 | 62 | 63 | x_train = pickle.load( open( "train_data.file", "rb" ) ) 64 | print("number of training users: ", x_train.shape[0]) 65 | 66 | x_val = pickle.load( open( "val_data.file", "rb" ) ) 67 | x_val = x_val.todense() 68 | 69 | def nn_batch_generator(x, y, batch_size, samples_per_epoch): 70 | number_of_batches = samples_per_epoch/batch_size 71 | counter=0 72 | shuffle_index = np.arange(np.shape(y)[0]) 73 | np.random.shuffle(shuffle_index) 74 | x = x[shuffle_index, :] 75 | y = y[shuffle_index, :] 76 | while 1: 77 | index_batch = shuffle_index[batch_size*counter:batch_size*(counter+1)] 78 | x_batch = x[index_batch,:].todense() 79 | y_batch = y[index_batch,:].todense() 80 | counter += 1 81 | yield (np.array(x_batch),np.array(y_batch)) 82 | if (counter >= number_of_batches): 83 | counter=0 84 | 85 | 86 | weightsPath = "./tmp/weights.hdf5" 87 | checkpointer = ModelCheckpoint(filepath=weightsPath, verbose=1, save_best_only=True) 88 | 89 | reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=5, min_lr=0.001) 90 | 91 | # sending complete training data and shuffle flag will shuffle so that each user comes atleast once in training because of multiple epochs 92 | vae.fit_generator(nn_batch_generator(x_train, x_train, batch_size, 118000), samples_per_epoch=118000, nb_epoch=nb_epochs, 93 | validation_data=(x_val, x_val), callbacks=[checkpointer, reduce_lr, history]) 94 | 95 | print("training losses over epochs") 96 | print(history.losses) 97 | 98 | print("validation losses over epochs") 99 | print(history.val_losses) 100 | -------------------------------------------------------------------------------- /standard_vae_model_results.txt: -------------------------------------------------------------------------------- 1 | Approach 1 2 | 3 | fold-1 4 | recall@20: 0.5361947618247523 5 | recall@50: 0.5631774940667692 6 | ndcg@100: 0.2724007829543391 7 | 8 | fold-2 9 | recall@20: 0.5399664609063171 10 | recall@50 0.56542322335807 11 | ndcg@100: 0.27541021400623394 12 | 13 | fold-3 14 | recall@20: 0.5375343986736952 15 | recall@50: 0.5698594621594564 16 | ndcg@100: 0.27567425682388574 17 | 18 | recall@20- mean: 0.53686458, stddev: 0.00067 19 | recal@50- mean: 0.566518478, stddev: 0.0033 20 | ndcg@100- mean: 0.27403752, stddev: 0.0016 21 | 22 | ------------------------------------------------------------------------------------------------------ 23 | 24 | Approach 2 25 | 26 | fold-1 27 | recall@20: 0.20603333875751115 28 | recall@50: 0.364723130654524 29 | ndcg@100: 0.15851297473345 30 | 31 | 32 | fold-2 33 | recall@20: 0.20715624463122 34 | recall@50 0.36350998713467 35 | ndcg@100: 0.15670241651531513 36 | 37 | 38 | fold-3 39 | recall@20: 0.2026753411889 40 | recall@50: 0.3620999812169 41 | ndcg@100: 0.1522099753781 42 | 43 | 44 | 45 | recall@20- mean: 0.205288308193, stddev: 0.001903667832 46 | recal@50- mean: 0.363444366335, stddev: 0.00107190105576 47 | ndcg@100- mean: 0.155808455542, stddev: 0.00264969493645 48 | 49 | 50 | ------------------------------------------------------------------------------------------------------ 51 | 52 | mean is rounded off till 3 decimal places 53 | testing approach 1 is where we obtained the metrics over all the movies which test users had marked 1, approach 2 is when the 20% of the movies which were marked 1 were set off to 0 and then metrics were calculated using only those 54 | 55 | standard vae results using testing approach 1 on movielens, mean stddev over 3 CVs 56 | recall@20- mean: 0.537, stddev: 0.00067 57 | recal@50- mean: 0.567, stddev: 0.0033 58 | ndcg@100- mean: 0.274, stddev: 0.0016 59 | 60 | standard vae results using testing approach 2 on movielens, mean stddev over 3 CVs 61 | recall@20- mean: 0.205, stddev: 0.0019 62 | recal@50- mean: 0.363, stddev: 0.00107 63 | ndcg@100- mean: 0.156, stddev: 0.0026 -------------------------------------------------------------------------------- /update_ratings.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | with open('movie_data.json', 'r') as f: 4 | data = json.load(f) 5 | 6 | with open('links.csv', 'r') as f: 7 | ids = f.read().splitlines() 8 | 9 | movie_json = {} 10 | for i in data: 11 | movie_json[i['imdbID']] = i 12 | 13 | movie_ids_list = [] 14 | movie_json_ml20 = {} 15 | for i in ids: 16 | i = i.split(",") 17 | imdb = i[1] 18 | ml20 = i[0] 19 | try: 20 | movie_json_ml20[ml20] = movie_json['tt'+imdb] 21 | movie_ids_list.append(ml20) 22 | except KeyError: 23 | pass 24 | 25 | with open('movie_ml20.json', 'w') as f: 26 | json.dump(movie_json_ml20, f) 27 | 28 | with open('ratings.csv', 'r') as f: 29 | ratings = f.read().splitlines() 30 | 31 | with open('new_ratings.csv', 'w') as f: 32 | for i in ratings: 33 | j = i 34 | i = i.split(",") 35 | movie_id = i[1] 36 | try : 37 | a = movie_json_ml20[movie_id] 38 | f.write(j+"\n") 39 | 40 | except KeyError: 41 | pass 42 | 43 | --------------------------------------------------------------------------------