├── Hybrid
    ├── README.md
    ├── feature_extraction
    │   ├── Genome
    │   │   ├── Readme.md
    │   │   ├── get_genomes.py
    │   │   └── unique_movies
    │   ├── Genre
    │   │   ├── Readme.md
    │   │   ├── get_genres.py
    │   │   └── unique_movies
    │   ├── IMDb
    │   │   ├── Readme.md
    │   │   ├── combine_all_features.py
    │   │   ├── get_imdb_data.py
    │   │   ├── liwc.py
    │   │   ├── process_imdb_data.py
    │   │   ├── process_plot.py
    │   │   └── vad.py
    │   └── Movie-VAE
    │   │   ├── Readme.md
    │   │   ├── get_embeddings.py
    │   │   └── vae_movie.py
    └── vae_imdb
    │   ├── Readme.md
    │   ├── h-vae_imdb.py
    │   ├── test_h-vae_eval1.py
    │   └── test_h-vae_eval2.py
├── README.md
├── Spotify
    ├── README.md
    ├── generate_song_predictions.py
    ├── read_challenge.py
    ├── read_mpd.py
    ├── run_vae_on_test_playlists.py
    └── vae_cf_spotify.py
├── Standard
    ├── evaluate_model_approach_1.py
    ├── evaluate_model_approach_2.py
    ├── plot_loss_graphs.ipynb
    ├── project_user_clusters.ipynb
    ├── read_data.py
    ├── readme.md
    └── vae_cf_keras.py
├── standard_vae_model_results.txt
└── update_ratings.py


/Hybrid/README.md:
--------------------------------------------------------------------------------
1 | 
2 | This directory contains code for variational autoencoder applied to 
3 | collaborative filtering powered with movie embeddings/hand crafted movie features.
4 | 
5 | update_ratings.csv : creates a new ratings.csv file for only those movies whose IMDb data is downloaded
6 | 
7 | 


--------------------------------------------------------------------------------
/Hybrid/feature_extraction/Genome/Readme.md:
--------------------------------------------------------------------------------
1 | Run : get_genomes.py 
2 | obtain a (26621, 1128) matrix containing one hot encoding of genome tags. 
3 | 
4 | This is the input to movie-VAE
5 | 


--------------------------------------------------------------------------------
/Hybrid/feature_extraction/Genome/get_genomes.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | from operator import itemgetter
 3 | import numpy as np
 4 | NUM_GENOMES = 1128
 5 | 
 6 | 
 7 | #converts tuple to a 1128 dim vector
 8 | def get_genome_vec(genome_tup):
 9 |     vec = np.zeros(NUM_GENOMES)
10 |     print len(genome_tup)
11 |     for i in genome_tup:
12 | 
13 |         tag = int(i[0])-1
14 |         vec[tag] = 1
15 |     return vec
16 | 
17 | 
18 | with open('./ml-20m/genome-scores.csv', 'r') as f:
19 |     genome_data = f.read().splitlines()
20 | 
21 | genome_dict = {}
22 | 
23 | #Collect all the genomes in genomes-scores.csv
24 | for i in genome_data[1:]:
25 |     i = i.split(",")
26 |     mid = i[0]
27 |     tagid = i[1]
28 |     relevance = float(i[2])
29 |     try :
30 |         genome_dict[mid].append((tagid, relevance))
31 |     except:
32 |         genome_dict[mid] = [(tagid, relevance)]
33 | 
34 | #sort and select genomes
35 | for mid in genome_dict.keys():
36 |     scores = genome_dict[mid]
37 |     scores = sorted(scores , key=itemgetter(1), reverse = True)
38 |     scores = scores[:20]
39 |     genome_dict[mid] = scores
40 |     print len(genome_dict[mid])
41 | 
42 | with open('genome_scores.json', 'w') as f:
43 |     json.dump(genome_dict, f)
44 | 
45 | unk = np.array([0]*num_genomes)
46 | movie_embeddings_array = []
47 | 
48 | #convert list to a one hot vector
49 | with open('unique_movies', 'r') as f:
50 |     movie_id = f.read().splitlines()
51 |     for i in movie_id:
52 |         try:
53 |             i = i.split(',')
54 |             mid = i[0]
55 |             movie_embeddings_array.append(get_genome_vec(genome_dict[mid]))
56 |         except KeyError:
57 |             movie_embeddings_array.append(unk)
58 | 
59 | movie_embeddings_array = np.array(movie_embeddings_array)
60 | with open('movie_genomes.npy', 'wb') as f:
61 |     np.save(f, movie_embeddings_array)
62 | 


--------------------------------------------------------------------------------
/Hybrid/feature_extraction/Genre/Readme.md:
--------------------------------------------------------------------------------
1 | Run get_genres.py to get a .npy file containing a (26621, ) array with each element representing the genre of that movie
2 | 


--------------------------------------------------------------------------------
/Hybrid/feature_extraction/Genre/get_genres.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import numpy as np
 3 | 
 4 | with open('./ml20-m/movies.csv', 'r') as f:
 5 |     data = f.read().splitlines()
 6 | 
 7 | genre_json = {}
 8 | for i in data[1:]:
 9 |     i = i.split('::')
10 |     m_id = i[0]
11 |     genres = i[2]
12 |     print genres
13 |     genre_data[m_id] = genres.split('|')
14 | 
15 | with open('genres.json', 'w') as f:
16 |     json.dump(genre_data, f)
17 | 
18 | 
19 | genres = set()
20 | for m in genre_data.keys():
21 |     for g in genre_data[m]:
22 |         genres.add(g)
23 | genre2id = dict((genre, i) for (i, genre) in enumerate(genres))
24 | 
25 | with open('unique_movies', 'r') as f:
26 |     movie_ids = f.read().splitlines()
27 | 
28 | with open('movie_genres.npy', 'w') as f:
29 |     genre_array = []
30 |     for i in movie_ids:
31 |         i = i.split(',')
32 |         mid = i[0]
33 |         genre_array.append(genre2id[genre_data[mid][0]])
34 |     genre_array = np.array(genre_array)
35 |     np.save(f, genre_array)
36 |     
37 | 


--------------------------------------------------------------------------------
/Hybrid/feature_extraction/IMDb/Readme.md:
--------------------------------------------------------------------------------
 1 | Get IMDB features for each movie to be given as input to MovieVAE
 2 | 
 3 | Order of running scripts
 4 | 
 5 |     get_imdb_data.py
 6 |     
 7 |     process_imdb_data.py
 8 |     
 9 |     process_plot.py
10 |     
11 |     combine_all_features.py
12 |     
13 | 


--------------------------------------------------------------------------------
/Hybrid/feature_extraction/IMDb/combine_all_features.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import numpy as np
 3 | 
 4 | with open('language.json', 'r') as f:
 5 |     language = json.load(f)
 6 | 
 7 | with open('imdb_rat.json', 'r') as f:
 8 |     imdb_rat = json.load(f)
 9 | 
10 | with open('plot_feature.json', 'r') as f:
11 |     plot = json.load(f)
12 | 
13 | with open('rated.json', 'r') as f:
14 |     rated = json.load(f)
15 | 
16 | mid2idx = {}
17 | with open('unique_movies', 'r') as f:
18 |     data = f.read().splitlines()
19 |     for i in data:
20 |         i = i.split(',')
21 |         mid2idx[i[0]] = int(i[1])
22 | 
23 | movie_feat_list = []
24 | 
25 | idx = language['unique'] 
26 | for i in language['list'].keys():
27 |     vec = np.zeros(len(idx))
28 |     lang = language['list'][i]
29 |     for j in lang:
30 |         item = idx[j]
31 |         vec[item] =1.0
32 | 
33 |     language['list'][i] = list(vec)
34 | 
35 | 
36 | idx = rated['unique'] 
37 | for i in rated['list'].keys():
38 |     vec = np.zeros(len(idx))
39 |     item = idx[rated['list'][i]]
40 |     vec[item] =1.0 
41 |     rated['list'][i] = list(vec)
42 | 
43 | 
44 | for i in mid2idx.keys():
45 |     mid = str(mid2idx[i])
46 |     #    mid = str(mid)
47 |     vec = language['list'][mid]
48 |     vec+= plot[mid]
49 |     vec+= rated['list'][mid]
50 |     vec.append(imdb_rat[mid])
51 |     vec = np.array(vec)
52 |     movie_feat_list.append(vec)
53 | 
54 | movie_feat_list = np.array(movie_feat_list)
55 | print movie_feat_list.shape
56 | 
57 | with open('movie_features.npy', 'wb') as f:
58 |     np.save(f, movie_feat_list)
59 | 
60 | 


--------------------------------------------------------------------------------
/Hybrid/feature_extraction/IMDb/get_imdb_data.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import requests
 3 | import time 
 4 | 
 5 | def get_movie_data(imdb_id):
 6 | 	url = 'http://www.omdbapi.com/?i={}&apikey=96725c4f&plot=full'.format(imdb_id)
 7 | 	response = requests.get(url)
 8 | 	return response.json()
 9 | 
10 | with open('links.csv', 'r') as f:
11 | 	ids = f.read().splitlines()
12 | 
13 | 
14 | movie_dict = []
15 | count =  0
16 | for i in ids[1:]:
17 | 	i = i.split(",")
18 | 	ml_id = i[0]
19 | 	imdb_id = 'tt' + i[1]
20 | 
21 | 	print imdb_id
22 | 	
23 | 	movie = get_movie_data(imdb_id)
24 | 	if movie['Response'] == 'True':
25 | 		movie_dict.append(movie)
26 | 
27 | 	if count % 500 == 0:
28 | 		json_data = open("imdb_data.json", "w")
29 | 		movie_dict_json = json.dumps(movie_dict)
30 | 		json_data.write(movie_dict_json)
31 | 		json_data.close()
32 | 
33 | 	count += 1
34 | 	time.sleep(1)
35 | 


--------------------------------------------------------------------------------
/Hybrid/feature_extraction/IMDb/liwc.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | from nltk.stem.lancaster import LancasterStemmer
 3 | from nltk import PorterStemmer
 4 | from nltk.tokenize import RegexpTokenizer
 5 | from nltk.corpus import stopwords
 6 | from nltk import FreqDist as freq
 7 | import numpy as np
 8 | from nltk.stem import WordNetLemmatizer
 9 | 
10 | """with open('liwc.cat', 'r') as f:
11 | 	liwc_cat = f.read().splitlines()
12 | 
13 | liwc_json = {}
14 | for i in range(1, 65):
15 | 	print i
16 | 	with open('dictionary/{}.csv'.format(i), 'r') as f:
17 | 		words = f.read().splitlines()
18 | 		for word in words:
19 | 			#print word
20 | 			#word = word.encode('utf-8')
21 | 			try :
22 | 				liwc_json[word][i-1] += 1
23 | 			except KeyError:
24 | 				liwc_json[word] = [0]*64
25 | 				liwc_json[word][i-1] = 1
26 | 
27 | print liwc_json
28 | with open('liwc.json', 'w') as f:
29 | 	json.dump(liwc_json, f)"""
30 | 
31 | class liwc_score():
32 | 	def __init__(self):
33 | 		with open('liwc_dict.json', 'r') as f:
34 | 			self.liwc = json.load(f)
35 | 		#print self.liwc
36 | 		self.stemmer = LancasterStemmer()
37 | 		self.lemmatizer = WordNetLemmatizer()
38 | 		self.tokenizer = RegexpTokenizer(r'\w+')	
39 | 		self.stop_words = set(stopwords.words('english'))
40 | 
41 | 	#Remove stopwords ? 
42 | 	def token_stemmer(self, inp_sentence, remove_stop):
43 | 		token_list = self.tokenizer.tokenize(inp_sentence)
44 | 		if remove_stop :
45 | 			token_list = [word for word in token_list if word not in self.stop_words] 
46 | 		stem_list = [self.stemmer.stem(word) for word in token_list]
47 | 		#stem_list = [self.lemmatizer.lemmatize(word) for word in token_list]
48 | 
49 | 		return stem_list
50 | 
51 | 	def get_liwc_score(self, inp_sentence, remove_stop = False):
52 | 		stem_list = self.token_stemmer(inp_sentence, remove_stop)
53 | 		#print stem_list
54 | 		fdist = dict(freq(stem_list))
55 |             
56 | 		liwc_vec = np.array([0.0]*64)
57 | 		count = 1.0
58 | 		#print fdist
59 | 		for word in fdist.keys():
60 | 			try :
61 | 				#print self.liwc[word]
62 | 				liwc_vec += np.array( self.liwc[word] )*fdist[word]
63 | 				#print 'hello'
64 | 				count += fdist[word]
65 | 				#print liwc_vec
66 | 			except KeyError:
67 | 			    
68 | 				pass
69 | 
70 | 		return liwc_vec/count
71 | 
72 | if __name__ == "__main__":
73 | 	with open('test', 'r') as f:
74 | 		inp_data = f.read()
75 | 
76 | 	l = liwc_score()
77 | 	print l.get_liwc_score(inp_data)
78 | 
79 | 
80 | 
81 | 
82 | 
83 | 
84 | 
85 | 
86 | 


--------------------------------------------------------------------------------
/Hybrid/feature_extraction/IMDb/process_imdb_data.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import numpy as np 
 3 | 
 4 | def process_lang(string):
 5 |     string = string.split(",")
 6 |     string = [i.lstrip(' ').rstrip(' ') for i in string]
 7 |     return string
 8 | 
 9 | with open('imdb_data.json', 'r') as f:
10 |     raw_data = json.load(f)
11 | 
12 | language = {'unique' : set(), 'list' : {} }
13 | imdb_rat = {}
14 | plot = {}
15 | rated = {'unique' : set(), 'list' : {} }
16 | 
17 | mid2idx = {}
18 | with open('unique_movies', 'r') as f:
19 |     data = f.read().splitlines()
20 |     for i in data:
21 |         i = i.split(',')
22 |         mid2idx[i[0]] = int(i[1])
23 |         
24 | for mid in mid2idx.keys():
25 |     midx = mid2idx[mid]
26 |     
27 |     lang = process_lang(raw_data[mid]['Language'])
28 |     language['list'][midx] = lang
29 |     for i in lang: 
30 |         language['unique'].add(i)
31 | 
32 |     if raw_data[mid]['imdbRating'] != 'N/A':
33 |         imdb_rat[midx] = float(raw_data[mid]['imdbRating'])
34 |     else :    
35 |         imdb_rat[midx] = float(0.0)
36 | 
37 |     plot[midx] = raw_data[mid]['Plot']
38 | 
39 |     rated['list'][midx] = raw_data[mid]['Rated']
40 |     rated['unique'].add(raw_data[mid]['Rated'])
41 | 
42 | 
43 | language['unique'] = dict( (item, i) for (i, item) in enumerate(list(language['unique'])))  
44 | rated['unique'] = dict( (item, i) for (i, item) in enumerate(list(rated['unique'])))  
45 | 
46 | with open('language.json', 'w') as f:
47 |     json.dump( language,f)
48 | 
49 | with open('imdb_rat.json', 'w') as f:
50 |     json.dump( imdb_rat ,f)
51 | 
52 | with open('plot.json', 'w') as f:
53 |     json.dump( plot,f)
54 | 
55 | with open('rated.json', 'w') as f:
56 |     json.dump( rated ,f)
57 | 
58 | 
59 | 
60 | 


--------------------------------------------------------------------------------
/Hybrid/feature_extraction/IMDb/process_plot.py:
--------------------------------------------------------------------------------
 1 | import gensim
 2 | import sys
 3 | from liwc import liwc_score
 4 | from vad import vad_score
 5 | 
 6 | import json
 7 | from nltk.stem.lancaster import LancasterStemmer
 8 | from nltk import PorterStemmer
 9 | from nltk.tokenize import RegexpTokenizer
10 | from nltk.corpus import stopwords
11 | from nltk import FreqDist as freq
12 | import numpy as np
13 | from nltk.stem import WordNetLemmatizer
14 | 
15 | model = gensim.models.KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True) 
16 | model.save("word2vec.model")
17 | 
18 | tokenizer = RegexpTokenizer(r'\w+')    
19 | stop_words = set(stopwords.words('english'))
20 | lemmatizer = WordNetLemmatizer()
21 | ls = liwc_score()
22 | vs = vad_score()
23 | 
24 | def token_stemmer(inp_sentence):
25 |     token_list = tokenizer.tokenize(inp_sentence)
26 |     token_list = [word for word in token_list if word not in stop_words] 
27 |     stem_list = [lemmatizer.lemmatize(word) for word in token_list]
28 |     return stem_list
29 | 
30 | def normalize(x):
31 |     return (x - np.min(x))/(np.max(x) - np.min(x))
32 | 
33 | def get_word2vec(text):
34 |     word_list = token_stemmer(text)
35 |     vec = np.zeros(300)
36 |     count = 1.0
37 |     for word in word_list:
38 |         try:
39 |             vec+= (model[word])
40 |             count += 1.0
41 |             #vec+= model[word]
42 |         except:
43 |             pass
44 |     return vec/count
45 | 
46 | 
47 | with open('plot.json', 'r') as f:
48 |     plot_data = json.load(f)
49 | 
50 | w2vec_list = []
51 | plot_feature = {}
52 | for mid in plot_data.keys():
53 |     text = plot_data[mid]
54 |     
55 |     w2vec = get_word2vec(text)
56 |     w2vec_list.append(w2vec)
57 | 
58 |     vec = list(ls.get_liwc_score(text))
59 |     vec += list(vs.get_vad_score(text))
60 |     
61 |     plot_feature[mid] = vec
62 | 
63 | w2vec_list = np.array(w2vec_list)
64 | w2vec_list = (w2vec_list - np.min(w2vec_list))/(np.max(w2vec_list) - np.min(w2vec_list))
65 | 
66 | mids = liwc_vad_dict.keys()
67 | 
68 | for i in range(0, len(mids)):
69 |     mid = mids[i]
70 |     vec = list(w2vec_list[i])
71 |     plot_feature[mid] = plot_feature[mid] + vec
72 | 
73 | with open('plot_feature.json', 'w') as f:
74 |     json.dump(plot_feature, f)
75 | 
76 | 
77 | 


--------------------------------------------------------------------------------
/Hybrid/feature_extraction/IMDb/vad.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | from nltk.stem import WordNetLemmatizer
 3 | from nltk.tokenize import RegexpTokenizer
 4 | from nltk.corpus import stopwords
 5 | from nltk import FreqDist as freq
 6 | 
 7 | """with open('warriner.csv', 'r') as f:
 8 |     vad_raw = f.read().splitlines()
 9 | 
10 | vad_json = {}
11 | for i in vad_raw[1:]:
12 |     i = i.split(",")
13 |     item = {}
14 |     item['v'] = float(i[2])
15 |     item['a'] = float(i[5])
16 |     item['d'] = float(i[8])
17 |     vad_json[i[1]] = item
18 | 
19 | with open('warriner.json', 'w') as f:
20 |     json.dump(vad_json, f)"""
21 | 
22 | 
23 | class vad_score():
24 |     def __init__(self):
25 |         with open('warriner.json', 'r') as f:
26 |             self.vad = json.load(f)
27 |         self.tokenizer = RegexpTokenizer(r'\w+')    
28 |         self.stop_words = set(stopwords.words('english'))
29 |         self.lemmatizer = WordNetLemmatizer()
30 | 
31 |     def token_stemmer(self, inp_sentence, remove_stop ):
32 |         token_list = self.tokenizer.tokenize(inp_sentence)
33 |         if remove_stop :
34 |             token_list = [word for word in token_list if word not in self.stop_words] 
35 |         lemma_list = [self.lemmatizer.lemmatize(word) for word in token_list]
36 | 
37 |         return lemma_list
38 | 
39 |     def get_vad_score(self, inp_sentence, remove_stop= False):
40 |         token_list = self.token_stemmer(inp_sentence, remove_stop)
41 |         fdist = dict(freq(token_list))
42 | 
43 |         v_sum = 0.0
44 |         a_sum = 0.0
45 |         d_sum = 0.0
46 |         count = 1.0
47 | 
48 |         for word in fdist.keys():
49 |             try:
50 |                 v_sum = self.vad[word]['v']*fdist[word]
51 |                 a_sum = self.vad[word]['a']*fdist[word]
52 |                 d_sum = self.vad[word]['d']*fdist[word]
53 |                 count += fdist[word]
54 |             except KeyError:
55 |                 pass
56 |                 
57 |         return (v_sum/count, a_sum/count, d_sum/count)
58 | 
59 | 
60 | if __name__ == "__main__":
61 |     with open('test', 'r') as f:
62 |         inp_data = f.read()
63 | 
64 |     l = vad_score()
65 |     print l.get_vad_score(inp_data)
66 | 
67 | 


--------------------------------------------------------------------------------
/Hybrid/feature_extraction/Movie-VAE/Readme.md:
--------------------------------------------------------------------------------
1 | get movie embeddings to be given as input to hybrid models. 
2 | 
3 | run :
4 | 
5 |     vae_movie.py
6 |     
7 |     get_embeddings.py
8 |     
9 | 


--------------------------------------------------------------------------------
/Hybrid/feature_extraction/Movie-VAE/get_embeddings.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pickle
 3 | import os
 4 | import math
 5 | from keras.layers import Input, Dense, Lambda
 6 | from keras.models import Model, load_model
 7 | from keras import objectives
 8 | from keras import backend as K
 9 | 
10 | batch_size=1
11 | original_dim=1128
12 | intermediate_dim=50
13 | latent_dim=3
14 | epsilon_std=1.0
15 | 
16 | x=Input(batch_shape=(batch_size,original_dim))
17 | h=Dense(intermediate_dim, activation='relu')(x)
18 | z_mean=Dense(latent_dim)(h)
19 | z_log_var=Dense(latent_dim)(h)
20 | 
21 | def sampling(args):
22 |     _mean,_log_var=args
23 |     epsilon=K.random_normal(shape=(K.shape(z_mean)[0], latent_dim), mean=0., std=epsilon_std)
24 |     return _mean+K.exp(_log_var/2)*epsilon
25 | z = Lambda(sampling, output_shape=(latent_dim,))([z_mean, z_log_var])
26 | 
27 | h_decoder=Dense(intermediate_dim, activation='relu')
28 | x_bar=Dense(original_dim,activation='sigmoid')
29 | h_decoded = h_decoder(z)
30 | x_decoded = x_bar(h_decoded)
31 | 
32 | vae = Model(x, [x_decoded,z])
33 | weightsPath = "mov_genomes.hdf5"
34 | vae.load_weights(weightsPath)
35 | 
36 | x_test_matrix = np.load( open( "movie_genomes.npy", "rb" ) )
37 | 
38 | x_test_reconstructed = vae.predict(x_test_matrix, batch_size=batch_size)  # float values per user
39 | 
40 | with open('genome_embed.npy', 'wb') as f:
41 |     np.save(f, np.array(x_test_reconstructed[1].tolist()))
42 | 
43 | 


--------------------------------------------------------------------------------
/Hybrid/feature_extraction/Movie-VAE/vae_movie.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pickle
 3 | import os
 4 | from keras.layers import Input, Dense, Lambda
 5 | from keras.models import Model
 6 | from keras import objectives
 7 | from keras import backend as K
 8 | from keras.callbacks import ReduceLROnPlateau, ModelCheckpoint, Callback
 9 | 
10 | 
11 | batch_size=20
12 | original_dim=1128 
13 | intermediate_dim=100
14 | latent_dim=2
15 | nb_epochs=20
16 | epsilon_std=1.0
17 | 
18 | x=Input(batch_shape=(batch_size,original_dim))
19 | h=Dense(intermediate_dim, activation='relu')(x)
20 | z_mean=Dense(latent_dim)(h)
21 | z_log_var=Dense(latent_dim)(h)
22 | 
23 | def sampling(args):
24 |     _mean,_log_var=args
25 |     epsilon=K.random_normal(shape=(K.shape(z_mean)[0], latent_dim), mean=0., std=epsilon_std)
26 |     return _mean+K.exp(_log_var/2)*epsilon
27 | z= Lambda(sampling, output_shape=(latent_dim,))([z_mean, z_log_var])
28 | 
29 | h_decoder=Dense(intermediate_dim, activation='relu')
30 | x_bar=Dense(original_dim,activation='sigmoid') 
31 | h_decoded = h_decoder(z)
32 | x_decoded = x_bar(h_decoded)
33 | 
34 | vae = Model(x, x_decoded)
35 | def vae_loss(x,x_bar):
36 |     reconst_loss=original_dim*objectives.binary_crossentropy(x, x_bar)
37 |     kl_loss=-0.5*K.sum(1 + z_log_var - K.square(z_mean) - K.exp(z_log_var), axis=-1)
38 |     return reconst_loss + kl_loss
39 | 
40 | vae.compile(optimizer='adam', loss=vae_loss)
41 | 
42 | x_train = pickle.load( open( "movie_genomes.npy", "rb" ) )
43 | 
44 | def nn_batch_generator(x, y, batch_size, samples_per_epoch):
45 |     number_of_batches = samples_per_epoch/batch_size
46 |     counter=0
47 |     shuffle_index = np.arange(np.shape(y)[0])
48 |     np.random.shuffle(shuffle_index)
49 |     x =  x[shuffle_index, :]
50 |     y =  y[shuffle_index, :]
51 |     while 1:
52 |         index_batch = shuffle_index[batch_size*counter:batch_size*(counter+1)]
53 |         x_batch = x[index_batch,:]
54 |         y_batch = y[index_batch,:]
55 |         counter += 1
56 |         yield (np.array(x_batch),np.array(y_batch))
57 |         if (counter >= number_of_batches):
58 |             counter=0
59 | 
60 | 
61 | weightsPath = "./mov_genome.hdf5"
62 | checkpointer = ModelCheckpoint(filepath=weightsPath, verbose=1)
63 | 
64 | vae.fit_generator(nn_batch_generator(x_train, x_train, batch_size, 26620), samples_per_epoch=26620, nb_epoch=nb_epochs, callbacks=[checkpointer])
65 | 


--------------------------------------------------------------------------------
/Hybrid/vae_imdb/Readme.md:
--------------------------------------------------------------------------------
1 | train hyrbid-VAE : h-vae_imdb.py
2 | 
3 | eval 1           : test_h-vae_eval1.py
4 | 
5 | eval 2           : test_h-vae_eval2.py
6 | 


--------------------------------------------------------------------------------
/Hybrid/vae_imdb/h-vae_imdb.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pickle
 3 | import os
 4 | from keras.layers import Input, Dense, Lambda, merge, Embedding, Flatten, LSTM
 5 | from keras.models import Model, Sequential
 6 | from keras import objectives
 7 | from keras import backend as K
 8 | from keras.callbacks import ReduceLROnPlateau, ModelCheckpoint, Callback
 9 | import keras
10 | import tensorflow as tf
11 | import pdb
12 | 
13 | 
14 | batch_size=500
15 | original_dim=26621
16 | intermediate_dim=600
17 | latent_dim=200
18 | nb_epochs=15 
19 | epsilon_std=1.0
20 | 
21 | vocab_size = 26621
22 | embed_dim = 3
23 | seq_length = 26621
24 | 
25 | 
26 | with open('feature_embed.npy', 'rb') as f:
27 |     embedding_matrix = np.load(f)
28 | embedding_matrix = np.append(np.array([[0.0, 0.0, 0.0]]) ,embedding_matrix, axis =0)
29 | 
30 | x=Input(batch_shape=(batch_size,original_dim))
31 | embedding_layer = Embedding(vocab_size+1, 3, weights=[embedding_matrix], input_length=seq_length, trainable=True)
32 | embed = embedding_layer(x)
33 | flat_embed = Flatten()
34 | embed = flat_embed(embed)
35 | h=Dense(intermediate_dim, activation='tanh')(embed)
36 | 
37 | z_mean=Dense(latent_dim)(h)
38 | z_log_var=Dense(latent_dim)(h)
39 | def sampling(args):
40 |     _mean,_log_var=args
41 |     epsilon=K.random_normal(shape=(K.shape(z_mean)[0], latent_dim), mean=0., std=epsilon_std)
42 |     return _mean+K.exp(_log_var/2)*epsilon
43 | z= Lambda(sampling, output_shape=(latent_dim,))([z_mean, z_log_var])
44 | 
45 | # decoder network
46 | h_decoder=Dense(intermediate_dim, activation='tanh')
47 | x_bar=Dense(original_dim2, activation='softmax')
48 | h_decoded = h_decoder(z)
49 | x_decoded = x_bar(h_decoded)
50 | 
51 | mul_inp = Input(batch_shape=(batch_size,original_dim))
52 | x_decoded = merge([x_decoded, mul_inp], mode = 'mul')
53 | vae = Model([x, mul_inp], x_decoded)
54 | 
55 | 
56 | def vae_loss(x,x_bar):
57 |     reconst_loss=original_dim2*objectives.binary_crossentropy(x, x_bar)
58 |     kl_loss=-0.5*K.sum(1 + z_log_var - K.square(z_mean) - K.exp(z_log_var), axis=-1)
59 |     return reconst_loss + kl_loss
60 | 
61 | vae.compile(optimizer='adam', loss=vae_loss)
62 | print(vae.summary())
63 | 
64 | x_train = pickle.load( open( "train_data.file", "rb" ) )
65 | movie_indices = np.array([range(1,26622)])
66 | movie_indices = np.repeat(movie_indices, batch_size, axis = 0)
67 | 
68 | 
69 | def nn_batch_generator(x, y, batch_size, samples_per_epoch):
70 |     number_of_batches = samples_per_epoch/batch_size
71 |     counter=0
72 |     shuffle_index = np.arange(np.shape(y)[0])
73 |     np.random.shuffle(shuffle_index)
74 |     x =  x[shuffle_index, :]
75 |     y =  y[shuffle_index, :]
76 |     while 1:
77 |         index_batch = shuffle_index[batch_size*counter:batch_size*(counter+1)]
78 |         x_batch = np.array(x[index_batch,:].todense()).astype(float)
79 |         x_new_batch = x_batch*movie_indices
80 |         
81 |         counter += 1
82 |         yield ([x_new_batch, x_batch], x_batch)
83 |         if (counter >= number_of_batches):
84 |             counter=0
85 | 
86 | 
87 | weightsPath = "weights_h-vae_imdb.hdf5"
88 | checkpointer = ModelCheckpoint(filepath=weightsPath, verbose=1)
89 | 
90 | vae.fit_generator(nn_batch_generator(x_train, x_train, batch_size, 118000) , samples_per_epoch=118000, nb_epoch=nb_epochs, callbacks = [checkpointer])
91 | 


--------------------------------------------------------------------------------
/Hybrid/vae_imdb/test_h-vae_eval1.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import pickle
  3 | import os
  4 | import math
  5 | from keras.layers import Input, Dense, Lambda, Embedding, Flatten, merge
  6 | from keras.models import Model, load_model
  7 | from keras import objectives
  8 | from keras import backend as K
  9 | 
 10 | batch_size=500
 11 | original_dim=26621 
 12 | intermediate_dim=600
 13 | latent_dim=200
 14 | nb_epochs=15 
 15 | epsilon_std=1.0
 16 | vocab_size = 26621
 17 | embed_dim = 3
 18 | seq_length = 26621
 19 | 
 20 | 
 21 | with open('feature_embed_3dim.npy', 'rb') as f:
 22 |     embedding_matrix = np.load(f)
 23 | embedding_matrix = np.append(np.array([[0.0, 0.0, 0.0]]) ,embedding_matrix, axis =0)
 24 | 
 25 | x=Input(batch_shape=(batch_size,original_dim))
 26 | embedding_layer = Embedding(vocab_size+1, 3, weights=[embedding_matrix], input_length=seq_length, trainable=True)
 27 | embed = embedding_layer(x)
 28 | flat_embed = Flatten()
 29 | embed = flat_embed(embed)
 30 | h=Dense(intermediate_dim, activation='tanh')(embed)
 31 | 
 32 | z_mean=Dense(latent_dim)(h)
 33 | z_log_var=Dense(latent_dim)(h)
 34 | 
 35 | def sampling(args):
 36 |     _mean,_log_var=args
 37 |     epsilon=K.random_normal(shape=(K.shape(z_mean)[0], latent_dim), mean=0., std=epsilon_std)
 38 |     return _mean+K.exp(_log_var/2)*epsilon
 39 | z= Lambda(sampling, output_shape=(latent_dim,))([z_mean, z_log_var])
 40 | 
 41 | h_decoder=Dense(intermediate_dim, activation='tanh')
 42 | x_bar=Dense(original_dim2, activation='softmax') 
 43 | h_decoded = h_decoder(z)
 44 | x_decoded = x_bar(h_decoded)
 45 | 
 46 | mul_inp = Input(batch_shape=(batch_size,original_dim))
 47 | x_decoded2 = merge([x_decoded, mul_inp], mode = 'mul')
 48 | vae = Model([x, mul_inp], x_decoded)
 49 | 
 50 | weightsPath = "weights_h-vae_imdb.hdf5"
 51 | vae.load_weights(weightsPath)
 52 | 
 53 | movie_indices = np.array([range(1,26622)])
 54 | movie_indices = np.repeat(movie_indices, batch_size, axis = 0)
 55 | 
 56 | x_test_matrix = pickle.load( open( "test_data.file", "rb" ) )
 57 | x_test_matrix = x_test_matrix.todense()  
 58 | x_test = np.squeeze(np.asarray(x_test_matrix))
 59 | 
 60 | x_test_new = x_test*movie_indices
 61 | 
 62 | x_test_reconstructed = vae.predict([x_test_new, x_test], batch_size=batch_size)  # float values per user
 63 | 
 64 | def recallatk(x_test, x_test_reconstructed, k):
 65 |     recall_values = []
 66 |     total_recall = 0.0
 67 |     for i in range(len(x_test)):
 68 |         top_rated_movies_idx = [i for i, x in enumerate(x_test[i].tolist()) if x == 1.0]
 69 |         if len(top_rated_movies_idx) == 0:
 70 |             continue
 71 | 
 72 |         sorted_ratings = x_test_reconstructed[i].tolist()
 73 |         top_predicted_movies_idx = sorted(range(len(sorted_ratings)), key=lambda i: sorted_ratings[i])[-k:]
 74 |         
 75 |         sum = 0.0
 76 |         for i in range(0, k):
 77 |             if top_predicted_movies_idx[i] in top_rated_movies_idx:
 78 |                 sum+=1.0
 79 |         recall = sum/float(min(k, len(top_rated_movies_idx)))
 80 |         total_recall += recall
 81 |         recall_values.append(recall)
 82 |     return total_recall/float(len(recall_values))
 83 | 
 84 | def ndcgatk(x_test, x_test_reconstructed, k):
 85 |     ndcg_values = []
 86 |     total_ndcg = 0.0
 87 |     best  = 0.0
 88 |     for i in range(len(x_test)):
 89 |         top_rated_movies_idx = [i for i, x in enumerate(x_test[i].tolist()) if x == 1.0]
 90 |         
 91 |         if len(top_rated_movies_idx) == 0:
 92 |             continue
 93 |         sorted_ratings = x_test_reconstructed[i].tolist()
 94 |         top_predicted_movies_idx = sorted(range(len(sorted_ratings)), key=lambda i: sorted_ratings[i])[-k:]
 95 |         sum_ndcg = 0
 96 |         for i in range(0, k):
 97 |             if top_predicted_movies_idx[i] in top_rated_movies_idx:
 98 |                 ndcg = 1/(math.log(i+2))
 99 |             else:
100 |                 ndcg = 0
101 |             sum_ndcg += ndcg
102 | 
103 |         total_ndcg += sum_ndcg
104 |         ndcg_values.append(sum_ndcg)
105 | 
106 |     ndcg_values = np.array(ndcg_values)
107 |     max_ndcg = ndcg_values.max()
108 |     ndcg_values = ndcg_values / max_ndcg 
109 |     total_ndcg = np.sum(ndcg_values)
110 | 
111 |     return total_ndcg/float(len(ndcg_values))
112 | 
113 | print("NDCG at 100: ", ndcgatk(x_test, x_test_reconstructed, 100))
114 | print("recall at 20: ", recallatk(x_test, x_test_reconstructed, 20))
115 | print("recall at 50: ", recallatk(x_test, x_test_reconstructed, 50))
116 | 


--------------------------------------------------------------------------------
/Hybrid/vae_imdb/test_h-vae_eval2.py:
--------------------------------------------------------------------------------
  1 | import random
  2 | import numpy as np
  3 | import pickle
  4 | import os
  5 | import math
  6 | from keras.layers import Input, Dense, Lambda, Embedding, Flatten
  7 | from keras.models import Model, load_model
  8 | from keras import objectives
  9 | from keras import backend as K
 10 | 
 11 | batch_size=500
 12 | original_dim=26621
 13 | intermediate_dim=600
 14 | latent_dim=200
 15 | 
 16 | epsilon_std=1.0
 17 | 
 18 | vocab_size = 26621
 19 | embed_dim = 3
 20 | seq_length = 26621
 21 | x_test_size = 10000
 22 | 
 23 | with open('feature_embed_3dim.npy', 'rb') as f:
 24 |     embedding_matrix = np.load(f)
 25 | embedding_matrix = np.append(np.array([[0.0, 0.0, 0.0]]) ,embedding_matrix, axis =0)
 26 | 
 27 | 
 28 | x=Input(batch_shape=(batch_size,original_dim))
 29 | embedding_layer = Embedding(vocab_size+1, 3, weights=[embedding_matrix], input_length=seq_length, trainable=True)
 30 | embed = embedding_layer(x)
 31 | flat_embed = Flatten()
 32 | embed = flat_embed(embed)
 33 | h=Dense(intermediate_dim, activation='tanh')(embed)
 34 | 
 35 | z_mean=Dense(latent_dim)(h)
 36 | z_log_var=Dense(latent_dim)(h)
 37 | 
 38 | 
 39 | def sampling(args):
 40 |     _mean,_log_var=args
 41 |     epsilon=K.random_normal(shape=(K.shape(z_mean)[0], latent_dim), mean=0., std=epsilon_std)
 42 |     return _mean+K.exp(_log_var/2)*epsilon
 43 | z= Lambda(sampling, output_shape=(latent_dim,))([z_mean, z_log_var])
 44 | 
 45 | h_decoder=Dense(intermediate_dim, activation='tanh')
 46 | x_bar=Dense(original_dim2, activation='softmax') 
 47 | h_decoded = h_decoder(z)
 48 | x_decoded = x_bar(h_decoded)
 49 | 
 50 | mul_inp = Input(batch_shape=(batch_size,original_dim))
 51 | x_decoded2 = merge([x_decoded, mul_inp], mode = 'mul')
 52 | 
 53 | vae = Model([x, mul_inp], x_decoded)
 54 | 
 55 | weightsPath = "weights_h-vae_imdb.hdf5"
 56 | vae.load_weights(weightsPath)
 57 | 
 58 | 
 59 | 
 60 | movie_indices = np.array([range(1,26622)])
 61 | movie_indices = np.repeat(movie_indices, batch_size, axis = 0)
 62 | 
 63 | x_test_matrix = pickle.load( open( "test_data.file", "rb" ) )
 64 | x_test_matrix = x_test_matrix.todense()
 65 | x_test = np.squeeze(np.asarray(x_test_matrix))
 66 | 
 67 | x_test_new_list = []
 68 | x_test_fold_out_indices = []
 69 | 
 70 | for i in range(x_test_size):
 71 |     user_i_features = x_test[i]
 72 |     one_indices = np.argwhere(user_i_features > 0.0)
 73 |     number_of_one_indices = one_indices.shape[0]
 74 |     fold_out_number = int(0.2*number_of_one_indices)
 75 | 
 76 |     fold_out_indices = random.sample(one_indices.tolist(), fold_out_number)
 77 |     x_test_fold_out_indices.append(fold_out_indices)
 78 | 
 79 |     np.put(user_i_features, fold_out_indices, np.zeros(fold_out_number))
 80 |     x_test_new_list.append(user_i_features)
 81 | 
 82 | x_test_new_list =  np.array(x_test_new_list)
 83 | x_test_new_list = x_test_new_list * movie_indices
 84 | x_test_reconstructed = vae.predict([x_test_new_list, x_test], batch_size=batch_size)
 85 | 
 86 | 
 87 | def recallatk(x_test, x_test_fold_out_indices, x_test_reconstructed, k):
 88 |     recall_values = []
 89 |     total_recall = 0.0
 90 |     for i in range(len(x_test)):
 91 |         if len(x_test_fold_out_indices[i]) == 0:  # if this user hadn't rated any movie as 1
 92 |             continue
 93 | 
 94 |         item_list = [item for sublist in x_test_fold_out_indices[i] for item in sublist]
 95 | 
 96 |         sorted_ratings = x_test_reconstructed[i].tolist()
 97 |         top_predicted_movies_idx = sorted(range(len(sorted_ratings)), key=lambda i: sorted_ratings[i])[-k:]
 98 |         
 99 |         sum = 0.0
100 |         for j in range(0, k):
101 |             if top_predicted_movies_idx[j] in item_list:
102 |                 sum+=1.0
103 |         recall = sum/float(min(k, len(x_test_fold_out_indices[i])))
104 |         total_recall += recall
105 |         recall_values.append(recall)
106 |     return total_recall/float(len(recall_values))
107 | 
108 | def ndcgatk(x_test, x_test_fold_out_indices, x_test_reconstructed, k):
109 |     ndcg_values = []
110 |     total_ndcg = 0.0
111 |     best  = 0.0
112 |     for i in range(len(x_test)):        
113 |         if len(x_test_fold_out_indices[i]) == 0:
114 |             continue
115 | 
116 |         sorted_ratings = x_test_reconstructed[i].tolist()
117 |         top_predicted_movies_idx = sorted(range(len(sorted_ratings)), key=lambda i: sorted_ratings[i])[-k:]
118 |         sum_ndcg = 0
119 |         item_list = [item for sublist in x_test_fold_out_indices[i] for item in sublist]
120 |         for j in range(0, k):
121 |             if top_predicted_movies_idx[j] in item_list:
122 |                 ndcg = 1/(math.log(j+2))
123 |             else:
124 |                 ndcg = 0
125 |             sum_ndcg += ndcg
126 |         total_ndcg += sum_ndcg
127 |         ndcg_values.append(sum_ndcg)
128 | 
129 |     ndcg_values = np.array(ndcg_values)
130 |     max_ndcg = ndcg_values.max()
131 |     ndcg_values = ndcg_values / max_ndcg 
132 |     total_ndcg = np.sum(ndcg_values)
133 | 
134 |     return total_ndcg/float(len(ndcg_values))
135 | 
136 | print("NDCG at 100: ", ndcgatk(x_test, x_test_fold_out_indices, x_test_reconstructed, 100))
137 | 
138 | print("recall at 20: ", recallatk(x_test, x_test_fold_out_indices, x_test_reconstructed, 20))
139 | 
140 | print("recall at 50: ", recallatk(x_test, x_test_fold_out_indices, x_test_reconstructed, 50))
141 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Variational-Autoencoders-Collaborative-Filtering
 2 | Please cite our paper: [A Hybrid Variational Autoencoder for Collaborative Filtering](https://arxiv.org/abs/1808.01006), if you find this repository helpful. 
 3 | 
 4 | This repository contains the code implementing variational autoencoders (VAE) for collaborative filtering (CF) on movielens data and spotify's Million Playlist dataset (MPD).
 5 | 
 6 | **Link to movielens data**: http://files.grouplens.org/datasets/movielens/ml-20m.zip
 7 | For movielens dataset, we couldn't use the ratings.csv file directly as it had some movies which IMDB didn't understand, and hence created new_ratings.csv. The code for this filtering is in:  update_ratings.py
 8 | 
 9 | **Million Playlist Dataset**-  official website hosted at https://recsys-challenge.spotify.com/
10 | One needs to register on the website and download the training data and the test data (challenge set) as part of the recsys 2018 playlist completion challenge.
11 | 
12 | **The folder ./Hybrid** contains the code for the implementation of our proposed hybrid VAE model.
13 | 
14 | **The folder ./Standard** contains the code for the implementation of standard VAE model.
15 | 
16 | **The folder ./Spotify** contains the code used for playlist completion challenge, from data preprocessing, training and generating predictions.
17 | 
18 | Please look into the specific folder to read more about the files that were used for the specific implementation.
19 | 


--------------------------------------------------------------------------------
/Spotify/README.md:
--------------------------------------------------------------------------------
 1 | 
 2 | Variational autoencoder code adapted for the task of playlist completion/song recommendation on the Spotify million playlist dataset (MPD).
 3 | 
 4 | - **read_mpd.py** file is used to create the training and validation files from the Spotify RecSys MPD dataset. It takes the trackcount.file as input, which stores the most popular 125,000 tracks from the MPD, which reduces the size of the input data. It creates the train_data.file and val_data.file using the sparse matrix representation, while also creating files to store the raw rows and columns for both datasets. In addition, it also creates the track_dict.file, the dictionary that maintains the track_uri to matrix index mapping.
 5 | 
 6 | - **read_challenge.py** file also uses the trackcount.file and track_dict.file to create a sparse matrix representation of the challenge dataset provided by Spotify, to create the test_data.file. 
 7 | 
 8 | - **vae_cf_spotify.py** file is used to create the vae network, compile it and train it on the train_data. This code saves the model's weights in the specified location, and also logs the train and validation losses for analysis purposes.
 9 | 
10 | - **run_vae_on_test_playlists.py** file reads in the test_data.file (which has 10,000 playlists as given in the challenge set), loads the saved model weights, and predicts on these playlists. Since ours is a VAE architecture, the reconstructed playlist representation (x_test_reconstructed variable) is then saved using pickle. This saved file is used to finally generate song predictions for each of these 10,000 playlists.
11 | 
12 | - **generate_song_predictions.py** file uses the output from the **run_vae_on_test_playlists.py** along with track_dict.file and the test_data.file to create a list of 500 predicted tracks for each playlist. It sorts the input and filters tracks with highest probabilities for each playlist, eliminates tracks that already belong to the playlist, and stores them in predictions.file.
13 | 
14 | - **generate_csv.py** file takes the output from **generate_song_predictions.py** and creates a CSV in the format acceptable by the Spotify RecSys Challenge, suitable for submission.
15 | 


--------------------------------------------------------------------------------
/Spotify/generate_song_predictions.py:
--------------------------------------------------------------------------------
 1 | import pickle
 2 | import json
 3 | 
 4 | x_test_size = 10000
 5 | first_half = 5000
 6 | 
 7 | print("started")
 8 | x_test_reconstructed = pickle.load(open("x_test_reconstructed.file", "rb"))
 9 | print(x_test_reconstructed.shape)
10 | 
11 | tracks = pickle.load(open("track_dict.file", "rb"))
12 | test_data = pickle.load(open("test_data.file", "rb"))
13 | 
14 | challenge = json.load(open("challenge_set.json"))
15 | playlists = challenge['playlists']
16 | predictions = {}
17 | 
18 | # TODO: change the range to (first_half, x_test_size)
19 | for i in range(0, first_half):
20 | 	print("Reading playlist: " + str(i))
21 | 	sorted_probabilities = x_test_reconstructed[i].tolist()
22 | 
23 | 	# we can pick top 1000 per say, disregard the ones (get track id from array index using track_dict) which were already there using SongCheck.py
24 | 	#and get the track ids of the remaining top ones using array index and track_dicts
25 | 
26 | 	pred_size = 0
27 | 	current_prediction = []
28 | 	# get top 700 playlists - enough to find top 500 tracks
29 | 	top_predicted_movies_idx = (sorted(range(len(sorted_probabilities)), key=lambda i: sorted_probabilities[i])[-700:])
30 | 	# reverse since it stores index in ascending order within top 700
31 | 	top_predicted_movies_idx.reverse()
32 | 
33 | 	for j in top_predicted_movies_idx:
34 | 		if (test_data[i, j] == 1.0):
35 | 			track_uri = False
36 | 		else:
37 | 			track_uri = (list(tracks.keys())[list(tracks.values()).index(j)])
38 | 
39 | 		if (track_uri):
40 | 			current_prediction.append(track_uri)
41 | 			pred_size += 1
42 | 		if (pred_size == 500):
43 | 			break
44 | 	# print(current_prediction)
45 | 	pid = playlists[i]['pid']
46 | 	predictions[pid] = current_prediction
47 | 	# print(predictions)
48 | 
49 | # TODO: change this to predictions2.file
50 | pickle.dump(predictions, open("predictions1.file", "wb"))


--------------------------------------------------------------------------------
/Spotify/read_challenge.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import pandas as pd
 3 | import numpy as np
 4 | import pickle
 5 | from scipy import sparse
 6 | 
 7 | number_of_songs = 125000
 8 | 
 9 | rows = []
10 | cols = []
11 | 
12 | json_data = json.load(open("challenge/challenge_set.json"))
13 | playlists = json_data['playlists']
14 | 
15 | tracks = pickle.load(open("track_dict.file", "rb"))
16 | trackfile = pickle.load(open("trackcount.file", "rb"))
17 | 
18 | user_count = 0
19 | 
20 | for playlist in playlists:
21 | 	track_list = []
22 | 	for track in playlist['tracks']:
23 | 		if track['track_uri'] in trackfile:
24 | 			track_list.append(tracks[track["track_uri"]])
25 | 	rows.extend(user_count for i in range(len(track_list)))
26 | 	cols.extend(track_list)
27 | 	user_count += 1
28 | 
29 | test_data = sparse.csr_matrix((np.ones_like(rows),(np.array(rows), np.array(cols))), dtype='float64', shape=(10000, number_of_songs))
30 | pickle.dump(test_data, open("test_data.file", "wb"))
31 | 		
32 | 


--------------------------------------------------------------------------------
/Spotify/read_mpd.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import pandas as pd
 3 | import numpy as np
 4 | import random
 5 | import pickle
 6 | from scipy import sparse
 7 | 
 8 | # number_of_songs = 2262292
 9 | number_of_songs = 125000
10 | 
11 | track_count = 0
12 | tracks = {}
13 | 
14 | file_count = 0
15 | validation = 0
16 | playlist_count = 0
17 | 
18 | rows = []
19 | cols = []
20 | valid_rows = []
21 | valid_cols = []
22 | 
23 | trackfile = pickle.load(open("trackcount.file", "rb"))
24 | 
25 | for i in range(0, 1000):
26 | 	print("----FILE------ " + str(i))
27 | 	filename = 'mpd/data/mpd.slice.' + str(file_count) + '-' + str(file_count + 999) + '.json'
28 | 	json_data = json.load(open(filename))
29 | 
30 | 	playlists = json_data["playlists"]
31 | 
32 | 	choice = False
33 | 	if validation < 10000:
34 | 		choice = random.choice([True, False])
35 | 		# print(choice)
36 | 
37 | 	for playlist in playlists:
38 | 		track_list = []
39 | 		for track in playlist['tracks']:
40 | 			if track['track_uri'] in trackfile:
41 | 				if track['track_uri'] not in tracks:
42 | 					tracks[track['track_uri']] = track_count
43 | 					track_count += 1
44 | 				track_list.append(tracks[track["track_uri"]])
45 | 
46 | 		if (choice):
47 | 			valid_rows.extend(validation for i in range(len(track_list)))
48 | 			valid_cols.extend(track_list)
49 | 			validation += 1
50 | 		else:
51 | 			rows.extend(playlist_count for i in range(len(track_list)))
52 | 			# print(rows)
53 | 			cols.extend(track_list)
54 | 			playlist_count += 1
55 | 
56 | 	# if (i == 2):
57 | 	# 	break
58 | 
59 | 	file_count += 1000
60 | 
61 | pickle.dump(rows, open("rows.file", "wb"))
62 | pickle.dump(cols, open("cols.file", "wb"))
63 | 
64 | pickle.dump(tracks, open("track_dict.file", "wb"))
65 | 
66 | pickle.dump(valid_rows, open("valid_rows.file", "wb"))
67 | pickle.dump(valid_cols, open("valid_cols.file", "wb"))
68 | 
69 | train_data = sparse.csr_matrix((np.ones_like(rows),(np.array(rows), np.array(cols))), dtype='float64', shape=(990000, number_of_songs))
70 | pickle.dump(train_data, open("train_data.file", "wb"))
71 | 
72 | val_data = sparse.csr_matrix((np.ones_like(valid_rows),(np.array(valid_rows), np.array(valid_cols))), dtype='float64', shape=(10000, number_of_songs))
73 | pickle.dump(val_data, open("val_data.file", "wb"))
74 | 
75 | 


--------------------------------------------------------------------------------
/Spotify/run_vae_on_test_playlists.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pickle
 3 | import os
 4 | import math
 5 | from keras.layers import Input, Dense, Lambda
 6 | from keras.models import Model, load_model
 7 | from keras import objectives
 8 | from keras import backend as K
 9 | 
10 | 
11 | # encoder/decoder network size
12 | batch_size=500
13 | original_dim=125000 # number of movies
14 | intermediate_dim=600
15 | latent_dim=200
16 | epsilon_std=1.0
17 | 
18 | # encoder network
19 | x=Input(batch_shape=(batch_size,original_dim))
20 | h=Dense(intermediate_dim, activation='tanh')(x)
21 | z_mean=Dense(latent_dim)(h)
22 | z_log_var=Dense(latent_dim)(h)
23 | 
24 | 
25 | # sampling from latent dimension for decoder/generative part of network
26 | def sampling(args):
27 |     _mean,_log_var=args
28 |     # does this mean we are modelling this is as a gaussian and not multinomial?
29 |     epsilon=K.random_normal(shape=(K.shape(z_mean)[0], latent_dim), mean=0., std=epsilon_std)
30 |     return _mean+K.exp(_log_var/2)*epsilon
31 | 
32 | z = Lambda(sampling, output_shape=(latent_dim,))([z_mean, z_log_var])
33 | 
34 | # decoder network
35 | h_decoder=Dense(intermediate_dim, activation='tanh')
36 | x_bar=Dense(original_dim,activation='softmax') # this should be softmax right?
37 | h_decoded = h_decoder(z)
38 | x_decoded = x_bar(h_decoded)
39 | 
40 | vae = Model(x, x_decoded)
41 | weightsPath = "./tmp/weights.hdf5"
42 | vae.load_weights(weightsPath)
43 | 
44 | x_test_matrix = pickle.load( open( "test_data.file", "rb" ) )
45 | print("number of playlists in test data", x_test_matrix.shape[0])
46 | print("number of songs in test playlists", x_test_matrix.shape[1])
47 | 
48 | 
49 | def nn_batch_generator(x, batch_size, samples_per_epoch):
50 |     number_of_batches = samples_per_epoch/batch_size
51 |     shuffle_index = np.arange(np.shape(x)[0])
52 |     counter=0
53 |     while 1:
54 |         index_batch = shuffle_index[batch_size*counter:batch_size*(counter+1)]
55 |         x_batch = x[index_batch,:].todense()
56 |         counter += 1
57 |         yield (np.array(x_batch))
58 |         if (counter >= number_of_batches):
59 |             counter=0
60 | 
61 | 
62 | x_test_reconstructed = vae.predict_generator(generator=nn_batch_generator(x_test_matrix, batch_size, 10000), val_samples=x_test_matrix.shape[0])
63 | print(type(x_test_reconstructed))
64 | print(len(x_test_reconstructed))
65 | print(x_test_reconstructed[0])
66 | pickle.dump(x_test_reconstructed, open("x_test_reconstructed.file", "wb"), protocol=4)
67 | 


--------------------------------------------------------------------------------
/Spotify/vae_cf_spotify.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pickle
 3 | import os
 4 | from keras.layers import Input, Dense, Lambda
 5 | from keras.models import Model
 6 | from keras import objectives
 7 | from keras import backend as K
 8 | from keras.callbacks import ReduceLROnPlateau, ModelCheckpoint, Callback, EarlyStopping
 9 | 
10 | # encoder/decoder network size
11 | batch_size=500
12 | original_dim = 125000 # number of filtered songs (songs appearing in less than 46 playlists)
13 | intermediate_dim=600
14 | latent_dim=200
15 | nb_epochs=50
16 | epsilon_std=1.0
17 | 
18 | class LossHistory(Callback):
19 |     def on_train_begin(self, logs={}):
20 |         self.losses = []
21 |         self.val_losses = []
22 | 
23 |     def on_epoch_end(self, epoch, logs={}):
24 |         self.losses.append(logs.get('loss'))
25 |         self.val_losses.append(logs.get('val_loss'))
26 | 
27 | history = LossHistory()
28 |         
29 | # encoder network
30 | x=Input(batch_shape=(batch_size,original_dim))
31 | h=Dense(intermediate_dim, activation='tanh')(x)
32 | z_mean=Dense(latent_dim)(h)
33 | z_log_var=Dense(latent_dim)(h)
34 | 
35 | 
36 | # sampling from latent dimension for decoder/generative part of network
37 | def sampling(args):
38 |     _mean,_log_var=args
39 |     epsilon=K.random_normal(shape=(K.shape(z_mean)[0], latent_dim), mean=0., std=epsilon_std)
40 |     return _mean+K.exp(_log_var/2)*epsilon
41 | 
42 | z= Lambda(sampling, output_shape=(latent_dim,))([z_mean, z_log_var])
43 | 
44 | # decoder network
45 | h_decoder=Dense(intermediate_dim, activation='tanh')
46 | x_bar=Dense(original_dim,activation='softmax') # this should be softmax right?
47 | h_decoded = h_decoder(z)
48 | x_decoded = x_bar(h_decoded)
49 | 
50 | # build and compile model
51 | vae = Model(x, x_decoded)
52 | def vae_loss(x,x_bar):
53 |     reconst_loss=original_dim*objectives.binary_crossentropy(x, x_bar)
54 |     kl_loss=-0.5*K.sum(1 + z_log_var - K.square(z_mean) - K.exp(z_log_var), axis=-1)
55 |     return reconst_loss + kl_loss
56 | 
57 | vae.compile(optimizer='adam', loss=vae_loss)
58 | 
59 | x_train = pickle.load( open( "train_data.file", "rb" ) )
60 | print("number of training playlists: ", x_train.shape[0])
61 | print("number of songs after filtering: ", x_train.shape[1])
62 | 
63 | x_val = pickle.load( open( "val_data.file", "rb" ) )
64 | print("number of validation playlists: ", x_val.shape[0])
65 | print("number of songs in validation playlists: ", x_val.shape[1])
66 | 
67 | 
68 | def nn_batch_generator(x, y, batch_size, samples_per_epoch):
69 |     number_of_batches = samples_per_epoch/batch_size
70 |     counter=0
71 |     shuffle_index = np.arange(np.shape(y)[0])
72 |     np.random.shuffle(shuffle_index)
73 |     x =  x[shuffle_index, :]
74 |     y =  y[shuffle_index, :]
75 |     while 1:
76 |         index_batch = shuffle_index[batch_size*counter:batch_size*(counter+1)]
77 |         x_batch = x[index_batch,:].todense()
78 |         y_batch = y[index_batch,:].todense()
79 |         counter += 1
80 |         yield (np.array(x_batch),np.array(y_batch))
81 |         if (counter >= number_of_batches):
82 |             counter=0
83 | 
84 | 
85 | weightsPath = "./tmp/weights.hdf5"
86 | checkpointer = ModelCheckpoint(filepath=weightsPath, verbose=1, save_best_only=True)
87 | earlyStopping = EarlyStopping(monitor='val_loss', min_delta=0, patience=5, verbose=0, mode='min')
88 | 
89 | # original size of training data = 9,90,000
90 | # sending only 1,00,000 playlists in each epoch and shuffling before every epoch so that each playlist is seen in the training
91 | 
92 | vae.fit_generator(nn_batch_generator(x_train, x_train, batch_size, 100000), samples_per_epoch=100000, nb_epoch=nb_epochs, 
93 |     validation_data=nn_batch_generator(x_val, x_val, batch_size, 10000), nb_val_samples=10000, callbacks=[checkpointer, earlyStopping, history])
94 | 
95 | 
96 | pickle.dump(history.losses, open('train_losses.file', 'wb'))
97 | pickle.dump(history.val_losses, open('val_losses.file', 'wb'))
98 | 


--------------------------------------------------------------------------------
/Standard/evaluate_model_approach_1.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import pickle
  3 | import os
  4 | import math
  5 | from keras.layers import Input, Dense, Lambda
  6 | from keras.models import Model, load_model
  7 | from keras import objectives
  8 | from keras import backend as K
  9 | 
 10 | 
 11 | # def measure_performance(x, x_bar):
 12 | #     return
 13 | 
 14 | 
 15 | # encoder/decoder network size
 16 | batch_size=500
 17 | original_dim=26621 # number of movies
 18 | intermediate_dim=600
 19 | latent_dim=200
 20 | epsilon_std=1.0
 21 | 
 22 | # activation used is tanh
 23 | # softmax activation is used at the final dense layer which produces x_reconstructed
 24 | 
 25 | # encoder network
 26 | x=Input(batch_shape=(batch_size,original_dim))
 27 | h=Dense(intermediate_dim, activation='tanh')(x)
 28 | z_mean=Dense(latent_dim)(h)
 29 | z_log_var=Dense(latent_dim)(h)
 30 | 
 31 | 
 32 | # sampling from latent dimension for decoder/generative part of network
 33 | def sampling(args):
 34 |     _mean,_log_var=args
 35 |     # does this mean we are modelling this is as a gaussian and not multinomial?
 36 |     epsilon=K.random_normal(shape=(K.shape(z_mean)[0], latent_dim), mean=0., std=epsilon_std)
 37 |     return _mean+K.exp(_log_var/2)*epsilon
 38 | 
 39 | z = Lambda(sampling, output_shape=(latent_dim,))([z_mean, z_log_var])
 40 | 
 41 | # decoder network
 42 | h_decoder=Dense(intermediate_dim, activation='tanh')
 43 | x_bar=Dense(original_dim,activation='softmax') # this should be softmax right?
 44 | h_decoded = h_decoder(z)
 45 | x_decoded = x_bar(h_decoded)
 46 | 
 47 | vae = Model(x, x_decoded)
 48 | weightsPath = "./hybrid/weights_org.hdf5"
 49 | vae.load_weights(weightsPath)
 50 | 
 51 | x_test_matrix = pickle.load( open( "test_data.file", "rb" ) )
 52 | x_test_matrix = x_test_matrix.todense()  # 1s and 0s per user
 53 | x_test = np.squeeze(np.asarray(x_test_matrix))
 54 | 
 55 | x_test_reconstructed = vae.predict(x_test, batch_size=batch_size)  # float values per user
 56 | 
 57 | 
 58 | # no concept of held out items in the test set, calculating overall
 59 | def recallatk(x_test, x_test_reconstructed, k):
 60 | 	recall_values = []
 61 | 	total_recall = 0.0
 62 | 	for i in range(len(x_test)):
 63 | 		top_rated_movies_idx = [i for i, x in enumerate(x_test[i].tolist()) if x == 1.0]
 64 | 
 65 | 		if len(top_rated_movies_idx) == 0:
 66 | 			#print("test user has no 1 rated movies: ", i)
 67 | 			continue
 68 | 
 69 | 		sorted_ratings = x_test_reconstructed[i].tolist()
 70 | 		top_predicted_movies_idx = sorted(range(len(sorted_ratings)), key=lambda i: sorted_ratings[i])[-k:]
 71 | 		
 72 | 		sum = 0.0
 73 | 		for i in range(0, k):
 74 | 			if top_predicted_movies_idx[i] in top_rated_movies_idx:
 75 | 				sum+=1.0
 76 | 		recall = sum/float(min(k, len(top_rated_movies_idx)))
 77 | 		total_recall += recall
 78 | 		recall_values.append(recall)
 79 | 	return total_recall/float(len(recall_values))
 80 | 
 81 | def ndcgatk(x_test, x_test_reconstructed, k):
 82 | 	ndcg_values = []
 83 | 	total_ndcg = 0.0
 84 | 	best  = 0.0
 85 | 	for i in range(len(x_test)):
 86 | 		top_rated_movies_idx = [i for i, x in enumerate(x_test[i].tolist()) if x == 1.0]
 87 | 		
 88 | 		if len(top_rated_movies_idx) == 0:
 89 | 			#print("test user has no 1 rated movies: ", i)
 90 | 			continue
 91 | 		sorted_ratings = x_test_reconstructed[i].tolist()
 92 | 		top_predicted_movies_idx = sorted(range(len(sorted_ratings)), key=lambda i: sorted_ratings[i])[-k:]
 93 | 		sum_ndcg = 0
 94 | 		for i in range(0, k):
 95 | 			if top_predicted_movies_idx[i] in top_rated_movies_idx:
 96 | 				ndcg = 1/(math.log(i+2))
 97 | 			else:
 98 | 				ndcg = 0
 99 | 			sum_ndcg += ndcg
100 | 
101 | 		total_ndcg += sum_ndcg
102 | 		ndcg_values.append(sum_ndcg)
103 | 
104 | 	ndcg_values = np.array(ndcg_values)
105 | 	max_ndcg = ndcg_values.max()
106 | 	ndcg_values = ndcg_values / max_ndcg 
107 | 	total_ndcg = np.sum(ndcg_values)
108 | 
109 | 	return total_ndcg/float(len(ndcg_values))
110 | 
111 | print("NDCG at 100: ", ndcgatk(x_test, x_test_reconstructed, 100))
112 | 
113 | #recall at 20:  0.542023083825468
114 | print("recall at 20: ", recallatk(x_test, x_test_reconstructed, 20))
115 | 
116 | #recall at 50:  0.5759154842447732
117 | print("recall at 50: ", recallatk(x_test, x_test_reconstructed, 50))
118 | 


--------------------------------------------------------------------------------
/Standard/evaluate_model_approach_2.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import pickle
  3 | import os
  4 | import math
  5 | import random
  6 | from keras.layers import Input, Dense, Lambda
  7 | from keras.models import Model, load_model
  8 | from keras import objectives
  9 | from keras import backend as K
 10 | 
 11 | 
 12 | batch_size=500
 13 | original_dim=26621
 14 | intermediate_dim=600
 15 | latent_dim=200
 16 | epsilon_std=1.0
 17 | x_test_size = 10000
 18 | 
 19 | x=Input(batch_shape=(batch_size,original_dim))
 20 | h=Dense(intermediate_dim, activation='tanh')(x)
 21 | z_mean=Dense(latent_dim)(h)
 22 | z_log_var=Dense(latent_dim)(h)
 23 | 
 24 | def sampling(args):
 25 |     _mean,_log_var=args
 26 |     epsilon=K.random_normal(shape=(K.shape(z_mean)[0], latent_dim), mean=0., std=epsilon_std)
 27 |     return _mean+K.exp(_log_var/2)*epsilon
 28 | z = Lambda(sampling, output_shape=(latent_dim,))([z_mean, z_log_var])
 29 | 
 30 | h_decoder=Dense(intermediate_dim, activation='tanh')
 31 | x_bar=Dense(original_dim,activation='softmax')
 32 | h_decoded = h_decoder(z)
 33 | x_decoded = x_bar(h_decoded)
 34 | 
 35 | vae = Model(x, x_decoded)
 36 | weightsPath = "./weights.hdf5"
 37 | vae.load_weights(weightsPath)
 38 | 
 39 | x_test_matrix = pickle.load( open( "test_data.file", "rb" ) )
 40 | x_test_matrix = x_test_matrix.todense()
 41 | x_test = np.squeeze(np.asarray(x_test_matrix))
 42 | 
 43 | 
 44 | x_test_new_list = []
 45 | x_test_fold_out_indices = []
 46 | 
 47 | 
 48 | for i in range(x_test_size):
 49 | 	user_i_features = x_test[i]
 50 | 	one_indices = np.argwhere(user_i_features > 0.0)
 51 | 	number_of_one_indices = one_indices.shape[0]
 52 | 	fold_out_number = int(0.2*number_of_one_indices)
 53 | 
 54 | 	fold_out_indices = random.sample(one_indices.tolist(), fold_out_number)
 55 | 	x_test_fold_out_indices.append(fold_out_indices)
 56 | 
 57 | 	np.put(user_i_features, fold_out_indices, np.zeros(fold_out_number))
 58 | 	x_test_new_list.append(user_i_features)
 59 | 	#print(i)
 60 | 
 61 | x_test_reconstructed = vae.predict(np.asarray(x_test_new_list), batch_size=batch_size)
 62 | 
 63 | 
 64 | def recallatk(x_test, x_test_fold_out_indices, x_test_reconstructed, k):
 65 | 	recall_values = []
 66 | 	total_recall = 0.0
 67 | 	for i in range(len(x_test)):
 68 | 		if len(x_test_fold_out_indices[i]) == 0:  # if this user hadn't rated any movie as 1
 69 | 			continue
 70 | 
 71 | 		item_list = [item for sublist in x_test_fold_out_indices[i] for item in sublist]
 72 | 
 73 | 		sorted_ratings = x_test_reconstructed[i].tolist()
 74 | 		top_predicted_movies_idx = sorted(range(len(sorted_ratings)), key=lambda i: sorted_ratings[i])[-k:]
 75 | 		
 76 | 		sum = 0.0
 77 | 		for j in range(0, k):
 78 | 			if top_predicted_movies_idx[j] in item_list:
 79 | 				sum+=1.0
 80 | 		recall = sum/float(min(k, len(x_test_fold_out_indices[i])))
 81 | 		total_recall += recall
 82 | 		recall_values.append(recall)
 83 | 	return total_recall/float(len(recall_values))
 84 | 
 85 | def ndcgatk(x_test, x_test_fold_out_indices, x_test_reconstructed, k):
 86 | 	ndcg_values = []
 87 | 	total_ndcg = 0.0
 88 | 	best  = 0.0
 89 | 	for i in range(len(x_test)):		
 90 | 		if len(x_test_fold_out_indices[i]) == 0:
 91 | 			continue
 92 | 
 93 | 		sorted_ratings = x_test_reconstructed[i].tolist()
 94 | 		top_predicted_movies_idx = sorted(range(len(sorted_ratings)), key=lambda i: sorted_ratings[i])[-k:]
 95 | 		sum_ndcg = 0
 96 | 		item_list = [item for sublist in x_test_fold_out_indices[i] for item in sublist]
 97 | 		for j in range(0, k):
 98 | 			if top_predicted_movies_idx[j] in item_list:
 99 | 				ndcg = 1/(math.log(j+2))
100 | 			else:
101 | 				ndcg = 0
102 | 			sum_ndcg += ndcg
103 | 		total_ndcg += sum_ndcg
104 | 		ndcg_values.append(sum_ndcg)
105 | 
106 | 	ndcg_values = np.array(ndcg_values)
107 | 	max_ndcg = ndcg_values.max()
108 | 	ndcg_values = ndcg_values / max_ndcg 
109 | 	total_ndcg = np.sum(ndcg_values)
110 | 
111 | 	return total_ndcg/float(len(ndcg_values))
112 | 
113 | print("NDCG at 100: ", ndcgatk(x_test, x_test_fold_out_indices, x_test_reconstructed, 100))
114 | 
115 | print("recall at 20: ", recallatk(x_test, x_test_fold_out_indices, x_test_reconstructed, 20))
116 | 
117 | print("recall at 50: ", recallatk(x_test, x_test_fold_out_indices, x_test_reconstructed, 50))


--------------------------------------------------------------------------------
/Standard/plot_loss_graphs.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {
  7 |     "collapsed": false
  8 |    },
  9 |    "outputs": [
 10 |     {
 11 |      "name": "stderr",
 12 |      "output_type": "stream",
 13 |      "text": [
 14 |       "Using TensorFlow backend.\n"
 15 |      ]
 16 |     }
 17 |    ],
 18 |    "source": [
 19 |     " #%matplotlib inline\n",
 20 |     "import numpy as np\n",
 21 |     "import pickle\n",
 22 |     "import os\n",
 23 |     "#from matplotlib import pyplot as plt\n",
 24 |     "from keras.layers import Input, Dense, Lambda\n",
 25 |     "from keras.models import Model\n",
 26 |     "from keras import objectives\n",
 27 |     "from keras import backend as K\n",
 28 |     "from keras.callbacks import ReduceLROnPlateau, ModelCheckpoint, Callback\n",
 29 |     "#from IPython.display import clear_output"
 30 |    ]
 31 |   },
 32 |   {
 33 |    "cell_type": "code",
 34 |    "execution_count": 2,
 35 |    "metadata": {
 36 |     "collapsed": true
 37 |    },
 38 |    "outputs": [],
 39 |    "source": [
 40 |     "class PlotLosses(Callback):\n",
 41 |     "    def on_train_begin(self, logs={}):\n",
 42 |     "        self.i = 0\n",
 43 |     "        self.x = []\n",
 44 |     "        self.losses = []\n",
 45 |     "        self.val_losses = []\n",
 46 |     "        \n",
 47 |     "        self.fig = plt.figure()\n",
 48 |     "        \n",
 49 |     "        self.logs = []\n",
 50 |     "\n",
 51 |     "    def on_epoch_end(self, epoch, logs={}):\n",
 52 |     "        \n",
 53 |     "        self.logs.append(logs)\n",
 54 |     "        self.x.append(self.i)\n",
 55 |     "        self.losses.append(logs.get('loss'))\n",
 56 |     "        self.val_losses.append(logs.get('val_loss'))\n",
 57 |     "        self.i += 1\n",
 58 |     "        \n",
 59 |     "        clear_output(wait=True)\n",
 60 |     "        plt.plot(self.x, self.losses, label=\"loss\")\n",
 61 |     "        plt.plot(self.x, self.val_losses, label=\"val_loss\")\n",
 62 |     "        plt.legend()\n",
 63 |     "        plt.show();\n",
 64 |     "        \n",
 65 |     "plot_losses = PlotLosses()"
 66 |    ]
 67 |   },
 68 |   {
 69 |    "cell_type": "code",
 70 |    "execution_count": 2,
 71 |    "metadata": {
 72 |     "collapsed": true
 73 |    },
 74 |    "outputs": [],
 75 |    "source": [
 76 |     "# encoder/decoder network size\n",
 77 |     "batch_size=500\n",
 78 |     "original_dim=26621 # number of movies\n",
 79 |     "intermediate_dim=600\n",
 80 |     "latent_dim=200\n",
 81 |     "nb_epochs=20\n",
 82 |     "epsilon_std=1.0"
 83 |    ]
 84 |   },
 85 |   {
 86 |    "cell_type": "code",
 87 |    "execution_count": 3,
 88 |    "metadata": {
 89 |     "collapsed": true
 90 |    },
 91 |    "outputs": [],
 92 |    "source": [
 93 |     "# encoder network\n",
 94 |     "x=Input(batch_shape=(batch_size,original_dim))\n",
 95 |     "h=Dense(intermediate_dim, activation='tanh')(x)\n",
 96 |     "z_mean=Dense(latent_dim)(h)\n",
 97 |     "z_log_var=Dense(latent_dim)(h)"
 98 |    ]
 99 |   },
100 |   {
101 |    "cell_type": "code",
102 |    "execution_count": 4,
103 |    "metadata": {
104 |     "collapsed": true
105 |    },
106 |    "outputs": [],
107 |    "source": [
108 |     "def sampling(args):\n",
109 |     "    _mean,_log_var=args\n",
110 |     "    epsilon=K.random_normal(shape=(K.shape(z_mean)[0], latent_dim), mean=0., std=epsilon_std)\n",
111 |     "    return _mean+K.exp(_log_var/2)*epsilon\n",
112 |     "\n",
113 |     "z= Lambda(sampling, output_shape=(latent_dim,))([z_mean, z_log_var])"
114 |    ]
115 |   },
116 |   {
117 |    "cell_type": "code",
118 |    "execution_count": 5,
119 |    "metadata": {
120 |     "collapsed": true
121 |    },
122 |    "outputs": [],
123 |    "source": [
124 |     "# decoder network\n",
125 |     "h_decoder=Dense(intermediate_dim, activation='tanh')\n",
126 |     "x_bar=Dense(original_dim,activation='softmax') # this should be softmax right?\n",
127 |     "h_decoded = h_decoder(z)\n",
128 |     "x_decoded = x_bar(h_decoded)"
129 |    ]
130 |   },
131 |   {
132 |    "cell_type": "code",
133 |    "execution_count": 6,
134 |    "metadata": {
135 |     "collapsed": true
136 |    },
137 |    "outputs": [],
138 |    "source": [
139 |     "# build and compile model\n",
140 |     "vae = Model(x, x_decoded)\n",
141 |     "def vae_loss(x,x_bar):\n",
142 |     "    reconst_loss=original_dim*objectives.binary_crossentropy(x, x_bar)\n",
143 |     "    kl_loss=-0.5*K.sum(1 + z_log_var - K.square(z_mean) - K.exp(z_log_var), axis=-1)\n",
144 |     "    return reconst_loss + kl_loss\n",
145 |     "\n",
146 |     "vae.compile(optimizer='adam', loss=vae_loss)"
147 |    ]
148 |   },
149 |   {
150 |    "cell_type": "code",
151 |    "execution_count": 7,
152 |    "metadata": {
153 |     "collapsed": false
154 |    },
155 |    "outputs": [
156 |     {
157 |      "name": "stdout",
158 |      "output_type": "stream",
159 |      "text": [
160 |       "number of training users:  118493\n"
161 |      ]
162 |     }
163 |    ],
164 |    "source": [
165 |     "x_train = pickle.load( open( \"train_data_0.file\", \"rb\" ) )\n",
166 |     "#x_train = x_train[0:118000, :]\n",
167 |     "print(\"number of training users: \", x_train.shape[0])\n",
168 |     "\n",
169 |     "x_val = pickle.load( open( \"val_data_0.file\", \"rb\" ) )\n",
170 |     "x_val = x_val.todense()"
171 |    ]
172 |   },
173 |   {
174 |    "cell_type": "code",
175 |    "execution_count": 9,
176 |    "metadata": {
177 |     "collapsed": true
178 |    },
179 |    "outputs": [],
180 |    "source": [
181 |     "def nn_batch_generator(x, y, batch_size, samples_per_epoch):\n",
182 |     "    number_of_batches = samples_per_epoch/batch_size\n",
183 |     "    counter=0\n",
184 |     "    shuffle_index = np.arange(np.shape(y)[0])\n",
185 |     "    np.random.shuffle(shuffle_index)\n",
186 |     "    x =  x[shuffle_index, :]\n",
187 |     "    y =  y[shuffle_index, :]\n",
188 |     "    while 1:\n",
189 |     "        index_batch = shuffle_index[batch_size*counter:batch_size*(counter+1)]\n",
190 |     "        x_batch = x[index_batch,:].todense()\n",
191 |     "        y_batch = y[index_batch,:].todense()\n",
192 |     "        counter += 1\n",
193 |     "        yield (np.array(x_batch),np.array(y_batch))\n",
194 |     "        if (counter >= number_of_batches):\n",
195 |     "            counter=0"
196 |    ]
197 |   },
198 |   {
199 |    "cell_type": "code",
200 |    "execution_count": 10,
201 |    "metadata": {
202 |     "collapsed": true
203 |    },
204 |    "outputs": [],
205 |    "source": [
206 |     "weightsPath = \"./tmp/weights.hdf5\"\n",
207 |     "checkpointer = ModelCheckpoint(filepath=weightsPath, verbose=1, save_best_only=True)\n",
208 |     "reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=5, min_lr=0.001)"
209 |    ]
210 |   },
211 |   {
212 |    "cell_type": "code",
213 |    "execution_count": 11,
214 |    "metadata": {
215 |     "collapsed": false
216 |    },
217 |    "outputs": [
218 |     {
219 |      "data": {
220 |       "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXoAAAD8CAYAAAB5Pm/hAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMi4yLCBo\ndHRwOi8vbWF0cGxvdGxpYi5vcmcvhp/UCwAAIABJREFUeJzt3Xl8XOV97/HPbzaN9s1abMsrNjbY\nxsYIwmKcsMSAQ3CTNJgUGuCm8W1KszQpzdbe0KTc5IY2dIfSNgmkpMFZaElMbCg4GJKwyMYrXrEt\nW7KtXbL2ZeZ3/3iOZMmLdnmso9/79ZrXOXM0y3Nk+TvP/M5zniOqijHGGP8KJLoBxhhjxpYFvTHG\n+JwFvTHG+JwFvTHG+JwFvTHG+JwFvTHG+JwFvTHG+JwFvTHG+JwFvTHG+Fwo0Q0AmDRpks6cOTPR\nzTDGmHFl8+bN1aqaN9DjLoignzlzJiUlJYluhjHGjCsiUjqYx1npxhhjfM6C3hhjfM6C3hhjfO6C\nqNEbYyamzs5OysrKaGtrS3RTLmjRaJSioiLC4fCwnm9Bb4xJmLKyMtLT05k5cyYikujmXJBUlZqa\nGsrKypg1a9awXsNKN8aYhGlrayM3N9dCvh8iQm5u7oi+9Qwq6EXksIjsEJGtIlLibXtIRMq9bVtF\nZGWvx39ZRA6IyF4RuWXYrTPG+J6F/MBG+jsaSunmBlWtPm3bo6r616c16FLgLmABMAX4HxG5WFVj\nI2rpWew90cjPtx3jE8tmkZ0aGe2XN8YYXxiL0s0q4Eeq2q6qh4ADwFVj8D4cqm7iHzce4HiDHcgx\nxgxPWlpaopsw5gYb9Aq8ICKbRWRNr+1/LCLbReS7IpLtbZsKHO31mDJvWx8iskZESkSkpKqqaliN\nz0x2vfj61o5hPd8YYyaCwQb9MlVdCtwGPCAiy4HHgIuAJcBx4G+G8saq+oSqFqtqcV7egFM1nFVW\nihtq1NDSOaznG2NMN1XlwQcfZOHChSxatIhnnnkGgOPHj7N8+XKWLFnCwoULefXVV4nFYtx33309\nj3300UcT3Pr+DapGr6rl3rJSRJ4FrlLVTd0/F5F/BX7h3S0HpvV6epG3bdR1B319qwW9MePdX/58\nF+8cOzmqr3nplAy+9sEFg3rsz372M7Zu3cq2bduorq7myiuvZPny5fzwhz/klltu4atf/SqxWIyW\nlha2bt1KeXk5O3fuBKC+vn5U2z3aBuzRi0iqiKR3rwMrgJ0iMrnXwz4E7PTWnwPuEpEkEZkFzAXe\nHN1mO1ndpRvr0RtjRui1117jYx/7GMFgkIKCAt773vfy1ltvceWVV/K9732Phx56iB07dpCens7s\n2bM5ePAgn/70p1m/fj0ZGRmJbn6/BtOjLwCe9Yb3hIAfqup6EfmBiCzB1e8PA/8bQFV3icha4B2g\nC3hgLEbcAETDASKhgNXojfGBwfa8z7fly5ezadMm1q1bx3333cfnP/95Pv7xj7Nt2zY2bNjA448/\nztq1a/nud7+b6Kae04BBr6oHgcVn2f77/TznYeDhkTVtYCJCVnLYavTGmBG7/vrr+Zd/+Rfuvfde\namtr2bRpE4888gilpaUUFRXxyU9+kvb2drZs2cLKlSuJRCJ85CMfYd68edxzzz2Jbn6/xv0UCFkp\nYSvdGGNG7EMf+hC//e1vWbx4MSLCt7/9bQoLC3nyySd55JFHCIfDpKWl8dRTT1FeXs79999PPB4H\n4Jvf/GaCW98/UdVEt4Hi4mId7oVHPvr4bwgFAvznmqtHuVXGmLG2e/duLrnkkkQ3Y1w42+9KRDar\navFAzx33c91kJkds1I0xxvRj3Ad9VkqYhhY7GGuMMecy/oM+OWw9emOM6cf4D/qUMC0dMdq7xmQE\npzHGjHvjPugzU9xJUw3WqzfGmLMa90GflWzz3RhjTH/Gf9DbfDfGGNOv8R/03nw31qM3xoy1/uau\nP3z4MAsXLjyPrRm8cR/0mcnWozfGmP6M+ykQMrtLNzaW3pjx7ZdfghM7Rvc1CxfBbd8654+/9KUv\nMW3aNB544AEAHnroIUKhEBs3bqSuro7Ozk7+6q/+ilWrVg3pbdva2vjUpz5FSUkJoVCI73znO9xw\nww3s2rWL+++/n46ODuLxOD/96U+ZMmUKd955J2VlZcRiMf7iL/6C1atXj2i3Tzfugz49KURAbNSN\nMWboVq9ezec+97meoF+7di0bNmzgM5/5DBkZGVRXV3P11Vdzxx13DOkC3f/0T/+EiLBjxw727NnD\nihUr2LdvH48//jif/exnufvuu+no6CAWi/H8888zZcoU1q1bB0BDQ8Oo7+e4D/pAQMhMtonNjBn3\n+ul5j5XLL7+cyspKjh07RlVVFdnZ2RQWFvInf/InbNq0iUAgQHl5ORUVFRQWFg76dV977TU+/elP\nAzB//nxmzJjBvn37uOaaa3j44YcpKyvjwx/+MHPnzmXRokV84Qtf4Itf/CK33347119//ajv57iv\n0QNkpdh8N8aY4fnoRz/KT37yE5555hlWr17N008/TVVVFZs3b2br1q0UFBTQ1tY2Ku/1e7/3ezz3\n3HMkJyezcuVKXn75ZS6++GK2bNnCokWL+PM//3O+/vWvj8p79Tbue/SA16O3Gr0xZuhWr17NJz/5\nSaqrq3nllVdYu3Yt+fn5hMNhNm7cSGlp6ZBf8/rrr+fpp5/mxhtvZN++fRw5coR58+Zx8OBBZs+e\nzWc+8xmOHDnC9u3bmT9/Pjk5Odxzzz1kZWXxb//2b6O+j74I+qyUMLXNFvTGmKFbsGABjY2NTJ06\nlcmTJ3P33XfzwQ9+kEWLFlFcXMz8+fOH/Jp/9Ed/xKc+9SkWLVpEKBTi+9//PklJSaxdu5Yf/OAH\nhMNhCgsL+cpXvsJbb73Fgw8+SCAQIBwO89hjj436Po77+egBPvejt3n7aD2vPHjDKLbKGDPWbD76\nwZvQ89EDdjDWGGP64YvSTWZKhJNtncTiSjAw+CFQxhgzVDt27OD3f7/vJbOTkpJ44403EtSigfki\n6LOSw6hCY1snWd5slsaY8UFVhzRGPdEWLVrE1q1bz+t7jrTE7ovSTc/EZla+MWZciUaj1NTUjDjI\n/ExVqampIRqNDvs1BtWjF5HDQCMQA7p6F/9F5AvAXwN5qlot7qP574CVQAtwn6puGXYLB8FmsDRm\nfCoqKqKsrIyqqqpEN+WCFo1GKSoqGvbzh1K6uUFVq3tvEJFpwArgSK/NtwFzvdt7gMe85ZjJ9Gaw\ntLH0xowv4XCYWbNmJboZvjfS0s2jwJ8Bvb93rQKeUud1IEtEJo/wffrV3aO3+W6MMeZMgw16BV4Q\nkc0isgZARFYB5aq67bTHTgWO9rpf5m3rQ0TWiEiJiJSM9Gtbz1WmLOiNMeYMgy3dLFPVchHJB14U\nkT3AV3Blm2FR1SeAJ8CdMDXc1wHISLaDscYYcy6D6tGrarm3rASeBd4LzAK2eQdqi4AtIlIIlAPT\nej29yNs2ZsLBAGlJIQt6Y4w5iwGDXkRSRSS9ex3Xi39LVfNVdaaqzsSVZ5aq6gngOeDj4lwNNKjq\n8bHbBSczOUx9qx2MNcaY0w2mdFMAPOud0BACfqiq6/t5/PO4oZUHcMMr7x9pIwcjKyVs1401xpiz\nGDDoVfUgsHiAx8zsta7AAyNu2RBlpYRtHL0xxpyFL86MBchKjtg4emOMOQvfBH1mSpiG1q5EN8MY\nYy44vgn6rOQwDa0dNmeGMcacxjdBn5kcpjOmtHTEEt0UY4y5oPgm6G1iM2OMOTvfBL1NbGaMMWfn\nm6DvmdjMxtIbY0wfvgt6K90YY0xf/gn6ntKNBb0xxvTmn6C3OemNMeasfBP00XCQpFDAJjYzxpjT\n+CbowY2lt4OxxhjTl6+CPislbDV6Y4w5jb+CPjlipRtjjDmNr4I+03r0xhhzBl8FvZvYzILeGGN6\n81fQW4/eGGPO4LOgj9DaGaO9y2awNMaYbr4K+sxkO2nKGGNO58+gt/KNMcb08FXQ28RmxhhzJn8F\nvU1sZowxZxhU0IvIYRHZISJbRaTE2/YNEdnubXtBRKZ420VE/l5EDng/XzqWO9BbT4/eLj5ijDE9\nhtKjv0FVl6hqsXf/EVW9TFWXAL8A/o+3/TZgrndbAzw2aq0dQKbNYGmMMWcYdulGVU/2upsKqLe+\nCnhKndeBLBGZPII2Dlp6UohgQKx0Y4wxvQw26BV4QUQ2i8ia7o0i8rCIHAXu5lSPfipwtNdzy7xt\nfYjIGhEpEZGSqqqq4bX+zNd0M1haj94YY3oMNuiXqepSXFnmARFZDqCqX1XVacDTwB8P5Y1V9QlV\nLVbV4ry8vCE1uj9ZyWEbdWOMMb0MKuhVtdxbVgLPAled9pCngY946+XAtF4/K/K2nRduYjM7GGuM\nMd0GDHoRSRWR9O51YAWwU0Tm9nrYKmCPt/4c8HFv9M3VQIOqHh/ldp+TlW6MMaav0CAeUwA8KyLd\nj/+hqq4XkZ+KyDwgDpQCf+g9/nlgJXAAaAHuH/VW9yMrOczBqubz+ZbGGHNBGzDoVfUgsPgs2z9y\nloejqgo8MPKmDU9WSsRKN8YY04uvzowFV7o52dZFLK4DP9gYYyYA3wV999mxjW1WpzfGGPBx0NtJ\nU8YY4/gv6LsnNrORN8YYA/gw6DNtYjNjjOnDf0FvV5kyxpg+fBf0WclWozfGmN58F/SZFvTGGNOH\n74I+FAyQnhSivtVq9MYYAz4MenAHZK1Gb4wxji+DPislTIOVbowxBhjvQb9vAzy6CJoq+2zOSo7Y\nOHpjjPGM76BPyYWGI1D66z6bbU56Y4w5ZXwH/eTFEE6Fw6cFvc1Jb4wxPcZ30AfDMP09UPqbPpuz\nksPUt3TiZkw2xpiJbXwHPcCMa6FyF7TU9mzKSgnTFVeaO2IJbJgxxlwYfBD0y9yyV6++Z2Izq9Mb\nY4wPgn7qUghF+wR998RmVqc3xhg/BH0oCYquhNLXejZ1z3djY+mNMcYPQQ8w4zo4sQPaGgB33Viw\nOemNMQb8EvQzrwONw5E3ACjMiCICe080JrhhxhiTeIMKehE5LCI7RGSriJR42x4RkT0isl1EnhWR\nrF6P/7KIHBCRvSJyy1g1vsfUYgiEe8o3mSlhLp+Wxca9lQM80Rhj/G8oPfobVHWJqhZ7918EFqrq\nZcA+4MsAInIpcBewALgV+GcRCY5im88USYGpV/Q5ceqmSwrYXtZAxcm2MX1rY4y50A27dKOqL6hq\nl3f3daDIW18F/EhV21X1EHAAuGpkzRyEmdfB8a3Q3gTATZfkA7Bxj/XqjTET22CDXoEXRGSziKw5\ny8//F/BLb30qcLTXz8q8bWNrxrUQ74KyNwGYV5DO1KxkXrKgN8ZMcIMN+mWquhS4DXhARJZ3/0BE\nvgp0AU8P5Y1FZI2IlIhISVVV1VCeenbT3gMS7CnfiAg3XZLPa/uraeu0M2SNMRPXoIJeVcu9ZSXw\nLF4pRkTuA24H7tZTE8uUA9N6Pb3I23b6az6hqsWqWpyXlzfsHeiRlA5TlvQ5cerG+fm0dsb47cGa\nkb++McaMUwMGvYikikh69zqwAtgpIrcCfwbcoaotvZ7yHHCXiCSJyCxgLvDm6Df9LGZcC+Ul0NkK\nwNWzc0mJBHlpd8V5eXtjjLkQDaZHXwC8JiLbcIG9TlXXA/8IpAMvesMuHwdQ1V3AWuAdYD3wgKqe\nn9rJjGUQ64CyEgCi4SDL5kzi5d2VNpOlMWbCCg30AFU9CCw+y/Y5/TznYeDhkTVtGKZfDYgr38y6\nHnCjb154p4I9Jxq5ZHLGeW+SMcYkmj/OjO2WnAWFi/rMe3PDfDfM0so3xpiJyl9BD27em6NvQZeb\nojg/PcriokwbZmmMmbD8F/Qzr4OuVjj2ds+mmy4pYOvReqqb2hPYMGOMSQz/Bf30a92yV/nmxvn5\nqNpZssaYicl/QZ+aC3mX9Jn3ZsGUDAozory024LeGDPx+C/owZVvjr4BMTcVj4hw4yX5vLq/ivYu\nO0vWGDOx+DPoZ1wHHU1wYlvPppvm59PcEePNQ7X9PNEYY/zHn0E/cxkgsG9Dz6br5kwiGg5Y+cYY\nM+H4M+jT8mH2e2H7WvDOiI2Gg1x30SRe2lNhZ8kaYyYUfwY9wGWroe5Qz3QI4IZZHq1tZX9lUwIb\nZowx55d/g37+7RBKhu0/6tl0o3eW7P/YWbLGmAnEv0EfzYD5K2Hnz3rOki3MjLJoaiYvvmNBb4yZ\nOPwb9ODKN6218O5LPZtuWVDA20fq7VqyxpgJw99Bf9GNkJIL25/p2XTLgkIAXth1IlGtMsaY88rf\nQR8Mw8KPwN5fQlsDAHPy05g9KZUNu6x8Y4yZGPwd9ODKN11tsPvngDtL9paFhbx+sIb6lo4EN84Y\nY8ae/4N+6hWQM/uM8k1XXO3kKWPMhOD/oBdxvfpDr0KDu0b5ZVMzKcyIssHq9MaYCcD/QQ+w6KOA\nwo4fAxAICCsWFLBpfxWtHTbJmTHG3yZG0OdeBEVXuikRPLcuKKStM84r+6oS2DBjjBl7EyPowZVv\nKnfBiZ0AXDUrh6yUsJVvjDG+N3GCfsGHIBDqOSgbCga4aX4BL+2uoDMWT3DjjDFm7Awq6EXksIjs\nEJGtIlLibfuoiOwSkbiIFJ/2+C+LyAER2Ssit4xFw4csdRLMuRl2/ATiri5/y4ICTrZ18frBmgQ3\nzhhjxs5QevQ3qOoSVe0O9Z3Ah4FNvR8kIpcCdwELgFuBfxaR4Gg0dsQuuxMaj8Fhdz3Z5RfnkRwO\nsn6nlW+MMf417NKNqu5W1b1n+dEq4Eeq2q6qh4ADwFXDfZ9RNW8lRNJ7Rt9Ew0HeNy+PF9+pIB63\nOeqNMf402KBX4AUR2SwiawZ47FTgaK/7Zd62xAsnw9z3w771vco3hVQ2tvP20foEN84YY8bGYIN+\nmaouBW4DHhCR5SN9YxFZIyIlIlJSVXUehzjO/wA0V/VckOSG+fmEAmKTnBljfGtQQa+q5d6yEniW\n/ksx5cC0XveLvG2nv+YTqlqsqsV5eXmDb/FIzbnZjb7Zuw6AzOQw186ZxPpdJ+wSg8YYXxow6EUk\nVUTSu9eBFbgDsefyHHCXiCSJyCxgLvDmaDR2VCRnuYuH73m+Z9MtCwoorWlhb0VjAhtmjDFjYzA9\n+gLgNRHZhgvsdaq6XkQ+JCJlwDXAOhHZAKCqu4C1wDvAeuABVb2w5hmY9wGo2Q/V+wF4/6UFiMCG\nnTZ1sTHGfwYMelU9qKqLvdsCVX3Y2/6sqhapapKqFqjqLb2e87CqXqSq81T1l2O5A8My7za33OPK\nN/npUa6Yns26HcesfGOM8Z2Jc2Zsb1nToPAy2HuqfPOhpVPZV9HEtrKGBDbMGGNG38QMenCjb46+\nCU1uTvoPLp5CNBzgmbeODvBEY4wZXyZu0M9bCagbUw9kRMOsXDSZn287RktHV2LbZowxo2jiBn3h\nIsic3mf0zeriaTS1d/H8DhtTb4zxj4kb9CLuoOzBjdDRDLipi2dNSmWtlW+MMT4ycYMeYP5Kd+Hw\ndzcC7sLhdxZP483Dtbxb1ZTgxhljzOiY2EE/4zqIZvYZffORK6YSDAhrS6xXb4zxh4kd9MEwzF3R\nZ5Kz/PQoN8zL56eby+2CJMYYX5jYQQ9u9E1LDRx9o2fT6iunUd3UzsY9lQlsmDHGjA4L+jk3QyDc\nc5YswA3z8shPT7LyjTHGFyzooxkwa7mr03vTH4SCAT5yRREv76mk4mRbghtojDEjY0EPbvRN7UGo\nOnXBrDuLpxFX+MnmsgQ2zBhjRs6CHryzZOmZox5g1qRUrpqVw49LjtpEZ8aYcc2CHiBjCkxZCtt/\n3DP6BuCuK6dxuKaFNw7VJrBxxhgzMhb03a79Y6jaDZu/37PptoWTSU8K2ZmyxphxzYK+24IPw8zr\n4eVvQIvrwSdHgtyxZAq/2HGc/Xb1KWPMOGVB300Ebvs2tJ10Ye/5zE1zyYiG+MP/2ExTu81qaYwZ\nfyzoeyu4FK76JJR8D45vc5syovzDx5ZyqLqZL/50ux2YNcaMOxb0p3vflyElF55/sGdc/TUX5fKn\nt8xj3fbjfP83hxPbPmOMGSIL+tMlZ8HND7kpEbY/07P5D5dfxM2XFPDwut1sLq1LWPOMMWaoLOjP\nZsndMPUKePH/uJo9EAgIf3PnYqZkJfPA01uobmpPcCONMWZwLOjPJhCA2x6BpgrY9O2ezZnJYf75\n7qXUtnTw2R+9TSxu9XpjzIVvUEEvIodFZIeIbBWREm9bjoi8KCL7vWW2t11E5O9F5ICIbBeRpWO5\nA2Om6Aq4/B54/bE+UyMsnJrJN1Yt4NcHavjb/9mXwAYaY8zghIbw2BtUtbrX/S8BL6nqt0TkS979\nLwK3AXO923uAx7zl+HPTQ/DOz+Gnn4BZ7wUJgARYLQFyp1Xz21fi/GvwU/zBjQsQkUS31hhjzmoo\nQX+6VcD7vPUngV/hgn4V8JS6cYivi0iWiExW1eMjaWhCpOXBykdg/RfdkEuNe7cYN2mcm8Nx1v1q\nH58+8U2+/dElpERG8us0xpixMdhkUuAFEVHgX1T1CaCgV3ifAAq89alA7zkDyrxt4y/oARavdrfT\nCKC//ns+8OJfcHj33/Phf/4Dnvj9Yqbnppz/NhpjTD8GG/TLVLVcRPKBF0VkT+8fqqp6HwKDJiJr\ngDUA06dPH8pTLxhy7aehZj8PbHmK4/VFfPAf2/iHj13O8ovzEt00Y4zpMaiDsapa7i0rgWeBq4AK\nEZkM4C27r7tXDkzr9fQib9vpr/mEqharanFe3jgNRhH4wHdg1nK+EXiC96ce4L7vvcnjr7xrZ9Aa\nYy4YAwa9iKSKSHr3OrAC2Ak8B9zrPexe4L+99eeAj3ujb64GGsZlfX6wgmG48ykkeyaPxL7Nx+fF\n+dYv9/CJJ0uotKtTGWMuAIPp0RcAr4nINuBNYJ2qrge+BbxfRPYDN3v3AZ4HDgIHgH8F/mjUW32h\nSc6G33sGAb7W+BD/99Yifn2gmhV/u4lfbD+W6NYZYyY4uRBKDMXFxVpSUpLoZozc4V/DU6tgxjUc\nWPEkX/jZbrYdreeDi6fwjVULyEqJJLqFxhgfEZHNqlo80OPszNjRNPM6uOMf4NAm5vzwOp6d/yu+\ndn0Gv9xxnBWPbmLj3sqBX8MYY0aZBf1oW/IxuPsnULCAwKuPcH/JHWy56F+5Lfw2f/C91/n8M1s5\nVt+a6FYaYyYQK92MpbpSePsHsOUH0HSCxkgez7UtYbfO5OLF1/I7t76fjLT0RLfSGDNODbZ0Y0F/\nPsQ6Yd8G2PIU8dLfEOhwlyXsIkBj6iwyZ19BYOYyWPRRiNgJV8aYwbGgv1DF41BfSumu19n8xiYy\nG3ZzWaiUPK1Fk7ORK+6Hq9ZAxuREt9QYMxKdbVBzAKr2QH0p5M2H6ddASs6ovYUF/TigqmzcW8k3\n1+0ms3ozfxhZz43yFgRCsOBDBK55AKYsSXQzjZk4Oluh8TicPO6WXe3uXJlAEAJhbz3sHtvV6sK8\nswW62txz2+qhap8L97pDbm6sPgQKFsCM69zgjRnXQeqkYTfXgn4cicWVNw7V8F9vl7Njx1bujK3j\nztArpNJGa86lRNMyEbzZMbtnyQyEoHCR6yFMv3pEfyzGjIr2Rqg/4m6tdRBOhnAqRFJdSTLsLYNJ\nEIq4ZTDirv8wWlQhHoPgALO7xDrh2FYofQ2OvO7afPKYC+qRCIQgdw7kzXM9+O5lZhGc2Amlv4bD\nr8HRN90HBcCyz8PNXxvW21nQj1NtnTE27qlkw+a9FL77Y5bxNslhISc1Qm5KhPRoyE2J3NkCJ3ZA\nrMM9MXeOC/xpV0PuRZBWAGn5EEk79eFgzEi1NUD1fneNhup9UHuwV7jXDu81A2EIRSE1F9InQ3oh\npE/xlpPdB0F7E3Q0Q0eTu7U3QftJaK13Hypt3rK13vWiM6ZA9sy+t5RcOLbFne9y9E3obHbvnzsX\nJl3syqXp3q17PZzsPjhinRDv9JZd7nmhKIRTIByFUPKp5WA+uLo64PhWF/pTLoeLbhjWr86C3gca\nWjrZ8M4JXth1gk37q+noipObGmHFggJuWVDItTPSiVRuhyO/db2So6+7P/bewimQmueCP/cid4nE\nqUuhYJHrVRlzLvE4lL0Fe34Bx7e5YG/sNZtJIOyF6AzImu7dZrhbSo4rZXS2eOHc4q03u85JVzvE\n2l3gxTpc6aO5ChpPuJ5143H3+LMJJrlvCUnp7qz05Gx3refudQlCw1GoO+xujafNwJK/wCubXOtK\nJ2n5Y/QLHHsW9D7T1N7Fr/ZWsmFXBS/vrqC5I0ZGNMT7Ly1k5aJCls2dRFJAoPZd90feVOkuhdhU\n6d1OQOUeaPZO2gpGoPAyKCp2NcPUPHdLyXXLSGrivwl0dbgeZFu9W7bWu9DIvxQmzU18+/woHnMd\nh3f+G3b/3IVkMOLKhJPmud973jy3nj1z4BLJcKm6UlDjcbceSYWkNPcNNRge2mt1trpvHE0VULBw\nVA+GJpoFvY+1dcb49YFqnt9xghfeOUFjWxfpSSFuvrSA2xYWcv3cPJIjwTOfqAoNZVBeAuWboXwL\nHHv77D2nUNT7JpDvlYEKTpWDUvPc19jevbSOZvcfKi0P8i6B/PmQOX1wX2PjMVcOOPa2d9sCFe+c\n+mp9NulTYNZymP1ed/WvzKln7mtHk/uGE4y4tg/mgyHW6T4o2xu9/erukTa7fY3Hut/gtN9XEiRl\nQDQLohkQzXT3w1H3nHjMfeVXbykBSM5xvdKx+MCKexfJ6S+I4zEXgDXvQs1+qNgF+9a7nnUoCnNu\nhkt/By5e4fbHXHAs6CeIjq44v363mue3H+eFdypoaO0kEgxw5axsls/N4/q5ecwvTCcQOEeYxLrg\nZBk010BLNTRXe8sqt979raDxhNt+LhJw9cne4RxOhbyLXQ88JdeFaKzd+6re4dabquDEdhem3c+Z\nvNj1IFPz3FfyaKYXoJkuOMu3wKFX4NAmaKlxz8ud44Kzte5Uzba7lgruuT0Hxy5xy9RJ3vC3vW6U\nRNU+dz/eObJ/lKEIhN3vJiXX9TRTct0HRVKvD4tohvtA6Gr3atH1p77ptNa7D6XTb965GoRTTr1W\n9wdQIORKGrUHTx3j6f4dXXRI640mAAANz0lEQVQjXLoK5rzf9aDNBc2CfgLqjMV5/WANr+yt4tX9\n1eytcP/ZJ6Ulcf3cSVxzUS5XzcxhRm7K8K5xG+s89QEQjLgRFJE0FyahJNczba13wVn5jgvPyt1u\n2dbgjbIIu8cGw+41olluCOmUpe6g1KS5bijbYMTjULkLDr4Ch191dd7kbPeaPbXbbNcTr9rjtWv3\nmQcNJeDKEN0fBLlz3GtEUt3+RVLceji1b9u6f4eqXgg3uAOEbSddCLefdMPvAiFveF7QrUvQ9bZb\na90HVUsNtPRab2twr9HVz1QZoeipD7/uD4akdBfO3esSdG1oP3nqNdsaXLhnz3THbHLnuv3NneM+\n+KwcNq5Y0BsqTrbx6v5qXt1fxWv7q6lpdr23/PQkrpyZw1WzcrhyZg7zCtMJnqvH70fN1S7wW2pO\nhVw4muhWnSnW6cK5vcH10kPRU99uLsT2mvPOgt70EY8rB6qaePNQLW8druWtQ7Uca3AXRkmPhrhi\nRjZXzsyheEY2i6dlEQ0PsldtjEmYwQb9GB0yNxeaQEC4uCCdiwvSuefqGQCU1bV4wV9HyeFafrV3\nLwDhoLBwaiZXTM9mTn4asyalMisvlby0pOGVfIwxCWVBP4EVZadQlJ3Ch5cWAVDX3MHm0jpKSl3w\nP/V6KR1dp07hTksKudCflMolkzNYODWDBVMyyUm18fjGXMisdGPOKRZXjtW3cqi6ued2sLqZdyub\nKO81p/6UzCgLpmaycEomi6dlsnRGNhnRIY51NsYMmZVuzIgFA8K0nBSm5aSw/OK8Pj+rb+ngnWMn\n2XmsgZ3lbvk/uytQdQM35hWkUzzT1f2vmJHN1KxkK/sYkyDWozejpqm9i61H6ikprWVzaR1bSuto\n7nAnGE1Ki/QcI3C3NOYWpJOZbD1/Y4bLevTmvEtLCrFs7iSWzXUzaXbF4uw50cjm0jp2HWtgb0UT\nPy452hP+AAUZSV7dP43ZXv1/Vl4q07JTiITsSpfGjAYLejNmQsEAC6dmsnDqqdPn43GlvL6V/ZWN\n7KtoYn9FE4drmtmw6wS1zafO0gwITMlKZkZuCtNzUpmRm8KMnBSm57pSkh0DMGbwBh30IhIESoBy\nVb1dRG4E/hqIAJuBT6hql7hC7N8BK4EW4D5V3TL6TTfjUaBX3f/G+QV9flbf0tHnwG9pTQultS2s\n33mcupa+0xJkREPeqKHknuXkzCh56UnkpScxKS2J1CTrxxgDQ+vRfxbYDWSISAB4ErhJVfeJyNeB\ne4F/B24D5nq39wCPeUtj+pWVEuHy6REun559xs9OtnVypKaFI7UtlNW1UFbXSlldK4drmnntQDUt\nvcpB3VIiwZ7Qz02NkJsWITc1idy0CDmpEQoyoszJT2NSWtL52D1jEmZQQS8iRcAHgIeBzwO5QIeq\n7vMe8iLwZVzQrwKeUneU93URyRKRyap6/CwvbcygZETDZ5SBuqkqdS2dVJxso6qx3d2a2k+tN7ZT\nWtPCliP11Da3Ez9t/EFOaoQ5+WlcXJDGxQXpzMlLoyg7hYLMJJJCdoawGf8G26P/W+DPgHTvfjUQ\nEpFiVS0BfheY5v1sKnC013PLvG0W9GZMiLgrcOWkRrhkgGuqx+NKQ2snNc3tHKtvY39lE/srGtlf\n2cR/bz1GY1tXn8dPSktiSlaUKZnJTM6K9nxDmJQWcd8UvG8LNmWEuZANGPQicjtQqaqbReR9AKqq\nInIX8KiIJAEvAGd+d+7/ddcAawCmT58+1HYbMyyBgJCdGiE7NcKc/PQ+5weoKhUn23m3yp0Qdry+\njWP1rRxraOVAVROv7q/qM2Kot4xoiMLMKAUZUQozohRmutuUzGSm5bhjCPZhYBJlMD3664A7RGQl\nEMXV6P9DVe8BrgcQkRXAxd7jyznVuwco8rb1oapPAE+AG0c/7D0wZpSISE9An0tLRxc1TR1UN7VT\n3dRBTVM71U3tVJxs58TJNipOtrH3RCNVTe2cfopKQUYS03NSmOYdPM7xPnCyUyI96zkpkbNfNMaY\nERgw6FX1y7j6O16P/k9V9R4RyVfVSq9H/0Vc/R7gOeCPReRHuIOwDVafN36REgmRkhNiWk5Kv4/r\nisWpamqnvK6Vo3UtHK1t5UitO5j8+sEajp9sO+ODoFt+uju3YHZeKrMnpfWsT89JIRS0cwvM0I1k\n/NmDXlknADymqi9725/HDa08gBteef/ImmjM+BMKBpicmczkzGSKZ555jdKYd6ygtrmDupYOaps7\nqG/poLqpg8PenEIbdlVQ23zqcFc4KMyelMbcgjTm5nefXZzGtJwUO2hs+mVTIBhzAatv6eBgdTMH\nq5o50OvA8dG6lj7fCDKTw+SnJ5GfkUReWhL5GVEmpUXISnHloOzUcM96RnJ4VC4009EVRwTC9i0j\nYWwKBGN8ICslwtLpEZaedm5Ba0eMd6ua2FfRSHldK5WN7VQ2uuGlJaV1VDa295liujcRSE8KkZEc\nJiMaJjM5TEZyiPRomLgqnTGlsytOZyxORyxOR1ects4YzR0xWtq73LKji86Y+6RJSwqRmRwmK8W7\nJUfITAmTlRwmO8WtZ6dEyPKWhZlR0uxktvPKftvGjEPJkeA5zysAN4Koqb2L+pZT5aHu9fqWDk62\ndXGytZOTbZ2cbO3icHULjW2dBAJCJBggHAwQDolbBgNkpUSYmh0kJRIiNRIkJcktY3FoaO2kvrWD\nhpZO6ls72dNwknpvPXb6SQuejGiIKVnubObJWclMyYxSmOnuF2a6kUt2ZvPosd+kMT4kIqRHw6RH\nwwMeOB4rvT9sXPC7YxHHG9o4Xt/KsQY3fHVbWUOfeY66pUdDPdNa5KR6ZzenRshJi3hnOp86pyEt\nKWTTYPfDgt4YMyb6ftj0/9i2zhgVJ9s43tDGiYY2Tpx0y2P1rVQ3tbOjrp6a5o4zTmjrlhQKuNBP\nTyIv7cwhq24Ya5hoOEg0HCApFCQp5C3DAZJCAV9/UFjQG2MSLhoOMiM3lRm5qf0+rr0rRl1zJ9VN\n7dQ0d1Dd2O6d0+DOa6huaqe8vo2d5Sepbe6gI3b24xSnSw67eZHyvUnx8tLdQe3kSJBYXImpEo8r\ncXUjpmJx7TmG0RmL09nl7nfFlVBACAVPlb1CASESCpCXnuSNxIoyOSvKpNQkAqNwUHwwLOiNMeNG\nUihIYWaw35PauqkqLR2xnmMUdS2dtHXGaO9yB5jbu2K0d8Zp7YxR19zRMz/SgcomfvNuDQ2tned8\n7YA32igSDBAOBQh7wR4MCF0xpSsepyvmwr8zpnTE4mccrwgHhYKMKPdeM5NPLp894t9NfyzojTG+\nJCKkJoVITRr4BLezae+K0dEVJyBCMCC9lgy5zKOq1DR39JSjTnhlquP1reRnjP3sqRb0xhhzFq6O\nPzonoomId+A46ZwjpcaSnelgjDE+Z0FvjDE+Z0FvjDE+Z0FvjDE+Z0FvjDE+Z0FvjDE+Z0FvjDE+\nZ0FvjDE+d0FceEREqoDSYT59ElA9is0ZTybqvtt+Tyy23+c2Q1XzBnjMhRH0IyEiJYO5woofTdR9\nt/2eWGy/R85KN8YY43MW9MYY43N+CPonEt2ABJqo+277PbHYfo/QuK/RG2OM6Z8fevTGGGP6Ma6D\nXkRuFZG9InJARL6U6PaMFRH5rohUisjOXttyRORFEdnvLbMT2caxICLTRGSjiLwjIrtE5LPedl/v\nu4hEReRNEdnm7fdfettnicgb3t/7MyISSXRbx4KIBEXkbRH5hXff9/stIodFZIeIbBWREm/bqP2d\nj9ugF5Eg8E/AbcClwMdE5NLEtmrMfB+49bRtXwJeUtW5wEvefb/pAr6gqpcCVwMPeP/Gft/3duBG\nVV0MLAFuFZGrgf8HPKqqc4A64BMJbONY+iywu9f9ibLfN6jqkl5DKkft73zcBj1wFXBAVQ+qagfw\nI2BVgts0JlR1E1B72uZVwJPe+pPA75zXRp0HqnpcVbd46424//xT8fm+q9Pk3Q17NwVuBH7ibffd\nfgOISBHwAeDfvPvCBNjvcxi1v/PxHPRTgaO97pd52yaKAlU97q2fAAoS2ZixJiIzgcuBN5gA++6V\nL7YClcCLwLtAvap2eQ/x69/73wJ/BsS9+7lMjP1W4AUR2Swia7xto/Z3bteM9QFVVRHx7fApEUkD\nfgp8TlVP9r4ws1/3XVVjwBIRyQKeBeYnuEljTkRuBypVdbOIvC/R7TnPlqlquYjkAy+KyJ7ePxzp\n3/l47tGXA9N63S/ytk0UFSIyGcBbVia4PWNCRMK4kH9aVX/mbZ4Q+w6gqvXARuAaIEtEujtnfvx7\nvw64Q0QO40qxNwJ/h//3G1Ut95aVuA/2qxjFv/PxHPRvAXO9I/IR4C7guQS36Xx6DrjXW78X+O8E\ntmVMePXZfwd2q+p3ev3I1/suInleTx4RSQbejzs+sRH4Xe9hvttvVf2yqhap6kzc/+eXVfVufL7f\nIpIqIund68AKYCej+Hc+rk+YEpGVuJpeEPiuqj6c4CaNCRH5T+B9uNnsKoCvAf8FrAWm42b+vFNV\nTz9gO66JyDLgVWAHp2q2X8HV6X277yJyGe7gWxDXGVurql8Xkdm4nm4O8DZwj6q2J66lY8cr3fyp\nqt7u9/329u9Z724I+KGqPiwiuYzS3/m4DnpjjDEDG8+lG2OMMYNgQW+MMT5nQW+MMT5nQW+MMT5n\nQW+MMT5nQW+MMT5nQW+MMT5nQW+MMT73/wGB9R4eTS3JLgAAAABJRU5ErkJggg==\n",
221 |       "text/plain": [
222 |        "<Figure size 600x400 with 1 Axes>"
223 |       ]
224 |      },
225 |      "metadata": {},
226 |      "output_type": "display_data"
227 |     },
228 |     {
229 |      "name": "stdout",
230 |      "output_type": "stream",
231 |      "text": [
232 |       "118000/118000 [==============================] - 88s - loss: 486.4370 - val_loss: 496.1855\n"
233 |      ]
234 |     },
235 |     {
236 |      "data": {
237 |       "text/plain": [
238 |        "<keras.callbacks.History at 0x7fbf3f704748>"
239 |       ]
240 |      },
241 |      "execution_count": 11,
242 |      "metadata": {},
243 |      "output_type": "execute_result"
244 |     }
245 |    ],
246 |    "source": [
247 |     "vae.fit_generator(nn_batch_generator(x_train, x_train, batch_size, 118000), samples_per_epoch=118000, nb_epoch=nb_epochs, validation_data=(x_val, x_val), callbacks=[checkpointer, reduce_lr, plot_losses])"
248 |    ]
249 |   },
250 |   {
251 |    "cell_type": "code",
252 |    "execution_count": null,
253 |    "metadata": {
254 |     "collapsed": true
255 |    },
256 |    "outputs": [],
257 |    "source": []
258 |   }
259 |  ],
260 |  "metadata": {
261 |   "kernelspec": {
262 |    "display_name": "Python 3",
263 |    "language": "python",
264 |    "name": "python3"
265 |   },
266 |   "language_info": {
267 |    "codemirror_mode": {
268 |     "name": "ipython",
269 |     "version": 3
270 |    },
271 |    "file_extension": ".py",
272 |    "mimetype": "text/x-python",
273 |    "name": "python",
274 |    "nbconvert_exporter": "python",
275 |    "pygments_lexer": "ipython3",
276 |    "version": "3.5.1"
277 |   }
278 |  },
279 |  "nbformat": 4,
280 |  "nbformat_minor": 0
281 | }
282 | 


--------------------------------------------------------------------------------
/Standard/read_data.py:
--------------------------------------------------------------------------------
  1 | import os, sys
  2 | import pandas as pd
  3 | import numpy as np
  4 | import random
  5 | import pickle
  6 | from scipy import sparse
  7 | DATA_DIR = '/home/kg2719/project/data-random-fold-1'
  8 | #DATA_DIR = '/home/kg2719/project/kilol'
  9 | raw_data = pd.read_csv(os.path.join(DATA_DIR, 'new_ratings.csv'), header=0)
 10 | 
 11 | unique_user_ids = raw_data.userId.unique()
 12 | 
 13 | np.random.seed(1)
 14 | np.random.shuffle(unique_user_ids) # to ensure random splitting between train, val and test
 15 | unique_movie_ids = raw_data.movieId.unique()
 16 | 
 17 | number_of_users = len(unique_user_ids)
 18 | print("total number of users: ", number_of_users)
 19 | 
 20 | number_of_movies = len(unique_movie_ids)
 21 | print("total number of movies: ", number_of_movies)
 22 | 
 23 | # split users into training, validation and test
 24 | val_user_ids = []
 25 | test_user_ids = []
 26 | train_user_ids = []
 27 | 
 28 | for i in range(10000):
 29 |     val_user_ids.append(unique_user_ids[i]) # first 10k after shuffling keys
 30 | 
 31 | for i in range(10000, 20000):
 32 |     test_user_ids.append(unique_user_ids[i]) # next 10k after shuffling keys
 33 | 
 34 | for i in range(20000, number_of_users): # all the remaining form training data
 35 |     train_user_ids.append(unique_user_ids[i])
 36 | 
 37 | 
 38 | # creating a movieId and userId to index dictionary for creating train_data ndarray
 39 | movie2id = {}
 40 | movie2id = dict((mid, i) for (i, mid) in enumerate(unique_movie_ids))
 41 | 
 42 | 
 43 | #------------------------------------------------------------------------------------------------------------------------------------
 44 | 
 45 | print("creating training data....")
 46 | 
 47 | user2id = {}
 48 | user2id = dict((uid, i) for (i, uid) in enumerate(train_user_ids))
 49 | 
 50 | 
 51 | rows = []
 52 | cols = []
 53 | for u_id in train_user_ids:
 54 | 	print("train-",user2id[u_id])
 55 | 	m_ids = raw_data[(raw_data.userId == u_id) & (raw_data.rating > 3.5)]['movieId'].tolist()
 56 | 	movie_indexes = [movie2id[m] for m in m_ids]
 57 | 	rows.extend([user2id[u_id] for i in range(len(m_ids))])
 58 | 	cols.extend(movie_indexes)
 59 | 
 60 | # pickle.dump(rows, open("rows.file", "wb"))
 61 | # pickle.dump(cols, open("cols.file", "wb"))
 62 | 
 63 | 
 64 | # creating a sparse matrix with no_of_train_users X movies for training, binarized feedback
 65 | # rows and cols should be of same length
 66 | train_data = sparse.csr_matrix((np.ones_like(rows),(np.array(rows), np.array(cols))), dtype='float64', shape=(len(train_user_ids), number_of_movies))
 67 | 
 68 | # dumping variable to load later for use in VAE code
 69 | pickle.dump(train_data, open("train_data.file", "wb"))
 70 | print("number of training users: ", len(train_user_ids))
 71 | 
 72 | 
 73 | #-----------------------------------------------------------------------------------------------------------------------------------
 74 | 
 75 | print("creating test data....")
 76 | test_user2id = {}
 77 | test_user2id = dict((uid, i) for (i, uid) in enumerate(test_user_ids))
 78 | 
 79 | test_rows = []
 80 | test_cols = []
 81 | for u_id in test_user_ids:
 82 | 	print("test-", test_user2id[u_id])
 83 | 	m_ids = raw_data[(raw_data.userId == u_id) & (raw_data.rating > 3.5)]['movieId'].tolist()
 84 | 	movie_indexes = [movie2id[m] for m in m_ids]
 85 | 	test_rows.extend([test_user2id[u_id] for i in range(len(m_ids))])
 86 | 	test_cols.extend(movie_indexes)
 87 | 
 88 | test_data = sparse.csr_matrix((np.ones_like(test_rows),(np.array(test_rows), np.array(test_cols))), dtype='float64', shape=(len(test_user_ids), number_of_movies))
 89 | pickle.dump(test_data, open("test_data.file", "wb"))
 90 | print("number of test users: ", len(test_user_ids))
 91 | 
 92 | #----------------------------------------------------------------------------------------------------------------------------------------------------------------------
 93 | 
 94 | 
 95 | print("creating validation data")
 96 | val_user2id = {}
 97 | val_user2id = dict((uid, i) for (i, uid) in enumerate(val_user_ids))
 98 | 
 99 | val_rows = []
100 | val_cols = []
101 | for u_id in val_user_ids:
102 | 	print("val-", val_user2id[u_id])
103 | 	m_ids = raw_data[(raw_data.userId == u_id) & (raw_data.rating > 3.5)]['movieId'].tolist()
104 | 	movie_indexes = [movie2id[m] for m in m_ids]
105 | 	val_rows.extend([val_user2id[u_id] for i in range(len(m_ids))])
106 | 	val_cols.extend(movie_indexes)
107 | 
108 | val_data = sparse.csr_matrix((np.ones_like(val_rows),(np.array(val_rows), np.array(val_cols))), dtype='float64', shape=(len(val_user_ids), number_of_movies))
109 | pickle.dump(val_data, open("val_data.file", "wb"))
110 | print("number of validation users: ", len(val_user_ids))


--------------------------------------------------------------------------------
/Standard/readme.md:
--------------------------------------------------------------------------------
 1 | 
 2 | - **read_data.py** file used to create training, validation and testing data files. It reads in the new_ratings.csv and creates train_data.file, val_data.file and test_data.file which store the dumped sparse_matrix representation of these data-sets. To create multiple random folds, change the random seed's value in line 13.
 3 | 
 4 | - **vae_cf_keras.py** file is used to create the vae network, compile it and train it on the train_data. This code saves the model's weights in the specified location, and also logs the train and validation losses for analysis purposes.
 5 | 
 6 | - **evaluate_model_approach_1.py** and **evaluate_model_approach_2.py** load the saved weights, run the model on the test_data.file and calculates recall@20, recall@50 and ndcg@100 using the below mentioned testing approaches. Approach 2 is consistent with Liang et al (https://arxiv.org/pdf/1802.05814.pdf)
 7 | 
 8 | - **plot_loss_graphs.ipynb** has the code which can be used to plot the loss vs epochs graph while running the model on test data
 9 | 
10 | - **project_user_clusters.ipynb** has the code which generates user clusters from the user-embeddings using k-means clustering and t-SNE dimensionality reduction.
11 | 
12 | Testing approach 1 is where we obtained the metrics over all the movies which test users had marked 1 whereas approach 2 is the one where the 20% of the movies which were marked 1 were set off to 0 and then metrics were calculated how well our model recommended on these 20% of the movies.
13 | 


--------------------------------------------------------------------------------
/Standard/vae_cf_keras.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import pickle
  3 | import os
  4 | from keras.layers import Input, Dense, Lambda
  5 | from keras.models import Model
  6 | from keras import objectives
  7 | from keras import backend as K
  8 | from keras.callbacks import ReduceLROnPlateau, ModelCheckpoint, Callback
  9 | 
 10 | class LossHistory(Callback):
 11 |     def on_train_begin(self, logs={}):
 12 |         self.losses = []
 13 |         self.val_losses = []
 14 | 
 15 |     def on_epoch_end(self, epoch, logs={}):
 16 |         self.losses.append(logs.get('loss'))
 17 |         self.val_losses.append(logs.get('val_loss'))
 18 |         
 19 | history = LossHistory()
 20 | 
 21 | 
 22 | 
 23 | # encoder/decoder network size
 24 | batch_size=500
 25 | original_dim=26621 # number of movies
 26 | intermediate_dim=600
 27 | latent_dim=200
 28 | nb_epochs=20
 29 | epsilon_std=1.0
 30 | 
 31 | 
 32 | # encoder network
 33 | x=Input(batch_shape=(batch_size,original_dim))
 34 | h=Dense(intermediate_dim, activation='tanh')(x)
 35 | z_mean=Dense(latent_dim)(h)
 36 | z_log_var=Dense(latent_dim)(h)
 37 | 
 38 | 
 39 | # sampling from latent dimension for decoder/generative part of network
 40 | def sampling(args):
 41 |     _mean,_log_var=args
 42 |     epsilon=K.random_normal(shape=(K.shape(z_mean)[0], latent_dim), mean=0., std=epsilon_std)
 43 |     return _mean+K.exp(_log_var/2)*epsilon
 44 | 
 45 | z= Lambda(sampling, output_shape=(latent_dim,))([z_mean, z_log_var])
 46 | 
 47 | # decoder network
 48 | h_decoder=Dense(intermediate_dim, activation='tanh')
 49 | x_bar=Dense(original_dim,activation='softmax') # this should be softmax right?
 50 | h_decoded = h_decoder(z)
 51 | x_decoded = x_bar(h_decoded)
 52 | 
 53 | # build and compile model
 54 | vae = Model(x, x_decoded)
 55 | def vae_loss(x,x_bar):
 56 |     reconst_loss=original_dim*objectives.binary_crossentropy(x, x_bar)
 57 |     kl_loss= -0.5*K.sum(1 + z_log_var - K.square(z_mean) - K.exp(z_log_var), axis=-1)
 58 |     return kl_loss + reconst_loss
 59 | 
 60 | vae.compile(optimizer='adam', loss=vae_loss)
 61 | 
 62 | 
 63 | x_train = pickle.load( open( "train_data.file", "rb" ) )
 64 | print("number of training users: ", x_train.shape[0])
 65 | 
 66 | x_val = pickle.load( open( "val_data.file", "rb" ) )
 67 | x_val = x_val.todense()
 68 | 
 69 | def nn_batch_generator(x, y, batch_size, samples_per_epoch):
 70 |     number_of_batches = samples_per_epoch/batch_size
 71 |     counter=0
 72 |     shuffle_index = np.arange(np.shape(y)[0])
 73 |     np.random.shuffle(shuffle_index)
 74 |     x =  x[shuffle_index, :]
 75 |     y =  y[shuffle_index, :]
 76 |     while 1:
 77 |         index_batch = shuffle_index[batch_size*counter:batch_size*(counter+1)]
 78 |         x_batch = x[index_batch,:].todense()
 79 |         y_batch = y[index_batch,:].todense()
 80 |         counter += 1
 81 |         yield (np.array(x_batch),np.array(y_batch))
 82 |         if (counter >= number_of_batches):
 83 |             counter=0
 84 | 
 85 | 
 86 | weightsPath = "./tmp/weights.hdf5"
 87 | checkpointer = ModelCheckpoint(filepath=weightsPath, verbose=1, save_best_only=True)
 88 | 
 89 | reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=5, min_lr=0.001)
 90 | 
 91 | # sending complete training data and shuffle flag will shuffle so that each user comes atleast once in training because of multiple epochs
 92 | vae.fit_generator(nn_batch_generator(x_train, x_train, batch_size, 118000), samples_per_epoch=118000, nb_epoch=nb_epochs, 
 93 |     validation_data=(x_val, x_val), callbacks=[checkpointer, reduce_lr, history])
 94 | 
 95 | print("training losses over epochs")
 96 | print(history.losses)
 97 | 
 98 | print("validation losses over epochs")
 99 | print(history.val_losses)
100 | 


--------------------------------------------------------------------------------
/standard_vae_model_results.txt:
--------------------------------------------------------------------------------
 1 | Approach 1
 2 | 
 3 | fold-1
 4 | recall@20: 0.5361947618247523
 5 | recall@50: 0.5631774940667692
 6 | ndcg@100:  0.2724007829543391
 7 | 
 8 | fold-2
 9 | recall@20:  0.5399664609063171
10 | recall@50   0.56542322335807
11 | ndcg@100:  0.27541021400623394
12 | 
13 | fold-3
14 | recall@20:  0.5375343986736952
15 | recall@50:  0.5698594621594564
16 | ndcg@100:  0.27567425682388574
17 | 
18 | recall@20- mean: 0.53686458, stddev: 0.00067
19 | recal@50- mean: 0.566518478, stddev: 0.0033
20 | ndcg@100- mean: 0.27403752, stddev: 0.0016
21 | 
22 | ------------------------------------------------------------------------------------------------------
23 | 
24 | Approach 2
25 | 
26 | fold-1
27 | recall@20: 0.20603333875751115
28 | recall@50: 0.364723130654524
29 | ndcg@100:  0.15851297473345
30 | 
31 | 
32 | fold-2
33 | recall@20:  0.20715624463122
34 | recall@50   0.36350998713467
35 | ndcg@100:  0.15670241651531513
36 | 
37 | 
38 | fold-3
39 | recall@20:  0.2026753411889
40 | recall@50:  0.3620999812169
41 | ndcg@100:  0.1522099753781
42 | 
43 | 
44 | 
45 | recall@20- mean: 0.205288308193, stddev: 0.001903667832
46 | recal@50- mean: 0.363444366335, stddev: 0.00107190105576
47 | ndcg@100- mean: 0.155808455542, stddev: 0.00264969493645
48 | 
49 | 
50 | ------------------------------------------------------------------------------------------------------
51 | 
52 | mean is rounded off till 3 decimal places
53 | testing approach 1 is where we obtained the metrics over all the movies which test users had marked 1, approach 2 is when the 20% of the movies which were marked 1 were set off to 0 and then metrics were calculated using only those
54 | 
55 | standard vae results using testing approach 1 on movielens, mean stddev over 3 CVs
56 | recall@20- mean: 0.537, stddev: 0.00067
57 | recal@50- mean: 0.567, stddev: 0.0033
58 | ndcg@100- mean: 0.274, stddev: 0.0016
59 | 
60 | standard vae results using testing approach 2 on movielens, mean stddev over 3 CVs
61 | recall@20- mean: 0.205, stddev: 0.0019
62 | recal@50- mean: 0.363, stddev: 0.00107
63 | ndcg@100- mean: 0.156, stddev: 0.0026


--------------------------------------------------------------------------------
/update_ratings.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | 
 3 | with open('movie_data.json', 'r') as f:
 4 |     data  = json.load(f)
 5 | 
 6 | with open('links.csv', 'r') as f:
 7 |     ids = f.read().splitlines()
 8 | 
 9 | movie_json = {}
10 | for i in data:
11 |     movie_json[i['imdbID']] = i
12 | 
13 | movie_ids_list = []
14 | movie_json_ml20 = {}
15 | for i in ids:
16 |     i = i.split(",")
17 |     imdb = i[1]
18 |     ml20 = i[0]
19 |     try:
20 |         movie_json_ml20[ml20] = movie_json['tt'+imdb]
21 |         movie_ids_list.append(ml20)
22 |     except KeyError:
23 |         pass
24 |         
25 | with open('movie_ml20.json', 'w') as f:
26 |     json.dump(movie_json_ml20, f)
27 | 
28 | with open('ratings.csv', 'r') as f:
29 |     ratings = f.read().splitlines()
30 | 
31 | with open('new_ratings.csv', 'w') as f:
32 |     for i in ratings:
33 |         j = i
34 |         i = i.split(",")
35 |         movie_id = i[1]
36 |         try :
37 |             a = movie_json_ml20[movie_id]
38 |             f.write(j+"\n")
39 | 
40 |         except KeyError:
41 |             pass
42 | 
43 | 


--------------------------------------------------------------------------------