├── LICENSE ├── README.md └── src ├── baselines ├── cvae │ └── vae.py └── vlm │ ├── vlm_pytorch.py │ └── vlm.py ├── PreprocessMSD.py ├── PreprocessYahooMovies.py ├── PreprocessAmazonVideoGames.py ├── PreprocessAmazonSportsOutdoors.py ├── PreprocessNetflix.py ├── PreprocessML20M.py ├── util.py ├── TrainModel.py └── models.py /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020 Olivier Jeunen 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Closed-Form Models for Collaborative Filtering with Side-Information 2 | Source code for our LBR paper "Closed-Form Models for Collaborative Filtering with Side-Information" published at RecSys 2020. 3 | 4 | ## Reproducibility 5 | To generate a virtual Python environment that holds all the packages our work relies on, run: 6 | 7 | virtualenv -p python3 ease_side_info 8 | source ease_side_info/bin/activate 9 | pip3 install -r requirements.txt 10 | 11 | 12 | To preprocess the datasets to the format we use, run: 13 | 14 | python3 Preprocess[...].py 15 | 16 | We do not hold the rights to any of the datasets used in the paper, and are not at liberty to host and share them. 17 | However, upon request, I will gladly share pointers on where to find them. 18 | 19 | Now, you can run the ''TrainModel'' script to train and evaluate all models on the dataset of your choice. 20 | 21 | ## Acknowledgements 22 | The source code we use for our baselines (SLIM, cVAE, VLM) was slightly adapted from their original sources, and we are grateful to the original authors for providing publicly available implementations: 23 | 24 | - SLIM - https://github.com/KarypisLab/SLIM 25 | - cVAE - https://github.com/yifanclifford/cVAE 26 | - VLM - https://github.com/ehtsham/recsys19vlm 27 | 28 | ## Paper 29 | If you use our code in your research, please remember to cite our paper: 30 | 31 | ```BibTeX 32 | @inproceedings{JeunenRecSys2020, 33 | author = {Jeunen, Olivier and Van Balen, Jan and Goethals, Bart}, 34 | title = {Closed-Form Models for Collaborative Filtering with Side-Information}, 35 | booktitle = {Proceedings of the 14th ACM Conference on Recommender Systems}, 36 | series = {RecSys '20}, 37 | year = {2020}, 38 | publisher = {ACM}, 39 | } 40 | -------------------------------------------------------------------------------- /src/baselines/cvae/vae.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch.nn import functional 3 | from torch import nn 4 | 5 | # Code from Yifan Chen 6 | # https://github.com/yifanclifford/cVAE 7 | 8 | def trace(A=None, B=None): 9 | if A is None: 10 | print('Expecting PyTorch tensor') 11 | val = None 12 | elif B is None: 13 | val = torch.sum(A * A) 14 | else: 15 | val = torch.sum(A * B) 16 | return val 17 | 18 | class VAE(nn.Module): 19 | def __init__(self, args): 20 | super(VAE, self).__init__() 21 | self.l = len(args['layers']) 22 | self.device = args['device'] 23 | self.inet = nn.ModuleList() 24 | darray = [args['n_items']] + args['layers'] 25 | for i in range(self.l - 1): 26 | self.inet.append(nn.Linear(darray[i], darray[i + 1])) 27 | self.mu = nn.Linear(darray[self.l - 1], darray[self.l]) 28 | self.sigma = nn.Linear(darray[self.l - 1], darray[self.l]) 29 | self.gnet = nn.ModuleList() 30 | for i in range(self.l): 31 | self.gnet.append(nn.Linear(darray[self.l - i], darray[self.l - i - 1])) 32 | 33 | def encode(self, x): 34 | h = x 35 | for i in range(self.l - 1): 36 | h = functional.relu(self.inet[i](h)) 37 | return self.mu(h), self.sigma(h) 38 | 39 | def decode(self, z): 40 | h = z 41 | for i in range(self.l - 1): 42 | h = functional.relu(self.gnet[i](h)) 43 | return self.gnet[self.l - 1](h) 44 | 45 | def reparameterize(self, mu, logvar): 46 | if self.training: 47 | std = torch.exp(0.5 * logvar) 48 | eps = torch.randn_like(std) 49 | return eps.mul(std).add_(mu) 50 | else: 51 | return mu 52 | 53 | def forward(self, x): 54 | mu, logvar = self.encode(x) 55 | z = self.reparameterize(mu, logvar) 56 | return self.decode(z), mu, logvar 57 | 58 | def infer_reg(self): 59 | reg = 0 60 | for infer in self.inet: 61 | for param in infer.parameters(): 62 | reg += trace(param) 63 | return reg 64 | 65 | def gen_reg(self): 66 | reg = 0 67 | for infer in self.gnet: 68 | for param in infer.parameters(): 69 | reg += trace(param) 70 | return reg 71 | -------------------------------------------------------------------------------- /src/baselines/vlm/vlm_pytorch.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | from torch.autograd import Variable 4 | from torch.nn import functional 5 | from torch import nn 6 | 7 | # Loosely adapted from Ehtsham Elahi's original Tensorflow implementation 8 | # https://github.com/ehtsham/recsys19vlm/blob/master/RecSys2019-VLMPaper.ipynb 9 | 10 | class VLM_PyTorch(nn.Module): 11 | def __init__(self, args): 12 | super(VLM_PyTorch, self).__init__() 13 | 14 | self.num_users = args['num_users'] 15 | self.num_items = args['num_items'] 16 | self.num_tags = args['num_tags'] 17 | self.num_factors = args['num_factors'] 18 | self.var_prior = args['var_prior'] 19 | self.reg = args['reg'] 20 | self.side_info = args['side_info'] 21 | if self.side_info: 22 | # Experimental- Sparse matrix to speed up multiplication 23 | print('Setting up embeddings for tags...') 24 | item_tag_mat = args['item_tag_mat'].tocoo() 25 | values = item_tag_mat.data 26 | indices = np.vstack((item_tag_mat.row, item_tag_mat.col)) 27 | i = torch.LongTensor(indices) 28 | v = torch.FloatTensor(values) 29 | shape = item_tag_mat.shape 30 | self.item_tag_mat = torch.nn.Parameter(torch.sparse.FloatTensor(i, v, torch.Size(shape)), requires_grad = False) 31 | # Deprecated - Dense matrix makes multiplication too slow to handle 32 | # self.item_tag_mat = torch.nn.Parameter(torch.from_numpy(args['item_tag_mat']), requires_grad = False) #.to(args['device']) 33 | self.Mu_Zt = nn.Embedding(self.num_tags, self.num_factors) # Mean latent factors for tags 34 | 35 | self.Mu_Zu = nn.Embedding(self.num_users, self.num_factors) # Mean latent factors for users 36 | self.lsdev_Zu = nn.Embedding(self.num_users, 1) # Log(std-deviation) for user latent factors 37 | self.Mu_Zv = nn.Embedding(self.num_items, self.num_factors) # Mean latent factors for items 38 | 39 | def forward(self, user_ids, add_noise = True): 40 | # Get mean and log(std-dev) for users in this batch 41 | Mu_Zu_batch, lsdev_Zu_batch = self.Mu_Zu(user_ids), self.lsdev_Zu(user_ids) 42 | 43 | # Map item-tag matrix to tag embeddings 44 | if self.side_info: 45 | Mu_Zv_hat = torch.mm(self.item_tag_mat, self.Mu_Zt.weight) 46 | # TODO - validate whether sparse matrix multiplication is faster than gathering and summing embeddings 47 | 48 | # Simple things first - let's not bring in side-info just yet 49 | if add_noise: 50 | # 'Reparameterisation trick' - sample Gaussian noise over factors 51 | eps = torch.randn_like(Mu_Zu_batch) 52 | 53 | # Bring it together - mean + eps * std-dev 54 | Zu_batch = Mu_Zu_batch + eps * torch.exp(lsdev_Zu_batch) 55 | 56 | # Compute scores between both as the dot product 57 | if self.side_info: 58 | batch_logits = torch.mm(Zu_batch, self.Mu_Zv.weight.T + Mu_Zv_hat.T) 59 | else: 60 | batch_logits = torch.mm(Zu_batch, self.Mu_Zv.weight.T) 61 | else: 62 | if self.side_info: 63 | batch_logits = torch.mm(Mu_Zu_batch, self.Mu_Zv.weight.T + Mu_Zv_hat.T) 64 | else: 65 | batch_logits = torch.mm(Mu_Zu_batch, self.Mu_Zv.weight.T) 66 | 67 | log_softmax = torch.nn.functional.log_softmax(batch_logits, dim = 1) 68 | return log_softmax, Mu_Zu_batch, lsdev_Zu_batch 69 | -------------------------------------------------------------------------------- /src/baselines/vlm/vlm.py: -------------------------------------------------------------------------------- 1 | import tensorflow.compat.v1 as tf 2 | tf.disable_v2_behavior() 3 | 4 | # Code from Ehtsham Elahi 5 | # https://github.com/ehtsham/recsys19vlm/blob/master/RecSys2019-VLMPaper.ipynb 6 | 7 | class VLM(object): 8 | def __init__(self, num_users, num_items, num_tags, num_factors, var_prior, reg, video_metadata_array): 9 | self.num_users = num_users 10 | self.num_items = num_items 11 | self.num_tags = num_tags 12 | self.num_factors = num_factors 13 | self.var_prior = var_prior 14 | self.reg = reg 15 | self.video_metadata_array_const = tf.constant(video_metadata_array, dtype = tf.float32) 16 | self.construct_placeholders() 17 | 18 | def construct_placeholders(self): 19 | # Placeholders for training samples 20 | self.users_ph = tf.placeholder(dtype=tf.int32, shape=[None]) 21 | self.played_videos_ph = tf.placeholder(dtype=tf.float32, shape=[None, self.num_items]) 22 | 23 | def construct_model_variables(self): 24 | # Mean for user latent factors 25 | self.Mu_Zu = tf.Variable(dtype=tf.float32, 26 | initial_value=tf.random_normal(shape=[self.num_users, self.num_factors]), 27 | name = 'mean_latent_factors_zu') 28 | # Log(std-deviation) for user latent factors 29 | self.lsdev_Zu = tf.Variable(dtype=tf.float32, 30 | initial_value=tf.random_normal(shape=[self.num_users, 1]), name='lsdev_Zu') 31 | # Mean for item latent factors 32 | self.Mu_Zv = tf.Variable(dtype=tf.float32, 33 | initial_value=tf.random_normal(shape=[self.num_items, self.num_factors]), 34 | name = 'mean_latent_factors_zv') 35 | # Mean for tag latent factors 36 | self.Mu_Zt = tf.Variable(dtype=tf.float32, 37 | initial_value=tf.random_normal(shape=[self.num_tags, self.num_factors]), 38 | name = 'mean_latent_factors_zt') 39 | 40 | def compute_kl_div(self, lsdev_Zu_batch, Mu_Zu_batch): 41 | # KL Divergence needed for ELBO 42 | sdev_Zu_batch = tf.exp(lsdev_Zu_batch) 43 | comp1 = self.num_factors * (0.5 * tf.math.log(self.var_prior) - lsdev_Zu_batch) 44 | comp2 = (self.num_factors / (2 * self.var_prior)) * (tf.pow(sdev_Zu_batch, 2)) 45 | comp3 = (1.0 / (2 * self.var_prior)) * tf.reduce_sum(tf.pow(Mu_Zu_batch, 2), axis=1, keep_dims = True) 46 | comp4 = (self.num_factors / 2.0) 47 | 48 | return comp1 + comp2 + comp3 - comp4 49 | 50 | def construct_graph(self): 51 | # Boilerplate Tensorflow 52 | self.construct_model_variables() 53 | 54 | # Mean, log(std-deviation) and Gaussian noise for user latent factors 55 | Mu_Zu_batch = tf.gather(self.Mu_Zu, self.users_ph) 56 | lsdev_Zu_batch = tf.gather(self.lsdev_Zu, self.users_ph) 57 | Eps_u_ph = tf.random_normal(shape = [tf.size(self.users_ph), self.num_factors], 58 | mean=0.0, stddev=1.0, dtype=tf.float32, seed=None, name="eps") 59 | Zu_batch = Mu_Zu_batch + Eps_u_ph * tf.exp(lsdev_Zu_batch) 60 | 61 | # Tag factors mapped to items 62 | Mu_Zv_hat = tf.matmul(self.video_metadata_array_const, self.Mu_Zt) 63 | batch_logits = tf.matmul(Zu_batch, self.Mu_Zv + Mu_Zv_hat, transpose_b=True) 64 | batch_logits_validation = tf.matmul(Mu_Zu_batch, self.Mu_Zv + Mu_Zv_hat, transpose_b=True) 65 | 66 | log_softmax = tf.nn.log_softmax(batch_logits) 67 | 68 | num_items_per_document = tf.reduce_sum(self.played_videos_ph, axis=1, keep_dims=True) 69 | 70 | batch_conditional_log_likelihood = tf.reduce_sum(self.played_videos_ph * log_softmax, axis = 1, keep_dims=True) 71 | batch_kl_div = self.compute_kl_div(lsdev_Zu_batch, Mu_Zu_batch) 72 | 73 | batch_elbo = (1.0 / num_items_per_document) * (batch_conditional_log_likelihood - batch_kl_div) 74 | 75 | avg_loss = -1 * tf.reduce_mean(batch_elbo) + self.reg * (tf.nn.l2_loss(self.Mu_Zv) + 76 | tf.nn.l2_loss(self.Mu_Zt)) 77 | 78 | return batch_logits, batch_logits_validation, log_softmax, avg_loss, batch_conditional_log_likelihood, batch_kl_div, num_items_per_document 79 | -------------------------------------------------------------------------------- /src/PreprocessMSD.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import numpy as np 3 | import os 4 | import pandas as pd 5 | import pickle 6 | import util 7 | from datetime import datetime 8 | from scipy.sparse import save_npz, vstack 9 | from sklearn.preprocessing import LabelEncoder 10 | 11 | if __name__ == '__main__': 12 | # Commandline arguments 13 | parser = argparse.ArgumentParser() 14 | parser.add_argument('dir', type = str, help = 'Directory containing the data') 15 | parser.add_argument('--test_users', type = int, default = 10000) 16 | args = parser.parse_args() 17 | 18 | # Fix seed for reproducibility 19 | np.random.seed(42) 20 | 21 | # Load rating data 22 | print(datetime.now(), 'Loading in ratings...') 23 | ratings = pd.read_csv(args.dir + 'preprocessed_pref.csv') 24 | ratings.columns = ['user', 'item'] 25 | print('\t{0:8} ratings'.format(ratings.shape[0])) 26 | print('\t{0:8} unique users, {1:8} unique items'.format(ratings['user'].nunique(), ratings['item'].nunique())) 27 | 28 | # Load side info 29 | print(datetime.now(), 'Loading in side-info...') 30 | ########################### 31 | # ARTISTS - GENRES - TAGS # 32 | ########################### 33 | # Load in data 34 | artists = pd.read_csv(args.dir + 'preprocessed_artists.csv') 35 | artists.columns = ['item', 'artist'] 36 | genres = pd.read_csv(args.dir + 'preprocessed_genres.csv') 37 | genres.columns = ['item', 'genre'] 38 | tags = pd.read_csv(args.dir + 'preprocessed_tags.csv') 39 | tags.columns = ['item', 'tag'] 40 | 41 | # Drop those not appearing in preference data 42 | artists = artists.merge(ratings[['item']].drop_duplicates(), how = 'right').dropna() 43 | genres = genres.merge(ratings[['item']].drop_duplicates(), how = 'right').dropna() 44 | tags = tags.merge(ratings[['item']].drop_duplicates(), how = 'right').dropna() 45 | 46 | # Ensure proper integer identifiers 47 | user_enc = LabelEncoder() 48 | item_enc = LabelEncoder() 49 | ratings['user'] = user_enc.fit_transform(ratings['user']) 50 | ratings['item'] = item_enc.fit_transform(ratings['item']) 51 | artists['item'] = item_enc.transform(artists['item']) 52 | genres['item'] = item_enc.transform(genres['item']) 53 | tags['item'] = item_enc.transform(tags['item']) 54 | 55 | # Generate Metadata-to-item mapping 56 | X_artists = util.generate_csr_matrix(artists, 'artist', ratings['item'].max() + 1) 57 | X_genres = util.generate_csr_matrix(genres, 'genre', ratings['item'].max() + 1) 58 | X_tags = util.generate_csr_matrix(tags, 'tag', ratings['item'].max() + 1) 59 | X_meta = vstack((X_artists, X_genres, X_tags)) 60 | 61 | # Check whether output directory already exists - make it if necessary 62 | if not os.path.exists(args.dir + 'preprocessed/'): 63 | os.makedirs(args.dir + 'preprocessed/') 64 | 65 | # Write out metadata-item matrix 66 | print(datetime.now(), 'Writing out metadata-item matrix...') 67 | save_npz(args.dir + 'preprocessed/X_meta.npz', X_meta) 68 | 69 | # Train - validation - test split 70 | print(datetime.now(), 'Train-validation-test split...') 71 | X_train, X_val, val_dict, X_test, test_dict = util.train_val_test_split_Jebara(ratings, n_test_users = args.test_users) 72 | 73 | # Write out validation and test data 74 | print(datetime.now(), 'Writing out validation and test data...') 75 | save_npz(args.dir + 'preprocessed/X_val.npz', X_val) 76 | with open(args.dir + 'preprocessed/val_dict.pkl', 'wb') as handle: 77 | pickle.dump(val_dict, handle) 78 | save_npz(args.dir + 'preprocessed/X_test.npz', X_test) 79 | with open(args.dir + 'preprocessed/test_dict.pkl', 'wb') as handle: 80 | pickle.dump(test_dict, handle) 81 | 82 | # Write out full user-item training matrix 83 | print(datetime.now(), 'Writing out train data...') 84 | save_npz(args.dir + 'preprocessed/X_train.npz', X_train) 85 | 86 | # Subsample training data on a user-level 87 | print(datetime.now(), 'Subsampling training users...') 88 | train_users = np.unique(X_train.nonzero()[0]) 89 | np.random.shuffle(train_users) 90 | for frac_train_users in [0.01, .05, .1, .25, .5]: 91 | train_users[:int(frac_train_users * len(train_users))] 92 | pd.DataFrame(train_users[:int(frac_train_users * len(train_users))], columns = ['user']).to_csv(args.dir + 'preprocessed/train_users_{}.csv'.format(frac_train_users), index = False) 93 | print(datetime.now(), 'Finished!') 94 | -------------------------------------------------------------------------------- /src/PreprocessYahooMovies.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import numpy as np 3 | import os 4 | import pandas as pd 5 | import pickle 6 | import util 7 | from datetime import datetime 8 | from scipy.sparse import save_npz, vstack 9 | from sklearn.preprocessing import LabelEncoder 10 | 11 | if __name__ == '__main__': 12 | # Commandline arguments 13 | parser = argparse.ArgumentParser() 14 | parser.add_argument('dir', type = str, help = 'Directory containing the data') 15 | args = parser.parse_args() 16 | 17 | # Fix seed for reproducibility 18 | np.random.seed(42) 19 | 20 | # Load rating data 21 | print(datetime.now(), 'Loading in ratings...') 22 | ratings = pd.read_csv(args.dir + 'ydata-ymovies-user-movie-ratings-train-v1_0.txt', 23 | sep = '\t', 24 | header = None) 25 | ratings.columns = ['user', 'item', 'weird_rating','rating'] 26 | ratings = ratings.loc[ratings.rating > 3.0] 27 | 28 | # Only keep users who have rated at least 5 movies 29 | user_counts = ratings['user'].value_counts().reset_index().rename(columns = {'index': 'user', 'user': 'count'}) 30 | user_counts = user_counts.loc[user_counts['count'] >= 5] 31 | ratings = ratings.merge(user_counts, on = 'user', how = 'right').drop('count', axis = 1) 32 | print('\t{0:8} ratings'.format(ratings.shape[0])) 33 | print('\t{0:8} unique users, {1:8} unique items'.format(ratings['user'].nunique(), ratings['item'].nunique())) 34 | 35 | # Load side info 36 | print(datetime.now(), 'Loading in side-info...') 37 | side_columns = [ 38 | 'item', 39 | 'title', 40 | 'synposis', 41 | 'runtime', 42 | 'MPAA', 43 | 'MPAA_reason', 44 | 'release_date', 45 | 'distributor', 46 | 'dummy_1', # HUH? 47 | 'poster', 48 | 'genre', 49 | 'directors', 50 | 'director_ids', 51 | 'crew_members', 52 | 'crew_ids', 53 | 'crew_types', 54 | 'actors', 55 | 'actor_ids', 56 | 'avg_rating', 57 | 'n_rating', 58 | 'n_awards', 59 | 'n_nominated', 60 | 'list_won', 61 | 'list_nominated', 62 | 'rating_moviemom', 63 | 'review_moviemom', 64 | 'list_review_summaries', 65 | 'list_reviewers', 66 | 'list_captions', 67 | 'preview', 68 | 'DVD_review', 69 | 'GNPP', 70 | 'avg_train', 71 | 'num_train' 72 | ] 73 | side = pd.read_csv(args.dir + 'ydata-ymovies-movie-content-descr-v1_0.txt', 74 | sep = '\t', 75 | encoding = 'latin', 76 | names = side_columns)#[['item','genre']] 77 | 78 | # Extract genres properly 79 | genres = pd.DataFrame(side.genre.str.split('|').tolist(), index = side.item).stack().reset_index([0, 'item']) 80 | genres.columns = ['item', 'genre'] 81 | genres = genres.loc[genres.genre != '\\N'] 82 | genres = genres.merge(ratings[['item']].drop_duplicates(), on = 'item', how = 'right').dropna() 83 | 84 | # Extract directors properly 85 | directors = pd.DataFrame(side.director_ids.str.split('|').tolist(), index = side.item).stack().reset_index([0, 'item']) 86 | directors.columns = ['item', 'director'] 87 | directors = directors.loc[directors.director != '\\N'] 88 | directors = directors.merge(ratings[['item']].drop_duplicates(), on = 'item', how = 'inner').dropna() 89 | 90 | # Extract actors properly 91 | actors = pd.DataFrame(side.actor_ids.str.split('|').tolist(), index = side.item).stack().reset_index([0, 'item']) 92 | actors.columns = ['item', 'actor'] 93 | actors = actors.loc[actors.actor != '\\N'] 94 | actors = actors.merge(ratings[['item']].drop_duplicates(), on = 'item', how = 'inner').dropna() 95 | 96 | # Drop those that appear less than twice (wouldn't affect Gram-matrix) 97 | dir2count = directors.groupby('director')['item'].apply(lambda x: len(set(x))).reset_index().rename(columns = {'item': 'count'}) 98 | dir2count = dir2count[dir2count['count'] >= 2] 99 | directors = directors.merge(dir2count[['director']], on = 'director', how = 'right') 100 | act2count = actors.groupby('actor')['item'].apply(lambda x: len(set(x))).reset_index().rename(columns = {'item': 'count'}) 101 | act2count = act2count[act2count['count'] >= 2] 102 | actors = actors.merge(act2count[['actor']], on = 'actor', how = 'right') 103 | 104 | # Ensure proper integer identifiers 105 | user_enc = LabelEncoder() 106 | item_enc = LabelEncoder() 107 | genre_enc = LabelEncoder() 108 | direc_enc = LabelEncoder() 109 | actor_enc = LabelEncoder() 110 | ratings['user'] = user_enc.fit_transform(ratings['user']) 111 | ratings['item'] = item_enc.fit_transform(ratings['item']) 112 | genres['item'] = item_enc.transform(genres['item']) 113 | genres['genre'] = genre_enc.fit_transform(genres['genre'].astype(str)) 114 | directors['item'] = item_enc.transform(directors['item']) 115 | directors['director'] = direc_enc.fit_transform(directors['director']) 116 | actors['item'] = item_enc.transform(actors['item']) 117 | actors['actor'] = actor_enc.fit_transform(actors['actor']) 118 | 119 | # Generate Metadata-to-item mapping 120 | X_genres = util.generate_csr_matrix(genres, 'genre', ratings['item'].max() + 1) 121 | X_directors = util.generate_csr_matrix(directors, 'director', ratings['item'].max() + 1) 122 | X_actors = util.generate_csr_matrix(actors, 'actor', ratings['item'].max() + 1) 123 | X_meta = vstack((X_genres,X_directors,X_actors)) 124 | 125 | # Check whether output directory already exists - make it if necessary 126 | if not os.path.exists(args.dir + 'preprocessed/'): 127 | os.makedirs(args.dir + 'preprocessed/') 128 | 129 | # Write out metadata-item matrix 130 | print(datetime.now(), 'Writing out metadata-item matrix...') 131 | save_npz(args.dir + 'preprocessed/X_meta.npz', X_meta) 132 | 133 | print(datetime.now(), 'Train-validation-test split...') 134 | X_train, _, val_dict, _, test_dict = util.train_val_test_split_Karypis(ratings) 135 | 136 | # Write out user-item matrix and held-out dictionaries 137 | print(datetime.now(), 'Writing out training, validation and test data...') 138 | save_npz(args.dir + 'preprocessed/X_train.npz', X_train) 139 | with open(args.dir + 'preprocessed/val_dict.pkl', 'wb') as handle: 140 | pickle.dump(val_dict, handle) 141 | with open(args.dir + 'preprocessed/test_dict.pkl', 'wb') as handle: 142 | pickle.dump(test_dict, handle) 143 | print(datetime.now(), 'Finished!') 144 | -------------------------------------------------------------------------------- /src/PreprocessAmazonVideoGames.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import numpy as np 3 | import os 4 | import pandas as pd 5 | import pickle 6 | import string 7 | import util 8 | from datetime import datetime 9 | from scipy.sparse import save_npz, vstack 10 | from sklearn.preprocessing import LabelEncoder 11 | 12 | if __name__ == '__main__': 13 | # Commandline arguments 14 | parser = argparse.ArgumentParser() 15 | parser.add_argument('dir', type = str, help = 'Directory containing the data') 16 | args = parser.parse_args() 17 | 18 | # Fix seed for reproducibility 19 | np.random.seed(42) 20 | 21 | # Load rating data 22 | print(datetime.now(), 'Loading in ratings...') 23 | ratings = pd.read_csv(args.dir + 'reviews_Video_Games_5.csv')[['reviewerID','asin','overall']] 24 | ratings.columns = ['user', 'item', 'rating'] 25 | ratings = ratings.loc[ratings.rating > 3.0] 26 | 27 | # Only keep users who have rated at least 5 movies 28 | user_counts = ratings['user'].value_counts().reset_index().rename(columns = {'index': 'user', 'user': 'count'}) 29 | user_counts = user_counts.loc[user_counts['count'] >= 5] 30 | ratings = ratings.merge(user_counts, on = 'user', how = 'right').drop('count', axis = 1) 31 | print('\t{0:8} ratings'.format(ratings.shape[0])) 32 | print('\t{0:8} unique users, {1:8} unique items'.format(ratings['user'].nunique(), ratings['item'].nunique())) 33 | 34 | # Load in metadata 35 | meta = pd.read_csv(args.dir + 'meta_Video_Games.csv')[['asin','description','categories','title','brand']] 36 | meta.columns = ['item','desc','cat','title','brand'] 37 | 38 | # We only want metadata for items we have ratings for 39 | meta = meta.merge(ratings[['item']].drop_duplicates(), how = 'right', on = 'item') 40 | minsup = 3 41 | maxsup = ratings['item'].nunique() // 4 42 | 43 | # Clean up categorical strings 44 | meta['cat'] = meta['cat'].apply(lambda s: s.replace('[','').replace(']','').replace('\'','').strip()) 45 | cat = pd.DataFrame(meta.cat.str.split(',').tolist(), index = meta.item).stack().reset_index([0, 'item']) 46 | cat.columns = ['item', 'cat'] 47 | cat['cat'] = cat['cat'].apply(lambda s: s.strip()) 48 | cat.drop_duplicates(inplace=True) 49 | cat = cat.loc[cat.cat != 'Video Games'] # Appears too often 50 | cat = cat.loc[cat.cat != ' '] 51 | cat2count = cat.groupby('cat')['item'].apply(lambda x: len(set(x))).reset_index().rename(columns = {'item': 'count'}) 52 | cat2count = cat2count[cat2count['count'] >= 2] 53 | cat = cat.merge(cat2count[['cat']], on = 'cat', how = 'right') 54 | print(cat['cat'].value_counts()) 55 | 56 | # Clean up description strings 57 | meta['desc'] = meta['desc'].apply(lambda s: str(s).lower().translate(str.maketrans(string.punctuation, ' ' * len(string.punctuation)))) 58 | desc = pd.DataFrame(meta.desc.str.split(' ').tolist(), index = meta.item).stack().reset_index([0, 'item']) 59 | desc.columns = ['item', 'desc'] 60 | desc.drop_duplicates(inplace=True) 61 | desc.dropna(inplace = True) 62 | desc = desc.loc[desc.desc != ' '] 63 | word2count = desc.groupby('desc')['item'].apply(lambda x: len(set(x))).reset_index().rename(columns = {'item': 'count'}) 64 | word2count = word2count[word2count['count'] >= minsup] 65 | word2count = word2count[word2count['count'] <= maxsup] 66 | desc = desc.merge(word2count[['desc']], on = 'desc', how = 'right') 67 | print(desc['desc'].value_counts()) 68 | 69 | # Clean up Title strings 70 | meta['title'] = meta['title'].apply(lambda s: str(s).lower().translate(str.maketrans(string.punctuation, ' ' * len(string.punctuation)))) 71 | title = pd.DataFrame(meta.title.str.split(' ').tolist(), index = meta.item).stack().reset_index([0, 'item']) 72 | title.columns = ['item', 'title'] 73 | title.drop_duplicates(inplace=True) 74 | title.dropna(inplace = True) 75 | title = title.loc[title.title != ' '] 76 | word2count = title.groupby('title')['item'].apply(lambda x: len(set(x))).reset_index().rename(columns = {'item': 'count'}) 77 | word2count = word2count[word2count['count'] >= minsup] 78 | word2count = word2count[word2count['count'] <= maxsup] 79 | title = title.merge(word2count[['title']], on = 'title', how = 'right') 80 | print(title['title'].value_counts()) 81 | 82 | # Clean up description strings 83 | meta['brand'] = meta['brand'].apply(lambda s: str(s).lower().translate(str.maketrans(string.punctuation, ' ' * len(string.punctuation)))) 84 | brand = pd.DataFrame(meta.brand.str.split(' ').tolist(), index = meta.item).stack().reset_index([0, 'item']) 85 | brand.columns = ['item', 'brand'] 86 | brand.drop_duplicates(inplace=True) 87 | brand.dropna(inplace = True) 88 | brand = brand.loc[brand.brand != ' '] 89 | word2count = brand.groupby('brand')['item'].apply(lambda x: len(set(x))).reset_index().rename(columns = {'item': 'count'}) 90 | word2count = word2count[word2count['count'] >= minsup] 91 | word2count = word2count[word2count['count'] <= maxsup] 92 | brand = brand.merge(word2count[['brand']], on = 'brand', how = 'right') 93 | print(brand['brand'].value_counts()) 94 | 95 | # Ensure proper integer identifiers 96 | user_enc = LabelEncoder() 97 | item_enc = LabelEncoder() 98 | cat_enc = LabelEncoder() 99 | desc_enc = LabelEncoder() 100 | title_enc = LabelEncoder() 101 | brand_enc = LabelEncoder() 102 | ratings['user'] = user_enc.fit_transform(ratings['user']) 103 | ratings['item'] = item_enc.fit_transform(ratings['item']) 104 | cat['item'] = item_enc.transform(cat['item']) 105 | cat['cat'] = cat_enc.fit_transform(cat['cat'].astype(str)) 106 | desc['item'] = item_enc.transform(desc['item']) 107 | desc['desc'] = desc_enc.fit_transform(desc['desc']) 108 | title['item'] = item_enc.transform(title['item']) 109 | title['title'] = title_enc.fit_transform(title['title']) 110 | brand['item'] = item_enc.transform(brand['item']) 111 | brand['brand'] = brand_enc.fit_transform(brand['brand']) 112 | 113 | # Generate Metadata-to-item mapping 114 | X_cat = util.generate_csr_matrix(cat, 'cat', ratings['item'].max() + 1) 115 | X_desc = util.generate_csr_matrix(desc, 'desc', ratings['item'].max() + 1) 116 | X_title = util.generate_csr_matrix(title, 'title', ratings['item'].max() + 1) 117 | X_brand = util.generate_csr_matrix(brand, 'brand', ratings['item'].max() + 1) 118 | X_meta = vstack((X_cat,X_desc,X_title,X_brand)) 119 | 120 | # Check whether output directory already exists - make it if necessary 121 | if not os.path.exists(args.dir + 'preprocessed/'): 122 | os.makedirs(args.dir + 'preprocessed/') 123 | 124 | # Write out metadata-item matrix 125 | print(datetime.now(), 'Writing out metadata-item matrix...') 126 | save_npz(args.dir + 'preprocessed/X_meta.npz', X_meta) 127 | 128 | print(datetime.now(), 'Train-validation-test split...') 129 | X_train, _, val_dict, _, test_dict = util.train_val_test_split_Karypis(ratings) 130 | 131 | # Write out user-item matrix and held-out dictionaries 132 | print(datetime.now(), 'Writing out training, validation and test data...') 133 | save_npz(args.dir + 'preprocessed/X_train.npz', X_train) 134 | with open(args.dir + 'preprocessed/val_dict.pkl', 'wb') as handle: 135 | pickle.dump(val_dict, handle) 136 | with open(args.dir + 'preprocessed/test_dict.pkl', 'wb') as handle: 137 | pickle.dump(test_dict, handle) 138 | print(datetime.now(), 'Finished!') 139 | -------------------------------------------------------------------------------- /src/PreprocessAmazonSportsOutdoors.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import numpy as np 3 | import os 4 | import pandas as pd 5 | import pickle 6 | import string 7 | import util 8 | from datetime import datetime 9 | from scipy.sparse import save_npz, vstack 10 | from sklearn.preprocessing import LabelEncoder 11 | 12 | if __name__ == '__main__': 13 | # Commandline arguments 14 | parser = argparse.ArgumentParser() 15 | parser.add_argument('dir', type = str, help = 'Directory containing the data') 16 | args = parser.parse_args() 17 | 18 | # Fix seed for reproducibility 19 | np.random.seed(42) 20 | 21 | # Load rating data 22 | print(datetime.now(), 'Loading in ratings...') 23 | ratings = pd.read_csv(args.dir + 'reviews_Sports_and_Outdoors_5.csv')[['reviewerID','asin','overall']] 24 | ratings.columns = ['user', 'item', 'rating'] 25 | ratings = ratings.loc[ratings.rating > 3.0] 26 | 27 | # Only keep users who have rated at least 5 movies 28 | user_counts = ratings['user'].value_counts().reset_index().rename(columns = {'index': 'user', 'user': 'count'}) 29 | user_counts = user_counts.loc[user_counts['count'] >= 5] 30 | ratings = ratings.merge(user_counts, on = 'user', how = 'right').drop('count', axis = 1) 31 | print('\t{0:8} ratings'.format(ratings.shape[0])) 32 | print('\t{0:8} unique users, {1:8} unique items'.format(ratings['user'].nunique(), ratings['item'].nunique())) 33 | 34 | # Load in metadata 35 | meta = pd.read_csv(args.dir + 'meta_Sports_and_Outdoors.csv')[['asin','description','categories','title','brand']] 36 | meta.columns = ['item','desc','cat','title','brand'] 37 | 38 | # We only want metadata for items we have ratings for 39 | meta = meta.merge(ratings[['item']].drop_duplicates(), how = 'right', on = 'item') 40 | minsup = 3 41 | maxsup = ratings['item'].nunique() // 4 42 | 43 | # Clean up categorical strings 44 | meta['cat'] = meta['cat'].apply(lambda s: s.replace('[','').replace(']','').replace('\'','').strip()) 45 | cat = pd.DataFrame(meta.cat.str.split(',').tolist(), index = meta.item).stack().reset_index([0, 'item']) 46 | cat.columns = ['item', 'cat'] 47 | cat['cat'] = cat['cat'].apply(lambda s: s.strip()) 48 | cat.drop_duplicates(inplace=True) 49 | cat = cat.loc[cat.cat != 'Sports & Outdoors'] # Appears too often 50 | cat = cat.loc[cat.cat != ' '] 51 | cat2count = cat.groupby('cat')['item'].apply(lambda x: len(set(x))).reset_index().rename(columns = {'item': 'count'}) 52 | cat2count = cat2count[cat2count['count'] >= 2] 53 | cat = cat.merge(cat2count[['cat']], on = 'cat', how = 'right') 54 | print(cat['cat'].value_counts()) 55 | 56 | # Clean up description strings 57 | meta['desc'] = meta['desc'].apply(lambda s: str(s).lower().translate(str.maketrans(string.punctuation, ' ' * len(string.punctuation)))) 58 | desc = pd.DataFrame(meta.desc.str.split(' ').tolist(), index = meta.item).stack().reset_index([0, 'item']) 59 | desc.columns = ['item', 'desc'] 60 | desc.drop_duplicates(inplace=True) 61 | desc.dropna(inplace = True) 62 | desc = desc.loc[desc.desc != ' '] 63 | word2count = desc.groupby('desc')['item'].apply(lambda x: len(set(x))).reset_index().rename(columns = {'item': 'count'}) 64 | word2count = word2count[word2count['count'] >= minsup] 65 | word2count = word2count[word2count['count'] <= maxsup] 66 | desc = desc.merge(word2count[['desc']], on = 'desc', how = 'right') 67 | print(desc['desc'].value_counts()) 68 | 69 | # Clean up Title strings 70 | meta['title'] = meta['title'].apply(lambda s: str(s).lower().translate(str.maketrans(string.punctuation, ' ' * len(string.punctuation)))) 71 | title = pd.DataFrame(meta.title.str.split(' ').tolist(), index = meta.item).stack().reset_index([0, 'item']) 72 | title.columns = ['item', 'title'] 73 | title.drop_duplicates(inplace=True) 74 | title.dropna(inplace = True) 75 | title = title.loc[title.title != ' '] 76 | word2count = title.groupby('title')['item'].apply(lambda x: len(set(x))).reset_index().rename(columns = {'item': 'count'}) 77 | word2count = word2count[word2count['count'] >= minsup] 78 | word2count = word2count[word2count['count'] <= maxsup] 79 | title = title.merge(word2count[['title']], on = 'title', how = 'right') 80 | print(title['title'].value_counts()) 81 | 82 | # Clean up description strings 83 | meta['brand'] = meta['brand'].apply(lambda s: str(s).lower().translate(str.maketrans(string.punctuation, ' ' * len(string.punctuation)))) 84 | brand = pd.DataFrame(meta.brand.str.split(' ').tolist(), index = meta.item).stack().reset_index([0, 'item']) 85 | brand.columns = ['item', 'brand'] 86 | brand.drop_duplicates(inplace=True) 87 | brand.dropna(inplace = True) 88 | brand = brand.loc[brand.brand != ' '] 89 | word2count = brand.groupby('brand')['item'].apply(lambda x: len(set(x))).reset_index().rename(columns = {'item': 'count'}) 90 | word2count = word2count[word2count['count'] >= minsup] 91 | word2count = word2count[word2count['count'] <= maxsup] 92 | brand = brand.merge(word2count[['brand']], on = 'brand', how = 'right') 93 | print(brand['brand'].value_counts()) 94 | 95 | # Ensure proper integer identifiers 96 | user_enc = LabelEncoder() 97 | item_enc = LabelEncoder() 98 | cat_enc = LabelEncoder() 99 | desc_enc = LabelEncoder() 100 | title_enc = LabelEncoder() 101 | brand_enc = LabelEncoder() 102 | ratings['user'] = user_enc.fit_transform(ratings['user']) 103 | ratings['item'] = item_enc.fit_transform(ratings['item']) 104 | cat['item'] = item_enc.transform(cat['item']) 105 | cat['cat'] = cat_enc.fit_transform(cat['cat'].astype(str)) 106 | desc['item'] = item_enc.transform(desc['item']) 107 | desc['desc'] = desc_enc.fit_transform(desc['desc']) 108 | title['item'] = item_enc.transform(title['item']) 109 | title['title'] = title_enc.fit_transform(title['title']) 110 | brand['item'] = item_enc.transform(brand['item']) 111 | brand['brand'] = brand_enc.fit_transform(brand['brand']) 112 | 113 | # Generate Metadata-to-item mapping 114 | X_cat = util.generate_csr_matrix(cat, 'cat', ratings['item'].max() + 1) 115 | X_desc = util.generate_csr_matrix(desc, 'desc', ratings['item'].max() + 1) 116 | X_title = util.generate_csr_matrix(title, 'title', ratings['item'].max() + 1) 117 | X_brand = util.generate_csr_matrix(brand, 'brand', ratings['item'].max() + 1) 118 | X_meta = vstack((X_cat,X_desc,X_title,X_brand)) 119 | 120 | # Check whether output directory already exists - make it if necessary 121 | if not os.path.exists(args.dir + 'preprocessed/'): 122 | os.makedirs(args.dir + 'preprocessed/') 123 | 124 | # Write out metadata-item matrix 125 | print(datetime.now(), 'Writing out metadata-item matrix...') 126 | save_npz(args.dir + 'preprocessed/X_meta.npz', X_meta) 127 | 128 | print(datetime.now(), 'Train-validation-test split...') 129 | X_train, _, val_dict, _, test_dict = util.train_val_test_split_Karypis(ratings) 130 | 131 | # Write out user-item matrix and held-out dictionaries 132 | print(datetime.now(), 'Writing out training, validation and test data...') 133 | save_npz(args.dir + 'preprocessed/X_train.npz', X_train) 134 | with open(args.dir + 'preprocessed/val_dict.pkl', 'wb') as handle: 135 | pickle.dump(val_dict, handle) 136 | with open(args.dir + 'preprocessed/test_dict.pkl', 'wb') as handle: 137 | pickle.dump(test_dict, handle) 138 | print(datetime.now(), 'Finished!') 139 | -------------------------------------------------------------------------------- /src/PreprocessNetflix.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import numpy as np 3 | import os 4 | import pandas as pd 5 | import pickle 6 | import util 7 | from datetime import datetime 8 | from scipy.sparse import save_npz, vstack 9 | from sklearn.preprocessing import LabelEncoder 10 | 11 | if __name__ == '__main__': 12 | # Commandline arguments 13 | parser = argparse.ArgumentParser() 14 | parser.add_argument('dir', type = str, help = 'Directory containing the data') 15 | parser.add_argument('--test_users', type = int, default = 40000) 16 | args = parser.parse_args() 17 | 18 | # Fix seed for reproducibility 19 | np.random.seed(42) 20 | 21 | # Load rating data 22 | print(datetime.now(), 'Loading in ratings...') 23 | ratings = pd.read_csv(args.dir + 'netflix_ratings.csv') 24 | ratings.columns = ['user', 'item', 'rating', 'time'] 25 | 26 | # Preprocessing as in Liang et al. @ WWW 2018 27 | # Only keep ratings of 4 or higher 28 | ratings = ratings.loc[ratings.rating >= 4] 29 | # Only keep users who have rated at least 5 movies 30 | user_counts = ratings['user'].value_counts().reset_index().rename(columns = {'index': 'user', 'user': 'count'}) 31 | user_counts = user_counts.loc[user_counts['count'] >= 5] 32 | ratings = ratings.merge(user_counts, on = 'user', how = 'right').drop('count', axis = 1) 33 | print('\t{0:8} ratings'.format(ratings.shape[0])) 34 | print('\t{0:8} unique users, {1:8} unique items'.format(ratings['user'].nunique(), ratings['item'].nunique())) 35 | 36 | # Load side info 37 | print(datetime.now(), 'Loading in side-info...') 38 | #################### 39 | # SERIES AND YEARS # 40 | #################### 41 | # Load in data 42 | series = pd.read_csv(args.dir + 'netflixid2series.csv') 43 | # Drop movies that don't appear in preference data 44 | #series = series.merge(ratings[['item']].drop_duplicates(), on = 'item', how = 'right') 45 | 46 | # Load in data 47 | years = pd.read_csv(args.dir + 'netflixid2year.csv') 48 | # Drop movies that don't appear in preference data 49 | years = years.merge(ratings[['item']].drop_duplicates(), on = 'item', how = 'right') 50 | 51 | ######## 52 | # CREW # 53 | ######## 54 | # Load IMDB data links with movielens 55 | links = pd.read_csv(args.dir + 'netflixid2imdbid.csv') 56 | 57 | # Side info - genres 58 | side = pd.read_csv(args.dir + 'ml-imdb_sideinfo.csv')[['imdb_title_id','genre']] 59 | side.columns = ['imdb_id', 'genre'] 60 | side = side.merge(links, on = 'imdb_id', how = 'right') 61 | 62 | # Extract genres 63 | genres = pd.DataFrame(side.genre.str.split(',').tolist(), index = side.item).stack().reset_index([0, 'item']) 64 | genres.columns = ['item', 'genre'] 65 | genres = genres.loc[genres.genre != '\\N'] 66 | 67 | # Load IMDB crew data and link it properly 68 | crew = pd.read_csv(args.dir + 'imdb_crew_info.csv') 69 | crew.columns = ['imdb_id', 'directors', 'writers'] 70 | crew = crew.merge(links, on = 'imdb_id', how = 'right') 71 | 72 | # We don't care about movies without ratings 73 | crew = crew.merge(ratings[['item']].drop_duplicates(), on = 'item', how = 'right')[['item','directors','writers']] 74 | crew['directors'] = crew['directors'].apply(lambda s: str(s)) 75 | crew['writers'] = crew['writers'].apply(lambda s: str(s)) 76 | 77 | # Extract directors 78 | directors = pd.DataFrame(crew.directors.str.split(',').tolist(), index = crew.item).stack().reset_index([0, 'item']) 79 | directors.columns = ['item', 'director'] 80 | directors = directors.loc[directors.director != '\\N'] 81 | 82 | # Drop directors that appear less than once (wouldn't affect Gram-matrix) 83 | dir2count = directors.groupby('director')['item'].apply(lambda x: len(set(x))).reset_index().rename(columns = {'item': 'count'}) 84 | dir2count = dir2count[dir2count['count'] >= 2] 85 | directors = directors.merge(dir2count[['director']], on = 'director', how = 'right') 86 | 87 | # Extract writers 88 | writers = pd.DataFrame(crew.writers.str.split(',').tolist(), index = crew.item).stack().reset_index([0, 'item']) 89 | writers.columns = ['item', 'writer'] 90 | writers = writers.loc[writers.writer != '\\N'] 91 | 92 | # Drop writers that appear less than once (wouldn't affect Gram-matrix) 93 | writer2count = writers.groupby('writer')['item'].apply(lambda x: len(set(x))).reset_index().rename(columns = {'item': 'count'}) 94 | writer2count = writer2count[writer2count['count'] >= 2] 95 | writers = writers.merge(writer2count[['writer']], on = 'writer', how = 'right') 96 | 97 | # Ensure proper integer identifiers 98 | user_enc = LabelEncoder() 99 | item_enc = LabelEncoder() 100 | year_enc = LabelEncoder() 101 | genre_enc = LabelEncoder() 102 | direc_enc = LabelEncoder() 103 | write_enc = LabelEncoder() 104 | ratings['user'] = user_enc.fit_transform(ratings['user']) 105 | ratings['item'] = item_enc.fit_transform(ratings['item']) 106 | years['item'] = item_enc.transform(years['item']) 107 | years['year'] = year_enc.fit_transform(years['year']) 108 | series['item'] = item_enc.transform(series['item']) 109 | genres['item'] = item_enc.transform(genres['item']) 110 | genres['genre'] = genre_enc.fit_transform(genres['genre']) 111 | directors['item'] = item_enc.transform(directors['item']) 112 | directors['director'] = direc_enc.fit_transform(directors['director']) 113 | writers['item'] = item_enc.transform(writers['item']) 114 | writers['writer'] = write_enc.fit_transform(writers['writer']) 115 | 116 | # Generate Metadata-to-item mapping 117 | X_years = util.generate_csr_matrix(years, 'year', ratings['item'].max() + 1) 118 | X_series = util.generate_csr_matrix(series, 'title_id', ratings['item'].max() + 1) 119 | X_genres = util.generate_csr_matrix(genres, 'genre', ratings['item'].max() + 1) 120 | X_directors = util.generate_csr_matrix(directors, 'director', ratings['item'].max() + 1) 121 | X_writers = util.generate_csr_matrix(writers, 'writer', ratings['item'].max() + 1) 122 | X_meta = vstack((X_years, X_series, X_genres, X_directors, X_writers)) 123 | 124 | # Check whether output directory already exists - make it if necessary 125 | if not os.path.exists(args.dir + 'preprocessed/'): 126 | os.makedirs(args.dir + 'preprocessed/') 127 | 128 | # Write out metadata-item matrix 129 | print(datetime.now(), 'Writing out metadata-item matrix...') 130 | save_npz(args.dir + 'preprocessed/X_meta.npz', X_meta) 131 | 132 | # Train - validation - test split 133 | print(datetime.now(), 'Train-validation-test split...') 134 | X_train, X_val, val_dict, X_test, test_dict = util.train_val_test_split_Jebara(ratings, n_test_users = args.test_users) 135 | 136 | # Write out validation and test data 137 | print(datetime.now(), 'Writing out validation and test data...') 138 | save_npz(args.dir + 'preprocessed/X_val.npz', X_val) 139 | with open(args.dir + 'preprocessed/val_dict.pkl', 'wb') as handle: 140 | pickle.dump(val_dict, handle) 141 | save_npz(args.dir + 'preprocessed/X_test.npz', X_test) 142 | with open(args.dir + 'preprocessed/test_dict.pkl', 'wb') as handle: 143 | pickle.dump(test_dict, handle) 144 | 145 | # Write out full user-item training matrix 146 | print(datetime.now(), 'Writing out train data...') 147 | save_npz(args.dir + 'preprocessed/X_train.npz', X_train) 148 | 149 | # Subsample training data on a user-level 150 | print(datetime.now(), 'Subsampling training users...') 151 | train_users = np.unique(X_train.nonzero()[0]) 152 | np.random.shuffle(train_users) 153 | for frac_train_users in [0.01, .05, .1, .25, .5]: 154 | train_users[:int(frac_train_users * len(train_users))] 155 | pd.DataFrame(train_users[:int(frac_train_users * len(train_users))], columns = ['user']).to_csv(args.dir + 'preprocessed/train_users_{}.csv'.format(frac_train_users), index = False) 156 | print(datetime.now(), 'Finished!') 157 | -------------------------------------------------------------------------------- /src/PreprocessML20M.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import numpy as np 3 | import os 4 | import pandas as pd 5 | import pickle 6 | import util 7 | from datetime import datetime 8 | from scipy.sparse import save_npz, vstack 9 | from sklearn.preprocessing import LabelEncoder 10 | 11 | if __name__ == '__main__': 12 | # Commandline arguments 13 | parser = argparse.ArgumentParser() 14 | parser.add_argument('dir', type = str, help = 'Directory containing the data') 15 | parser.add_argument('--test_users', type = int, default = 10000) 16 | args = parser.parse_args() 17 | 18 | # Fix seed for reproducibility 19 | np.random.seed(42) 20 | 21 | # Load rating data 22 | print(datetime.now(), 'Loading in ratings...') 23 | ratings = pd.read_csv(args.dir + 'ml-20m_ratings.csv') 24 | ratings.columns = ['user', 'item', 'rating', 'time'] 25 | 26 | # Preprocessing as in Liang et al. @ WWW 2018 27 | # Only keep ratings of 4 or higher 28 | ratings = ratings.loc[ratings.rating >= 4] 29 | # Only keep users who have rated at least 5 movies 30 | user_counts = ratings['user'].value_counts().reset_index().rename(columns = {'index': 'user', 'user': 'count'}) 31 | user_counts = user_counts.loc[user_counts['count'] >= 5] 32 | ratings = ratings.merge(user_counts, on = 'user', how = 'right').drop('count', axis = 1) 33 | print('\t{0:8} ratings'.format(ratings.shape[0])) 34 | print('\t{0:8} unique users, {1:8} unique items'.format(ratings['user'].nunique(), ratings['item'].nunique())) 35 | 36 | # Load side info 37 | print(datetime.now(), 'Loading in side-info...') 38 | ########## 39 | # GENRES # 40 | ########## 41 | # Load in data 42 | movies = pd.read_csv(args.dir + 'ml-20m_movies.csv') 43 | movies.columns = ['item', 'title', 'genres'] 44 | # Drop movies that don't appear in preference data 45 | movies = movies.merge(ratings[['item']].drop_duplicates(), on = 'item', how = 'right') 46 | # Properly format 47 | genres = pd.DataFrame(movies.genres.str.split('|').tolist(), index = movies.item)\ 48 | .stack()\ 49 | .reset_index([0, 'item'])\ 50 | .rename(columns = {0: 'genre'}) 51 | # Drop nonsensical genres 52 | genres = genres.loc[genres.genre != '(no genres listed)'] 53 | genres = genres.loc[genres.genre != 'IMAX'] 54 | 55 | ######### 56 | # YEARS # 57 | ######### 58 | # Extract year 59 | movies['year'] = movies['title'].str.extract(pat = '\((\d\d\d\d)(?:[-–]\s*(?:\d\d\d\d)?)?\)') 60 | years = movies[['item','year']] 61 | # Drop years that appear less than once (wouldn't affect Gram-matrix) 62 | y2c = years.groupby('year')['item']\ 63 | .apply(lambda x: len(set(x)))\ 64 | .reset_index()\ 65 | .rename(columns = {'item': 'count'}) 66 | y2c = y2c[y2c['count'] >= 2] 67 | years = years.merge(y2c[['year']], on = 'year', how = 'right') 68 | 69 | ######## 70 | # CREW # 71 | ######## 72 | # Load IMDB data links with movielens 73 | links = pd.read_csv(args.dir + 'ml-imdb_links.csv')[['movieId','imdbId']] 74 | links.columns = ['item', 'imdb_id'] 75 | 76 | # Load IMDB crew data and link it properly 77 | crew = pd.read_csv(args.dir + 'imdb_crew_info.csv') 78 | crew.columns = ['imdb_id', 'directors', 'writers'] 79 | crew['imdb_id'] = crew['imdb_id'].apply(lambda s: int(s[2:])) 80 | crew = crew.merge(links, on = 'imdb_id', how = 'right') 81 | 82 | # We don't care about movies without ratings 83 | crew = crew.merge(ratings[['item']].drop_duplicates(), on = 'item', how = 'right')[['item','directors','writers']] 84 | crew['directors'] = crew['directors'].apply(lambda s: str(s)) 85 | crew['writers'] = crew['writers'].apply(lambda s: str(s)) 86 | 87 | # Extract directors 88 | directors = pd.DataFrame(crew.directors.str.split(',').tolist(), index = crew.item).stack().reset_index([0, 'item']) 89 | directors.columns = ['item', 'director'] 90 | directors = directors.loc[directors.director != '\\N'] 91 | 92 | # Drop directors that appear less than once (wouldn't affect Gram-matrix) 93 | dir2count = directors.groupby('director')['item'].apply(lambda x: len(set(x))).reset_index().rename(columns = {'item': 'count'}) 94 | dir2count = dir2count[dir2count['count'] >= 2] 95 | directors = directors.merge(dir2count[['director']], on = 'director', how = 'right') 96 | 97 | # Extract writers 98 | writers = pd.DataFrame(crew.writers.str.split(',').tolist(), index = crew.item).stack().reset_index([0, 'item']) 99 | writers.columns = ['item', 'writer'] 100 | writers = writers.loc[writers.writer != '\\N'] 101 | 102 | # Drop writers that appear less than once (wouldn't affect Gram-matrix) 103 | writer2count = writers.groupby('writer')['item'].apply(lambda x: len(set(x))).reset_index().rename(columns = {'item': 'count'}) 104 | writer2count = writer2count[writer2count['count'] >= 2] 105 | writers = writers.merge(writer2count[['writer']], on = 'writer', how = 'right') 106 | 107 | # Ensure proper integer identifiers 108 | user_enc = LabelEncoder() 109 | item_enc = LabelEncoder() 110 | genre_enc = LabelEncoder() 111 | year_enc = LabelEncoder() 112 | direc_enc = LabelEncoder() 113 | write_enc = LabelEncoder() 114 | ratings['user'] = user_enc.fit_transform(ratings['user']) 115 | ratings['item'] = item_enc.fit_transform(ratings['item']) 116 | genres['item'] = item_enc.transform(genres['item']) 117 | genres['genre'] = genre_enc.fit_transform(genres['genre']) 118 | years['item'] = item_enc.transform(years['item']) 119 | years['year'] = year_enc.fit_transform(years['year']) 120 | directors['item'] = item_enc.transform(directors['item']) 121 | directors['director'] = direc_enc.fit_transform(directors['director']) 122 | writers['item'] = item_enc.transform(writers['item']) 123 | writers['writer'] = write_enc.fit_transform(writers['writer']) 124 | 125 | # Generate Metadata-to-item mapping 126 | X_genres = util.generate_csr_matrix(genres, 'genre', ratings['item'].max() + 1) 127 | X_years = util.generate_csr_matrix(years, 'year', ratings['item'].max() + 1) 128 | X_directors = util.generate_csr_matrix(directors, 'director', ratings['item'].max() + 1) 129 | X_writers = util.generate_csr_matrix(writers, 'writer', ratings['item'].max() + 1) 130 | X_meta = vstack((X_genres,X_years,X_directors,X_writers)) 131 | 132 | # Check whether output directory already exists - make it if necessary 133 | if not os.path.exists(args.dir + 'preprocessed/'): 134 | os.makedirs(args.dir + 'preprocessed/') 135 | 136 | # Write out metadata-item matrix 137 | print(datetime.now(), 'Writing out metadata-item matrix...') 138 | save_npz(args.dir + 'preprocessed/X_meta.npz', X_meta) 139 | 140 | # Train - validation - test split 141 | print(datetime.now(), 'Train-validation-test split...') 142 | X_train, X_val, val_dict, X_test, test_dict = util.train_val_test_split_Jebara(ratings, n_test_users = args.test_users) 143 | 144 | # Write out validation and test data 145 | print(datetime.now(), 'Writing out validation and test data...') 146 | save_npz(args.dir + 'preprocessed/X_val.npz', X_val) 147 | with open(args.dir + 'preprocessed/val_dict.pkl', 'wb') as handle: 148 | pickle.dump(val_dict, handle) 149 | save_npz(args.dir + 'preprocessed/X_test.npz', X_test) 150 | with open(args.dir + 'preprocessed/test_dict.pkl', 'wb') as handle: 151 | pickle.dump(test_dict, handle) 152 | 153 | # Write out full user-item training matrix 154 | print(datetime.now(), 'Writing out train data...') 155 | save_npz(args.dir + 'preprocessed/X_train.npz', X_train) 156 | 157 | # Subsample training data on a user-level 158 | print(datetime.now(), 'Subsampling training users...') 159 | train_users = np.unique(X_train.nonzero()[0]) 160 | np.random.shuffle(train_users) 161 | for frac_train_users in [0.01, .05, .1, .25, .5]: 162 | train_users[:int(frac_train_users * len(train_users))] 163 | pd.DataFrame(train_users[:int(frac_train_users * len(train_users))], columns = ['user']).to_csv(args.dir + 'preprocessed/train_users_{}.csv'.format(frac_train_users), index = False) 164 | print(datetime.now(), 'Finished!') 165 | -------------------------------------------------------------------------------- /src/util.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | import sys 4 | from collections import defaultdict 5 | from datetime import datetime 6 | from joblib import Parallel, delayed 7 | from scipy.sparse import csr_matrix, coo_matrix, vstack 8 | from tqdm import tqdm 9 | 10 | # From https://stackoverflow.com/questions/24455615/python-how-to-display-size-of-all-variables 11 | def sizeof_fmt(num, suffix='B'): 12 | ''' by Fred Cirera, https://stackoverflow.com/a/1094933/1870254, modified''' 13 | for unit in ['','Ki','Mi','Gi','Ti','Pi','Ei','Zi']: 14 | if abs(num) < 1024.0: 15 | return "%3.1f %s%s" % (num, unit, suffix) 16 | num /= 1024.0 17 | return "%.1f %s%s" % (num, 'Yi', suffix) 18 | 19 | def pretty_print_results(results): 20 | ''' Pretty print results in a defaultdict. ''' 21 | print('\tRecall@K') 22 | for K in results[0].keys(): 23 | print('\t', K,'\t',results[0][K]) 24 | print('\tNDCG@K') 25 | for K in results[1].keys(): 26 | print('\t', K,'\t',results[1][K]) 27 | 28 | def generate_csr_matrix(meta_df, colname, ncols, alpha = 1.): 29 | ''' Generate Metadata-to-item mapping in the form of a CSR matrix. ''' 30 | data = np.ones(meta_df.shape[0]) * alpha 31 | rows, cols = meta_df[colname].values, meta_df['item'].values 32 | nrows = meta_df[colname].max() + 1 33 | return csr_matrix((data, (rows, cols)), shape = (int(nrows), int(ncols))) 34 | 35 | def normalize_idf(X): 36 | ''' Normalize matrix X according to column-wise IDF. ''' 37 | # Log-normalised Smoothed Inverse Document Frequency 38 | row_counts = X.sum(axis = 1) 39 | row_counts -= (row_counts.min() - 2.0) # Start from 0 for more expressive log-scale 40 | idf = (1.0 / np.log(row_counts)).A1.ravel() 41 | return csr_matrix(np.diag(idf)) @ X 42 | 43 | def compute_sparsity(A): 44 | ''' Compute the sparsity level (% of non-zeros) of matrix A. ''' 45 | return 1.0 - np.count_nonzero(A) / (A.shape[0] * A.shape[1]) 46 | 47 | def sparsify(B, rho = .95): 48 | ''' Get B to the required sparsity level by dropping out the rho % lower absolute values. ''' 49 | min_val = np.quantile(np.abs(B), rho) 50 | B[np.abs(B) < min_val] = .0 51 | return B 52 | 53 | def compute_EASE(X, l2 = 5e2): 54 | ''' Compute a closed-form OLS SLIM-like item-based model. (H. Steck @ WWW 2019) ''' 55 | G = X.T @ X + l2 * np.identity((X.shape[1])) 56 | B = np.linalg.inv(G) 57 | B /= -np.diag(B) 58 | B[np.diag_indices(B.shape[0])] = .0 59 | return B 60 | 61 | def compute_cosine(X): 62 | ''' Compute a cosine similarity item-based model. ''' 63 | # Base similarity matrix (all dot products) 64 | similarity = X.T.dot(X).toarray() 65 | # Squared magnitude of preference vectors (number of occurrences) 66 | square_mag = np.diag(similarity) 67 | # Inverse squared magnitude 68 | inv_square_mag = 1 / square_mag 69 | # If it doesn't occur, set it's inverse magnitude to zero (instead of inf) 70 | inv_square_mag[np.isinf(inv_square_mag)] = 0 71 | # inverse of the magnitude 72 | inv_mag = np.sqrt(inv_square_mag) 73 | # cosine similarity (elementwise multiply by inverse magnitudes) 74 | cosine = similarity * inv_mag 75 | cosine = cosine.T * inv_mag 76 | cosine[np.diag_indices(X.shape[1])] = .0 77 | return cosine 78 | 79 | def generate_eval_format(ratings, nrows, ncols, hist_frac = .8): 80 | ''' Split 'ratings' into a historical and held-out fraction ''' 81 | # Split ratings into 'history' and 'held-out' set 82 | test_ratings = ratings.groupby('user').apply(lambda df: df.sample(frac = 1. - hist_frac)).reset_index(drop = True) 83 | hist_ratings = pd.concat([test_ratings, ratings]).drop_duplicates(keep = False) 84 | 85 | # Generate user-item matrix for history and dictionary for hold-out 86 | data = np.ones(hist_ratings.shape[0]) 87 | rows, cols = hist_ratings['user'], hist_ratings['item'] 88 | X_hist = csr_matrix((data, (rows, cols)), shape = (nrows, ncols)) 89 | 90 | # Generate dictionary for hold-out (fast lookup) 91 | test_dict = defaultdict(set) 92 | for row in test_ratings.itertuples(): 93 | test_dict[row.user].add(row.item) 94 | 95 | return X_hist, test_dict 96 | 97 | def train_val_test_split_strong(ratings, n_test_users = 10000, hist_frac = .8, n_train_users = 0): 98 | ''' Split into train/validation/test ratings for strong generalisation. 99 | i.e. unseen users during training time ''' 100 | # Sample validation and testing users without replacement 101 | val_test_users = np.random.choice(ratings['user'].max() + 1, size = n_test_users * 2, replace = False) 102 | val_users = val_test_users[:n_test_users] 103 | test_users = val_test_users[n_test_users:] 104 | 105 | # Extract ratings for these users from the full set 106 | val_ratings = ratings.merge(pd.DataFrame(val_users, columns = ['user']), how = 'right') 107 | test_ratings = ratings.merge(pd.DataFrame(test_users, columns = ['user']), how = 'right') 108 | train_ratings = pd.concat([test_ratings, val_ratings, ratings]).drop_duplicates(keep = False) 109 | 110 | # Split into historical and held-out sets 111 | nrows, ncols = ratings['user'].max() + 1, ratings['item'].max() + 1 112 | X_val, val_dict = generate_eval_format(val_ratings, nrows, ncols, hist_frac = hist_frac) 113 | X_test, test_dict = generate_eval_format(test_ratings, nrows, ncols, hist_frac = hist_frac) 114 | 115 | # Subsample training data if specified 116 | if n_train_users: 117 | # Randomly sample training users - only keep their ratings 118 | train_users = train_ratings[['user']].sample(n = n_train_users) 119 | train_ratings = train_ratings.merge(train_users, on = 'user', how = 'right') 120 | 121 | # Generate historical matrix for training ratings 122 | X_train, _ = generate_eval_format(train_ratings, nrows, ncols, hist_frac = 1.) 123 | 124 | return X_train, X_val, val_dict, X_test, test_dict 125 | 126 | def train_val_test_split_loocb(ratings, n_train_users = 0): 127 | ''' Split into train/validation/test ratings via leave-one-out. ''' 128 | # For every user - randomly sample a single item for test and validation 129 | val_ratings = ratings.groupby('user').apply(lambda df: df.sample(1)).reset_index(drop = True) 130 | rest_ratings = pd.concat([val_ratings, ratings]).drop_duplicates(keep = False) 131 | 132 | test_ratings = rest_ratings.groupby('user').apply(lambda df: df.sample(1)).reset_index(drop = True) 133 | train_ratings = pd.concat([test_ratings, rest_ratings]).drop_duplicates(keep = False) 134 | 135 | # Generate historical matrix for training ratings 136 | nrows, ncols = ratings['user'].max() + 1, ratings['item'].max() + 1 137 | X_hist, _ = generate_eval_format(train_ratings, nrows, ncols, hist_frac = 1.) 138 | _, val_dict = generate_eval_format(val_ratings, nrows, ncols, hist_frac = 0.) 139 | _, test_dict = generate_eval_format(test_ratings, nrows, ncols, hist_frac = 0.) 140 | 141 | # Subsample training data if specified 142 | if n_train_users: 143 | # Randomly sample training users - only keep their ratings 144 | train_users = train_ratings[['user']].sample(n = n_train_users) 145 | train_ratings = train_ratings.merge(train_users, on = 'user', how = 'right') 146 | 147 | # Generate historical matrix for training ratings 148 | X_train, _ = generate_eval_format(train_ratings, nrows, ncols, hist_frac = 1.) 149 | 150 | return X_train, X_hist, val_dict, X_hist, test_dict 151 | 152 | def evaluate(X, scores, test, k_values = [1, 5, 10, 20, 50, 100], compute_item_counts = True): 153 | ''' Evaluate an approximation X with historical user-item matrix 'X' and user to held-out item dictionary 'test'. ''' 154 | # Placeholder for results 155 | recall = defaultdict(float) 156 | NDCG = defaultdict(float) 157 | item2count = csr_matrix((1,scores.shape[0])) 158 | 159 | # Function per user to parallellise 160 | def evaluate_user(scores, items, k_values = k_values): 161 | # Placeholder for results per user 162 | item2count = None 163 | recall = [] 164 | NDCG = [] 165 | # Top-K for multiple K's 166 | for K in k_values: 167 | ########## 168 | # RECALL # 169 | ########## 170 | # Extract top-K highest scores into a set 171 | topK_list = np.argpartition(scores, -K)[-K:] 172 | topK_set = set(topK_list) 173 | # Compute recall 174 | recall.append(len(topK_set.intersection(items)) / min(K, len(items))) 175 | ######## 176 | # NDCG # 177 | ######## 178 | # Extract top-K highest scores into a sorted list 179 | topK_list = topK_list[np.argsort(scores[topK_list])][::-1] 180 | # Compute NDCG discount template 181 | discount_template = 1. / np.log2(np.arange(2, K + 2)) 182 | # Compute ideal DCG 183 | IDCG = discount_template[:min(K, len(items))].sum() 184 | # Compute DCG 185 | DCG = sum((discount_template[rank] * (item in items)) for rank, item in enumerate(topK_list)) 186 | # Normalise and store 187 | NDCG.append(DCG / IDCG) 188 | ############# 189 | # LONG TAIL # 190 | ############# 191 | if K == 100: 192 | item2count = coo_matrix(([1] * K,([0] * K,topK_list)), shape = (1, scores.shape[0])) 193 | # Stack batches 194 | return recall + NDCG, item2count 195 | 196 | # Parallellise every batch 197 | val = Parallel(n_jobs=-1)(delayed(evaluate_user)(scores[new_row,:].A1, items, k_values) for new_row, (user, items) in tqdm(enumerate(test.items()), total = len(test))) 198 | if compute_item_counts: 199 | # Properly extract evaluation metrics and item counts for analysis 200 | item2counts = [v[1] for v in val] 201 | item2count = vstack(item2counts).sum(axis=0).A1 202 | # Merge evaluation-metrics per user 203 | val = [v[0] for v in val] 204 | val = np.vstack(val) 205 | for idx, K in enumerate(k_values): 206 | recall[K] = np.mean(val[:,idx]) 207 | NDCG[K] = np.mean(val[:,idx+len(k_values)]) 208 | return recall, NDCG, item2count 209 | -------------------------------------------------------------------------------- /src/TrainModel.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import gc 3 | import itertools 4 | import numpy as np 5 | import os 6 | import pandas as pd 7 | import pickle 8 | import time 9 | from copy import deepcopy 10 | from datetime import datetime 11 | from scipy.sparse import load_npz, vstack 12 | from sklearn.preprocessing import LabelEncoder, normalize 13 | import sys 14 | import tensorflow.compat.v1 as tf 15 | tf.disable_v2_behavior() 16 | import torch 17 | from tqdm import trange 18 | 19 | import models 20 | import util 21 | 22 | # Only needed for tensorflow when 1st GPU is already in use 23 | #import os 24 | #os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" 25 | #os.environ["CUDA_VISIBLE_DEVICES"] = "1" 26 | 27 | if __name__ == '__main__': 28 | # Commandline arguments 29 | parser = argparse.ArgumentParser() 30 | parser.add_argument('dir', type = str, help = 'Directory containing the data') 31 | parser.add_argument('-a', '--algorithms', nargs='+', help='Algorithms to train and evaluate') 32 | parser.add_argument('-frac', '--frac_train_users', nargs='+', help='Fractions of training users to go over', default = [0.01, .05, .1, .25, .5, 1.]) 33 | parser.add_argument('-l2', '--l2_values', nargs='+', help='Values to test for EASE\'s l2 regularisation strength', default = [50, 100, 200, 500, 1000]) 34 | parser.add_argument('-alpha', '--alpha_values', nargs='+', help='Values to test for EASE\'s alpha side-info weight', default = np.linspace(.0,1.,21)) 35 | parser.add_argument('-eval_style', help='Evaluation style - either strong generalisation or LOOCV', default = 'strong') 36 | args = parser.parse_args() 37 | 38 | # Fix seed for reproducibility 39 | np.random.seed(42) 40 | torch.manual_seed(42) 41 | tf.set_random_seed(42) 42 | 43 | # Check whether output directory already exists - make it if necessary 44 | if not os.path.exists(args.dir + 'preprocessed/'): 45 | print('Directory {0} not found.\nPlease run the accompanying preprocessing script first.'.format(args.dir + 'preprocessed/')) 46 | exit(1) 47 | 48 | print('Directory with data:', args.dir) 49 | print('Models to evaluate:', args.algorithms) 50 | print('Evaluation style:', args.eval_style ) 51 | 52 | # Load everything you need 53 | print(datetime.now(), 'Loading in data...') 54 | X_meta = load_npz(args.dir + 'preprocessed/X_meta.npz').astype(np.int32) 55 | X_train = load_npz(args.dir + 'preprocessed/X_train.npz').astype(np.int32) 56 | if args.eval_style == 'strong': 57 | X_val = load_npz(args.dir + 'preprocessed/X_val.npz').astype(np.int32) 58 | X_test = load_npz(args.dir + 'preprocessed/X_test.npz').astype(np.int32) 59 | elif args.eval_style == 'LOOCV': 60 | X_val = X_train 61 | X_test = X_train 62 | else: 63 | print('Unknown evaluation style, aborting...') 64 | exit(1) 65 | 66 | with open(args.dir + 'preprocessed/val_dict.pkl', 'rb') as handle: 67 | val_dict = pickle.load(handle) 68 | with open(args.dir + 'preprocessed/test_dict.pkl', 'rb') as handle: 69 | test_dict = pickle.load(handle) 70 | 71 | # Check whether output directory already exists - make it if necessary 72 | if not os.path.exists(args.dir + 'results/'): 73 | os.makedirs(args.dir + 'results/') 74 | 75 | # For every sampled subset of training users 76 | subsampling_fractions = args.frac_train_users if args.eval_style == 'strong' else [1.] 77 | for frac_train_users in subsampling_fractions: 78 | frac_train_users = float(frac_train_users) 79 | print(datetime.now(), '---- Frac of train users:\t{} ----'.format(frac_train_users)) 80 | 81 | # Placeholder for results 82 | results = [] 83 | 84 | # Normally, we train on everything 85 | train_users = np.unique(X_train.nonzero()[0]) 86 | # But subsample when necessary 87 | if frac_train_users < 1.: 88 | # Read training users 89 | train_users = pd.read_csv(args.dir + 'preprocessed/train_users_{}.csv'.format(frac_train_users))['user'].values.astype(np.int32) 90 | # Only keep these rows in X_train 91 | X_train_subset = deepcopy(X_train[train_users,:]) 92 | 93 | print('\tTraining data # users:', X_train_subset.shape[0]) 94 | print('\tTraining data # prefs:', X_train_subset.count_nonzero()) 95 | print('\tSide-information # tags:', X_meta.shape[0]) 96 | print('\tSide-information # pairs:', X_meta.count_nonzero()) 97 | print('\tTraining and evaluating models...') 98 | 99 | for algo in args.algorithms: 100 | print('\t\t---- {0} ----'.format(algo)) 101 | if algo == 'cosine': 102 | ########################################### 103 | # 1. Item-kNN (cosine) (Sarwar, WWW 2001) # 104 | ########################################### 105 | # Get dictionary with results - 0 is recall, 1 is NDCG, 2 is item counts for analysis 106 | recall, ndcg, item2count = models.run_itemknn(X_train_subset, X_test, test_dict) 107 | util.pretty_print_results((recall, ndcg)) 108 | results.append({ 109 | 'Recall@20': recall[20], 110 | 'Recall@50': recall[50], 111 | 'NDCG@100': ndcg[100], 112 | 'frac_U': frac_train_users, 113 | 'Alg': 'Item-kNN' 114 | }) 115 | np.savez(args.dir + 'results/item_counts_{0}_{1}.npz'.format(algo, frac_train_users), item2count) 116 | 117 | elif algo == 'cvae': 118 | #################################### 119 | # 2. cVAE (Chen, DLRS@RecSys 2018) # 120 | #################################### 121 | # Get dictionary with results - 0 is recall, 1 is NDCG, 2 is item counts for analysis 122 | recall, ndcg, _ = models.run_cVAE(X_train_subset, X_meta, X_val, X_test, val_dict, test_dict) 123 | util.pretty_print_results((recall, ndcg)) 124 | results.append({ 125 | 'Recall@20': recall[20], 126 | 'Recall@50': recall[50], 127 | 'NDCG@100': ndcg[100], 128 | 'frac_U': frac_train_users, 129 | 'Alg': 'cVAE' 130 | }) 131 | 132 | elif algo == 'vlm': 133 | ################################################################### 134 | # 3.a VARIATONAL LOW-RANK MULTINOMIALS (Elahi, RecSys 2019) # 135 | ################################################################### 136 | # Original implementation in Tensorflow 137 | # Get dictionary with results - 0 is recall, 1 is NDCG, 2 is item counts for analysis 138 | print('------------------ WITH SIDE INFO ---------------') 139 | recall, ndcg, _ = models.run_VLM(X_train_subset, train_users, normalize(X_meta, norm = 'l1', axis = 0), X_val, X_test, val_dict, test_dict, side_info = True) 140 | util.pretty_print_results((recall, ndcg)) 141 | results.append({ 142 | 'Recall@20': recall[20], 143 | 'Recall@50': recall[50], 144 | 'NDCG@100': ndcg[100], 145 | 'frac_U': frac_train_users, 146 | 'Alg': 'VLM-Side' 147 | }) 148 | print('------------------ WITHOUT SIDE INFO ---------------') 149 | recall, ndcg, _ = models.run_VLM(X_train_subset, train_users, normalize(X_meta, norm = 'l1', axis = 0), X_val, X_test, val_dict, test_dict, side_info = False) 150 | util.pretty_print_results((recall, ndcg)) 151 | results.append({ 152 | 'Recall@20': recall[20], 153 | 'Recall@50': recall[50], 154 | 'NDCG@100': ndcg[100], 155 | 'frac_U': frac_train_users, 156 | 'Alg': 'VLM-NoSide' 157 | }) 158 | 159 | elif algo == 'vlm_pytorch': 160 | ################################################################### 161 | # 3.b VARIATONAL LOW-RANK MULTINOMIALS (Elahi, RecSys 2019) # 162 | ################################################################### 163 | # Our implementation in PyTorch, generally quite a bit faster 164 | # Get dictionary with results - 0 is recall, 1 is NDCG, 2 is item counts for analysis 165 | print('------------------ WITH SIDE INFO ---------------') 166 | recall, ndcg, _ = models.run_VLM_PyTorch(X_train_subset, train_users, normalize(X_meta, norm = 'l1', axis = 0), X_val, X_test, val_dict, test_dict, side_info = True, eval_style = args.eval_style) 167 | util.pretty_print_results((recall, ndcg)) 168 | results.append({ 169 | 'Recall@20': recall[20], 170 | 'Recall@50': recall[50], 171 | 'NDCG@100': ndcg[100], 172 | 'frac_U': frac_train_users, 173 | 'Alg': 'VLM-PyTorch-Side' 174 | }) 175 | print('------------------ WITHOUT SIDE INFO ---------------') 176 | recall, ndcg, _ = models.run_VLM_PyTorch(X_train_subset, train_users, normalize(X_meta, norm = 'l1', axis = 0), X_val, X_test, val_dict, test_dict, side_info = False, eval_style = args.eval_style) 177 | util.pretty_print_results((recall, ndcg)) 178 | results.append({ 179 | 'Recall@20': recall[20], 180 | 'Recall@50': recall[50], 181 | 'NDCG@100': ndcg[100], 182 | 'frac_U': frac_train_users, 183 | 'Alg': 'VLM-PyTorch-NoSide' 184 | }) 185 | 186 | elif algo == 'slim': 187 | ######################################### 188 | # 4. (c)SLIM (Ning and Karypis, ICDM 2011) # 189 | ######################################### 190 | # Get dictionary with results - 0 is recall, 1 is NDCG, 2 is item counts for analysis 191 | print('------------------ WITHOUT SIDE INFO ---------------') 192 | recall, ndcg, _ = models.run_SLIM(X_train_subset, train_users, X_meta, X_val, X_test, val_dict, test_dict, side_info = False, eval_style = args.eval_style) 193 | util.pretty_print_results((recall, ndcg)) 194 | results.append({ 195 | 'Recall@20': recall[20], 196 | 'Recall@50': recall[50], 197 | 'NDCG@100': ndcg[100], 198 | 'frac_U': frac_train_users, 199 | 'Alg': 'SLIM' 200 | }) 201 | print('------------------ WITH SIDE INFO ---------------') 202 | recall, ndcg, _ = models.run_SLIM(X_train_subset, train_users, X_meta, X_val, X_test, val_dict, test_dict, side_info = True, eval_style = args.eval_style) 203 | util.pretty_print_results((recall, ndcg)) 204 | results.append({ 205 | 'Recall@20': recall[20], 206 | 'Recall@50': recall[50], 207 | 'NDCG@100': ndcg[100], 208 | 'frac_U': frac_train_users, 209 | 'Alg': 'cSLIM' 210 | }) 211 | 212 | elif algo == 'ease': 213 | ############################# 214 | ## 4. EASE (Steck, WWW 2019) # 215 | ############################# 216 | print(datetime.now(), '---- EASE ----') 217 | # Find optimal l2 on validation set via grid-search for optimal NDCG@100 218 | NDCG_values = [] 219 | optimal_model_EASE = None 220 | optimal_l2_value = None 221 | val_users = list(val_dict.keys()) 222 | # For every parameter combination 223 | for l2 in args.l2_values: 224 | # Compute the model 225 | start = time.perf_counter() 226 | model = util.compute_EASE(X_train_subset, l2 = int(l2)) 227 | end = time.perf_counter() 228 | print('\t... took {0} seconds!'.format(end - start)) 229 | # Evaluate the model 230 | val_scores = X_val[val_users,:] @ model - 987654321 * X_val[val_users,:] 231 | NDCG = util.evaluate(X_val, val_scores, val_dict)[1][100] 232 | NDCG_values.append(NDCG) 233 | print('\tL2:', l2, 'NDCG@100:', NDCG) 234 | if np.max(NDCG_values) == NDCG: 235 | optimal_model_EASE = model 236 | optimal_l2_value = int(l2) 237 | # Compute prediction scores for all test users - subtract already seen items 238 | test_users = list(test_dict.keys()) 239 | test_scores = X_test[test_users,:] @ optimal_model_EASE - 987654321 * X_test[test_users,:] 240 | recall, ndcg, item2count = util.evaluate(X_test, test_scores, test_dict) 241 | util.pretty_print_results((recall, ndcg)) 242 | results.append({ 243 | 'Recall@20': recall[20], 244 | 'Recall@50': recall[50], 245 | 'NDCG@100': ndcg[100], 246 | 'frac_U': frac_train_users, 247 | 'Alg': 'EASE' 248 | }) 249 | #np.savez(args.dir + 'results/item_counts_{0}_{1}.npz'.format('ease', frac_train_users), item2count) 250 | ############################ 251 | # 5. Add-EASE (contrib. 1) # 252 | ############################ 253 | print(datetime.now(), '---- Add-EASE ----') 254 | NDCG_values = [] 255 | optimal_model_ADDEASE = None 256 | # Compute EASE model on tag-item matrix 257 | side_model = util.compute_EASE(X_meta, l2 = optimal_l2_value) 258 | # For every parameter combination 259 | for alpha in args.alpha_values: 260 | # Blend 261 | model = (1. - float(alpha)) * optimal_model_EASE + float(alpha) * side_model 262 | # Evaluate the model 263 | val_scores = X_val[val_users,:] @ model - 987654321 * X_val[val_users,:] 264 | NDCG = util.evaluate(X_val, val_scores, val_dict)[1][100] 265 | NDCG_values.append(NDCG) 266 | print('\tAlpha:', alpha, 'L2:', optimal_l2_value, 'NDCG@100:', NDCG) 267 | if np.max(NDCG_values) == NDCG: 268 | optimal_model_ADDEASE = model 269 | # Compute prediction scores for all test users - subtract already seen items 270 | test_scores = X_test[test_users,:] @ optimal_model_ADDEASE - 987654321 * X_test[test_users,:] 271 | recall, ndcg, item2count = util.evaluate(X_test, test_scores, test_dict) 272 | util.pretty_print_results((recall, ndcg)) 273 | results.append({ 274 | 'Recall@20': recall[20], 275 | 'Recall@50': recall[50], 276 | 'NDCG@100': ndcg[100], 277 | 'frac_U': frac_train_users, 278 | 'Alg': 'Add-EASE' 279 | }) 280 | #np.savez(args.dir + 'results/item_counts_{0}_{1}.npz'.format('addease', frac_train_users), item2count) 281 | del optimal_model_EASE, side_model, optimal_model_ADDEASE 282 | 283 | ######################### 284 | # 6. cEASE (contrib. 2) # 285 | ######################### 286 | print(datetime.now(), '---- cEASE ----') 287 | NDCG_values = [] 288 | optimal_model_CEASE = None 289 | # For every parameter combination 290 | for alpha in args.alpha_values: 291 | # Stack matrix 292 | X_full = vstack((X_train_subset, X_meta * float(alpha))) 293 | # Compute the model 294 | model = util.compute_EASE(X_full, l2 = optimal_l2_value) 295 | # Evaluate the model 296 | val_scores = X_val[val_users,:] @ model - 987654321 * X_val[val_users,:] 297 | NDCG = util.evaluate(X_val, val_scores, val_dict)[1][100] 298 | NDCG_values.append(NDCG) 299 | print('\tAlpha:', alpha, 'L2:', optimal_l2_value, 'NDCG@100:', NDCG) 300 | if np.max(NDCG_values) == NDCG: 301 | optimal_model_CEASE = model 302 | # Compute prediction scores for all test users - subtract already seen items 303 | test_scores = X_test[test_users,:] @ optimal_model_CEASE - 987654321 * X_test[test_users,:] 304 | recall, ndcg, item2count = util.evaluate(X_test, test_scores, test_dict) 305 | util.pretty_print_results((recall, ndcg)) 306 | results.append({ 307 | 'Recall@20': recall[20], 308 | 'Recall@50': recall[50], 309 | 'NDCG@100': ndcg[100], 310 | 'frac_U': frac_train_users, 311 | 'Alg': 'cEASE' 312 | }) 313 | #np.savez(args.dir + 'results/item_counts_{0}_{1}.npz'.format('cease', frac_train_users), item2count) 314 | del optimal_model_CEASE 315 | gc.collect() 316 | 317 | else: 318 | print('\t\t\tUnknown algorithm, skipping...') 319 | 320 | # Write out results 321 | pd.DataFrame(results).to_csv(args.dir + 'results/{0}_{1}.csv'.format('_'.join(args.algorithms),str(frac_train_users)), index = False) 322 | -------------------------------------------------------------------------------- /src/models.py: -------------------------------------------------------------------------------- 1 | from copy import deepcopy 2 | import itertools 3 | import numpy as np 4 | import pandas as pd 5 | import util 6 | from scipy.sparse import lil_matrix, coo_matrix, csr_matrix, vstack 7 | from sklearn.preprocessing import normalize 8 | import tensorflow.compat.v1 as tf 9 | tf.disable_v2_behavior() 10 | import torch 11 | import torch.utils.data 12 | from tqdm import trange 13 | 14 | from baselines.cvae.vae import VAE 15 | from baselines.vlm.vlm import VLM 16 | from baselines.vlm.vlm_pytorch import VLM_PyTorch 17 | 18 | from SLIM import SLIM, SLIMatrix 19 | 20 | def run_itemknn(X_train, X_test, test_dict): 21 | # Compute item-item matrix with cosine similarities 22 | S_cosine = util.compute_cosine(X_train) 23 | 24 | # Compute prediction scores for all test users - subtract already seen items 25 | test_users = list(test_dict.keys()) 26 | test_scores = X_test[test_users,:] @ S_cosine - 987654321 * X_test[test_users,:] 27 | 28 | # Evaluate and pretty print 29 | results_cosine = util.evaluate(X_test, test_scores, test_dict) 30 | return results_cosine 31 | 32 | def run_cVAE(X_train, X_meta, X_val, X_test, val_dict, test_dict): 33 | # Parameters for cVAE 34 | device = 'cuda' if torch.cuda.is_available() else 'cpu' 35 | params = { 36 | 'layers': [400, 100], 37 | 'n_items': X_meta.shape[1], 38 | 'device': device 39 | } 40 | 41 | batch_size = 1024 42 | tol = 1e-7 43 | patience = 200 44 | alpha = 1. 45 | beta = 1. 46 | # Instantiate model 47 | model = VAE(params).to(device) 48 | 49 | # Multi-GPU 50 | if torch.cuda.device_count() > 1: 51 | model = torch.nn.DataParallel(model) 52 | batch_size = int(batch_size * 2) 53 | 54 | # Set up optimizer 55 | optimizer = torch.optim.Adam(model.parameters(),lr=5e-4) 56 | 57 | # Loss function is MSE and annealed ELBO 58 | def loss_function(recon_x, x, mu, logvar, anneal=1.0): 59 | MSE = torch.sum((x - torch.sigmoid(recon_x)) ** 2) 60 | KLD = -0.5 * torch.sum(1 + logvar - mu.pow(2) - logvar.exp()) 61 | return MSE + anneal * KLD 62 | 63 | # Pre-train on meta-data 64 | train_tensor = torch.from_numpy(X_meta.A.astype('float32')).to(device) 65 | train_loader = torch.utils.data.DataLoader(train_tensor, batch_size, shuffle=True) 66 | t = trange(2000, desc = 'Meta') 67 | best_loss, best_epoch = np.inf, -1 68 | for epoch in t: 69 | # Put the model into training mode 70 | model.train() 71 | loss_value = 0 72 | # Every batch 73 | for batch_idx, data in enumerate(train_loader): 74 | # Clear gradients 75 | optimizer.zero_grad() 76 | # Get predictions 77 | recon_batch, mu, logvar = model(data) 78 | # Compute loss 79 | loss = loss_function(recon_batch, data, mu, logvar, anneal = 0.2) 80 | # Back-propagate 81 | loss.backward() 82 | loss_value += loss.item() 83 | optimizer.step() 84 | loss = loss_value / len(train_loader.dataset) 85 | t.set_postfix(loss = loss) 86 | # Early stopping - are we improving by at least 'tol'? 87 | if (best_loss - loss) > tol: 88 | # If yes - keep going 89 | best_loss = loss 90 | best_epoch = epoch 91 | # If we're not improving, have we improved at all in the past 'patience' epochs? 92 | if (epoch - best_epoch) > patience: 93 | print('Converged after {0} epochs, stopping...'.format(epoch)) 94 | break 95 | 96 | del train_tensor, train_loader 97 | torch.cuda.empty_cache() 98 | 99 | # Loss function is cross-entropy and annealed ELBO 100 | def loss_function(recon_x, x, mu, logvar, anneal=1., alpha = 1.): 101 | BCE = -torch.sum(alpha * torch.log(torch.sigmoid(recon_x) + 1e-8) * x + torch.log(1 - torch.sigmoid(recon_x) + 1e-8) * (1 - x)) 102 | KLD = -0.5 * torch.sum(1 + logvar - mu.pow(2) - logvar.exp()) 103 | return BCE + anneal * KLD 104 | 105 | # Refine on ratings 106 | # Get rating data into torch format 107 | train_tensor = torch.from_numpy(X_train.A.astype('float32')).to(device) 108 | train_loader = torch.utils.data.DataLoader(train_tensor, batch_size, shuffle=True) 109 | t = trange(2000, desc = 'Pref') 110 | best_loss, best_epoch = np.inf, -1 111 | for epoch in t: 112 | # Put the model into training mode 113 | model.train() 114 | loss_value = 0 115 | # Every batch 116 | for batch_idx, data in enumerate(train_loader): 117 | # Clear gradients 118 | optimizer.zero_grad() 119 | # Get predictions 120 | recon_batch, mu, logvar = model(data) 121 | # Compute loss 122 | loss = loss_function(recon_batch, data, mu, logvar, anneal = beta, alpha = alpha) 123 | # Back-propagate 124 | loss.backward() 125 | loss_value += loss.item() 126 | optimizer.step() 127 | loss = loss_value / len(train_loader.dataset) 128 | t.set_postfix(loss = loss) 129 | # Early stopping - are we improving by at least 'tol'? 130 | if (best_loss - loss) > tol: 131 | # If yes - keep going 132 | best_loss = loss 133 | best_epoch = epoch 134 | # If we're not improving, have we improved at all in the past 'patience' epochs? 135 | if (epoch - best_epoch) > patience: 136 | print('Converged after {0} epochs, stopping...'.format(epoch)) 137 | break 138 | 139 | del train_tensor, train_loader 140 | torch.cuda.empty_cache() 141 | 142 | # Scores for test set 143 | test_users = list(test_dict.keys()) 144 | with torch.no_grad(): 145 | test_tensor = torch.from_numpy(X_test[test_users,:].A.astype('float32')).to(device) 146 | test_loader = torch.utils.data.DataLoader(test_tensor, batch_size, shuffle=False) 147 | pred_test = [] 148 | for batch_idx, data in enumerate(test_loader): 149 | scores, _, _ = model(data) 150 | pred_test.append(scores.detach().cpu().numpy()) 151 | pred_test = np.vstack(pred_test) 152 | # Subtract previously seen items from predicted scores 153 | test_scores = pred_test - 987654321 * X_test[test_users,:] 154 | results_cVAE = util.evaluate(X_test, test_scores, test_dict) 155 | return results_cVAE 156 | 157 | def run_VLM(X_train_subset, train_users, X_meta, X_val, X_test, val_dict, test_dict, side_info = True): 158 | # Parameters for VLM 159 | var_prior = 1.0 160 | lr = 5e-3 161 | reg = 1e-9 162 | num_factors = 100 163 | batch_size = 512 164 | num_epochs = 1500 165 | 166 | if not side_info: 167 | X_meta = csr_matrix((1,X_train_subset.shape[1])) 168 | 169 | # Specific input format for VLM 170 | val_users = list(val_dict.keys()) 171 | test_users = list(test_dict.keys()) 172 | 173 | # X_train_subset only has relevant non-zero rows at the moment 174 | # We want a matrix with zeroes everywhere but these training vectors in the right spot 175 | X_train_full = lil_matrix(X_val.shape).astype(np.int32) 176 | X_train_full[train_users,:] = X_train_subset 177 | X_train_full = X_train_full.tocsr() 178 | X_all = (X_train_full+X_val+X_test).toarray().astype(np.float32) 179 | video_metadata_array = X_meta.T.todense().astype(np.float32) 180 | 181 | ############################## 182 | # TRAINING PROCEDURE FOR VLM # 183 | ############################## 184 | tf.reset_default_graph() 185 | 186 | # Instantiate TensorFlow Execution DAG 187 | with tf.Graph().as_default(): 188 | # Generate model 189 | model = VLM(X_test.shape[0], # Num users 190 | X_test.shape[1], # Num items 191 | X_meta.shape[0], # Num tags 192 | num_factors, 193 | var_prior, 194 | reg, 195 | video_metadata_array) 196 | 197 | # Innitialise model 198 | batch_logits, batch_logits_validation, log_softmax, avg_loss, batch_conditional_log_likelihood,\ 199 | batch_kl_div, num_items_per_document = model.construct_graph() 200 | 201 | # Optimisation procedure for training users 202 | train_op = tf.train.AdamOptimizer(learning_rate=lr)\ 203 | .minimize(avg_loss, global_step=tf.Variable(0, name='global_step_1', trainable=False)) 204 | 205 | # Optimisation procedure for validation and test users (keep items/tags fixed) 206 | train_op_validation = tf.train.AdamOptimizer(learning_rate=lr)\ 207 | .minimize(avg_loss, 208 | var_list = [model.Mu_Zu, model.lsdev_Zu], 209 | global_step=tf.Variable(0, name='global_step_1_validation', trainable=False)) 210 | 211 | ####Summary#### 212 | avg_loss_summary_ph = tf.placeholder(dtype = tf.float32) 213 | tf.summary.scalar('avg_loss', avg_loss_summary_ph) 214 | 215 | ndcg_summary_ph = tf.placeholder(dtype=tf.float32) 216 | tf.summary.scalar('ndcg_100', ndcg_summary_ph) 217 | summary = tf.summary.merge_all() 218 | 219 | ####Start#### 220 | init = tf.global_variables_initializer() 221 | saver = tf.train.Saver() 222 | 223 | config = tf.ConfigProto() 224 | config.gpu_options.allow_growth = True 225 | # Initialise session 226 | with tf.Session(config=config) as sess: 227 | 228 | sess.run(init) 229 | ndcgs_vad = [] 230 | best_ndcg_sofar = -1000 231 | # For every epoch 232 | progress_bar = trange(num_epochs) 233 | for epoch_ind in progress_bar: 234 | #################################### 235 | ## COMPUTATIONS FOR TEST SET # 236 | #################################### 237 | ###Optimize parameters for test users #### 238 | for batch_ind, st_index in enumerate(range(0, len(test_users), batch_size)): 239 | # Put batch into the right format 240 | end_index = min(st_index + batch_size, len(test_users)) 241 | user_indices = test_users[st_index:end_index] 242 | # Optimise user factors for validation data 243 | _, loss_val = sess.run([train_op_validation, avg_loss], feed_dict = {model.users_ph : user_indices, model.played_videos_ph : X_all[user_indices]}) 244 | 245 | ################################## 246 | ## COMPUTATIONS FOR TRAINING SET # 247 | ################################## 248 | avg_loss_dataset = 0 249 | num_batches = 0 250 | np.random.shuffle(train_users) 251 | ## For every training batch 252 | for batch_ind, st_index in enumerate(range(0, len(train_users), batch_size)): 253 | # Put batch into the right format 254 | end_index = min(st_index + batch_size, len(train_users)) 255 | user_indices = train_users[st_index:end_index] 256 | # Optimise user factors for training data 257 | _, loss_val = sess.run([train_op, avg_loss], feed_dict = {model.users_ph : user_indices, model.played_videos_ph : X_all[user_indices,:]}) 258 | avg_loss_dataset += loss_val 259 | num_batches += 1 260 | # Average out loss 261 | avg_loss_dataset = avg_loss_dataset / max(num_batches, 1) 262 | ####Summary#### 263 | progress_bar.set_postfix(loss = avg_loss_dataset) 264 | 265 | # Compute NDCG on test set 266 | predictions_test = sess.run(batch_logits_validation, feed_dict = {model.users_ph: test_users}) 267 | # Clear model, memory and such 268 | tf.reset_default_graph() 269 | 270 | # Subtract previously seen items from predicted scores 271 | test_scores = predictions_test - 987654321 * X_test[test_users,:] 272 | VLM_results = util.evaluate(X_test, test_scores, test_dict) 273 | return VLM_results 274 | 275 | def run_VLM_PyTorch(X_train, train_users, X_meta, X_val, X_test, val_dict, test_dict, side_info, eval_style = 'strong'): 276 | # Parameters for VLM 277 | device = 'cuda' if torch.cuda.is_available() else 'cpu' 278 | params = { 279 | 'num_users': X_val.shape[0], 280 | 'num_items': X_val.shape[1], 281 | 'num_tags': X_meta.shape[0], 282 | 'num_factors': 100, 283 | 'var_prior': 1.0, 284 | 'reg': 1e-9, 285 | 'device': device, 286 | 'item_tag_mat': X_meta.T.astype(np.float32), 287 | 'side_info': side_info 288 | } 289 | lr = 3e-3 290 | num_epochs = 5000 291 | batch_size = 1024 292 | tol = 1e-8 293 | patience = 50 294 | 295 | # Heldout data from dictionary to csr matrix 296 | vals = np.ones(len(val_dict)) 297 | rows = np.asarray(list(val_dict.keys())) 298 | cols = np.asarray([list(v)[0] for v in val_dict.values()]) 299 | val_csr = csr_matrix((vals,(rows,cols)), shape = X_val.shape) 300 | 301 | # Instantiate model 302 | model = VLM_PyTorch(params).to(device) 303 | 304 | # Multi-GPU if possible 305 | multi_gpu = False 306 | if torch.cuda.device_count() > 1: 307 | model = torch.nn.DataParallel(model) 308 | multi_gpu = True 309 | batch_size = int(batch_size * 2) 310 | 311 | def compute_kl_div(lsdev_Zu_batch, Mu_Zu_batch, num_factors, var_prior): 312 | sdev_Zu_batch = torch.exp(lsdev_Zu_batch) 313 | comp1 = num_factors * (0.5 * np.log(var_prior) - lsdev_Zu_batch) 314 | comp2 = (num_factors / (2 * var_prior)) * sdev_Zu_batch.pow(2) 315 | comp3 = (1.0 / (2 * var_prior)) * torch.sum(Mu_Zu_batch.pow(2), dim = 1) 316 | comp4 = (num_factors / 2.0) 317 | return comp1 + comp2 + comp3 - comp4 318 | 319 | def loss_function(x, scores, Mu_Zu, lsdev_Zu, num_factors = params['num_factors'], var_prior = params['var_prior'], reg = params['reg']): 320 | scores = scores.masked_fill(~x,.0) 321 | batch_conditional_log_likelihood = torch.sum(scores, dim = 1) 322 | batch_kl_div = compute_kl_div(lsdev_Zu, Mu_Zu, num_factors, var_prior) 323 | items_per_user = torch.sum(x, dim = 1, dtype = torch.float) 324 | batch_elbo = (1.0 / items_per_user) * (batch_conditional_log_likelihood - batch_kl_div) 325 | if multi_gpu and side_info: 326 | return -1 * torch.mean(batch_elbo) + reg * (torch.norm(model.module.Mu_Zv.weight, 2) + torch.norm(model.module.Mu_Zt.weight, 2)) 327 | elif (not multi_gpu) and side_info: 328 | return -1 * torch.mean(batch_elbo) + reg * (torch.norm(model.Mu_Zv.weight, 2) + torch.norm(model.Mu_Zt.weight, 2)) 329 | elif multi_gpu and (not side_info): 330 | return -1 * torch.mean(batch_elbo) + reg * (torch.norm(model.module.Mu_Zv.weight, 2)) 331 | elif (not multi_gpu) and (not side_info): 332 | return -1 * torch.mean(batch_elbo) + reg * (torch.norm(model.Mu_Zv.weight, 2)) 333 | 334 | # Set up data for training 335 | train_tensor = torch.from_numpy(X_train.A.astype(bool)).to(device) 336 | train_users = torch.from_numpy(train_users.astype(np.int64)).to(device) 337 | train_loader = torch.utils.data.DataLoader(torch.utils.data.TensorDataset(train_tensor, train_users), batch_size, shuffle=True) 338 | 339 | # Set up data for validation 340 | val_batch_size = batch_size #1024 341 | val_users = np.asarray(list(val_dict.keys())) 342 | val_tensor = torch.from_numpy(X_val[val_users,:].A.astype(np.bool)).to(device) 343 | heldout_val_tensor = torch.from_numpy(val_csr[val_users,:].A.astype(np.float32)).to(device) 344 | val_users = torch.from_numpy(val_users.astype(np.int64)).to(device) 345 | val_loader = torch.utils.data.DataLoader(torch.utils.data.TensorDataset(val_tensor, heldout_val_tensor, val_users), val_batch_size, shuffle=False) 346 | 347 | # Optimise everything for training data 348 | optimizer = torch.optim.Adam(model.parameters(),lr=lr) 349 | 350 | # For every epoch 351 | t = trange(num_epochs, desc = 'Train') 352 | best_loss, best_epoch = np.inf, -1 353 | for epoch in t: 354 | # Put the model into training mode 355 | model.train() 356 | loss_value = 0 357 | # Optimise for every batch of training data 358 | for batch_idx, (data, users) in enumerate(train_loader): 359 | # Clear gradients 360 | optimizer.zero_grad() 361 | # Get predictions 362 | scores, Mu_Zu, lsdev_Zu = model(users) 363 | # Compute loss 364 | loss = loss_function(data, scores, Mu_Zu, lsdev_Zu) 365 | # Back-propagate 366 | loss.backward() 367 | loss_value += loss.item() 368 | optimizer.step() 369 | loss = loss_value / len(train_loader.dataset) 370 | t.set_postfix(loss = loss) 371 | 372 | # Early stopping - are we improving by at least 'tol'? 373 | if (best_loss - loss) > tol: 374 | # If yes - keep going 375 | best_loss = loss 376 | best_epoch = epoch 377 | # If we're not improving, have we improved at all in the past 'patience' epochs? 378 | if (epoch - best_epoch) > patience: 379 | print('Converged after {0} epochs, stopping...'.format(epoch)) 380 | break 381 | 382 | del train_tensor, train_users, train_loader 383 | torch.cuda.empty_cache() 384 | 385 | # Scores for test set 386 | # Only optimise user vectors for test data 387 | test_users = np.asarray(list(test_dict.keys())) 388 | test_tensor = torch.from_numpy(X_test[test_users,:].A.astype(bool)).to(device) 389 | # If we have distinct train/test users - we should learn vectors for test users 390 | if eval_style == 'strong': 391 | if multi_gpu: 392 | optimizer = torch.optim.Adam([model.module.Mu_Zu.weight, model.module.lsdev_Zu.weight],lr=lr) 393 | else: 394 | optimizer = torch.optim.Adam([model.Mu_Zu.weight, model.lsdev_Zu.weight],lr=lr) 395 | test_users = torch.from_numpy(test_users.astype(np.int64)).to(device) 396 | test_loader = torch.utils.data.DataLoader(torch.utils.data.TensorDataset(test_tensor, test_users), batch_size, shuffle=True) 397 | t = trange(num_epochs, desc = 'Test ') 398 | best_loss, best_epoch = np.inf, -1 399 | for epoch in t: 400 | # Put the model into training mode 401 | model.train() 402 | loss_value = 0 403 | # Every batch 404 | for batch_idx, (data, users) in enumerate(test_loader): 405 | # Clear gradients 406 | optimizer.zero_grad() 407 | # Get predictions 408 | scores, Mu_Zu, lsdev_Zu = model(users) 409 | # Compute loss 410 | loss = loss_function(data, scores, Mu_Zu, lsdev_Zu) 411 | # Back-propagate 412 | loss.backward() 413 | loss_value += loss.item() 414 | optimizer.step() 415 | loss = loss_value / len(test_loader.dataset) 416 | t.set_postfix(loss = loss) 417 | # Early stopping - are we improving by at least 'tol'? 418 | if (best_loss - loss) > tol: 419 | # If yes - keep going 420 | best_loss = loss 421 | best_epoch = epoch 422 | # If we're not improving, have we improved at all in the past 'patience' epochs? 423 | if (epoch - best_epoch) > patience: 424 | print('Converged after {0} epochs, stopping...'.format(epoch)) 425 | break 426 | 427 | # Scores for test set - only for optimal model 428 | torch.cuda.empty_cache() 429 | with torch.no_grad(): 430 | test_users = np.asarray(list(test_dict.keys())) 431 | test_users = torch.from_numpy(test_users.astype(np.int64)).to(device) 432 | test_loader = torch.utils.data.DataLoader(test_users, batch_size, shuffle=False) 433 | pred_test = [] 434 | for batch_idx, users in enumerate(test_loader): 435 | scores, _, _ = model(users, add_noise = False) 436 | pred_test.append(scores.detach().cpu().numpy()) 437 | pred_test = np.vstack(pred_test) 438 | 439 | # Subtract previously seen items from predicted scores 440 | test_users = np.asarray(list(test_dict.keys())) 441 | test_scores = pred_test - 987654321 * X_test[test_users,:] 442 | results_VLM = util.evaluate(X_test, test_scores, test_dict) 443 | return results_VLM 444 | 445 | def run_SLIM(X_train, train_users, X_meta, X_val, X_test, val_dict, test_dict, side_info, eval_style = 'strong'): 446 | # Values for grid-search 447 | NDCG_values = [] 448 | optimal_model_SLIM = None 449 | best_values = None 450 | l1_values = [0, 2.5, 5.0, 10.0, 20] 451 | l2_values = [0, 5.0, 10.0, 20, 50, 100] 452 | al_values = [.5, 1.0, 2.5, 5.0, 10.0] if side_info else [1.0] 453 | for l1r, l2r, alpha in itertools.product(l1_values, l2_values, al_values): 454 | print('L1: {0}\tL2: {1}\tAlpha: {2}'.format(l1r,l2r, alpha)) 455 | # Set up parameters 456 | params = {'algo':'cd', 'nthreads':16, 'l1r':l1r, 'l2r':l2r} 457 | 458 | # Build training matrix 459 | trainmat = X_train 460 | if side_info: 461 | trainmat = vstack((trainmat, alpha * X_meta)) 462 | trainmat = SLIMatrix(trainmat) 463 | 464 | # Train model 465 | model = SLIM() 466 | model.train(params, trainmat) 467 | print('Converting out of SLIM format...') 468 | # To CSR works, but densifying it crashes sometimes? Very strange 469 | # S_SLIM = model.to_csr().todense() 470 | # Work-around by writing to disk and reading in 471 | model.save_model(modelfname='slim_model.csr', mapfname='slim_map.csr') 472 | def read_csr(filename): 473 | f = open(filename, 'r') 474 | all_rows = [] 475 | all_cols = [] 476 | all_vals = [] 477 | for i, line in enumerate(f.readlines()): 478 | strs = line.split(' ') 479 | cols = [int(s) for s in strs[1::2]] 480 | vals = [float(s) for s in strs[2::2]] 481 | all_cols.extend(cols) 482 | all_vals.extend(vals) 483 | all_rows.extend([i for _ in cols]) 484 | all_rows = np.array(all_rows, dtype=np.int64) 485 | all_cols = np.array(all_cols, dtype=np.int64) 486 | all_vals = np.array(all_vals, dtype=np.float32) 487 | mat = coo_matrix((all_vals, (all_rows, all_cols)), shape = (X_train.shape[1],X_train.shape[1])) 488 | return mat 489 | S_SLIM = read_csr('slim_model.csr') 490 | print('... done!') 491 | S_SLIM = S_SLIM.todense() 492 | 493 | # Evaluate on validation data 494 | print('Evaluating...') 495 | val_users = list(val_dict.keys()) 496 | val_scores = X_val[val_users,:] @ S_SLIM - 987654321 * X_val[val_users,:] 497 | 498 | # Evaluate and pretty print 499 | NDCG = util.evaluate(X_val, val_scores, val_dict)[1][100] 500 | NDCG_values.append(NDCG) 501 | 502 | print('\tNDCG@100:\t{0}'.format(NDCG)) 503 | if np.max(NDCG_values) == NDCG: 504 | optimal_model_SLIM = S_SLIM 505 | best_values = (l1r, l2r, alpha) 506 | 507 | print('Best grid-search values:', best_values) 508 | 509 | # Compute prediction scores for all test users - subtract already seen items 510 | test_users = list(test_dict.keys()) 511 | test_scores = X_test[test_users,:] @ optimal_model_SLIM - 987654321 * X_test[test_users,:] 512 | 513 | # Evaluate and pretty print 514 | results_SLIM = util.evaluate(X_test, test_scores, test_dict) 515 | return results_SLIM 516 | --------------------------------------------------------------------------------