├── LICENSE
├── README.md
└── src
    ├── baselines
        ├── cvae
        │   └── vae.py
        └── vlm
        │   ├── vlm_pytorch.py
        │   └── vlm.py
    ├── PreprocessMSD.py
    ├── PreprocessYahooMovies.py
    ├── PreprocessAmazonVideoGames.py
    ├── PreprocessAmazonSportsOutdoors.py
    ├── PreprocessNetflix.py
    ├── PreprocessML20M.py
    ├── util.py
    ├── TrainModel.py
    └── models.py


/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2020 Olivier Jeunen
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Closed-Form Models for Collaborative Filtering with Side-Information
 2 | Source code for our LBR paper "Closed-Form Models for Collaborative Filtering with Side-Information" published at RecSys 2020.
 3 | 
 4 | ## Reproducibility
 5 | To generate a virtual Python environment that holds all the packages our work relies on, run:
 6 | 
 7 |     virtualenv -p python3 ease_side_info
 8 |     source ease_side_info/bin/activate
 9 |     pip3 install -r requirements.txt
10 |     
11 | 
12 | To preprocess the datasets to the format we use, run:
13 | 
14 |     python3 Preprocess[...].py <dataset_location>
15 | 
16 | We do not hold the rights to any of the datasets used in the paper, and are not at liberty to host and share them.
17 | However, upon request, I will gladly share pointers on where to find them.
18 | 
19 | Now, you can run the ''TrainModel'' script to train and evaluate all models on the dataset of your choice.
20 | 
21 | ## Acknowledgements
22 | The source code we use for our baselines (SLIM, cVAE, VLM) was slightly adapted from their original sources, and we are grateful to the original authors for providing publicly available implementations:
23 | 
24 | - SLIM - https://github.com/KarypisLab/SLIM
25 | - cVAE - https://github.com/yifanclifford/cVAE
26 | - VLM  - https://github.com/ehtsham/recsys19vlm
27 | 
28 | ## Paper
29 | If you use our code in your research, please remember to cite our paper:
30 | 
31 | ```BibTeX
32 |     @inproceedings{JeunenRecSys2020,
33 |       author = {Jeunen, Olivier and Van Balen, Jan and Goethals, Bart},
34 |       title = {Closed-Form Models for Collaborative Filtering with Side-Information},
35 |       booktitle = {Proceedings of the 14th ACM Conference on Recommender Systems},
36 |       series = {RecSys '20},
37 |       year = {2020},
38 |       publisher = {ACM},
39 |     }
40 | 


--------------------------------------------------------------------------------
/src/baselines/cvae/vae.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch.nn import functional
 3 | from torch import nn
 4 | 
 5 | # Code from Yifan Chen
 6 | # https://github.com/yifanclifford/cVAE
 7 | 
 8 | def trace(A=None, B=None):
 9 |     if A is None:
10 |         print('Expecting PyTorch tensor')
11 |         val = None
12 |     elif B is None:
13 |         val = torch.sum(A * A)
14 |     else:
15 |         val = torch.sum(A * B)
16 |     return val
17 | 
18 | class VAE(nn.Module):
19 |     def __init__(self, args):
20 |         super(VAE, self).__init__()
21 |         self.l = len(args['layers'])
22 |         self.device = args['device']
23 |         self.inet = nn.ModuleList()
24 |         darray = [args['n_items']] + args['layers']
25 |         for i in range(self.l - 1):
26 |             self.inet.append(nn.Linear(darray[i], darray[i + 1]))
27 |         self.mu = nn.Linear(darray[self.l - 1], darray[self.l])
28 |         self.sigma = nn.Linear(darray[self.l - 1], darray[self.l])
29 |         self.gnet = nn.ModuleList()
30 |         for i in range(self.l):
31 |             self.gnet.append(nn.Linear(darray[self.l - i], darray[self.l - i - 1]))
32 | 
33 |     def encode(self, x):
34 |         h = x
35 |         for i in range(self.l - 1):
36 |             h = functional.relu(self.inet[i](h))
37 |         return self.mu(h), self.sigma(h)
38 | 
39 |     def decode(self, z):
40 |         h = z
41 |         for i in range(self.l - 1):
42 |             h = functional.relu(self.gnet[i](h))
43 |         return self.gnet[self.l - 1](h)
44 | 
45 |     def reparameterize(self, mu, logvar):
46 |         if self.training:
47 |             std = torch.exp(0.5 * logvar)
48 |             eps = torch.randn_like(std)
49 |             return eps.mul(std).add_(mu)
50 |         else:
51 |             return mu
52 | 
53 |     def forward(self, x):
54 |         mu, logvar = self.encode(x)
55 |         z = self.reparameterize(mu, logvar)
56 |         return self.decode(z), mu, logvar
57 | 
58 |     def infer_reg(self):
59 |         reg = 0
60 |         for infer in self.inet:
61 |             for param in infer.parameters():
62 |                 reg += trace(param)
63 |         return reg
64 | 
65 |     def gen_reg(self):
66 |         reg = 0
67 |         for infer in self.gnet:
68 |             for param in infer.parameters():
69 |                 reg += trace(param)
70 |         return reg
71 | 


--------------------------------------------------------------------------------
/src/baselines/vlm/vlm_pytorch.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import torch
 3 | from torch.autograd import Variable
 4 | from torch.nn import functional
 5 | from torch import nn
 6 | 
 7 | # Loosely adapted from Ehtsham Elahi's original Tensorflow implementation
 8 | # https://github.com/ehtsham/recsys19vlm/blob/master/RecSys2019-VLMPaper.ipynb
 9 | 
10 | class VLM_PyTorch(nn.Module):
11 |     def __init__(self, args):
12 |         super(VLM_PyTorch, self).__init__()
13 | 
14 |         self.num_users = args['num_users']
15 |         self.num_items = args['num_items']
16 |         self.num_tags = args['num_tags']
17 |         self.num_factors = args['num_factors']
18 |         self.var_prior = args['var_prior']
19 |         self.reg = args['reg']
20 |         self.side_info = args['side_info']
21 |         if self.side_info:
22 |             # Experimental- Sparse matrix to speed up multiplication
23 |             print('Setting up embeddings for tags...')
24 |             item_tag_mat = args['item_tag_mat'].tocoo()
25 |             values = item_tag_mat.data
26 |             indices = np.vstack((item_tag_mat.row, item_tag_mat.col))
27 |             i = torch.LongTensor(indices)
28 |             v = torch.FloatTensor(values)
29 |             shape = item_tag_mat.shape
30 |             self.item_tag_mat = torch.nn.Parameter(torch.sparse.FloatTensor(i, v, torch.Size(shape)), requires_grad = False)
31 |             # Deprecated - Dense matrix makes multiplication too slow to handle
32 |             # self.item_tag_mat = torch.nn.Parameter(torch.from_numpy(args['item_tag_mat']), requires_grad = False) #.to(args['device'])
33 |             self.Mu_Zt = nn.Embedding(self.num_tags,  self.num_factors) # Mean latent factors for tags
34 | 
35 |         self.Mu_Zu = nn.Embedding(self.num_users, self.num_factors) # Mean latent factors for users
36 |         self.lsdev_Zu = nn.Embedding(self.num_users, 1)             # Log(std-deviation) for user latent factors
37 |         self.Mu_Zv = nn.Embedding(self.num_items, self.num_factors) # Mean latent factors for items
38 | 
39 |     def forward(self, user_ids, add_noise = True):
40 |         # Get mean and log(std-dev) for users in this batch
41 |         Mu_Zu_batch, lsdev_Zu_batch = self.Mu_Zu(user_ids), self.lsdev_Zu(user_ids)
42 | 
43 |         # Map item-tag matrix to tag embeddings
44 |         if self.side_info:
45 |             Mu_Zv_hat = torch.mm(self.item_tag_mat, self.Mu_Zt.weight)
46 |             # TODO - validate whether sparse matrix multiplication is faster than gathering and summing embeddings
47 | 
48 |         # Simple things first - let's not bring in side-info just yet
49 |         if add_noise:
50 |             # 'Reparameterisation trick' - sample Gaussian noise over factors
51 |             eps = torch.randn_like(Mu_Zu_batch)
52 |             
53 |             # Bring it together - mean + eps * std-dev
54 |             Zu_batch = Mu_Zu_batch + eps * torch.exp(lsdev_Zu_batch)
55 | 
56 |             # Compute scores between both as the dot product
57 |             if self.side_info:
58 |                 batch_logits = torch.mm(Zu_batch, self.Mu_Zv.weight.T + Mu_Zv_hat.T)
59 |             else:
60 |                 batch_logits = torch.mm(Zu_batch, self.Mu_Zv.weight.T)
61 |         else:
62 |             if self.side_info:
63 |                 batch_logits = torch.mm(Mu_Zu_batch, self.Mu_Zv.weight.T + Mu_Zv_hat.T)
64 |             else:
65 |                 batch_logits = torch.mm(Mu_Zu_batch, self.Mu_Zv.weight.T)
66 | 
67 |         log_softmax = torch.nn.functional.log_softmax(batch_logits, dim = 1)
68 |         return log_softmax, Mu_Zu_batch, lsdev_Zu_batch
69 | 


--------------------------------------------------------------------------------
/src/baselines/vlm/vlm.py:
--------------------------------------------------------------------------------
 1 | import tensorflow.compat.v1 as tf
 2 | tf.disable_v2_behavior() 
 3 | 
 4 | # Code from Ehtsham Elahi
 5 | # https://github.com/ehtsham/recsys19vlm/blob/master/RecSys2019-VLMPaper.ipynb
 6 | 
 7 | class VLM(object):
 8 |     def __init__(self, num_users, num_items, num_tags, num_factors, var_prior, reg, video_metadata_array):
 9 |         self.num_users = num_users
10 |         self.num_items = num_items
11 |         self.num_tags = num_tags
12 |         self.num_factors = num_factors
13 |         self.var_prior = var_prior
14 |         self.reg = reg
15 |         self.video_metadata_array_const = tf.constant(video_metadata_array, dtype = tf.float32)
16 |         self.construct_placeholders()
17 |         
18 |     def construct_placeholders(self):
19 |         # Placeholders for training samples
20 |         self.users_ph = tf.placeholder(dtype=tf.int32, shape=[None])
21 |         self.played_videos_ph = tf.placeholder(dtype=tf.float32, shape=[None, self.num_items])
22 |     
23 |     def construct_model_variables(self):
24 |         # Mean for user latent factors
25 |         self.Mu_Zu = tf.Variable(dtype=tf.float32,
26 |                             initial_value=tf.random_normal(shape=[self.num_users, self.num_factors]), 
27 |                             name = 'mean_latent_factors_zu')
28 |         # Log(std-deviation) for user latent factors
29 |         self.lsdev_Zu = tf.Variable(dtype=tf.float32,
30 |                                initial_value=tf.random_normal(shape=[self.num_users, 1]), name='lsdev_Zu')
31 |         # Mean for item latent factors
32 |         self.Mu_Zv = tf.Variable(dtype=tf.float32,
33 |                                  initial_value=tf.random_normal(shape=[self.num_items, self.num_factors]),
34 |                                  name = 'mean_latent_factors_zv')
35 |         # Mean for tag latent factors
36 |         self.Mu_Zt = tf.Variable(dtype=tf.float32,
37 |                             initial_value=tf.random_normal(shape=[self.num_tags, self.num_factors]),
38 |                             name = 'mean_latent_factors_zt')
39 |         
40 |     def compute_kl_div(self, lsdev_Zu_batch, Mu_Zu_batch):
41 |         # KL Divergence needed for ELBO
42 |         sdev_Zu_batch = tf.exp(lsdev_Zu_batch)
43 |         comp1 = self.num_factors * (0.5 * tf.math.log(self.var_prior) - lsdev_Zu_batch)
44 |         comp2 = (self.num_factors / (2 * self.var_prior)) * (tf.pow(sdev_Zu_batch, 2))
45 |         comp3 = (1.0 / (2 * self.var_prior)) * tf.reduce_sum(tf.pow(Mu_Zu_batch, 2), axis=1, keep_dims = True)
46 |         comp4 = (self.num_factors / 2.0)
47 | 
48 |         return comp1 + comp2 + comp3 - comp4
49 |         
50 |     def construct_graph(self):
51 |         # Boilerplate Tensorflow
52 |         self.construct_model_variables()
53 |         
54 |         # Mean, log(std-deviation) and Gaussian noise for user latent factors
55 |         Mu_Zu_batch = tf.gather(self.Mu_Zu, self.users_ph)
56 |         lsdev_Zu_batch = tf.gather(self.lsdev_Zu, self.users_ph)
57 |         Eps_u_ph = tf.random_normal(shape = [tf.size(self.users_ph), self.num_factors],
58 |                                     mean=0.0, stddev=1.0, dtype=tf.float32, seed=None, name="eps")
59 |         Zu_batch = Mu_Zu_batch + Eps_u_ph * tf.exp(lsdev_Zu_batch)
60 |         
61 |         # Tag factors mapped to items
62 |         Mu_Zv_hat = tf.matmul(self.video_metadata_array_const, self.Mu_Zt)
63 |         batch_logits = tf.matmul(Zu_batch, self.Mu_Zv + Mu_Zv_hat, transpose_b=True)
64 |         batch_logits_validation = tf.matmul(Mu_Zu_batch, self.Mu_Zv + Mu_Zv_hat, transpose_b=True)
65 |     
66 |         log_softmax = tf.nn.log_softmax(batch_logits)
67 |         
68 |         num_items_per_document = tf.reduce_sum(self.played_videos_ph, axis=1, keep_dims=True)
69 |         
70 |         batch_conditional_log_likelihood = tf.reduce_sum(self.played_videos_ph * log_softmax, axis = 1, keep_dims=True)
71 |         batch_kl_div = self.compute_kl_div(lsdev_Zu_batch, Mu_Zu_batch)
72 |         
73 |         batch_elbo = (1.0 / num_items_per_document) * (batch_conditional_log_likelihood - batch_kl_div)
74 |         
75 |         avg_loss = -1 * tf.reduce_mean(batch_elbo) + self.reg * (tf.nn.l2_loss(self.Mu_Zv) +
76 |                                                                  tf.nn.l2_loss(self.Mu_Zt))
77 |         
78 |         return batch_logits, batch_logits_validation, log_softmax, avg_loss, batch_conditional_log_likelihood, batch_kl_div, num_items_per_document
79 | 


--------------------------------------------------------------------------------
/src/PreprocessMSD.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import numpy as np
 3 | import os
 4 | import pandas as pd
 5 | import pickle
 6 | import util
 7 | from datetime import datetime
 8 | from scipy.sparse import save_npz, vstack
 9 | from sklearn.preprocessing import LabelEncoder
10 | 
11 | if __name__ == '__main__':
12 |     # Commandline arguments
13 |     parser = argparse.ArgumentParser()
14 |     parser.add_argument('dir', type = str, help = 'Directory containing the data')
15 |     parser.add_argument('--test_users', type = int, default = 10000)
16 |     args = parser.parse_args()
17 | 
18 |     # Fix seed for reproducibility
19 |     np.random.seed(42)
20 | 
21 |     # Load rating data
22 |     print(datetime.now(), 'Loading in ratings...')
23 |     ratings = pd.read_csv(args.dir + 'preprocessed_pref.csv')
24 |     ratings.columns = ['user', 'item']
25 |     print('\t{0:8} ratings'.format(ratings.shape[0]))
26 |     print('\t{0:8} unique users, {1:8} unique items'.format(ratings['user'].nunique(), ratings['item'].nunique()))
27 | 
28 |     # Load side info
29 |     print(datetime.now(), 'Loading in side-info...')
30 |     ###########################
31 |     # ARTISTS - GENRES - TAGS #
32 |     ###########################
33 |     # Load in data
34 |     artists = pd.read_csv(args.dir + 'preprocessed_artists.csv')
35 |     artists.columns = ['item', 'artist']
36 |     genres = pd.read_csv(args.dir + 'preprocessed_genres.csv')
37 |     genres.columns = ['item', 'genre']
38 |     tags = pd.read_csv(args.dir + 'preprocessed_tags.csv')
39 |     tags.columns = ['item', 'tag']
40 | 
41 |     # Drop those not appearing in preference data
42 |     artists = artists.merge(ratings[['item']].drop_duplicates(), how = 'right').dropna()
43 |     genres = genres.merge(ratings[['item']].drop_duplicates(), how = 'right').dropna()
44 |     tags = tags.merge(ratings[['item']].drop_duplicates(), how = 'right').dropna()
45 | 
46 |     # Ensure proper integer identifiers
47 |     user_enc = LabelEncoder()
48 |     item_enc = LabelEncoder()
49 |     ratings['user'] = user_enc.fit_transform(ratings['user'])
50 |     ratings['item'] = item_enc.fit_transform(ratings['item'])
51 |     artists['item'] = item_enc.transform(artists['item'])
52 |     genres['item'] = item_enc.transform(genres['item'])
53 |     tags['item'] = item_enc.transform(tags['item'])
54 | 
55 |     # Generate Metadata-to-item mapping
56 |     X_artists = util.generate_csr_matrix(artists, 'artist', ratings['item'].max() + 1)
57 |     X_genres = util.generate_csr_matrix(genres, 'genre', ratings['item'].max() + 1)
58 |     X_tags = util.generate_csr_matrix(tags, 'tag', ratings['item'].max() + 1)
59 |     X_meta = vstack((X_artists, X_genres, X_tags))
60 |     
61 |     # Check whether output directory already exists - make it if necessary
62 |     if not os.path.exists(args.dir + 'preprocessed/'):
63 |         os.makedirs(args.dir + 'preprocessed/')
64 | 
65 |     # Write out metadata-item matrix
66 |     print(datetime.now(), 'Writing out metadata-item matrix...')
67 |     save_npz(args.dir + 'preprocessed/X_meta.npz', X_meta)
68 | 
69 |     # Train - validation - test split
70 |     print(datetime.now(), 'Train-validation-test split...')
71 |     X_train, X_val, val_dict, X_test, test_dict = util.train_val_test_split_Jebara(ratings, n_test_users = args.test_users) 
72 | 
73 |     # Write out validation and test data
74 |     print(datetime.now(), 'Writing out validation and test data...')
75 |     save_npz(args.dir + 'preprocessed/X_val.npz', X_val)
76 |     with open(args.dir + 'preprocessed/val_dict.pkl', 'wb') as handle:
77 |         pickle.dump(val_dict, handle)
78 |     save_npz(args.dir + 'preprocessed/X_test.npz', X_test)
79 |     with open(args.dir + 'preprocessed/test_dict.pkl', 'wb') as handle:
80 |         pickle.dump(test_dict, handle)
81 | 
82 |     # Write out full user-item training matrix
83 |     print(datetime.now(), 'Writing out train data...')
84 |     save_npz(args.dir + 'preprocessed/X_train.npz', X_train)
85 | 
86 |     # Subsample training data on a user-level
87 |     print(datetime.now(), 'Subsampling training users...')
88 |     train_users = np.unique(X_train.nonzero()[0])
89 |     np.random.shuffle(train_users)
90 |     for frac_train_users in [0.01, .05, .1, .25, .5]:
91 |         train_users[:int(frac_train_users * len(train_users))]
92 |         pd.DataFrame(train_users[:int(frac_train_users * len(train_users))], columns = ['user']).to_csv(args.dir + 'preprocessed/train_users_{}.csv'.format(frac_train_users), index = False)
93 |     print(datetime.now(), 'Finished!')
94 | 


--------------------------------------------------------------------------------
/src/PreprocessYahooMovies.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import numpy as np
  3 | import os
  4 | import pandas as pd
  5 | import pickle
  6 | import util
  7 | from datetime import datetime
  8 | from scipy.sparse import save_npz, vstack
  9 | from sklearn.preprocessing import LabelEncoder
 10 | 
 11 | if __name__ == '__main__':
 12 |     # Commandline arguments
 13 |     parser = argparse.ArgumentParser()
 14 |     parser.add_argument('dir', type = str, help = 'Directory containing the data')
 15 |     args = parser.parse_args()
 16 | 
 17 |     # Fix seed for reproducibility
 18 |     np.random.seed(42)
 19 | 
 20 |     # Load rating data
 21 |     print(datetime.now(), 'Loading in ratings...')
 22 |     ratings = pd.read_csv(args.dir + 'ydata-ymovies-user-movie-ratings-train-v1_0.txt',
 23 |                           sep = '\t',
 24 |                           header = None)
 25 |     ratings.columns = ['user', 'item', 'weird_rating','rating']
 26 |     ratings = ratings.loc[ratings.rating > 3.0]
 27 | 
 28 |     # Only keep users who have rated at least 5 movies
 29 |     user_counts = ratings['user'].value_counts().reset_index().rename(columns = {'index': 'user', 'user': 'count'})
 30 |     user_counts = user_counts.loc[user_counts['count'] >= 5]
 31 |     ratings = ratings.merge(user_counts, on = 'user', how = 'right').drop('count', axis = 1)
 32 |     print('\t{0:8} ratings'.format(ratings.shape[0]))
 33 |     print('\t{0:8} unique users, {1:8} unique items'.format(ratings['user'].nunique(), ratings['item'].nunique()))
 34 | 
 35 |     # Load side info
 36 |     print(datetime.now(), 'Loading in side-info...')
 37 |     side_columns = [
 38 |         'item',
 39 |         'title',
 40 |         'synposis',
 41 |         'runtime',
 42 |         'MPAA',
 43 |         'MPAA_reason',
 44 |         'release_date',
 45 |         'distributor',
 46 |         'dummy_1', # HUH?
 47 |         'poster',
 48 |         'genre',
 49 |         'directors',
 50 |         'director_ids',
 51 |         'crew_members',
 52 |         'crew_ids',
 53 |         'crew_types',
 54 |         'actors',
 55 |         'actor_ids',
 56 |         'avg_rating',
 57 |         'n_rating',
 58 |         'n_awards',
 59 |         'n_nominated',
 60 |         'list_won',
 61 |         'list_nominated',
 62 |         'rating_moviemom',
 63 |         'review_moviemom',
 64 |         'list_review_summaries',
 65 |         'list_reviewers',
 66 |         'list_captions',
 67 |         'preview',
 68 |         'DVD_review',
 69 |         'GNPP',
 70 |         'avg_train',
 71 |         'num_train'
 72 |     ]
 73 |     side = pd.read_csv(args.dir + 'ydata-ymovies-movie-content-descr-v1_0.txt',
 74 |                        sep = '\t',
 75 |                        encoding = 'latin',
 76 |                        names = side_columns)#[['item','genre']]
 77 | 
 78 |     # Extract genres properly
 79 |     genres = pd.DataFrame(side.genre.str.split('|').tolist(), index = side.item).stack().reset_index([0, 'item'])
 80 |     genres.columns = ['item', 'genre']
 81 |     genres = genres.loc[genres.genre != '\\N']
 82 |     genres = genres.merge(ratings[['item']].drop_duplicates(), on = 'item', how = 'right').dropna()
 83 | 
 84 |     # Extract directors properly
 85 |     directors = pd.DataFrame(side.director_ids.str.split('|').tolist(), index = side.item).stack().reset_index([0, 'item'])
 86 |     directors.columns = ['item', 'director']
 87 |     directors = directors.loc[directors.director != '\\N']
 88 |     directors = directors.merge(ratings[['item']].drop_duplicates(), on = 'item', how = 'inner').dropna()
 89 | 
 90 |     # Extract actors properly
 91 |     actors = pd.DataFrame(side.actor_ids.str.split('|').tolist(), index = side.item).stack().reset_index([0, 'item'])
 92 |     actors.columns = ['item', 'actor']
 93 |     actors = actors.loc[actors.actor != '\\N']
 94 |     actors = actors.merge(ratings[['item']].drop_duplicates(), on = 'item', how = 'inner').dropna()
 95 | 
 96 |     # Drop those that appear less than twice (wouldn't affect Gram-matrix)
 97 |     dir2count = directors.groupby('director')['item'].apply(lambda x: len(set(x))).reset_index().rename(columns = {'item': 'count'})
 98 |     dir2count = dir2count[dir2count['count'] >= 2]
 99 |     directors = directors.merge(dir2count[['director']], on = 'director', how = 'right')
100 |     act2count = actors.groupby('actor')['item'].apply(lambda x: len(set(x))).reset_index().rename(columns = {'item': 'count'})
101 |     act2count = act2count[act2count['count'] >= 2]
102 |     actors = actors.merge(act2count[['actor']], on = 'actor', how = 'right')
103 | 
104 |     # Ensure proper integer identifiers
105 |     user_enc = LabelEncoder()
106 |     item_enc = LabelEncoder()
107 |     genre_enc = LabelEncoder()
108 |     direc_enc = LabelEncoder()
109 |     actor_enc = LabelEncoder()
110 |     ratings['user'] = user_enc.fit_transform(ratings['user'])
111 |     ratings['item'] = item_enc.fit_transform(ratings['item'])
112 |     genres['item'] = item_enc.transform(genres['item'])
113 |     genres['genre'] = genre_enc.fit_transform(genres['genre'].astype(str))
114 |     directors['item'] = item_enc.transform(directors['item'])
115 |     directors['director'] = direc_enc.fit_transform(directors['director'])
116 |     actors['item'] = item_enc.transform(actors['item'])
117 |     actors['actor'] = actor_enc.fit_transform(actors['actor'])
118 | 
119 |     # Generate Metadata-to-item mapping
120 |     X_genres = util.generate_csr_matrix(genres, 'genre', ratings['item'].max() + 1)
121 |     X_directors = util.generate_csr_matrix(directors, 'director', ratings['item'].max() + 1)
122 |     X_actors = util.generate_csr_matrix(actors, 'actor', ratings['item'].max() + 1)
123 |     X_meta = vstack((X_genres,X_directors,X_actors))
124 |     
125 |     # Check whether output directory already exists - make it if necessary
126 |     if not os.path.exists(args.dir + 'preprocessed/'):
127 |         os.makedirs(args.dir + 'preprocessed/')
128 | 
129 |     # Write out metadata-item matrix
130 |     print(datetime.now(), 'Writing out metadata-item matrix...')
131 |     save_npz(args.dir + 'preprocessed/X_meta.npz', X_meta)
132 | 
133 |     print(datetime.now(), 'Train-validation-test split...')
134 |     X_train, _, val_dict, _, test_dict = util.train_val_test_split_Karypis(ratings) 
135 | 
136 |     # Write out user-item matrix and held-out dictionaries
137 |     print(datetime.now(), 'Writing out training, validation and test data...')
138 |     save_npz(args.dir + 'preprocessed/X_train.npz', X_train)
139 |     with open(args.dir + 'preprocessed/val_dict.pkl', 'wb') as handle:
140 |         pickle.dump(val_dict, handle)
141 |     with open(args.dir + 'preprocessed/test_dict.pkl', 'wb') as handle:
142 |         pickle.dump(test_dict, handle)
143 |     print(datetime.now(), 'Finished!')
144 | 


--------------------------------------------------------------------------------
/src/PreprocessAmazonVideoGames.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import numpy as np
  3 | import os
  4 | import pandas as pd
  5 | import pickle
  6 | import string
  7 | import util
  8 | from datetime import datetime
  9 | from scipy.sparse import save_npz, vstack
 10 | from sklearn.preprocessing import LabelEncoder
 11 | 
 12 | if __name__ == '__main__':
 13 |     # Commandline arguments
 14 |     parser = argparse.ArgumentParser()
 15 |     parser.add_argument('dir', type = str, help = 'Directory containing the data')
 16 |     args = parser.parse_args()
 17 | 
 18 |     # Fix seed for reproducibility
 19 |     np.random.seed(42)
 20 | 
 21 |     # Load rating data
 22 |     print(datetime.now(), 'Loading in ratings...')
 23 |     ratings = pd.read_csv(args.dir + 'reviews_Video_Games_5.csv')[['reviewerID','asin','overall']]
 24 |     ratings.columns = ['user', 'item', 'rating']
 25 |     ratings = ratings.loc[ratings.rating > 3.0]
 26 | 
 27 |     # Only keep users who have rated at least 5 movies
 28 |     user_counts = ratings['user'].value_counts().reset_index().rename(columns = {'index': 'user', 'user': 'count'})
 29 |     user_counts = user_counts.loc[user_counts['count'] >= 5]
 30 |     ratings = ratings.merge(user_counts, on = 'user', how = 'right').drop('count', axis = 1)
 31 |     print('\t{0:8} ratings'.format(ratings.shape[0]))
 32 |     print('\t{0:8} unique users, {1:8} unique items'.format(ratings['user'].nunique(), ratings['item'].nunique()))
 33 | 
 34 |     # Load in metadata
 35 |     meta = pd.read_csv(args.dir + 'meta_Video_Games.csv')[['asin','description','categories','title','brand']]
 36 |     meta.columns = ['item','desc','cat','title','brand']
 37 |     
 38 |     # We only want metadata for items we have ratings for
 39 |     meta = meta.merge(ratings[['item']].drop_duplicates(), how = 'right', on = 'item')
 40 |     minsup = 3
 41 |     maxsup = ratings['item'].nunique() // 4
 42 |    
 43 |     # Clean up categorical strings
 44 |     meta['cat'] = meta['cat'].apply(lambda s: s.replace('[','').replace(']','').replace('\'','').strip())
 45 |     cat = pd.DataFrame(meta.cat.str.split(',').tolist(), index = meta.item).stack().reset_index([0, 'item'])
 46 |     cat.columns = ['item', 'cat']
 47 |     cat['cat'] = cat['cat'].apply(lambda s: s.strip())
 48 |     cat.drop_duplicates(inplace=True)
 49 |     cat = cat.loc[cat.cat != 'Video Games'] # Appears too often
 50 |     cat = cat.loc[cat.cat != ' ']
 51 |     cat2count = cat.groupby('cat')['item'].apply(lambda x: len(set(x))).reset_index().rename(columns = {'item': 'count'})
 52 |     cat2count = cat2count[cat2count['count'] >= 2]
 53 |     cat = cat.merge(cat2count[['cat']], on = 'cat', how = 'right')
 54 |     print(cat['cat'].value_counts())
 55 | 
 56 |     # Clean up description strings
 57 |     meta['desc'] = meta['desc'].apply(lambda s: str(s).lower().translate(str.maketrans(string.punctuation, ' ' * len(string.punctuation))))
 58 |     desc = pd.DataFrame(meta.desc.str.split(' ').tolist(), index = meta.item).stack().reset_index([0, 'item'])
 59 |     desc.columns = ['item', 'desc']
 60 |     desc.drop_duplicates(inplace=True)
 61 |     desc.dropna(inplace = True)
 62 |     desc = desc.loc[desc.desc != ' ']
 63 |     word2count = desc.groupby('desc')['item'].apply(lambda x: len(set(x))).reset_index().rename(columns = {'item': 'count'})
 64 |     word2count = word2count[word2count['count'] >= minsup]
 65 |     word2count = word2count[word2count['count'] <= maxsup]
 66 |     desc = desc.merge(word2count[['desc']], on = 'desc', how = 'right')
 67 |     print(desc['desc'].value_counts())
 68 | 
 69 |     # Clean up Title strings
 70 |     meta['title'] = meta['title'].apply(lambda s: str(s).lower().translate(str.maketrans(string.punctuation, ' ' * len(string.punctuation))))
 71 |     title = pd.DataFrame(meta.title.str.split(' ').tolist(), index = meta.item).stack().reset_index([0, 'item'])
 72 |     title.columns = ['item', 'title']
 73 |     title.drop_duplicates(inplace=True)
 74 |     title.dropna(inplace = True)
 75 |     title = title.loc[title.title != ' ']
 76 |     word2count = title.groupby('title')['item'].apply(lambda x: len(set(x))).reset_index().rename(columns = {'item': 'count'})
 77 |     word2count = word2count[word2count['count'] >= minsup]
 78 |     word2count = word2count[word2count['count'] <= maxsup]
 79 |     title = title.merge(word2count[['title']], on = 'title', how = 'right')
 80 |     print(title['title'].value_counts())
 81 | 
 82 |     # Clean up description strings
 83 |     meta['brand'] = meta['brand'].apply(lambda s: str(s).lower().translate(str.maketrans(string.punctuation, ' ' * len(string.punctuation))))
 84 |     brand = pd.DataFrame(meta.brand.str.split(' ').tolist(), index = meta.item).stack().reset_index([0, 'item'])
 85 |     brand.columns = ['item', 'brand']
 86 |     brand.drop_duplicates(inplace=True)
 87 |     brand.dropna(inplace = True)
 88 |     brand = brand.loc[brand.brand != ' ']
 89 |     word2count = brand.groupby('brand')['item'].apply(lambda x: len(set(x))).reset_index().rename(columns = {'item': 'count'})
 90 |     word2count = word2count[word2count['count'] >= minsup]
 91 |     word2count = word2count[word2count['count'] <= maxsup]
 92 |     brand = brand.merge(word2count[['brand']], on = 'brand', how = 'right')
 93 |     print(brand['brand'].value_counts())
 94 |     
 95 |     # Ensure proper integer identifiers
 96 |     user_enc = LabelEncoder()
 97 |     item_enc = LabelEncoder()
 98 |     cat_enc = LabelEncoder()
 99 |     desc_enc = LabelEncoder()
100 |     title_enc = LabelEncoder()
101 |     brand_enc = LabelEncoder()
102 |     ratings['user'] = user_enc.fit_transform(ratings['user'])
103 |     ratings['item'] = item_enc.fit_transform(ratings['item'])
104 |     cat['item'] = item_enc.transform(cat['item'])
105 |     cat['cat'] = cat_enc.fit_transform(cat['cat'].astype(str))
106 |     desc['item'] = item_enc.transform(desc['item'])
107 |     desc['desc'] = desc_enc.fit_transform(desc['desc'])
108 |     title['item'] = item_enc.transform(title['item'])
109 |     title['title'] = title_enc.fit_transform(title['title'])
110 |     brand['item'] = item_enc.transform(brand['item'])
111 |     brand['brand'] = brand_enc.fit_transform(brand['brand'])
112 | 
113 |     # Generate Metadata-to-item mapping
114 |     X_cat = util.generate_csr_matrix(cat, 'cat', ratings['item'].max() + 1)
115 |     X_desc = util.generate_csr_matrix(desc, 'desc', ratings['item'].max() + 1)
116 |     X_title = util.generate_csr_matrix(title, 'title', ratings['item'].max() + 1)
117 |     X_brand = util.generate_csr_matrix(brand, 'brand', ratings['item'].max() + 1)
118 |     X_meta = vstack((X_cat,X_desc,X_title,X_brand))
119 |     
120 |     # Check whether output directory already exists - make it if necessary
121 |     if not os.path.exists(args.dir + 'preprocessed/'):
122 |         os.makedirs(args.dir + 'preprocessed/')
123 | 
124 |     # Write out metadata-item matrix
125 |     print(datetime.now(), 'Writing out metadata-item matrix...')
126 |     save_npz(args.dir + 'preprocessed/X_meta.npz', X_meta)
127 | 
128 |     print(datetime.now(), 'Train-validation-test split...')
129 |     X_train, _, val_dict, _, test_dict = util.train_val_test_split_Karypis(ratings) 
130 | 
131 |     # Write out user-item matrix and held-out dictionaries
132 |     print(datetime.now(), 'Writing out training, validation and test data...')
133 |     save_npz(args.dir + 'preprocessed/X_train.npz', X_train)
134 |     with open(args.dir + 'preprocessed/val_dict.pkl', 'wb') as handle:
135 |         pickle.dump(val_dict, handle)
136 |     with open(args.dir + 'preprocessed/test_dict.pkl', 'wb') as handle:
137 |         pickle.dump(test_dict, handle)
138 |     print(datetime.now(), 'Finished!')
139 | 


--------------------------------------------------------------------------------
/src/PreprocessAmazonSportsOutdoors.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import numpy as np
  3 | import os
  4 | import pandas as pd
  5 | import pickle
  6 | import string
  7 | import util
  8 | from datetime import datetime
  9 | from scipy.sparse import save_npz, vstack
 10 | from sklearn.preprocessing import LabelEncoder
 11 | 
 12 | if __name__ == '__main__':
 13 |     # Commandline arguments
 14 |     parser = argparse.ArgumentParser()
 15 |     parser.add_argument('dir', type = str, help = 'Directory containing the data')
 16 |     args = parser.parse_args()
 17 | 
 18 |     # Fix seed for reproducibility
 19 |     np.random.seed(42)
 20 | 
 21 |     # Load rating data
 22 |     print(datetime.now(), 'Loading in ratings...')
 23 |     ratings = pd.read_csv(args.dir + 'reviews_Sports_and_Outdoors_5.csv')[['reviewerID','asin','overall']]
 24 |     ratings.columns = ['user', 'item', 'rating']
 25 |     ratings = ratings.loc[ratings.rating > 3.0]
 26 | 
 27 |     # Only keep users who have rated at least 5 movies
 28 |     user_counts = ratings['user'].value_counts().reset_index().rename(columns = {'index': 'user', 'user': 'count'})
 29 |     user_counts = user_counts.loc[user_counts['count'] >= 5]
 30 |     ratings = ratings.merge(user_counts, on = 'user', how = 'right').drop('count', axis = 1)
 31 |     print('\t{0:8} ratings'.format(ratings.shape[0]))
 32 |     print('\t{0:8} unique users, {1:8} unique items'.format(ratings['user'].nunique(), ratings['item'].nunique()))
 33 | 
 34 |     # Load in metadata
 35 |     meta = pd.read_csv(args.dir + 'meta_Sports_and_Outdoors.csv')[['asin','description','categories','title','brand']]
 36 |     meta.columns = ['item','desc','cat','title','brand']
 37 |     
 38 |     # We only want metadata for items we have ratings for
 39 |     meta = meta.merge(ratings[['item']].drop_duplicates(), how = 'right', on = 'item')
 40 |     minsup = 3
 41 |     maxsup = ratings['item'].nunique() // 4
 42 |    
 43 |     # Clean up categorical strings
 44 |     meta['cat'] = meta['cat'].apply(lambda s: s.replace('[','').replace(']','').replace('\'','').strip())
 45 |     cat = pd.DataFrame(meta.cat.str.split(',').tolist(), index = meta.item).stack().reset_index([0, 'item'])
 46 |     cat.columns = ['item', 'cat']
 47 |     cat['cat'] = cat['cat'].apply(lambda s: s.strip())
 48 |     cat.drop_duplicates(inplace=True)
 49 |     cat = cat.loc[cat.cat != 'Sports & Outdoors'] # Appears too often
 50 |     cat = cat.loc[cat.cat != ' ']
 51 |     cat2count = cat.groupby('cat')['item'].apply(lambda x: len(set(x))).reset_index().rename(columns = {'item': 'count'})
 52 |     cat2count = cat2count[cat2count['count'] >= 2]
 53 |     cat = cat.merge(cat2count[['cat']], on = 'cat', how = 'right')
 54 |     print(cat['cat'].value_counts())
 55 | 
 56 |     # Clean up description strings
 57 |     meta['desc'] = meta['desc'].apply(lambda s: str(s).lower().translate(str.maketrans(string.punctuation, ' ' * len(string.punctuation))))
 58 |     desc = pd.DataFrame(meta.desc.str.split(' ').tolist(), index = meta.item).stack().reset_index([0, 'item'])
 59 |     desc.columns = ['item', 'desc']
 60 |     desc.drop_duplicates(inplace=True)
 61 |     desc.dropna(inplace = True)
 62 |     desc = desc.loc[desc.desc != ' ']
 63 |     word2count = desc.groupby('desc')['item'].apply(lambda x: len(set(x))).reset_index().rename(columns = {'item': 'count'})
 64 |     word2count = word2count[word2count['count'] >= minsup]
 65 |     word2count = word2count[word2count['count'] <= maxsup]
 66 |     desc = desc.merge(word2count[['desc']], on = 'desc', how = 'right')
 67 |     print(desc['desc'].value_counts())
 68 | 
 69 |     # Clean up Title strings
 70 |     meta['title'] = meta['title'].apply(lambda s: str(s).lower().translate(str.maketrans(string.punctuation, ' ' * len(string.punctuation))))
 71 |     title = pd.DataFrame(meta.title.str.split(' ').tolist(), index = meta.item).stack().reset_index([0, 'item'])
 72 |     title.columns = ['item', 'title']
 73 |     title.drop_duplicates(inplace=True)
 74 |     title.dropna(inplace = True)
 75 |     title = title.loc[title.title != ' ']
 76 |     word2count = title.groupby('title')['item'].apply(lambda x: len(set(x))).reset_index().rename(columns = {'item': 'count'})
 77 |     word2count = word2count[word2count['count'] >= minsup]
 78 |     word2count = word2count[word2count['count'] <= maxsup]
 79 |     title = title.merge(word2count[['title']], on = 'title', how = 'right')
 80 |     print(title['title'].value_counts())
 81 | 
 82 |     # Clean up description strings
 83 |     meta['brand'] = meta['brand'].apply(lambda s: str(s).lower().translate(str.maketrans(string.punctuation, ' ' * len(string.punctuation))))
 84 |     brand = pd.DataFrame(meta.brand.str.split(' ').tolist(), index = meta.item).stack().reset_index([0, 'item'])
 85 |     brand.columns = ['item', 'brand']
 86 |     brand.drop_duplicates(inplace=True)
 87 |     brand.dropna(inplace = True)
 88 |     brand = brand.loc[brand.brand != ' ']
 89 |     word2count = brand.groupby('brand')['item'].apply(lambda x: len(set(x))).reset_index().rename(columns = {'item': 'count'})
 90 |     word2count = word2count[word2count['count'] >= minsup]
 91 |     word2count = word2count[word2count['count'] <= maxsup]
 92 |     brand = brand.merge(word2count[['brand']], on = 'brand', how = 'right')
 93 |     print(brand['brand'].value_counts())
 94 |     
 95 |     # Ensure proper integer identifiers
 96 |     user_enc = LabelEncoder()
 97 |     item_enc = LabelEncoder()
 98 |     cat_enc = LabelEncoder()
 99 |     desc_enc = LabelEncoder()
100 |     title_enc = LabelEncoder()
101 |     brand_enc = LabelEncoder()
102 |     ratings['user'] = user_enc.fit_transform(ratings['user'])
103 |     ratings['item'] = item_enc.fit_transform(ratings['item'])
104 |     cat['item'] = item_enc.transform(cat['item'])
105 |     cat['cat'] = cat_enc.fit_transform(cat['cat'].astype(str))
106 |     desc['item'] = item_enc.transform(desc['item'])
107 |     desc['desc'] = desc_enc.fit_transform(desc['desc'])
108 |     title['item'] = item_enc.transform(title['item'])
109 |     title['title'] = title_enc.fit_transform(title['title'])
110 |     brand['item'] = item_enc.transform(brand['item'])
111 |     brand['brand'] = brand_enc.fit_transform(brand['brand'])
112 | 
113 |     # Generate Metadata-to-item mapping
114 |     X_cat = util.generate_csr_matrix(cat, 'cat', ratings['item'].max() + 1)
115 |     X_desc = util.generate_csr_matrix(desc, 'desc', ratings['item'].max() + 1)
116 |     X_title = util.generate_csr_matrix(title, 'title', ratings['item'].max() + 1)
117 |     X_brand = util.generate_csr_matrix(brand, 'brand', ratings['item'].max() + 1)
118 |     X_meta = vstack((X_cat,X_desc,X_title,X_brand))
119 |     
120 |     # Check whether output directory already exists - make it if necessary
121 |     if not os.path.exists(args.dir + 'preprocessed/'):
122 |         os.makedirs(args.dir + 'preprocessed/')
123 | 
124 |     # Write out metadata-item matrix
125 |     print(datetime.now(), 'Writing out metadata-item matrix...')
126 |     save_npz(args.dir + 'preprocessed/X_meta.npz', X_meta)
127 | 
128 |     print(datetime.now(), 'Train-validation-test split...')
129 |     X_train, _, val_dict, _, test_dict = util.train_val_test_split_Karypis(ratings) 
130 | 
131 |     # Write out user-item matrix and held-out dictionaries
132 |     print(datetime.now(), 'Writing out training, validation and test data...')
133 |     save_npz(args.dir + 'preprocessed/X_train.npz', X_train)
134 |     with open(args.dir + 'preprocessed/val_dict.pkl', 'wb') as handle:
135 |         pickle.dump(val_dict, handle)
136 |     with open(args.dir + 'preprocessed/test_dict.pkl', 'wb') as handle:
137 |         pickle.dump(test_dict, handle)
138 |     print(datetime.now(), 'Finished!')
139 | 


--------------------------------------------------------------------------------
/src/PreprocessNetflix.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import numpy as np
  3 | import os
  4 | import pandas as pd
  5 | import pickle
  6 | import util
  7 | from datetime import datetime
  8 | from scipy.sparse import save_npz, vstack
  9 | from sklearn.preprocessing import LabelEncoder
 10 | 
 11 | if __name__ == '__main__':
 12 |     # Commandline arguments
 13 |     parser = argparse.ArgumentParser()
 14 |     parser.add_argument('dir', type = str, help = 'Directory containing the data')
 15 |     parser.add_argument('--test_users', type = int, default = 40000)
 16 |     args = parser.parse_args()
 17 | 
 18 |     # Fix seed for reproducibility
 19 |     np.random.seed(42)
 20 |     
 21 |     # Load rating data
 22 |     print(datetime.now(), 'Loading in ratings...')
 23 |     ratings = pd.read_csv(args.dir + 'netflix_ratings.csv')
 24 |     ratings.columns = ['user', 'item', 'rating', 'time']
 25 |     
 26 |     # Preprocessing as in Liang et al. @ WWW 2018
 27 |     # Only keep ratings of 4 or higher
 28 |     ratings = ratings.loc[ratings.rating >= 4]
 29 |     # Only keep users who have rated at least 5 movies
 30 |     user_counts = ratings['user'].value_counts().reset_index().rename(columns = {'index': 'user', 'user': 'count'})
 31 |     user_counts = user_counts.loc[user_counts['count'] >= 5]
 32 |     ratings = ratings.merge(user_counts, on = 'user', how = 'right').drop('count', axis = 1)
 33 |     print('\t{0:8} ratings'.format(ratings.shape[0]))
 34 |     print('\t{0:8} unique users, {1:8} unique items'.format(ratings['user'].nunique(), ratings['item'].nunique()))
 35 | 
 36 |     # Load side info
 37 |     print(datetime.now(), 'Loading in side-info...')
 38 |     ####################
 39 |     # SERIES AND YEARS #
 40 |     ####################
 41 |     # Load in data
 42 |     series = pd.read_csv(args.dir + 'netflixid2series.csv')
 43 |     # Drop movies that don't appear in preference data
 44 |     #series = series.merge(ratings[['item']].drop_duplicates(), on = 'item', how = 'right')
 45 | 
 46 |     # Load in data
 47 |     years = pd.read_csv(args.dir + 'netflixid2year.csv')
 48 |     # Drop movies that don't appear in preference data
 49 |     years = years.merge(ratings[['item']].drop_duplicates(), on = 'item', how = 'right')
 50 | 
 51 |     ########
 52 |     # CREW #
 53 |     ########
 54 |     # Load IMDB data links with movielens
 55 |     links = pd.read_csv(args.dir + 'netflixid2imdbid.csv')
 56 | 
 57 |     # Side info - genres
 58 |     side = pd.read_csv(args.dir + 'ml-imdb_sideinfo.csv')[['imdb_title_id','genre']]
 59 |     side.columns = ['imdb_id', 'genre']
 60 |     side = side.merge(links, on = 'imdb_id', how = 'right')
 61 | 
 62 |     # Extract genres
 63 |     genres = pd.DataFrame(side.genre.str.split(',').tolist(), index = side.item).stack().reset_index([0, 'item'])
 64 |     genres.columns = ['item', 'genre']
 65 |     genres = genres.loc[genres.genre != '\\N']
 66 |     
 67 |     # Load IMDB crew data and link it properly
 68 |     crew = pd.read_csv(args.dir + 'imdb_crew_info.csv')
 69 |     crew.columns = ['imdb_id', 'directors', 'writers']
 70 |     crew = crew.merge(links, on = 'imdb_id', how = 'right')
 71 |     
 72 |     # We don't care about movies without ratings
 73 |     crew = crew.merge(ratings[['item']].drop_duplicates(), on = 'item', how = 'right')[['item','directors','writers']]
 74 |     crew['directors'] = crew['directors'].apply(lambda s: str(s))
 75 |     crew['writers'] = crew['writers'].apply(lambda s: str(s))
 76 |  
 77 |     # Extract directors
 78 |     directors = pd.DataFrame(crew.directors.str.split(',').tolist(), index = crew.item).stack().reset_index([0, 'item'])
 79 |     directors.columns = ['item', 'director']
 80 |     directors = directors.loc[directors.director != '\\N']
 81 |     
 82 |     # Drop directors that appear less than once (wouldn't affect Gram-matrix)
 83 |     dir2count = directors.groupby('director')['item'].apply(lambda x: len(set(x))).reset_index().rename(columns = {'item': 'count'})
 84 |     dir2count = dir2count[dir2count['count'] >= 2]
 85 |     directors = directors.merge(dir2count[['director']], on = 'director', how = 'right')
 86 | 
 87 |     # Extract writers
 88 |     writers = pd.DataFrame(crew.writers.str.split(',').tolist(), index = crew.item).stack().reset_index([0, 'item'])
 89 |     writers.columns = ['item', 'writer']
 90 |     writers = writers.loc[writers.writer != '\\N']
 91 |     
 92 |     # Drop writers that appear less than once (wouldn't affect Gram-matrix)
 93 |     writer2count = writers.groupby('writer')['item'].apply(lambda x: len(set(x))).reset_index().rename(columns = {'item': 'count'})
 94 |     writer2count = writer2count[writer2count['count'] >= 2]
 95 |     writers = writers.merge(writer2count[['writer']], on = 'writer', how = 'right')
 96 | 
 97 |     # Ensure proper integer identifiers
 98 |     user_enc = LabelEncoder()
 99 |     item_enc = LabelEncoder()
100 |     year_enc = LabelEncoder()
101 |     genre_enc = LabelEncoder()
102 |     direc_enc = LabelEncoder()
103 |     write_enc = LabelEncoder()
104 |     ratings['user'] = user_enc.fit_transform(ratings['user'])
105 |     ratings['item'] = item_enc.fit_transform(ratings['item'])
106 |     years['item'] = item_enc.transform(years['item'])
107 |     years['year'] = year_enc.fit_transform(years['year'])
108 |     series['item'] = item_enc.transform(series['item'])
109 |     genres['item'] = item_enc.transform(genres['item'])
110 |     genres['genre'] = genre_enc.fit_transform(genres['genre'])
111 |     directors['item'] = item_enc.transform(directors['item'])
112 |     directors['director'] = direc_enc.fit_transform(directors['director'])
113 |     writers['item'] = item_enc.transform(writers['item'])
114 |     writers['writer'] = write_enc.fit_transform(writers['writer'])
115 | 
116 |     # Generate Metadata-to-item mapping
117 |     X_years = util.generate_csr_matrix(years, 'year', ratings['item'].max() + 1)
118 |     X_series = util.generate_csr_matrix(series, 'title_id', ratings['item'].max() + 1)
119 |     X_genres = util.generate_csr_matrix(genres, 'genre', ratings['item'].max() + 1)
120 |     X_directors = util.generate_csr_matrix(directors, 'director', ratings['item'].max() + 1)
121 |     X_writers = util.generate_csr_matrix(writers, 'writer', ratings['item'].max() + 1)
122 |     X_meta = vstack((X_years, X_series, X_genres, X_directors, X_writers))
123 |     
124 |     # Check whether output directory already exists - make it if necessary
125 |     if not os.path.exists(args.dir + 'preprocessed/'):
126 |         os.makedirs(args.dir + 'preprocessed/')
127 | 
128 |     # Write out metadata-item matrix
129 |     print(datetime.now(), 'Writing out metadata-item matrix...')
130 |     save_npz(args.dir + 'preprocessed/X_meta.npz', X_meta)
131 | 
132 |     # Train - validation - test split
133 |     print(datetime.now(), 'Train-validation-test split...')
134 |     X_train, X_val, val_dict, X_test, test_dict = util.train_val_test_split_Jebara(ratings, n_test_users = args.test_users) 
135 | 
136 |     # Write out validation and test data
137 |     print(datetime.now(), 'Writing out validation and test data...')
138 |     save_npz(args.dir + 'preprocessed/X_val.npz', X_val)
139 |     with open(args.dir + 'preprocessed/val_dict.pkl', 'wb') as handle:
140 |         pickle.dump(val_dict, handle)
141 |     save_npz(args.dir + 'preprocessed/X_test.npz', X_test)
142 |     with open(args.dir + 'preprocessed/test_dict.pkl', 'wb') as handle:
143 |         pickle.dump(test_dict, handle)
144 | 
145 |     # Write out full user-item training matrix
146 |     print(datetime.now(), 'Writing out train data...')
147 |     save_npz(args.dir + 'preprocessed/X_train.npz', X_train)
148 | 
149 |     # Subsample training data on a user-level
150 |     print(datetime.now(), 'Subsampling training users...')
151 |     train_users = np.unique(X_train.nonzero()[0])
152 |     np.random.shuffle(train_users)
153 |     for frac_train_users in [0.01, .05, .1, .25, .5]:
154 |         train_users[:int(frac_train_users * len(train_users))]
155 |         pd.DataFrame(train_users[:int(frac_train_users * len(train_users))], columns = ['user']).to_csv(args.dir + 'preprocessed/train_users_{}.csv'.format(frac_train_users), index = False)
156 |     print(datetime.now(), 'Finished!')
157 | 


--------------------------------------------------------------------------------
/src/PreprocessML20M.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import numpy as np
  3 | import os
  4 | import pandas as pd
  5 | import pickle
  6 | import util
  7 | from datetime import datetime
  8 | from scipy.sparse import save_npz, vstack
  9 | from sklearn.preprocessing import LabelEncoder
 10 | 
 11 | if __name__ == '__main__':
 12 |     # Commandline arguments
 13 |     parser = argparse.ArgumentParser()
 14 |     parser.add_argument('dir', type = str, help = 'Directory containing the data')
 15 |     parser.add_argument('--test_users', type = int, default = 10000)
 16 |     args = parser.parse_args()
 17 | 
 18 |     # Fix seed for reproducibility
 19 |     np.random.seed(42)
 20 | 
 21 |     # Load rating data
 22 |     print(datetime.now(), 'Loading in ratings...')
 23 |     ratings = pd.read_csv(args.dir + 'ml-20m_ratings.csv')
 24 |     ratings.columns = ['user', 'item', 'rating', 'time']
 25 |     
 26 |     # Preprocessing as in Liang et al. @ WWW 2018
 27 |     # Only keep ratings of 4 or higher
 28 |     ratings = ratings.loc[ratings.rating >= 4]
 29 |     # Only keep users who have rated at least 5 movies
 30 |     user_counts = ratings['user'].value_counts().reset_index().rename(columns = {'index': 'user', 'user': 'count'})
 31 |     user_counts = user_counts.loc[user_counts['count'] >= 5]
 32 |     ratings = ratings.merge(user_counts, on = 'user', how = 'right').drop('count', axis = 1)
 33 |     print('\t{0:8} ratings'.format(ratings.shape[0]))
 34 |     print('\t{0:8} unique users, {1:8} unique items'.format(ratings['user'].nunique(), ratings['item'].nunique()))
 35 | 
 36 |     # Load side info
 37 |     print(datetime.now(), 'Loading in side-info...')
 38 |     ##########
 39 |     # GENRES #
 40 |     ##########
 41 |     # Load in data
 42 |     movies = pd.read_csv(args.dir + 'ml-20m_movies.csv')
 43 |     movies.columns = ['item', 'title', 'genres']
 44 |     # Drop movies that don't appear in preference data
 45 |     movies = movies.merge(ratings[['item']].drop_duplicates(), on = 'item', how = 'right')
 46 |     # Properly format 
 47 |     genres = pd.DataFrame(movies.genres.str.split('|').tolist(), index = movies.item)\
 48 |                .stack()\
 49 |                .reset_index([0, 'item'])\
 50 |                .rename(columns = {0: 'genre'})
 51 |     # Drop nonsensical genres
 52 |     genres = genres.loc[genres.genre != '(no genres listed)']
 53 |     genres = genres.loc[genres.genre != 'IMAX']
 54 | 
 55 |     #########
 56 |     # YEARS #
 57 |     #########
 58 |     # Extract year
 59 |     movies['year'] = movies['title'].str.extract(pat = '\((\d\d\d\d)(?:[-–]\s*(?:\d\d\d\d)?)?\)')
 60 |     years = movies[['item','year']]
 61 |     # Drop years that appear less than once (wouldn't affect Gram-matrix)
 62 |     y2c = years.groupby('year')['item']\
 63 |                .apply(lambda x: len(set(x)))\
 64 |                .reset_index()\
 65 |                .rename(columns = {'item': 'count'})
 66 |     y2c = y2c[y2c['count'] >= 2]
 67 |     years = years.merge(y2c[['year']], on = 'year', how = 'right')
 68 |     
 69 |     ########
 70 |     # CREW #
 71 |     ########
 72 |     # Load IMDB data links with movielens
 73 |     links = pd.read_csv(args.dir + 'ml-imdb_links.csv')[['movieId','imdbId']]
 74 |     links.columns = ['item', 'imdb_id']
 75 |     
 76 |     # Load IMDB crew data and link it properly
 77 |     crew = pd.read_csv(args.dir + 'imdb_crew_info.csv')
 78 |     crew.columns = ['imdb_id', 'directors', 'writers']
 79 |     crew['imdb_id'] = crew['imdb_id'].apply(lambda s: int(s[2:]))
 80 |     crew = crew.merge(links, on = 'imdb_id', how = 'right')
 81 |     
 82 |     # We don't care about movies without ratings
 83 |     crew = crew.merge(ratings[['item']].drop_duplicates(), on = 'item', how = 'right')[['item','directors','writers']]
 84 |     crew['directors'] = crew['directors'].apply(lambda s: str(s))
 85 |     crew['writers'] = crew['writers'].apply(lambda s: str(s))
 86 |     
 87 |     # Extract directors
 88 |     directors = pd.DataFrame(crew.directors.str.split(',').tolist(), index = crew.item).stack().reset_index([0, 'item'])
 89 |     directors.columns = ['item', 'director']
 90 |     directors = directors.loc[directors.director != '\\N']
 91 |     
 92 |     # Drop directors that appear less than once (wouldn't affect Gram-matrix)
 93 |     dir2count = directors.groupby('director')['item'].apply(lambda x: len(set(x))).reset_index().rename(columns = {'item': 'count'})
 94 |     dir2count = dir2count[dir2count['count'] >= 2]
 95 |     directors = directors.merge(dir2count[['director']], on = 'director', how = 'right')
 96 | 
 97 |     # Extract writers
 98 |     writers = pd.DataFrame(crew.writers.str.split(',').tolist(), index = crew.item).stack().reset_index([0, 'item'])
 99 |     writers.columns = ['item', 'writer']
100 |     writers = writers.loc[writers.writer != '\\N']
101 |     
102 |     # Drop writers that appear less than once (wouldn't affect Gram-matrix)
103 |     writer2count = writers.groupby('writer')['item'].apply(lambda x: len(set(x))).reset_index().rename(columns = {'item': 'count'})
104 |     writer2count = writer2count[writer2count['count'] >= 2]
105 |     writers = writers.merge(writer2count[['writer']], on = 'writer', how = 'right')
106 | 
107 |     # Ensure proper integer identifiers
108 |     user_enc = LabelEncoder()
109 |     item_enc = LabelEncoder()
110 |     genre_enc = LabelEncoder()
111 |     year_enc = LabelEncoder()
112 |     direc_enc = LabelEncoder()
113 |     write_enc = LabelEncoder()
114 |     ratings['user'] = user_enc.fit_transform(ratings['user'])
115 |     ratings['item'] = item_enc.fit_transform(ratings['item'])
116 |     genres['item'] = item_enc.transform(genres['item'])
117 |     genres['genre'] = genre_enc.fit_transform(genres['genre'])
118 |     years['item'] = item_enc.transform(years['item'])
119 |     years['year'] = year_enc.fit_transform(years['year'])
120 |     directors['item'] = item_enc.transform(directors['item'])
121 |     directors['director'] = direc_enc.fit_transform(directors['director'])
122 |     writers['item'] = item_enc.transform(writers['item'])
123 |     writers['writer'] = write_enc.fit_transform(writers['writer'])
124 | 
125 |     # Generate Metadata-to-item mapping
126 |     X_genres = util.generate_csr_matrix(genres, 'genre', ratings['item'].max() + 1)
127 |     X_years = util.generate_csr_matrix(years, 'year', ratings['item'].max() + 1)
128 |     X_directors = util.generate_csr_matrix(directors, 'director', ratings['item'].max() + 1)
129 |     X_writers = util.generate_csr_matrix(writers, 'writer', ratings['item'].max() + 1)
130 |     X_meta = vstack((X_genres,X_years,X_directors,X_writers))
131 |     
132 |     # Check whether output directory already exists - make it if necessary
133 |     if not os.path.exists(args.dir + 'preprocessed/'):
134 |         os.makedirs(args.dir + 'preprocessed/')
135 | 
136 |     # Write out metadata-item matrix
137 |     print(datetime.now(), 'Writing out metadata-item matrix...')
138 |     save_npz(args.dir + 'preprocessed/X_meta.npz', X_meta)
139 | 
140 |     # Train - validation - test split
141 |     print(datetime.now(), 'Train-validation-test split...')
142 |     X_train, X_val, val_dict, X_test, test_dict = util.train_val_test_split_Jebara(ratings, n_test_users = args.test_users) 
143 | 
144 |     # Write out validation and test data
145 |     print(datetime.now(), 'Writing out validation and test data...')
146 |     save_npz(args.dir + 'preprocessed/X_val.npz', X_val)
147 |     with open(args.dir + 'preprocessed/val_dict.pkl', 'wb') as handle:
148 |         pickle.dump(val_dict, handle)
149 |     save_npz(args.dir + 'preprocessed/X_test.npz', X_test)
150 |     with open(args.dir + 'preprocessed/test_dict.pkl', 'wb') as handle:
151 |         pickle.dump(test_dict, handle)
152 | 
153 |     # Write out full user-item training matrix
154 |     print(datetime.now(), 'Writing out train data...')
155 |     save_npz(args.dir + 'preprocessed/X_train.npz', X_train)
156 | 
157 |     # Subsample training data on a user-level
158 |     print(datetime.now(), 'Subsampling training users...')
159 |     train_users = np.unique(X_train.nonzero()[0])
160 |     np.random.shuffle(train_users)
161 |     for frac_train_users in [0.01, .05, .1, .25, .5]:
162 |         train_users[:int(frac_train_users * len(train_users))]
163 |         pd.DataFrame(train_users[:int(frac_train_users * len(train_users))], columns = ['user']).to_csv(args.dir + 'preprocessed/train_users_{}.csv'.format(frac_train_users), index = False)
164 |     print(datetime.now(), 'Finished!')
165 | 


--------------------------------------------------------------------------------
/src/util.py:
--------------------------------------------------------------------------------
  1 | import pandas as pd 
  2 | import numpy as np
  3 | import sys
  4 | from collections import defaultdict
  5 | from datetime import datetime
  6 | from joblib import Parallel, delayed
  7 | from scipy.sparse import csr_matrix, coo_matrix, vstack
  8 | from tqdm import tqdm
  9 | 
 10 | # From https://stackoverflow.com/questions/24455615/python-how-to-display-size-of-all-variables
 11 | def sizeof_fmt(num, suffix='B'):
 12 |     ''' by Fred Cirera,  https://stackoverflow.com/a/1094933/1870254, modified'''
 13 |     for unit in ['','Ki','Mi','Gi','Ti','Pi','Ei','Zi']:
 14 |         if abs(num) < 1024.0:
 15 |             return "%3.1f %s%s" % (num, unit, suffix)
 16 |         num /= 1024.0
 17 |     return "%.1f %s%s" % (num, 'Yi', suffix)
 18 | 
 19 | def pretty_print_results(results):
 20 |     ''' Pretty print results in a defaultdict. '''
 21 |     print('\tRecall@K')
 22 |     for K in results[0].keys():
 23 |         print('\t', K,'\t',results[0][K])
 24 |     print('\tNDCG@K')
 25 |     for K in results[1].keys():
 26 |         print('\t', K,'\t',results[1][K])
 27 | 
 28 | def generate_csr_matrix(meta_df, colname, ncols, alpha = 1.):
 29 |     ''' Generate Metadata-to-item mapping in the form of a CSR matrix. '''
 30 |     data = np.ones(meta_df.shape[0]) * alpha
 31 |     rows, cols = meta_df[colname].values, meta_df['item'].values
 32 |     nrows = meta_df[colname].max() + 1
 33 |     return csr_matrix((data, (rows, cols)), shape = (int(nrows), int(ncols)))
 34 | 
 35 | def normalize_idf(X):
 36 |     ''' Normalize matrix X according to column-wise IDF. '''
 37 |     # Log-normalised Smoothed Inverse Document Frequency
 38 |     row_counts = X.sum(axis = 1)
 39 |     row_counts -= (row_counts.min() - 2.0) # Start from 0 for more expressive log-scale
 40 |     idf = (1.0 / np.log(row_counts)).A1.ravel()
 41 |     return csr_matrix(np.diag(idf)) @ X
 42 | 
 43 | def compute_sparsity(A):
 44 |     ''' Compute the sparsity level (% of non-zeros) of matrix A. '''
 45 |     return 1.0 - np.count_nonzero(A) / (A.shape[0] * A.shape[1])
 46 | 
 47 | def sparsify(B, rho = .95):
 48 |     ''' Get B to the required sparsity level by dropping out the rho % lower absolute values. '''
 49 |     min_val = np.quantile(np.abs(B), rho)
 50 |     B[np.abs(B) < min_val] = .0
 51 |     return B
 52 | 
 53 | def compute_EASE(X, l2 = 5e2):
 54 |     ''' Compute a closed-form OLS SLIM-like item-based model. (H. Steck @ WWW 2019) '''
 55 |     G = X.T @ X + l2 * np.identity((X.shape[1]))
 56 |     B = np.linalg.inv(G)
 57 |     B /= -np.diag(B)
 58 |     B[np.diag_indices(B.shape[0])] = .0
 59 |     return B
 60 | 
 61 | def compute_cosine(X):
 62 |     ''' Compute a cosine similarity item-based model. '''
 63 |     # Base similarity matrix (all dot products)
 64 |     similarity = X.T.dot(X).toarray()
 65 |     # Squared magnitude of preference vectors (number of occurrences)
 66 |     square_mag = np.diag(similarity)
 67 |     # Inverse squared magnitude
 68 |     inv_square_mag = 1 / square_mag
 69 |     # If it doesn't occur, set it's inverse magnitude to zero (instead of inf)
 70 |     inv_square_mag[np.isinf(inv_square_mag)] = 0
 71 |     # inverse of the magnitude
 72 |     inv_mag = np.sqrt(inv_square_mag)
 73 |     # cosine similarity (elementwise multiply by inverse magnitudes)
 74 |     cosine = similarity * inv_mag
 75 |     cosine = cosine.T * inv_mag
 76 |     cosine[np.diag_indices(X.shape[1])] = .0
 77 |     return cosine
 78 | 
 79 | def generate_eval_format(ratings, nrows, ncols, hist_frac = .8):
 80 |     ''' Split 'ratings' into a historical and held-out fraction '''
 81 |     # Split ratings into 'history' and 'held-out' set
 82 |     test_ratings = ratings.groupby('user').apply(lambda df: df.sample(frac = 1. - hist_frac)).reset_index(drop = True)
 83 |     hist_ratings = pd.concat([test_ratings, ratings]).drop_duplicates(keep = False)
 84 | 
 85 |     # Generate user-item matrix for history and dictionary for hold-out
 86 |     data = np.ones(hist_ratings.shape[0])
 87 |     rows, cols = hist_ratings['user'], hist_ratings['item']
 88 |     X_hist = csr_matrix((data, (rows, cols)), shape = (nrows, ncols))
 89 |     
 90 |     # Generate dictionary for hold-out (fast lookup)
 91 |     test_dict = defaultdict(set)
 92 |     for row in test_ratings.itertuples():
 93 |         test_dict[row.user].add(row.item)
 94 |     
 95 |     return X_hist, test_dict
 96 | 
 97 | def train_val_test_split_strong(ratings, n_test_users = 10000, hist_frac = .8, n_train_users = 0):
 98 |     ''' Split into train/validation/test ratings for strong generalisation.
 99 |         i.e. unseen users during training time '''
100 |     # Sample validation and testing users without replacement
101 |     val_test_users = np.random.choice(ratings['user'].max() + 1, size = n_test_users * 2, replace = False)
102 |     val_users = val_test_users[:n_test_users]
103 |     test_users = val_test_users[n_test_users:]
104 |     
105 |     # Extract ratings for these users from the full set
106 |     val_ratings = ratings.merge(pd.DataFrame(val_users, columns = ['user']), how = 'right')
107 |     test_ratings = ratings.merge(pd.DataFrame(test_users, columns = ['user']), how = 'right')
108 |     train_ratings = pd.concat([test_ratings, val_ratings, ratings]).drop_duplicates(keep = False)
109 | 
110 |     # Split into historical and held-out sets
111 |     nrows, ncols = ratings['user'].max() + 1, ratings['item'].max() + 1
112 |     X_val, val_dict = generate_eval_format(val_ratings, nrows, ncols, hist_frac = hist_frac)
113 |     X_test, test_dict = generate_eval_format(test_ratings, nrows, ncols, hist_frac = hist_frac)
114 | 
115 |     # Subsample training data if specified
116 |     if n_train_users:
117 |         # Randomly sample training users - only keep their ratings
118 |         train_users = train_ratings[['user']].sample(n = n_train_users)
119 |         train_ratings = train_ratings.merge(train_users, on = 'user', how = 'right')
120 | 
121 |     # Generate historical matrix for training ratings
122 |     X_train, _ = generate_eval_format(train_ratings, nrows, ncols, hist_frac = 1.)
123 | 
124 |     return X_train, X_val, val_dict, X_test, test_dict
125 | 
126 | def train_val_test_split_loocb(ratings, n_train_users = 0):
127 |     ''' Split into train/validation/test ratings via leave-one-out. '''
128 |     # For every user - randomly sample a single item for test and validation
129 |     val_ratings = ratings.groupby('user').apply(lambda df: df.sample(1)).reset_index(drop = True)
130 |     rest_ratings = pd.concat([val_ratings, ratings]).drop_duplicates(keep = False)
131 | 
132 |     test_ratings = rest_ratings.groupby('user').apply(lambda df: df.sample(1)).reset_index(drop = True)
133 |     train_ratings = pd.concat([test_ratings, rest_ratings]).drop_duplicates(keep = False)
134 |     
135 |     # Generate historical matrix for training ratings
136 |     nrows, ncols = ratings['user'].max() + 1, ratings['item'].max() + 1
137 |     X_hist, _ = generate_eval_format(train_ratings, nrows, ncols, hist_frac = 1.)
138 |     _, val_dict = generate_eval_format(val_ratings, nrows, ncols, hist_frac = 0.)
139 |     _, test_dict = generate_eval_format(test_ratings, nrows, ncols, hist_frac = 0.)
140 | 
141 |     # Subsample training data if specified
142 |     if n_train_users:
143 |         # Randomly sample training users - only keep their ratings
144 |         train_users = train_ratings[['user']].sample(n = n_train_users)
145 |         train_ratings = train_ratings.merge(train_users, on = 'user', how = 'right')
146 | 
147 |     # Generate historical matrix for training ratings
148 |     X_train, _ = generate_eval_format(train_ratings, nrows, ncols, hist_frac = 1.)
149 | 
150 |     return X_train, X_hist, val_dict, X_hist, test_dict
151 | 
152 | def evaluate(X, scores, test, k_values = [1, 5, 10, 20, 50, 100], compute_item_counts = True):
153 |     ''' Evaluate an approximation X with historical user-item matrix 'X' and user to held-out item dictionary 'test'. '''
154 |     # Placeholder for results
155 |     recall = defaultdict(float)
156 |     NDCG = defaultdict(float)
157 |     item2count = csr_matrix((1,scores.shape[0]))
158 | 
159 |     # Function per user to parallellise
160 |     def evaluate_user(scores, items, k_values = k_values):
161 |         # Placeholder for results per user
162 |         item2count = None
163 |         recall = []
164 |         NDCG = []
165 |         # Top-K for multiple K's
166 |         for K in k_values:
167 |             ##########
168 |             # RECALL #
169 |             ##########
170 |             # Extract top-K highest scores into a set
171 |             topK_list = np.argpartition(scores, -K)[-K:]
172 |             topK_set = set(topK_list)
173 |             # Compute recall
174 |             recall.append(len(topK_set.intersection(items)) / min(K, len(items)))
175 |             ########
176 |             # NDCG #
177 |             ########
178 |             # Extract top-K highest scores into a sorted list
179 |             topK_list = topK_list[np.argsort(scores[topK_list])][::-1]
180 |             # Compute NDCG discount template
181 |             discount_template = 1. / np.log2(np.arange(2, K + 2))
182 |             # Compute ideal DCG
183 |             IDCG = discount_template[:min(K, len(items))].sum()
184 |             # Compute DCG
185 |             DCG = sum((discount_template[rank] * (item in items)) for rank, item in enumerate(topK_list))
186 |             # Normalise and store
187 |             NDCG.append(DCG / IDCG)
188 |             #############
189 |             # LONG TAIL # 
190 |             #############
191 |             if K == 100:
192 |                 item2count = coo_matrix(([1] * K,([0] * K,topK_list)), shape = (1, scores.shape[0]))
193 |         # Stack batches
194 |         return recall + NDCG, item2count 
195 | 
196 |     # Parallellise every batch
197 |     val = Parallel(n_jobs=-1)(delayed(evaluate_user)(scores[new_row,:].A1, items, k_values) for new_row, (user, items) in tqdm(enumerate(test.items()), total = len(test)))
198 |     if compute_item_counts:
199 |         # Properly extract evaluation metrics and item counts for analysis
200 |         item2counts = [v[1] for v in val]
201 |         item2count = vstack(item2counts).sum(axis=0).A1
202 |     # Merge evaluation-metrics per user
203 |     val = [v[0] for v in val]
204 |     val = np.vstack(val)
205 |     for idx, K in enumerate(k_values):
206 |         recall[K] = np.mean(val[:,idx])
207 |         NDCG[K] = np.mean(val[:,idx+len(k_values)])
208 |     return recall, NDCG, item2count
209 | 


--------------------------------------------------------------------------------
/src/TrainModel.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import gc
  3 | import itertools
  4 | import numpy as np
  5 | import os
  6 | import pandas as pd
  7 | import pickle
  8 | import time
  9 | from copy import deepcopy
 10 | from datetime import datetime
 11 | from scipy.sparse import load_npz, vstack
 12 | from sklearn.preprocessing import LabelEncoder, normalize
 13 | import sys
 14 | import tensorflow.compat.v1 as tf
 15 | tf.disable_v2_behavior()
 16 | import torch
 17 | from tqdm import trange
 18 | 
 19 | import models
 20 | import util
 21 | 
 22 | # Only needed for tensorflow when 1st GPU is already in use
 23 | #import os
 24 | #os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
 25 | #os.environ["CUDA_VISIBLE_DEVICES"] = "1"
 26 | 
 27 | if __name__ == '__main__':
 28 |     # Commandline arguments
 29 |     parser = argparse.ArgumentParser()
 30 |     parser.add_argument('dir', type = str, help = 'Directory containing the data')
 31 |     parser.add_argument('-a', '--algorithms', nargs='+', help='Algorithms to train and evaluate')
 32 |     parser.add_argument('-frac', '--frac_train_users', nargs='+', help='Fractions of training users to go over', default = [0.01, .05, .1, .25, .5, 1.])
 33 |     parser.add_argument('-l2', '--l2_values', nargs='+', help='Values to test for EASE\'s l2 regularisation strength', default = [50, 100, 200, 500, 1000])
 34 |     parser.add_argument('-alpha', '--alpha_values', nargs='+', help='Values to test for EASE\'s alpha side-info weight', default = np.linspace(.0,1.,21))
 35 |     parser.add_argument('-eval_style', help='Evaluation style - either strong generalisation or LOOCV', default = 'strong')
 36 |     args = parser.parse_args()
 37 | 
 38 |     # Fix seed for reproducibility
 39 |     np.random.seed(42)
 40 |     torch.manual_seed(42)
 41 |     tf.set_random_seed(42)
 42 | 
 43 |     # Check whether output directory already exists - make it if necessary
 44 |     if not os.path.exists(args.dir + 'preprocessed/'):
 45 |         print('Directory {0} not found.\nPlease run the accompanying preprocessing script first.'.format(args.dir + 'preprocessed/'))
 46 |         exit(1)
 47 |     
 48 |     print('Directory with data:', args.dir)
 49 |     print('Models to evaluate:', args.algorithms)
 50 |     print('Evaluation style:', args.eval_style )
 51 | 
 52 |     # Load everything you need
 53 |     print(datetime.now(), 'Loading in data...')
 54 |     X_meta = load_npz(args.dir + 'preprocessed/X_meta.npz').astype(np.int32)
 55 |     X_train = load_npz(args.dir + 'preprocessed/X_train.npz').astype(np.int32)
 56 |     if args.eval_style == 'strong':
 57 |         X_val = load_npz(args.dir + 'preprocessed/X_val.npz').astype(np.int32)
 58 |         X_test = load_npz(args.dir + 'preprocessed/X_test.npz').astype(np.int32)
 59 |     elif args.eval_style == 'LOOCV':
 60 |         X_val = X_train
 61 |         X_test = X_train
 62 |     else:
 63 |         print('Unknown evaluation style, aborting...')
 64 |         exit(1)
 65 | 
 66 |     with open(args.dir + 'preprocessed/val_dict.pkl', 'rb') as handle:
 67 |         val_dict = pickle.load(handle)
 68 |     with open(args.dir + 'preprocessed/test_dict.pkl', 'rb') as handle:
 69 |         test_dict = pickle.load(handle)
 70 | 
 71 |     # Check whether output directory already exists - make it if necessary
 72 |     if not os.path.exists(args.dir + 'results/'):
 73 |         os.makedirs(args.dir + 'results/')
 74 |         
 75 |     # For every sampled subset of training users
 76 |     subsampling_fractions = args.frac_train_users if args.eval_style == 'strong' else [1.]
 77 |     for frac_train_users in subsampling_fractions:
 78 |         frac_train_users = float(frac_train_users)
 79 |         print(datetime.now(), '---- Frac of train users:\t{} ----'.format(frac_train_users))
 80 |     
 81 |         # Placeholder for results
 82 |         results = []
 83 |     
 84 |         # Normally, we train on everything
 85 |         train_users = np.unique(X_train.nonzero()[0])
 86 |         # But subsample when necessary
 87 |         if frac_train_users < 1.:
 88 |             # Read training users
 89 |             train_users = pd.read_csv(args.dir + 'preprocessed/train_users_{}.csv'.format(frac_train_users))['user'].values.astype(np.int32)
 90 |         # Only keep these rows in X_train
 91 |         X_train_subset = deepcopy(X_train[train_users,:])
 92 | 
 93 |         print('\tTraining data # users:', X_train_subset.shape[0])
 94 |         print('\tTraining data # prefs:', X_train_subset.count_nonzero())
 95 |         print('\tSide-information # tags:', X_meta.shape[0])
 96 |         print('\tSide-information # pairs:', X_meta.count_nonzero())
 97 |         print('\tTraining and evaluating models...')  
 98 |         
 99 |         for algo in args.algorithms:
100 |             print('\t\t---- {0} ----'.format(algo))
101 |             if algo == 'cosine':
102 |                 ###########################################
103 |                 # 1. Item-kNN (cosine) (Sarwar, WWW 2001) #
104 |                 ###########################################
105 |                 # Get dictionary with results - 0 is recall, 1 is NDCG, 2 is item counts for analysis
106 |                 recall, ndcg, item2count = models.run_itemknn(X_train_subset, X_test, test_dict)
107 |                 util.pretty_print_results((recall, ndcg))
108 |                 results.append({
109 |                     'Recall@20': recall[20],
110 |                     'Recall@50': recall[50],
111 |                     'NDCG@100': ndcg[100],
112 |                     'frac_U': frac_train_users,
113 |                     'Alg': 'Item-kNN'
114 |                 })
115 |                 np.savez(args.dir + 'results/item_counts_{0}_{1}.npz'.format(algo, frac_train_users), item2count)
116 | 
117 |             elif algo == 'cvae':
118 |                 ####################################
119 |                 # 2. cVAE (Chen, DLRS@RecSys 2018) #
120 |                 ####################################
121 |                 # Get dictionary with results - 0 is recall, 1 is NDCG, 2 is item counts for analysis
122 |                 recall, ndcg, _ = models.run_cVAE(X_train_subset, X_meta, X_val, X_test, val_dict, test_dict)
123 |                 util.pretty_print_results((recall, ndcg))
124 |                 results.append({
125 |                     'Recall@20': recall[20],
126 |                     'Recall@50': recall[50],
127 |                     'NDCG@100': ndcg[100],
128 |                     'frac_U': frac_train_users,
129 |                     'Alg': 'cVAE'
130 |                 })
131 | 
132 |             elif algo == 'vlm':
133 |                 ###################################################################
134 |                 # 3.a VARIATONAL LOW-RANK MULTINOMIALS (Elahi, RecSys 2019) #
135 |                 ###################################################################
136 |                 # Original implementation in Tensorflow
137 |                 # Get dictionary with results - 0 is recall, 1 is NDCG, 2 is item counts for analysis
138 |                 print('------------------ WITH SIDE INFO ---------------')
139 |                 recall, ndcg, _ = models.run_VLM(X_train_subset, train_users, normalize(X_meta, norm = 'l1', axis = 0), X_val, X_test, val_dict, test_dict, side_info = True)
140 |                 util.pretty_print_results((recall, ndcg))
141 |                 results.append({
142 |                     'Recall@20': recall[20],
143 |                     'Recall@50': recall[50],
144 |                     'NDCG@100': ndcg[100],
145 |                     'frac_U': frac_train_users,
146 |                     'Alg': 'VLM-Side'
147 |                 })
148 |                 print('------------------ WITHOUT SIDE INFO ---------------')
149 |                 recall, ndcg, _ = models.run_VLM(X_train_subset, train_users, normalize(X_meta, norm = 'l1', axis = 0), X_val, X_test, val_dict, test_dict, side_info = False)
150 |                 util.pretty_print_results((recall, ndcg))
151 |                 results.append({
152 |                     'Recall@20': recall[20],
153 |                     'Recall@50': recall[50],
154 |                     'NDCG@100': ndcg[100],
155 |                     'frac_U': frac_train_users,
156 |                     'Alg': 'VLM-NoSide'
157 |                 })
158 | 
159 |             elif algo == 'vlm_pytorch':
160 |                 ###################################################################
161 |                 # 3.b VARIATONAL LOW-RANK MULTINOMIALS (Elahi, RecSys 2019) #
162 |                 ###################################################################
163 |                 # Our implementation in PyTorch, generally quite a bit faster
164 |                 # Get dictionary with results - 0 is recall, 1 is NDCG, 2 is item counts for analysis
165 |                 print('------------------ WITH SIDE INFO ---------------')
166 |                 recall, ndcg, _ = models.run_VLM_PyTorch(X_train_subset, train_users, normalize(X_meta, norm = 'l1', axis = 0), X_val, X_test, val_dict, test_dict, side_info = True, eval_style = args.eval_style)
167 |                 util.pretty_print_results((recall, ndcg))
168 |                 results.append({
169 |                     'Recall@20': recall[20],
170 |                     'Recall@50': recall[50],
171 |                     'NDCG@100': ndcg[100],
172 |                     'frac_U': frac_train_users,
173 |                     'Alg': 'VLM-PyTorch-Side'
174 |                 })
175 |                 print('------------------ WITHOUT SIDE INFO ---------------')
176 |                 recall, ndcg, _ = models.run_VLM_PyTorch(X_train_subset, train_users, normalize(X_meta, norm = 'l1', axis = 0), X_val, X_test, val_dict, test_dict, side_info = False, eval_style = args.eval_style)
177 |                 util.pretty_print_results((recall, ndcg))
178 |                 results.append({
179 |                     'Recall@20': recall[20],
180 |                     'Recall@50': recall[50],
181 |                     'NDCG@100': ndcg[100],
182 |                     'frac_U': frac_train_users,
183 |                     'Alg': 'VLM-PyTorch-NoSide'
184 |                 })
185 |             
186 |             elif algo == 'slim':
187 |                 #########################################
188 |                 # 4. (c)SLIM (Ning and Karypis, ICDM 2011) #
189 |                 #########################################
190 |                 # Get dictionary with results - 0 is recall, 1 is NDCG, 2 is item counts for analysis
191 |                 print('------------------ WITHOUT SIDE INFO ---------------')
192 |                 recall, ndcg, _ = models.run_SLIM(X_train_subset, train_users, X_meta, X_val, X_test, val_dict, test_dict, side_info = False, eval_style = args.eval_style)
193 |                 util.pretty_print_results((recall, ndcg))
194 |                 results.append({
195 |                     'Recall@20': recall[20],
196 |                     'Recall@50': recall[50],
197 |                     'NDCG@100': ndcg[100],
198 |                     'frac_U': frac_train_users,
199 |                     'Alg': 'SLIM'
200 |                 })
201 |                 print('------------------ WITH SIDE INFO ---------------')
202 |                 recall, ndcg, _ = models.run_SLIM(X_train_subset, train_users, X_meta, X_val, X_test, val_dict, test_dict, side_info = True, eval_style = args.eval_style)
203 |                 util.pretty_print_results((recall, ndcg))
204 |                 results.append({
205 |                     'Recall@20': recall[20],
206 |                     'Recall@50': recall[50],
207 |                     'NDCG@100': ndcg[100],
208 |                     'frac_U': frac_train_users,
209 |                     'Alg': 'cSLIM'
210 |                 })
211 | 
212 |             elif algo == 'ease':
213 |                 #############################
214 |                 ## 4. EASE (Steck, WWW 2019) #
215 |                 #############################
216 |                 print(datetime.now(), '---- EASE ----')
217 |                 # Find optimal l2 on validation set via grid-search for optimal NDCG@100
218 |                 NDCG_values = []
219 |                 optimal_model_EASE = None
220 |                 optimal_l2_value = None
221 |                 val_users = list(val_dict.keys())
222 |                 # For every parameter combination
223 |                 for l2 in args.l2_values:
224 |                     # Compute the model
225 |                     start = time.perf_counter()
226 |                     model = util.compute_EASE(X_train_subset, l2 = int(l2))
227 |                     end = time.perf_counter()
228 |                     print('\t... took {0} seconds!'.format(end - start))
229 |                     # Evaluate the model
230 |                     val_scores = X_val[val_users,:] @ model - 987654321 * X_val[val_users,:]
231 |                     NDCG = util.evaluate(X_val, val_scores, val_dict)[1][100]
232 |                     NDCG_values.append(NDCG)
233 |                     print('\tL2:', l2, 'NDCG@100:', NDCG)
234 |                     if np.max(NDCG_values) == NDCG:
235 |                         optimal_model_EASE = model
236 |                         optimal_l2_value = int(l2)
237 |                 # Compute prediction scores for all test users - subtract already seen items
238 |                 test_users = list(test_dict.keys())
239 |                 test_scores = X_test[test_users,:] @ optimal_model_EASE - 987654321 * X_test[test_users,:]
240 |                 recall, ndcg, item2count = util.evaluate(X_test, test_scores, test_dict)
241 |                 util.pretty_print_results((recall, ndcg))
242 |                 results.append({
243 |                     'Recall@20': recall[20],
244 |                     'Recall@50': recall[50],
245 |                     'NDCG@100': ndcg[100],
246 |                     'frac_U': frac_train_users,
247 |                     'Alg': 'EASE'
248 |                 })
249 |                 #np.savez(args.dir + 'results/item_counts_{0}_{1}.npz'.format('ease', frac_train_users), item2count)
250 |                 ############################
251 |                 # 5. Add-EASE (contrib. 1) #
252 |                 ############################
253 |                 print(datetime.now(), '---- Add-EASE ----')
254 |                 NDCG_values = []
255 |                 optimal_model_ADDEASE = None
256 |                 # Compute EASE model on tag-item matrix
257 |                 side_model = util.compute_EASE(X_meta, l2 = optimal_l2_value)
258 |                 # For every parameter combination
259 |                 for alpha in args.alpha_values:
260 |                     # Blend
261 |                     model = (1. - float(alpha)) * optimal_model_EASE + float(alpha) * side_model
262 |                     # Evaluate the model
263 |                     val_scores = X_val[val_users,:] @ model - 987654321 * X_val[val_users,:]
264 |                     NDCG = util.evaluate(X_val, val_scores, val_dict)[1][100]
265 |                     NDCG_values.append(NDCG)
266 |                     print('\tAlpha:', alpha, 'L2:', optimal_l2_value, 'NDCG@100:', NDCG)
267 |                     if np.max(NDCG_values) == NDCG:
268 |                         optimal_model_ADDEASE = model
269 |                 # Compute prediction scores for all test users - subtract already seen items
270 |                 test_scores = X_test[test_users,:] @ optimal_model_ADDEASE - 987654321 * X_test[test_users,:]
271 |                 recall, ndcg, item2count = util.evaluate(X_test, test_scores, test_dict)
272 |                 util.pretty_print_results((recall, ndcg))
273 |                 results.append({
274 |                     'Recall@20': recall[20],
275 |                     'Recall@50': recall[50],
276 |                     'NDCG@100': ndcg[100],
277 |                     'frac_U': frac_train_users,
278 |                     'Alg': 'Add-EASE'
279 |                 })
280 |                 #np.savez(args.dir + 'results/item_counts_{0}_{1}.npz'.format('addease', frac_train_users), item2count)
281 |                 del optimal_model_EASE, side_model, optimal_model_ADDEASE
282 | 
283 |                 #########################
284 |                 # 6. cEASE (contrib. 2) #
285 |                 #########################
286 |                 print(datetime.now(), '---- cEASE ----')
287 |                 NDCG_values = []
288 |                 optimal_model_CEASE = None
289 |                 # For every parameter combination
290 |                 for alpha in args.alpha_values:
291 |                     # Stack matrix 
292 |                     X_full = vstack((X_train_subset, X_meta * float(alpha)))
293 |                     # Compute the model
294 |                     model = util.compute_EASE(X_full, l2 = optimal_l2_value)
295 |                     # Evaluate the model
296 |                     val_scores = X_val[val_users,:] @ model - 987654321 * X_val[val_users,:]
297 |                     NDCG = util.evaluate(X_val, val_scores, val_dict)[1][100]
298 |                     NDCG_values.append(NDCG)
299 |                     print('\tAlpha:', alpha, 'L2:', optimal_l2_value, 'NDCG@100:', NDCG)
300 |                     if np.max(NDCG_values) == NDCG:
301 |                         optimal_model_CEASE = model
302 |                 # Compute prediction scores for all test users - subtract already seen items
303 |                 test_scores = X_test[test_users,:] @ optimal_model_CEASE - 987654321 * X_test[test_users,:]
304 |                 recall, ndcg, item2count = util.evaluate(X_test, test_scores, test_dict)
305 |                 util.pretty_print_results((recall, ndcg))
306 |                 results.append({
307 |                     'Recall@20': recall[20],
308 |                     'Recall@50': recall[50],
309 |                     'NDCG@100': ndcg[100],
310 |                     'frac_U': frac_train_users,
311 |                     'Alg': 'cEASE'
312 |                 })
313 |                 #np.savez(args.dir + 'results/item_counts_{0}_{1}.npz'.format('cease', frac_train_users), item2count)
314 |                 del optimal_model_CEASE
315 |                 gc.collect()
316 | 
317 |             else:
318 |                 print('\t\t\tUnknown algorithm, skipping...')
319 | 
320 |         # Write out results
321 |         pd.DataFrame(results).to_csv(args.dir + 'results/{0}_{1}.csv'.format('_'.join(args.algorithms),str(frac_train_users)), index = False)
322 | 


--------------------------------------------------------------------------------
/src/models.py:
--------------------------------------------------------------------------------
  1 | from copy import deepcopy
  2 | import itertools
  3 | import numpy as np
  4 | import pandas as pd
  5 | import util
  6 | from scipy.sparse import lil_matrix, coo_matrix, csr_matrix, vstack
  7 | from sklearn.preprocessing import normalize
  8 | import tensorflow.compat.v1 as tf
  9 | tf.disable_v2_behavior()
 10 | import torch
 11 | import torch.utils.data
 12 | from tqdm import trange
 13 | 
 14 | from baselines.cvae.vae import VAE
 15 | from baselines.vlm.vlm import VLM
 16 | from baselines.vlm.vlm_pytorch import VLM_PyTorch
 17 | 
 18 | from SLIM import SLIM, SLIMatrix
 19 | 
 20 | def run_itemknn(X_train, X_test, test_dict):
 21 |     # Compute item-item matrix with cosine similarities
 22 |     S_cosine = util.compute_cosine(X_train)
 23 | 
 24 |     # Compute prediction scores for all test users - subtract already seen items
 25 |     test_users = list(test_dict.keys())
 26 |     test_scores = X_test[test_users,:] @ S_cosine - 987654321 * X_test[test_users,:]
 27 |     
 28 |     # Evaluate and pretty print
 29 |     results_cosine = util.evaluate(X_test, test_scores, test_dict)
 30 |     return results_cosine
 31 | 
 32 | def run_cVAE(X_train, X_meta, X_val, X_test, val_dict, test_dict):
 33 |     # Parameters for cVAE
 34 |     device = 'cuda' if torch.cuda.is_available() else 'cpu'
 35 |     params = {
 36 |         'layers': [400, 100],
 37 |         'n_items': X_meta.shape[1],
 38 |         'device': device 
 39 |     }
 40 |    
 41 |     batch_size = 1024
 42 |     tol = 1e-7
 43 |     patience = 200
 44 |     alpha = 1.
 45 |     beta = 1.
 46 |     # Instantiate model
 47 |     model = VAE(params).to(device)
 48 |     
 49 |     # Multi-GPU 
 50 |     if torch.cuda.device_count() > 1:
 51 |         model = torch.nn.DataParallel(model)
 52 |         batch_size = int(batch_size * 2)
 53 | 
 54 |     # Set up optimizer
 55 |     optimizer = torch.optim.Adam(model.parameters(),lr=5e-4)
 56 | 
 57 |     # Loss function is MSE and annealed ELBO
 58 |     def loss_function(recon_x, x, mu, logvar, anneal=1.0):
 59 |         MSE = torch.sum((x - torch.sigmoid(recon_x)) ** 2)
 60 |         KLD = -0.5 * torch.sum(1 + logvar - mu.pow(2) - logvar.exp())
 61 |         return MSE + anneal * KLD
 62 | 
 63 |     # Pre-train on meta-data
 64 |     train_tensor = torch.from_numpy(X_meta.A.astype('float32')).to(device)
 65 |     train_loader = torch.utils.data.DataLoader(train_tensor, batch_size, shuffle=True)
 66 |     t = trange(2000, desc = 'Meta')
 67 |     best_loss, best_epoch = np.inf, -1
 68 |     for epoch in t:
 69 |         # Put the model into training mode
 70 |         model.train()
 71 |         loss_value = 0
 72 |         # Every batch
 73 |         for batch_idx, data in enumerate(train_loader):
 74 |             # Clear gradients
 75 |             optimizer.zero_grad()
 76 |             # Get predictions
 77 |             recon_batch, mu, logvar = model(data)
 78 |             # Compute loss
 79 |             loss = loss_function(recon_batch, data, mu, logvar, anneal = 0.2)
 80 |             # Back-propagate 
 81 |             loss.backward()
 82 |             loss_value += loss.item()
 83 |             optimizer.step()
 84 |         loss = loss_value / len(train_loader.dataset)
 85 |         t.set_postfix(loss = loss)
 86 |         # Early stopping - are we improving by at least 'tol'?
 87 |         if (best_loss - loss) > tol:
 88 |             # If yes - keep going
 89 |             best_loss = loss
 90 |             best_epoch = epoch
 91 |         # If we're not improving, have we improved at all in the past 'patience' epochs?
 92 |         if (epoch - best_epoch) > patience:
 93 |             print('Converged after {0} epochs, stopping...'.format(epoch))
 94 |             break
 95 | 
 96 |     del train_tensor, train_loader
 97 |     torch.cuda.empty_cache()
 98 |     
 99 |     # Loss function is cross-entropy and annealed ELBO
100 |     def loss_function(recon_x, x, mu, logvar, anneal=1., alpha = 1.):
101 |         BCE = -torch.sum(alpha * torch.log(torch.sigmoid(recon_x) + 1e-8) * x + torch.log(1 - torch.sigmoid(recon_x) + 1e-8) * (1 - x))
102 |         KLD = -0.5 * torch.sum(1 + logvar - mu.pow(2) - logvar.exp())
103 |         return BCE + anneal * KLD
104 |     
105 |     # Refine on ratings
106 |     # Get rating data into torch format
107 |     train_tensor = torch.from_numpy(X_train.A.astype('float32')).to(device)
108 |     train_loader = torch.utils.data.DataLoader(train_tensor, batch_size, shuffle=True)
109 |     t = trange(2000, desc = 'Pref')
110 |     best_loss, best_epoch = np.inf, -1
111 |     for epoch in t:
112 |         # Put the model into training mode
113 |         model.train()
114 |         loss_value = 0
115 |         # Every batch
116 |         for batch_idx, data in enumerate(train_loader):
117 |             # Clear gradients
118 |             optimizer.zero_grad()
119 |             # Get predictions
120 |             recon_batch, mu, logvar = model(data)
121 |             # Compute loss
122 |             loss = loss_function(recon_batch, data, mu, logvar, anneal = beta, alpha = alpha)
123 |             # Back-propagate 
124 |             loss.backward()
125 |             loss_value += loss.item()
126 |             optimizer.step()
127 |         loss = loss_value / len(train_loader.dataset)
128 |         t.set_postfix(loss = loss)
129 |         # Early stopping - are we improving by at least 'tol'?
130 |         if (best_loss - loss) > tol:
131 |             # If yes - keep going
132 |             best_loss = loss
133 |             best_epoch = epoch
134 |         # If we're not improving, have we improved at all in the past 'patience' epochs?
135 |         if (epoch - best_epoch) > patience:
136 |             print('Converged after {0} epochs, stopping...'.format(epoch))
137 |             break
138 |         
139 |     del train_tensor, train_loader
140 |     torch.cuda.empty_cache()
141 |     
142 |     # Scores for test set
143 |     test_users = list(test_dict.keys())
144 |     with torch.no_grad():
145 |         test_tensor = torch.from_numpy(X_test[test_users,:].A.astype('float32')).to(device)
146 |         test_loader = torch.utils.data.DataLoader(test_tensor, batch_size, shuffle=False)
147 |         pred_test = []
148 |         for batch_idx, data in enumerate(test_loader):
149 |             scores, _, _ = model(data)
150 |             pred_test.append(scores.detach().cpu().numpy())
151 |         pred_test = np.vstack(pred_test)
152 |     # Subtract previously seen items from predicted scores
153 |     test_scores = pred_test - 987654321 * X_test[test_users,:]
154 |     results_cVAE = util.evaluate(X_test, test_scores, test_dict)
155 |     return results_cVAE
156 | 
157 | def run_VLM(X_train_subset, train_users, X_meta, X_val, X_test, val_dict, test_dict, side_info = True):
158 |     # Parameters for VLM
159 |     var_prior = 1.0
160 |     lr = 5e-3
161 |     reg = 1e-9
162 |     num_factors = 100
163 |     batch_size = 512
164 |     num_epochs = 1500
165 | 
166 |     if not side_info:
167 |         X_meta = csr_matrix((1,X_train_subset.shape[1]))
168 | 
169 |     # Specific input format for VLM
170 |     val_users = list(val_dict.keys())
171 |     test_users = list(test_dict.keys()) 
172 | 
173 |     # X_train_subset only has relevant non-zero rows at the moment
174 |     # We want a matrix with zeroes everywhere but these training vectors in the right spot
175 |     X_train_full = lil_matrix(X_val.shape).astype(np.int32)
176 |     X_train_full[train_users,:] = X_train_subset
177 |     X_train_full = X_train_full.tocsr()
178 |     X_all = (X_train_full+X_val+X_test).toarray().astype(np.float32)
179 |     video_metadata_array = X_meta.T.todense().astype(np.float32)
180 | 
181 |     ##############################
182 |     # TRAINING PROCEDURE FOR VLM #
183 |     ##############################
184 |     tf.reset_default_graph()
185 |     
186 |     # Instantiate TensorFlow Execution DAG
187 |     with tf.Graph().as_default():
188 |         # Generate model
189 |         model = VLM(X_test.shape[0], # Num users
190 |                     X_test.shape[1], # Num items
191 |                     X_meta.shape[0], # Num tags
192 |                     num_factors,
193 |                     var_prior,
194 |                     reg,
195 |                     video_metadata_array) 
196 | 
197 |         # Innitialise model
198 |         batch_logits, batch_logits_validation, log_softmax, avg_loss, batch_conditional_log_likelihood,\
199 |         batch_kl_div, num_items_per_document = model.construct_graph()
200 | 
201 |         # Optimisation procedure for training users
202 |         train_op = tf.train.AdamOptimizer(learning_rate=lr)\
203 |         .minimize(avg_loss, global_step=tf.Variable(0, name='global_step_1', trainable=False))
204 | 
205 |         # Optimisation procedure for validation and test users (keep items/tags fixed)
206 |         train_op_validation = tf.train.AdamOptimizer(learning_rate=lr)\
207 |         .minimize(avg_loss,
208 |                   var_list = [model.Mu_Zu, model.lsdev_Zu],
209 |                   global_step=tf.Variable(0, name='global_step_1_validation', trainable=False))
210 | 
211 |         ####Summary####
212 |         avg_loss_summary_ph = tf.placeholder(dtype = tf.float32)
213 |         tf.summary.scalar('avg_loss', avg_loss_summary_ph)
214 | 
215 |         ndcg_summary_ph = tf.placeholder(dtype=tf.float32)
216 |         tf.summary.scalar('ndcg_100', ndcg_summary_ph)
217 |         summary = tf.summary.merge_all()
218 | 
219 |         ####Start####
220 |         init = tf.global_variables_initializer()
221 |         saver = tf.train.Saver()
222 | 
223 |         config = tf.ConfigProto()
224 |         config.gpu_options.allow_growth = True
225 |         # Initialise session
226 |         with tf.Session(config=config) as sess:
227 |             
228 |             sess.run(init)
229 |             ndcgs_vad = []
230 |             best_ndcg_sofar = -1000
231 |             # For every epoch
232 |             progress_bar = trange(num_epochs)
233 |             for epoch_ind in progress_bar:
234 |                 ####################################
235 |                 ## COMPUTATIONS FOR TEST SET #
236 |                 ####################################
237 |                 ###Optimize parameters for test users ####    
238 |                 for batch_ind, st_index in enumerate(range(0, len(test_users), batch_size)):
239 |                     # Put batch into the right format
240 |                     end_index = min(st_index + batch_size, len(test_users))
241 |                     user_indices = test_users[st_index:end_index]
242 |                     # Optimise user factors for validation data
243 |                     _, loss_val = sess.run([train_op_validation, avg_loss], feed_dict = {model.users_ph : user_indices, model.played_videos_ph : X_all[user_indices]})
244 | 
245 |                 ##################################
246 |                 ## COMPUTATIONS FOR TRAINING SET #
247 |                 ##################################
248 |                 avg_loss_dataset = 0
249 |                 num_batches = 0
250 |                 np.random.shuffle(train_users)
251 |                 ## For every training batch
252 |                 for batch_ind, st_index in enumerate(range(0, len(train_users), batch_size)):
253 |                     # Put batch into the right format
254 |                     end_index = min(st_index + batch_size, len(train_users))
255 |                     user_indices = train_users[st_index:end_index]
256 |                     # Optimise user factors for training data
257 |                     _, loss_val = sess.run([train_op, avg_loss], feed_dict = {model.users_ph : user_indices, model.played_videos_ph : X_all[user_indices,:]})
258 |                     avg_loss_dataset += loss_val
259 |                     num_batches += 1
260 |                 # Average out loss
261 |                 avg_loss_dataset = avg_loss_dataset / max(num_batches, 1)
262 |                 ####Summary####
263 |                 progress_bar.set_postfix(loss = avg_loss_dataset)
264 |             
265 |             # Compute NDCG on test set
266 |             predictions_test = sess.run(batch_logits_validation, feed_dict = {model.users_ph: test_users})
267 |     # Clear model, memory and such
268 |     tf.reset_default_graph()
269 | 
270 |     # Subtract previously seen items from predicted scores
271 |     test_scores = predictions_test - 987654321 * X_test[test_users,:]
272 |     VLM_results = util.evaluate(X_test, test_scores, test_dict)
273 |     return VLM_results
274 | 
275 | def run_VLM_PyTorch(X_train, train_users, X_meta, X_val, X_test, val_dict, test_dict, side_info, eval_style = 'strong'):
276 |     # Parameters for VLM
277 |     device = 'cuda' if torch.cuda.is_available() else 'cpu'
278 |     params = {
279 |         'num_users': X_val.shape[0],
280 |         'num_items': X_val.shape[1],
281 |         'num_tags': X_meta.shape[0],
282 |         'num_factors': 100,
283 |         'var_prior': 1.0,
284 |         'reg': 1e-9,
285 |         'device': device,
286 |         'item_tag_mat': X_meta.T.astype(np.float32),
287 |         'side_info': side_info
288 |     }
289 |     lr = 3e-3
290 |     num_epochs = 5000
291 |     batch_size = 1024
292 |     tol = 1e-8
293 |     patience = 50
294 | 
295 |     # Heldout data from dictionary to csr matrix 
296 |     vals = np.ones(len(val_dict))
297 |     rows = np.asarray(list(val_dict.keys()))
298 |     cols = np.asarray([list(v)[0] for v in val_dict.values()])
299 |     val_csr = csr_matrix((vals,(rows,cols)), shape = X_val.shape)
300 |    
301 |     # Instantiate model
302 |     model = VLM_PyTorch(params).to(device)
303 |     
304 |     # Multi-GPU if possible
305 |     multi_gpu = False
306 |     if torch.cuda.device_count() > 1:
307 |         model = torch.nn.DataParallel(model)
308 |         multi_gpu = True
309 |         batch_size = int(batch_size * 2)
310 | 
311 |     def compute_kl_div(lsdev_Zu_batch, Mu_Zu_batch, num_factors, var_prior):
312 |         sdev_Zu_batch = torch.exp(lsdev_Zu_batch)
313 |         comp1 = num_factors * (0.5 * np.log(var_prior) - lsdev_Zu_batch)
314 |         comp2 = (num_factors / (2 * var_prior)) * sdev_Zu_batch.pow(2)
315 |         comp3 = (1.0 / (2 * var_prior)) * torch.sum(Mu_Zu_batch.pow(2), dim = 1)
316 |         comp4 = (num_factors / 2.0)
317 |         return comp1 + comp2 + comp3 - comp4
318 |     
319 |     def loss_function(x, scores, Mu_Zu, lsdev_Zu, num_factors = params['num_factors'], var_prior = params['var_prior'], reg = params['reg']):
320 |         scores = scores.masked_fill(~x,.0)
321 |         batch_conditional_log_likelihood = torch.sum(scores, dim = 1)
322 |         batch_kl_div = compute_kl_div(lsdev_Zu, Mu_Zu, num_factors, var_prior)
323 |         items_per_user = torch.sum(x, dim = 1, dtype = torch.float)
324 |         batch_elbo = (1.0 / items_per_user) * (batch_conditional_log_likelihood - batch_kl_div)
325 |         if multi_gpu and side_info:
326 |             return -1 * torch.mean(batch_elbo) + reg * (torch.norm(model.module.Mu_Zv.weight, 2) + torch.norm(model.module.Mu_Zt.weight, 2))
327 |         elif (not multi_gpu) and side_info:
328 |             return -1 * torch.mean(batch_elbo) + reg * (torch.norm(model.Mu_Zv.weight, 2) + torch.norm(model.Mu_Zt.weight, 2))
329 |         elif multi_gpu and (not side_info):
330 |             return -1 * torch.mean(batch_elbo) + reg * (torch.norm(model.module.Mu_Zv.weight, 2))
331 |         elif (not multi_gpu) and (not side_info):
332 |             return -1 * torch.mean(batch_elbo) + reg * (torch.norm(model.Mu_Zv.weight, 2))
333 | 
334 |     # Set up data for training
335 |     train_tensor = torch.from_numpy(X_train.A.astype(bool)).to(device)
336 |     train_users = torch.from_numpy(train_users.astype(np.int64)).to(device)
337 |     train_loader = torch.utils.data.DataLoader(torch.utils.data.TensorDataset(train_tensor, train_users), batch_size, shuffle=True)
338 | 
339 |     # Set up data for validation 
340 |     val_batch_size = batch_size #1024
341 |     val_users = np.asarray(list(val_dict.keys()))
342 |     val_tensor = torch.from_numpy(X_val[val_users,:].A.astype(np.bool)).to(device)
343 |     heldout_val_tensor = torch.from_numpy(val_csr[val_users,:].A.astype(np.float32)).to(device)
344 |     val_users = torch.from_numpy(val_users.astype(np.int64)).to(device)
345 |     val_loader = torch.utils.data.DataLoader(torch.utils.data.TensorDataset(val_tensor, heldout_val_tensor, val_users), val_batch_size, shuffle=False)
346 | 
347 |     # Optimise everything for training data
348 |     optimizer = torch.optim.Adam(model.parameters(),lr=lr)
349 | 
350 |     # For every epoch
351 |     t = trange(num_epochs, desc = 'Train')
352 |     best_loss, best_epoch = np.inf, -1
353 |     for epoch in t:
354 |         # Put the model into training mode
355 |         model.train()
356 |         loss_value = 0
357 |         # Optimise for every batch of training data
358 |         for batch_idx, (data, users) in enumerate(train_loader):
359 |             # Clear gradients
360 |             optimizer.zero_grad()
361 |             # Get predictions
362 |             scores, Mu_Zu, lsdev_Zu = model(users)
363 |             # Compute loss
364 |             loss = loss_function(data, scores, Mu_Zu, lsdev_Zu)
365 |             # Back-propagate 
366 |             loss.backward()
367 |             loss_value += loss.item()
368 |             optimizer.step()
369 |         loss = loss_value / len(train_loader.dataset)
370 |         t.set_postfix(loss = loss)
371 | 
372 |         # Early stopping - are we improving by at least 'tol'?
373 |         if (best_loss - loss) > tol:
374 |             # If yes - keep going
375 |             best_loss = loss
376 |             best_epoch = epoch
377 |         # If we're not improving, have we improved at all in the past 'patience' epochs?
378 |         if (epoch - best_epoch) > patience:
379 |             print('Converged after {0} epochs, stopping...'.format(epoch))
380 |             break
381 | 
382 |     del train_tensor, train_users, train_loader
383 |     torch.cuda.empty_cache()
384 | 
385 |     # Scores for test set
386 |     # Only optimise user vectors for test data
387 |     test_users = np.asarray(list(test_dict.keys()))
388 |     test_tensor = torch.from_numpy(X_test[test_users,:].A.astype(bool)).to(device)
389 |     # If we have distinct train/test users - we should learn vectors for test users
390 |     if eval_style == 'strong':
391 |         if multi_gpu:
392 |             optimizer = torch.optim.Adam([model.module.Mu_Zu.weight, model.module.lsdev_Zu.weight],lr=lr)
393 |         else:
394 |             optimizer = torch.optim.Adam([model.Mu_Zu.weight, model.lsdev_Zu.weight],lr=lr)
395 |         test_users = torch.from_numpy(test_users.astype(np.int64)).to(device)
396 |         test_loader = torch.utils.data.DataLoader(torch.utils.data.TensorDataset(test_tensor, test_users), batch_size, shuffle=True)
397 |         t = trange(num_epochs, desc = 'Test ')
398 |         best_loss, best_epoch = np.inf, -1
399 |         for epoch in t:
400 |             # Put the model into training mode
401 |             model.train()
402 |             loss_value = 0
403 |             # Every batch
404 |             for batch_idx, (data, users) in enumerate(test_loader):
405 |                 # Clear gradients
406 |                 optimizer.zero_grad()
407 |                 # Get predictions
408 |                 scores, Mu_Zu, lsdev_Zu = model(users)
409 |                 # Compute loss
410 |                 loss = loss_function(data, scores, Mu_Zu, lsdev_Zu)
411 |                 # Back-propagate 
412 |                 loss.backward()
413 |                 loss_value += loss.item()
414 |                 optimizer.step()
415 |             loss = loss_value / len(test_loader.dataset)
416 |             t.set_postfix(loss = loss)
417 |             # Early stopping - are we improving by at least 'tol'?
418 |             if (best_loss - loss) > tol:
419 |                 # If yes - keep going
420 |                 best_loss = loss
421 |                 best_epoch = epoch
422 |             # If we're not improving, have we improved at all in the past 'patience' epochs?
423 |             if (epoch - best_epoch) > patience:
424 |                 print('Converged after {0} epochs, stopping...'.format(epoch))
425 |                 break
426 | 
427 |     # Scores for test set - only for optimal model
428 |     torch.cuda.empty_cache()
429 |     with torch.no_grad():
430 |         test_users = np.asarray(list(test_dict.keys()))
431 |         test_users = torch.from_numpy(test_users.astype(np.int64)).to(device)
432 |         test_loader = torch.utils.data.DataLoader(test_users, batch_size, shuffle=False)
433 |         pred_test = []
434 |         for batch_idx, users in enumerate(test_loader):
435 |             scores, _, _ = model(users, add_noise = False)
436 |             pred_test.append(scores.detach().cpu().numpy())
437 |         pred_test = np.vstack(pred_test)
438 | 
439 |     # Subtract previously seen items from predicted scores
440 |     test_users = np.asarray(list(test_dict.keys()))
441 |     test_scores = pred_test - 987654321 * X_test[test_users,:]
442 |     results_VLM = util.evaluate(X_test, test_scores, test_dict)
443 |     return results_VLM
444 | 
445 | def run_SLIM(X_train, train_users, X_meta, X_val, X_test, val_dict, test_dict, side_info, eval_style = 'strong'):
446 |     # Values for grid-search
447 |     NDCG_values = []
448 |     optimal_model_SLIM = None
449 |     best_values = None
450 |     l1_values = [0, 2.5, 5.0, 10.0, 20] 
451 |     l2_values = [0, 5.0, 10.0, 20, 50, 100]
452 |     al_values = [.5, 1.0, 2.5, 5.0, 10.0] if side_info else [1.0]
453 |     for l1r, l2r, alpha in itertools.product(l1_values, l2_values, al_values): 
454 |         print('L1: {0}\tL2: {1}\tAlpha: {2}'.format(l1r,l2r, alpha))
455 |         # Set up parameters
456 |         params = {'algo':'cd', 'nthreads':16, 'l1r':l1r, 'l2r':l2r}
457 | 
458 |         # Build training matrix
459 |         trainmat = X_train
460 |         if side_info:
461 |             trainmat = vstack((trainmat, alpha * X_meta))
462 |         trainmat = SLIMatrix(trainmat)
463 | 
464 |         # Train model
465 |         model = SLIM()
466 |         model.train(params, trainmat)
467 |         print('Converting out of SLIM format...')
468 |         # To CSR works, but densifying it crashes sometimes? Very strange
469 |         # S_SLIM = model.to_csr().todense()
470 |         # Work-around by writing to disk and reading in
471 |         model.save_model(modelfname='slim_model.csr', mapfname='slim_map.csr')
472 |         def read_csr(filename):
473 |             f = open(filename, 'r')
474 |             all_rows = []
475 |             all_cols = []
476 |             all_vals = []
477 |             for i, line in enumerate(f.readlines()):
478 |                 strs = line.split(' ')
479 |                 cols = [int(s) for s in strs[1::2]]
480 |                 vals = [float(s) for s in strs[2::2]]
481 |                 all_cols.extend(cols)
482 |                 all_vals.extend(vals)
483 |                 all_rows.extend([i for _ in cols])
484 |             all_rows = np.array(all_rows, dtype=np.int64)
485 |             all_cols = np.array(all_cols, dtype=np.int64)
486 |             all_vals = np.array(all_vals, dtype=np.float32)
487 |             mat = coo_matrix((all_vals, (all_rows, all_cols)), shape = (X_train.shape[1],X_train.shape[1]))
488 |             return mat
489 |         S_SLIM = read_csr('slim_model.csr')
490 |         print('... done!')
491 |         S_SLIM = S_SLIM.todense()
492 | 
493 |         # Evaluate on validation data
494 |         print('Evaluating...')
495 |         val_users = list(val_dict.keys())
496 |         val_scores = X_val[val_users,:] @ S_SLIM - 987654321 * X_val[val_users,:]
497 |         
498 |         # Evaluate and pretty print
499 |         NDCG = util.evaluate(X_val, val_scores, val_dict)[1][100]
500 |         NDCG_values.append(NDCG)
501 | 
502 |         print('\tNDCG@100:\t{0}'.format(NDCG))
503 |         if np.max(NDCG_values) == NDCG:
504 |             optimal_model_SLIM = S_SLIM
505 |             best_values = (l1r, l2r, alpha)
506 | 
507 |     print('Best grid-search values:', best_values)
508 | 
509 |     # Compute prediction scores for all test users - subtract already seen items
510 |     test_users = list(test_dict.keys())
511 |     test_scores = X_test[test_users,:] @ optimal_model_SLIM - 987654321 * X_test[test_users,:]
512 |     
513 |     # Evaluate and pretty print
514 |     results_SLIM = util.evaluate(X_test, test_scores, test_dict)
515 |     return results_SLIM
516 | 


--------------------------------------------------------------------------------