├── __init__.py ├── lazy ├── __init__.py ├── lazy.py ├── utils.py ├── pop.py ├── markov_model.py └── user_knn.py ├── helpers ├── __init__.py ├── sparse_layer.py ├── early_stopping.py ├── data_handling.py ├── evaluation.py └── command_parser.py ├── word2vec ├── __init__.py └── ltm.py ├── factorization ├── __init__.py ├── bprmf.py ├── fism.py ├── fossil.py ├── fpmc.py └── mf_base.py ├── neural_networks ├── __init__.py ├── target_selection.py ├── update_manager.py ├── sequence_noise.py ├── recurrent_layers.py ├── rnn_one_hot.py ├── stacked_denoising_autoencoder.py ├── rnn_margin.py ├── rnn_sampling.py ├── fism_cluster.py └── rnn_cluster.py ├── requirements.txt ├── Dockerfile ├── LICENSE ├── train.py ├── test.py ├── preprocess.py └── README.md /__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /lazy/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /helpers/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /word2vec/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /factorization/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /neural_networks/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | Theano>=0.8.2 2 | Gensim==0.13.1 3 | pandas==0.19.2 4 | -------------------------------------------------------------------------------- /helpers/sparse_layer.py: -------------------------------------------------------------------------------- 1 | import lasagne 2 | import numpy as np 3 | import theano 4 | import theano.tensor as T 5 | 6 | class SparseLayer(lasagne.layers.DenseLayer): 7 | 8 | def __init__(self, incoming, **kwargs): 9 | super(SparseLayer, self).__init__(incoming, **kwargs) 10 | 11 | def get_output_for(self, input, **kwargs): 12 | 13 | activation = theano.sparse.structured_dot(input, self.W) 14 | if self.b is not None: 15 | activation = activation + self.b 16 | return self.nonlinearity(activation) -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | from ubuntu:16.04 2 | 3 | run apt-get -yqq update 4 | run apt-get install -yqq python-dev python-pip python-nose g++ libopenblas-dev python-numpy python-scipy 5 | 6 | add . /root/sequence-based-recommendations 7 | workdir /root/sequence-based-recommendations 8 | 9 | run pip install --upgrade pip 10 | run pip install -r requirements.txt 11 | 12 | 13 | run pip install -r https://raw.githubusercontent.com/Lasagne/Lasagne/master/requirements.txt && \ 14 | pip install https://github.com/Lasagne/Lasagne/archive/master.zip -------------------------------------------------------------------------------- /lazy/lazy.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | import numpy as np 4 | 5 | 6 | class Lazy(object): 7 | """Base for Lazy object. 8 | """ 9 | def __init__(self): 10 | super(Lazy, self).__init__() 11 | 12 | self.name = "Lazy base" 13 | 14 | def prepare_model(self, dataset): 15 | '''Must be called before using top_k_recommendations 16 | ''' 17 | raise NotImplemented 18 | 19 | 20 | def load(self, *args, **kwargs): 21 | '''Nothing to do here 22 | ''' 23 | return None 24 | 25 | def top_k_recommendations(self, sequence, k=10, **kwargs): 26 | ''' Recieves a sequence of (id, rating), and produces k recommendations (as a list of ids) 27 | ''' 28 | raise NotImplemented -------------------------------------------------------------------------------- /lazy/utils.py: -------------------------------------------------------------------------------- 1 | from __future__ import division 2 | import numpy as np 3 | import scipy.sparse as ssp 4 | 5 | def top_k(values, k, exclude=[]): 6 | ''' Return the indices of the k items with the highest value in the list of values. 7 | Exclude the ids from the list "exclude". 8 | ''' 9 | 10 | # Put low similarity to viewed items to exclude them from recommendations 11 | values[exclude] = -np.inf 12 | 13 | return list(np.argpartition(-values, range(k))[:k]) 14 | 15 | def get_sparse_vector(ids, length, values=None): 16 | '''Converts a list of ids into a sparse vector of length "length" where the elements corresponding to the ids are given the values in "values". 17 | If "values" is None, the elements are set to 1. 18 | ''' 19 | n = len(ids) 20 | 21 | if values is None: 22 | return ssp.coo_matrix((np.ones(n), (ids,np.zeros(n))), (length, 1)).tocsc() 23 | else: 24 | return ssp.coo_matrix((values, (ids,np.zeros(n))), (length, 1)).tocsc() -------------------------------------------------------------------------------- /lazy/pop.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | import collections 3 | import numpy as np 4 | import scipy.sparse as ssp 5 | from copy import deepcopy 6 | import os 7 | from .lazy import Lazy 8 | from .utils import top_k, get_sparse_vector 9 | 10 | 11 | class Pop(Lazy): 12 | """ 13 | """ 14 | def __init__(self, **kwargs): 15 | super(Pop, self).__init__(**kwargs) 16 | self.name = "Pop" 17 | 18 | def _get_model_filename(self, *args): 19 | return "pop" 20 | 21 | def prepare_model(self, dataset): 22 | '''Load the data from the training file into a format adapted for the KNN methods. 23 | ''' 24 | 25 | self._items_pop = np.zeros(dataset.n_items) 26 | for triplet in dataset.training_set_triplets(): 27 | self._items_pop[triplet['item_id']] += 1 28 | 29 | def top_k_recommendations(self, sequence, k=10, exclude=None, **kwargs): 30 | 31 | if exclude is None: 32 | exclude = [] 33 | 34 | items_pop = deepcopy(self._items_pop) 35 | 36 | items_pop[exclude] = -np.inf 37 | items_pop[[i[0] for i in sequence]] = -np.inf 38 | 39 | return list(np.argpartition(-items_pop, range(k))[:k]) 40 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2017 Robin Devooght 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /lazy/markov_model.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | import collections 3 | import numpy as np 4 | import scipy.sparse as ssp 5 | from copy import deepcopy 6 | from .lazy import Lazy 7 | from .utils import top_k, get_sparse_vector 8 | 9 | 10 | class MarkovModel(Lazy): 11 | """ 12 | """ 13 | def __init__(self, **kwargs): 14 | super(MarkovModel, self).__init__(**kwargs) 15 | 16 | self.previous_recommendations = dict() 17 | 18 | self.name = "MarkovModel" 19 | 20 | def _get_model_filename(self, *args): 21 | return "MM" 22 | 23 | def prepare_model(self, dataset): 24 | '''Load the data from the training file into a format adapted for the MM predictions. 25 | ''' 26 | self.n_items = dataset.n_items 27 | 28 | self.sequences = [] 29 | 30 | with open(dataset.training_set.filename, 'r') as f: 31 | for sequence in f: 32 | sequence = sequence.split() 33 | items = map(int, sequence[1::2]) 34 | s = dict() 35 | for i in range(len(items)-1): 36 | s[items[i]] = items[i+1] 37 | self.sequences.append(s) 38 | 39 | def get_all_recommendations(self, item): 40 | all_recommendations = [] 41 | for s in self.sequences: 42 | if item in s: 43 | all_recommendations.append(s[item]) 44 | all_recommendations = collections.Counter(all_recommendations) 45 | del all_recommendations[None] 46 | self.previous_recommendations[item] = all_recommendations 47 | 48 | 49 | def top_k_recommendations(self, sequence, k=10, exclude=None, **kwargs): 50 | if exclude is None: 51 | exclude = [] 52 | 53 | last_item = int(sequence[-1][0]) 54 | if last_item not in self.previous_recommendations: 55 | self.get_all_recommendations(last_item) 56 | 57 | all_recommendations = deepcopy(self.previous_recommendations[last_item]) 58 | for s in sequence: 59 | all_recommendations[int(s[0])] = 0 60 | for i in exclude: 61 | all_recommendations[i] = 0 62 | 63 | ranking = np.zeros(self.n_items) 64 | for i, x in enumerate(all_recommendations.most_common(k)): 65 | ranking[x[0]] = k-i 66 | return np.argpartition(-ranking, range(k))[:k] 67 | -------------------------------------------------------------------------------- /neural_networks/target_selection.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | import numpy as np 3 | import random 4 | 5 | def target_selection_command_parser(parser): 6 | parser.add_argument('--n_targets', help='Number of targets (Only for RNN with hinge, logit or logsig loss).', default=1, type=int) 7 | parser.add_argument('--shuffle_targets', help='Instead of picking the next items in the sequence as the target(s), the targets are picked randomly in the remaining sequence.', action='store_true') 8 | parser.add_argument('--rand_test_target', help='Use the exact same procedure for target selection during training and testing. Otherwise shuffling and bias are used only during training.', action='store_true') 9 | parser.add_argument('--target_bias', help='Popular item are picked as item with a lower probability. Targets are skipped with a probability proportional to (number_of_views)^bias. Set negative bias to avoid this procedure.', default=-1., type=float) 10 | 11 | def get_target_selection(args): 12 | return SelectTargets(n_targets=args.n_targets, shuffle=args.shuffle_targets, bias=args.target_bias, determinist_test=(not args.rand_test_target)) 13 | 14 | 15 | class SelectTargets(object): 16 | def __init__(self, n_targets=1, shuffle=False, bias=-1, determinist_test=True): 17 | super(SelectTargets, self).__init__() 18 | self.n_targets = n_targets 19 | self.shuffle = shuffle 20 | self.bias = bias 21 | self.determinist_test = determinist_test 22 | 23 | @property 24 | def name(self): 25 | 26 | name = "nt"+str(self.n_targets) 27 | 28 | if self.bias >= 0.: 29 | name += '_tb'+str(self.bias) 30 | if self.shuffle: 31 | name += "_shufT" 32 | return name 33 | 34 | 35 | def set_dataset(self, dataset): 36 | 37 | if self.bias >= 0.: 38 | pop = np.maximum(1, dataset.item_popularity) 39 | self.keep_prob = np.power(min(pop) / pop, self.bias) 40 | 41 | def __call__(self, remaining_sequence, test=False): 42 | ''' Receives the sequence of item that are not read by the RNN and chooses the target(s) among them. 43 | the test parameter indicates whether this is the training or the testing phase. 44 | If test is True and self.determinist_test is True, no shuffle nor bias is performed 45 | ''' 46 | 47 | if not (test and self.determinist_test): 48 | if self.shuffle: 49 | random.shuffle(remaining_sequence) 50 | if self.bias >= 0.: 51 | remaining_sequence = [i for i in remaining_sequence if (np.random.random() <= self.keep_prob[i[0]])] 52 | 53 | return remaining_sequence[:min(len(remaining_sequence), self.n_targets)] -------------------------------------------------------------------------------- /train.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | import numpy as np 4 | import theano 5 | import theano.tensor as T 6 | import time 7 | import lasagne 8 | import random 9 | import helpers.command_parser as parse 10 | from helpers.data_handling import DataHandler 11 | 12 | def training_command_parser(parser): 13 | parser.add_argument('--tshuffle', help='Shuffle sequences during training.', action='store_true') 14 | 15 | parser.add_argument('--extended_set', help='Use extended training set (contains first half of validation and test set).', action='store_true') 16 | 17 | parser.add_argument('-d', dest='dataset', help='Directory name of the dataset.', default='', type=str) 18 | parser.add_argument('--dir', help='Directory name to save model.', default='', type=str) 19 | parser.add_argument('--save', choices=['All', 'Best', 'None'], help='Policy for saving models.', default='Best') 20 | parser.add_argument('--metrics', help='Metrics for validation, comma separated', default='sps', type=str) 21 | parser.add_argument('--time_based_progress', help='Follow progress based on time rather than iterations.', action='store_true') 22 | parser.add_argument('--load_last_model', help='Load Last model before starting training.', action='store_true') 23 | parser.add_argument('--progress', help='Progress intervals', default='2.', type=str) 24 | parser.add_argument('--mpi', help='Max progress intervals', default=np.inf, type=float) 25 | parser.add_argument('--max_iter', help='Max number of iterations', default=np.inf, type=float) 26 | parser.add_argument('--max_time', help='Max training time in seconds', default=np.inf, type=float) 27 | parser.add_argument('--min_iter', help='Min number of iterations before showing progress', default=0., type=float) 28 | 29 | def num(s): 30 | try: 31 | return int(s) 32 | except ValueError: 33 | return float(s) 34 | 35 | def main(): 36 | 37 | 38 | args = parse.command_parser(parse.predictor_command_parser, training_command_parser, parse.early_stopping_command_parser) 39 | 40 | predictor = parse.get_predictor(args) 41 | 42 | 43 | dataset = DataHandler(dirname=args.dataset, extended_training_set=args.extended_set, shuffle_training=args.tshuffle) 44 | 45 | predictor.prepare_model(dataset) 46 | predictor.train(dataset, 47 | save_dir=dataset.dirname + "models/" + args.dir, 48 | time_based_progress=args.time_based_progress, 49 | progress=num(args.progress), 50 | autosave=args.save, 51 | max_progress_interval=args.mpi, 52 | max_iter = args.max_iter, 53 | min_iterations=args.min_iter, 54 | max_time=args.max_time, 55 | early_stopping=parse.get_early_stopper(args), 56 | load_last_model=args.load_last_model, 57 | validation_metrics=args.metrics.split(',')) 58 | 59 | if __name__ == '__main__': 60 | main() 61 | -------------------------------------------------------------------------------- /lazy/user_knn.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | import numpy as np 4 | import scipy.sparse as ssp 5 | import os.path 6 | from .lazy import Lazy 7 | from .utils import top_k, get_sparse_vector 8 | 9 | 10 | class UserKNN(Lazy): 11 | """ 12 | """ 13 | def __init__(self, similarity_measure='cosine', neighborhood_size=80, **kwargs): 14 | super(UserKNN, self).__init__(**kwargs) 15 | 16 | self.similarity_measure = similarity_measure 17 | self.neighborhood_size = neighborhood_size 18 | 19 | self.name = "UserKNN" 20 | 21 | def _get_model_filename(self, *args): 22 | return "UKNN_ns"+str(self.neighborhood_size)+"_"+self.similarity_measure 23 | 24 | def prepare_model(self, dataset): 25 | '''Load the data from the training file into a format adapted for the KNN methods. 26 | ''' 27 | filename = dataset.dirname + 'data/train_set_triplets' 28 | if os.path.isfile(filename + '.npy'): 29 | file_content = np.load(filename + '.npy') 30 | else: 31 | file_content = np.loadtxt(filename) 32 | np.save(filename, file_content) 33 | 34 | #self.user_item = ssp.coo_matrix((file_content[:,2], (file_content[:,0], file_content[:,1]))).tocsr() 35 | self.binary_user_item = ssp.coo_matrix((np.ones(file_content.shape[0]), (file_content[:,0], file_content[:,1]))).tocsr() 36 | 37 | del file_content 38 | 39 | self.n_items = self.binary_user_item.shape[1] 40 | self.n_users = self.binary_user_item.shape[0] 41 | 42 | def _items_count_per_user(self): 43 | if not hasattr(self, '__items_count_per_user'): 44 | self.__items_count_per_user = np.asarray(self.binary_user_item.sum(axis=1)).ravel() 45 | return self.__items_count_per_user 46 | 47 | def similarity_with_users(self, sequence): 48 | '''Compute the similarity of each user with the sequence recieved in parameter 49 | ''' 50 | sparse_sequence = get_sparse_vector([i[0] for i in sequence], self.n_items) 51 | overlap = self.binary_user_item.dot(sparse_sequence).toarray().ravel() 52 | overlap[overlap != 0] /= np.sqrt(self._items_count_per_user()[overlap != 0]) 53 | return overlap 54 | 55 | def top_k_recommendations(self, sequence, k=10, exclude=None, **kwargs): 56 | ''' Recieves a sequence of (id, rating), and produces k recommendations (as a list of ids) 57 | ''' 58 | if exclude is None: 59 | exclude = [] 60 | 61 | sim_with_users = self.similarity_with_users(sequence) 62 | nearest_neighbors = top_k(sim_with_users, self.neighborhood_size) 63 | sim_with_users = get_sparse_vector(nearest_neighbors, self.n_users, values=sim_with_users[nearest_neighbors]) 64 | sim_with_items = self.binary_user_item.T.dot(sim_with_users).toarray().ravel() 65 | 66 | sim_with_items[exclude] = -np.inf 67 | sim_with_items[[i[0] for i in sequence]] = -np.inf 68 | 69 | return list(np.argpartition(-sim_with_items, range(k))[:k]) -------------------------------------------------------------------------------- /helpers/early_stopping.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | import numpy as np 3 | 4 | def early_stopping_command_parser(parser): 5 | parser.add_argument('--es_m', dest='early_stopping_method', choices=['WorstTimesX', 'StopAfterN', 'None'], help='Early stopping method', default='None') 6 | parser.add_argument('--es_n', help='N parameter (for StopAfterN)', default=5, type=int) 7 | parser.add_argument('--es_x', help='X parameter (for WorstTimesX)', default=2., type=float) 8 | parser.add_argument('--es_min_wait', help='Mininum wait before stopping (for WorstTimesX)', default=1., type=float) 9 | parser.add_argument('--es_LiB', help='Lower is better for validation score.', action='store_true') 10 | 11 | def get_early_stopper(args): 12 | if args.early_stopping_method == 'StopAfterN': 13 | return StopAfterN(n = args.es_n, higher_is_better=(not args.es_LiB)) 14 | elif args.early_stopping_method == 'WorstTimesX': 15 | return WaitWorstCaseTimesX(x = args.es_x, min_wait=args.es_min_wait, higher_is_better=(not args.es_LiB)) 16 | else: 17 | return None 18 | 19 | class EarlyStopperBase(object): 20 | def __init__(self, higher_is_better=True): 21 | super(EarlyStopperBase, self).__init__() 22 | 23 | self.higher_is_better = higher_is_better 24 | 25 | def __call__(self, epochs, val_costs): 26 | 27 | if not self.higher_is_better: 28 | val_costs = [-i for i in val_costs] 29 | 30 | return self.decideStopping(epochs, val_costs) 31 | 32 | def decideStopping(self, epochs, val_costs): 33 | pass 34 | 35 | class StopAfterN(EarlyStopperBase): 36 | ''' Stops after N consecutively non improving cost 37 | ''' 38 | def __init__(self, n=3, **kwargs): 39 | super(StopAfterN, self).__init__(**kwargs) 40 | 41 | self.n = n 42 | 43 | def decideStopping(self, epochs, val_costs): 44 | 45 | if len(val_costs) <= self.n: 46 | return False 47 | 48 | for i in range(self.n): 49 | if val_costs[-1-i] > val_costs[-2-i]: 50 | return False 51 | 52 | return True 53 | 54 | 55 | class WaitWorstCaseTimesX(EarlyStopperBase): 56 | ''' Stops if the number of epochs since the best cost is X times larger than the maximum number of epochs between two consecutive best. 57 | ''' 58 | 59 | def __init__(self, x=2., min_wait=1., **kwargs): 60 | super(WaitWorstCaseTimesX, self).__init__(**kwargs) 61 | 62 | self.x = x 63 | self.min_wait = min_wait 64 | 65 | def decideStopping(self, epochs, val_costs): 66 | 67 | # find longest wait between two best scores 68 | last_best = val_costs[0] 69 | last_best_epoch = epochs[0] 70 | longest_wait = 0 71 | for epoch, cost in zip(epochs[1:], val_costs[1:]): 72 | if cost > last_best: 73 | wait = epoch - last_best_epoch 74 | last_best_epoch = epoch 75 | last_best = cost 76 | if wait > longest_wait: 77 | longest_wait = wait 78 | 79 | current_wait = epochs[-1] - last_best_epoch 80 | 81 | if longest_wait == 0: 82 | return current_wait > self.min_wait 83 | 84 | print('current wait : ', round(current_wait, 3), ' longest wait : ', round(longest_wait, 3), ' ratio : ', current_wait/longest_wait, ' / ', self.x) 85 | 86 | return current_wait > max(self.min_wait, longest_wait*self.x) -------------------------------------------------------------------------------- /neural_networks/update_manager.py: -------------------------------------------------------------------------------- 1 | import lasagne 2 | 3 | def update_manager_command_parser(parser): 4 | parser.add_argument('--u_m', dest='update_manager', choices=['adagrad', 'adadelta', 'rmsprop', 'nesterov', 'adam'], help='Update mechanism', default='adam') 5 | parser.add_argument('--u_l', help='Learning rate', default=0.001, type=float) 6 | parser.add_argument('--u_rho', help='rho parameter for Adadelta and RMSProp (momentum for Nesterov momentum)', default=0.9, type=float) 7 | parser.add_argument('--u_b1', help='Beta 1 parameter for Adam', default=0.9, type=float) 8 | parser.add_argument('--u_b2', help='Beta 2 parameter for Adam', default=0.999, type=float) 9 | 10 | def get_update_manager(args): 11 | if args.update_manager == 'adagrad': 12 | return Adagrad(learning_rate = args.u_l) 13 | elif args.update_manager == 'adadelta': 14 | return Adadelta(learning_rate = args.u_l, rho = args.u_rho) 15 | elif args.update_manager == 'rmsprop': 16 | return RMSProp(learning_rate = args.u_l, rho = args.u_rho) 17 | elif args.update_manager == 'nesterov': 18 | return NesterovMomentum(learning_rate = args.u_l, momentum = args.u_rho) 19 | elif args.update_manager == 'adam': 20 | return Adam(learning_rate = args.u_l, beta1 = args.u_b1, beta2 = args.u_b2) 21 | else: 22 | raise ValueError('Unknown update option') 23 | 24 | class Adagrad(object): 25 | 26 | def __init__(self, learning_rate=0.1, **kwargs): 27 | super(Adagrad, self).__init__(**kwargs) 28 | 29 | self.learning_rate = learning_rate 30 | self.name = 'Ug_lr'+str(self.learning_rate) 31 | 32 | def __call__(self, cost, params): 33 | return lasagne.updates.adagrad(cost, params, self.learning_rate) 34 | 35 | class Adadelta(object): 36 | 37 | def __init__(self, learning_rate=1.0, rho=0.9, **kwargs): 38 | super(Adadelta, self).__init__(**kwargs) 39 | 40 | self.learning_rate = learning_rate 41 | self.rho = rho 42 | self.name = 'Ud_lr'+str(self.learning_rate)+'_rho'+str(self.rho) 43 | 44 | def __call__(self, cost, params): 45 | return lasagne.updates.adadelta(cost, params, self.learning_rate, rho=self.rho) 46 | 47 | class RMSProp(object): 48 | 49 | def __init__(self, learning_rate=1.0, rho=0.9, **kwargs): 50 | super(RMSProp, self).__init__(**kwargs) 51 | 52 | self.learning_rate = learning_rate 53 | self.rho = rho 54 | self.name = 'Ur_lr'+str(self.learning_rate)+'_rho'+str(self.rho) 55 | 56 | def __call__(self, cost, params): 57 | return lasagne.updates.rmsprop(cost, params, self.learning_rate, rho=self.rho) 58 | 59 | class NesterovMomentum(object): 60 | 61 | def __init__(self, learning_rate=1.0, momentum=0.9, **kwargs): 62 | super(NesterovMomentum, self).__init__(**kwargs) 63 | 64 | self.learning_rate = learning_rate 65 | self.momentum = momentum 66 | self.name = 'Un_lr'+str(self.learning_rate)+'_m'+str(self.momentum) 67 | 68 | def __call__(self, cost, params): 69 | return lasagne.updates.nesterov_momentum(cost, params, self.learning_rate, momentum=self.momentum) 70 | 71 | class Adam(object): 72 | 73 | def __init__(self, learning_rate=0.001, beta1=0.9, beta2=0.999, **kwargs): 74 | super(Adam, self).__init__(**kwargs) 75 | 76 | self.learning_rate = learning_rate 77 | self.beta1 = beta1 78 | self.beta2 = beta2 79 | self.name = 'Ua_lr'+str(self.learning_rate)+'_b1'+str(self.beta1)+'_b2'+str(self.beta2) 80 | 81 | def __call__(self, cost, params): 82 | return lasagne.updates.adam(cost, params, self.learning_rate, beta1=self.beta1, beta2=self.beta2) -------------------------------------------------------------------------------- /neural_networks/sequence_noise.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | import numpy as np 3 | 4 | def sequence_noise_command_parser(parser): 5 | parser.add_argument('--n_dropout', help='Dropout probability', default=0., type=float) 6 | parser.add_argument('--n_swap', help="Probability of swapping two consecutive items", default=0., type=float) 7 | parser.add_argument('--n_shuf', help="Probability of swapping two random items", default=0., type=float) 8 | parser.add_argument('--n_shuf_std', help="The distance between the two items to be swapped is drawn from a normal distribution whose std is defined by this parameter", default=5., type=float) 9 | parser.add_argument('--n_ratings', help='Probability of changing the rating.', default=0., type=float) 10 | 11 | def get_sequence_noise(args): 12 | return SequenceNoise(dropout=args.n_dropout, swap=args.n_swap, ratings_perturb=args.n_ratings, shuf=args.n_shuf, shuf_std=args.n_shuf_std) 13 | 14 | 15 | class SequenceNoise(object): 16 | def __init__(self, dropout=0., swap=0., ratings_perturb=0., shuf=0., shuf_std=0.): 17 | super(SequenceNoise, self).__init__() 18 | self.dropout = dropout 19 | self.swap = swap 20 | self.ratings_perturb = ratings_perturb 21 | self.shuf = shuf 22 | self.shuf_std = shuf_std 23 | 24 | self._check_param_validity() 25 | self._set_name() 26 | 27 | 28 | def _set_name(self): 29 | name = [] 30 | if self.dropout > 0: 31 | name.append("do"+str(self.dropout)) 32 | 33 | if self.swap > 0: 34 | name.append("sw"+str(self.swap)) 35 | 36 | if self.ratings_perturb > 0: 37 | name.append("rp"+str(self.ratings_perturb)) 38 | 39 | if self.shuf > 0: 40 | name.append("sh"+str(self.shuf)+"-"+str(self.shuf_std)) 41 | 42 | self.name = "_".join(name) 43 | 44 | def _check_param_validity(self): 45 | if self.dropout < 0. or self.dropout >= 1.: 46 | raise ValueError('Dropout should be in [0,1)') 47 | if self.swap < 0. or self.swap >= 1.: 48 | raise ValueError('Swapping probability should be in [0,1)') 49 | if self.ratings_perturb < 0. or self.ratings_perturb >= 1.: 50 | raise ValueError('Rating perturbation probability should be in [0,1)') 51 | 52 | def __call__(self, sequence_generator): 53 | """Recieves a generator of sequences in the form ([(item, rating), (item, rating), ...], user) and generates sequences in the same format, 54 | after potentially applying dropout, item swapping and ratings modifications. 55 | """ 56 | 57 | while True: 58 | 59 | sequence, user = next(sequence_generator) 60 | 61 | # Dropout 62 | if self.dropout > 0.: 63 | sequence = [i for i in sequence if (np.random.random() >= self.dropout)] 64 | if len(sequence) < 2: 65 | continue 66 | 67 | # Perturb the order 68 | if self.swap > 0.: 69 | i = 0 70 | while i < len(sequence) - 1: 71 | if np.random.random() < self.swap: 72 | tmp = sequence[i] 73 | sequence[i] = sequence[i+1] 74 | sequence[i+1] = tmp 75 | i+=1 # Don't allow to swap twice the same item 76 | i += 1 77 | 78 | # Shuffle 79 | if self.shuf > 0.: 80 | for i in range(len(sequence)): 81 | if np.random.random() < self.shuf: 82 | other_item = max(0, min(len(sequence)-1, int(np.random.randn()*self.shuf_std)+i)) 83 | sequence[i], sequence[other_item] = sequence[other_item], sequence[i] 84 | 85 | # Perturb ratings 86 | if self.ratings_perturb > 0: 87 | for i in range(len(sequence)): 88 | if np.random.random() < self.ratings_perturb: 89 | if np.random.random() < 0.5: 90 | sequence[i][1] = min(5, sequence[i][1] + 0.5) 91 | else: 92 | sequence[i][1] = max(1, sequence[i][1] - 0.5) 93 | 94 | yield sequence, user 95 | 96 | -------------------------------------------------------------------------------- /neural_networks/recurrent_layers.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | import numpy as np 3 | import theano 4 | import theano.tensor as T 5 | import lasagne 6 | from sparse_lstm import * 7 | 8 | def recurrent_layers_command_parser(parser): 9 | parser.add_argument('--r_t', dest='recurrent_layer_type', choices=['LSTM', 'GRU', 'Vanilla'], help='Type of recurrent layer', default='GRU') 10 | parser.add_argument('--r_l', help="Layers' size, (eg: 100-50-50)", default="50", type=str) 11 | parser.add_argument('--r_bi', help='Bidirectional layers.', action='store_true') 12 | parser.add_argument('--r_emb', help='Add an embedding layer before the RNN. Takes the size of the embedding as parameter, a size<1 means no embedding layer.', type=int, default=0) 13 | 14 | def get_recurrent_layers(args): 15 | return RecurrentLayers(layer_type=args.recurrent_layer_type, layers=map(int, args.r_l.split('-')), bidirectional=args.r_bi, embedding_size=args.r_emb) 16 | 17 | 18 | class RecurrentLayers(object): 19 | def __init__(self, layer_type="LSTM", layers=[32], bidirectional=False, embedding_size=0, grad_clipping=100): 20 | super(RecurrentLayers, self).__init__() 21 | self.layer_type = layer_type 22 | self.layers = layers 23 | self.bidirectional = bidirectional 24 | self.embedding_size = embedding_size 25 | self.grad_clip=grad_clipping 26 | self.set_name() 27 | 28 | def set_name(self): 29 | 30 | self.name = "" 31 | if self.bidirectional: 32 | self.name += "b"+self.layer_type+"_" 33 | elif self.layer_type != "LSTM": 34 | self.name += self.layer_type+"_" 35 | 36 | self.name += "gc"+str(self.grad_clip)+"_" 37 | if self.embedding_size > 0: 38 | self.name += "e"+str(self.embedding_size) 39 | self.name += "h"+('-'.join(map(str,self.layers))) 40 | 41 | 42 | def __call__(self, input_layer, mask_layer, true_input_size=None, only_return_final=True): 43 | 44 | if true_input_size is None and self.embedding_size > 0: 45 | raise ValueError('Embedding layer only works with sparse inputs') 46 | 47 | if self.embedding_size > 0: 48 | in_int32 = lasagne.layers.ExpressionLayer(input_layer, lambda x: x.astype('int32')) # change type of input 49 | l_emb = lasagne.layers.flatten(lasagne.layers.EmbeddingLayer(in_int32, input_size=true_input_size, output_size=self.embedding_size), outdim=3) 50 | l_rec = self.get_recurrent_layers(l_emb, mask_layer, true_input_size=None, only_return_final=only_return_final) 51 | else: 52 | l_rec = self.get_recurrent_layers(input_layer, mask_layer, true_input_size=true_input_size, only_return_final=only_return_final) 53 | 54 | return l_rec 55 | 56 | 57 | def get_recurrent_layers(self, input_layer, mask_layer, true_input_size=None, only_return_final=True): 58 | 59 | orf = False 60 | prev_layer = input_layer 61 | for i, h in enumerate(self.layers): 62 | if i == len(self.layers) - 1: 63 | orf = only_return_final 64 | prev_layer = self.get_one_layer(prev_layer, mask_layer, h, true_input_size, orf) 65 | 66 | true_input_size = None # Second layer is always densely encoded 67 | 68 | return prev_layer 69 | 70 | 71 | 72 | def get_one_layer(self, input_layer, mask_layer, n_hidden, true_input_size, only_return_final): 73 | if self.bidirectional: 74 | forward = self.get_unidirectional_layer(input_layer, mask_layer, n_hidden, true_input_size, only_return_final, backwards=False) 75 | backward = self.get_unidirectional_layer(input_layer, mask_layer, n_hidden, true_input_size, only_return_final, backwards=True) 76 | return lasagne.layers.ConcatLayer([forward, backward], axis = -1) 77 | else: 78 | return self.get_unidirectional_layer(input_layer, mask_layer, n_hidden, true_input_size, only_return_final, backwards=False) 79 | 80 | def get_unidirectional_layer(self, input_layer, mask_layer, n_hidden, true_input_size, only_return_final, backwards=False): 81 | if true_input_size is not None: 82 | if self.layer_type == "LSTM": 83 | layer = LSTMLayerOHEInput 84 | elif self.layer_type == "GRU": 85 | layer = GRULayerOHEInput 86 | elif self.layer_type == "Vanilla": 87 | layer = VanillaLayerOHEInput 88 | else: 89 | raise ValueError('Unknown layer type') 90 | 91 | return layer(input_layer, n_hidden, true_input_size, mask_input=mask_layer, grad_clipping=self.grad_clip, 92 | learn_init=True, only_return_final=only_return_final, backwards=backwards) 93 | else: 94 | if self.layer_type == "LSTM": 95 | layer = lasagne.layers.LSTMLayer 96 | elif self.layer_type == "GRU": 97 | layer = lasagne.layers.GRULayer 98 | elif self.layer_type == "Vanilla": 99 | layer = lasagne.layers.RecurrentLayer 100 | else: 101 | raise ValueError('Unknown layer type') 102 | 103 | return layer(input_layer, n_hidden, mask_input=mask_layer, grad_clipping=self.grad_clip, 104 | learn_init=True, only_return_final=only_return_final, backwards=backwards) 105 | -------------------------------------------------------------------------------- /neural_networks/rnn_one_hot.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | import numpy as np 4 | import theano 5 | import theano.tensor as T 6 | import lasagne 7 | import cPickle 8 | import random 9 | from time import time 10 | import rnn_base as rnn 11 | from sparse_lstm import * 12 | 13 | class RNNOneHot(rnn.RNNBase): 14 | """RNNOneHot are recurrent neural networks that do not depend on the factorization: they are based on one-hot encoding. 15 | 16 | The parameters specific to the RNNOneHot are: 17 | diversity_bias: a float in [0, inf) that tunes how the cost function of the network is biased towards less seen movies. 18 | In practice, the classification error given by the categorical cross-entropy is divided by exp(diversity_bias * popularity (on a scale from 1 to 10)). 19 | This will reduce the error associated to movies with a lot of views, putting therefore more importance on the ability of the network to correctly predict the rare movies. 20 | A diversity_bias of 0 produces the normal behavior, with no bias. 21 | """ 22 | def __init__(self, diversity_bias=0.0, regularization=0.0, **kwargs): 23 | super(RNNOneHot, self).__init__(**kwargs) 24 | 25 | self.diversity_bias = np.cast[theano.config.floatX](diversity_bias) 26 | 27 | self.regularization = regularization 28 | 29 | self.name = "RNN with categorical cross entropy" 30 | 31 | def _get_model_filename(self, epochs): 32 | '''Return the name of the file to save the current model 33 | ''' 34 | filename = "rnn_cce_db"+str(self.diversity_bias)+"_r"+str(self.regularization)+"_"+self._common_filename(epochs) 35 | return filename 36 | 37 | def _prepare_networks(self, n_items): 38 | ''' Prepares the building blocks of the RNN, but does not compile them: 39 | ''' 40 | 41 | self.n_items = n_items 42 | # The input is composed of to parts : the on-hot encoding of the movie, and the features of the movie 43 | self.l_in = lasagne.layers.InputLayer(shape=(self.batch_size, self.max_length, self._input_size())) 44 | # The input is completed by a mask to inform the LSTM of the length of the sequence 45 | self.l_mask = lasagne.layers.InputLayer(shape=(self.batch_size, self.max_length)) 46 | 47 | # recurrent layer 48 | if not self.use_movies_features: 49 | l_recurrent = self.recurrent_layer(self.l_in, self.l_mask, true_input_size=self.n_items + self._n_optional_features(), only_return_final=True) 50 | else: 51 | l_recurrent = self.recurrent_layer(self.l_in, self.l_mask, true_input_size=None, only_return_final=True) 52 | 53 | # l_last_slice gets the last output of the recurrent layer 54 | l_last_slice = l_recurrent 55 | # l_last_slice = lasagne.layers.SliceLayer(l_recurrent, -1, 1) 56 | 57 | # Theano tensor for the targets 58 | target = T.ivector('target_output') 59 | target_popularity = T.fvector('target_popularity') 60 | self.exclude = T.fmatrix('excluded_items') 61 | self.theano_inputs = [self.l_in.input_var, self.l_mask.input_var, target, target_popularity, self.exclude] 62 | 63 | 64 | # The sliced output is then passed through linear layer to obtain the right output size 65 | self.l_out = lasagne.layers.DenseLayer(l_last_slice, num_units=self.n_items, nonlinearity=lasagne.nonlinearities.softmax) 66 | 67 | # lasagne.layers.get_output produces a variable for the output of the net 68 | network_output = lasagne.layers.get_output(self.l_out) 69 | 70 | # loss function 71 | self.cost = (T.nnet.categorical_crossentropy(network_output, target) / target_popularity).mean() 72 | 73 | if self.regularization > 0.: 74 | self.cost += self.regularization * lasagne.regularization.l2(self.l_out.b) 75 | # self.cost += self.regularization * lasagne.regularization.regularize_layer_params(self.l_out, lasagne.regularization.l2) 76 | elif self.regularization < 0.: 77 | self.cost -= self.regularization * lasagne.regularization.l1(self.l_out.b) 78 | # self.cost -= self.regularization * lasagne.regularization.regularize_layer_params(self.l_out, lasagne.regularization.l1) 79 | 80 | 81 | 82 | 83 | def _prepare_input(self, sequences): 84 | ''' Sequences is a list of [user_id, input_sequence, targets] 85 | ''' 86 | 87 | batch_size = len(sequences) 88 | 89 | # Shape return variables 90 | X = np.zeros((batch_size, self.max_length, self._input_size()), dtype=self._input_type) # input of the RNN 91 | mask = np.zeros((batch_size, self.max_length)) # mask of the input (to deal with sequences of different length) 92 | Y = np.zeros((batch_size,), dtype='int32') # output target 93 | pop = np.zeros((batch_size,)) # output target 94 | exclude = np.zeros((batch_size, self.n_items), dtype=theano.config.floatX) 95 | 96 | 97 | for i, sequence in enumerate(sequences): 98 | user_id, in_seq, target = sequence 99 | seq_features = np.array(map(lambda x: self._get_features(x, user_id), in_seq)) 100 | X[i, :len(in_seq), :] = seq_features # Copy sequences into X 101 | mask[i, :len(in_seq)] = 1 102 | Y[i] = target[0][0] # id of the first and only target 103 | pop[i] = self.dataset.item_popularity[target[0][0]] ** self.diversity_bias 104 | exclude[i, [j[0] for j in in_seq]] = 1 105 | 106 | return (X, mask.astype(theano.config.floatX), Y, pop.astype(theano.config.floatX), exclude) 107 | -------------------------------------------------------------------------------- /factorization/bprmf.py: -------------------------------------------------------------------------------- 1 | from __future__ import division 2 | from __future__ import print_function 3 | import numpy as np 4 | import math 5 | import random 6 | import re 7 | import os 8 | import glob 9 | import sys 10 | from time import time 11 | from mf_base import MFBase 12 | 13 | class BPRMF(MFBase): 14 | ''' Implementation of the algorithm presented in "BPR: Bayesian personalized ranking from implicit feedback", by Rendle S. et al., 2009. 15 | 16 | The adaptive sampling algorithm is adapted from "Improving pairwise learning for item recommendation from implicit feedback", by Rendle S. et al., 2014 17 | ''' 18 | 19 | def __init__(self, k = 32, adaptive_sampling=True, sampling_bias=500, **kwargs): 20 | 21 | super(BPRMF, self).__init__(**kwargs) 22 | 23 | self.name = 'BPRMF' 24 | self.k = k 25 | self.adaptive_sampling = adaptive_sampling 26 | self.sampling_bias = sampling_bias # lambda parameter in "Improving pairwise learning for item recommendation from implicit feedback", by Rendle S. et al., 2014 27 | 28 | def _get_model_filename(self, epochs): 29 | '''Return the name of the file to save the current model 30 | ''' 31 | filename = "bprmf_ne"+str(epochs)+"_lr"+str(self.init_learning_rate)+"_an"+str(self.annealing_rate)+"_k"+str(self.k)+"_reg"+str(self.reg)+"_ini"+str(self.init_sigma) 32 | if self.adaptive_sampling: 33 | filename += "_as"+str(self.sampling_bias) 34 | return filename+".npz" 35 | 36 | def init_model(self): 37 | ''' Initialize the model parameters 38 | ''' 39 | self.V = self.init_sigma * np.random.randn(self.n_users, self.k).astype(np.float32) 40 | self.H = self.init_sigma * np.random.randn(self.n_items, self.k).astype(np.float32) 41 | self.bias = np.zeros(self.n_items).astype(np.float32) 42 | 43 | def sgd_step(self, user, true_item, false_item): 44 | ''' Make one SGD update, given that the interaction between user and true_item exists, 45 | but the one between user and false_item does not. 46 | user, true_item and false_item are all user or item ids. 47 | 48 | return error 49 | ''' 50 | 51 | # Compute error 52 | x_true = self.bias[true_item] + np.dot(self.V[user, :], self.H[true_item, :]) 53 | x_false = self.bias[false_item] + np.dot(self.V[user, :], self.H[false_item, :]) 54 | delta = 1 - 1 / (1 + math.exp(min(10, max(-10, x_false - x_true)))) # Bound x_true - x_false in [-10, 10] to avoid overflow 55 | 56 | # Update CF 57 | V_mem = self.V[user, :] 58 | self.V[user, :] += self.learning_rate * ( delta * (self.H[true_item, :] - self.H[false_item, :]) - self.reg * self.V[user, :]) 59 | self.H[true_item, :] += self.learning_rate * ( delta * V_mem - self.reg * self.H[true_item, :]) 60 | self.H[false_item, :] += self.learning_rate * ( -delta * V_mem - self.reg / 10 * self.H[false_item, :]) 61 | self.bias[true_item] += self.learning_rate * (delta - self.reg * self.bias[true_item]) 62 | self.bias[false_item] += self.learning_rate * (- delta - self.reg * self.bias[false_item]) 63 | 64 | return delta 65 | 66 | def compute_factor_rankings(self): 67 | '''Rank items according to each factor in order to do adaptive sampling 68 | ''' 69 | 70 | self.ranks = np.argsort(self.H, axis=0) 71 | self.var = np.var(self.H, axis=0) 72 | 73 | def get_training_sample(self): 74 | '''Pick a random triplet from self.triplets and a random false next item. 75 | returns a tuple of ids : (user, true_item, false_item) 76 | ''' 77 | 78 | user_id = random.randrange(self.n_users) 79 | while self.users[user_id,1] < 2: 80 | user_id = random.randrange(self.n_users) 81 | user_items = self.items[self.users[user_id,0]:self.users[user_id,0]+self.users[user_id,1]] 82 | true_item = random.choice(user_items) 83 | if self.adaptive_sampling: 84 | while True: 85 | rank = np.random.exponential(scale=self.sampling_bias) 86 | while rank >= self.n_items: 87 | rank = np.random.exponential(scale=self.sampling_bias) 88 | factor_signs = np.sign(self.V[user_id, :]) 89 | factor_prob = np.abs(self.V[user_id, :]) * self.var 90 | f = np.random.choice(self.k, p=factor_prob/sum(factor_prob)) 91 | false_item = self.ranks[int(rank) * factor_signs[f],f] 92 | if false_item not in user_items: 93 | break 94 | else: 95 | false_item = random.randrange(self.n_items) 96 | while false_item in user_items: 97 | false_item = random.randrange(self.n_items) 98 | 99 | return (user_id, true_item, false_item) 100 | 101 | def top_k_recommendations(self, sequence, user_id=None, k=10, exclude=None): 102 | ''' Recieves a sequence of (id, rating), and produces k recommendations (as a list of ids) 103 | ''' 104 | 105 | if exclude is None: 106 | exclude = [] 107 | 108 | last_item = sequence[-1][0] 109 | output = self.bias + np.dot(self.V[user_id, :], self.H.T) 110 | 111 | # Put low similarity to viewed items to exclude them from recommendations 112 | output[[i[0] for i in sequence]] = -np.inf 113 | output[exclude] = -np.inf 114 | 115 | # find top k according to output 116 | return list(np.argpartition(-output, range(k))[:k]) 117 | 118 | def training_step(self, iterations): 119 | if self.adaptive_sampling and iterations%int(self.n_items * np.log(self.n_items)) == 0: 120 | self.compute_factor_rankings() 121 | 122 | # Train with a new batch 123 | return self.sgd_step(*self.get_training_sample()) 124 | 125 | def save(self, filename): 126 | '''Save the parameters of a network into a file 127 | ''' 128 | print('Save model in ' + filename) 129 | if not os.path.exists(os.path.dirname(filename)): 130 | os.makedirs(os.path.dirname(filename)) 131 | np.savez(filename, V=self.V, H=self.H, bias=self.bias) 132 | 133 | 134 | def load(self, filename): 135 | '''Load parameters values form a file 136 | ''' 137 | f = np.load(filename) 138 | self.V = f['V'] 139 | self.H = f['H'] 140 | self.bias = f['bias'] -------------------------------------------------------------------------------- /neural_networks/stacked_denoising_autoencoder.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | import numpy as np 4 | import theano 5 | import theano.tensor as T 6 | import lasagne 7 | import cPickle 8 | import random 9 | import re 10 | import glob 11 | from time import time 12 | from .rnn_base import RNNBase 13 | 14 | def log_softmax(x): 15 | xdev = x - x.max(1, keepdims=True) 16 | return xdev - T.log(T.sum(T.exp(xdev), axis=1, keepdims=True)) 17 | 18 | def categorical_crossentropy_logdomain(log_predictions, targets): 19 | return -T.sum(targets * log_predictions, axis=1) 20 | 21 | class StackedDenoisingAutoencoder(RNNBase): 22 | """Base for Feed forward neural networks object. 23 | """ 24 | def __init__(self, layers=[20], input_dropout=0.2, dropout=0.5, **kwargs): 25 | super(StackedDenoisingAutoencoder, self).__init__(**kwargs) 26 | 27 | self.layers = layers 28 | self.input_dropout = input_dropout 29 | self.dropout = dropout 30 | 31 | self.name = "Stacked Denoising Autoencoder" 32 | 33 | 34 | def _get_model_filename(self, epochs): 35 | '''Return the name of the file to save the current model 36 | ''' 37 | filename = "sda_bs"+str(self.batch_size)+"_ne"+str(epochs) 38 | filename += "_h"+('-'.join(map(str,self.layers))) 39 | filename += "_" + self.updater.name 40 | if not self.use_ratings_features: 41 | filename += "_nf" 42 | if self.use_ratings_features: 43 | filename += "_rf" 44 | return filename 45 | 46 | def top_k_recommendations(self, sequence, user_id=None, k=10, exclude=None, **kwargs): 47 | ''' Recieves a sequence of (id, rating), and produces k recommendations (as a list of ids) 48 | ''' 49 | 50 | # Compile network if needed 51 | if not hasattr(self, 'predict_function'): 52 | self._compile_predict_function() 53 | 54 | # Prepare RNN input 55 | X = np.zeros((1, self._input_size())) # input of the RNN 56 | X[0, :] = self._one_hot_encoding([i[0] for i in sequence]) 57 | 58 | # Run RNN 59 | output = self.predict_function(X.astype(theano.config.floatX))[0] 60 | 61 | # Put low similarity to viewed items to exclude them from recommendations 62 | output[[i[0] for i in sequence]] = -np.inf 63 | output[exclude] = -np.inf 64 | 65 | # find top k according to output 66 | return list(np.argpartition(-output, range(k))[:k]) 67 | 68 | def _prepare_networks(self, n_items): 69 | ''' Prepares the building blocks of the RNN, but does not compile them: 70 | self.l_in : input layer 71 | self.target : target of the network 72 | self.l_out : output of the network 73 | self.cost : cost function 74 | ''' 75 | 76 | self.n_items = n_items 77 | 78 | # The input is composed of to parts : the on-hot encoding of the movie, and the features of the movie 79 | self.l_in = lasagne.layers.InputLayer(shape=(self.batch_size, self._input_size())) 80 | # hidden_layer = lasagne.layers.dropout(self.l_in, p=self.input_dropout) 81 | hidden_layer = self.l_in 82 | 83 | # Build hidden layers 84 | for l in self.layers: 85 | hidden_layer = lasagne.layers.DenseLayer(hidden_layer, num_units=l) 86 | if self.dropout: 87 | hidden_layer = lasagne.layers.dropout(hidden_layer, p=self.dropout) 88 | 89 | # The sliced output is then passed through linear layer to obtain the right output size 90 | self.l_out = lasagne.layers.DenseLayer(hidden_layer, num_units=self.n_items, nonlinearity=lasagne.nonlinearities.sigmoid) 91 | 92 | # lasagne.layers.get_output produces a variable for the output of the net 93 | network_output = lasagne.layers.get_output(self.l_out) 94 | 95 | # loss function 96 | self.targets = T.fmatrix('multiple_target_output') 97 | self.theano_inputs = [self.l_in.input_var, self.targets] 98 | 99 | self.cost = T.sqr(network_output - self.targets).mean() 100 | 101 | def _compile_predict_function(self): 102 | ''' Compile self.predict, the deterministic rnn that output the prediction at the end of the sequence 103 | ''' 104 | print("Compiling...") 105 | deterministic_output = lasagne.layers.get_output(self.l_out, deterministic=True) 106 | self.predict_function = theano.function([self.l_in.input_var], deterministic_output, allow_input_downcast=True) 107 | print("Compilation done.") 108 | 109 | def _compile_test_function(self): 110 | ''' Compile self.test_function, the deterministic rnn that output the precision@10 111 | ''' 112 | print("Compiling test...") 113 | deterministic_output = lasagne.layers.get_output(self.l_out, deterministic=True) 114 | if self.interactions_are_unique: 115 | deterministic_output *= (1 - self.l_in.input_var) 116 | theano_test_function = theano.function(self.theano_inputs, deterministic_output, allow_input_downcast=True, name="Test_function", on_unused_input='ignore') 117 | 118 | def test_function(theano_inputs, k=10): 119 | output = theano_test_function(*theano_inputs) 120 | ids = np.argpartition(-output, range(k), axis=-1)[0, :k] 121 | 122 | return ids 123 | 124 | self.test_function = test_function 125 | 126 | def _gen_mini_batch(self, sequence_generator, test=False, **kwargs): 127 | ''' Takes a sequence generator and produce a mini batch generator. 128 | The mini batch have a size defined by self.batch_size, and have format of the input layer of the rnn. 129 | 130 | Assuming that the length of the sequence is bigger than the size of the batch, each batch is created based on one sequence. 131 | ''' 132 | 133 | while True: 134 | 135 | # Shape return variables 136 | X = np.zeros((self.batch_size, self._input_size())) # input of the RNN 137 | Y = np.zeros((self.batch_size, self._input_size())) # Target of the RNN 138 | 139 | for j in range(self.batch_size): 140 | 141 | sequence, user_id = next(sequence_generator) 142 | if not test: 143 | X[j,:] = self._one_hot_encoding([i[0] for i in sequence if (np.random.random() >= self.input_dropout)]) 144 | Y[j, :] = self._one_hot_encoding([i[0] for i in sequence]) 145 | yield (X.astype(theano.config.floatX),Y.astype(theano.config.floatX)) 146 | else: 147 | X[j, :] = self._one_hot_encoding([i[0] for i in sequence[:len(sequence)/2]]) 148 | Y[j, :] = self._one_hot_encoding(sequence[len(sequence)/2][0]) 149 | yield (X.astype(theano.config.floatX),Y.astype(theano.config.floatX)), [i[0] for i in sequence[len(sequence)/2:]] 150 | 151 | def _one_hot_encoding(self, ids): 152 | ohe = np.zeros(self._input_size()) 153 | ohe[ids] = 1 154 | return ohe 155 | 156 | def _input_size(self): 157 | ''' Returns the number of input neurons 158 | ''' 159 | return self.n_items 160 | 161 | -------------------------------------------------------------------------------- /factorization/fism.py: -------------------------------------------------------------------------------- 1 | from __future__ import division 2 | from __future__ import print_function 3 | import numpy as np 4 | import math 5 | import random 6 | import re 7 | import os 8 | import glob 9 | import sys 10 | from time import time 11 | from mf_base import MFBase 12 | 13 | class FISM(MFBase): 14 | ''' Implementation of the algorithm presented in "FISM : Factored Item Similarity Models for Top-N Recommender Systems", by Santosh K.. et al., 2013. 15 | ''' 16 | 17 | def __init__(self, k = 100, alpha=0.5, loss="auc", **kwargs): 18 | 19 | super(FISM, self).__init__(**kwargs) 20 | 21 | self.name = 'FISM' 22 | self.k = k 23 | self.loss = loss 24 | if loss not in ['RMSE', 'BPR']: 25 | raise ValueError('Unknown loss for FISM: ', loss) 26 | self.alpha = alpha 27 | 28 | def _get_model_filename(self, epochs): 29 | '''Return the name of the file to save the current model 30 | ''' 31 | filename = "fism_" + self.loss + "_ne"+str(epochs)+"_lr"+str(self.init_learning_rate)+"_an"+str(self.annealing_rate)+"_k"+str(self.k)+"_reg"+str(self.reg)+"_ini"+str(self.init_sigma) 32 | 33 | return filename+".npz" 34 | 35 | def init_model(self): 36 | ''' Initialize the model parameters 37 | ''' 38 | self.V = self.init_sigma * np.random.randn(self.n_items, self.k).astype(np.float32) 39 | self.H = self.init_sigma * np.random.randn(self.n_items, self.k).astype(np.float32) 40 | self.bias = np.zeros(self.n_items).astype(np.float32) 41 | 42 | def item_score(self, user_items, item = None): 43 | ''' Compute the prediction score of the FISM model for the item "item", based on the list of items "user_items". 44 | ''' 45 | if item is not None: 46 | return self.bias[item] + np.power(len(user_items), -self.alpha) * np.dot(self.V[user_items, :].sum(axis=0), self.H[item, :]) 47 | else: 48 | return self.bias + np.power(len(user_items), -self.alpha) * np.dot(self.V[user_items, :].sum(axis=0), self.H.T) 49 | 50 | def auc_sgd_step(self, user_items, true_item, false_item): 51 | ''' Make one SGD update, given that the interaction between user and true_item exists, 52 | but the one between user and false_item does not. 53 | user, true_item and false_item are all user or item ids. 54 | 55 | return error 56 | ''' 57 | 58 | # Compute error 59 | x_true = self.item_score(user_items, true_item) 60 | x_false = self.item_score(user_items, false_item) 61 | delta = 1 - 1 / (1 + math.exp(min(10, max(-10, x_false - x_true)))) # Original BPR error 62 | #delta = (x_true - x_false - 1) # error proposed in the FISM paper 63 | 64 | # Update CF 65 | V_sum = self.V[user_items, :].sum(axis=0) 66 | self.V[user_items, :] += self.learning_rate * ( delta * np.power(len(user_items), -self.alpha) * (self.H[true_item, :] - self.H[false_item, :]) - self.reg * self.V[user_items, :]) 67 | self.H[true_item, :] += self.learning_rate * ( delta * np.power(len(user_items), -self.alpha) * V_sum - self.reg * self.H[true_item, :]) 68 | self.H[false_item, :] += self.learning_rate * ( -delta * np.power(len(user_items), -self.alpha) * V_sum - self.reg * self.H[false_item, :]) 69 | self.bias[true_item] += self.learning_rate * (delta - self.reg * self.bias[true_item]) 70 | self.bias[false_item] += self.learning_rate * (- delta - self.reg * self.bias[false_item]) 71 | 72 | return delta 73 | 74 | def rmse_sgd_step(self, user_items, item, rating): 75 | ''' 76 | 77 | return error 78 | ''' 79 | 80 | # Compute error 81 | prediction = self.item_score(user_items, item) 82 | delta = (rating - prediction) # error proposed in the FISM paper 83 | 84 | print(delta) 85 | if delta != delta: 86 | raise ValueError('NaN') 87 | 88 | # print(prediction) 89 | # y = raw_input() 90 | 91 | # Update CF 92 | V_sum = self.V[user_items, :].sum(axis=0) 93 | 94 | self.V[user_items, :] += self.learning_rate * ( delta * np.power(len(user_items), -self.alpha) * self.H[item, :] - self.reg * self.V[user_items, :]) 95 | self.H[item, :] += self.learning_rate * ( delta * np.power(len(user_items), -self.alpha) * V_sum - self.reg * self.H[item, :]) 96 | self.bias[item] += self.learning_rate * ( delta - self.reg * self.bias[item]) 97 | 98 | return delta 99 | 100 | def get_auc_training_sample(self): 101 | '''Pick a random triplet from self.triplets and a random false next item. 102 | returns a tuple of ids : (user_items, true_item, false_item) 103 | ''' 104 | 105 | user_id = random.randrange(self.n_users) 106 | while self.users[user_id,1] < 2: 107 | user_id = random.randrange(self.n_users) 108 | user_items = self.items[self.users[user_id,0]:self.users[user_id,0]+self.users[user_id,1]] 109 | 110 | true_item = random.choice(user_items) 111 | 112 | false_item = random.randrange(self.n_items) 113 | while false_item in user_items: 114 | false_item = random.randrange(self.n_items) 115 | 116 | return ([i for i in user_items if i is not true_item], true_item, false_item) 117 | 118 | def get_rmse_training_sample(self): 119 | 120 | neg_to_pos_ratio = 3 121 | user_items, true_item, false_item = self.get_auc_training_sample() 122 | 123 | if random.random() < 1 / (neg_to_pos_ratio + 1): 124 | return (user_items, true_item, 1) 125 | else: 126 | return (user_items, false_item, 0) 127 | 128 | 129 | def top_k_recommendations(self, sequence, user_id=None, k=10, exclude=None): 130 | ''' Recieves a sequence of (id, rating), and produces k recommendations (as a list of ids) 131 | ''' 132 | 133 | if exclude is None: 134 | exclude = [] 135 | 136 | user_items = [i[0] for i in sequence] 137 | output = self.item_score(user_items) 138 | 139 | # Put low similarity to viewed items to exclude them from recommendations 140 | output[[i[0] for i in sequence]] = -np.inf 141 | output[exclude] = -np.inf 142 | 143 | # find top k according to output 144 | return list(np.argpartition(-output, range(k))[:k]) 145 | 146 | def training_step(self, iterations): 147 | if self.loss == "BPR": 148 | return self.auc_sgd_step(*self.get_auc_training_sample()) 149 | else: 150 | return self.rmse_sgd_step(*self.get_rmse_training_sample()) 151 | 152 | def save(self, filename): 153 | '''Save the parameters of a network into a file 154 | ''' 155 | print('Save model in ' + filename) 156 | if not os.path.exists(os.path.dirname(filename)): 157 | os.makedirs(os.path.dirname(filename)) 158 | np.savez(filename, V=self.V, H=self.H, bias=self.bias) 159 | 160 | 161 | def load(self, filename): 162 | '''Load parameters values form a file 163 | ''' 164 | f = np.load(filename) 165 | self.V = f['V'] 166 | self.H = f['H'] 167 | self.bias = f['bias'] -------------------------------------------------------------------------------- /test.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | import numpy as np 4 | import theano 5 | import theano.tensor as T 6 | import lasagne 7 | import time 8 | import random 9 | import argparse 10 | import re 11 | import glob 12 | import sys 13 | import os 14 | import copy 15 | # import matplotlib.pyplot as plt 16 | from helpers.data_handling import DataHandler 17 | from helpers import evaluation 18 | import helpers.command_parser as parse 19 | 20 | 21 | def get_file_name(predictor, args): 22 | return args.dir + re.sub('_ml'+str(args.max_length), '_ml'+str(args.training_max_length), predictor._get_model_filename(args.number_of_batches)) 23 | 24 | def find_models(predictor, dataset, args): 25 | if args.method == "UKNN" or args.method == "MM" or args.method == "POP": 26 | return None 27 | 28 | file = dataset.dirname + "models/" + get_file_name(predictor, args) 29 | print(file) 30 | if args.number_of_batches == "*": 31 | file = np.array(glob.glob(file)) 32 | 33 | return file 34 | 35 | def save_file_name(predictor, dataset, args): 36 | if not args.save: 37 | return None 38 | else: 39 | file = re.sub('_ne\*_', '_', dataset.dirname + 'results/' + get_file_name(predictor, args)) 40 | return file 41 | 42 | def run_tests(predictor, model_file, dataset, args, get_full_recommendation_list=False, k=10): 43 | # Load model 44 | predictor.load(model_file) 45 | #predictor.load_last(os.path.dirname(model_file) + '/') 46 | # Prepare evaluator 47 | evaluator = evaluation.Evaluator(dataset, k=k) 48 | 49 | if get_full_recommendation_list: 50 | k = dataset.n_items 51 | 52 | count = 0 53 | nb_of_dp = [] 54 | start = time.clock() 55 | for sequence, user_id in dataset.test_set(epochs=1): 56 | count += 1 57 | num_viewed = int(len(sequence) / 2) 58 | viewed = sequence[:num_viewed] 59 | goal = [i[0] for i in sequence[num_viewed:]] 60 | 61 | if args.clusters > 0: 62 | recommendations, n = predictor.top_k_recommendations(viewed, user_id=user_id, k=k) 63 | nb_of_dp.append(n) 64 | else: 65 | recommendations = predictor.top_k_recommendations(viewed, user_id=user_id, k=k) 66 | 67 | evaluator.add_instance(goal, recommendations) 68 | 69 | if len(goal) == 0: 70 | raise ValueError 71 | end = time.clock() 72 | print('Timer: ', end-start) 73 | if len(nb_of_dp) == 0: 74 | evaluator.nb_of_dp = dataset.n_items 75 | else: 76 | evaluator.nb_of_dp = np.mean(nb_of_dp) 77 | return evaluator 78 | 79 | def print_results(ev, metrics, plot=True, file=None, n_batches=None, print_full_rank_comparison=False): 80 | 81 | for m in metrics: 82 | if m not in ev.metrics: 83 | raise ValueError('Unkown metric: ' + m) 84 | 85 | print(m+'@'+str(ev.k)+': ', ev.metrics[m]()) 86 | 87 | if file != None: 88 | if not os.path.exists(os.path.dirname(file)): 89 | os.makedirs(os.path.dirname(file)) 90 | with open(file, "a") as f: 91 | f.write(str(n_batches)+"\t".join(map(str, [ev.metrics[m]() for m in metrics])) + "\n") 92 | if print_full_rank_comparison: 93 | with open(file+"_full_rank", "a") as f: 94 | for data in ev.get_rank_comparison(): 95 | f.write("\t".join(map(str, data)) + "\n") 96 | else: 97 | print("-\t" + "\t".join(map(str, [ev.metrics[m]() for m in metrics])), file=sys.stderr) 98 | if print_full_rank_comparison: 99 | with open(file+"_full_rank", "a") as f: 100 | for data in ev.get_rank_comparison(): 101 | f.write("\t".join(map(str, data)) + "\n") 102 | 103 | def extract_number_of_epochs(filename): 104 | m = re.search('_ne([0-9]+(\.[0-9]+)?)_', filename) 105 | return float(m.group(1)) 106 | 107 | def get_last_tested_batch(filename): 108 | '''If the output file exist already, it will look at the content of the file and return the last batch that was tested. 109 | This is used to avoid testing to times the same model. 110 | ''' 111 | 112 | if filename is not None and os.path.isfile(filename): 113 | with open(filename) as f: 114 | for line in f: 115 | pass 116 | return float(line.split()[0]) 117 | else: 118 | return 0 119 | 120 | def test_command_parser(parser): 121 | 122 | parser.add_argument('-d', dest='dataset', help='Directory name of the dataset.', default='', type=str) 123 | parser.add_argument('-i', dest='number_of_batches', help='Number of epochs, if not set it will compare all the available models', default=-1, type=int) 124 | parser.add_argument('-k', dest='nb_of_predictions', help='Number of predictions to make. It is the "k" in "prec@k", "rec@k", etc.', default=10, type=int) 125 | parser.add_argument('--metrics', help='List of metrics to compute, comma separated', default='sps,recall,item_coverage,user_coverage,blockbuster_share', type=str) 126 | parser.add_argument('--save', help='Save results to a file', action='store_true') 127 | parser.add_argument('--dir', help='Model directory.', default="", type=str) 128 | parser.add_argument('--save_rank', help='Save the full comparison of goal and prediction ranking.', action='store_true') 129 | 130 | def main(): 131 | 132 | args = parse.command_parser(parse.predictor_command_parser, test_command_parser) 133 | 134 | args.training_max_length = args.max_length 135 | # args.max_length = int(DATA_HANDLER.max_length/2) 136 | if args.number_of_batches == -1: 137 | args.number_of_batches = "*" 138 | 139 | dataset = DataHandler(dirname=args.dataset) 140 | predictor = parse.get_predictor(args) 141 | predictor.prepare_model(dataset) 142 | file = find_models(predictor, dataset, args) 143 | 144 | if args.number_of_batches == "*" and args.method != "UKNN" and args.method != "MM" and args.method != "POP": 145 | 146 | output_file = save_file_name(predictor, dataset, args) 147 | 148 | last_tested_batch = get_last_tested_batch(output_file) 149 | batches = np.array(map(extract_number_of_epochs, file)) 150 | sorted_ids = np.argsort(batches) 151 | batches = batches[sorted_ids] 152 | file = file[sorted_ids] 153 | for i, f in enumerate(file): 154 | if batches[i] > last_tested_batch: 155 | evaluator = run_tests(predictor, f, dataset, args, get_full_recommendation_list=args.save_rank, k=args.nb_of_predictions) 156 | print('-------------------') 157 | print('(',i+1 ,'/', len(file),') results on ' + f) 158 | print_results(evaluator, args.metrics.split(','), plot=False, file=output_file, n_batches=batches[i], print_full_rank_comparison=args.save_rank) 159 | else: 160 | evaluator = run_tests(predictor, file, dataset, args, get_full_recommendation_list=args.save_rank, k=args.nb_of_predictions) 161 | print_results(evaluator, args.metrics.split(','), file=save_file_name(predictor, dataset, args), print_full_rank_comparison=args.save_rank) 162 | 163 | if __name__ == '__main__': 164 | main() -------------------------------------------------------------------------------- /factorization/fossil.py: -------------------------------------------------------------------------------- 1 | from __future__ import division 2 | from __future__ import print_function 3 | import numpy as np 4 | import math 5 | import random 6 | import re 7 | import os 8 | import glob 9 | import sys 10 | from time import time 11 | from mf_base import MFBase 12 | 13 | class Fossil(MFBase): 14 | ''' Implementation of the algorithm presented in "Fusing Similarity Models with Markov Chains for Sparse Sequential Recommendation", by He R. and McAuley J., 2016. 15 | ''' 16 | 17 | def __init__(self, k = 32, order=1, alpha=0.2, **kwargs): 18 | 19 | super(Fossil, self).__init__(**kwargs) 20 | 21 | self.name = 'Fossil' 22 | self.k = k 23 | self.order = order #markov chain order 24 | self.alpha = alpha 25 | 26 | def _get_model_filename(self, epochs): 27 | '''Return the name of the file to save the current model 28 | ''' 29 | filename = "fossil_ne"+str(epochs)+"_lr"+str(self.init_learning_rate)+"_an"+str(self.annealing_rate)+"_k"+str(self.k)+"_o"+str(self.order)+"_reg"+str(self.reg)+"_ini"+str(self.init_sigma) 30 | 31 | return filename+".npz" 32 | 33 | def init_model(self): 34 | ''' Initialize the model parameters 35 | ''' 36 | self.V = self.init_sigma * np.random.randn(self.n_items, self.k).astype(np.float32) 37 | self.H = self.init_sigma * np.random.randn(self.n_items, self.k).astype(np.float32) 38 | self.eta = self.init_sigma * np.random.randn(self.n_users, self.order).astype(np.float32) 39 | self.eta_bias = np.zeros(self.order).astype(np.float32) 40 | self.bias = np.zeros(self.n_items).astype(np.float32) 41 | 42 | def item_score(self, user_id, user_items, item=None): 43 | ''' Compute the prediction score of the Fossil model for the item "item", based on the list of items "user_items". 44 | ''' 45 | 46 | long_term = np.power(len(user_items), -self.alpha) * self.V[user_items, :].sum(axis=0) 47 | effective_order = min(self.order, len(user_items)) 48 | if user_id is None: 49 | short_term = np.dot((self.eta_bias + self.eta.mean(axis=0))[:effective_order], self.V[user_items[:-effective_order-1:-1], :]) 50 | else: 51 | short_term = np.dot((self.eta_bias + self.eta[user_id, :])[:effective_order], self.V[user_items[:-effective_order-1:-1], :]) 52 | 53 | if item is not None: 54 | return self.bias[item] + np.dot(long_term + short_term, self.H[item, :]) 55 | else: 56 | return self.bias + np.dot(long_term + short_term, self.H.T) 57 | 58 | def sgd_step(self, user_id, user_items, false_item): 59 | ''' Make one SGD update, given that the interaction between user and true_item exists, 60 | but the one between user and false_item does not. 61 | 62 | return error 63 | ''' 64 | 65 | true_item = user_items[-1] 66 | user_items = user_items[:-1] 67 | effective_order = min(self.order, len(user_items)) 68 | 69 | long_term = np.power(len(user_items), -self.alpha) * self.V[user_items, :].sum(axis=0) 70 | short_term = np.dot((self.eta_bias + self.eta[user_id, :])[:effective_order], self.V[user_items[:-effective_order-1:-1], :]) 71 | 72 | # Compute error 73 | x_true = self.item_score(user_id, user_items, true_item) 74 | x_false = self.item_score(user_id, user_items, false_item) 75 | delta = 1 / (1 + math.exp(-min(10, max(-10, x_false - x_true)))) # sigmoid of the error 76 | 77 | # Compute Update 78 | V_update = self.learning_rate * ( delta * np.power(len(user_items), -self.alpha) * (self.H[true_item, :] - self.H[false_item, :]) - self.reg * self.V[user_items, :]) 79 | V_update2 = self.learning_rate * delta * np.outer((self.eta_bias + self.eta[user_id, :])[:effective_order], self.H[true_item, :] - self.H[false_item, :]) 80 | H_true_up = self.learning_rate * ( delta * (long_term + short_term) - self.reg * self.H[true_item, :]) 81 | H_false_up = self.learning_rate * ( -delta * (long_term + short_term) - self.reg * self.H[false_item, :]) 82 | bias_true_up = self.learning_rate * (delta - self.reg * self.bias[true_item]) 83 | bias_false_up = self.learning_rate * (- delta - self.reg * self.bias[false_item]) 84 | eta_bias_up = self.learning_rate * (delta * np.dot(self.V[user_items[:-effective_order-1:-1], :], self.H[true_item, :] - self.H[false_item, :]) - self.reg * self.eta_bias[:effective_order]) 85 | eta_up = self.learning_rate * (delta * np.dot(self.V[user_items[:-effective_order-1:-1], :], self.H[true_item, :] - self.H[false_item, :]) - self.reg * self.eta[user_id, :effective_order]) 86 | 87 | 88 | # Update 89 | self.V[user_items, :] += V_update 90 | self.V[user_items[:-effective_order-1:-1], :] += V_update2 91 | self.H[true_item, :] += H_true_up 92 | self.H[false_item, :] += H_false_up 93 | self.bias[true_item] += bias_true_up 94 | self.bias[false_item] += bias_false_up 95 | self.eta_bias[:effective_order] += eta_bias_up 96 | self.eta[user_id, :effective_order] += eta_up 97 | 98 | return delta 99 | 100 | def get_training_sample(self): 101 | '''Pick a random triplet from self.triplets and a random false next item. 102 | returns a tuple of ids : (user_items, true_item, false_item) 103 | ''' 104 | 105 | user_id = random.randrange(self.n_users) 106 | while self.users[user_id,1] < 2: 107 | user_id = random.randrange(self.n_users) 108 | user_items = self.items[self.users[user_id,0]:self.users[user_id,0]+self.users[user_id,1]] 109 | 110 | t = random.randrange(1, len(user_items)) 111 | 112 | false_item = random.randrange(self.n_items) 113 | while false_item in user_items[:t+1]: 114 | false_item = random.randrange(self.n_items) 115 | 116 | return (user_id, user_items[:t+1], false_item) 117 | 118 | 119 | def top_k_recommendations(self, sequence, user_id=None, k=10, exclude=None): 120 | ''' Recieves a sequence of (id, rating), and produces k recommendations (as a list of ids) 121 | ''' 122 | 123 | if exclude is None: 124 | exclude = [] 125 | 126 | user_items = [i[0] for i in sequence] 127 | output = self.item_score(user_id, user_items) 128 | 129 | # Put low similarity to viewed items to exclude them from recommendations 130 | output[[i[0] for i in sequence]] = -np.inf 131 | output[exclude] = -np.inf 132 | 133 | # find top k according to output 134 | return list(np.argpartition(-output, range(k))[:k]) 135 | 136 | def training_step(self, iterations): 137 | return self.sgd_step(*self.get_training_sample()) 138 | 139 | def save(self, filename): 140 | '''Save the parameters of a network into a file 141 | ''' 142 | print('Save model in ' + filename) 143 | if not os.path.exists(os.path.dirname(filename)): 144 | os.makedirs(os.path.dirname(filename)) 145 | np.savez(filename, V=self.V, H=self.H, bias=self.bias, eta=self.eta, eta_bias=self.eta_bias) 146 | 147 | def load(self, filename): 148 | '''Load parameters values form a file 149 | ''' 150 | f = np.load(filename) 151 | self.V = f['V'] 152 | self.H = f['H'] 153 | self.bias = f['bias'] 154 | self.eta = f['eta'] 155 | self.eta_bias = f['eta_bias'] -------------------------------------------------------------------------------- /neural_networks/rnn_margin.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | import numpy as np 4 | import theano 5 | import theano.tensor as T 6 | import lasagne 7 | import cPickle 8 | import random 9 | from time import time 10 | import rnn_base as rnn 11 | from sparse_lstm import * 12 | 13 | class RNNMargin(rnn.RNNBase): 14 | ''' 15 | 16 | OPTIONS 17 | ------- 18 | balance: float, default 1, balance between the weight of false negative and false positive on the loss function. 19 | e.g. if balance = 1, both have the same weight, 20 | if balance = 0, only false negative have an impact, 21 | if balance = 2, false positive have twice as much weight as false negative. 22 | popularity_based: bool, default False, choose wether the target value of negatives depends on their popularity. 23 | if False, the target value of all negatives is 0 (versus 1 for the positives) 24 | if True, the target value of item i is min(1 - p_i, (1 - min_access) * p_i / min_access), where p_i = fraction of users who consumed that item. 25 | min_access: float in (0,1), default 0.05, parameter used in the formula for the target value of negatives in the popularity based case. 26 | Represent the minimum fraction of users that has access to any item. 27 | e.g. min_access=0.05 means that there is no item accessible by less than 5% of the users. 28 | n_targets: int or inf, default 1, number of items in the continuation of the sequence that will be used as positive target. 29 | 30 | ''' 31 | 32 | def __init__(self, loss_function="hinge", balance=1., popularity_based=False, min_access=0.05, n_targets=1, **kwargs): 33 | super(RNNMargin, self).__init__(**kwargs) 34 | 35 | self.balance = balance 36 | self.popularity_based = popularity_based 37 | self.min_access = min_access 38 | self.n_targets = n_targets 39 | if loss_function is None: 40 | loss_function = "hinge" 41 | self.loss_function_name = loss_function 42 | if loss_function == "hinge": 43 | self.loss_function = self._hinge_loss 44 | elif loss_function == "logit": 45 | self.loss_function = self._logit_loss 46 | elif loss_function == "logsig": 47 | self.loss_function = self._logsigmoid_loss 48 | else: 49 | raise ValueError('Unknown loss function') 50 | 51 | self.name = "RNN multi-targets" 52 | 53 | def _get_model_filename(self, epochs): 54 | '''Return the name of the file to save the current model 55 | ''' 56 | filename = "rnn_multitarget_"+self.loss_function_name+"_b"+str(self.balance) 57 | if self.popularity_based: 58 | filename += '_pb_ma'+str(self.min_access) 59 | return filename + "_" + self._common_filename(epochs) 60 | 61 | def _hinge_loss(self, predictions, targets, weights): 62 | return T.nnet.relu((predictions - targets) * weights).sum(axis=-1) 63 | 64 | def _logit_loss(self, predictions, targets, weights): 65 | return (T.nnet.sigmoid(predictions - targets) * weights).sum(axis=-1) 66 | 67 | def _logsigmoid_loss(self, predictions, targets, weights): 68 | return -T.log(T.nnet.sigmoid((targets - predictions) * weights)).sum(axis=-1) 69 | 70 | def _prepare_networks(self, n_items): 71 | ''' Prepares the building blocks of the RNN, but does not compile them: 72 | self.l_in : input layer 73 | self.l_mask : mask of the input layer 74 | self.target : target of the network 75 | self.l_out : output of the network 76 | self.cost : cost function 77 | ''' 78 | 79 | self.n_items = n_items 80 | 81 | # The input is composed of to parts : the on-hot encoding of the movie, and the features of the movie 82 | self.l_in = lasagne.layers.InputLayer(shape=(self.batch_size, self.max_length, self._input_size())) 83 | # The input is completed by a mask to inform the LSTM of the length of the sequence 84 | self.l_mask = lasagne.layers.InputLayer(shape=(self.batch_size, self.max_length)) 85 | 86 | # recurrent layer 87 | if not self.use_movies_features: 88 | l_recurrent = self.recurrent_layer(self.l_in, self.l_mask, true_input_size=self.n_items + self._n_optional_features(), only_return_final=True) 89 | else: 90 | l_recurrent = self.recurrent_layer(self.l_in, self.l_mask, true_input_size=None, only_return_final=True) 91 | 92 | # l_last_slice gets the last output of the recurrent layer 93 | l_last_slice = l_recurrent 94 | # l_last_slice = lasagne.layers.SliceLayer(l_recurrent, -1, 1) 95 | 96 | # Theano tensor for the targets 97 | target = T.fmatrix('multiple_target_output') 98 | target_weight = T.fmatrix('target_weight') 99 | self.exclude = T.fmatrix('excluded_items') 100 | self.theano_inputs = [self.l_in.input_var, self.l_mask.input_var, target, target_weight, self.exclude] 101 | 102 | # The sliced output is then passed through linear layer to obtain the right output size 103 | self.l_out = lasagne.layers.DenseLayer(l_last_slice, num_units=self.n_items, nonlinearity=None) 104 | 105 | # lasagne.layers.get_output produces a variable for the output of the net 106 | network_output = lasagne.layers.get_output(self.l_out) 107 | 108 | # loss function 109 | self.cost = self.loss_function(network_output, target, target_weight).mean() 110 | 111 | 112 | def _prepare_input(self, sequences): 113 | ''' Sequences is a list of [user_id, input_sequence, targets] 114 | ''' 115 | 116 | batch_size = len(sequences) 117 | 118 | # Shape return variables 119 | X = np.zeros((batch_size, self.max_length, self._input_size()), dtype=self._input_type) # input of the RNN 120 | mask = np.zeros((batch_size, self.max_length)) # mask of the input (to deal with sequences of different length) 121 | Y = np.zeros((batch_size, self.n_items), dtype=theano.config.floatX) 122 | weight = np.zeros((batch_size, self.n_items), dtype=theano.config.floatX) 123 | exclude = np.zeros((batch_size, self.n_items), dtype=theano.config.floatX) 124 | 125 | 126 | for i, sequence in enumerate(sequences): 127 | user_id, in_seq, target = sequence 128 | seq_features = np.array(map(lambda x: self._get_features(x, user_id), in_seq)) 129 | X[i, :len(in_seq), :] = seq_features # Copy sequences into X 130 | mask[i, :len(in_seq)] = 1 131 | exclude[i, [j[0] for j in in_seq]] = 1 132 | 133 | # compute weight for false positive 134 | w = self.balance * len(target) / (self.n_items - len(target) - len(in_seq)) 135 | 136 | weight[i,:] = w 137 | weight[i, [t[0] for t in target]] = -1 138 | if self.interactions_are_unique: 139 | weight[i, [t[0] for t in in_seq]] = 0 140 | 141 | 142 | Y[i, :] = self._default_target() 143 | Y[i, [t[0] for t in target]] = 1 144 | if self.interactions_are_unique: 145 | Y[i, [t[0] for t in in_seq]] = 0 146 | 147 | 148 | # weight *= 10e3 149 | return (X, mask.astype(theano.config.floatX), Y, weight, exclude) 150 | 151 | def _default_target(self): 152 | 153 | if not hasattr(self, '__default_target'): 154 | if not self.popularity_based: 155 | self.__default_target = np.zeros(self.n_items) 156 | else: 157 | num_users = self.dataset.training_set.n_users 158 | view_prob = self.dataset.item_popularity / num_users 159 | self.__default_target = np.minimum(1 - view_prob, (1 - self.min_access) * view_prob / self.min_access) 160 | 161 | return self.__default_target 162 | -------------------------------------------------------------------------------- /factorization/fpmc.py: -------------------------------------------------------------------------------- 1 | from __future__ import division 2 | from __future__ import print_function 3 | import numpy as np 4 | import math 5 | import random 6 | import re 7 | import os 8 | import glob 9 | import sys 10 | from time import time 11 | from helpers import evaluation 12 | from mf_base import MFBase 13 | 14 | class FPMC(MFBase): 15 | ''' Implementation of the algorithm presented in "Factorizing personalized Markov chains for next-basket recommendation", by Rendle S. et al., 2010. 16 | 17 | The adaptive sampling algorithm is adapted from "Improving pairwise learning for item recommendation from implicit feedback", by Rendle S. et al., 2014 18 | ''' 19 | 20 | def __init__(self, k_cf = 32, k_mc = 32, adaptive_sampling=True, sampling_bias=500, **kwargs): 21 | 22 | super(FPMC, self).__init__(**kwargs) 23 | 24 | self.name = 'FPMC' 25 | self.k_cf = k_cf 26 | self.k_mc = k_mc 27 | self.adaptive_sampling = adaptive_sampling 28 | self.sampling_bias = sampling_bias # lambda parameter in "Improving pairwise learning for item recommendation from implicit feedback", by Rendle S. et al., 2014 29 | self.max_length = np.inf # For compatibility with the RNNs 30 | 31 | def _get_model_filename(self, epochs): 32 | '''Return the name of the file to save the current model 33 | ''' 34 | filename = "fpmc_ne"+str(epochs)+"_lr"+str(self.init_learning_rate)+"_an"+str(self.annealing_rate)+"_kcf"+str(self.k_cf)+"_kmc"+str(self.k_mc)+"_reg"+str(self.reg)+"_ini"+str(self.init_sigma) 35 | if self.adaptive_sampling: 36 | filename += "_as"+str(self.sampling_bias) 37 | return filename+".npz" 38 | 39 | def init_model(self): 40 | ''' Initialize the model parameters 41 | ''' 42 | self.V_user_item = self.init_sigma * np.random.randn(self.n_users, self.k_cf).astype(np.float32) 43 | self.V_item_user = self.init_sigma * np.random.randn(self.n_items, self.k_cf).astype(np.float32) 44 | self.V_prev_next = self.init_sigma * np.random.randn(self.n_items, self.k_mc).astype(np.float32) 45 | self.V_next_prev = self.init_sigma * np.random.randn(self.n_items, self.k_mc).astype(np.float32) 46 | 47 | def sgd_step(self, user, prev_item, true_next, false_next): 48 | ''' Make one SGD update, given that the transition from prev_item to true_next exist in user history, 49 | But the transition prev_item to false_next does not exist. 50 | user, prev_item, true_next and false_next are all user or item ids. 51 | 52 | return error 53 | ''' 54 | 55 | # Compute error 56 | x_true = np.dot(self.V_user_item[user, :], self.V_item_user[true_next, :]) + np.dot(self.V_prev_next[prev_item, :], self.V_next_prev[true_next, :]) 57 | x_false = np.dot(self.V_user_item[user, :], self.V_item_user[false_next, :]) + np.dot(self.V_prev_next[prev_item, :], self.V_next_prev[false_next, :]) 58 | delta = 1 - 1 / (1 + math.exp(min(10, max(-10, x_false - x_true)))) # Bound x_true - x_false in [-10, 10] to avoid overflow 59 | 60 | # Update CF 61 | V_user_item_mem = self.V_user_item[user, :] 62 | self.V_user_item[user, :] += self.learning_rate * ( delta * (self.V_item_user[true_next, :] - self.V_item_user[false_next, :]) - self.reg * self.V_user_item[user, :]) 63 | self.V_item_user[true_next, :] += self.learning_rate * ( delta * V_user_item_mem - self.reg * self.V_item_user[true_next, :]) 64 | self.V_item_user[false_next, :] += self.learning_rate * ( -delta * V_user_item_mem - self.reg * self.V_item_user[false_next, :]) 65 | 66 | # Update MC 67 | V_prev_next_mem = self.V_prev_next[prev_item, :] 68 | self.V_prev_next[prev_item, :] += self.learning_rate * ( delta * (self.V_next_prev[true_next, :] - self.V_next_prev[false_next, :]) - self.reg * self.V_prev_next[prev_item, :]) 69 | self.V_next_prev[true_next, :] += self.learning_rate * ( delta * V_prev_next_mem - self.reg * self.V_next_prev[true_next, :]) 70 | self.V_next_prev[false_next, :] += self.learning_rate * ( -delta * V_prev_next_mem - self.reg * self.V_next_prev[false_next, :]) 71 | 72 | return delta 73 | 74 | def compute_factor_rankings(self): 75 | '''Rank items according to each factor in order to do adaptive sampling 76 | ''' 77 | 78 | CF_rank = np.argsort(self.V_item_user, axis=0) 79 | MC_rank = np.argsort(self.V_next_prev, axis=0) 80 | self.ranks = np.concatenate((CF_rank, MC_rank), axis=1) 81 | 82 | CF_var = np.var(self.V_item_user, axis=0) 83 | MC_var = np.var(self.V_next_prev, axis=0) 84 | self.var = np.concatenate((CF_var, MC_var)) 85 | 86 | def get_training_sample(self): 87 | '''Pick a random triplet from self.triplets and a random false next item. 88 | returns a tuple of ids : (user, prev_item, true_next, false_next) 89 | ''' 90 | 91 | # user_id, prev_item, true_next = random.choice(self.triplets) 92 | user_id = random.randrange(self.n_users) 93 | while self.users[user_id,1] < 2: 94 | user_id = random.randrange(self.n_users) 95 | r = random.randrange(self.users[user_id,1]-1) 96 | prev_item = self.items[self.users[user_id,0]+r] 97 | true_next = self.items[self.users[user_id,0]+r+1] 98 | if self.adaptive_sampling: 99 | while True: 100 | rank = np.random.exponential(scale=self.sampling_bias) 101 | while rank >= self.n_items: 102 | rank = np.random.exponential(scale=self.sampling_bias) 103 | factor_signs = np.sign(np.concatenate((self.V_user_item[user_id, :], self.V_prev_next[prev_item, :]))) 104 | factor_prob = np.abs(np.concatenate((self.V_user_item[user_id, :], self.V_prev_next[prev_item, :]))) * self.var 105 | f = np.random.choice(self.k_cf+self.k_mc, p=factor_prob/sum(factor_prob)) 106 | false_next = self.ranks[int(rank) * factor_signs[f],f] 107 | if false_next != true_next: 108 | break 109 | else: 110 | false_next = random.randrange(self.n_items-1) 111 | if false_next >= true_next: # To make sure false_next != true_next 112 | false_next += 1 113 | 114 | return (user_id, prev_item, true_next, false_next) 115 | 116 | def top_k_recommendations(self, sequence, user_id=None, k=10, exclude=None): 117 | ''' Recieves a sequence of (id, rating), and produces k recommendations (as a list of ids) 118 | ''' 119 | 120 | if exclude is None: 121 | exclude = [] 122 | 123 | last_item = sequence[-1][0] 124 | output = np.dot(self.V_user_item[user_id, :], self.V_item_user.T) + np.dot(self.V_prev_next[last_item, :], self.V_next_prev.T) 125 | 126 | # Put low similarity to viewed items to exclude them from recommendations 127 | output[[i[0] for i in sequence]] = -np.inf 128 | output[exclude] = -np.inf 129 | 130 | # find top k according to output 131 | return list(np.argpartition(-output, range(k))[:k]) 132 | 133 | def training_step(self, iterations): 134 | if self.adaptive_sampling and iterations%int(self.n_items * np.log(self.n_items)) == 0: 135 | self.compute_factor_rankings() 136 | 137 | # Train with a new batch 138 | return self.sgd_step(*self.get_training_sample()) 139 | 140 | def save(self, filename): 141 | '''Save the parameters of a network into a file 142 | ''' 143 | print('Save model in ' + filename) 144 | if not os.path.exists(os.path.dirname(filename)): 145 | os.makedirs(os.path.dirname(filename)) 146 | np.savez(filename, V_user_item=self.V_user_item, V_item_user=self.V_item_user, V_prev_next=self.V_prev_next, V_next_prev=self.V_next_prev) 147 | 148 | def load(self, filename): 149 | '''Load parameters values form a file 150 | ''' 151 | f = np.load(filename) 152 | self.V_user_item = f['V_user_item'] 153 | self.V_item_user = f['V_item_user'] 154 | self.V_prev_next = f['V_prev_next'] 155 | self.V_next_prev = f['V_next_prev'] -------------------------------------------------------------------------------- /helpers/data_handling.py: -------------------------------------------------------------------------------- 1 | from __future__ import division 2 | import numpy as np 3 | import theano 4 | import theano.tensor as T 5 | import random 6 | import os.path 7 | 8 | # Data directory 9 | DEFAULT_DIR = '../../data/' 10 | 11 | 12 | class DataHandler(object): 13 | ''' Prepare data for the differents algorithms. 14 | Give easy access to training, validation and test set and to information about the dataset 15 | such as number of users, items and interactions. 16 | ''' 17 | 18 | def __init__(self, dirname, extended_training_set=False, shuffle_training=False): 19 | ''' 20 | 21 | Parameter 22 | --------- 23 | 24 | dirname: str 25 | Directory where the dataset can be found. 26 | If dirname does not correspond to an existing path, DEFAULT_DIR+dirname will be looked for. 27 | If both dirname and DEFAULT_DIR+dirname are existing path, a warning will be printed. 28 | The directory should contains at least the following sub folders: 29 | - data/ where the dataset files can be found 30 | - models/ where the models are stored during training 31 | - results/ where the results are stored during testing 32 | 33 | extended_training_set: boolean 34 | If True, the extended training set is used, i.e. the file "train_set_sequences+" is loaded instead of "train_set_sequences". 35 | 36 | shuffle_training: boolean 37 | If True, the order of the training sequences is shuffled between each pass. 38 | ''' 39 | super(DataHandler, self).__init__() 40 | 41 | self.dirname = self._get_path(dirname) 42 | 43 | self.extended_training_set = extended_training_set 44 | if extended_training_set: 45 | self.training_set = SequenceGenerator(self.dirname+'data/train_set_sequences+', shuffle=shuffle_training) 46 | else: 47 | self.training_set = SequenceGenerator(self.dirname+'data/train_set_sequences', shuffle=shuffle_training) 48 | self.validation_set = SequenceGenerator(self.dirname+'data/val_set_sequences') 49 | self.test_set = SequenceGenerator(self.dirname+'data/test_set_sequences') 50 | 51 | self._load_stats() 52 | 53 | def training_set_triplets(self): 54 | with open(self.dirname + 'data/train_set_triplets') as f: 55 | for line in f: 56 | line = line.split() 57 | yield {'user_id': int(line[0]), 'item_id': int(line[1]), 'rating': float(line[2])} 58 | 59 | @property 60 | def item_popularity(self): 61 | '''Returns the number of occurences of an item in the training set. 62 | ''' 63 | 64 | if not hasattr(self.training_set, '_item_pop'): 65 | if os.path.isfile(self.dirname + 'data/training_set_item_popularity.npy'): 66 | self.training_set._item_pop = np.load(self.dirname + 'data/training_set_item_popularity.npy') 67 | else: 68 | self.training_set._item_pop = np.zeros(self.n_items) 69 | with open(self.dirname + 'data/train_set_triplets') as f: 70 | for line in f: 71 | self.training_set._item_pop[int(line.split()[1])] += 1 72 | np.save(self.dirname + 'data/training_set_item_popularity.npy', self.training_set._item_pop) 73 | 74 | return self.training_set._item_pop 75 | 76 | def _get_path(self, dirname): 77 | ''' Choose between dirname and DEFAULT_DIR+dirname. 78 | ''' 79 | if os.path.exists(dirname) and not os.path.exists(DEFAULT_DIR+dirname+'/'): 80 | return dirname 81 | if not os.path.exists(dirname) and os.path.exists(DEFAULT_DIR+dirname+'/'): 82 | return DEFAULT_DIR+dirname+'/' 83 | if os.path.exists(dirname) and os.path.exists(DEFAULT_DIR+dirname+'/'): 84 | print('WARNING: ambiguous directory name, both "'+dirname+'" and "'+DEFAULT_DIR+dirname+'" exist. "'+dirname+'" is used.') 85 | return dirname 86 | 87 | raise ValueError('Dataset not found') 88 | 89 | def _load_stats(self): 90 | ''' Load informations about the dataset from dirname/data/stats 91 | ''' 92 | with open(self.dirname+'data/stats', 'r') as f: 93 | _ = f.readline() # Line with column titles 94 | self.n_users, self.n_items, self.n_interactions, self.longest_sequence = map(int, f.readline().split()[1:]) 95 | self.training_set.n_users, self.training_set.n_items, self.training_set.n_interactions, self.training_set.longest_sequence = map(int, f.readline().split()[1:]) 96 | self.validation_set.n_users, self.validation_set.n_items, self.validation_set.n_interactions, self.validation_set.longest_sequence = map(int, f.readline().split()[1:]) 97 | self.test_set.n_users, self.test_set.n_items, self.test_set.n_interactions, self.test_set.longest_sequence = map(int, f.readline().split()[1:]) 98 | 99 | if self.extended_training_set: 100 | #Those values are unfortunately inexact 101 | self.training_set.n_users, self.training_set.n_items = self.n_users, self.n_items 102 | self.training_set.n_interactions += (self.validation_set.n_interactions + self.test_set.n_interactions)//2 103 | 104 | class SequenceGenerator(object): 105 | """docstring for SequenceGenerator""" 106 | def __init__(self, filename, shuffle=False): 107 | super(SequenceGenerator, self).__init__() 108 | self.filename = filename 109 | self.shuffle = shuffle 110 | self.epochs = 0. 111 | 112 | def load(self): 113 | 114 | self.lines = [] 115 | # self.max_length = 0 116 | # self.max_user_id = 0 117 | # self.max_item_id = 0 118 | 119 | with open(self.filename, 'r') as f: 120 | for sequence in f: 121 | self.lines.append(sequence) 122 | # self.max_length = max(self.max_length, (len(sequence.split()) - 1) / 2) 123 | # self.max_user_id = max(self.max_user_id, int(sequence.split()[0])) 124 | # self.max_item_id = max(self.max_item_id, max(map(int, sequence.split()[1::2]))) 125 | 126 | def __call__(self, min_length=2, max_length=None, length_choice='max', subsequence='contiguous', epochs=np.inf): 127 | if not hasattr(self, 'lines'): 128 | self.load() 129 | 130 | counter = 0 131 | self.epochs = 0. 132 | while counter < epochs: 133 | counter += 1 134 | print("Opening file ({})".format(counter)) 135 | if self.shuffle: 136 | random.shuffle(self.lines) 137 | for j, sequence in enumerate(self.lines): 138 | 139 | self.epochs = counter - 1 + j / len(self.lines) 140 | 141 | # Express sequence as a list of tuples (movie_id, rating) 142 | sequence = sequence.split() 143 | user_id = sequence[0] 144 | sequence = sequence[1:] 145 | sequence = [[int(sequence[2*i]), float(sequence[2*i + 1])] for i in range(int(len(sequence) / 2))] 146 | 147 | # Determine length of the sequence to be returned 148 | if max_length == None: 149 | this_max_length = len(sequence) 150 | else: 151 | this_max_length = max_length 152 | 153 | if len(sequence) < min_length: 154 | continue 155 | if (length_choice == 'random'): 156 | length = np.random.randint(min_length, min(this_max_length, len(sequence)) + 1) 157 | elif (length_choice == 'max'): 158 | length = min(this_max_length, len(sequence)) 159 | else: 160 | raise ValueError('Unrecognised length_choice option. Authorised values are "random" and "max" ') 161 | 162 | # Extract subsequence if needed 163 | if (length < len(sequence)): 164 | if subsequence == 'random': 165 | sequence = [ sequence[i] for i in sorted(random.sample(xrange(len(sequence)), length)) ] 166 | elif subsequence == 'contiguous': 167 | start = np.random.randint(0, len(sequence) - length + 1) 168 | sequence = sequence[start:start+length] 169 | elif subsequence == 'begining': 170 | sequence = sequence[:length] 171 | else: 172 | raise ValueError('Unrecognised subsequence option. Authorised values are "random", "contiguous" and "begining".') 173 | 174 | yield sequence, user_id 175 | -------------------------------------------------------------------------------- /neural_networks/rnn_sampling.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | import numpy as np 4 | import theano 5 | import theano.tensor as T 6 | import lasagne 7 | import cPickle 8 | import random 9 | from bisect import bisect 10 | from time import time 11 | import rnn_base as rnn 12 | from sparse_lstm import * 13 | 14 | class RNNSampling(rnn.RNNBase): 15 | """RNNSampling have a loss function that uses a sampling procedure. 16 | BPR or TOP1 17 | """ 18 | def __init__(self, loss_function="Blackout", sampling=32, last_layer_tanh=False, last_layer_init=1., diversity_bias=0.0, sampling_bias=0., **kwargs): 19 | ''' 20 | Parameters 21 | ---------- 22 | loss_function: "BPR" or "TOP1" or "Blackout" 23 | Choice between 3 loss functions: 24 | - BPR, as used in "Session-based Recommendations with Recurrent Neural Networks", Hidasi, B. et al., 2016 25 | - TOP1, defined in "Session-based Recommendations with Recurrent Neural Networks", Hidasi, B. et al., 2016 26 | - Blackout, discriminative loss function defined in "BlackOut: Speeding up Recurrent Neural Network Language Models With Very Large Vocabularies", Ji, S. et al., 2015 (equation 6) 27 | 28 | sampling: integer > 0 or float in (0,1) 29 | Number of items to sample in the computation of the loss function. 30 | If sampling is a float in (0,1), it is interpreted as the fraction of items to use. 31 | sampling_bias: float 32 | Items are sampled with a probability proportional to their frequency to the power of the sampling_bias. 33 | 34 | ''' 35 | super(RNNSampling, self).__init__(**kwargs) 36 | 37 | self.last_layer_init = last_layer_init 38 | self.last_layer_tanh =last_layer_tanh 39 | self.diversity_bias = diversity_bias 40 | self.sampling = sampling 41 | self.sampling_bias = sampling_bias 42 | if loss_function is None: 43 | loss_function = "Blackout" 44 | self.loss_function_name = loss_function 45 | if loss_function == "BPR": 46 | self.loss_function = self._BPR_loss 47 | elif loss_function == "BPRI": 48 | self.loss_function = self._BPRI_loss 49 | elif loss_function == "TOP1": 50 | self.loss_function = self._TOP1_loss 51 | elif loss_function == "Blackout": 52 | self.loss_function = self._blackout_loss 53 | else: 54 | raise ValueError("Unknown loss function") 55 | 56 | 57 | self.name = "RNN with sampling loss" 58 | 59 | def _get_model_filename(self, epochs): 60 | '''Return the name of the file to save the current model 61 | ''' 62 | filename = "rnn_sampling_"+self.loss_function_name+"_" 63 | if self.sampling_bias > 0.: 64 | filename += "p" + str(self.sampling_bias) 65 | filename += "s"+str(self.sampling)+"_ini"+str(self.last_layer_init)+"_db"+str(self.diversity_bias) 66 | return filename + "_" + self._common_filename(epochs) 67 | 68 | def _blackout_loss(self, predictions, targets): 69 | predictions = T.nnet.softmax(predictions) 70 | pos = T.nnet.categorical_crossentropy(predictions, targets) 71 | neg = T.log(1 - predictions) 72 | return pos - neg[:, targets.shape[0]:].sum(axis=-1) 73 | 74 | def _BPRI_loss(self, predictions, targets): 75 | if self.last_layer_tanh: 76 | predictions = T.tanh(predictions) 77 | diff = (predictions - T.diag(predictions).dimshuffle([0,'x']))[:, targets.shape[0]:] 78 | return (T.log(T.nnet.sigmoid(diff))).mean(axis=-1) 79 | 80 | def _BPR_loss(self, predictions, targets): 81 | if self.last_layer_tanh: 82 | predictions = T.tanh(predictions) 83 | diff = (predictions - T.diag(predictions).dimshuffle([0,'x']))[:, targets.shape[0]:] 84 | return -(T.log(T.nnet.sigmoid(-diff))).mean(axis=-1) 85 | 86 | def _TOP1_loss(self, predictions, targets): 87 | if self.last_layer_tanh: 88 | predictions = T.tanh(predictions) 89 | diff = (predictions - T.diag(predictions).dimshuffle([0,'x']))[:, targets.shape[0]:] 90 | reg = T.sqr(predictions[:, targets.shape[0]:]) 91 | return (T.nnet.sigmoid(diff) + T.nnet.sigmoid(reg)).mean(axis=-1) 92 | 93 | def _prepare_networks(self, n_items): 94 | ''' Prepares the building blocks of the RNN, but does not compile them: 95 | self.l_in : input layer 96 | self.l_mask : mask of the input layer 97 | self.target : target of the network 98 | self.l_out : output of the network 99 | self.cost : cost function 100 | ''' 101 | 102 | self.n_items = n_items 103 | if self.sampling < 1: 104 | self.effective_sampling = int(self.sampling * self.n_items) 105 | else: 106 | self.effective_sampling = int(self.sampling) 107 | 108 | # The input is composed of to parts : the on-hot encoding of the movie, and the features of the movie 109 | self.l_in = lasagne.layers.InputLayer(shape=(self.batch_size, self.max_length, self._input_size())) 110 | # The input is completed by a mask to inform the LSTM of the length of the sequence 111 | self.l_mask = lasagne.layers.InputLayer(shape=(self.batch_size, self.max_length)) 112 | 113 | # recurrent layer 114 | if not self.use_movies_features: 115 | l_recurrent = self.recurrent_layer(self.l_in, self.l_mask, true_input_size=self.n_items + self._n_optional_features(), only_return_final=True) 116 | else: 117 | l_recurrent = self.recurrent_layer(self.l_in, self.l_mask, true_input_size=None, only_return_final=True) 118 | 119 | # l_last_slice gets the last output of the recurrent layer 120 | l_last_slice = l_recurrent 121 | # l_last_slice = lasagne.layers.SliceLayer(l_recurrent, -1, 1) 122 | 123 | # Theano tensor for the targets 124 | target = T.ivector('target_output') 125 | samples = T.ivector('samples') 126 | self.exclude = T.fmatrix('excluded_items') 127 | target_popularity = T.fvector('target_popularity') 128 | self.theano_inputs = [self.l_in.input_var, self.l_mask.input_var, target, samples, target_popularity, self.exclude] 129 | 130 | # The sliced output is then passed through linear layer to obtain the right output size 131 | self.l_out = BlackoutLayer(l_last_slice, num_units=self.n_items, num_outputs=self.sampling, nonlinearity=None, W=lasagne.init.GlorotUniform(gain=self.last_layer_init)) 132 | 133 | # lasagne.layers.get_output produces a variable for the output of the net 134 | network_output = lasagne.layers.get_output(self.l_out, targets = target, samples=samples) 135 | 136 | # loss function 137 | self.cost = (self.loss_function(network_output,np.arange(self.batch_size)) / target_popularity).mean() 138 | 139 | 140 | def _compile_test_function(self): 141 | ''' Differs from base test function because of the added softmax operation 142 | ''' 143 | print("Compiling test...") 144 | deterministic_output = T.nnet.softmax(lasagne.layers.get_output(self.l_out, deterministic=True)) 145 | if self.interactions_are_unique: 146 | deterministic_output *= (1 - self.exclude) 147 | 148 | theano_test_function = theano.function(self.theano_inputs, deterministic_output, allow_input_downcast=True, name="Test_function", on_unused_input='ignore') 149 | 150 | def precision_test_function(theano_inputs, k=10): 151 | output = theano_test_function(*theano_inputs) 152 | ids = np.argpartition(-output, range(k), axis=-1)[0, :k] 153 | 154 | return ids 155 | 156 | self.test_function = precision_test_function 157 | print("Compilation done.") 158 | 159 | def _popularity_sample(self): 160 | if not hasattr(self, '_cumsum'): 161 | self._cumsum = np.cumsum(np.power(self.dataset.item_popularity, self.sampling_bias)) 162 | 163 | return bisect(self._cumsum, random.uniform(0, self._cumsum[-1])) 164 | 165 | def _prepare_input(self, sequences): 166 | ''' Sequences is a list of [user_id, input_sequence, targets] 167 | ''' 168 | 169 | batch_size = len(sequences) 170 | 171 | # Shape return variables 172 | X = np.zeros((batch_size, self.max_length, self._input_size()), dtype=self._input_type) # input of the RNN 173 | mask = np.zeros((batch_size, self.max_length)) # mask of the input (to deal with sequences of different length) 174 | Y = np.zeros((batch_size,), dtype='int32') # output target 175 | pop = np.zeros((batch_size,)) # output target popularity 176 | exclude = np.zeros((batch_size, self.n_items), dtype=theano.config.floatX) 177 | 178 | 179 | for i, sequence in enumerate(sequences): 180 | user_id, in_seq, target = sequence 181 | seq_features = np.array(map(lambda x: self._get_features(x, user_id), in_seq)) 182 | X[i, :len(in_seq), :] = seq_features # Copy sequences into X 183 | mask[i, :len(in_seq)] = 1 184 | Y[i] = target[0][0] # id of the first and only target 185 | exclude[i, [j[0] for j in in_seq]] = 1 186 | pop[i] = self.dataset.item_popularity[target[0][0]] ** self.diversity_bias 187 | 188 | if self.sampling_bias > 0: 189 | samples = np.array([self._popularity_sample() for i in range(self.effective_sampling)], dtype='int32') 190 | else: 191 | samples = np.random.choice(self.n_items, self.effective_sampling).astype('int32') 192 | 193 | 194 | return (X, mask.astype(theano.config.floatX), Y, samples, pop.astype(theano.config.floatX), exclude) 195 | -------------------------------------------------------------------------------- /helpers/evaluation.py: -------------------------------------------------------------------------------- 1 | from __future__ import division 2 | 3 | import numpy as np 4 | import scipy.sparse as ssp 5 | import os.path 6 | import theano 7 | import theano.tensor as T 8 | import random 9 | import operator 10 | import collections 11 | #import matplotlib.pyplot as plt 12 | 13 | # Plot multiple figures at the same time 14 | #plt.ion() 15 | 16 | class Evaluator(object): 17 | '''Evaluator is a class to compute metrics on tests 18 | 19 | It is used by first adding a series of "instances" : pairs of goals and predictions, then metrics can be computed on the ensemble of instances: 20 | average precision, percentage of instance with a correct prediction, etc. 21 | 22 | It can also return the set of correct predictions. 23 | ''' 24 | def __init__(self, dataset, k=10): 25 | super(Evaluator, self).__init__() 26 | self.instances = [] 27 | self.dataset = dataset 28 | self.k = k 29 | 30 | self.metrics = {'sps': self.short_term_prediction_success, 31 | 'recall': self.average_recall, 32 | 'precision': self.average_precision, 33 | 'ndcg': self.average_ndcg, 34 | 'item_coverage': self.item_coverage, 35 | 'user_coverage': self.user_coverage, 36 | 'assr': self.assr, 37 | 'blockbuster_share': self.blockbuster_share} 38 | 39 | def add_instance(self, goal, predictions): 40 | self.instances.append([goal, predictions]) 41 | 42 | def _load_interaction_matrix(self): 43 | '''Load the training set as an interaction matrix between items and users in a sparse format. 44 | ''' 45 | filename = self.dataset.dirname + 'data/train_set_triplets' 46 | if os.path.isfile(filename + '.npy'): 47 | file_content = np.load(filename + '.npy') 48 | else: 49 | file_content = np.loadtxt(filename) 50 | np.save(filename, file_content) 51 | 52 | self._interactions = ssp.coo_matrix((np.ones(file_content.shape[0]), (file_content[:,1], file_content[:,0]))).tocsr() 53 | 54 | def _intra_list_similarity(self, items): 55 | '''Compute the intra-list similarity of a list of items. 56 | ''' 57 | if not hasattr(self, "_interactions"): 58 | self._load_interaction_matrix() 59 | 60 | norm = np.sqrt(np.asarray(self._interactions[items, :].sum(axis=1)).ravel()) 61 | sims = self._interactions[items, :].dot(self._interactions[items, :].T).toarray() 62 | S = 0 63 | for i in range(len(items)): 64 | for j in range(i): 65 | S += sims[i, j] / norm[i] / norm[j] 66 | 67 | return S 68 | 69 | def average_intra_list_similarity(self): 70 | '''Return the average intra-list similarity, as defined in "Auralist: Introducing Serendipity into Music Recommendation" 71 | ''' 72 | 73 | ILS = 0 74 | for goal, prediction in self.instances: 75 | if len(prediction) > 0: 76 | ILS += self._intra_list_similarity(prediction[:min(len(prediction), self.k)]) 77 | 78 | return ILS / len(self.instances) 79 | 80 | 81 | def blockbuster_share(self): 82 | '''Return the percentage of correct long term predictions that are about items in the top 1% of the most popular items. 83 | ''' 84 | 85 | correct_predictions = self.get_correct_predictions() 86 | nb_pop_items = self.dataset.n_items // 100 87 | pop_items = np.argpartition(-self.dataset.item_popularity, nb_pop_items)[:nb_pop_items] 88 | 89 | if len(correct_predictions) == 0: 90 | return 0 91 | return len([i for i in correct_predictions if i in pop_items])/len(correct_predictions) 92 | 93 | def average_novelty(self): 94 | '''Return the average novelty measure, as defined in "Auralist: Introducing Serendipity into Music Recommendation" 95 | ''' 96 | 97 | nb_of_ratings = sum(self.dataset.item_popularity) 98 | 99 | novelty = 0 100 | for goal, prediction in self.instances: 101 | if len(prediction) > 0: 102 | novelty += sum(map(np.log2, self.dataset.item_popularity[prediction[:min(len(prediction), self.k)]] / nb_of_ratings)) / min(len(prediction), self.k) 103 | 104 | return -novelty / len(self.instances) 105 | 106 | def average_precision(self): 107 | '''Return the average number of correct predictions per instance. 108 | ''' 109 | precision = 0 110 | for goal, prediction in self.instances: 111 | if len(prediction) > 0: 112 | precision += float(len(set(goal) & set(prediction[:min(len(prediction), self.k)]))) / min(len(prediction), self.k) 113 | 114 | return precision / len(self.instances) 115 | 116 | def average_recall(self): 117 | '''Return the average recall. 118 | ''' 119 | recall = 0 120 | for goal, prediction in self.instances: 121 | if len(goal) > 0: 122 | recall += float(len(set(goal) & set(prediction[:min(len(prediction), self.k)]))) / len(goal) 123 | 124 | return recall / len(self.instances) 125 | 126 | def average_ndcg(self): 127 | ndcg = 0. 128 | for goal, prediction in self.instances: 129 | if len(prediction) > 0: 130 | dcg = 0. 131 | max_dcg = 0. 132 | for i, p in enumerate(prediction[:min(len(prediction), self.k)]): 133 | if i < len(goal): 134 | max_dcg += 1. / np.log2(2 + i) 135 | 136 | if p in goal: 137 | dcg += 1. / np.log2(2 + i) 138 | 139 | ndcg += dcg/max_dcg 140 | 141 | return ndcg / len(self.instances) 142 | 143 | def short_term_prediction_success(self): 144 | '''Return the percentage of instances for which the first goal was in the predictions 145 | ''' 146 | score = 0 147 | for goal, prediction in self.instances: 148 | score += int(goal[0] in prediction[:min(len(prediction), self.k)]) 149 | 150 | return score / len(self.instances) 151 | 152 | def sps(self): 153 | return self.short_term_prediction_success() 154 | 155 | def user_coverage(self): 156 | '''Return the percentage of instances for which at least one of the goals was in the predictions 157 | ''' 158 | score = 0 159 | for goal, prediction in self.instances: 160 | score += int(len(set(goal) & set(prediction[:min(len(prediction), self.k)])) > 0) 161 | 162 | return score / len(self.instances) 163 | 164 | def get_all_goals(self): 165 | '''Return a concatenation of the goals of each instances 166 | ''' 167 | return [g for goal, _ in self.instances for g in goal] 168 | 169 | def get_strict_goals(self): 170 | '''Return a concatenation of the strict goals (i.e. the first goal) of each instances 171 | ''' 172 | return [goal[0] for goal, _ in self.instances] 173 | 174 | def get_all_predictions(self): 175 | '''Return a concatenation of the predictions of each instances 176 | ''' 177 | return [p for _, prediction in self.instances for p in prediction[:min(len(prediction), self.k)]] 178 | 179 | def get_correct_predictions(self): 180 | '''Return a concatenation of the correct predictions of each instances 181 | ''' 182 | correct_predictions = [] 183 | for goal, prediction in self.instances: 184 | correct_predictions.extend(list(set(goal) & set(prediction[:min(len(prediction), self.k)]))) 185 | return correct_predictions 186 | 187 | def item_coverage(self): 188 | return len(set(self.get_correct_predictions())) 189 | 190 | def get_correct_strict_predictions(self): 191 | '''Return a concatenation of the strictly correct predictions of each instances (i.e. predicted the first goal) 192 | ''' 193 | correct_predictions = [] 194 | for goal, prediction in self.instances: 195 | correct_predictions.extend(list(set([goal[0]]) & set(prediction[:min(len(prediction), self.k)]))) 196 | return correct_predictions 197 | 198 | def get_rank_comparison(self): 199 | '''Returns a list of tuple of the form (position of the item in the list of goals, position of the item in the recommendations) 200 | ''' 201 | all_positions = [] 202 | for goal, prediction in self.instances: 203 | position_in_predictions = np.argsort(prediction)[goal] 204 | all_positions.extend(list(enumerate(position_in_predictions))) 205 | 206 | return all_positions 207 | 208 | def assr(self): 209 | '''Returns the average search space reduction. 210 | It is defined as the number of items in the dataset divided by the average number of dot products made during testing. 211 | ''' 212 | 213 | if hasattr(self, 'nb_of_dp') and self.nb_of_dp > 0: 214 | return self.dataset.n_items / self.nb_of_dp 215 | else: 216 | return 1 # If nb_of_dp is not defined, clustering is probably not used, return default assr: 1 217 | 218 | class DistributionCharacteristics(object): 219 | """DistributionCharacteristics computes and plot certain characteristics of a list of movies, such as the distribution of popularity. 220 | """ 221 | def __init__(self, movies): 222 | super(DistributionCharacteristics, self).__init__() 223 | self.movies = collections.Counter(movies) 224 | 225 | def plot_frequency_distribution(self): 226 | '''Plot the number of items versus the frequency 227 | ''' 228 | frequencies = self.movies.values() 229 | freq_distribution = collections.Counter(frequencies) 230 | #plt.figure() 231 | #plt.loglog(freq_distribution.keys(), freq_distribution.values(), '.') 232 | #plt.show() 233 | 234 | def plot_popularity_distribution(self): 235 | '''Bar plot of the number of movies in each popularity category 236 | ''' 237 | 238 | bars = np.zeros(10) 239 | for key, val in self.movies.items(): 240 | popularity_index = OTHER_FEATURES[key, 3] - 1 # minus 1 to shift from 1-based to 0-based counting 241 | bars[popularity_index] += val 242 | 243 | # plt.figure() 244 | # plt.bar(np.arange(10) + 0.5, bars, width=1) 245 | # plt.show() 246 | 247 | def number_of_movies(self): 248 | return len(self.movies) 249 | 250 | -------------------------------------------------------------------------------- /factorization/mf_base.py: -------------------------------------------------------------------------------- 1 | from __future__ import division 2 | from __future__ import print_function 3 | import numpy as np 4 | import math 5 | import random 6 | import re 7 | import os 8 | import glob 9 | import sys 10 | from time import time 11 | from helpers import evaluation 12 | 13 | class MFBase(object): 14 | '''Base class for methods based on matrix factorization 15 | ''' 16 | 17 | def __init__(self, reg = 0.0025, learning_rate = 0.05, annealing=1., init_sigma = 1): 18 | self.name = 'Base for matrix factorization' 19 | self.reg = reg 20 | self.learning_rate = learning_rate # self.learning_rate will change due to annealing. 21 | self.init_learning_rate = learning_rate # self.init_learning_rate keeps the original value (for filename) 22 | self.annealing_rate = annealing 23 | self.init_sigma = init_sigma 24 | self.max_length = np.inf # For compatibility with the RNNs 25 | 26 | self.metrics = {'recall': {'direction': 1}, 27 | 'sps': {'direction': 1}, 28 | 'user_coverage' : {'direction': 1}, 29 | 'item_coverage' : {'direction': 1}, 30 | 'ndcg' : {'direction': 1}, 31 | 'blockbuster_share' : {'direction': -1} 32 | } 33 | 34 | def prepare_model(self, dataset): 35 | '''Must be called before using train, load or top_k_recommendations 36 | ''' 37 | self.dataset = dataset 38 | self.n_items = dataset.n_items 39 | self.n_users = dataset.n_users 40 | 41 | def change_data_format(self, dataset): 42 | '''Gets a generator of data in the sequence format and save data in the csr format 43 | ''' 44 | 45 | self.users = np.zeros((self.n_users,2), dtype=np.int32) 46 | self.items = np.zeros(dataset.training_set.n_interactions, dtype=np.int32) 47 | cursor = 0 48 | with open(dataset.training_set.filename, 'r') as f: 49 | for sequence in f: 50 | sequence = sequence.split() 51 | items = map(int, sequence[1::2]) 52 | self.users[int(sequence[0]), :] = [cursor, len(items)] 53 | self.items[cursor:cursor+len(items)] = items 54 | cursor += len(items) 55 | 56 | def get_pareto_front(self, metrics, metrics_names): 57 | costs = np.zeros((len(metrics[metrics_names[0]]), len(metrics_names))) 58 | for i, m in enumerate(metrics_names): 59 | costs[:, i] = np.array(metrics[m]) * self.metrics[m]['direction'] 60 | is_efficient = np.ones(costs.shape[0], dtype = bool) 61 | for i, c in enumerate(costs): 62 | if is_efficient[i]: 63 | is_efficient[is_efficient] = np.any(costs[is_efficient]>=c, axis=1) 64 | return np.where(is_efficient)[0].tolist() 65 | 66 | def _compute_validation_metrics(self, metrics): 67 | ev = evaluation.Evaluator(self.dataset, k=10) 68 | for sequence, user_id in self.dataset.validation_set(epochs=1): 69 | top_k = self.top_k_recommendations(sequence[:len(sequence)//2], user_id=int(user_id)) 70 | goal = [i[0] for i in sequence[len(sequence)//2:]] 71 | ev.add_instance(goal, top_k) 72 | 73 | metrics['recall'].append(ev.average_recall()) 74 | metrics['sps'].append(ev.sps()) 75 | metrics['ndcg'].append(ev.average_ndcg()) 76 | metrics['user_coverage'].append(ev.user_coverage()) 77 | metrics['item_coverage'].append(ev.item_coverage()) 78 | metrics['blockbuster_share'].append(ev.blockbuster_share()) 79 | 80 | return metrics 81 | 82 | def train(self, dataset, 83 | max_time=np.inf, 84 | progress=2.0, 85 | time_based_progress=False, 86 | autosave='All', 87 | save_dir='', 88 | min_iterations=0, 89 | max_iter=np.inf, 90 | max_progress_interval=np.inf, 91 | load_last_model=False, 92 | early_stopping=None, 93 | validation_metrics=['sps']): 94 | '''Train the model based on the sequence given by the training_set 95 | 96 | max_time is used to set the maximumn amount of time (in seconds) that the training can last before being stop. 97 | By default, max_time=np.inf, which means that the training will last until the training_set runs out, or the user interrupt the program. 98 | 99 | progress is used to set when progress information should be printed during training. It can be either an int or a float: 100 | integer : print at linear intervals specified by the value of progress (i.e. : progress, 2*progress, 3*progress, ...) 101 | float : print at geometric intervals specified by the value of progress (i.e. : progress, progress^2, progress^3, ...) 102 | 103 | max_progress_interval can be used to have geometric intervals in the begining then switch to linear intervals. 104 | It ensures, independently of the progress parameter, that progress is shown at least every max_progress_interval. 105 | 106 | time_based_progress is used to choose between using number of iterations or time as a progress indicator. True means time (in seconds) is used, False means number of iterations. 107 | 108 | autosave is used to set whether the model should be saved during training. It can take several values: 109 | All : the model will be saved each time progress info is printed. 110 | Best : save only the best model so far 111 | None : does not save 112 | 113 | min_iterations is used to set a minimum of iterations before printing the first information (and saving the model). 114 | 115 | save_dir is the path to the directory where models are saved. 116 | 117 | load_last_model: if true, find the latest model in the directory where models should be saved, and load it before starting training. 118 | 119 | early_stopping : should be a callable that will recieve the list of validation error and the corresponding epochs and return a boolean indicating whether the learning should stop. 120 | ''' 121 | 122 | # Change data format 123 | self.change_data_format(dataset) 124 | #del dataset.training_set.lines 125 | 126 | if len(set(validation_metrics) & set(self.metrics.keys())) < len(validation_metrics): 127 | raise ValueError('Incorrect validation metrics. Metrics must be chosen among: ' + ', '.join(self.metrics.keys())) 128 | 129 | # Load last model if needed, else initialise the model 130 | iterations = 0 131 | epochs_offset = 0 132 | if load_last_model: 133 | epochs_offset = self.load_last(save_dir) 134 | if epochs_offset == 0: 135 | self.init_model() 136 | 137 | start_time = time() 138 | next_save = int(progress) 139 | train_costs = [] 140 | current_train_cost = [] 141 | epochs = [] 142 | metrics = {name:[] for name in self.metrics.keys()} 143 | filename = {} 144 | 145 | while (time() - start_time < max_time and iterations < max_iter): 146 | 147 | cost = self.training_step(iterations) 148 | 149 | current_train_cost.append(cost) 150 | 151 | # Cool learning rate 152 | if iterations % dataset.training_set.n_interactions == 0: 153 | self.learning_rate *= self.annealing_rate 154 | 155 | # Check if it is time to save the model 156 | iterations += 1 157 | 158 | if time_based_progress: 159 | progress_indicator = int(time() - start_time) 160 | else: 161 | progress_indicator = iterations 162 | 163 | if progress_indicator >= next_save: 164 | 165 | if progress_indicator >= min_iterations: 166 | 167 | # Save current epoch 168 | epochs.append(epochs_offset + iterations / dataset.training_set.n_interactions) 169 | 170 | # Average train cost 171 | train_costs.append(np.mean(current_train_cost)) 172 | current_train_cost = [] 173 | 174 | # Compute validation cost 175 | metrics = self._compute_validation_metrics(metrics) 176 | 177 | # Print info 178 | self._print_progress(iterations, epochs[-1], start_time, train_costs, metrics, validation_metrics) 179 | 180 | # Save model 181 | run_nb = len(metrics[self.metrics.keys()[0]])-1 182 | if autosave == 'All': 183 | filename[run_nb] = save_dir + self._get_model_filename(round(epochs[-1], 3)) 184 | self.save(filename[run_nb]) 185 | elif autosave == 'Best': 186 | pareto_runs = self.get_pareto_front(metrics, validation_metrics) 187 | if run_nb in pareto_runs: 188 | filename[run_nb] = save_dir + self._get_model_filename(round(epochs[-1], 3)) 189 | self.save(filename[run_nb]) 190 | to_delete = [r for r in filename if r not in pareto_runs] 191 | for run in to_delete: 192 | try: 193 | os.remove(filename[run]) 194 | print('Deleted ', filename[run]) 195 | except OSError: 196 | print('Warning : Previous model could not be deleted') 197 | del filename[run] 198 | 199 | if early_stopping is not None: 200 | # Stop if early stopping is triggered for all the validation metrics 201 | if all([early_stopping(epochs, metrics[m]) for m in validation_metrics]): 202 | break 203 | 204 | 205 | # Compute next checkpoint 206 | if isinstance(progress, int): 207 | next_save += min(progress, max_progress_interval) 208 | else: 209 | next_save += min(max_progress_interval, next_save * (progress - 1)) 210 | 211 | best_run = np.argmax(np.array(metrics[validation_metrics[0]]) * self.metrics[validation_metrics[0]]['direction']) 212 | return ({m: metrics[m][best_run] for m in self.metrics.keys()}, time()-start_time, filename[best_run]) 213 | 214 | def _print_progress(self, iterations, epochs, start_time, train_costs, metrics, validation_metrics): 215 | '''Print learning progress in terminal 216 | ''' 217 | print(self.name, iterations, "batchs, ", epochs, " epochs in", time() - start_time, "s") 218 | print("Last train cost : ", train_costs[-1]) 219 | for m in self.metrics: 220 | print(m, ': ', metrics[m][-1]) 221 | if m in validation_metrics: 222 | print('Best ', m, ': ', max(np.array(metrics[m])*self.metrics[m]['direction'])*self.metrics[m]['direction']) 223 | print('-----------------') 224 | 225 | # Print on stderr for easier recording of progress 226 | print(iterations, epochs, time() - start_time, train_costs[-1], ' '.join(map(str, [metrics[m][-1] for m in self.metrics])), file=sys.stderr) 227 | 228 | def load_last(self, save_dir): 229 | '''Load last model from dir 230 | ''' 231 | def extract_number_of_epochs(filename): 232 | m = re.search('_ne([0-9]+(\.[0-9]+)?)_', filename) 233 | return float(m.group(1)) 234 | 235 | # Get all the models for this RNN 236 | file = save_dir + self._get_model_filename("*") 237 | file = np.array(glob.glob(file)) 238 | 239 | if len(file) == 0: 240 | print('No previous model, starting from scratch') 241 | return 0 242 | 243 | # Find last model and load it 244 | last_batch = np.amax(np.array(map(extract_number_of_epochs, file))) 245 | last_model = save_dir + self._get_model_filename(last_batch) 246 | print('Starting from model ' + last_model) 247 | self.load(last_model) 248 | 249 | return last_batch -------------------------------------------------------------------------------- /helpers/command_parser.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | from neural_networks.rnn_one_hot import RNNOneHot 3 | from neural_networks.rnn_cluster import RNNCluster 4 | from neural_networks.fism_cluster import FISMCluster 5 | from neural_networks.rnn_margin import RNNMargin 6 | from neural_networks.rnn_sampling import RNNSampling 7 | from lazy.pop import Pop 8 | from lazy.markov_model import MarkovModel 9 | from lazy.user_knn import UserKNN 10 | from neural_networks.stacked_denoising_autoencoder import StackedDenoisingAutoencoder 11 | from factorization.bprmf import BPRMF 12 | from factorization.fism import FISM 13 | from factorization.fossil import Fossil 14 | from factorization.fpmc import FPMC 15 | from word2vec.ltm import LTM 16 | from helpers.early_stopping import early_stopping_command_parser, get_early_stopper 17 | from neural_networks.recurrent_layers import recurrent_layers_command_parser, get_recurrent_layers 18 | from neural_networks.update_manager import update_manager_command_parser, get_update_manager 19 | from neural_networks.sequence_noise import sequence_noise_command_parser, get_sequence_noise 20 | from neural_networks.target_selection import target_selection_command_parser, get_target_selection 21 | 22 | def command_parser(*sub_command_parser): 23 | ''' *sub_command_parser should be callables that will add arguments to the command parser 24 | ''' 25 | 26 | parser = argparse.ArgumentParser() 27 | 28 | for scp in sub_command_parser: 29 | scp(parser) 30 | 31 | args = parser.parse_args() 32 | return args 33 | 34 | def predictor_command_parser(parser): 35 | parser.add_argument('-m', dest='method', choices=['RNN', 'SDA', 'BPRMF', 'FPMC', 'FISM', 'Fossil', 'LTM', 'UKNN', 'MM', 'POP'], 36 | help='Method', default='RNN') 37 | parser.add_argument('-b', dest='batch_size', help='Batch size', default=16, type=int) 38 | parser.add_argument('-l', dest='learning_rate', help='Learning rate', default=0.01, type=float) 39 | parser.add_argument('-r', dest='regularization', help='Regularization (positive for L2, negative for L1)', default=0., type=float) 40 | parser.add_argument('-g', dest='gradient_clipping', help='Gradient clipping', default=100, type=int) 41 | parser.add_argument('-H', dest='hidden', help='Number of hidden neurons (for LTM and BPRMF)', default=20, type=int) 42 | parser.add_argument('-L', dest='layers', help='Layers (for SDA)', default="20", type=str) 43 | parser.add_argument('--loss', help='Loss function, choose between TOP1, BPR and Blackout (Sampling), or hinge, logit and logsig (multi-targets), or CCE (Categorical cross-entropy)', default='CCE', type=str) 44 | parser.add_argument('--sampling', help='Number of sample for the computation of the loss in RNNSampling', default=32.0, type=float) 45 | parser.add_argument('--sampling_bias', help='Sampling bias for cluster methods. 0. means uniform sampling, 1. means proportional to the item frequency', default=0., type=float) 46 | parser.add_argument('--db', dest='diversity_bias', help='Diversity bias (for RNN with CCE, TOP1, BPR or Blackout loss)', default=0.0, type=float) 47 | parser.add_argument('--in_do', dest='input_dropout', help='Input dropout (for SDA)', default=0.2, type=float) 48 | parser.add_argument('--do', dest='dropout', help='Dropout (for SDA)', default=0.5, type=float) 49 | parser.add_argument('--rf', help='Use rating features.', action='store_true') 50 | parser.add_argument('--mf', help='Use movie features.', action='store_true') 51 | parser.add_argument('--uf', help='Use users features.', action='store_true') 52 | parser.add_argument('--ns', help='Neighborhood size (for UKNN).', default=80, type=int) 53 | parser.add_argument('--pb', help='Popularity based (for RNNMargin).', action='store_true') 54 | parser.add_argument('--balance', help='Balance between false positive and false negative error (for RNNMargin).', default=1., type=float) 55 | parser.add_argument('--min_access', help='Estimation of minimum access probability (for RNNMargin).', default=0.05, type=float) 56 | parser.add_argument('--k_cf', help='Number of features for the CF factorization (for FPMC).', default=32, type=int) 57 | parser.add_argument('--k_mc', help='Number of features for the MC factorization (for FPMC).', default=32, type=int) 58 | parser.add_argument('--init_sigma', help='Sigma of the gaussian initialization (for FPMC)', default=1, type=float) 59 | parser.add_argument('--fpmc_bias', help='Sampling bias (for FPMC)', default=100., type=float) 60 | parser.add_argument('--no_adaptive_sampling', help='No adaptive sampling (for FPMC)', action='store_true') 61 | parser.add_argument('--cooling', help='Simulated annealing', default=1., type=float) 62 | parser.add_argument('--ltm_damping', help='Temporal damping (for LTM)', default=0.8, type=float) 63 | parser.add_argument('--ltm_window', help='Window for word2vec (for LTM)', default=5, type=int) 64 | parser.add_argument('--ltm_no_trajectory', help='Do not use users trajectory in LTM, just use word2vec', action='store_true') 65 | parser.add_argument('--max_length', help='Maximum length of sequences during training (for RNNs)', default=30, type=int) 66 | parser.add_argument('--repeated_interactions', help='The model can recommend items with which the user already interacted', action='store_true') 67 | parser.add_argument('--fism_alpha', help='Alpha parameter in FISM', default=0.2, type=float) 68 | parser.add_argument('--fossil_order', help='Order of the markov chains in Fossil', default=1, type=int) 69 | 70 | parser.add_argument('--c_sampling', help='Number of sample for the clustering loss. If unset, the same samples are used for the recommendation loss and for the clustering loss.', default=-1, type=int) 71 | parser.add_argument('--ignore_clusters', help='Don\'t use clusters during test. Useful to observe the influence of clustering', action='store_true') 72 | parser.add_argument('--clusters', help='Number of clusters. If unset, no clustering is used', default=-1, type=int) 73 | parser.add_argument('--init_scale', help='Initial scale of the softmax and sigmoid in the clustering method.', default=1., type=float) 74 | parser.add_argument('--scale_growing_rate', help='Rate of the geometric growth of the sigmoid/softmax scale in the clustering method.', default=1., type=float) 75 | parser.add_argument('--max_scale', help='Max scale of the softmax and sigmoid in the clustering method.', default=50, type=float) 76 | parser.add_argument('--csn', help='Cluster selection noise', default=0., type=float) 77 | parser.add_argument('--cluster_type', choices=['softmax', 'mix', 'sigmoid'], help='Type of clusters. Softmax puts every item in 1 and only 1 cluster. Sigmoid allow puts items in 0 to n clusters. Mix puts items in 1 to n clusters.', default='mix', type=str) 78 | 79 | update_manager_command_parser(parser) 80 | recurrent_layers_command_parser(parser) 81 | sequence_noise_command_parser(parser) 82 | target_selection_command_parser(parser) 83 | 84 | def get_predictor(args): 85 | args.layers = map(int, args.layers.split('-')) 86 | 87 | updater = get_update_manager(args) 88 | recurrent_layer = get_recurrent_layers(args) 89 | sequence_noise = get_sequence_noise(args) 90 | target_selection = get_target_selection(args) 91 | 92 | if args.method == "MF": 93 | return Factorization() 94 | elif args.method == "BPRMF": 95 | return BPRMF(k=args.hidden, reg = args.regularization, learning_rate = args.learning_rate, annealing=args.cooling, init_sigma = args.init_sigma, adaptive_sampling=(not args.no_adaptive_sampling), sampling_bias=args.fpmc_bias) 96 | elif args.method == "FISM": 97 | if args.clusters > 0: 98 | return FISMCluster(h=args.hidden, reg=args.regularization, alpha=args.fism_alpha, loss=args.loss, interactions_are_unique=(not args.repeated_interactions), predict_with_clusters=(not args.ignore_clusters), sampling_bias=args.sampling_bias, sampling=args.sampling, cluster_sampling=args.c_sampling, init_scale=args.init_scale, scale_growing_rate=args.scale_growing_rate, max_scale=args.max_scale, n_clusters=args.clusters, cluster_type=args.cluster_type, updater=updater, target_selection=target_selection, sequence_noise=sequence_noise, recurrent_layer=recurrent_layer, use_ratings_features=args.rf, use_movies_features=args.mf, use_users_features=args.uf, batch_size=args.batch_size) 99 | else: 100 | return FISM(k=args.hidden, reg = args.regularization, learning_rate = args.learning_rate, annealing=args.cooling, init_sigma = args.init_sigma, loss=args.loss, alpha=args.fism_alpha) 101 | elif args.method == "Fossil": 102 | return Fossil(k=args.hidden, order=args.fossil_order, reg = args.regularization, learning_rate = args.learning_rate, annealing=args.cooling, init_sigma = args.init_sigma, alpha=args.fism_alpha) 103 | elif args.method == "FPMC": 104 | return FPMC(k_cf = args.k_cf, k_mc = args.k_mc, reg = args.regularization, learning_rate = args.learning_rate, annealing=args.cooling, init_sigma = args.init_sigma, adaptive_sampling=(not args.no_adaptive_sampling), sampling_bias=args.fpmc_bias) 105 | elif args.method == "LTM": 106 | return LTM(k = args.hidden, alpha = args.ltm_damping, window = args.ltm_window, learning_rate=args.learning_rate, use_trajectory=(not args.ltm_no_trajectory)) 107 | elif args.method == "UKNN": 108 | return UserKNN(neighborhood_size=args.ns) 109 | elif args.method == "POP": 110 | return Pop() 111 | elif args.method == "MM": 112 | return MarkovModel() 113 | elif args.method == 'RNN': 114 | if args.clusters > 0: 115 | return RNNCluster(interactions_are_unique=(not args.repeated_interactions), max_length=args.max_length, cluster_selection_noise=args.csn, loss=args.loss, predict_with_clusters=(not args.ignore_clusters), sampling_bias=args.sampling_bias, sampling=args.sampling, cluster_sampling=args.c_sampling, init_scale=args.init_scale, scale_growing_rate=args.scale_growing_rate, max_scale=args.max_scale, n_clusters=args.clusters, cluster_type=args.cluster_type, updater=updater, target_selection=target_selection, sequence_noise=sequence_noise, recurrent_layer=recurrent_layer, use_ratings_features=args.rf, use_movies_features=args.mf, use_users_features=args.uf, batch_size=args.batch_size) 116 | elif args.loss == 'CCE': 117 | return RNNOneHot(interactions_are_unique=(not args.repeated_interactions), max_length=args.max_length, diversity_bias=args.diversity_bias, regularization=args.regularization, updater=updater, target_selection=target_selection, sequence_noise=sequence_noise, recurrent_layer=recurrent_layer, use_ratings_features=args.rf, use_movies_features=args.mf, use_users_features=args.uf, batch_size=args.batch_size) 118 | elif args.loss in ['hinge', 'logit', 'logsig']: 119 | return RNNMargin(interactions_are_unique=(not args.repeated_interactions), loss_function=args.loss, balance = args.balance, popularity_based = args.pb, min_access = args.min_access, target_selection=target_selection, sequence_noise=sequence_noise, recurrent_layer=recurrent_layer, max_length=args.max_length, updater=updater, use_ratings_features=args.rf, use_movies_features=args.mf, use_users_features=args.uf, batch_size=args.batch_size) 120 | elif args.loss in ['BPR', 'TOP1', 'Blackout']: 121 | return RNNSampling(interactions_are_unique=(not args.repeated_interactions), loss_function=args.loss, diversity_bias=args.diversity_bias, sampling=args.sampling, sampling_bias=args.sampling_bias, recurrent_layer=recurrent_layer, max_length=args.max_length, updater=updater, target_selection=target_selection, sequence_noise=sequence_noise, use_ratings_features=args.rf, use_movies_features=args.mf, use_users_features=args.uf, batch_size=args.batch_size) 122 | else: 123 | raise ValueError('Unknown loss for the RNN model') 124 | elif args.method == "SDA": 125 | return StackedDenoisingAutoencoder(interactions_are_unique=(not args.repeated_interactions), layers = args.layers, input_dropout=args.input_dropout, dropout=args.dropout, updater=updater, batch_size=args.batch_size, use_ratings_features=args.rf) 126 | 127 | 128 | -------------------------------------------------------------------------------- /word2vec/ltm.py: -------------------------------------------------------------------------------- 1 | from __future__ import division 2 | from __future__ import print_function 3 | import numpy as np 4 | import math 5 | import random 6 | import re 7 | import os 8 | import glob 9 | import sys 10 | from time import time 11 | from gensim.models.word2vec import Word2Vec 12 | from helpers import evaluation 13 | 14 | 15 | class LTM(object): 16 | """ Implementation of the algorithm proposed in "Latent Trajectory Modeling : A Light and Efficient Way to Introduce Time in Recommender Systems" by Guardia-Sebaoun, E. et al., 2015. 17 | """ 18 | def __init__(self, use_trajectory=True, alpha=0.8, k = 32, window = 5, learning_rate=0.025): 19 | ''' 20 | 21 | parameters 22 | ---------- 23 | use_trajectory: boolean 24 | If True, the original LTM algorithm is used. 25 | Else the users features are not computed, and the predictions are made only by taking the items with the closest word2vec representation from the (window/2) last item in the sequence. 26 | alpha: float in (0,1) 27 | temporal damping parameter from "Apprentissage de trajectoires temporelles pour la recommandation dans les communautes d'utilisateurs", by Guardia-Sebaoun, E. et al. 28 | k : int > 0 29 | number of dimension for the word2vec embedding 30 | window : int > 0 31 | window size for the word2vec embedding 32 | learning_rate: float 33 | initial learning rate for word2vec. (alpha parameter in the gensim implementation of word2vec) 34 | ''' 35 | super(LTM, self).__init__() 36 | self.use_trajectory = use_trajectory 37 | self.alpha = alpha 38 | self.k = k 39 | self.window = window 40 | self.learning_rate = learning_rate 41 | 42 | self.name = 'Latent Trajectory Modeling' 43 | self.max_length = np.inf # For compatibility with the RNNs 44 | 45 | self.metrics = {'recall': {'direction': 1}, 46 | 'sps': {'direction': 1}, 47 | 'user_coverage' : {'direction': 1}, 48 | 'item_coverage' : {'direction': 1}, 49 | 'ndcg' : {'direction': 1}, 50 | 'blockbuster_share' : {'direction': -1} 51 | } 52 | 53 | 54 | def _get_model_filename(self, epochs): 55 | '''Return the name of the file to save the current model 56 | ''' 57 | filename = "ltm_ne"+str(epochs)+"_lr"+str(self.learning_rate)+"_k"+str(self.k)+"_w"+str(self.window) 58 | if self.use_trajectory: 59 | filename += "_ut"+str(self.alpha) 60 | return filename 61 | 62 | def user_features(self, sequence): 63 | '''Compute the transition features of the users based on his sequence of items. 64 | ''' 65 | features = np.zeros(self.k) 66 | for i in range(1,len(sequence)): 67 | features = self.alpha * features + (1 - self.alpha) * (self.w2v_model[str(sequence[i][0])] - self.w2v_model[str(sequence[i-1][0])]) 68 | 69 | return features 70 | 71 | def prepare_model(self, dataset): 72 | ''' For compatibility with other methods. 73 | ''' 74 | pass 75 | 76 | def top_k_recommendations(self, sequence, user_id=None, k=10, exclude=None): 77 | ''' Recieves a sequence of (id, rating), and produces k recommendations (as a list of ids) 78 | ''' 79 | 80 | if exclude is None: 81 | exclude = [] 82 | 83 | if self.use_trajectory: 84 | f = self.user_features(sequence) 85 | else: 86 | f = np.mean(np.array([self.w2v_model[str(sequence[-i-1][0])] for i in range(self.window//2)]), axis=0) # average over last window/2 items 87 | 88 | top = self.w2v_model.similar_by_vector(f, topn=k+len(sequence)+len(exclude)) 89 | top = [int(i[0]) for i in top if int(i[0]) not in exclude] 90 | s = [i[0] for i in sequence] 91 | top = [i for i in top if i not in s] 92 | return top[:k] 93 | 94 | # # f = f / np.sqrt(np.sum(np.square(f))) 95 | # # dist = np.dot(self.w2v_model.syn0, f) 96 | # dist = -np.dot(self.w2v_model.syn0, f) / np.sum(np.square(self.w2v_model.syn0), axis=-1) 97 | # # dist = np.sum(np.square(self.w2v_model.syn0 - f), axis=-1) 98 | 99 | # # Put low similarity to viewed items to exclude them from recommendations 100 | # dist[[self.w2v_model.vocab[str(i)].index for i in exclude]] = np.inf 101 | # dist[[self.w2v_model.vocab[str(i[0])].index for i in sequence]] = np.inf 102 | 103 | # # find top k according to dist 104 | # return [int(self.w2v_model.index2word[i]) for i in list(np.argpartition(dist, range(k))[:k])] 105 | 106 | def word2vec_training_generator(self, dataset): 107 | '''Take a generator of sequences and produce a generator in the format used by gensim word2vec module 108 | ''' 109 | for sequence, user_id in dataset.training_set(epochs=1): 110 | yield [str(i[0]) for i in sequence] 111 | 112 | def set_dataset(self, dataset): 113 | self.dataset = dataset 114 | 115 | def train(self, dataset, 116 | max_time=np.inf, 117 | progress=2.0, 118 | time_based_progress=False, 119 | autosave='All', 120 | save_dir='', 121 | min_iterations=0, 122 | max_iter=np.inf, 123 | max_progress_interval=np.inf, 124 | load_last_model=False, 125 | early_stopping=None, 126 | validation_metrics=['sps']): 127 | '''Train the model based on the sequence given by the training_set 128 | 129 | !!!! Contrary to what the train function of other algorithms, here an iteration is equivalent to one epoch !!!!!!! 130 | 131 | max_time is used to set the maximumn amount of time (in seconds) that the training can last before being stop. 132 | By default, max_time=np.inf, which means that the training will last until the training_set runs out, or the user interrupt the program. 133 | 134 | progress is used to set when progress information should be printed during training. It can be either an int or a float: 135 | integer : print at linear intervals specified by the value of progress (i.e. : progress, 2*progress, 3*progress, ...) 136 | float : print at geometric intervals specified by the value of progress (i.e. : progress, progress^2, progress^3, ...) 137 | 138 | max_progress_interval can be used to have geometric intervals in the begining then switch to linear intervals. 139 | It ensures, independently of the progress parameter, that progress is shown at least every max_progress_interval. 140 | 141 | time_based_progress is used to choose between using number of iterations or time as a progress indicator. True means time (in seconds) is used, False means number of iterations. 142 | 143 | autosave is used to set whether the model should be saved during training. It can take several values: 144 | All : the model will be saved each time progress info is printed. 145 | Best : save only the best model so far 146 | None : does not save 147 | 148 | min_iterations is used to set a minimum of iterations before printing the first information (and saving the model). 149 | 150 | save_dir is the path to the directory where models are saved. 151 | 152 | load_last_model: if true, find the latest model in the directory where models should be saved, and load it before starting training. 153 | 154 | early_stopping : should be a callable that will recieve the list of validation error and the corresponding epochs and return a boolean indicating whether the learning should stop. 155 | ''' 156 | 157 | self.set_dataset(dataset) 158 | 159 | if len(set(validation_metrics) & set(self.metrics.keys())) < len(validation_metrics): 160 | raise ValueError('Incorrect validation metrics. Metrics must be chosen among: ' + ', '.join(self.metrics.keys())) 161 | 162 | # Load last model if needed, else initialise the model 163 | iterations = 0 164 | epochs_offset = 0 165 | if load_last_model: 166 | epochs_offset = self.load_last(save_dir) 167 | if not hasattr(self, 'w2v_model'): 168 | self.w2v_model = Word2Vec(iter = 1, min_count = 1, size=self.k, window=self.window, alpha=self.learning_rate, sg=0) 169 | self.w2v_model.build_vocab([map(str, range(dataset.n_items))]) 170 | 171 | start_time = time() 172 | next_save = int(progress) 173 | epochs = [] 174 | metrics = {name:[] for name in self.metrics.keys()} 175 | filename = {} 176 | 177 | while (time() - start_time < max_time and iterations < max_iter): 178 | 179 | # Train one epoch 180 | self.w2v_model.train(self.word2vec_training_generator(dataset)) 181 | 182 | # Check if it is time to save the model 183 | iterations += 1 184 | 185 | if time_based_progress: 186 | progress_indicator = int(time() - start_time) 187 | else: 188 | progress_indicator = iterations 189 | 190 | if progress_indicator >= next_save: 191 | 192 | if progress_indicator >= min_iterations: 193 | 194 | # Save current epoch 195 | epochs.append(epochs_offset + iterations) 196 | 197 | # Compute validation cost 198 | metrics = self._compute_validation_metrics(metrics) 199 | 200 | # Print info 201 | self._print_progress(iterations, epochs[-1], start_time, metrics, validation_metrics) 202 | 203 | # Save model 204 | run_nb = len(metrics[self.metrics.keys()[0]])-1 205 | if autosave == 'All': 206 | filename[run_nb] = save_dir + self._get_model_filename(round(epochs[-1], 3)) 207 | self.save(filename[run_nb]) 208 | elif autosave == 'Best': 209 | pareto_runs = self.get_pareto_front(metrics, validation_metrics) 210 | if run_nb in pareto_runs: 211 | filename[run_nb] = save_dir + self._get_model_filename(round(epochs[-1], 3)) 212 | self.save(filename[run_nb]) 213 | to_delete = [r for r in filename if r not in pareto_runs] 214 | for run in to_delete: 215 | try: 216 | os.remove(filename[run]) 217 | except OSError: 218 | print('Warning : Previous model could not be deleted') 219 | del filename[run] 220 | 221 | if early_stopping is not None: 222 | # Stop if early stopping is triggered for all the validation metrics 223 | if all([early_stopping(epochs, metrics[m]) for m in validation_metrics]): 224 | break 225 | 226 | 227 | # Compute next checkpoint 228 | if isinstance(progress, int): 229 | next_save += min(progress, max_progress_interval) 230 | else: 231 | next_save += min(max_progress_interval, next_save * (progress - 1)) 232 | 233 | def get_pareto_front(self, metrics, metrics_names): 234 | costs = np.zeros((len(metrics[metrics_names[0]]), len(metrics_names))) 235 | for i, m in enumerate(metrics_names): 236 | costs[:, i] = np.array(metrics[m]) * self.metrics[m]['direction'] 237 | is_efficient = np.ones(costs.shape[0], dtype = bool) 238 | for i, c in enumerate(costs): 239 | if is_efficient[i]: 240 | is_efficient[is_efficient] = np.any(costs[is_efficient]>=c, axis=1) 241 | return np.where(is_efficient)[0].tolist() 242 | 243 | def _compute_validation_metrics(self, metrics): 244 | 245 | ev = evaluation.Evaluator(self.dataset, k=10) 246 | for sequence, user_id in self.dataset.validation_set(epochs=1): 247 | top_k = self.top_k_recommendations(sequence[:len(sequence)//2], user_id=int(user_id)) 248 | goal = [i[0] for i in sequence[len(sequence)//2:]] 249 | ev.add_instance(goal, top_k) 250 | 251 | metrics['recall'].append(ev.average_recall()) 252 | metrics['sps'].append(ev.sps()) 253 | metrics['ndcg'].append(ev.average_ndcg()) 254 | metrics['user_coverage'].append(ev.user_coverage()) 255 | metrics['item_coverage'].append(ev.item_coverage()) 256 | metrics['blockbuster_share'].append(ev.blockbuster_share()) 257 | 258 | return metrics 259 | 260 | def _print_progress(self, iterations, epochs, start_time, metrics, validation_metrics): 261 | '''Print learning progress in terminal 262 | ''' 263 | print(self.name, iterations, "batchs, ", epochs, " epochs in", time() - start_time, "s") 264 | for m in self.metrics: 265 | print(m, ': ', metrics[m][-1]) 266 | if m in validation_metrics: 267 | print('Best ', m, ': ', max(np.array(metrics[m])*self.metrics[m]['direction'])*self.metrics[m]['direction']) 268 | print('-----------------') 269 | 270 | # Print on stderr for easier recording of progress 271 | print(iterations, epochs, time() - start_time, 'n/a', ' '.join(map(str, [metrics[m][-1] for m in self.metrics])), file=sys.stderr) 272 | 273 | def save(self, filename): 274 | '''Save the word2vec object into a file 275 | ''' 276 | print('Save model in ' + filename) 277 | self.w2v_model.save(filename) 278 | 279 | def load_last(self, save_dir): 280 | '''Load last model from dir 281 | ''' 282 | def extract_number_of_epochs(filename): 283 | m = re.search('_ne([0-9]+(\.[0-9]+)?)_', filename) 284 | return float(m.group(1)) 285 | 286 | # Get all the models for this RNN 287 | file = save_dir + self._get_model_filename("*") 288 | file = np.array(glob.glob(file)) 289 | 290 | if len(file) == 0: 291 | print('No previous model, starting from scratch') 292 | return 0 293 | 294 | # Find last model and load it 295 | last_batch = np.amax(np.array(map(extract_number_of_epochs, file))) 296 | last_model = save_dir + self._get_model_filename(last_batch) 297 | print('Starting from model ' + last_model) 298 | self.load(last_model) 299 | 300 | return last_batch 301 | 302 | 303 | def load(self, filename): 304 | '''Load parameters values form a file 305 | ''' 306 | self.w2v_model = Word2Vec.load(filename) -------------------------------------------------------------------------------- /preprocess.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | import numpy as np 4 | import pandas as pd 5 | import random 6 | import argparse 7 | import os 8 | import sys 9 | from shutil import copyfile 10 | 11 | def command_parser(): 12 | parser = argparse.ArgumentParser() 13 | parser.add_argument('-f', dest='filename', help='Input file', required=True, type=str) 14 | parser.add_argument('--columns', help='Order of the columns in the file (eg: "uirt"), u for user, i for item, t for timestamp, r for rating. If r is not present a default rating of 1 is given to all interaction. If t is not present interactions are assumed to be in chronological order. Extra columns are ignored. Default: uit', default="uit", type=str) 15 | parser.add_argument('--sep', help='Separator between the column. If unspecified pandas will try to guess the separator', default="\s+", type=str) 16 | parser.add_argument('--min_user_activity', help='Users with less interactions than this will be removed from the dataset. Default: 2', default=2, type=int) 17 | parser.add_argument('--min_item_pop', help='Items with less interactions than this will be removed from the dataset. Default: 5', default=5, type=int) 18 | parser.add_argument('--val_size', help='Number of users to put in the validation set. If in (0,1) it will be interpreted as the fraction of total number of users. Default: 0.1', default=0.1, type=float) 19 | parser.add_argument('--test_size', help='Number of users to put in the test set. If in (0,1) it will be interpreted as the fraction of total number of users. Default: 0.1', default=0.1, type=float) 20 | parser.add_argument('--seed', help='Seed for the random train/val/test split', default=1, type=int) 21 | 22 | args = parser.parse_args() 23 | args.dirname = os.path.dirname(os.path.abspath(args.filename)) + "/" 24 | return args 25 | 26 | def warn_user(dirname): 27 | '''Ask user if he's sure to create files in that directory. 28 | ''' 29 | print('This program will create a lot of files and directories in ' + dirname) 30 | answer = raw_input('Are you sure that you want to do that ? [y/n]') 31 | if answer != "y": 32 | sys.exit(0) 33 | 34 | def create_dirs(dirname): 35 | if not os.path.exists(dirname + "data"): 36 | os.makedirs(dirname + "data") 37 | 38 | if not os.path.exists(dirname + "models"): 39 | os.makedirs(dirname + "models") 40 | 41 | if not os.path.exists(dirname + "results"): 42 | os.makedirs(dirname + "results") 43 | 44 | def load_data(filename, columns, separator): 45 | ''' Load the data from filename and sort it according to timestamp. 46 | Returns a dataframe with 3 columns: user_id, item_id, rating 47 | ''' 48 | 49 | print('Load data...') 50 | data = pd.read_csv(filename, sep=separator, names=list(columns), index_col=False, usecols=range(len(columns))) 51 | 52 | if 'r' not in columns: 53 | # Add a column of default ratings 54 | data['r'] = 1 55 | 56 | if 't' in columns: 57 | # sort according to the timestamp column 58 | if data['t'].dtype == np.int64: # probably a timestamp 59 | data['t'] = pd.to_datetime(data['t'], unit='s') 60 | else: 61 | data['t'] = pd.to_datetime(data['t']) 62 | print('Sort data in chronological order...') 63 | data.sort_values('t', inplace=True) 64 | 65 | return data 66 | 67 | def remove_rare_elements(data, min_user_activity, min_item_popularity): 68 | '''Removes user and items that appears in too few interactions. 69 | min_user_activity is the minimum number of interaction that a user should have. 70 | min_item_popularity is the minimum number of interaction that an item should have. 71 | NB: the constraint on item might not be strictly satisfied because rare users and items are removed in alternance, 72 | and the last removal of inactive users might create new rare items. 73 | ''' 74 | 75 | print('Remove inactive users and rare items...') 76 | 77 | #Remove inactive users a first time 78 | user_activity = data.groupby('u').size() 79 | data = data[np.in1d(data.u, user_activity[user_activity >= min_user_activity].index)] 80 | #Remove unpopular items 81 | item_popularity = data.groupby('i').size() 82 | data = data[np.in1d(data.i, item_popularity[item_popularity >= min_item_popularity].index)] 83 | #Remove users that might have passed below the activity threshold due to the removal of rare items 84 | user_activity = data.groupby('u').size() 85 | data = data[np.in1d(data.u, user_activity[user_activity >= min_user_activity].index)] 86 | 87 | return data 88 | 89 | def save_index_mapping(data, separator, dirname): 90 | ''' Save the mapping of original user and item ids to numerical consecutive ids in dirname. 91 | NB: some users and items might have been removed in previous steps and will therefore not appear in the mapping. 92 | ''' 93 | 94 | separator = "\t" 95 | 96 | 97 | # Pandas categorical type will create the numerical ids we want 98 | print('Map original users and items ids to consecutive numerical ids...') 99 | data['u_original'] = data['u'].astype('category') 100 | data['i_original'] = data['i'].astype('category') 101 | data['u'] = data['u_original'].cat.codes 102 | data['i'] = data['i_original'].cat.codes 103 | 104 | print('Save ids mapping to file...') 105 | user_mapping = pd.DataFrame({'original_id' : data['u_original'], 'new_id': data['u']}) 106 | user_mapping.sort_values('original_id', inplace=True) 107 | user_mapping.drop_duplicates(subset='original_id', inplace=True) 108 | user_mapping.to_csv(dirname+"data/user_id_mapping", sep=separator, index=False) 109 | 110 | item_mapping = pd.DataFrame({'original_id' : data['i_original'], 'new_id': data['i']}) 111 | item_mapping.sort_values('original_id', inplace=True) 112 | item_mapping.drop_duplicates(subset='original_id', inplace=True) 113 | item_mapping.to_csv(dirname+"data/item_id_mapping", sep=separator, index=False) 114 | 115 | return data 116 | 117 | def split_data(data, nb_val_users, nb_test_users, dirname): 118 | '''Splits the data set into training, validation and test sets. 119 | Each user is in one and only one set. 120 | nb_val_users is the number of users to put in the validation set. 121 | nb_test_users is the number of users to put in the test set. 122 | ''' 123 | nb_users = data['u'].nunique() 124 | 125 | # check if nb_val_user is specified as a fraction 126 | if nb_val_users < 1: 127 | nb_val_users = round(nb_val_users * nb_users) 128 | if nb_test_users < 1: 129 | nb_test_users = round(nb_test_users * nb_users) 130 | nb_test_users = int(nb_test_users) 131 | nb_val_users = int(nb_val_users) 132 | 133 | if nb_users <= nb_val_users+nb_test_users: 134 | raise ValueError('Not enough users in the dataset: choose less users for validation and test splits') 135 | 136 | def extract_n_users(df, n): 137 | users_ids = np.random.choice(df['u'].unique(), n) 138 | n_set = df[df['u'].isin(users_ids)] 139 | remain_set = df.drop(n_set.index) 140 | return n_set, remain_set 141 | 142 | print('Split data into training, validation and test sets...') 143 | test_set, tmp_set = extract_n_users(data, nb_test_users) 144 | val_set, train_set = extract_n_users(tmp_set, nb_val_users) 145 | 146 | print('Save training, validation and test sets in the triplets format...') 147 | train_set.to_csv(dirname + "data/train_set_triplets", sep="\t", columns=['u', 'i', 'r'], index=False, header=False) 148 | val_set.to_csv(dirname + "data/val_set_triplets", sep="\t", columns=['u', 'i', 'r'], index=False, header=False) 149 | test_set.to_csv(dirname + "data/test_set_triplets", sep="\t", columns=['u', 'i', 'r'], index=False, header=False) 150 | 151 | return train_set, val_set, test_set 152 | 153 | def gen_sequences(data, half=False): 154 | '''Generates sequences of user actions from data. 155 | each sequence has the format [user_id, first_item_id, first_item_rating, 2nd_item_id, 2nd_item_rating, ...]. 156 | If half is True, cut the sequences to half their true length (useful to produce the extended training set). 157 | ''' 158 | data = data.sort_values('u', kind="mergesort") # Mergesort is stable and keeps the time ordering 159 | seq = [] 160 | prev_id = -1 161 | for u, i, r in zip(data['u'], data['i'], data['r']): 162 | if u != prev_id: 163 | if len(seq) > 3: 164 | if half: 165 | seq = seq[:1+2*int((len(seq) - 1)/4)] 166 | yield seq 167 | prev_id = u 168 | seq = [u] 169 | seq.extend([i,r]) 170 | if half: 171 | seq = seq[:1+2*int((len(seq) - 1)/4)] 172 | yield seq 173 | 174 | def make_sequence_format(train_set, val_set, test_set, dirname): 175 | '''Convert the train/validation/test sets in the sequence format and save them. 176 | Also create the extended training sequences, which countains the first half of the sequences of users in the validation and test sets. 177 | ''' 178 | 179 | print('Save the training set in the sequences format...') 180 | with open(dirname+"data/train_set_sequences", "w") as f: 181 | for s in gen_sequences(train_set): 182 | f.write(' '.join(map(str, s)) + "\n") 183 | 184 | print('Save the validation set in the sequences format...') 185 | with open(dirname+"data/val_set_sequences", "w") as f: 186 | for s in gen_sequences(val_set): 187 | f.write(' '.join(map(str, s)) + "\n") 188 | 189 | print('Save the test set in the sequences format...') 190 | with open(dirname+"data/test_set_sequences", "w") as f: 191 | for s in gen_sequences(test_set): 192 | f.write(' '.join(map(str, s)) + "\n") 193 | 194 | # sequences+ contains all the sequences of train_set_sequences plus half the sequences of val and test sets 195 | print('Save the extended training set in the sequences format...') 196 | copyfile(dirname+"data/train_set_sequences", dirname+"data/train_set_sequences+") 197 | with open(dirname+"data/train_set_sequences+", "a") as f: 198 | for s in gen_sequences(val_set, half=True): 199 | f.write(' '.join(map(str, s)) + "\n") 200 | for s in gen_sequences(test_set, half=True): 201 | f.write(' '.join(map(str, s)) + "\n") 202 | 203 | def save_data_stats(data, train_set, val_set, test_set, dirname): 204 | print('Save stats...') 205 | 206 | def _get_stats(df): 207 | return "\t".join(map(str, [df['u'].nunique(), df['i'].nunique(), len(df.index), df.groupby('u').size().max()])) 208 | 209 | with open(dirname+"data/stats", "w") as f: 210 | f.write("set\tn_users\tn_items\tn_interactions\tlongest_sequence\n") 211 | f.write("Full\t"+ _get_stats(data) + "\n") 212 | f.write("Train\t"+ _get_stats(train_set) + "\n") 213 | f.write("Val\t"+ _get_stats(val_set) + "\n") 214 | f.write("Test\t"+ _get_stats(test_set) + "\n") 215 | 216 | def make_readme(dirname, val_set, test_set): 217 | data_readme = '''The following files were automatically generated by preprocess.py 218 | 219 | user_id_mapping 220 | mapping between the users ids in the original dataset and the new users ids. 221 | the first column contains the new id and the second the original id. 222 | Inactive users might have been deleted from the original, and they will therefore not appear in the id mapping. 223 | 224 | item_id_mapping 225 | Idem for item ids. 226 | 227 | train_set_triplets 228 | Training set in the triplets format. 229 | Each line is a user item interaction in the form (user_id, item_id, rating). 230 | Interactions are listed in chronological order. 231 | 232 | train_set_sequences 233 | Training set in the sequence format. 234 | Each line contains all the interactions of a user in the form (user_id, first_item_id, first_rating, 2nd_item_id, 2nd_rating, ...). 235 | 236 | train_set_sequences+ 237 | Extended training set in the sequence format. 238 | The extended training set contains all the training set plus the first half of the interactions of each users in the validation and testing set. 239 | 240 | val_set_triplets 241 | Validation set in the triplets format 242 | 243 | val_set_triplets 244 | Validation set in the sequence format 245 | 246 | test_set_triplets 247 | Test set in the triplets format 248 | 249 | test_set_triplets 250 | Test set in the sequence format 251 | 252 | stats 253 | Contains some informations about the dataset. 254 | 255 | The training, validation and test sets are obtain by randomly partitioning the users and all their interactions into 3 sets. 256 | The validation set contains {n_val} users, the test_set {n_test} users and the train set all the other users. 257 | 258 | '''.format(n_val=str(val_set['u'].nunique()), n_test=str(test_set['u'].nunique())) 259 | 260 | results_readme = '''The format of the results file is the following 261 | Each line correspond to one model, with the fields being: 262 | Number of epochs 263 | precision 264 | sps 265 | user coverage 266 | number of unique items in the test set 267 | number of unique items in the recommendations 268 | number of unique items in the succesful recommendations 269 | number of unique items in the short-term test set (when the goal is to predict precisely the next item) 270 | number of unique items in the successful short-term recommendations 271 | recall 272 | NDCG 273 | NB: all the metrics are computed "@10" 274 | ''' 275 | 276 | with open(dirname+"data/README", "w") as f: 277 | f.write(data_readme) 278 | with open(dirname+"results/README", "w") as f: 279 | f.write(results_readme) 280 | 281 | def main(): 282 | 283 | args = command_parser() 284 | np.random.seed(seed=args.seed) 285 | warn_user(args.dirname) 286 | create_dirs(args.dirname) 287 | data = load_data(args.filename, args.columns, args.sep) 288 | data = remove_rare_elements(data, args.min_user_activity, args.min_item_pop) 289 | data = save_index_mapping(data, args.sep, args.dirname) 290 | train_set, val_set, test_set = split_data(data, args.val_size, args.test_size, args.dirname) 291 | make_sequence_format(train_set, val_set, test_set, args.dirname) 292 | save_data_stats(data, train_set, val_set, test_set, args.dirname) 293 | make_readme(args.dirname, val_set, test_set) 294 | 295 | print('Data ready!') 296 | 297 | print(data.head(10)) 298 | 299 | if __name__ == '__main__': 300 | main() -------------------------------------------------------------------------------- /neural_networks/fism_cluster.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | import numpy as np 4 | import theano 5 | import theano.tensor as T 6 | import scipy.sparse as sp 7 | import theano.sparse 8 | import lasagne 9 | import cPickle 10 | import os 11 | import sys 12 | import random 13 | from time import time 14 | from rnn_cluster import RNNCluster 15 | from sparse_lstm import * 16 | from helpers import evaluation 17 | from helpers.sparse_layer import SparseLayer 18 | 19 | class FISMCluster(RNNCluster): 20 | """FISMCluster combines FISM with item clustering. 21 | 22 | Parameters 23 | ---------- 24 | 25 | h: int 26 | Size of the embedding. 27 | 28 | alpha: float 29 | Exponant of the normalization term in FISM 30 | 31 | reg: float 32 | Regularization coefficient. If reg > 0, L2 regularization is used, otherwise L1 regularization is used with coef -reg. 33 | 34 | FISMCluster is built on top of RNNCluster, all the parameters associated to the clustering are described in RNNCluster. 35 | """ 36 | def __init__(self, h=100, alpha=0.5, reg=0.00025, max_length=np.inf, **kwargs): 37 | super(FISMCluster, self).__init__(max_length=np.inf, **kwargs) 38 | 39 | self.n_hidden = h 40 | self.alpha = alpha 41 | self.reg = reg 42 | self.target_selection.shuffle = True 43 | self.name = "FISM Cluster with categorical cross entropy" 44 | self.recurrent_layer.name = "" 45 | 46 | def _get_model_filename(self, epochs): 47 | '''Return the name of the file to save the current model 48 | ''' 49 | filename = "fism_clusters"+str(self.n_clusters)+"_sc"+str(self.init_scale) 50 | 51 | if self.scale_growing_rate != 1.: 52 | filename += "-"+str(self.scale_growing_rate)+"-"+str(self.max_scale) 53 | 54 | filename += "_h"+ str(self.n_hidden) + "_a" + str(self.alpha) +"_" 55 | if self.sampling_bias > 0.: 56 | filename += "p" + str(self.sampling_bias) 57 | filename += "s"+str(self.n_samples) 58 | 59 | if self.n_cluster_samples > 0: 60 | filename += "_" 61 | if self.sampling_bias > 0.: 62 | filename += "p" + str(self.sampling_bias) 63 | filename += "cs"+str(self.n_cluster_samples) 64 | 65 | if self.cluster_type == 'softmax': 66 | filename += "_softmax" 67 | elif self.cluster_type == 'mix': 68 | filename += "_mix" 69 | 70 | if self.cluster_selection_noise > 0.: 71 | filename += '_n' + str(self.cluster_selection_noise) 72 | 73 | if self.reg != 0.: 74 | filename += '_r' + str(self.reg) 75 | 76 | filename += "_c" + self.loss 77 | 78 | return filename+"_"+self._common_filename(epochs) 79 | 80 | def _prepare_networks(self, n_items): 81 | ''' Prepares the building blocks of the RNN, but does not compile them: 82 | self.l_in : input layer 83 | self.l_mask : mask of the input layer 84 | self.target : target of the network 85 | self.l_out : output of the network 86 | self.cost : cost function 87 | ''' 88 | 89 | self.n_items = n_items 90 | 91 | # Theano tensor for the targets 92 | input_var = theano.sparse.csr_matrix('input_var') 93 | self.target = T.ivector('target_output') 94 | self.exclude = T.fmatrix('excluded_items') 95 | self.samples = T.ivector('samples') 96 | self.cluster_samples = T.ivector('cluster_samples') 97 | 98 | # The input is composed of to parts : the on-hot encoding of the movie, and the features of the movie 99 | self.l_in = lasagne.layers.InputLayer(shape=(self.batch_size, self.n_items), input_var=input_var) 100 | 101 | l_user_rep = SparseLayer(self.l_in, num_units=self.n_hidden, nonlinearity=None, b=None) 102 | 103 | self.user_representation_layer = l_user_rep 104 | 105 | # The sliced output is then passed through linear layer to obtain the right output size 106 | self.l_out = BlackoutLayer(l_user_rep, num_units=self.n_items, num_outputs=self.n_samples, nonlinearity=None, W=lasagne.init.GlorotUniform()) 107 | 108 | # lasagne.layers.get_output produces a variable for the output of the net 109 | network_output = lasagne.layers.get_output(self.l_out, targets = self.target, samples=self.samples) 110 | 111 | # loss function 112 | self.cost = self._loss(network_output,self.batch_size).mean() 113 | if self.reg > 0.: 114 | self.cost += self.reg * lasagne.regularization.regularize_network_params(self.l_out, lasagne.regularization.l2) 115 | elif self.reg < 0.: 116 | self.cost -= self.reg * lasagne.regularization.regularize_network_params(self.l_out, lasagne.regularization.l1) 117 | 118 | 119 | # Cluster learning 120 | self.T_scale = theano.shared(self.effective_scale) 121 | scaled_softmax = lambda x: lasagne.nonlinearities.softmax(x*self.T_scale) 122 | 123 | self.cluster_selection_layer = lasagne.layers.DenseLayer(l_user_rep, b=None, num_units=self.n_clusters, nonlinearity=None) 124 | cluster_selection = lasagne.layers.get_output(self.cluster_selection_layer) 125 | if self.cluster_selection_noise > 0.: 126 | cluster_selection = cluster_selection + self._srng.normal(cluster_selection.shape, avg=0.0, std=self.cluster_selection_noise) 127 | cluster_selection = scaled_softmax(cluster_selection) 128 | 129 | self.cluster_repartition = theano.shared((0.1 * np.random.randn(self.n_items, self.n_clusters)).astype(theano.config.floatX)) 130 | if self.cluster_type == 'softmax': 131 | target_and_samples_clusters = scaled_softmax(self.cluster_repartition[T.concatenate([self.target, self.cluster_samples]), :]) 132 | elif self.cluster_type == 'mix': 133 | target_and_samples_clusters = scaled_softmax(self.cluster_repartition[T.concatenate([self.target, self.cluster_samples]), :]) + \ 134 | T.nnet.sigmoid(self.T_scale*self.cluster_repartition[T.concatenate([self.target, self.cluster_samples]), :]) 135 | else: 136 | target_and_samples_clusters = T.nnet.sigmoid(self.T_scale*self.cluster_repartition[T.concatenate([self.target, self.cluster_samples]), :]) 137 | cluster_score = cluster_selection.dot(target_and_samples_clusters.T) 138 | 139 | self.cost_clusters = self._loss(cluster_score, self.batch_size).mean() 140 | 141 | def _compile_train_function(self): 142 | ''' Compile self.train. 143 | self.train recieves a sequence and a target for every steps of the sequence, 144 | compute error on every steps, update parameter and return global cost (i.e. the error). 145 | ''' 146 | print("Compiling train...") 147 | # Compute AdaGrad updates for training 148 | all_params = lasagne.layers.get_all_params(self.l_out, trainable=True) 149 | updates = self.updater(self.cost, all_params) 150 | 151 | params_clusters = self.cluster_selection_layer.get_params(trainable=True) 152 | params_clusters.append(self.cluster_repartition) 153 | updates.update(self.updater(self.cost_clusters, params_clusters)) 154 | # Compile network 155 | self.train_function = theano.function([self.l_in.input_var, self.target, self.samples, self.cluster_samples, self.exclude], self.cost, updates=updates, allow_input_downcast=True, name="Train_function", on_unused_input='ignore') 156 | print("Compilation done.") 157 | 158 | def _get_hard_clusters(self): 159 | if self.cluster_type == 'softmax': 160 | return lasagne.nonlinearities.softmax(100. * self.cluster_repartition) 161 | elif self.cluster_type == 'mix': 162 | # Clipping is used to avoid the sum of sigmoid and softmax to produce a cluster indicator of 2 163 | return (lasagne.nonlinearities.softmax(100. * self.cluster_repartition) + T.nnet.sigmoid(100. * self.cluster_repartition)).clip(0,1) 164 | else: 165 | return T.nnet.sigmoid(100. * self.cluster_repartition) 166 | 167 | def _compile_predict_function(self): 168 | ''' Compile self.predict, the deterministic rnn that output the prediction at the end of the sequence 169 | ''' 170 | print("Compiling predict...") 171 | if self.predict_with_clusters: 172 | cluster_selection = lasagne.layers.get_output(self.cluster_selection_layer, deterministic=True)[0, :].argmax() 173 | user_representation = lasagne.layers.get_output(self.user_representation_layer, deterministic=True) 174 | theano_predict_function = theano.function([self.l_in.input_var], [user_representation, cluster_selection], allow_input_downcast=True, name="Predict_function", on_unused_input='ignore') 175 | 176 | def cluster_predict_function(sequence, k, exclude): 177 | u, c = theano_predict_function(sequence) 178 | c = int(c) 179 | scores = u[0].dot(self.clusters_embeddings[c]) + self.clusters_bias[c] 180 | 181 | cluster_index_exclude = [] 182 | for i in exclude: 183 | if i in self.clusters_reverse_index[c]: 184 | cluster_index_exclude.append(self.clusters_reverse_index[c][i]) 185 | scores[cluster_index_exclude] = -np.inf 186 | 187 | # find top k according to output 188 | effective_k = min(k, len(self.clusters[c])) 189 | return list(self.clusters[c][np.argpartition(-scores, range(effective_k))[:effective_k]]), len(self.clusters[c]) 190 | 191 | self.predict_function = cluster_predict_function 192 | else: 193 | items_score = lasagne.nonlinearities.softmax(lasagne.layers.get_output(self.l_out, deterministic=True)) 194 | 195 | user_representation = lasagne.layers.get_output(self.user_representation_layer, deterministic=True) 196 | theano_predict_function = theano.function([self.l_in.input_var], user_representation, allow_input_downcast=True, name="Predict_function", on_unused_input='ignore') 197 | 198 | def no_cluster_predict_function(sequence, k, exclude): 199 | u = theano_predict_function(sequence) 200 | scores = u[0].dot(self.l_out.W.get_value(borrow=True)) + self.l_out.b.get_value(borrow=True) 201 | 202 | scores[exclude] = -np.inf 203 | 204 | # find top k according to output 205 | return list(np.argpartition(-scores, range(k))[:k]), self.n_items 206 | 207 | # theano_predict_function = theano.function([self.l_in.input_var], items_score, allow_input_downcast=True, name="Predict_function", on_unused_input='ignore') 208 | 209 | # def no_cluster_predict_function(sequence, k, exclude): 210 | # scores = theano_predict_function(sequence)[0] 211 | # scores[exclude] = -np.inf 212 | 213 | # # find top k according to output 214 | # return list(np.argpartition(-scores, range(k))[:k]), self.n_items 215 | 216 | self.predict_function = no_cluster_predict_function 217 | 218 | print("Compilation done.") 219 | 220 | def _compile_test_function(self): 221 | ''' Compile self.test_function, the deterministic rnn that output the precision@10 222 | ''' 223 | print("Compiling test...") 224 | 225 | items_score1 = lasagne.nonlinearities.softmax(lasagne.layers.get_output(self.l_out, deterministic=True)) 226 | 227 | cluster_selection = lasagne.layers.get_output(self.cluster_selection_layer, deterministic=True)[0, :].argmax() 228 | items_clusters = self._get_hard_clusters() 229 | used_items = items_clusters[:,cluster_selection] 230 | items_score2 = items_score1 * used_items 231 | 232 | if self.interactions_are_unique: 233 | items_score1 *= (1 - self.exclude) 234 | items_score2 *= (1 - self.exclude) 235 | 236 | theano_test_function = theano.function([self.l_in.input_var, self.target, self.samples, self.cluster_samples, self.exclude], [items_score1, items_score2, cluster_selection, used_items.sum()], allow_input_downcast=True, name="Test_function", on_unused_input='ignore') 237 | 238 | def precision_test_function(theano_inputs): 239 | k = 10 240 | scores1, scores2, c_select, n_used_items = theano_test_function(*theano_inputs) 241 | ids1 = np.argpartition(-scores1, range(k), axis=-1)[0, :k] 242 | ids2 = np.argpartition(-scores2, range(k), axis=-1)[0, :k] 243 | 244 | return ids1, ids2, c_select, n_used_items 245 | 246 | self.test_function = precision_test_function 247 | 248 | print("Compilation done.") 249 | 250 | def _prepare_input(self, sequences): 251 | ''' Sequences is a list of [user_id, input_sequence, targets] 252 | ''' 253 | 254 | batch_size = len(sequences) 255 | 256 | # Shape return variables 257 | X = sp.lil_matrix((batch_size, self.n_items), dtype=theano.config.floatX) 258 | Y = np.zeros((batch_size,), dtype='int32') # output target 259 | exclude = np.zeros((batch_size, self.n_items), dtype=theano.config.floatX) 260 | 261 | 262 | for i, sequence in enumerate(sequences): 263 | user_id, in_seq, target = sequence 264 | for j in in_seq: 265 | X[i, j[0]] = 1./len(in_seq)**self.alpha 266 | Y[i] = target[0][0] # id of the first and only target 267 | exclude[i, [j[0] for j in in_seq]] = 1 268 | 269 | if self.sampling_bias > 0.: 270 | samples = np.array([self._popularity_sample() for i in range(self.n_samples)], dtype='int32') 271 | if self.n_cluster_samples > 0: 272 | cluster_samples = np.array([self._popularity_sample() for i in range(self.n_cluster_samples)], dtype='int32') 273 | else: 274 | cluster_samples = samples 275 | else: 276 | samples = np.random.choice(self.n_items, self.n_samples).astype('int32') 277 | if self.n_cluster_samples > 0: 278 | cluster_samples = np.random.choice(self.n_items, self.n_cluster_samples).astype('int32') 279 | else: 280 | cluster_samples = samples 281 | 282 | # scale 283 | if not hasattr(self, '_last_epoch'): 284 | self._last_epoch = self.dataset.training_set.epochs 285 | else: 286 | if self.dataset.training_set.epochs > self._last_epoch+1 and self.scale_growing_rate != 1.: 287 | self.effective_scale *= self.scale_growing_rate ** int(self.dataset.training_set.epochs - self._last_epoch) 288 | self._last_epoch += int(self.dataset.training_set.epochs - self._last_epoch) 289 | print("New scale: ", self.effective_scale) 290 | self.T_scale.set_value(self.effective_scale) 291 | 292 | return (X.tocsr(), Y, samples, cluster_samples, exclude) 293 | 294 | def top_k_recommendations(self, sequence, user_id=None, k=10, exclude=None): 295 | ''' Recieves a sequence of (id, rating), and produces k recommendations (as a list of ids) 296 | ''' 297 | 298 | if exclude is None: 299 | exclude = [] 300 | 301 | # Compile network if needed 302 | if not hasattr(self, 'predict_function'): 303 | self._compile_predict_function() 304 | 305 | # Prepare RNN input 306 | max_length_seq = sequence[-min(self.max_length, len(sequence)):] 307 | X = sp.lil_matrix((1, self.n_items), dtype=theano.config.floatX) 308 | for j in sequence: 309 | X[0, j[0]] = 1./len(sequence)**self.alpha 310 | 311 | # Run RNN 312 | if self.interactions_are_unique: 313 | should_exclude = [i[0] for i in sequence] 314 | else: 315 | should_exclude = [] 316 | should_exclude.extend(exclude) 317 | return self.predict_function(X.tocsr(), k, should_exclude) 318 | 319 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | [![No Maintenance Intended](http://unmaintained.tech/badge.svg)](http://unmaintained.tech/) 2 | 3 | # Collaborative filtering based on sequences 4 | This python library includes multiple collaboraborative filtering algorithm that make use of the sequence of actions of the user: they not only use the fact that a user rated a certain item, but also that he rated before this other item or after that other one. 5 | Some standard algorithms that do not use sequence information are also present for easier comparison. 6 | 7 | All those algorithms aims to solve the "item recommendation" or "top-N recommendation" problem, which mean that they are not designed to predict ratings values, but only to predict which items are of interest for a given user. 8 | 9 | Our code was used to produce the experiments in "[Collaborative Filtering with Recurrent Neural Networks](https://arxiv.org/abs/1608.07400)" and "[Long and Short-Term Recommendations with Recurrent 10 | Neural Networks](http://iridia.ulb.ac.be/~rdevooght/papers/UMAP__Long_and_short_term_with_RNN.pdf)". 11 | If you use this code in your research, please cite us: 12 | ```` 13 | @inproceedings{Rec_with_RNN, 14 | author = {Devooght, Robin and Bersini, Hugues}, 15 | title = {Long and Short-Term Recommendations with Recurrent Neural Networks}, 16 | booktitle = {Proceedings of the 25th Conference on User Modeling, Adaptation and Personalization}, 17 | series = {UMAP '17}, 18 | year = {2017}, 19 | isbn = {978-1-4503-4635-1}, 20 | location = {Bratislava, Slovakia}, 21 | pages = {13--21}, 22 | numpages = {9}, 23 | url = {http://doi.acm.org/10.1145/3079628.3079670}, 24 | doi = {10.1145/3079628.3079670}, 25 | acmid = {3079670}, 26 | publisher = {ACM}, 27 | } 28 | ```` 29 | 30 | ## Installation 31 | The library has many dependencies: numpy/scipy, theano and lasagne for the neural networks, Gensim for word2vec and pandas for the data manipulation. 32 | 33 | Numpy, scipy and Theano can be sometimes difficult to install, and we recommend looking at Theano's installation tutorial: http://deeplearning.net/software/theano/install.html 34 | Gensim and pandas are easily installed with pip. Lasagne is also installed with pip but you have to specify the version >=0.2.dev1. 35 | 36 | On Ubuntu, the following commands should install everything that you need: 37 | ```` 38 | sudo apt-get install python-numpy python-scipy python-dev python-pip python-nose g++ libopenblas-dev git 39 | sudo pip install Theano pandas gensim https://github.com/Lasagne/Lasagne/archive/master.zip 40 | ```` 41 | 42 | ## Usage 43 | The library is designed to be used in command line through three scripts: 44 | * preprocess.py for the preparation of the dataset 45 | * train.py for training models 46 | * test.py for testing models 47 | 48 | calling these scripts with the `--help` option will display the available options (e.g. `python preprocess.py --help`). 49 | 50 | ### preprocess.py 51 | 52 | This script takes a file containing a dataset of user/item interactions and split it into training/validation/test sets and save them in the format used by train.py and test.py. 53 | The original dataset must be in a format where each line correspond to a single user/item interaction. 54 | 55 | The only required argument is `-f path/to/dataset`, which is used to specify the original dataset. The script will create subfolders named "data", "models" and "results" in the folder containing the original dataset. "data" is used by preprocess.py to store all the files it produces, "models" is used by train.py to store the trained models and "results" is used by test.py to store the results of the tests. 56 | 57 | The optional arguments are the following: 58 | 59 | Option | Desciption 60 | ------ | ---------- 61 | `--columns` | Order of the columns in the file (eg: "uirt"), u for user, i for item, t for timestamp, r for rating. If r is not present a default rating of 1 is given to all interaction. If t is not present interactions are assumed to be in chronological order. Extra columns are ignored. Default: uit 62 | `--sep` | Separator between the column. If unspecified pandas will try to guess the separator 63 | `--min_user_activity` | Users with less interactions than this will be removed from the dataset. Default: 2 64 | `--min_item_pop` | Items with less interactions than this will be removed from the dataset. Default: 5 65 | `--val_size` | Number of users to put in the validation set. If in (0,1) it will be interpreted as the fraction of total number of users. Default: 0.1 66 | `--test_size` | Number of users to put in the test set. If in (0,1) it will be interpreted as the fraction of total number of users. Default: 0.1 67 | `--seed` | Seed for the random train/val/test split 68 | 69 | #### Example 1 70 | In the movielens 1M dataset each line has the following format: 71 | ```` 72 | UserID::MovieID::Rating::Timestamp 73 | ```` 74 | To process it you have to specify the order of the columns, in this case uirt (for user, item, rating, timestamp), and the separator ("::"). If you want to use a hundred users for the validation set and a hundred others for the test set, you'll have to use the following command: 75 | ```` 76 | python preprocess.py -f path/to/ratings.dat --columns uirt --sep :: --val_size 100 --test_size 100 77 | ```` 78 | #### Example 2 79 | Consider a dataset where each line has the following format: 80 | ```` 81 | timestamp, user_id, some_useless_data, item_id, more_useless_data 82 | ```` 83 | You can specify the order of columns with "tuxiy" where x and y are placeholder names for the columns that will be discarted by the script. Using "tuxi" will also work, as all the columns not mentioned are discarded. As no rating column is present, each interaction will recieve the rating "1". If you also want for example to remove users with less than 10 interactions, use the following command: 84 | ```` 85 | python preprocess.py -f path/to/file --columns tuxi --min_user_activity 10 86 | ```` 87 | 88 | ### train.py 89 | 90 | This script is used to train models and offers many options regarding when to save new models and when to stop training. 91 | The basic usage is the following: 92 | ```` 93 | python train.py -d path/to/dataset/ -m Method_name 94 | ```` 95 | 96 | The argument `-d` is used to specify the path to the folder that contains the "data", "models" and "results" subfolders created by preprocess.py. 97 | If you have multiple datasets with a partly common path (e.g. path/to/dataset1/, path/to/dataset2/, etc.) you can specify this common path in the variable DEFAULT_DIR of helpers/data_handling.py. For example, setting DEFAULT_DIR = "path/to/" and using the argument `-d dataset1` will look for the dataset in "path/to/dataset1/". 98 | 99 | The optional arguments are the following: 100 | 101 | Option | Desciption 102 | ------ | ---------- 103 | `--dir dirname/` | Name of the subfolder of "path/to/dataset/models/" in which to save the model. By default it will be saved directly in the models/ folder, but using subfolders can be useful when many models are tested. 104 | `--progress {int or float}` | Number of iterations (or seconds) between two evaluations of the model on the validation set. When the model is evaluated, progress is shown on the command line, and the model might be saved (depending on the `--save` option). An float value means that the evaluations happen at geometric intervals (rather than linear). Default: 2.0 105 | `--metrics value` | Metrics computed on the validation set, separated by commas. Available metrics are recall, sps, ndcg, item\_coverage, user\_coverage and blockbuster\_share. Default: sps. 106 | `--save [All, Best, None]` | Policy for saving models. If "None", no model is saved. If "All", the current model is saved each time the model is evaluated on the validation set, and no model is destroyed. If "Best", the current model is only saved if it improves over the previous best results on the validation set, and the previous best model is deleted. If "Best" and multiple metrics are used, all the pareto-optimal models are saved. 107 | `--time_based_progress` | Base the interval between two evaluations on the number of elapsed seconds rather than on the number of iterations. 108 | `--mpi value` | Max number of iterations (or seconds) between two evaluations (useful when using geometric intervals). Default: inf. 109 | `--max_iter value` | Max number of iterations (default: inf). 110 | `--max_time value` | Max training time in seconds (default: inf). 111 | `--min_iter value` | Min number of iterations before making the first evaluation (default: 0). 112 | `--extended_set` | Use extended training set (contains first half of validation and test set). This is necessary for factorization based methods such as BPRMF and FPMC because they need to build a model for every user. 113 | `--tshuffle` | Shuffle the order of sequences between epochs. 114 | `--load_last_model` | Load Last model before starting training (it will search for a model build with all the same options and take the one with the largest number of epochs). 115 | `--es_m [WorstTimesX, StopAfterN, None]` | Early stopping method (by default none is used, and training continues until max_iter or max_time is reached). WorstTimesX will stop training if the number of iterations since the last best score on the validation set is longer than X times the longest time between two consecutive best scores. StopAfterN will stop the training if the model has not improved for the N last evaluations on the validation set. 116 | `--es_n N` | N parameter for StopAfterN (default: 5). 117 | `--es_x X` | X parameter for WorstTimesX (default: 2). 118 | `--es_min_wait num_epochs` | Mininum number of epochs before stopping (for WorstTimesX). Default: 1. 119 | `--es_LiB` | Lower is better for validation score. By default a higher validation score is considered better, but if it is not the case you can use this option. 120 | 121 | The options specific to each method are explained in the Methods section. 122 | 123 | ### test.py 124 | 125 | This script test the models built with train.py on the test set. 126 | The basic usage is: 127 | ```` 128 | python test.py -d path/to/dataset/ -m Method_name 129 | ```` 130 | The argument `-d` works in the same way as with train.py, and the precise model to test is specified by the `--dir` option and the methods-specific options. 131 | If multiple models fit the options (They are in the same subfolder and were trained with the same method and same options), they are all evaluated one after the other, except if the argument `-i epoch_number` is also specified, which will then select the model based on the number of epochs. 132 | 133 | `--metrics` allows to specify the list of metrics to compute, separated by commas. By default the metrics are: sps, recall, item\_coverage, user\_coverage, blockbuster_share. 134 | The "blockbuster share" is the percentage of correct recommendations among the 1% most popular items. 135 | The other available metrics are the sps, the ndcg and the assr (when clustering is used). 136 | 137 | All the metrics are computed "@k", with k=10 by default. k can be changed using the `-k` option. 138 | 139 | When the `--save` option is used, the results are saved in a file in "path/to/dataset/results/". 140 | the results of each model form a line of the file, and each line contains the number of epochs followed by the metrics specified by `--metrics`. 141 | 142 | When testing a method based on clustering, the option `--ignore_clusters` can be used to test how the method performs without clusters. 143 | 144 | ## Methods 145 | 146 | The available methods are: 147 | * [Recurrent Neural Networks](#recurrent-neural-networks) 148 | * [Stacked Denoising Autoencoder](#stacked-denoising-autoencoders) 149 | * [Latent Tarjectory Modeling/word2vec](#latent-trajectory-modeling) 150 | * [BPR-MF](#bpr-mf) 151 | * [FPMC](#fpmc) 152 | * [FISM](#fism) 153 | * [Fossil](#fossil) 154 | * [Markov Chains](#markov-chain) 155 | * [User KNN](#user-knn) 156 | * [Popularity baseline](#pop) 157 | 158 | ### Neural Networks 159 | #### Recurrent Neural Networks 160 | 161 | Use it with `-m RNN`. 162 | The RNN have many options allowing to change the type/size/number of layers, the training procedure and the objective function, and some options are specific to a particular objective function. 163 | 164 | ##### Layers 165 | 166 | Option | Desciption 167 | ------ | ---------- 168 | `--r_t [LSTM, GRU, Vanilla]` | Type of recurrent layer (default is GRU) 169 | `--r_l size_of_layer1-size_of_layer2-etc.` | Size and number of layers. for example, `--r_l 100-50-50` creates a layer with 50 hidden neurons on top of another layer with 50 hidden neurons on top of a layer with 100 hidden neurons. Default: 32. 170 | `--r_bi` | Use bidirectional layers. 171 | `--r_emb size` | Adds an embedding layer before the recurrent layer. By default no embedding layer is used, but it is adviced to use one (e.g. `--r_emb 100`). 172 | 173 | ##### Update mechanism 174 | 175 | Option | Desciption 176 | ------ | ---------- 177 | `--u_m [adagrad, adadelta, rmsprop, nesterov, adam]` | Update mechanism (see [Lasagne doc](http://lasagne.readthedocs.io/en/latest/modules/updates.html)). Default is adam 178 | `--u_l float` | Learning rate (default: 0.001). The default learning rate works well with adam. For adagrad `--u_l 0.1` is adviced. 179 | `--u_rho float` | rho parameter for Adadelta and RMSProp, or momentum for Nesterov momentum (default: 0.9). 180 | `--u_b1 float` | Beta 1 parameter for Adam (default: 0.9). 181 | `--u_b2 float` | Beta 2 parameter for Adam (default: 0.999). 182 | 183 | ##### Noise 184 | 185 | Option | Desciption 186 | ------ | ---------- 187 | `--n_dropout P` | Dropout probability (default: 0.) 188 | `--n_shuf P` | Probability that an item is swapped with another one (default: 0.). 189 | `--n_shuf_std STD` | If an item is swapped, the position of the other item is drawn from a normal distribution whose std is defined by this parameter (default: 5.). 190 | 191 | ##### Other options 192 | 193 | Option | Desciption 194 | ------ | ---------- 195 | `-b int` | Size of the mini-batchs (default: 16) 196 | `--max_length int` | Maximum length of sequences (default: 200) 197 | `-g val` | Gradient clipping (default: 100) 198 | `--repeated_interactions` | Use when a user can interact multiple times with the same item. If not set, the items that the user already saw are never recommended. 199 | 200 | ##### Objective functions 201 | 202 | Option | Desciption 203 | ------ | ---------- 204 | `--loss [CCE, Blackout, TOP1, BPR, hinge, logit, logsig]` | Objective function. CCE is the categorical cross-entropy, BPR, TOP1 and Blackout are based on sampling, and hinge, logit and logsig allow to have multiple targets. Default is CCE. 205 | `-r float` | *Only for CCE*. Add a regularization term. A positive value will use L2 regularization and a negative value will use L1. Default: 0. 206 | `--db float` | *Only for CCE, Blackout, BPR and TOP1*. Increase the diversity bias to put more pressure on learning correct recomendations for unfrequent items (default: 0.). 207 | `--sampling float or int` | *Only for Blackout, BPR and TOP1*. Number of items to sample in the error computation. Use a float in [0,1] to express it as a fraction of the number of items in the catalog, or an int > 0 to specify the number of samples directly. Default: 32. 208 | `--n_targets N` | *Only for hinge, logit and logsig*. Number of items in the sequence that are used as targets. Default: 1. 209 | 210 | ##### Clustering 211 | 212 | It is possible to combine RNNs with an item-clustering method. This leads to faster prediction on large dataset and creates meaningful item clusters. 213 | In order to use it, use the option `--clusters nb_of_clusters`. 214 | For example, `python train.py -d path/to/dataset/ -m RNN --loss BPR --clusters 10` will train an RNN with the BPR loss and 10 clusters of items. 215 | Note that the clustering is only compatible with sampling-based loss (BPR, Blackout and TOP1). 216 | It also works with `--loss CCE`, but a sampling version of CCE is then used instead of the normal categorical cross-entropy. 217 | 218 | 219 | #### Stacked Denoising Autoencoders 220 | 221 | Use it with `-m SDAE`. 222 | SDAE the RNN options described in "[Update mechanism](#update-mechanism)" and "[Other options](#other-options)". 223 | 224 | Option | Desciption 225 | ------ | ---------- 226 | `--L size_of_layer1-size_of_layer2-etc.` | Size and number of layers. for example, `--r_l 50-32-50` creates a layer with 50 hidden neurons on top of another layer with 32 hidden neurons on top of a layer with 50 hidden neurons. Default: 20. 227 | `--in_do float` | Dropout rate applied to the input layer of the SDAE (default: 0.2). 228 | `--do float` | Dropout rate applied to the hidden layers of the SDAE (default: 0.5). 229 | 230 | #### Latent Trajectory Modeling 231 | 232 | Use it with `-m LTM`. 233 | LTM is a method based on word2vec, described in "[Latent Trajectory Modeling: A Light and Efficient Way to Introduce Time in Recommender Systems](http://dl.acm.org/citation.cfm?id=2799676)". 234 | LTM works in two steps: it first produces an embedding of the items with the word2vec algorithm using the sequence of items in the training set, then it estimates for each user a translation vector that would best explain the trajectory of that user in the embedded space. 235 | Predictions are made by finding the closest items to the last user item translated by the user's translation vector. 236 | Our implementation is mainly a wrapper around [Gensim's word2vec implementation](https://radimrehurek.com/gensim/models/word2vec.html). 237 | 238 | Option | Desciption 239 | ------ | ---------- 240 | `-H int` | Number of neurons (default: 20). 241 | `--ltm_window int` | Size of word2vec's window (default: 5). 242 | `--ltm_damping float` | Temporal damping (default: 0.8). 243 | `--ltm_no_trajectory` | Use this option to make predictions directly with word2vec, without the trajectory estimation proposed in the LTM paper. 244 | 245 | ### Factorization-based 246 | #### FPMC 247 | 248 | FPMC is a method combining factorized markov chains with the factorization of the user-item matrix (see "Factorizing personalized Markov chains for next-basket recommendation" by Rendle et al. in *Proceedings of WWW'10*). 249 | Use it with `-m FPMC` 250 | 251 | Option | Desciption 252 | ------ | ---------- 253 | `--k_cf int` | Rank of the user-item matrix factorization (default: 32). 254 | `--k_mc int` | Rank of the factorized Markov chain (default: 32). 255 | `-l val` | Learning rate (default: 0.01). 256 | `--cooling val` | Multiplicative factor applied to the learning rate after each epoch (default: 1) 257 | `--init_sigma val` | Standard deviation of the gaussian initialization (default: 1). 258 | `--fpmc_bias val` | Sampling bias (default: 100). By default the SGD process uses adaptive sampling to speed up learning. This parameter is used to control how much the sampling is biased towards high error items. 259 | `--no_adaptive_sampling` | No adaptive sampling 260 | `-r float` | Add a regularization term. A positive value will use L2 regularization and a negative value will use L1. Default: 0. 261 | 262 | #### BPR-MF 263 | 264 | BPR-MF is a matrix factorization method based on the BPR loss (see "BPR: Bayesian personalized ranking from implicit feedback" by Rendle et al. in *Proceedings of the twenty-fifth conference on uncertainty in artificial intelligence*) 265 | Use it with `-m BPRMF` 266 | 267 | Option | Desciption 268 | ------ | ---------- 269 | `-H int` | Rank of the user-item matrix factorization (default: 20). 270 | `-l val` | Learning rate (default: 0.01). 271 | `--cooling val` | Multiplicative factor applied to the learning rate after each epoch (default: 1) 272 | `--init_sigma val` | Standard deviation of the gaussian initialization (default: 1). 273 | `--fpmc_bias val` | Sampling bias (default: 100). By default the SGD process uses adaptive sampling to speed up learning. This parameter is used to control how much the sampling is biased towards high error items. 274 | `--no_adaptive_sampling` | No adaptive sampling 275 | `-r float` | Add a regularization term. A positive value will use L2 regularization and a negative value will use L1. Default: 0. 276 | 277 | #### FISM 278 | 279 | FISM is a method based of item-item factorization (see "Fism: factored item similarity models for top-n recommender systems" by Kabbur et al. in *Proceedings of SIGKDD'13*). 280 | It has the advantage over BPR-MF that it does not build a representation for each user. This leads to smaller models, and the ability to make recommendation to new users. 281 | Use it with `-m FISM --loss [BPR, RMSE]` 282 | 283 | Option | Desciption 284 | ------ | ---------- 285 | `--loss [BPR, RMSE]` | Loss function. "BPR" is the same loss as for BPR-MF, "RMSE" optimizes the square error. This cannot be left to default because the default loss is CCE, which is not compatible with FISM. 286 | `-H int` | Rank of the matrix factorization (default: 20). 287 | `--fism_alpha float` | Alpha parameter in FISM. (default: 0.2). 288 | `-l val` | Learning rate (default: 0.01). 289 | `--cooling val` | Multiplicative factor applied to the learning rate after each epoch (default: 1) 290 | `--init_sigma val` | Standard deviation of the gaussian initialization (default: 1). 291 | `-r float` | Add a regularization term. A positive value will use L2 regularization and a negative value will use L1. Default: 0. 292 | 293 | FISM can be combined with item-clustering the same way that RNN can. 294 | To do so, add the option `--clusters nb_of_clusters`. 295 | When using clustering, a completely different implementation is used, which is based on Theano instead of Numpy. 296 | This has some implications on the available options: 297 | * The loss must be choosen among CCE, BPR, Blackout and TOP1 instead of BPR and RMSE. 298 | * The number of samples for each training step can be specified using `--sampling nb_of_samples`. 299 | * The update mechanism is controled by the options defined in [Update mechanism](#update-mechanism) instead of `-l` and `--cooling`. 300 | 301 | #### Fossil 302 | 303 | Fossil combines FISM with factorized markov chains (see "Fusing Similarity Models with Markov Chains for Sparse Sequential Recommendation" by He and McAuley in *Proceedings of ICDM'16*). 304 | Unlike FPMC, Fossil can use higher-order markov chains. 305 | Use it with `-m Fossil` 306 | 307 | Option | Desciption 308 | ------ | ---------- 309 | `-H int` | Rank of the matrix factorization (default: 20). 310 | `--fism_alpha float` | Alpha parameter in FISM. (default: 0.2). 311 | `--fossil_order int` | Order of the markov chains in Fossil. (default: 1). 312 | `-l val` | Learning rate (default: 0.01). 313 | `--cooling val` | Multiplicative factor applied to the learning rate after each epoch (default: 1) 314 | `--init_sigma val` | Standard deviation of the gaussian initialization (default: 1). 315 | `-r float` | Add a regularization term. A positive value will use L2 regularization and a negative value will use L1. Default: 0. 316 | 317 | ### Lazy 318 | 319 | Lazy methods do not build models, they make recommendation directly based on the dataset. 320 | They should therefor not be used with `train.py`, but only with `test.py`. 321 | 322 | #### POP 323 | 324 | Use it with `-m POP`. 325 | Always predict the most popular items. 326 | 327 | #### Markov Chain 328 | 329 | Use it with `-m MM`. 330 | Recommends the items that follow most often the last item the user's sequence. 331 | 332 | #### User KNN 333 | 334 | Use it with `-m UKNN`. 335 | User-based nearest neighbors approach. 336 | The similarity measure between users is the cosine similarity: #number-of-common-items / sqrt(#number-of-items-of-user-a * #number-of-items-of-user-b). 337 | 338 | Option | Desciption 339 | ------ | ---------- 340 | `--ns int` | Neighborhood size (default: 80). 341 | 342 | -------------------------------------------------------------------------------- /neural_networks/rnn_cluster.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | import numpy as np 4 | import theano 5 | import theano.tensor as T 6 | import lasagne 7 | import cPickle 8 | import os 9 | import sys 10 | import random 11 | from bisect import bisect 12 | from time import time 13 | import rnn_base as rnn 14 | from sparse_lstm import * 15 | from helpers import evaluation 16 | from theano.sandbox.rng_mrg import MRG_RandomStreams 17 | 18 | 19 | class RNNCluster(rnn.RNNBase): 20 | """RNNCluster combines sampling-based RNN with item clustering. 21 | 22 | Parameters 23 | ---------- 24 | n_clusters: int 25 | Number of clusters 26 | 27 | loss: "Blackout", "CCE", "BPR" or "BPRelu" 28 | Determines the loss function, among: 29 | - BPR, as used in "Session-based Recommendations with Recurrent Neural Networks", Hidasi, B. et al., 2016 30 | - TOP1, defined in "Session-based Recommendations with Recurrent Neural Networks", Hidasi, B. et al., 2016 31 | - Blackout, discriminative loss function defined in "BlackOut: Speeding up Recurrent Neural Network Language Models With Very Large Vocabularies", Ji, S. et al., 2015 (equation 6) 32 | - BPRelu, approximation of BPR based on relu/hinge non-linearities 33 | - CCE, categorical cross-entropy computed on the set of samples 34 | 35 | cluster_type: "mix", "softmax" or "sigmoid" 36 | Determines whether items can belong to multiple clusters. 37 | - mix, items belong to at least one cluster, possibly many. 38 | - softmax, items belong to one and only one cluster. 39 | - sigmoid, items belong to zero, one or multiple clusters. 40 | 41 | sampling: int 42 | Number of samples. 43 | 44 | cluster_sampling: int 45 | If cluster_sampling > 0, the recommendation loss and the clustering loss use different samples. 46 | In that case, cluster_sampling is the number of samples used by the clustering loss. 47 | 48 | sampling_bias: float 49 | Items are sampled with a probability proportional to their frequency to the power of the sampling_bias. 50 | 51 | predict_with_clusters: bool 52 | Set to false during testing if you want to ignore the clustering. 53 | 54 | cluster_selection_noise: float 55 | If cluster_selection_noise > 0, a random gaussian noise (whose std is cluster_selection_noise) is added to the cluster selection output during training. 56 | Can help to explore a large number of clusters. 57 | 58 | init_scale: float 59 | Initial scale of the softmax and sigmoid functions used in the cluster selection process. 60 | 61 | scale_growing_rate: float 62 | After each training epoch, the scale of the softmax and sigmoid functions is multiplied by the scale_growing_rate. 63 | 64 | max_scale: float 65 | Maximum allowed scale. 66 | 67 | See classes SequenceNoise, RecurrentLayers, SelectTargets and update manager for options common to the other RNN methods. 68 | """ 69 | 70 | def __init__(self, n_clusters=10, loss="Blackout", cluster_type='mix', sampling=100, cluster_sampling=-1, sampling_bias=0., predict_with_clusters=True, cluster_selection_noise=0., init_scale=1., scale_growing_rate=1., max_scale=50, **kwargs): 71 | super(RNNCluster, self).__init__(**kwargs) 72 | 73 | self.n_clusters = n_clusters 74 | self.init_scale = np.cast[theano.config.floatX](init_scale) 75 | self.effective_scale = np.cast[theano.config.floatX](init_scale) 76 | self.scale_growing_rate = np.cast[theano.config.floatX](scale_growing_rate) 77 | self.max_scale = np.cast[theano.config.floatX](max_scale) 78 | self.cluster_type = cluster_type 79 | self.sampling_bias = sampling_bias 80 | self.loss = loss 81 | self.cluster_selection_noise = cluster_selection_noise 82 | 83 | self.predict_with_clusters = predict_with_clusters 84 | 85 | if self.loss == "Blackout": 86 | self._loss = self._blackout_loss 87 | elif self.loss == 'lin': 88 | self._loss = self._lin_loss 89 | elif self.loss == 'BPRelu': 90 | self._loss = self._BPRelu_loss 91 | elif self.loss == 'BPR': 92 | self._loss = self._BPR_loss 93 | elif self.loss == 'TOP1': 94 | self._loss = self._TOP1_loss 95 | elif self.loss == 'CCE': 96 | self._loss = self._cce_loss 97 | else: 98 | raise ValueError('Unknown cluster loss') 99 | 100 | 101 | self.n_samples = int(sampling) 102 | self.n_cluster_samples = int(cluster_sampling) 103 | 104 | self._srng = MRG_RandomStreams(lasagne.random.get_rng().randint(1, 2147462579)) 105 | 106 | 107 | self.name = "RNN Cluster with categorical cross entropy" 108 | 109 | self.metrics = {'recall': {'direction': 1}, 110 | 'cluster_recall': {'direction': 1}, 111 | 'sps': {'direction': 1}, 112 | 'cluster_sps': {'direction': 1}, 113 | 'ignored_items': {'direction': -1}, 114 | 'assr': {'direction': 1}, 115 | 'cluster_use': {'direction': 1}, 116 | 'cluster_use_std': {'direction': -1}, 117 | 'cluster_size': {'direction': 1} 118 | } 119 | 120 | def _get_model_filename(self, epochs): 121 | '''Return the name of the file to save the current model 122 | ''' 123 | filename = "rnn_clusters"+str(self.n_clusters)+"_sc"+str(self.init_scale) 124 | 125 | if self.scale_growing_rate != 1.: 126 | filename += "-"+str(self.scale_growing_rate)+"-"+str(self.max_scale) 127 | 128 | filename+="_" 129 | if self.sampling_bias > 0.: 130 | filename += "p" + str(self.sampling_bias) 131 | filename += "s"+str(self.n_samples) 132 | 133 | if self.n_cluster_samples > 0: 134 | filename += "_" 135 | if self.sampling_bias > 0.: 136 | filename += "p" + str(self.sampling_bias) 137 | filename += "cs"+str(self.n_cluster_samples) 138 | 139 | if self.cluster_type == 'softmax': 140 | filename += "_softmax" 141 | elif self.cluster_type == 'mix': 142 | filename += "_mix" 143 | 144 | if self.cluster_selection_noise > 0.: 145 | filename += '_n' + str(self.cluster_selection_noise) 146 | 147 | filename += "_c" + self.loss 148 | 149 | return filename+"_"+self._common_filename(epochs) 150 | 151 | def _blackout_loss(self, predictions, n_targets): 152 | targets = np.arange(n_targets) 153 | predictions = T.nnet.softmax(predictions) 154 | pos = T.nnet.categorical_crossentropy(predictions, targets) 155 | neg = T.log(1 - predictions) 156 | return pos - neg[:, targets.shape[0]:].sum(axis=-1) 157 | 158 | def _cce_loss(self, predictions, n_targets): 159 | targets = np.arange(n_targets) 160 | predictions = T.nnet.softmax(predictions) 161 | pos = T.nnet.categorical_crossentropy(predictions, targets) 162 | return pos 163 | 164 | def _lin_loss(self, predictions, n_targets): 165 | neg = predictions[:, n_targets:].sum(axis=-1) 166 | pos = T.diag(predictions) 167 | return neg - pos 168 | 169 | def _BPR_loss(self, predictions, n_targets): 170 | diff = (predictions - T.diag(predictions).dimshuffle([0,'x']))[:, n_targets:] 171 | return -(T.log(T.nnet.sigmoid(-diff))).mean(axis=-1) 172 | 173 | def _BPRelu_loss(self, predictions, n_targets): 174 | diff = (predictions - T.diag(predictions).dimshuffle([0,'x']))[:, n_targets:] 175 | return lasagne.nonlinearities.leaky_rectify(diff+0.5).mean(axis=-1) 176 | 177 | def _TOP1_loss(self, predictions, n_targets): 178 | diff = (predictions - T.diag(predictions).dimshuffle([0,'x']))[:, n_targets:] 179 | reg = T.sqr(predictions[:, n_targets:]) 180 | return (T.nnet.sigmoid(diff) + T.nnet.sigmoid(reg)).mean(axis=-1) 181 | 182 | def _create_ini_clusters(self): 183 | c = 0.1 * np.random.randn(self.n_items, self.n_clusters) 184 | # c = -2 * np.random.random((self.n_items, self.n_clusters)) - 1 185 | # for i, j in enumerate(np.random.choice(self.n_clusters, self.n_items)): 186 | # c[i,j] *= -1 187 | 188 | # print(np.round(c[:5, :], 2)) 189 | return c.astype(theano.config.floatX) 190 | 191 | def _prepare_networks(self, n_items): 192 | ''' Prepares the building blocks of the RNN, but does not compile them: 193 | self.l_in : input layer 194 | self.l_mask : mask of the input layer 195 | self.target : target of the network 196 | self.l_out : output of the network 197 | self.cost : cost function 198 | ''' 199 | 200 | self.n_items = n_items 201 | # The input is composed of to parts : the on-hot encoding of the movie, and the features of the movie 202 | self.l_in = lasagne.layers.InputLayer(shape=(self.batch_size, self.max_length, self._input_size())) 203 | # The input is completed by a mask to inform the LSTM of the length of the sequence 204 | self.l_mask = lasagne.layers.InputLayer(shape=(self.batch_size, self.max_length)) 205 | 206 | # recurrent layer 207 | if not self.use_movies_features: 208 | l_recurrent = self.recurrent_layer(self.l_in, self.l_mask, true_input_size=self.n_items + self._n_optional_features(), only_return_final=True) 209 | else: 210 | l_recurrent = self.recurrent_layer(self.l_in, self.l_mask, true_input_size=None, only_return_final=True) 211 | 212 | 213 | # Theano tensor for the targets 214 | self.target = T.ivector('target_output') 215 | self.exclude = T.fmatrix('excluded_items') 216 | self.samples = T.ivector('samples') 217 | self.cluster_samples = T.ivector('cluster_samples') 218 | 219 | self.user_representation_layer = l_recurrent 220 | 221 | # The sliced output is then passed through linear layer to obtain the right output size 222 | self.l_out = BlackoutLayer(l_recurrent, num_units=self.n_items, num_outputs=self.n_samples, nonlinearity=None, W=lasagne.init.GlorotUniform()) 223 | 224 | # lasagne.layers.get_output produces a variable for the output of the net 225 | network_output = lasagne.layers.get_output(self.l_out, targets = self.target, samples=self.samples) 226 | 227 | # loss function 228 | self.cost = self._loss(network_output,self.batch_size).mean() 229 | 230 | 231 | # Cluster learning 232 | self.T_scale = theano.shared(self.effective_scale) 233 | scaled_softmax = lambda x: lasagne.nonlinearities.softmax(x*self.T_scale) 234 | 235 | self.cluster_selection_layer = lasagne.layers.DenseLayer(l_recurrent, b=None, num_units=self.n_clusters, nonlinearity=None) 236 | cluster_selection = lasagne.layers.get_output(self.cluster_selection_layer) 237 | if self.cluster_selection_noise > 0.: 238 | cluster_selection = cluster_selection + self._srng.normal(cluster_selection.shape, avg=0.0, std=self.cluster_selection_noise) 239 | cluster_selection = scaled_softmax(cluster_selection) 240 | 241 | self.cluster_repartition = theano.shared(self._create_ini_clusters()) 242 | if self.cluster_type == 'softmax': 243 | target_and_samples_clusters = scaled_softmax(self.cluster_repartition[T.concatenate([self.target, self.cluster_samples]), :]) 244 | elif self.cluster_type == 'mix': 245 | target_and_samples_clusters = scaled_softmax(self.cluster_repartition[T.concatenate([self.target, self.cluster_samples]), :]) + \ 246 | T.nnet.sigmoid(self.T_scale*self.cluster_repartition[T.concatenate([self.target, self.cluster_samples]), :]) 247 | else: 248 | target_and_samples_clusters = T.nnet.sigmoid(self.T_scale*self.cluster_repartition[T.concatenate([self.target, self.cluster_samples]), :]) 249 | cluster_score = cluster_selection.dot(target_and_samples_clusters.T) 250 | 251 | self.cost_clusters = self._loss(cluster_score, self.batch_size).mean() 252 | 253 | 254 | 255 | 256 | 257 | 258 | def _compile_train_function(self): 259 | ''' Compile self.train. 260 | self.train recieves a sequence and a target for every steps of the sequence, 261 | compute error on every steps, update parameter and return global cost (i.e. the error). 262 | ''' 263 | print("Compiling train...") 264 | # Compute AdaGrad updates for training 265 | all_params = lasagne.layers.get_all_params(self.l_out, trainable=True) 266 | updates = self.updater(self.cost, all_params) 267 | 268 | params_clusters = self.cluster_selection_layer.get_params(trainable=True) 269 | params_clusters.append(self.cluster_repartition) 270 | updates.update(self.updater(self.cost_clusters, params_clusters)) 271 | # Compile network 272 | self.train_function = theano.function([self.l_in.input_var, self.l_mask.input_var, self.target, self.samples, self.cluster_samples, self.exclude], self.cost, updates=updates, allow_input_downcast=True, name="Train_function", on_unused_input='ignore') 273 | print("Compilation done.") 274 | 275 | def _get_hard_clusters(self): 276 | if self.cluster_type == 'softmax': 277 | return lasagne.nonlinearities.softmax(100. * self.cluster_repartition) 278 | elif self.cluster_type == 'mix': 279 | # Clipping is used to avoid the sum of sigmoid and softmax to produce a cluster indicator of 2 280 | return (lasagne.nonlinearities.softmax(100. * self.cluster_repartition) + T.nnet.sigmoid(100. * self.cluster_repartition)).clip(0,1) 281 | else: 282 | return T.nnet.sigmoid(100. * self.cluster_repartition) 283 | 284 | def _compile_predict_function(self): 285 | ''' Compile self.predict, the deterministic rnn that output the prediction at the end of the sequence 286 | ''' 287 | print("Compiling predict...") 288 | if self.predict_with_clusters: 289 | cluster_selection = lasagne.layers.get_output(self.cluster_selection_layer, deterministic=True)[0, :].argmax() 290 | user_representation = lasagne.layers.get_output(self.user_representation_layer, deterministic=True) 291 | theano_predict_function = theano.function([self.l_in.input_var, self.l_mask.input_var], [user_representation, cluster_selection], allow_input_downcast=True, name="Predict_function", on_unused_input='ignore') 292 | 293 | def cluster_predict_function(sequence, mask, k, exclude): 294 | u, c = theano_predict_function(sequence, mask) 295 | scores = u[0].dot(self.clusters_embeddings[c]) + self.clusters_bias[c] 296 | 297 | cluster_index_exclude = [] 298 | for i in exclude: 299 | if i in self.clusters_reverse_index[c]: 300 | cluster_index_exclude.append(self.clusters_reverse_index[c][i]) 301 | scores[cluster_index_exclude] = -np.inf 302 | 303 | # find top k according to output 304 | effective_k = min(k, len(self.clusters[c])) 305 | return list(self.clusters[c][np.argpartition(-scores, range(effective_k))[:effective_k]]), len(self.clusters[c]) 306 | 307 | self.predict_function = cluster_predict_function 308 | else: 309 | items_score = lasagne.nonlinearities.softmax(lasagne.layers.get_output(self.l_out, deterministic=True)) 310 | 311 | user_representation = lasagne.layers.get_output(self.user_representation_layer, deterministic=True) 312 | theano_predict_function = theano.function([self.l_in.input_var, self.l_mask.input_var], user_representation, allow_input_downcast=True, name="Predict_function", on_unused_input='ignore') 313 | 314 | def no_cluster_predict_function(sequence, mask, k, exclude): 315 | u = theano_predict_function(sequence, mask) 316 | scores = u[0].dot(self.l_out.W.get_value(borrow=True)) + self.l_out.b.get_value(borrow=True) 317 | 318 | scores[exclude] = -np.inf 319 | 320 | # find top k according to output 321 | return list(np.argpartition(-scores, range(k))[:k]), self.n_items 322 | 323 | self.predict_function = no_cluster_predict_function 324 | 325 | print("Compilation done.") 326 | 327 | def _compile_test_function(self): 328 | ''' Compile self.test_function, the deterministic rnn that output the precision@10 329 | ''' 330 | print("Compiling test...") 331 | 332 | items_score1 = lasagne.nonlinearities.softmax(lasagne.layers.get_output(self.l_out, deterministic=True)) 333 | 334 | cluster_selection = lasagne.layers.get_output(self.cluster_selection_layer, deterministic=True)[0, :].argmax() 335 | items_clusters = self._get_hard_clusters() 336 | used_items = items_clusters[:,cluster_selection] 337 | items_score2 = items_score1 * used_items 338 | 339 | if self.interactions_are_unique: 340 | items_score1 *= (1 - self.exclude) 341 | items_score2 *= (1 - self.exclude) 342 | 343 | theano_test_function = theano.function([self.l_in.input_var, self.l_mask.input_var, self.target, self.samples, self.cluster_samples, self.exclude], [items_score1, items_score2, cluster_selection, used_items.sum()], allow_input_downcast=True, name="Test_function", on_unused_input='ignore') 344 | 345 | def precision_test_function(theano_inputs): 346 | k = 10 347 | scores1, scores2, c_select, n_used_items = theano_test_function(*theano_inputs) 348 | ids1 = np.argpartition(-scores1, range(k), axis=-1)[0, :k] 349 | ids2 = np.argpartition(-scores2, range(k), axis=-1)[0, :k] 350 | 351 | return ids1, ids2, c_select, n_used_items 352 | 353 | self.test_function = precision_test_function 354 | 355 | print("Compilation done.") 356 | 357 | def _popularity_sample(self): 358 | if not hasattr(self, '_cumsum'): 359 | self._cumsum = np.cumsum(np.power(self.dataset.item_popularity, self.sampling_bias)) 360 | 361 | return bisect(self._cumsum, random.uniform(0, self._cumsum[-1])) 362 | 363 | def _prepare_input(self, sequences): 364 | ''' Sequences is a list of [user_id, input_sequence, targets] 365 | ''' 366 | 367 | batch_size = len(sequences) 368 | 369 | # Shape return variables 370 | X = np.zeros((batch_size, self.max_length, self._input_size()), dtype=self._input_type) # input of the RNN 371 | mask = np.zeros((batch_size, self.max_length)) # mask of the input (to deal with sequences of different length) 372 | Y = np.zeros((batch_size,), dtype='int32') # output target 373 | exclude = np.zeros((batch_size, self.n_items), dtype=theano.config.floatX) 374 | 375 | 376 | for i, sequence in enumerate(sequences): 377 | user_id, in_seq, target = sequence 378 | seq_features = np.array(map(lambda x: self._get_features(x, user_id), in_seq)) 379 | X[i, :len(in_seq), :] = seq_features # Copy sequences into X 380 | mask[i, :len(in_seq)] = 1 381 | Y[i] = target[0][0] # id of the first and only target 382 | exclude[i, [j[0] for j in in_seq]] = 1 383 | 384 | if self.sampling_bias > 0.: 385 | samples = np.array([self._popularity_sample() for i in range(self.n_samples)], dtype='int32') 386 | if self.n_cluster_samples > 0: 387 | cluster_samples = np.array([self._popularity_sample() for i in range(self.n_cluster_samples)], dtype='int32') 388 | else: 389 | cluster_samples = samples 390 | else: 391 | samples = np.random.choice(self.n_items, self.n_samples).astype('int32') 392 | if self.n_cluster_samples > 0: 393 | cluster_samples = np.random.choice(self.n_items, self.n_cluster_samples).astype('int32') 394 | else: 395 | cluster_samples = samples 396 | 397 | # scale 398 | if not hasattr(self, '_last_epoch'): 399 | self._last_epoch = self.dataset.training_set.epochs 400 | else: 401 | if self.dataset.training_set.epochs > self._last_epoch+1 and self.scale_growing_rate != 1.: 402 | self.effective_scale *= self.scale_growing_rate ** int(self.dataset.training_set.epochs - self._last_epoch) 403 | self._last_epoch += int(self.dataset.training_set.epochs - self._last_epoch) 404 | print("New scale: ", self.effective_scale) 405 | self.T_scale.set_value(self.effective_scale) 406 | 407 | return (X, mask.astype(theano.config.floatX), Y, samples, cluster_samples, exclude) 408 | 409 | def _compute_validation_metrics(self, metrics): 410 | clusters = np.zeros(self.n_clusters, dtype="int") 411 | used_items = [] 412 | ev = evaluation.Evaluator(self.dataset, k=10) 413 | ev_clusters = evaluation.Evaluator(self.dataset, k=10) 414 | for batch, goal in self._gen_mini_batch(self.dataset.validation_set(epochs=1), test=True): 415 | pred1, pred2, cl, i = self.test_function(batch) 416 | ev.add_instance(goal, pred1) 417 | ev_clusters.add_instance(goal, pred2) 418 | clusters[cl] += 1 419 | used_items.append(i) 420 | 421 | if self.cluster_type == 'softmax': 422 | ignored_items = 0 423 | cluster_size = np.histogram(self.cluster_repartition.get_value(borrow=True).argmax(axis=1), bins=range(self.n_clusters+1))[0].tolist() 424 | elif self.cluster_type == 'mix': 425 | ignored_items = 0 426 | sig_clusters = self.cluster_repartition.get_value(borrow=True) > 0. 427 | softmax_clusters = self.cluster_repartition.get_value(borrow=True).argmax(axis=1) 428 | for i in range(self.n_items): 429 | sig_clusters[i, softmax_clusters[i]] = True 430 | cluster_size = sig_clusters.sum(axis=0) 431 | else: 432 | ignored_items = (self.cluster_repartition.get_value(borrow=True).max(axis=1) < 0.).sum() 433 | cluster_size = (self.cluster_repartition.get_value(borrow=True) > 0.).sum(axis=0) 434 | 435 | metrics['recall'].append(ev.average_recall()) 436 | metrics['cluster_recall'].append(ev_clusters.average_recall()) 437 | metrics['sps'].append(ev.sps()) 438 | metrics['cluster_sps'].append(ev_clusters.sps()) 439 | metrics['assr'].append(self.n_items / np.mean(used_items)) 440 | metrics['ignored_items'].append(ignored_items) 441 | metrics['cluster_use'].append(clusters) 442 | metrics['cluster_use_std'].append(np.std(clusters)) 443 | metrics['cluster_size'].append(cluster_size) 444 | 445 | return metrics 446 | 447 | def _print_progress(self, iterations, epochs, start_time, train_costs, metrics, validation_metrics): 448 | '''Print learning progress in terminal 449 | ''' 450 | print(self.name, iterations, "batchs, ", epochs, " epochs in", time() - start_time, "s") 451 | print("Last train cost : ", train_costs[-1]) 452 | for m in self.metrics.keys(): 453 | print(m, ': ', metrics[m][-1]) 454 | if m in validation_metrics: 455 | print('Best ', m, ': ', max(np.array(metrics[m])*self.metrics[m]['direction'])*self.metrics[m]['direction']) 456 | print('-----------------') 457 | 458 | # Print on stderr for easier recording of progress 459 | print(iterations, epochs, time() - start_time, train_costs[-1], metrics['sps'][-1], metrics['cluster_sps'][-1], metrics['recall'][-1], metrics['cluster_recall'][-1], metrics['assr'][-1], metrics['ignored_items'][-1], metrics['cluster_use_std'][-1], file=sys.stderr) 460 | 461 | def prepare_tests(self): 462 | '''Take the soft clustering and make actual clusters. 463 | ''' 464 | cluster_membership = self.cluster_repartition.get_value(borrow=True) 465 | item_embeddings = self.l_out.W.get_value(borrow=True) 466 | item_bias = self.l_out.b.get_value(borrow=True) 467 | self.clusters = [[] for i in range(self.n_clusters)] 468 | for i in range(cluster_membership.shape[0]): 469 | no_cluster = True 470 | best_cluster = 0 471 | best_val = cluster_membership[i, 0] 472 | for j in range(self.n_clusters): 473 | if cluster_membership[i,j] > 0: 474 | self.clusters[j].append(i) 475 | no_cluster = False 476 | elif cluster_membership[i,j] > best_val: 477 | best_val = cluster_membership[i,j] 478 | best_cluster = j 479 | if no_cluster: 480 | self.clusters[best_cluster].append(i) 481 | 482 | self.clusters = [np.array(c) for c in self.clusters] 483 | self.clusters_reverse_index = [] 484 | for c in self.clusters: 485 | self.clusters_reverse_index.append({c[j]: j for j in range(len(c))}) 486 | self.clusters_embeddings = [item_embeddings[:, c] for c in self.clusters] 487 | self.clusters_bias = [item_bias[c] for c in self.clusters] 488 | 489 | def top_k_recommendations(self, sequence, user_id=None, k=10, exclude=None): 490 | ''' Recieves a sequence of (id, rating), and produces k recommendations (as a list of ids) 491 | ''' 492 | 493 | if exclude is None: 494 | exclude = [] 495 | 496 | # Compile network if needed 497 | if not hasattr(self, 'predict_function'): 498 | self._compile_predict_function() 499 | 500 | # Prepare RNN input 501 | max_length_seq = sequence[-min(self.max_length, len(sequence)):] 502 | X = np.zeros((1, self.max_length, self._input_size()), dtype=self._input_type) # input of the RNN 503 | X[0, :len(max_length_seq), :] = np.array(map(lambda x: self._get_features(x, user_id), max_length_seq)) 504 | mask = np.zeros((1, self.max_length)) # mask of the input (to deal with sequences of different length) 505 | mask[0, :len(max_length_seq)] = 1 506 | 507 | # Run RNN 508 | if self.interactions_are_unique: 509 | should_exclude = [i[0] for i in sequence] 510 | else: 511 | should_exclude = [] 512 | should_exclude.extend(exclude) 513 | return self.predict_function(X, mask.astype(theano.config.floatX), k, should_exclude) 514 | 515 | def save(self, filename): 516 | '''Save the parameters of a network into a file 517 | ''' 518 | print('Save model in ' + filename) 519 | if not os.path.exists(os.path.dirname(filename)): 520 | os.makedirs(os.path.dirname(filename)) 521 | param = lasagne.layers.get_all_param_values(self.l_out) 522 | param.append(self.cluster_repartition.get_value(borrow=True)) 523 | param.append([p.get_value(borrow=True) for p in self.cluster_selection_layer.get_params()]) 524 | f = file(filename, 'wb') 525 | cPickle.dump(param,f,protocol=cPickle.HIGHEST_PROTOCOL) 526 | f.close() 527 | 528 | def load(self, filename): 529 | '''Load parameters values form a file 530 | ''' 531 | f = file(filename, 'rb') 532 | param = cPickle.load(f) 533 | f.close() 534 | lasagne.layers.set_all_param_values(self.l_out, [i.astype(theano.config.floatX) for i in param[:-2]]) 535 | self.cluster_repartition.set_value(param[-2]) 536 | for p, v in zip(self.cluster_selection_layer.get_params(), param[-1]): 537 | p.set_value(v) 538 | 539 | self.prepare_tests() 540 | --------------------------------------------------------------------------------