├── __init__.py
├── lazy
    ├── __init__.py
    ├── lazy.py
    ├── utils.py
    ├── pop.py
    ├── markov_model.py
    └── user_knn.py
├── helpers
    ├── __init__.py
    ├── sparse_layer.py
    ├── early_stopping.py
    ├── data_handling.py
    ├── evaluation.py
    └── command_parser.py
├── word2vec
    ├── __init__.py
    └── ltm.py
├── factorization
    ├── __init__.py
    ├── bprmf.py
    ├── fism.py
    ├── fossil.py
    ├── fpmc.py
    └── mf_base.py
├── neural_networks
    ├── __init__.py
    ├── target_selection.py
    ├── update_manager.py
    ├── sequence_noise.py
    ├── recurrent_layers.py
    ├── rnn_one_hot.py
    ├── stacked_denoising_autoencoder.py
    ├── rnn_margin.py
    ├── rnn_sampling.py
    ├── fism_cluster.py
    └── rnn_cluster.py
├── requirements.txt
├── Dockerfile
├── LICENSE
├── train.py
├── test.py
├── preprocess.py
└── README.md


/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/lazy/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/helpers/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/word2vec/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/factorization/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/neural_networks/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | Theano>=0.8.2
2 | Gensim==0.13.1
3 | pandas==0.19.2
4 | 


--------------------------------------------------------------------------------
/helpers/sparse_layer.py:
--------------------------------------------------------------------------------
 1 | import lasagne
 2 | import numpy as np
 3 | import theano
 4 | import theano.tensor as T
 5 | 
 6 | class SparseLayer(lasagne.layers.DenseLayer):
 7 | 	
 8 | 	def __init__(self, incoming, **kwargs):
 9 | 		super(SparseLayer, self).__init__(incoming, **kwargs)
10 | 
11 | 	def get_output_for(self, input, **kwargs):
12 | 		
13 | 		activation = theano.sparse.structured_dot(input, self.W)
14 | 		if self.b is not None:
15 | 			activation = activation + self.b
16 | 		return self.nonlinearity(activation)


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | from 	ubuntu:16.04
 2 | 
 3 | run	    apt-get -yqq update
 4 | run	    apt-get install -yqq python-dev python-pip python-nose g++ libopenblas-dev python-numpy python-scipy
 5 | 
 6 | add	    . /root/sequence-based-recommendations
 7 | workdir /root/sequence-based-recommendations
 8 | 
 9 | run     pip install --upgrade pip
10 | run     pip install -r requirements.txt
11 | 
12 | 
13 | run     pip install -r https://raw.githubusercontent.com/Lasagne/Lasagne/master/requirements.txt && \
14 |         pip install https://github.com/Lasagne/Lasagne/archive/master.zip


--------------------------------------------------------------------------------
/lazy/lazy.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function
 2 | 
 3 | import numpy as np
 4 | 
 5 | 
 6 | class Lazy(object):
 7 | 	"""Base for Lazy object.
 8 | 	"""
 9 | 	def __init__(self):
10 | 		super(Lazy, self).__init__()
11 | 		
12 | 		self.name = "Lazy base"
13 | 
14 | 	def prepare_model(self, dataset):
15 | 		'''Must be called before using top_k_recommendations
16 | 		'''
17 | 		raise NotImplemented
18 | 
19 | 
20 | 	def load(self, *args, **kwargs):
21 | 		'''Nothing to do here
22 | 		'''
23 | 		return None
24 | 
25 | 	def top_k_recommendations(self, sequence, k=10, **kwargs):
26 | 		''' Recieves a sequence of (id, rating), and produces k recommendations (as a list of ids)
27 | 		'''
28 | 		raise NotImplemented


--------------------------------------------------------------------------------
/lazy/utils.py:
--------------------------------------------------------------------------------
 1 | from __future__ import division
 2 | import numpy as np
 3 | import scipy.sparse as ssp
 4 | 
 5 | def top_k(values, k, exclude=[]):
 6 | 	''' Return the indices of the k items with the highest value in the list of values.
 7 | 	Exclude the ids from the list "exclude".
 8 | 	'''
 9 | 
10 | 	# Put low similarity to viewed items to exclude them from recommendations
11 | 	values[exclude] = -np.inf
12 | 
13 | 	return list(np.argpartition(-values, range(k))[:k])
14 | 
15 | def get_sparse_vector(ids, length, values=None):
16 | 	'''Converts a list of ids into a sparse vector of length "length" where the elements corresponding to the ids are given the values in "values".
17 | 	If "values" is None, the elements are set to 1.
18 | 	'''
19 | 	n = len(ids)
20 | 
21 | 	if values is None:
22 | 		return ssp.coo_matrix((np.ones(n), (ids,np.zeros(n))), (length, 1)).tocsc()
23 | 	else:
24 | 		return ssp.coo_matrix((values, (ids,np.zeros(n))), (length, 1)).tocsc()


--------------------------------------------------------------------------------
/lazy/pop.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function
 2 | import collections
 3 | import numpy as np
 4 | import scipy.sparse as ssp
 5 | from copy import deepcopy
 6 | import os
 7 | from .lazy import Lazy
 8 | from .utils import top_k, get_sparse_vector
 9 | 
10 | 
11 | class Pop(Lazy):
12 | 	"""
13 | 	"""
14 | 	def __init__(self, **kwargs):
15 | 		super(Pop, self).__init__(**kwargs)
16 | 		self.name = "Pop"
17 | 
18 | 	def _get_model_filename(self, *args):
19 | 		return "pop"
20 | 
21 | 	def prepare_model(self, dataset):
22 | 		'''Load the data from the training file into a format adapted for the KNN methods.
23 | 		'''
24 | 
25 | 		self._items_pop = np.zeros(dataset.n_items)
26 | 		for triplet in dataset.training_set_triplets():
27 | 			self._items_pop[triplet['item_id']] += 1
28 | 
29 | 	def top_k_recommendations(self, sequence, k=10, exclude=None, **kwargs):
30 | 				
31 | 		if exclude is None:
32 | 			exclude = []
33 | 
34 | 		items_pop = deepcopy(self._items_pop)
35 | 
36 | 		items_pop[exclude] = -np.inf
37 | 		items_pop[[i[0] for i in sequence]] = -np.inf
38 | 
39 | 		return list(np.argpartition(-items_pop, range(k))[:k])
40 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2017 Robin Devooght
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/lazy/markov_model.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function
 2 | import collections
 3 | import numpy as np
 4 | import scipy.sparse as ssp
 5 | from copy import deepcopy
 6 | from .lazy import Lazy
 7 | from .utils import top_k, get_sparse_vector
 8 | 
 9 | 
10 | class MarkovModel(Lazy):
11 | 	"""
12 | 	"""
13 | 	def __init__(self, **kwargs):
14 | 		super(MarkovModel, self).__init__(**kwargs)
15 | 		
16 | 		self.previous_recommendations = dict()	
17 | 
18 | 		self.name = "MarkovModel"
19 | 
20 | 	def _get_model_filename(self, *args):
21 | 		return "MM"
22 | 	
23 | 	def prepare_model(self, dataset):
24 | 		'''Load the data from the training file into a format adapted for the MM predictions.
25 | 		'''
26 | 		self.n_items = dataset.n_items
27 | 
28 | 		self.sequences = []
29 | 
30 | 		with open(dataset.training_set.filename, 'r') as f:
31 | 			for sequence in f:
32 | 				sequence = sequence.split()
33 | 				items = map(int, sequence[1::2])
34 | 				s = dict()
35 | 				for i in range(len(items)-1):
36 | 					s[items[i]] = items[i+1]
37 | 				self.sequences.append(s)
38 | 
39 | 	def get_all_recommendations(self, item):
40 | 		all_recommendations = []
41 | 		for s in self.sequences:
42 | 			if item in s:
43 | 				all_recommendations.append(s[item])
44 | 		all_recommendations = collections.Counter(all_recommendations)
45 | 		del all_recommendations[None]
46 | 		self.previous_recommendations[item] = all_recommendations
47 | 
48 | 
49 | 	def top_k_recommendations(self, sequence, k=10, exclude=None, **kwargs):
50 | 		if exclude is None:
51 | 			exclude = []
52 | 		
53 | 		last_item = int(sequence[-1][0])
54 | 		if last_item not in self.previous_recommendations:
55 | 			self.get_all_recommendations(last_item)
56 | 		
57 | 		all_recommendations = deepcopy(self.previous_recommendations[last_item])
58 | 		for s in sequence:
59 | 			all_recommendations[int(s[0])] = 0
60 | 		for i in exclude:
61 | 			all_recommendations[i] = 0
62 | 
63 | 		ranking = np.zeros(self.n_items)
64 | 		for i, x in enumerate(all_recommendations.most_common(k)):
65 | 			ranking[x[0]] = k-i
66 | 		return np.argpartition(-ranking, range(k))[:k]
67 | 


--------------------------------------------------------------------------------
/neural_networks/target_selection.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function
 2 | import numpy as np
 3 | import random
 4 | 
 5 | def target_selection_command_parser(parser):
 6 | 	parser.add_argument('--n_targets', help='Number of targets (Only for RNN with hinge, logit or logsig loss).', default=1, type=int)
 7 | 	parser.add_argument('--shuffle_targets', help='Instead of picking the next items in the sequence as the target(s), the targets are picked randomly in the remaining sequence.',  action='store_true')
 8 | 	parser.add_argument('--rand_test_target', help='Use the exact same procedure for target selection during training and testing. Otherwise shuffling and bias are used only during training.',  action='store_true')
 9 | 	parser.add_argument('--target_bias', help='Popular item are picked as item with a lower probability. Targets are skipped with a probability proportional to (number_of_views)^bias. Set negative bias to avoid this procedure.',  default=-1., type=float)
10 | 
11 | def get_target_selection(args):
12 | 	return SelectTargets(n_targets=args.n_targets, shuffle=args.shuffle_targets, bias=args.target_bias, determinist_test=(not args.rand_test_target))
13 | 	
14 | 
15 | class SelectTargets(object):
16 | 	def __init__(self, n_targets=1, shuffle=False, bias=-1, determinist_test=True):
17 | 		super(SelectTargets, self).__init__()
18 | 		self.n_targets = n_targets
19 | 		self.shuffle = shuffle
20 | 		self.bias = bias
21 | 		self.determinist_test = determinist_test
22 | 
23 | 	@property
24 | 	def name(self):
25 | 		
26 | 		name = "nt"+str(self.n_targets)
27 | 
28 | 		if self.bias >= 0.:
29 | 			name += '_tb'+str(self.bias)
30 | 		if self.shuffle:
31 | 			name += "_shufT"
32 | 		return name
33 | 			
34 | 
35 | 	def set_dataset(self, dataset):
36 | 		
37 | 		if self.bias >= 0.:
38 | 			pop = np.maximum(1, dataset.item_popularity)
39 | 			self.keep_prob = np.power(min(pop) / pop, self.bias) 
40 | 
41 | 	def __call__(self, remaining_sequence, test=False):
42 | 		''' Receives the sequence of item that are not read by the RNN and chooses the target(s) among them.
43 | 		the test parameter indicates whether this is the training or the testing phase.
44 | 		If test is True and self.determinist_test is True, no shuffle nor bias is performed
45 | 		'''
46 | 		
47 | 		if not (test and self.determinist_test):
48 | 			if self.shuffle:
49 | 				random.shuffle(remaining_sequence)
50 | 			if self.bias >= 0.:
51 | 				remaining_sequence = [i for i in remaining_sequence if (np.random.random() <= self.keep_prob[i[0]])]
52 | 
53 | 		return remaining_sequence[:min(len(remaining_sequence), self.n_targets)]


--------------------------------------------------------------------------------
/train.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function
 2 | 
 3 | import numpy as np
 4 | import theano
 5 | import theano.tensor as T
 6 | import time
 7 | import lasagne
 8 | import random
 9 | import helpers.command_parser as parse
10 | from helpers.data_handling import DataHandler
11 | 
12 | def training_command_parser(parser):
13 | 	parser.add_argument('--tshuffle', help='Shuffle sequences during training.', action='store_true')
14 | 
15 | 	parser.add_argument('--extended_set', help='Use extended training set (contains first half of validation and test set).', action='store_true')
16 | 
17 | 	parser.add_argument('-d', dest='dataset', help='Directory name of the dataset.', default='', type=str)
18 | 	parser.add_argument('--dir', help='Directory name to save model.', default='', type=str)
19 | 	parser.add_argument('--save', choices=['All', 'Best', 'None'], help='Policy for saving models.', default='Best')
20 | 	parser.add_argument('--metrics', help='Metrics for validation, comma separated', default='sps', type=str)
21 | 	parser.add_argument('--time_based_progress', help='Follow progress based on time rather than iterations.', action='store_true')
22 | 	parser.add_argument('--load_last_model', help='Load Last model before starting training.', action='store_true')
23 | 	parser.add_argument('--progress', help='Progress intervals', default='2.', type=str)
24 | 	parser.add_argument('--mpi', help='Max progress intervals', default=np.inf, type=float)
25 | 	parser.add_argument('--max_iter', help='Max number of iterations', default=np.inf, type=float)
26 | 	parser.add_argument('--max_time', help='Max training time in seconds', default=np.inf, type=float)
27 | 	parser.add_argument('--min_iter', help='Min number of iterations before showing progress', default=0., type=float)
28 | 
29 | def num(s):
30 | 	try:
31 | 		return int(s)
32 | 	except ValueError:
33 | 		return float(s)
34 | 
35 | def main():
36 | 
37 | 	
38 | 	args = parse.command_parser(parse.predictor_command_parser, training_command_parser, parse.early_stopping_command_parser)
39 | 
40 | 	predictor = parse.get_predictor(args)
41 | 
42 | 	
43 | 	dataset = DataHandler(dirname=args.dataset, extended_training_set=args.extended_set, shuffle_training=args.tshuffle)
44 | 
45 | 	predictor.prepare_model(dataset)
46 | 	predictor.train(dataset, 
47 | 		save_dir=dataset.dirname + "models/" + args.dir, 
48 | 		time_based_progress=args.time_based_progress, 
49 | 		progress=num(args.progress), 
50 | 		autosave=args.save, 
51 | 		max_progress_interval=args.mpi, 
52 | 		max_iter = args.max_iter,
53 | 		min_iterations=args.min_iter,
54 | 		max_time=args.max_time,
55 | 		early_stopping=parse.get_early_stopper(args),
56 | 		load_last_model=args.load_last_model,
57 | 		validation_metrics=args.metrics.split(','))
58 | 
59 | if __name__ == '__main__':
60 |     main()
61 | 


--------------------------------------------------------------------------------
/lazy/user_knn.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function
 2 | 
 3 | import numpy as np
 4 | import scipy.sparse as ssp
 5 | import os.path
 6 | from .lazy import Lazy
 7 | from .utils import top_k, get_sparse_vector
 8 | 
 9 | 
10 | class UserKNN(Lazy):
11 | 	"""
12 | 	"""
13 | 	def __init__(self, similarity_measure='cosine', neighborhood_size=80, **kwargs):
14 | 		super(UserKNN, self).__init__(**kwargs)
15 | 		
16 | 		self.similarity_measure = similarity_measure
17 | 		self.neighborhood_size = neighborhood_size		
18 | 
19 | 		self.name = "UserKNN"
20 | 
21 | 	def _get_model_filename(self, *args):
22 | 		return "UKNN_ns"+str(self.neighborhood_size)+"_"+self.similarity_measure
23 | 
24 | 	def prepare_model(self, dataset):
25 | 		'''Load the data from the training file into a format adapted for the KNN methods.
26 | 		'''
27 | 		filename = dataset.dirname + 'data/train_set_triplets'
28 | 		if os.path.isfile(filename + '.npy'):
29 | 			file_content = np.load(filename + '.npy')
30 | 		else:
31 | 			file_content = np.loadtxt(filename)
32 | 			np.save(filename, file_content)
33 | 
34 | 		#self.user_item = ssp.coo_matrix((file_content[:,2], (file_content[:,0], file_content[:,1]))).tocsr()
35 | 		self.binary_user_item = ssp.coo_matrix((np.ones(file_content.shape[0]), (file_content[:,0], file_content[:,1]))).tocsr()
36 | 
37 | 		del file_content
38 | 
39 | 		self.n_items = self.binary_user_item.shape[1]
40 | 		self.n_users = self.binary_user_item.shape[0]
41 | 
42 | 	def _items_count_per_user(self):
43 | 		if not hasattr(self, '__items_count_per_user'):
44 | 			self.__items_count_per_user = np.asarray(self.binary_user_item.sum(axis=1)).ravel()
45 | 		return self.__items_count_per_user
46 | 
47 | 	def similarity_with_users(self, sequence):
48 | 		'''Compute the similarity of each user with the sequence recieved in parameter
49 | 		'''
50 | 		sparse_sequence = get_sparse_vector([i[0] for i in sequence], self.n_items)
51 | 		overlap = self.binary_user_item.dot(sparse_sequence).toarray().ravel()
52 | 		overlap[overlap != 0] /= np.sqrt(self._items_count_per_user()[overlap != 0])
53 | 		return overlap
54 | 
55 | 	def top_k_recommendations(self, sequence, k=10, exclude=None, **kwargs):
56 | 		''' Recieves a sequence of (id, rating), and produces k recommendations (as a list of ids)
57 | 		'''
58 | 		if exclude is None:
59 | 			exclude = []
60 | 
61 | 		sim_with_users = self.similarity_with_users(sequence)
62 | 		nearest_neighbors = top_k(sim_with_users, self.neighborhood_size)
63 | 		sim_with_users = get_sparse_vector(nearest_neighbors, self.n_users, values=sim_with_users[nearest_neighbors])
64 | 		sim_with_items = self.binary_user_item.T.dot(sim_with_users).toarray().ravel()
65 | 
66 | 		sim_with_items[exclude] = -np.inf
67 | 		sim_with_items[[i[0] for i in sequence]] = -np.inf
68 | 
69 | 		return list(np.argpartition(-sim_with_items, range(k))[:k])


--------------------------------------------------------------------------------
/helpers/early_stopping.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function
 2 | import numpy as np
 3 | 
 4 | def early_stopping_command_parser(parser):
 5 | 	parser.add_argument('--es_m', dest='early_stopping_method', choices=['WorstTimesX', 'StopAfterN', 'None'], help='Early stopping method', default='None')
 6 | 	parser.add_argument('--es_n', help='N parameter (for StopAfterN)', default=5, type=int)
 7 | 	parser.add_argument('--es_x', help='X parameter (for WorstTimesX)', default=2., type=float)
 8 | 	parser.add_argument('--es_min_wait', help='Mininum wait before stopping (for WorstTimesX)', default=1., type=float)
 9 | 	parser.add_argument('--es_LiB', help='Lower is better for validation score.', action='store_true')
10 | 
11 | def get_early_stopper(args):
12 | 	if args.early_stopping_method == 'StopAfterN':
13 | 		return StopAfterN(n = args.es_n, higher_is_better=(not args.es_LiB))
14 | 	elif args.early_stopping_method == 'WorstTimesX':
15 | 		return WaitWorstCaseTimesX(x = args.es_x, min_wait=args.es_min_wait, higher_is_better=(not args.es_LiB))
16 | 	else:
17 | 		return None
18 | 
19 | class EarlyStopperBase(object):
20 | 	def __init__(self, higher_is_better=True):
21 | 		super(EarlyStopperBase, self).__init__()
22 | 
23 | 		self.higher_is_better = higher_is_better
24 | 
25 | 	def __call__(self, epochs, val_costs):
26 | 
27 | 		if not self.higher_is_better:
28 | 			val_costs = [-i for i in val_costs]
29 | 
30 | 		return self.decideStopping(epochs, val_costs)
31 | 
32 | 	def decideStopping(self, epochs, val_costs):
33 | 		pass
34 | 
35 | class StopAfterN(EarlyStopperBase):
36 | 	''' Stops after N consecutively non improving cost
37 | 	'''
38 | 	def __init__(self, n=3, **kwargs):
39 | 		super(StopAfterN, self).__init__(**kwargs)
40 | 
41 | 		self.n = n
42 | 
43 | 	def decideStopping(self, epochs, val_costs):
44 | 
45 | 		if len(val_costs) <= self.n:
46 | 			return False
47 | 
48 | 		for i in range(self.n):
49 | 			if val_costs[-1-i] > val_costs[-2-i]:
50 | 				return False
51 | 
52 | 		return True
53 | 
54 | 
55 | class WaitWorstCaseTimesX(EarlyStopperBase):
56 | 	''' Stops if the number of epochs since the best cost is X times larger than the maximum number of epochs between two consecutive best.
57 | 	'''
58 | 
59 | 	def __init__(self, x=2., min_wait=1., **kwargs):
60 | 		super(WaitWorstCaseTimesX, self).__init__(**kwargs)
61 | 
62 | 		self.x = x
63 | 		self.min_wait = min_wait
64 | 
65 | 	def decideStopping(self, epochs, val_costs):
66 | 
67 | 		# find longest wait between two best scores
68 | 		last_best = val_costs[0]
69 | 		last_best_epoch = epochs[0]
70 | 		longest_wait = 0
71 | 		for epoch, cost in zip(epochs[1:], val_costs[1:]):
72 | 			if cost > last_best:
73 | 				wait = epoch - last_best_epoch
74 | 				last_best_epoch = epoch
75 | 				last_best = cost
76 | 				if wait > longest_wait:
77 | 					longest_wait = wait
78 | 
79 | 		current_wait = epochs[-1] - last_best_epoch
80 | 
81 | 		if longest_wait == 0:
82 | 			return current_wait > self.min_wait
83 | 
84 | 		print('current wait : ', round(current_wait, 3), ' longest wait : ', round(longest_wait, 3), ' ratio : ', current_wait/longest_wait, ' / ', self.x)
85 | 		
86 | 		return current_wait > max(self.min_wait, longest_wait*self.x)


--------------------------------------------------------------------------------
/neural_networks/update_manager.py:
--------------------------------------------------------------------------------
 1 | import lasagne
 2 | 
 3 | def update_manager_command_parser(parser):
 4 | 	parser.add_argument('--u_m', dest='update_manager', choices=['adagrad', 'adadelta', 'rmsprop', 'nesterov', 'adam'], help='Update mechanism', default='adam')
 5 | 	parser.add_argument('--u_l', help='Learning rate', default=0.001, type=float)
 6 | 	parser.add_argument('--u_rho', help='rho parameter for Adadelta and RMSProp (momentum for Nesterov momentum)', default=0.9, type=float)
 7 | 	parser.add_argument('--u_b1', help='Beta 1 parameter for Adam', default=0.9, type=float)
 8 | 	parser.add_argument('--u_b2', help='Beta 2 parameter for Adam', default=0.999, type=float)
 9 | 
10 | def get_update_manager(args):
11 | 	if args.update_manager == 'adagrad':
12 | 		return Adagrad(learning_rate = args.u_l)
13 | 	elif args.update_manager == 'adadelta':
14 | 		return Adadelta(learning_rate = args.u_l, rho = args.u_rho)
15 | 	elif args.update_manager == 'rmsprop':
16 | 		return RMSProp(learning_rate = args.u_l, rho = args.u_rho)
17 | 	elif args.update_manager == 'nesterov':
18 | 		return NesterovMomentum(learning_rate = args.u_l, momentum = args.u_rho)
19 | 	elif args.update_manager == 'adam':
20 | 		return Adam(learning_rate = args.u_l, beta1 = args.u_b1, beta2 = args.u_b2)
21 | 	else:
22 | 		raise ValueError('Unknown update option')
23 | 
24 | class Adagrad(object):
25 | 
26 | 	def __init__(self, learning_rate=0.1, **kwargs):
27 | 		super(Adagrad, self).__init__(**kwargs)
28 | 		
29 | 		self.learning_rate = learning_rate
30 | 		self.name = 'Ug_lr'+str(self.learning_rate)
31 | 
32 | 	def __call__(self, cost, params):
33 | 		return lasagne.updates.adagrad(cost, params, self.learning_rate)
34 | 
35 | class Adadelta(object):
36 | 
37 | 	def __init__(self, learning_rate=1.0, rho=0.9, **kwargs):
38 | 		super(Adadelta, self).__init__(**kwargs)
39 | 		
40 | 		self.learning_rate = learning_rate
41 | 		self.rho = rho
42 | 		self.name = 'Ud_lr'+str(self.learning_rate)+'_rho'+str(self.rho)
43 | 
44 | 	def __call__(self, cost, params):
45 | 		return lasagne.updates.adadelta(cost, params, self.learning_rate, rho=self.rho)
46 | 
47 | class RMSProp(object):
48 | 
49 | 	def __init__(self, learning_rate=1.0, rho=0.9, **kwargs):
50 | 		super(RMSProp, self).__init__(**kwargs)
51 | 		
52 | 		self.learning_rate = learning_rate
53 | 		self.rho = rho
54 | 		self.name = 'Ur_lr'+str(self.learning_rate)+'_rho'+str(self.rho)
55 | 
56 | 	def __call__(self, cost, params):
57 | 		return lasagne.updates.rmsprop(cost, params, self.learning_rate, rho=self.rho)
58 | 
59 | class NesterovMomentum(object):
60 | 
61 | 	def __init__(self, learning_rate=1.0, momentum=0.9, **kwargs):
62 | 		super(NesterovMomentum, self).__init__(**kwargs)
63 | 		
64 | 		self.learning_rate = learning_rate
65 | 		self.momentum = momentum
66 | 		self.name = 'Un_lr'+str(self.learning_rate)+'_m'+str(self.momentum)
67 | 
68 | 	def __call__(self, cost, params):
69 | 		return lasagne.updates.nesterov_momentum(cost, params, self.learning_rate, momentum=self.momentum)
70 | 
71 | class Adam(object):
72 | 
73 | 	def __init__(self, learning_rate=0.001, beta1=0.9, beta2=0.999, **kwargs):
74 | 		super(Adam, self).__init__(**kwargs)
75 | 		
76 | 		self.learning_rate = learning_rate
77 | 		self.beta1 = beta1
78 | 		self.beta2 = beta2
79 | 		self.name = 'Ua_lr'+str(self.learning_rate)+'_b1'+str(self.beta1)+'_b2'+str(self.beta2)
80 | 
81 | 	def __call__(self, cost, params):
82 | 		return lasagne.updates.adam(cost, params, self.learning_rate, beta1=self.beta1, beta2=self.beta2)


--------------------------------------------------------------------------------
/neural_networks/sequence_noise.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function
 2 | import numpy as np
 3 | 
 4 | def sequence_noise_command_parser(parser):
 5 | 	parser.add_argument('--n_dropout', help='Dropout probability', default=0., type=float)
 6 | 	parser.add_argument('--n_swap', help="Probability of swapping two consecutive items", default=0., type=float)
 7 | 	parser.add_argument('--n_shuf', help="Probability of swapping two random items", default=0., type=float)
 8 | 	parser.add_argument('--n_shuf_std', help="The distance between the two items to be swapped is drawn from a normal distribution whose std is defined by this parameter", default=5., type=float)
 9 | 	parser.add_argument('--n_ratings', help='Probability of changing the rating.', default=0., type=float)
10 | 
11 | def get_sequence_noise(args):
12 | 	return SequenceNoise(dropout=args.n_dropout, swap=args.n_swap, ratings_perturb=args.n_ratings, shuf=args.n_shuf, shuf_std=args.n_shuf_std)
13 | 	
14 | 
15 | class SequenceNoise(object):
16 | 	def __init__(self, dropout=0., swap=0., ratings_perturb=0., shuf=0., shuf_std=0.):
17 | 		super(SequenceNoise, self).__init__()
18 | 		self.dropout = dropout
19 | 		self.swap = swap
20 | 		self.ratings_perturb = ratings_perturb
21 | 		self.shuf = shuf
22 | 		self.shuf_std = shuf_std
23 | 
24 | 		self._check_param_validity()
25 | 		self._set_name()
26 | 
27 | 
28 | 	def _set_name(self):
29 | 		name = []
30 | 		if self.dropout > 0:
31 | 			name.append("do"+str(self.dropout))
32 | 
33 | 		if self.swap > 0:
34 | 			name.append("sw"+str(self.swap))
35 | 
36 | 		if self.ratings_perturb > 0:
37 | 			name.append("rp"+str(self.ratings_perturb))
38 | 
39 | 		if self.shuf > 0:
40 | 			name.append("sh"+str(self.shuf)+"-"+str(self.shuf_std))
41 | 
42 | 		self.name = "_".join(name)
43 | 
44 | 	def _check_param_validity(self):
45 | 		if self.dropout < 0. or self.dropout >= 1.:
46 | 			raise ValueError('Dropout should be in [0,1)')
47 | 		if self.swap < 0. or self.swap >= 1.:
48 | 			raise ValueError('Swapping probability should be in [0,1)')
49 | 		if self.ratings_perturb < 0. or self.ratings_perturb >= 1.:
50 | 			raise ValueError('Rating perturbation probability should be in [0,1)')
51 | 
52 | 	def __call__(self, sequence_generator):
53 | 		"""Recieves a generator of sequences in the form ([(item, rating), (item, rating), ...], user) and generates sequences in the same format,
54 | 		after potentially applying dropout, item swapping and ratings modifications.
55 | 		"""
56 | 
57 | 		while True:
58 | 
59 | 			sequence, user = next(sequence_generator)
60 | 
61 | 			# Dropout
62 | 			if self.dropout > 0.:
63 | 				sequence = [i for i in sequence if (np.random.random() >= self.dropout)]
64 | 				if len(sequence) < 2:
65 | 					continue
66 | 			
67 | 			# Perturb the order
68 | 			if self.swap > 0.:
69 | 				i = 0
70 | 				while i < len(sequence) - 1:
71 | 					if np.random.random() < self.swap:
72 | 						tmp = sequence[i]
73 | 						sequence[i] = sequence[i+1]
74 | 						sequence[i+1] = tmp
75 | 						i+=1 # Don't allow to swap twice the same item
76 | 					i += 1
77 | 
78 | 			# Shuffle
79 | 			if self.shuf > 0.:
80 | 				for i in range(len(sequence)):
81 | 					if np.random.random() < self.shuf:
82 | 						other_item = max(0, min(len(sequence)-1, int(np.random.randn()*self.shuf_std)+i))
83 | 						sequence[i], sequence[other_item] = sequence[other_item], sequence[i]
84 | 
85 | 			# Perturb ratings
86 | 			if self.ratings_perturb > 0:
87 | 				for i in range(len(sequence)):
88 | 					if np.random.random() < self.ratings_perturb:
89 | 						if np.random.random() < 0.5:
90 | 							sequence[i][1] = min(5, sequence[i][1] + 0.5)
91 | 						else:
92 | 							sequence[i][1] = max(1, sequence[i][1] - 0.5)
93 | 
94 | 			yield sequence, user
95 | 	
96 | 


--------------------------------------------------------------------------------
/neural_networks/recurrent_layers.py:
--------------------------------------------------------------------------------
  1 | from __future__ import print_function
  2 | import numpy as np
  3 | import theano
  4 | import theano.tensor as T
  5 | import lasagne
  6 | from sparse_lstm import *
  7 | 
  8 | def recurrent_layers_command_parser(parser):
  9 | 	parser.add_argument('--r_t', dest='recurrent_layer_type', choices=['LSTM', 'GRU', 'Vanilla'], help='Type of recurrent layer', default='GRU')
 10 | 	parser.add_argument('--r_l', help="Layers' size, (eg: 100-50-50)", default="50", type=str)
 11 | 	parser.add_argument('--r_bi', help='Bidirectional layers.', action='store_true')
 12 | 	parser.add_argument('--r_emb', help='Add an embedding layer before the RNN. Takes the size of the embedding as parameter, a size<1 means no embedding layer.', type=int, default=0)
 13 | 
 14 | def get_recurrent_layers(args):
 15 | 	return RecurrentLayers(layer_type=args.recurrent_layer_type, layers=map(int, args.r_l.split('-')), bidirectional=args.r_bi, embedding_size=args.r_emb)
 16 | 	
 17 | 
 18 | class RecurrentLayers(object):
 19 | 	def __init__(self, layer_type="LSTM", layers=[32], bidirectional=False, embedding_size=0, grad_clipping=100):
 20 | 		super(RecurrentLayers, self).__init__()
 21 | 		self.layer_type = layer_type
 22 | 		self.layers = layers
 23 | 		self.bidirectional = bidirectional
 24 | 		self.embedding_size = embedding_size
 25 | 		self.grad_clip=grad_clipping
 26 | 		self.set_name()
 27 | 
 28 | 	def set_name(self):
 29 | 
 30 | 		self.name = ""
 31 | 		if self.bidirectional:
 32 | 			self.name += "b"+self.layer_type+"_"
 33 | 		elif self.layer_type != "LSTM":
 34 | 			self.name += self.layer_type+"_"
 35 | 		
 36 | 		self.name += "gc"+str(self.grad_clip)+"_"
 37 | 		if self.embedding_size > 0:
 38 | 			self.name += "e"+str(self.embedding_size)
 39 | 		self.name += "h"+('-'.join(map(str,self.layers)))
 40 | 
 41 | 
 42 | 	def __call__(self, input_layer, mask_layer, true_input_size=None, only_return_final=True):
 43 | 
 44 | 		if true_input_size is None and self.embedding_size > 0:
 45 | 			raise ValueError('Embedding layer only works with sparse inputs')
 46 | 
 47 | 		if self.embedding_size > 0:
 48 | 			in_int32 = lasagne.layers.ExpressionLayer(input_layer, lambda x: x.astype('int32')) # change type of input
 49 | 			l_emb = lasagne.layers.flatten(lasagne.layers.EmbeddingLayer(in_int32, input_size=true_input_size, output_size=self.embedding_size), outdim=3)
 50 | 			l_rec = self.get_recurrent_layers(l_emb, mask_layer, true_input_size=None, only_return_final=only_return_final)
 51 | 		else:
 52 | 			l_rec = self.get_recurrent_layers(input_layer, mask_layer, true_input_size=true_input_size, only_return_final=only_return_final)
 53 | 
 54 | 		return l_rec
 55 | 
 56 | 
 57 | 	def get_recurrent_layers(self, input_layer, mask_layer, true_input_size=None, only_return_final=True):
 58 | 
 59 | 		orf = False
 60 | 		prev_layer = input_layer
 61 | 		for i, h in enumerate(self.layers):
 62 | 			if i == len(self.layers) - 1:
 63 | 				orf = only_return_final
 64 | 			prev_layer = self.get_one_layer(prev_layer, mask_layer, h, true_input_size, orf)
 65 | 
 66 | 			true_input_size = None # Second layer is always densely encoded
 67 | 
 68 | 		return prev_layer
 69 | 
 70 | 
 71 | 
 72 | 	def get_one_layer(self, input_layer, mask_layer, n_hidden, true_input_size, only_return_final):
 73 | 		if self.bidirectional:
 74 | 			forward = self.get_unidirectional_layer(input_layer, mask_layer, n_hidden, true_input_size, only_return_final, backwards=False)
 75 | 			backward = self.get_unidirectional_layer(input_layer, mask_layer, n_hidden, true_input_size, only_return_final, backwards=True)
 76 | 			return lasagne.layers.ConcatLayer([forward, backward], axis = -1)
 77 | 		else:
 78 | 			return self.get_unidirectional_layer(input_layer, mask_layer, n_hidden, true_input_size, only_return_final, backwards=False)
 79 | 
 80 | 	def get_unidirectional_layer(self, input_layer, mask_layer, n_hidden, true_input_size, only_return_final, backwards=False):
 81 | 		if true_input_size is not None:
 82 | 			if self.layer_type == "LSTM":
 83 | 				layer = LSTMLayerOHEInput
 84 | 			elif self.layer_type == "GRU":
 85 | 				layer = GRULayerOHEInput
 86 | 			elif self.layer_type == "Vanilla":
 87 | 				layer = VanillaLayerOHEInput
 88 | 			else:
 89 | 				raise ValueError('Unknown layer type')
 90 | 
 91 | 			return layer(input_layer, n_hidden, true_input_size, mask_input=mask_layer, grad_clipping=self.grad_clip,
 92 | 				learn_init=True, only_return_final=only_return_final, backwards=backwards)
 93 | 		else:
 94 | 			if self.layer_type == "LSTM":
 95 | 				layer = lasagne.layers.LSTMLayer
 96 | 			elif self.layer_type == "GRU":
 97 | 				layer = lasagne.layers.GRULayer
 98 | 			elif self.layer_type == "Vanilla":
 99 | 				layer = lasagne.layers.RecurrentLayer
100 | 			else:
101 | 				raise ValueError('Unknown layer type')
102 | 
103 | 			return layer(input_layer, n_hidden, mask_input=mask_layer, grad_clipping=self.grad_clip,
104 | 				learn_init=True, only_return_final=only_return_final, backwards=backwards)
105 | 


--------------------------------------------------------------------------------
/neural_networks/rnn_one_hot.py:
--------------------------------------------------------------------------------
  1 | from __future__ import print_function
  2 | 
  3 | import numpy as np
  4 | import theano
  5 | import theano.tensor as T
  6 | import lasagne
  7 | import cPickle
  8 | import random
  9 | from time import time
 10 | import rnn_base as rnn
 11 | from sparse_lstm import *
 12 | 
 13 | class RNNOneHot(rnn.RNNBase):
 14 | 	"""RNNOneHot are recurrent neural networks that do not depend on the factorization: they are based on one-hot encoding.
 15 | 
 16 | 	The parameters specific to the RNNOneHot are:
 17 | 		diversity_bias: a float in [0, inf) that tunes how the cost function of the network is biased towards less seen movies.
 18 | 			In practice, the classification error given by the categorical cross-entropy is divided by exp(diversity_bias * popularity (on a scale from 1 to 10)).
 19 | 			This will reduce the error associated to movies with a lot of views, putting therefore more importance on the ability of the network to correctly predict the rare movies.
 20 | 			A diversity_bias of 0 produces the normal behavior, with no bias.
 21 | 	"""
 22 | 	def __init__(self, diversity_bias=0.0, regularization=0.0, **kwargs):
 23 | 		super(RNNOneHot, self).__init__(**kwargs)
 24 | 		
 25 | 		self.diversity_bias = np.cast[theano.config.floatX](diversity_bias)
 26 | 		
 27 | 		self.regularization = regularization
 28 | 
 29 | 		self.name = "RNN with categorical cross entropy"
 30 | 
 31 | 	def _get_model_filename(self, epochs):
 32 | 		'''Return the name of the file to save the current model
 33 | 		'''
 34 | 		filename = "rnn_cce_db"+str(self.diversity_bias)+"_r"+str(self.regularization)+"_"+self._common_filename(epochs)
 35 | 		return filename
 36 | 
 37 | 	def _prepare_networks(self, n_items):
 38 | 		''' Prepares the building blocks of the RNN, but does not compile them:
 39 | 		'''
 40 | 	   
 41 | 		self.n_items = n_items
 42 | 		# The input is composed of to parts : the on-hot encoding of the movie, and the features of the movie
 43 | 		self.l_in = lasagne.layers.InputLayer(shape=(self.batch_size, self.max_length, self._input_size()))
 44 | 		# The input is completed by a mask to inform the LSTM of the length of the sequence
 45 | 		self.l_mask = lasagne.layers.InputLayer(shape=(self.batch_size, self.max_length))
 46 | 
 47 | 		# recurrent layer
 48 | 		if not self.use_movies_features:
 49 | 			l_recurrent = self.recurrent_layer(self.l_in, self.l_mask, true_input_size=self.n_items + self._n_optional_features(), only_return_final=True)
 50 | 		else:
 51 | 			l_recurrent = self.recurrent_layer(self.l_in, self.l_mask, true_input_size=None, only_return_final=True)
 52 | 
 53 | 		# l_last_slice gets the last output of the recurrent layer
 54 | 		l_last_slice = l_recurrent
 55 | 		# l_last_slice = lasagne.layers.SliceLayer(l_recurrent, -1, 1)
 56 | 
 57 | 		# Theano tensor for the targets
 58 | 		target = T.ivector('target_output')
 59 | 		target_popularity = T.fvector('target_popularity')
 60 | 		self.exclude = T.fmatrix('excluded_items')
 61 | 		self.theano_inputs = [self.l_in.input_var, self.l_mask.input_var, target, target_popularity, self.exclude]
 62 | 		
 63 | 		
 64 | 		# The sliced output is then passed through linear layer to obtain the right output size
 65 | 		self.l_out = lasagne.layers.DenseLayer(l_last_slice, num_units=self.n_items, nonlinearity=lasagne.nonlinearities.softmax)
 66 | 					
 67 | 		# lasagne.layers.get_output produces a variable for the output of the net
 68 | 		network_output = lasagne.layers.get_output(self.l_out)
 69 | 
 70 | 		# loss function
 71 | 		self.cost = (T.nnet.categorical_crossentropy(network_output, target) / target_popularity).mean()
 72 | 
 73 | 		if self.regularization > 0.:
 74 | 			self.cost += self.regularization * lasagne.regularization.l2(self.l_out.b)
 75 | 			# self.cost += self.regularization * lasagne.regularization.regularize_layer_params(self.l_out, lasagne.regularization.l2)
 76 | 		elif self.regularization < 0.:
 77 | 			self.cost -= self.regularization * lasagne.regularization.l1(self.l_out.b)
 78 | 			# self.cost -= self.regularization * lasagne.regularization.regularize_layer_params(self.l_out, lasagne.regularization.l1)
 79 | 		
 80 | 
 81 | 	
 82 | 
 83 | 	def _prepare_input(self, sequences):
 84 | 		''' Sequences is a list of [user_id, input_sequence, targets]
 85 | 		'''
 86 | 
 87 | 		batch_size = len(sequences)
 88 | 
 89 | 		# Shape return variables
 90 | 		X = np.zeros((batch_size, self.max_length, self._input_size()), dtype=self._input_type) # input of the RNN
 91 | 		mask = np.zeros((batch_size, self.max_length)) # mask of the input (to deal with sequences of different length)
 92 | 		Y = np.zeros((batch_size,), dtype='int32') # output target
 93 | 		pop = np.zeros((batch_size,)) # output target
 94 | 		exclude = np.zeros((batch_size, self.n_items), dtype=theano.config.floatX)
 95 | 
 96 | 		
 97 | 		for i, sequence in enumerate(sequences):
 98 | 			user_id, in_seq, target = sequence
 99 | 			seq_features = np.array(map(lambda x: self._get_features(x, user_id), in_seq))
100 | 			X[i, :len(in_seq), :] = seq_features # Copy sequences into X
101 | 			mask[i, :len(in_seq)] = 1
102 | 			Y[i] = target[0][0] # id of the first and only target
103 | 			pop[i] = self.dataset.item_popularity[target[0][0]] ** self.diversity_bias
104 | 			exclude[i, [j[0] for j in in_seq]] = 1
105 | 
106 | 		return (X, mask.astype(theano.config.floatX), Y, pop.astype(theano.config.floatX), exclude)
107 | 


--------------------------------------------------------------------------------
/factorization/bprmf.py:
--------------------------------------------------------------------------------
  1 | from __future__ import division
  2 | from __future__ import print_function
  3 | import numpy as np
  4 | import math
  5 | import random
  6 | import re
  7 | import os
  8 | import glob
  9 | import sys
 10 | from time import time
 11 | from mf_base import MFBase
 12 | 
 13 | class BPRMF(MFBase):
 14 | 	''' Implementation of the algorithm presented in "BPR: Bayesian personalized ranking from implicit feedback", by Rendle S. et al., 2009.
 15 | 
 16 | 	The adaptive sampling algorithm is adapted from "Improving pairwise learning for item recommendation from implicit feedback", by Rendle S. et al., 2014
 17 | 	'''
 18 | 
 19 | 	def __init__(self, k = 32, adaptive_sampling=True, sampling_bias=500, **kwargs):
 20 | 
 21 | 		super(BPRMF, self).__init__(**kwargs)
 22 | 
 23 | 		self.name = 'BPRMF'
 24 | 		self.k = k
 25 | 		self.adaptive_sampling = adaptive_sampling
 26 | 		self.sampling_bias = sampling_bias # lambda parameter in "Improving pairwise learning for item recommendation from implicit feedback", by Rendle S. et al., 2014
 27 | 
 28 | 	def _get_model_filename(self, epochs):
 29 | 		'''Return the name of the file to save the current model
 30 | 		'''
 31 | 		filename = "bprmf_ne"+str(epochs)+"_lr"+str(self.init_learning_rate)+"_an"+str(self.annealing_rate)+"_k"+str(self.k)+"_reg"+str(self.reg)+"_ini"+str(self.init_sigma)
 32 | 		if self.adaptive_sampling:
 33 | 			filename += "_as"+str(self.sampling_bias)
 34 | 		return filename+".npz"
 35 | 
 36 | 	def init_model(self):
 37 | 		''' Initialize the model parameters
 38 | 		'''
 39 | 		self.V = self.init_sigma * np.random.randn(self.n_users, self.k).astype(np.float32)
 40 | 		self.H = self.init_sigma * np.random.randn(self.n_items, self.k).astype(np.float32)
 41 | 		self.bias = np.zeros(self.n_items).astype(np.float32)
 42 | 
 43 | 	def sgd_step(self, user, true_item, false_item):
 44 | 		''' Make one SGD update, given that the interaction between user and true_item exists, 
 45 | 		but the one between user and false_item does not.
 46 | 		user, true_item and false_item are all user or item ids.
 47 | 
 48 | 		return error
 49 | 		'''
 50 | 
 51 | 		# Compute error
 52 | 		x_true = self.bias[true_item] + np.dot(self.V[user, :], self.H[true_item, :]) 
 53 | 		x_false = self.bias[false_item] + np.dot(self.V[user, :], self.H[false_item, :]) 
 54 | 		delta = 1 - 1 / (1 + math.exp(min(10, max(-10, x_false - x_true)))) # Bound x_true - x_false in [-10, 10] to avoid overflow
 55 | 		
 56 | 		# Update CF
 57 | 		V_mem = self.V[user, :]
 58 | 		self.V[user, :] += self.learning_rate * ( delta * (self.H[true_item, :] - self.H[false_item, :]) - self.reg * self.V[user, :])
 59 | 		self.H[true_item, :] += self.learning_rate * ( delta * V_mem - self.reg * self.H[true_item, :])
 60 | 		self.H[false_item, :] += self.learning_rate * ( -delta * V_mem - self.reg / 10 * self.H[false_item, :])
 61 | 		self.bias[true_item] += self.learning_rate * (delta - self.reg * self.bias[true_item])
 62 | 		self.bias[false_item] += self.learning_rate * (- delta - self.reg * self.bias[false_item])
 63 | 
 64 | 		return delta
 65 | 
 66 | 	def compute_factor_rankings(self):
 67 | 		'''Rank items according to each factor in order to do adaptive sampling
 68 | 		'''
 69 | 
 70 | 		self.ranks = np.argsort(self.H, axis=0)
 71 | 		self.var = np.var(self.H, axis=0)
 72 | 
 73 | 	def get_training_sample(self):
 74 | 		'''Pick a random triplet from self.triplets and a random false next item.
 75 | 		returns a tuple of ids : (user, true_item, false_item)
 76 | 		'''
 77 | 
 78 | 		user_id = random.randrange(self.n_users)
 79 | 		while self.users[user_id,1] < 2:
 80 | 			user_id = random.randrange(self.n_users)
 81 | 		user_items = self.items[self.users[user_id,0]:self.users[user_id,0]+self.users[user_id,1]]
 82 | 		true_item = random.choice(user_items)
 83 | 		if self.adaptive_sampling:
 84 | 			while True:
 85 | 				rank = np.random.exponential(scale=self.sampling_bias)
 86 | 				while rank >= self.n_items:
 87 | 					rank = np.random.exponential(scale=self.sampling_bias)
 88 | 				factor_signs = np.sign(self.V[user_id, :])
 89 | 				factor_prob = np.abs(self.V[user_id, :]) * self.var
 90 | 				f = np.random.choice(self.k, p=factor_prob/sum(factor_prob))
 91 | 				false_item = self.ranks[int(rank) * factor_signs[f],f]
 92 | 				if false_item not in user_items:
 93 | 					break
 94 | 		else:
 95 | 			false_item = random.randrange(self.n_items)
 96 | 			while false_item in user_items:
 97 | 				false_item = random.randrange(self.n_items)
 98 | 
 99 | 		return (user_id, true_item, false_item)
100 | 
101 | 	def top_k_recommendations(self, sequence, user_id=None, k=10, exclude=None):
102 | 		''' Recieves a sequence of (id, rating), and produces k recommendations (as a list of ids)
103 | 		'''
104 | 
105 | 		if exclude is None:
106 | 			exclude = []
107 | 
108 | 		last_item = sequence[-1][0]
109 | 		output = self.bias + np.dot(self.V[user_id, :], self.H.T)
110 | 
111 | 		# Put low similarity to viewed items to exclude them from recommendations
112 | 		output[[i[0] for i in sequence]] = -np.inf
113 | 		output[exclude] = -np.inf
114 | 
115 | 		# find top k according to output
116 | 		return list(np.argpartition(-output, range(k))[:k])
117 | 
118 | 	def training_step(self, iterations):
119 | 		if self.adaptive_sampling and iterations%int(self.n_items * np.log(self.n_items)) == 0:
120 | 			self.compute_factor_rankings()
121 | 
122 | 		# Train with a new batch
123 | 		return self.sgd_step(*self.get_training_sample())
124 | 
125 | 	def save(self, filename):
126 | 		'''Save the parameters of a network into a file
127 | 		'''
128 | 		print('Save model in ' + filename)
129 | 		if not os.path.exists(os.path.dirname(filename)):
130 | 			os.makedirs(os.path.dirname(filename))
131 | 		np.savez(filename, V=self.V, H=self.H, bias=self.bias)
132 | 		
133 | 
134 | 	def load(self, filename):
135 | 		'''Load parameters values form a file
136 | 		'''
137 | 		f = np.load(filename)
138 | 		self.V = f['V']
139 | 		self.H = f['H']
140 | 		self.bias = f['bias']


--------------------------------------------------------------------------------
/neural_networks/stacked_denoising_autoencoder.py:
--------------------------------------------------------------------------------
  1 | from __future__ import print_function
  2 | 
  3 | import numpy as np
  4 | import theano
  5 | import theano.tensor as T
  6 | import lasagne
  7 | import cPickle
  8 | import random
  9 | import re
 10 | import glob
 11 | from time import time
 12 | from .rnn_base import RNNBase
 13 | 
 14 | def log_softmax(x):
 15 | 	xdev = x - x.max(1, keepdims=True)
 16 | 	return xdev - T.log(T.sum(T.exp(xdev), axis=1, keepdims=True))
 17 | 
 18 | def categorical_crossentropy_logdomain(log_predictions, targets):
 19 | 	return -T.sum(targets * log_predictions, axis=1)
 20 | 
 21 | class StackedDenoisingAutoencoder(RNNBase):
 22 | 	"""Base for Feed forward neural networks object.
 23 | 	"""
 24 | 	def __init__(self, layers=[20], input_dropout=0.2, dropout=0.5, **kwargs):
 25 | 		super(StackedDenoisingAutoencoder, self).__init__(**kwargs)
 26 | 		
 27 | 		self.layers = layers
 28 | 		self.input_dropout = input_dropout
 29 | 		self.dropout = dropout
 30 | 
 31 | 		self.name = "Stacked Denoising Autoencoder"
 32 | 
 33 | 
 34 | 	def _get_model_filename(self, epochs):
 35 | 		'''Return the name of the file to save the current model
 36 | 		'''
 37 | 		filename = "sda_bs"+str(self.batch_size)+"_ne"+str(epochs)
 38 | 		filename += "_h"+('-'.join(map(str,self.layers)))
 39 | 		filename += "_" + self.updater.name
 40 | 		if not self.use_ratings_features:
 41 | 			filename += "_nf"
 42 | 		if self.use_ratings_features:
 43 | 			filename += "_rf"
 44 | 		return filename
 45 | 
 46 | 	def top_k_recommendations(self, sequence, user_id=None, k=10, exclude=None, **kwargs):
 47 | 		''' Recieves a sequence of (id, rating), and produces k recommendations (as a list of ids)
 48 | 		'''
 49 | 
 50 | 		# Compile network if needed
 51 | 		if not hasattr(self, 'predict_function'):
 52 | 			self._compile_predict_function()
 53 | 
 54 | 		# Prepare RNN input
 55 | 		X = np.zeros((1, self._input_size())) # input of the RNN
 56 | 		X[0, :] = self._one_hot_encoding([i[0] for i in sequence])
 57 | 
 58 | 		# Run RNN
 59 | 		output = self.predict_function(X.astype(theano.config.floatX))[0]
 60 | 
 61 | 		# Put low similarity to viewed items to exclude them from recommendations
 62 | 		output[[i[0] for i in sequence]] = -np.inf
 63 | 		output[exclude] = -np.inf
 64 | 
 65 | 		# find top k according to output
 66 | 		return list(np.argpartition(-output, range(k))[:k])
 67 | 
 68 | 	def _prepare_networks(self, n_items):
 69 | 		''' Prepares the building blocks of the RNN, but does not compile them:
 70 | 		self.l_in : input layer
 71 | 		self.target : target of the network
 72 | 		self.l_out : output of the network
 73 | 		self.cost : cost function
 74 | 		'''
 75 | 
 76 | 		self.n_items = n_items
 77 | 		
 78 | 		# The input is composed of to parts : the on-hot encoding of the movie, and the features of the movie
 79 | 		self.l_in = lasagne.layers.InputLayer(shape=(self.batch_size, self._input_size()))
 80 | 		# hidden_layer = lasagne.layers.dropout(self.l_in, p=self.input_dropout)
 81 | 		hidden_layer = self.l_in
 82 | 		
 83 | 		# Build hidden layers
 84 | 		for l in self.layers:
 85 | 			hidden_layer = lasagne.layers.DenseLayer(hidden_layer, num_units=l)
 86 | 			if self.dropout:
 87 | 				hidden_layer = lasagne.layers.dropout(hidden_layer, p=self.dropout)
 88 | 
 89 | 		# The sliced output is then passed through linear layer to obtain the right output size
 90 | 		self.l_out = lasagne.layers.DenseLayer(hidden_layer, num_units=self.n_items, nonlinearity=lasagne.nonlinearities.sigmoid)
 91 | 
 92 | 		# lasagne.layers.get_output produces a variable for the output of the net
 93 | 		network_output = lasagne.layers.get_output(self.l_out)
 94 | 
 95 | 		# loss function
 96 | 		self.targets = T.fmatrix('multiple_target_output')
 97 | 		self.theano_inputs = [self.l_in.input_var, self.targets]
 98 | 
 99 | 		self.cost = T.sqr(network_output - self.targets).mean()
100 | 
101 | 	def _compile_predict_function(self):
102 | 		''' Compile self.predict, the deterministic rnn that output the prediction at the end of the sequence
103 | 		'''
104 | 		print("Compiling...")
105 | 		deterministic_output = lasagne.layers.get_output(self.l_out, deterministic=True)
106 | 		self.predict_function = theano.function([self.l_in.input_var], deterministic_output, allow_input_downcast=True)
107 | 		print("Compilation done.")
108 | 
109 | 	def _compile_test_function(self):
110 | 		''' Compile self.test_function, the deterministic rnn that output the precision@10
111 | 		'''
112 | 		print("Compiling test...")
113 | 		deterministic_output = lasagne.layers.get_output(self.l_out, deterministic=True)
114 | 		if self.interactions_are_unique:
115 | 			deterministic_output *= (1 - self.l_in.input_var)
116 | 		theano_test_function = theano.function(self.theano_inputs, deterministic_output, allow_input_downcast=True, name="Test_function", on_unused_input='ignore')
117 | 		
118 | 		def test_function(theano_inputs, k=10):
119 | 			output = theano_test_function(*theano_inputs)
120 | 			ids = np.argpartition(-output, range(k), axis=-1)[0, :k]
121 | 			
122 | 			return ids
123 | 
124 | 		self.test_function = test_function
125 | 
126 | 	def _gen_mini_batch(self, sequence_generator, test=False, **kwargs):
127 | 		''' Takes a sequence generator and produce a mini batch generator.
128 | 		The mini batch have a size defined by self.batch_size, and have format of the input layer of the rnn.
129 | 
130 | 		Assuming that the length of the sequence is bigger than the size of the batch, each batch is created based on one sequence.
131 | 		'''
132 | 
133 | 		while True:
134 | 
135 | 			# Shape return variables
136 | 			X = np.zeros((self.batch_size, self._input_size())) # input of the RNN
137 | 			Y = np.zeros((self.batch_size, self._input_size())) # Target of the RNN
138 | 			
139 | 			for j in range(self.batch_size):
140 | 
141 | 				sequence, user_id = next(sequence_generator)
142 | 				if not test:
143 | 					X[j,:] = self._one_hot_encoding([i[0] for i in sequence if (np.random.random() >= self.input_dropout)])
144 | 					Y[j, :] = self._one_hot_encoding([i[0] for i in sequence])
145 | 					yield (X.astype(theano.config.floatX),Y.astype(theano.config.floatX))
146 | 				else:
147 | 					X[j, :] = self._one_hot_encoding([i[0] for i in sequence[:len(sequence)/2]])
148 | 					Y[j, :] = self._one_hot_encoding(sequence[len(sequence)/2][0])
149 | 					yield (X.astype(theano.config.floatX),Y.astype(theano.config.floatX)), [i[0] for i in sequence[len(sequence)/2:]]
150 | 
151 | 	def _one_hot_encoding(self, ids):
152 | 		ohe = np.zeros(self._input_size())
153 | 		ohe[ids] = 1
154 | 		return ohe
155 | 	
156 | 	def _input_size(self):
157 | 		''' Returns the number of input neurons
158 | 		'''
159 | 		return self.n_items
160 | 	
161 | 


--------------------------------------------------------------------------------
/factorization/fism.py:
--------------------------------------------------------------------------------
  1 | from __future__ import division
  2 | from __future__ import print_function
  3 | import numpy as np
  4 | import math
  5 | import random
  6 | import re
  7 | import os
  8 | import glob
  9 | import sys
 10 | from time import time
 11 | from mf_base import MFBase
 12 | 
 13 | class FISM(MFBase):
 14 | 	''' Implementation of the algorithm presented in "FISM : Factored Item Similarity Models for Top-N Recommender Systems", by Santosh K.. et al., 2013.
 15 | 	'''
 16 | 
 17 | 	def __init__(self, k = 100, alpha=0.5, loss="auc", **kwargs):
 18 | 
 19 | 		super(FISM, self).__init__(**kwargs)
 20 | 
 21 | 		self.name = 'FISM'
 22 | 		self.k = k
 23 | 		self.loss = loss
 24 | 		if loss not in ['RMSE', 'BPR']:
 25 | 			raise ValueError('Unknown loss for FISM: ', loss)
 26 | 		self.alpha = alpha
 27 | 
 28 | 	def _get_model_filename(self, epochs):
 29 | 		'''Return the name of the file to save the current model
 30 | 		'''
 31 | 		filename = "fism_" + self.loss + "_ne"+str(epochs)+"_lr"+str(self.init_learning_rate)+"_an"+str(self.annealing_rate)+"_k"+str(self.k)+"_reg"+str(self.reg)+"_ini"+str(self.init_sigma)
 32 | 		
 33 | 		return filename+".npz"
 34 | 
 35 | 	def init_model(self):
 36 | 		''' Initialize the model parameters
 37 | 		'''
 38 | 		self.V = self.init_sigma * np.random.randn(self.n_items, self.k).astype(np.float32)
 39 | 		self.H = self.init_sigma * np.random.randn(self.n_items, self.k).astype(np.float32)
 40 | 		self.bias = np.zeros(self.n_items).astype(np.float32)
 41 | 
 42 | 	def item_score(self, user_items, item = None):
 43 | 		''' Compute the prediction score of the FISM model for the item "item", based on the list of items "user_items".
 44 | 		'''
 45 | 		if item is not None:
 46 | 			return self.bias[item] + np.power(len(user_items), -self.alpha) * np.dot(self.V[user_items, :].sum(axis=0), self.H[item, :])
 47 | 		else:
 48 | 			return self.bias + np.power(len(user_items), -self.alpha) * np.dot(self.V[user_items, :].sum(axis=0), self.H.T)
 49 | 
 50 | 	def auc_sgd_step(self, user_items, true_item, false_item):
 51 | 		''' Make one SGD update, given that the interaction between user and true_item exists, 
 52 | 		but the one between user and false_item does not.
 53 | 		user, true_item and false_item are all user or item ids.
 54 | 
 55 | 		return error
 56 | 		'''
 57 | 
 58 | 		# Compute error
 59 | 		x_true = self.item_score(user_items, true_item)
 60 | 		x_false = self.item_score(user_items, false_item)
 61 | 		delta = 1 - 1 / (1 + math.exp(min(10, max(-10, x_false - x_true)))) # Original BPR error
 62 | 		#delta = (x_true - x_false - 1) # error proposed in the FISM paper
 63 | 		
 64 | 		# Update CF
 65 | 		V_sum = self.V[user_items, :].sum(axis=0)
 66 | 		self.V[user_items, :] += self.learning_rate * ( delta * np.power(len(user_items), -self.alpha) * (self.H[true_item, :] - self.H[false_item, :]) - self.reg * self.V[user_items, :])
 67 | 		self.H[true_item, :] += self.learning_rate * ( delta * np.power(len(user_items), -self.alpha) * V_sum - self.reg * self.H[true_item, :])
 68 | 		self.H[false_item, :] += self.learning_rate * ( -delta * np.power(len(user_items), -self.alpha) * V_sum - self.reg * self.H[false_item, :])
 69 | 		self.bias[true_item] += self.learning_rate * (delta - self.reg * self.bias[true_item])
 70 | 		self.bias[false_item] += self.learning_rate * (- delta - self.reg * self.bias[false_item])
 71 | 
 72 | 		return delta
 73 | 
 74 | 	def rmse_sgd_step(self, user_items, item, rating):
 75 | 		'''
 76 | 
 77 | 		return error
 78 | 		'''
 79 | 
 80 | 		# Compute error
 81 | 		prediction = self.item_score(user_items, item)
 82 | 		delta = (rating - prediction) # error proposed in the FISM paper
 83 | 
 84 | 		print(delta)
 85 | 		if delta != delta:
 86 | 			raise ValueError('NaN')
 87 | 		
 88 | 		# print(prediction)
 89 | 		# y = raw_input()
 90 | 		
 91 | 		# Update CF
 92 | 		V_sum = self.V[user_items, :].sum(axis=0)
 93 | 
 94 | 		self.V[user_items, :] += self.learning_rate * ( delta * np.power(len(user_items), -self.alpha) * self.H[item, :] - self.reg * self.V[user_items, :])
 95 | 		self.H[item, :] += self.learning_rate * ( delta * np.power(len(user_items), -self.alpha) * V_sum - self.reg * self.H[item, :])
 96 | 		self.bias[item] += self.learning_rate * ( delta - self.reg * self.bias[item])
 97 | 
 98 | 		return delta
 99 | 
100 | 	def get_auc_training_sample(self):
101 | 		'''Pick a random triplet from self.triplets and a random false next item.
102 | 		returns a tuple of ids : (user_items, true_item, false_item)
103 | 		'''
104 | 
105 | 		user_id = random.randrange(self.n_users)
106 | 		while self.users[user_id,1] < 2:
107 | 			user_id = random.randrange(self.n_users)
108 | 		user_items = self.items[self.users[user_id,0]:self.users[user_id,0]+self.users[user_id,1]]
109 | 		
110 | 		true_item = random.choice(user_items)
111 | 		
112 | 		false_item = random.randrange(self.n_items)
113 | 		while false_item in user_items:
114 | 			false_item = random.randrange(self.n_items)
115 | 
116 | 		return ([i for i in user_items if i is not true_item], true_item, false_item)
117 | 
118 | 	def get_rmse_training_sample(self):
119 | 
120 | 		neg_to_pos_ratio = 3
121 | 		user_items, true_item, false_item = self.get_auc_training_sample()
122 | 
123 | 		if random.random() < 1 / (neg_to_pos_ratio + 1):
124 | 			return (user_items, true_item, 1)
125 | 		else:
126 | 			return (user_items, false_item, 0)
127 | 
128 | 
129 | 	def top_k_recommendations(self, sequence, user_id=None, k=10, exclude=None):
130 | 		''' Recieves a sequence of (id, rating), and produces k recommendations (as a list of ids)
131 | 		'''
132 | 
133 | 		if exclude is None:
134 | 			exclude = []
135 | 
136 | 		user_items = [i[0] for i in sequence]
137 | 		output = self.item_score(user_items)
138 | 
139 | 		# Put low similarity to viewed items to exclude them from recommendations
140 | 		output[[i[0] for i in sequence]] = -np.inf
141 | 		output[exclude] = -np.inf
142 | 
143 | 		# find top k according to output
144 | 		return list(np.argpartition(-output, range(k))[:k])
145 | 
146 | 	def training_step(self, iterations):
147 | 		if self.loss == "BPR":
148 | 			return self.auc_sgd_step(*self.get_auc_training_sample())
149 | 		else: 
150 | 			return self.rmse_sgd_step(*self.get_rmse_training_sample())
151 | 
152 | 	def save(self, filename):
153 | 		'''Save the parameters of a network into a file
154 | 		'''
155 | 		print('Save model in ' + filename)
156 | 		if not os.path.exists(os.path.dirname(filename)):
157 | 			os.makedirs(os.path.dirname(filename))
158 | 		np.savez(filename, V=self.V, H=self.H, bias=self.bias)
159 | 		
160 | 
161 | 	def load(self, filename):
162 | 		'''Load parameters values form a file
163 | 		'''
164 | 		f = np.load(filename)
165 | 		self.V = f['V']
166 | 		self.H = f['H']
167 | 		self.bias = f['bias']


--------------------------------------------------------------------------------
/test.py:
--------------------------------------------------------------------------------
  1 | from __future__ import print_function
  2 | 
  3 | import numpy as np
  4 | import theano
  5 | import theano.tensor as T
  6 | import lasagne
  7 | import time
  8 | import random
  9 | import argparse
 10 | import re
 11 | import glob
 12 | import sys
 13 | import os
 14 | import copy
 15 | # import matplotlib.pyplot as plt
 16 | from helpers.data_handling import DataHandler
 17 | from helpers import evaluation
 18 | import helpers.command_parser as parse
 19 | 
 20 | 
 21 | def get_file_name(predictor, args):
 22 | 	return args.dir + re.sub('_ml'+str(args.max_length), '_ml'+str(args.training_max_length), predictor._get_model_filename(args.number_of_batches))
 23 | 
 24 | def find_models(predictor, dataset, args):
 25 | 	if args.method == "UKNN" or args.method == "MM" or args.method == "POP":
 26 | 		return None
 27 | 
 28 | 	file = dataset.dirname + "models/" + get_file_name(predictor, args)
 29 | 	print(file)
 30 | 	if args.number_of_batches == "*":
 31 | 		file = np.array(glob.glob(file))
 32 | 
 33 | 	return file
 34 | 
 35 | def save_file_name(predictor, dataset, args):
 36 | 	if not args.save:
 37 | 		return None
 38 | 	else:
 39 | 		file = re.sub('_ne\*_', '_', dataset.dirname + 'results/' + get_file_name(predictor, args))
 40 | 		return file
 41 | 
 42 | def run_tests(predictor, model_file, dataset, args, get_full_recommendation_list=False, k=10):
 43 | 	# Load model
 44 | 	predictor.load(model_file)
 45 | 	#predictor.load_last(os.path.dirname(model_file) + '/')
 46 | 	# Prepare evaluator
 47 | 	evaluator = evaluation.Evaluator(dataset, k=k)
 48 | 
 49 | 	if get_full_recommendation_list: 
 50 | 		k = dataset.n_items
 51 | 
 52 | 	count = 0
 53 | 	nb_of_dp = []
 54 | 	start = time.clock()
 55 | 	for sequence, user_id in dataset.test_set(epochs=1):
 56 | 		count += 1
 57 | 		num_viewed = int(len(sequence) / 2)
 58 | 		viewed = sequence[:num_viewed]
 59 | 		goal = [i[0] for i in sequence[num_viewed:]]
 60 | 
 61 | 		if  args.clusters > 0:
 62 | 			recommendations, n = predictor.top_k_recommendations(viewed, user_id=user_id, k=k)
 63 | 			nb_of_dp.append(n)
 64 | 		else:
 65 | 			recommendations = predictor.top_k_recommendations(viewed, user_id=user_id, k=k)
 66 | 
 67 | 		evaluator.add_instance(goal, recommendations)
 68 | 
 69 | 		if len(goal) == 0:
 70 | 			raise ValueError
 71 | 	end = time.clock()
 72 | 	print('Timer: ', end-start)
 73 | 	if len(nb_of_dp) == 0:
 74 | 		evaluator.nb_of_dp = dataset.n_items
 75 | 	else:
 76 | 		evaluator.nb_of_dp = np.mean(nb_of_dp)
 77 | 	return evaluator
 78 | 
 79 | def print_results(ev, metrics, plot=True, file=None, n_batches=None, print_full_rank_comparison=False):
 80 | 	
 81 | 	for m in metrics:
 82 | 		if m not in ev.metrics:
 83 | 			raise ValueError('Unkown metric: ' + m)
 84 | 
 85 | 		print(m+'@'+str(ev.k)+': ', ev.metrics[m]())
 86 | 
 87 | 	if file != None:
 88 | 		if not os.path.exists(os.path.dirname(file)):
 89 | 			os.makedirs(os.path.dirname(file))
 90 | 		with open(file, "a") as f:
 91 | 			f.write(str(n_batches)+"\t".join(map(str, [ev.metrics[m]() for m in metrics])) + "\n")
 92 | 		if print_full_rank_comparison:
 93 | 			with open(file+"_full_rank", "a") as f:
 94 | 				for data in ev.get_rank_comparison():
 95 | 					f.write("\t".join(map(str, data)) + "\n")
 96 | 	else:
 97 | 		print("-\t" + "\t".join(map(str, [ev.metrics[m]() for m in metrics])), file=sys.stderr)
 98 | 		if print_full_rank_comparison:
 99 | 			with open(file+"_full_rank", "a") as f:
100 | 				for data in ev.get_rank_comparison():
101 | 					f.write("\t".join(map(str, data)) + "\n")
102 | 
103 | def extract_number_of_epochs(filename):
104 | 	m = re.search('_ne([0-9]+(\.[0-9]+)?)_', filename)
105 | 	return float(m.group(1))
106 | 
107 | def get_last_tested_batch(filename):
108 | 	'''If the output file exist already, it will look at the content of the file and return the last batch that was tested.
109 | 	This is used to avoid testing to times the same model.
110 | 	'''
111 | 	
112 | 	if filename is not None and os.path.isfile(filename):
113 | 		with open(filename) as f:
114 | 			for line in f:
115 | 				pass
116 | 			return float(line.split()[0])
117 | 	else:
118 | 		return 0
119 | 
120 | def test_command_parser(parser):
121 | 	
122 | 	parser.add_argument('-d', dest='dataset', help='Directory name of the dataset.', default='', type=str)
123 | 	parser.add_argument('-i', dest='number_of_batches', help='Number of epochs, if not set it will compare all the available models', default=-1, type=int)
124 | 	parser.add_argument('-k', dest='nb_of_predictions', help='Number of predictions to make. It is the "k" in "prec@k", "rec@k", etc.', default=10, type=int)
125 | 	parser.add_argument('--metrics', help='List of metrics to compute, comma separated', default='sps,recall,item_coverage,user_coverage,blockbuster_share', type=str)
126 | 	parser.add_argument('--save', help='Save results to a file', action='store_true')
127 | 	parser.add_argument('--dir', help='Model directory.', default="", type=str)
128 | 	parser.add_argument('--save_rank', help='Save the full comparison of goal and prediction ranking.', action='store_true')
129 | 
130 | def main():
131 | 	
132 | 	args = parse.command_parser(parse.predictor_command_parser, test_command_parser)
133 | 
134 | 	args.training_max_length = args.max_length
135 | 	# args.max_length = int(DATA_HANDLER.max_length/2)
136 | 	if args.number_of_batches == -1:
137 | 		args.number_of_batches = "*"
138 | 
139 | 	dataset = DataHandler(dirname=args.dataset)
140 | 	predictor = parse.get_predictor(args)
141 | 	predictor.prepare_model(dataset)
142 | 	file = find_models(predictor, dataset, args)
143 | 
144 | 	if args.number_of_batches == "*" and args.method != "UKNN" and args.method != "MM" and args.method != "POP":
145 | 		
146 | 		output_file = save_file_name(predictor, dataset, args)
147 | 
148 | 		last_tested_batch = get_last_tested_batch(output_file)
149 | 		batches = np.array(map(extract_number_of_epochs, file))
150 | 		sorted_ids = np.argsort(batches)
151 | 		batches = batches[sorted_ids]
152 | 		file = file[sorted_ids]
153 | 		for i, f in enumerate(file):
154 | 			if batches[i] > last_tested_batch:
155 | 				evaluator = run_tests(predictor, f, dataset, args, get_full_recommendation_list=args.save_rank, k=args.nb_of_predictions)
156 | 				print('-------------------')
157 | 				print('(',i+1 ,'/', len(file),') results on ' + f)
158 | 				print_results(evaluator, args.metrics.split(','), plot=False, file=output_file, n_batches=batches[i], print_full_rank_comparison=args.save_rank)
159 | 	else:
160 | 		evaluator = run_tests(predictor, file, dataset, args, get_full_recommendation_list=args.save_rank, k=args.nb_of_predictions)
161 | 		print_results(evaluator, args.metrics.split(','), file=save_file_name(predictor, dataset, args), print_full_rank_comparison=args.save_rank)
162 | 
163 | if __name__ == '__main__':
164 |     main()


--------------------------------------------------------------------------------
/factorization/fossil.py:
--------------------------------------------------------------------------------
  1 | from __future__ import division
  2 | from __future__ import print_function
  3 | import numpy as np
  4 | import math
  5 | import random
  6 | import re
  7 | import os
  8 | import glob
  9 | import sys
 10 | from time import time
 11 | from mf_base import MFBase
 12 | 
 13 | class Fossil(MFBase):
 14 | 	''' Implementation of the algorithm presented in "Fusing Similarity Models with Markov Chains for Sparse Sequential Recommendation", by He R. and McAuley J., 2016.
 15 | 	'''
 16 | 
 17 | 	def __init__(self, k = 32, order=1, alpha=0.2, **kwargs):
 18 | 
 19 | 		super(Fossil, self).__init__(**kwargs)
 20 | 
 21 | 		self.name = 'Fossil'
 22 | 		self.k = k
 23 | 		self.order = order #markov chain order
 24 | 		self.alpha = alpha
 25 | 
 26 | 	def _get_model_filename(self, epochs):
 27 | 		'''Return the name of the file to save the current model
 28 | 		'''
 29 | 		filename = "fossil_ne"+str(epochs)+"_lr"+str(self.init_learning_rate)+"_an"+str(self.annealing_rate)+"_k"+str(self.k)+"_o"+str(self.order)+"_reg"+str(self.reg)+"_ini"+str(self.init_sigma)
 30 | 		
 31 | 		return filename+".npz"
 32 | 
 33 | 	def init_model(self):
 34 | 		''' Initialize the model parameters
 35 | 		'''
 36 | 		self.V = self.init_sigma * np.random.randn(self.n_items, self.k).astype(np.float32)
 37 | 		self.H = self.init_sigma * np.random.randn(self.n_items, self.k).astype(np.float32)
 38 | 		self.eta = self.init_sigma * np.random.randn(self.n_users, self.order).astype(np.float32)
 39 | 		self.eta_bias = np.zeros(self.order).astype(np.float32)
 40 | 		self.bias = np.zeros(self.n_items).astype(np.float32)
 41 | 
 42 | 	def item_score(self, user_id, user_items, item=None):
 43 | 		''' Compute the prediction score of the Fossil model for the item "item", based on the list of items "user_items".
 44 | 		'''
 45 | 
 46 | 		long_term = np.power(len(user_items), -self.alpha) * self.V[user_items, :].sum(axis=0)
 47 | 		effective_order = min(self.order, len(user_items))
 48 | 		if user_id is None:
 49 | 			short_term = np.dot((self.eta_bias + self.eta.mean(axis=0))[:effective_order], self.V[user_items[:-effective_order-1:-1], :])
 50 | 		else:
 51 | 			short_term = np.dot((self.eta_bias + self.eta[user_id, :])[:effective_order], self.V[user_items[:-effective_order-1:-1], :])
 52 | 
 53 | 		if item is not None:
 54 | 			return self.bias[item] + np.dot(long_term + short_term, self.H[item, :])
 55 | 		else:
 56 | 			return self.bias + np.dot(long_term + short_term, self.H.T)
 57 | 
 58 | 	def sgd_step(self, user_id, user_items, false_item):
 59 | 		''' Make one SGD update, given that the interaction between user and true_item exists, 
 60 | 		but the one between user and false_item does not.
 61 | 
 62 | 		return error
 63 | 		'''
 64 | 
 65 | 		true_item = user_items[-1]
 66 | 		user_items = user_items[:-1]
 67 | 		effective_order = min(self.order, len(user_items))
 68 | 
 69 | 		long_term = np.power(len(user_items), -self.alpha) * self.V[user_items, :].sum(axis=0)
 70 | 		short_term = np.dot((self.eta_bias + self.eta[user_id, :])[:effective_order], self.V[user_items[:-effective_order-1:-1], :])
 71 | 
 72 | 		# Compute error
 73 | 		x_true = self.item_score(user_id, user_items, true_item)
 74 | 		x_false = self.item_score(user_id, user_items, false_item)
 75 | 		delta = 1 / (1 + math.exp(-min(10, max(-10, x_false - x_true)))) # sigmoid of the error
 76 | 		
 77 | 		# Compute Update
 78 | 		V_update = self.learning_rate * ( delta * np.power(len(user_items), -self.alpha) * (self.H[true_item, :] - self.H[false_item, :]) - self.reg * self.V[user_items, :])
 79 | 		V_update2 = self.learning_rate * delta *  np.outer((self.eta_bias + self.eta[user_id, :])[:effective_order], self.H[true_item, :] - self.H[false_item, :])
 80 | 		H_true_up = self.learning_rate * ( delta * (long_term + short_term) - self.reg * self.H[true_item, :])
 81 | 		H_false_up = self.learning_rate * ( -delta * (long_term + short_term) - self.reg * self.H[false_item, :])
 82 | 		bias_true_up = self.learning_rate * (delta - self.reg * self.bias[true_item])
 83 | 		bias_false_up = self.learning_rate * (- delta - self.reg * self.bias[false_item])
 84 | 		eta_bias_up = self.learning_rate * (delta * np.dot(self.V[user_items[:-effective_order-1:-1], :], self.H[true_item, :] - self.H[false_item, :]) - self.reg * self.eta_bias[:effective_order])
 85 | 		eta_up = self.learning_rate * (delta * np.dot(self.V[user_items[:-effective_order-1:-1], :], self.H[true_item, :] - self.H[false_item, :]) - self.reg * self.eta[user_id, :effective_order])
 86 | 
 87 | 
 88 | 		# Update
 89 | 		self.V[user_items, :] += V_update
 90 | 		self.V[user_items[:-effective_order-1:-1], :] += V_update2
 91 | 		self.H[true_item, :] += H_true_up
 92 | 		self.H[false_item, :] += H_false_up
 93 | 		self.bias[true_item] += bias_true_up
 94 | 		self.bias[false_item] += bias_false_up
 95 | 		self.eta_bias[:effective_order] += eta_bias_up
 96 | 		self.eta[user_id, :effective_order] += eta_up
 97 | 
 98 | 		return delta
 99 | 
100 | 	def get_training_sample(self):
101 | 		'''Pick a random triplet from self.triplets and a random false next item.
102 | 		returns a tuple of ids : (user_items, true_item, false_item)
103 | 		'''
104 | 
105 | 		user_id = random.randrange(self.n_users)
106 | 		while self.users[user_id,1] < 2:
107 | 			user_id = random.randrange(self.n_users)
108 | 		user_items = self.items[self.users[user_id,0]:self.users[user_id,0]+self.users[user_id,1]]
109 | 		
110 | 		t = random.randrange(1, len(user_items))
111 | 		
112 | 		false_item = random.randrange(self.n_items)
113 | 		while false_item in user_items[:t+1]:
114 | 			false_item = random.randrange(self.n_items)
115 | 
116 | 		return (user_id, user_items[:t+1], false_item)
117 | 
118 | 
119 | 	def top_k_recommendations(self, sequence, user_id=None, k=10, exclude=None):
120 | 		''' Recieves a sequence of (id, rating), and produces k recommendations (as a list of ids)
121 | 		'''
122 | 
123 | 		if exclude is None:
124 | 			exclude = []
125 | 
126 | 		user_items = [i[0] for i in sequence]
127 | 		output = self.item_score(user_id, user_items)
128 | 
129 | 		# Put low similarity to viewed items to exclude them from recommendations
130 | 		output[[i[0] for i in sequence]] = -np.inf
131 | 		output[exclude] = -np.inf
132 | 
133 | 		# find top k according to output
134 | 		return list(np.argpartition(-output, range(k))[:k])
135 | 
136 | 	def training_step(self, iterations):
137 | 		return self.sgd_step(*self.get_training_sample())
138 | 
139 | 	def save(self, filename):
140 | 		'''Save the parameters of a network into a file
141 | 		'''
142 | 		print('Save model in ' + filename)
143 | 		if not os.path.exists(os.path.dirname(filename)):
144 | 			os.makedirs(os.path.dirname(filename))
145 | 		np.savez(filename, V=self.V, H=self.H, bias=self.bias, eta=self.eta, eta_bias=self.eta_bias)
146 | 
147 | 	def load(self, filename):
148 | 		'''Load parameters values form a file
149 | 		'''
150 | 		f = np.load(filename)
151 | 		self.V = f['V']
152 | 		self.H = f['H']
153 | 		self.bias = f['bias']
154 | 		self.eta = f['eta']
155 | 		self.eta_bias = f['eta_bias']


--------------------------------------------------------------------------------
/neural_networks/rnn_margin.py:
--------------------------------------------------------------------------------
  1 | from __future__ import print_function
  2 | 
  3 | import numpy as np
  4 | import theano
  5 | import theano.tensor as T
  6 | import lasagne
  7 | import cPickle
  8 | import random
  9 | from time import time
 10 | import rnn_base as rnn
 11 | from sparse_lstm import *
 12 | 
 13 | class RNNMargin(rnn.RNNBase):
 14 | 	'''
 15 | 
 16 | 	OPTIONS
 17 | 	-------
 18 | 	balance: float, default 1, balance between the weight of false negative and false positive on the loss function.
 19 | 		e.g. if balance = 1, both have the same weight, 
 20 | 		if balance = 0, only false negative have an impact, 
 21 | 		if balance = 2, false positive have twice as much weight as false negative.
 22 | 	popularity_based: bool, default False, choose wether the target value of negatives depends on their popularity.
 23 | 		if False, the target value of all negatives is 0 (versus 1 for the positives)
 24 | 		if True, the target value of item i is min(1 - p_i, (1 - min_access) * p_i / min_access), where p_i = fraction of users who consumed that item.
 25 | 	min_access: float in (0,1), default 0.05, parameter used in the formula for the target value of negatives in the popularity based case.
 26 | 		Represent the minimum fraction of users that has access to any item.
 27 | 		e.g. min_access=0.05 means that there is no item accessible by less than 5% of the users.
 28 | 	n_targets: int or inf, default 1, number of items in the continuation of the sequence that will be used as positive target.
 29 | 
 30 | 	'''
 31 | 	
 32 | 	def __init__(self, loss_function="hinge", balance=1., popularity_based=False, min_access=0.05, n_targets=1, **kwargs):
 33 | 		super(RNNMargin, self).__init__(**kwargs)
 34 | 		
 35 | 		self.balance = balance
 36 | 		self.popularity_based = popularity_based
 37 | 		self.min_access = min_access
 38 | 		self.n_targets = n_targets
 39 | 		if loss_function is None:
 40 | 			loss_function = "hinge"
 41 | 		self.loss_function_name = loss_function
 42 | 		if loss_function == "hinge":
 43 | 			self.loss_function = self._hinge_loss
 44 | 		elif loss_function == "logit":
 45 | 			self.loss_function = self._logit_loss
 46 | 		elif loss_function == "logsig":
 47 | 			self.loss_function = self._logsigmoid_loss
 48 | 		else:
 49 | 			raise ValueError('Unknown loss function')
 50 | 
 51 | 		self.name = "RNN multi-targets"
 52 | 
 53 | 	def _get_model_filename(self, epochs):
 54 | 		'''Return the name of the file to save the current model
 55 | 		'''
 56 | 		filename = "rnn_multitarget_"+self.loss_function_name+"_b"+str(self.balance)
 57 | 		if self.popularity_based:
 58 | 			filename += '_pb_ma'+str(self.min_access)
 59 | 		return filename + "_" + self._common_filename(epochs)
 60 | 
 61 | 	def _hinge_loss(self, predictions, targets, weights):
 62 | 		return T.nnet.relu((predictions - targets) * weights).sum(axis=-1)
 63 | 
 64 | 	def _logit_loss(self, predictions, targets, weights):
 65 | 		return (T.nnet.sigmoid(predictions - targets) * weights).sum(axis=-1)
 66 | 
 67 | 	def _logsigmoid_loss(self, predictions, targets, weights):
 68 | 		return -T.log(T.nnet.sigmoid((targets - predictions) * weights)).sum(axis=-1)
 69 | 
 70 | 	def _prepare_networks(self, n_items):
 71 | 		''' Prepares the building blocks of the RNN, but does not compile them:
 72 | 		self.l_in : input layer
 73 | 		self.l_mask : mask of the input layer
 74 | 		self.target : target of the network
 75 | 		self.l_out : output of the network
 76 | 		self.cost : cost function
 77 | 		'''
 78 | 
 79 | 		self.n_items = n_items
 80 | 		
 81 | 		# The input is composed of to parts : the on-hot encoding of the movie, and the features of the movie
 82 | 		self.l_in = lasagne.layers.InputLayer(shape=(self.batch_size, self.max_length, self._input_size()))
 83 | 		# The input is completed by a mask to inform the LSTM of the length of the sequence
 84 | 		self.l_mask = lasagne.layers.InputLayer(shape=(self.batch_size, self.max_length))
 85 | 
 86 | 		# recurrent layer
 87 | 		if not self.use_movies_features:
 88 | 			l_recurrent = self.recurrent_layer(self.l_in, self.l_mask, true_input_size=self.n_items + self._n_optional_features(), only_return_final=True)
 89 | 		else:
 90 | 			l_recurrent = self.recurrent_layer(self.l_in, self.l_mask, true_input_size=None, only_return_final=True)
 91 | 
 92 | 		# l_last_slice gets the last output of the recurrent layer
 93 | 		l_last_slice = l_recurrent
 94 | 		# l_last_slice = lasagne.layers.SliceLayer(l_recurrent, -1, 1)
 95 | 
 96 | 		# Theano tensor for the targets
 97 | 		target = T.fmatrix('multiple_target_output')
 98 | 		target_weight = T.fmatrix('target_weight')
 99 | 		self.exclude = T.fmatrix('excluded_items')
100 | 		self.theano_inputs = [self.l_in.input_var, self.l_mask.input_var, target, target_weight, self.exclude]
101 | 		
102 | 		# The sliced output is then passed through linear layer to obtain the right output size
103 | 		self.l_out = lasagne.layers.DenseLayer(l_last_slice, num_units=self.n_items, nonlinearity=None)
104 | 					
105 | 		# lasagne.layers.get_output produces a variable for the output of the net
106 | 		network_output = lasagne.layers.get_output(self.l_out)
107 | 
108 | 		# loss function
109 | 		self.cost = self.loss_function(network_output, target, target_weight).mean()
110 | 		
111 | 
112 | 	def _prepare_input(self, sequences):
113 | 		''' Sequences is a list of [user_id, input_sequence, targets]
114 | 		'''
115 | 
116 | 		batch_size = len(sequences)
117 | 
118 | 		# Shape return variables
119 | 		X = np.zeros((batch_size, self.max_length, self._input_size()), dtype=self._input_type) # input of the RNN
120 | 		mask = np.zeros((batch_size, self.max_length)) # mask of the input (to deal with sequences of different length)
121 | 		Y = np.zeros((batch_size, self.n_items), dtype=theano.config.floatX)
122 | 		weight = np.zeros((batch_size, self.n_items), dtype=theano.config.floatX)
123 | 		exclude = np.zeros((batch_size, self.n_items), dtype=theano.config.floatX)
124 | 
125 | 		
126 | 		for i, sequence in enumerate(sequences):
127 | 			user_id, in_seq, target = sequence
128 | 			seq_features = np.array(map(lambda x: self._get_features(x, user_id), in_seq))
129 | 			X[i, :len(in_seq), :] = seq_features # Copy sequences into X
130 | 			mask[i, :len(in_seq)] = 1
131 | 			exclude[i, [j[0] for j in in_seq]] = 1
132 | 
133 | 			# compute weight for false positive
134 | 			w = self.balance * len(target) / (self.n_items - len(target) - len(in_seq))
135 | 
136 | 			weight[i,:] = w
137 | 			weight[i, [t[0] for t in target]] = -1
138 | 			if self.interactions_are_unique:
139 | 				weight[i, [t[0] for t in in_seq]] = 0
140 | 			
141 | 
142 | 			Y[i, :] = self._default_target()
143 | 			Y[i, [t[0] for t in target]] = 1
144 | 			if self.interactions_are_unique:
145 | 				Y[i, [t[0] for t in in_seq]] = 0
146 | 			
147 | 
148 | 		# weight *= 10e3
149 | 		return (X, mask.astype(theano.config.floatX), Y, weight, exclude)
150 | 
151 | 	def _default_target(self):
152 | 
153 | 		if not hasattr(self, '__default_target'):
154 | 			if not self.popularity_based:
155 | 				self.__default_target = np.zeros(self.n_items)
156 | 			else:
157 | 				num_users = self.dataset.training_set.n_users
158 | 				view_prob = self.dataset.item_popularity / num_users
159 | 				self.__default_target = np.minimum(1 - view_prob, (1 - self.min_access) * view_prob / self.min_access)
160 | 
161 | 		return self.__default_target
162 | 


--------------------------------------------------------------------------------
/factorization/fpmc.py:
--------------------------------------------------------------------------------
  1 | from __future__ import division
  2 | from __future__ import print_function
  3 | import numpy as np
  4 | import math
  5 | import random
  6 | import re
  7 | import os
  8 | import glob
  9 | import sys
 10 | from time import time
 11 | from helpers import evaluation
 12 | from mf_base import MFBase
 13 | 
 14 | class FPMC(MFBase):
 15 | 	''' Implementation of the algorithm presented in "Factorizing personalized Markov chains for next-basket recommendation", by Rendle S. et al., 2010.
 16 | 
 17 | 	The adaptive sampling algorithm is adapted from "Improving pairwise learning for item recommendation from implicit feedback", by Rendle S. et al., 2014
 18 | 	'''
 19 | 
 20 | 	def __init__(self, k_cf = 32, k_mc = 32, adaptive_sampling=True, sampling_bias=500, **kwargs):
 21 | 
 22 | 		super(FPMC, self).__init__(**kwargs)
 23 | 
 24 | 		self.name = 'FPMC'
 25 | 		self.k_cf = k_cf
 26 | 		self.k_mc = k_mc
 27 | 		self.adaptive_sampling = adaptive_sampling
 28 | 		self.sampling_bias = sampling_bias # lambda parameter in "Improving pairwise learning for item recommendation from implicit feedback", by Rendle S. et al., 2014
 29 | 		self.max_length = np.inf # For compatibility with the RNNs
 30 | 
 31 | 	def _get_model_filename(self, epochs):
 32 | 		'''Return the name of the file to save the current model
 33 | 		'''
 34 | 		filename = "fpmc_ne"+str(epochs)+"_lr"+str(self.init_learning_rate)+"_an"+str(self.annealing_rate)+"_kcf"+str(self.k_cf)+"_kmc"+str(self.k_mc)+"_reg"+str(self.reg)+"_ini"+str(self.init_sigma)
 35 | 		if self.adaptive_sampling:
 36 | 			filename += "_as"+str(self.sampling_bias)
 37 | 		return filename+".npz"
 38 | 
 39 | 	def init_model(self):
 40 | 		''' Initialize the model parameters
 41 | 		'''
 42 | 		self.V_user_item = self.init_sigma * np.random.randn(self.n_users, self.k_cf).astype(np.float32)
 43 | 		self.V_item_user = self.init_sigma * np.random.randn(self.n_items, self.k_cf).astype(np.float32)
 44 | 		self.V_prev_next = self.init_sigma * np.random.randn(self.n_items, self.k_mc).astype(np.float32)
 45 | 		self.V_next_prev = self.init_sigma * np.random.randn(self.n_items, self.k_mc).astype(np.float32)
 46 | 
 47 | 	def sgd_step(self, user, prev_item, true_next, false_next):
 48 | 		''' Make one SGD update, given that the transition from prev_item to true_next exist in user history,
 49 | 		But the transition prev_item to false_next does not exist.
 50 | 		user, prev_item, true_next and false_next are all user or item ids.
 51 | 
 52 | 		return error
 53 | 		'''
 54 | 
 55 | 		# Compute error
 56 | 		x_true = np.dot(self.V_user_item[user, :], self.V_item_user[true_next, :]) + np.dot(self.V_prev_next[prev_item, :], self.V_next_prev[true_next, :])
 57 | 		x_false = np.dot(self.V_user_item[user, :], self.V_item_user[false_next, :]) + np.dot(self.V_prev_next[prev_item, :], self.V_next_prev[false_next, :])
 58 | 		delta = 1 - 1 / (1 + math.exp(min(10, max(-10, x_false - x_true)))) # Bound x_true - x_false in [-10, 10] to avoid overflow
 59 | 
 60 | 		# Update CF
 61 | 		V_user_item_mem = self.V_user_item[user, :]
 62 | 		self.V_user_item[user, :] += self.learning_rate * ( delta * (self.V_item_user[true_next, :] - self.V_item_user[false_next, :]) - self.reg * self.V_user_item[user, :])
 63 | 		self.V_item_user[true_next, :] += self.learning_rate * ( delta * V_user_item_mem - self.reg * self.V_item_user[true_next, :])
 64 | 		self.V_item_user[false_next, :] += self.learning_rate * ( -delta * V_user_item_mem - self.reg * self.V_item_user[false_next, :])
 65 | 
 66 | 		# Update MC
 67 | 		V_prev_next_mem = self.V_prev_next[prev_item, :]
 68 | 		self.V_prev_next[prev_item, :] += self.learning_rate * ( delta * (self.V_next_prev[true_next, :] - self.V_next_prev[false_next, :]) - self.reg * self.V_prev_next[prev_item, :])
 69 | 		self.V_next_prev[true_next, :] += self.learning_rate * ( delta * V_prev_next_mem - self.reg * self.V_next_prev[true_next, :])
 70 | 		self.V_next_prev[false_next, :] += self.learning_rate * ( -delta * V_prev_next_mem - self.reg * self.V_next_prev[false_next, :])
 71 | 
 72 | 		return delta
 73 | 
 74 | 	def compute_factor_rankings(self):
 75 | 		'''Rank items according to each factor in order to do adaptive sampling
 76 | 		'''
 77 | 
 78 | 		CF_rank = np.argsort(self.V_item_user, axis=0)
 79 | 		MC_rank = np.argsort(self.V_next_prev, axis=0)
 80 | 		self.ranks = np.concatenate((CF_rank, MC_rank), axis=1)
 81 | 
 82 | 		CF_var = np.var(self.V_item_user, axis=0)
 83 | 		MC_var = np.var(self.V_next_prev, axis=0)
 84 | 		self.var = np.concatenate((CF_var, MC_var))
 85 | 
 86 | 	def get_training_sample(self):
 87 | 		'''Pick a random triplet from self.triplets and a random false next item.
 88 | 		returns a tuple of ids : (user, prev_item, true_next, false_next)
 89 | 		'''
 90 | 
 91 | 		# user_id, prev_item, true_next = random.choice(self.triplets)
 92 | 		user_id = random.randrange(self.n_users)
 93 | 		while self.users[user_id,1] < 2:
 94 | 			user_id = random.randrange(self.n_users)
 95 | 		r = random.randrange(self.users[user_id,1]-1)
 96 | 		prev_item = self.items[self.users[user_id,0]+r]
 97 | 		true_next = self.items[self.users[user_id,0]+r+1]
 98 | 		if self.adaptive_sampling:
 99 | 			while True:
100 | 				rank = np.random.exponential(scale=self.sampling_bias)
101 | 				while rank >= self.n_items:
102 | 					rank = np.random.exponential(scale=self.sampling_bias)
103 | 				factor_signs = np.sign(np.concatenate((self.V_user_item[user_id, :], self.V_prev_next[prev_item, :])))
104 | 				factor_prob = np.abs(np.concatenate((self.V_user_item[user_id, :], self.V_prev_next[prev_item, :]))) * self.var
105 | 				f = np.random.choice(self.k_cf+self.k_mc, p=factor_prob/sum(factor_prob))
106 | 				false_next = self.ranks[int(rank) * factor_signs[f],f]
107 | 				if false_next != true_next:
108 | 					break
109 | 		else:
110 | 			false_next = random.randrange(self.n_items-1)
111 | 			if false_next >= true_next: # To make sure false_next != true_next
112 | 				false_next += 1
113 | 
114 | 		return (user_id, prev_item, true_next, false_next)
115 | 
116 | 	def top_k_recommendations(self, sequence, user_id=None, k=10, exclude=None):
117 | 		''' Recieves a sequence of (id, rating), and produces k recommendations (as a list of ids)
118 | 		'''
119 | 
120 | 		if exclude is None:
121 | 			exclude = []
122 | 
123 | 		last_item = sequence[-1][0]
124 | 		output = np.dot(self.V_user_item[user_id, :], self.V_item_user.T) + np.dot(self.V_prev_next[last_item, :], self.V_next_prev.T)
125 | 
126 | 		# Put low similarity to viewed items to exclude them from recommendations
127 | 		output[[i[0] for i in sequence]] = -np.inf
128 | 		output[exclude] = -np.inf
129 | 
130 | 		# find top k according to output
131 | 		return list(np.argpartition(-output, range(k))[:k])
132 | 
133 | 	def training_step(self, iterations):
134 | 		if self.adaptive_sampling and iterations%int(self.n_items * np.log(self.n_items)) == 0:
135 | 			self.compute_factor_rankings()
136 | 
137 | 		# Train with a new batch
138 | 		return self.sgd_step(*self.get_training_sample())
139 | 
140 | 	def save(self, filename):
141 | 		'''Save the parameters of a network into a file
142 | 		'''
143 | 		print('Save model in ' + filename)
144 | 		if not os.path.exists(os.path.dirname(filename)):
145 | 			os.makedirs(os.path.dirname(filename))
146 | 		np.savez(filename, V_user_item=self.V_user_item, V_item_user=self.V_item_user, V_prev_next=self.V_prev_next, V_next_prev=self.V_next_prev)
147 | 
148 | 	def load(self, filename):
149 | 		'''Load parameters values form a file
150 | 		'''
151 | 		f = np.load(filename)
152 | 		self.V_user_item = f['V_user_item']
153 | 		self.V_item_user = f['V_item_user']
154 | 		self.V_prev_next = f['V_prev_next']
155 | 		self.V_next_prev = f['V_next_prev']


--------------------------------------------------------------------------------
/helpers/data_handling.py:
--------------------------------------------------------------------------------
  1 | from __future__ import division
  2 | import numpy as np
  3 | import theano
  4 | import theano.tensor as T
  5 | import random
  6 | import os.path
  7 | 
  8 | # Data directory
  9 | DEFAULT_DIR = '../../data/'
 10 | 
 11 | 
 12 | class DataHandler(object):
 13 | 	''' Prepare data for the differents algorithms.
 14 | 	Give easy access to training, validation and test set and to information about the dataset 
 15 | 	such as number of users, items and interactions.
 16 | 	'''
 17 | 
 18 | 	def __init__(self, dirname, extended_training_set=False, shuffle_training=False):
 19 | 		'''
 20 | 
 21 | 		Parameter
 22 | 		---------
 23 | 
 24 | 		dirname: str
 25 | 			Directory where the dataset can be found.
 26 | 			If dirname does not correspond to an existing path, DEFAULT_DIR+dirname will be looked for.
 27 | 			If both dirname and DEFAULT_DIR+dirname are existing path, a warning will be printed.
 28 | 			The directory should contains at least the following sub folders:
 29 | 			- data/ where the dataset files can be found
 30 | 			- models/ where the models are stored during training
 31 | 			- results/ where the results are stored during testing
 32 | 
 33 | 		extended_training_set: boolean
 34 | 			If True, the extended training set is used, i.e. the file "train_set_sequences+" is loaded instead of "train_set_sequences".
 35 | 
 36 | 		shuffle_training: boolean
 37 | 			If True, the order of the training sequences is shuffled between each pass.
 38 | 		'''
 39 | 		super(DataHandler, self).__init__()
 40 | 
 41 | 		self.dirname = self._get_path(dirname)
 42 | 
 43 | 		self.extended_training_set = extended_training_set
 44 | 		if extended_training_set:
 45 | 			self.training_set = SequenceGenerator(self.dirname+'data/train_set_sequences+', shuffle=shuffle_training)
 46 | 		else:
 47 | 			self.training_set = SequenceGenerator(self.dirname+'data/train_set_sequences', shuffle=shuffle_training)
 48 | 		self.validation_set = SequenceGenerator(self.dirname+'data/val_set_sequences')
 49 | 		self.test_set = SequenceGenerator(self.dirname+'data/test_set_sequences')
 50 | 
 51 | 		self._load_stats()
 52 | 
 53 | 	def training_set_triplets(self):
 54 | 		with open(self.dirname + 'data/train_set_triplets') as f:
 55 | 			for line in f:
 56 | 				line = line.split()
 57 | 				yield {'user_id': int(line[0]), 'item_id': int(line[1]), 'rating': float(line[2])}
 58 | 
 59 | 	@property
 60 | 	def item_popularity(self):
 61 | 		'''Returns the number of occurences of an item in the training set.
 62 | 		'''
 63 | 
 64 | 		if not hasattr(self.training_set, '_item_pop'):
 65 | 			if os.path.isfile(self.dirname + 'data/training_set_item_popularity.npy'):
 66 | 				self.training_set._item_pop = np.load(self.dirname + 'data/training_set_item_popularity.npy')
 67 | 			else:
 68 | 				self.training_set._item_pop = np.zeros(self.n_items)
 69 | 				with open(self.dirname + 'data/train_set_triplets') as f:
 70 | 					for line in f:
 71 | 						self.training_set._item_pop[int(line.split()[1])] += 1
 72 | 				np.save(self.dirname + 'data/training_set_item_popularity.npy', self.training_set._item_pop)
 73 | 
 74 | 		return self.training_set._item_pop
 75 | 
 76 | 	def _get_path(self, dirname):
 77 | 		''' Choose between dirname and DEFAULT_DIR+dirname.
 78 | 		'''
 79 | 		if os.path.exists(dirname) and not os.path.exists(DEFAULT_DIR+dirname+'/'):
 80 | 			return dirname
 81 | 		if not os.path.exists(dirname) and os.path.exists(DEFAULT_DIR+dirname+'/'):
 82 | 			return DEFAULT_DIR+dirname+'/'
 83 | 		if os.path.exists(dirname) and os.path.exists(DEFAULT_DIR+dirname+'/'):
 84 | 			print('WARNING: ambiguous directory name, both "'+dirname+'" and "'+DEFAULT_DIR+dirname+'" exist. "'+dirname+'" is used.')
 85 | 			return dirname
 86 | 		
 87 | 		raise ValueError('Dataset not found')
 88 | 
 89 | 	def _load_stats(self):
 90 | 		''' Load informations about the dataset from dirname/data/stats
 91 | 		'''
 92 | 		with open(self.dirname+'data/stats', 'r') as f:
 93 | 			_ = f.readline() # Line with column titles
 94 | 			self.n_users, self.n_items, self.n_interactions, self.longest_sequence = map(int, f.readline().split()[1:])
 95 | 			self.training_set.n_users, self.training_set.n_items, self.training_set.n_interactions, self.training_set.longest_sequence = map(int, f.readline().split()[1:])
 96 | 			self.validation_set.n_users, self.validation_set.n_items, self.validation_set.n_interactions, self.validation_set.longest_sequence = map(int, f.readline().split()[1:])
 97 | 			self.test_set.n_users, self.test_set.n_items, self.test_set.n_interactions, self.test_set.longest_sequence = map(int, f.readline().split()[1:])
 98 | 
 99 | 			if self.extended_training_set:
100 | 				#Those values are unfortunately inexact
101 | 				self.training_set.n_users, self.training_set.n_items = self.n_users, self.n_items
102 | 				self.training_set.n_interactions += (self.validation_set.n_interactions + self.test_set.n_interactions)//2
103 | 
104 | class SequenceGenerator(object):
105 | 	"""docstring for SequenceGenerator"""
106 | 	def __init__(self, filename, shuffle=False):
107 | 		super(SequenceGenerator, self).__init__()
108 | 		self.filename = filename
109 | 		self.shuffle = shuffle
110 | 		self.epochs = 0.
111 | 
112 | 	def load(self):
113 | 
114 | 		self.lines = []
115 | 		# self.max_length = 0
116 | 		# self.max_user_id = 0
117 | 		# self.max_item_id = 0
118 | 
119 | 		with open(self.filename, 'r') as f:
120 | 			for sequence in f:
121 | 				self.lines.append(sequence)
122 | 				# self.max_length = max(self.max_length, (len(sequence.split()) - 1) / 2)
123 | 				# self.max_user_id = max(self.max_user_id, int(sequence.split()[0]))
124 | 				# self.max_item_id = max(self.max_item_id, max(map(int, sequence.split()[1::2])))
125 | 	
126 | 	def __call__(self, min_length=2, max_length=None, length_choice='max', subsequence='contiguous', epochs=np.inf):
127 | 		if not hasattr(self, 'lines'):
128 | 			self.load()
129 | 
130 | 		counter = 0
131 | 		self.epochs = 0.
132 | 		while counter < epochs:
133 | 			counter += 1
134 | 			print("Opening file ({})".format(counter))
135 | 			if self.shuffle:
136 | 				random.shuffle(self.lines)
137 | 			for j, sequence in enumerate(self.lines):
138 | 				
139 | 				self.epochs = counter - 1 + j / len(self.lines)
140 | 				
141 | 				# Express sequence as a list of tuples (movie_id, rating)
142 | 				sequence = sequence.split()
143 | 				user_id = sequence[0]
144 | 				sequence = sequence[1:]
145 | 				sequence = [[int(sequence[2*i]), float(sequence[2*i + 1])] for i in range(int(len(sequence) / 2))]
146 | 
147 | 				# Determine length of the sequence to be returned
148 | 				if max_length == None:
149 | 					this_max_length = len(sequence)
150 | 				else:
151 | 					this_max_length = max_length
152 | 
153 | 				if len(sequence) < min_length:
154 | 					continue
155 | 				if (length_choice == 'random'):
156 | 					length = np.random.randint(min_length, min(this_max_length, len(sequence)) + 1)
157 | 				elif (length_choice == 'max'):
158 | 					length = min(this_max_length, len(sequence))
159 | 				else:
160 | 					raise ValueError('Unrecognised length_choice option. Authorised values are "random" and "max" ')
161 | 				
162 | 				# Extract subsequence if needed
163 | 				if (length < len(sequence)):
164 | 					if subsequence == 'random':
165 | 						sequence = [ sequence[i] for i in sorted(random.sample(xrange(len(sequence)), length)) ]
166 | 					elif subsequence == 'contiguous':
167 | 						start = np.random.randint(0, len(sequence) - length + 1)
168 | 						sequence = sequence[start:start+length]
169 | 					elif subsequence == 'begining':
170 | 						sequence = sequence[:length]
171 | 					else:
172 | 						raise ValueError('Unrecognised subsequence option. Authorised values are "random", "contiguous" and "begining".')
173 | 
174 | 				yield sequence, user_id
175 | 		


--------------------------------------------------------------------------------
/neural_networks/rnn_sampling.py:
--------------------------------------------------------------------------------
  1 | from __future__ import print_function
  2 | 
  3 | import numpy as np
  4 | import theano
  5 | import theano.tensor as T
  6 | import lasagne
  7 | import cPickle
  8 | import random
  9 | from bisect import bisect
 10 | from time import time
 11 | import rnn_base as rnn
 12 | from sparse_lstm import *
 13 | 
 14 | class RNNSampling(rnn.RNNBase):
 15 | 	"""RNNSampling have a loss function that uses a sampling procedure.
 16 | 	BPR or TOP1
 17 | 	"""
 18 | 	def __init__(self, loss_function="Blackout", sampling=32, last_layer_tanh=False, last_layer_init=1., diversity_bias=0.0, sampling_bias=0., **kwargs):
 19 | 		'''
 20 | 		Parameters
 21 | 		----------
 22 | 		loss_function: "BPR" or "TOP1" or "Blackout"
 23 | 			Choice between 3 loss functions:
 24 | 			- BPR, as used in "Session-based Recommendations with Recurrent Neural Networks", Hidasi, B. et al., 2016
 25 | 			- TOP1, defined in "Session-based Recommendations with Recurrent Neural Networks", Hidasi, B. et al., 2016
 26 | 			- Blackout, discriminative loss function defined in "BlackOut: Speeding up Recurrent Neural Network Language Models With Very Large Vocabularies", Ji, S. et al., 2015 (equation 6)
 27 | 		
 28 | 		sampling: integer > 0 or float in (0,1)
 29 | 			Number of items to sample in the computation of the loss function.
 30 | 			If sampling is a float in (0,1), it is interpreted as the fraction of items to use.
 31 | 		sampling_bias: float
 32 | 			Items are sampled with a probability proportional to their frequency to the power of the sampling_bias.
 33 | 
 34 | 		'''
 35 | 		super(RNNSampling, self).__init__(**kwargs)
 36 | 		
 37 | 		self.last_layer_init = last_layer_init
 38 | 		self.last_layer_tanh =last_layer_tanh
 39 | 		self.diversity_bias = diversity_bias
 40 | 		self.sampling = sampling
 41 | 		self.sampling_bias = sampling_bias
 42 | 		if loss_function is None:
 43 | 			loss_function = "Blackout"
 44 | 		self.loss_function_name = loss_function
 45 | 		if loss_function == "BPR":
 46 | 			self.loss_function = self._BPR_loss
 47 | 		elif loss_function == "BPRI":
 48 | 			self.loss_function = self._BPRI_loss
 49 | 		elif loss_function == "TOP1":
 50 | 			self.loss_function = self._TOP1_loss
 51 | 		elif loss_function == "Blackout":
 52 | 			self.loss_function = self._blackout_loss
 53 | 		else:
 54 | 			raise ValueError("Unknown loss function")
 55 | 
 56 | 		
 57 | 		self.name = "RNN with sampling loss"
 58 | 
 59 | 	def _get_model_filename(self, epochs):
 60 | 		'''Return the name of the file to save the current model
 61 | 		'''
 62 | 		filename = "rnn_sampling_"+self.loss_function_name+"_"
 63 | 		if self.sampling_bias > 0.:
 64 | 			filename += "p" + str(self.sampling_bias)
 65 | 		filename += "s"+str(self.sampling)+"_ini"+str(self.last_layer_init)+"_db"+str(self.diversity_bias)
 66 | 		return filename + "_" + self._common_filename(epochs)
 67 | 
 68 | 	def _blackout_loss(self, predictions, targets):
 69 | 		predictions = T.nnet.softmax(predictions)
 70 | 		pos = T.nnet.categorical_crossentropy(predictions, targets)
 71 | 		neg = T.log(1 - predictions)
 72 | 		return pos - neg[:, targets.shape[0]:].sum(axis=-1)
 73 | 
 74 | 	def _BPRI_loss(self, predictions, targets):
 75 | 		if self.last_layer_tanh:
 76 | 			predictions = T.tanh(predictions)
 77 | 		diff = (predictions - T.diag(predictions).dimshuffle([0,'x']))[:, targets.shape[0]:]
 78 | 		return (T.log(T.nnet.sigmoid(diff))).mean(axis=-1)
 79 | 
 80 | 	def _BPR_loss(self, predictions, targets):
 81 | 		if self.last_layer_tanh:
 82 | 			predictions = T.tanh(predictions)
 83 | 		diff = (predictions - T.diag(predictions).dimshuffle([0,'x']))[:, targets.shape[0]:]
 84 | 		return -(T.log(T.nnet.sigmoid(-diff))).mean(axis=-1)
 85 | 
 86 | 	def _TOP1_loss(self, predictions, targets):
 87 | 		if self.last_layer_tanh:
 88 | 			predictions = T.tanh(predictions)
 89 | 		diff = (predictions - T.diag(predictions).dimshuffle([0,'x']))[:, targets.shape[0]:]
 90 | 		reg = T.sqr(predictions[:, targets.shape[0]:])
 91 | 		return (T.nnet.sigmoid(diff) + T.nnet.sigmoid(reg)).mean(axis=-1)
 92 | 
 93 | 	def _prepare_networks(self, n_items):
 94 | 		''' Prepares the building blocks of the RNN, but does not compile them:
 95 | 		self.l_in : input layer
 96 | 		self.l_mask : mask of the input layer
 97 | 		self.target : target of the network
 98 | 		self.l_out : output of the network
 99 | 		self.cost : cost function
100 | 		'''
101 | 
102 | 		self.n_items = n_items
103 | 		if self.sampling < 1:
104 | 			self.effective_sampling = int(self.sampling * self.n_items)
105 | 		else:
106 | 			self.effective_sampling = int(self.sampling)
107 | 		
108 | 		# The input is composed of to parts : the on-hot encoding of the movie, and the features of the movie
109 | 		self.l_in = lasagne.layers.InputLayer(shape=(self.batch_size, self.max_length, self._input_size()))
110 | 		# The input is completed by a mask to inform the LSTM of the length of the sequence
111 | 		self.l_mask = lasagne.layers.InputLayer(shape=(self.batch_size, self.max_length))
112 | 
113 | 		# recurrent layer
114 | 		if not self.use_movies_features:
115 | 			l_recurrent = self.recurrent_layer(self.l_in, self.l_mask, true_input_size=self.n_items + self._n_optional_features(), only_return_final=True)
116 | 		else:
117 | 			l_recurrent = self.recurrent_layer(self.l_in, self.l_mask, true_input_size=None, only_return_final=True)
118 | 
119 | 		# l_last_slice gets the last output of the recurrent layer
120 | 		l_last_slice = l_recurrent
121 | 		# l_last_slice = lasagne.layers.SliceLayer(l_recurrent, -1, 1)
122 | 
123 | 		# Theano tensor for the targets
124 | 		target = T.ivector('target_output')
125 | 		samples = T.ivector('samples')
126 | 		self.exclude = T.fmatrix('excluded_items')
127 | 		target_popularity = T.fvector('target_popularity')
128 | 		self.theano_inputs = [self.l_in.input_var, self.l_mask.input_var, target, samples, target_popularity, self.exclude]
129 | 		
130 | 		# The sliced output is then passed through linear layer to obtain the right output size
131 | 		self.l_out = BlackoutLayer(l_last_slice, num_units=self.n_items, num_outputs=self.sampling, nonlinearity=None, W=lasagne.init.GlorotUniform(gain=self.last_layer_init))
132 | 
133 | 		# lasagne.layers.get_output produces a variable for the output of the net
134 | 		network_output = lasagne.layers.get_output(self.l_out, targets = target, samples=samples)
135 | 
136 | 		# loss function
137 | 		self.cost = (self.loss_function(network_output,np.arange(self.batch_size)) / target_popularity).mean()
138 | 		
139 | 
140 | 	def _compile_test_function(self):
141 | 		''' Differs from base test function because of the added softmax operation
142 | 		'''
143 | 		print("Compiling test...")
144 | 		deterministic_output = T.nnet.softmax(lasagne.layers.get_output(self.l_out, deterministic=True))
145 | 		if self.interactions_are_unique:
146 | 			deterministic_output *= (1 - self.exclude)
147 | 
148 | 		theano_test_function = theano.function(self.theano_inputs, deterministic_output, allow_input_downcast=True, name="Test_function", on_unused_input='ignore')
149 | 		
150 | 		def precision_test_function(theano_inputs, k=10):
151 | 			output = theano_test_function(*theano_inputs)
152 | 			ids = np.argpartition(-output, range(k), axis=-1)[0, :k]
153 | 			
154 | 			return ids
155 | 
156 | 		self.test_function = precision_test_function
157 | 		print("Compilation done.")
158 | 
159 | 	def _popularity_sample(self):
160 | 		if not hasattr(self, '_cumsum'):
161 | 			self._cumsum = np.cumsum(np.power(self.dataset.item_popularity, self.sampling_bias))
162 | 
163 | 		return bisect(self._cumsum, random.uniform(0, self._cumsum[-1]))
164 | 
165 | 	def _prepare_input(self, sequences):
166 | 		''' Sequences is a list of [user_id, input_sequence, targets]
167 | 		'''
168 | 
169 | 		batch_size = len(sequences)
170 | 
171 | 		# Shape return variables
172 | 		X = np.zeros((batch_size, self.max_length, self._input_size()), dtype=self._input_type) # input of the RNN
173 | 		mask = np.zeros((batch_size, self.max_length)) # mask of the input (to deal with sequences of different length)
174 | 		Y = np.zeros((batch_size,), dtype='int32') # output target
175 | 		pop = np.zeros((batch_size,)) # output target popularity
176 | 		exclude = np.zeros((batch_size, self.n_items), dtype=theano.config.floatX)
177 | 
178 | 		
179 | 		for i, sequence in enumerate(sequences):
180 | 			user_id, in_seq, target = sequence
181 | 			seq_features = np.array(map(lambda x: self._get_features(x, user_id), in_seq))
182 | 			X[i, :len(in_seq), :] = seq_features # Copy sequences into X
183 | 			mask[i, :len(in_seq)] = 1
184 | 			Y[i] = target[0][0] # id of the first and only target
185 | 			exclude[i, [j[0] for j in in_seq]] = 1
186 | 			pop[i] = self.dataset.item_popularity[target[0][0]] ** self.diversity_bias
187 | 
188 | 		if self.sampling_bias > 0:
189 | 			samples = np.array([self._popularity_sample() for i in range(self.effective_sampling)], dtype='int32')
190 | 		else:
191 | 			samples = np.random.choice(self.n_items, self.effective_sampling).astype('int32')
192 | 
193 | 
194 | 		return (X, mask.astype(theano.config.floatX), Y, samples, pop.astype(theano.config.floatX), exclude)
195 | 


--------------------------------------------------------------------------------
/helpers/evaluation.py:
--------------------------------------------------------------------------------
  1 | from __future__ import division
  2 | 
  3 | import numpy as np
  4 | import scipy.sparse as ssp
  5 | import os.path
  6 | import theano
  7 | import theano.tensor as T
  8 | import random
  9 | import operator
 10 | import collections
 11 | #import matplotlib.pyplot as plt
 12 | 
 13 | # Plot multiple figures at the same time
 14 | #plt.ion()
 15 | 
 16 | class Evaluator(object):
 17 | 	'''Evaluator is a class to compute metrics on tests
 18 | 
 19 | 	It is used by first adding a series of "instances" : pairs of goals and predictions, then metrics can be computed on the ensemble of instances:
 20 | 	average precision, percentage of instance with a correct prediction, etc.
 21 | 
 22 | 	It can also return the set of correct predictions.
 23 | 	'''
 24 | 	def __init__(self, dataset, k=10):
 25 | 		super(Evaluator, self).__init__()
 26 | 		self.instances = []
 27 | 		self.dataset = dataset
 28 | 		self.k = k
 29 | 
 30 | 		self.metrics = {'sps': self.short_term_prediction_success,
 31 | 			'recall': self.average_recall, 
 32 | 			'precision': self.average_precision,
 33 | 			'ndcg': self.average_ndcg,
 34 | 			'item_coverage': self.item_coverage,
 35 | 			'user_coverage': self.user_coverage,
 36 | 			'assr': self.assr,
 37 | 			'blockbuster_share': self.blockbuster_share}
 38 | 	
 39 | 	def add_instance(self, goal, predictions):
 40 | 		self.instances.append([goal, predictions])
 41 | 
 42 | 	def _load_interaction_matrix(self):
 43 | 		'''Load the training set as an interaction matrix between items and users in a sparse format.
 44 | 		'''
 45 | 		filename = self.dataset.dirname + 'data/train_set_triplets'
 46 | 		if os.path.isfile(filename + '.npy'):
 47 | 			file_content = np.load(filename + '.npy')
 48 | 		else:
 49 | 			file_content = np.loadtxt(filename)
 50 | 			np.save(filename, file_content)
 51 | 
 52 | 		self._interactions = ssp.coo_matrix((np.ones(file_content.shape[0]), (file_content[:,1], file_content[:,0]))).tocsr()
 53 | 
 54 | 	def _intra_list_similarity(self, items):
 55 | 		'''Compute the intra-list similarity of a list of items.
 56 | 		'''
 57 | 		if not hasattr(self, "_interactions"):
 58 | 			self._load_interaction_matrix()
 59 | 
 60 | 		norm = np.sqrt(np.asarray(self._interactions[items, :].sum(axis=1)).ravel())
 61 | 		sims = self._interactions[items, :].dot(self._interactions[items, :].T).toarray()
 62 | 		S = 0
 63 | 		for i in range(len(items)):
 64 | 			for j in range(i):
 65 | 				S += sims[i, j] / norm[i] / norm[j]
 66 | 
 67 | 		return S
 68 | 
 69 | 	def average_intra_list_similarity(self):
 70 | 		'''Return the average intra-list similarity, as defined in "Auralist: Introducing Serendipity into Music Recommendation"
 71 | 		'''
 72 | 
 73 | 		ILS = 0
 74 | 		for goal, prediction in self.instances:
 75 | 			if len(prediction) > 0:
 76 | 				ILS += self._intra_list_similarity(prediction[:min(len(prediction), self.k)])
 77 | 
 78 | 		return ILS / len(self.instances)
 79 | 
 80 | 
 81 | 	def blockbuster_share(self):
 82 | 		'''Return the percentage of correct long term predictions that are about items in the top 1% of the most popular items.
 83 | 		'''
 84 | 
 85 | 		correct_predictions = self.get_correct_predictions()
 86 | 		nb_pop_items = self.dataset.n_items // 100
 87 | 		pop_items = np.argpartition(-self.dataset.item_popularity, nb_pop_items)[:nb_pop_items]
 88 | 
 89 | 		if len(correct_predictions) == 0:
 90 | 			return 0
 91 | 		return len([i for i in correct_predictions if i in pop_items])/len(correct_predictions)
 92 | 
 93 | 	def average_novelty(self):
 94 | 		'''Return the average novelty measure, as defined in "Auralist: Introducing Serendipity into Music Recommendation"
 95 | 		'''
 96 | 
 97 | 		nb_of_ratings = sum(self.dataset.item_popularity)
 98 | 
 99 | 		novelty = 0
100 | 		for goal, prediction in self.instances:
101 | 			if len(prediction) > 0:
102 | 				novelty += sum(map(np.log2, self.dataset.item_popularity[prediction[:min(len(prediction), self.k)]] / nb_of_ratings)) / min(len(prediction), self.k)
103 | 
104 | 		return -novelty / len(self.instances)
105 | 
106 | 	def average_precision(self):
107 | 		'''Return the average number of correct predictions per instance.
108 | 		'''
109 | 		precision = 0
110 | 		for goal, prediction in self.instances:
111 | 			if len(prediction) > 0:
112 | 				precision += float(len(set(goal) & set(prediction[:min(len(prediction), self.k)]))) / min(len(prediction), self.k)
113 | 
114 | 		return precision / len(self.instances)
115 | 
116 | 	def average_recall(self):
117 | 		'''Return the average recall.
118 | 		'''
119 | 		recall = 0
120 | 		for goal, prediction in self.instances:
121 | 			if len(goal) > 0:
122 | 				recall += float(len(set(goal) & set(prediction[:min(len(prediction), self.k)]))) / len(goal)
123 | 
124 | 		return recall / len(self.instances)
125 | 
126 | 	def average_ndcg(self):
127 | 		ndcg = 0.
128 | 		for goal, prediction in self.instances:
129 | 			if len(prediction) > 0:
130 | 				dcg = 0.
131 | 				max_dcg = 0.
132 | 				for i, p in enumerate(prediction[:min(len(prediction), self.k)]):
133 | 					if i < len(goal):
134 | 						max_dcg += 1. / np.log2(2 + i)
135 | 
136 | 					if p in goal:
137 | 						dcg += 1. / np.log2(2 + i)
138 | 
139 | 				ndcg += dcg/max_dcg
140 | 
141 | 		return ndcg / len(self.instances)
142 | 
143 | 	def short_term_prediction_success(self):
144 | 		'''Return the percentage of instances for which the first goal was in the predictions
145 | 		'''
146 | 		score = 0
147 | 		for goal, prediction in self.instances:
148 | 			score += int(goal[0] in prediction[:min(len(prediction), self.k)])
149 | 
150 | 		return score / len(self.instances)
151 | 	
152 | 	def sps(self):
153 | 		return self.short_term_prediction_success()
154 | 
155 | 	def user_coverage(self):
156 | 		'''Return the percentage of instances for which at least one of the goals was in the predictions
157 | 		'''
158 | 		score = 0
159 | 		for goal, prediction in self.instances:
160 | 			score += int(len(set(goal) & set(prediction[:min(len(prediction), self.k)])) > 0)
161 | 
162 | 		return score / len(self.instances)
163 | 
164 | 	def get_all_goals(self):
165 | 		'''Return a concatenation of the goals of each instances
166 | 		'''
167 | 		return [g for goal, _ in self.instances for g in goal]
168 | 
169 | 	def get_strict_goals(self):
170 | 		'''Return a concatenation of the strict goals (i.e. the first goal) of each instances
171 | 		'''
172 | 		return [goal[0] for goal, _ in self.instances]
173 | 
174 | 	def get_all_predictions(self):
175 | 		'''Return a concatenation of the predictions of each instances
176 | 		'''
177 | 		return [p for _, prediction in self.instances for p in prediction[:min(len(prediction), self.k)]]
178 | 
179 | 	def get_correct_predictions(self):
180 | 		'''Return a concatenation of the correct predictions of each instances
181 | 		'''
182 | 		correct_predictions = []
183 | 		for goal, prediction in self.instances:
184 | 			correct_predictions.extend(list(set(goal) & set(prediction[:min(len(prediction), self.k)])))
185 | 		return correct_predictions
186 | 
187 | 	def item_coverage(self):
188 | 		return len(set(self.get_correct_predictions()))
189 | 
190 | 	def get_correct_strict_predictions(self):
191 | 		'''Return a concatenation of the strictly correct predictions of each instances (i.e. predicted the first goal)
192 | 		'''
193 | 		correct_predictions = []
194 | 		for goal, prediction in self.instances:
195 | 			correct_predictions.extend(list(set([goal[0]]) & set(prediction[:min(len(prediction), self.k)])))
196 | 		return correct_predictions
197 | 
198 | 	def get_rank_comparison(self):
199 | 		'''Returns a list of tuple of the form (position of the item in the list of goals, position of the item in the recommendations)
200 | 		'''
201 | 		all_positions = []
202 | 		for goal, prediction in self.instances:
203 | 			position_in_predictions = np.argsort(prediction)[goal]
204 | 			all_positions.extend(list(enumerate(position_in_predictions)))
205 | 
206 | 		return all_positions
207 | 
208 | 	def assr(self):
209 | 		'''Returns the average search space reduction.
210 | 		It is defined as the number of items in the dataset divided by the average number of dot products made during testing.
211 | 		'''
212 | 
213 | 		if hasattr(self, 'nb_of_dp') and self.nb_of_dp > 0:
214 | 			return self.dataset.n_items / self.nb_of_dp
215 | 		else:
216 | 			return 1 # If nb_of_dp is not defined, clustering is probably not used, return default assr: 1
217 | 
218 | class DistributionCharacteristics(object):
219 | 	"""DistributionCharacteristics computes and plot certain characteristics of a list of movies, such as the distribution of popularity.
220 | 	"""
221 | 	def __init__(self, movies):
222 | 		super(DistributionCharacteristics, self).__init__()
223 | 		self.movies = collections.Counter(movies)
224 | 
225 | 	def plot_frequency_distribution(self):
226 | 		'''Plot the number of items versus the frequency
227 | 		'''
228 | 		frequencies = self.movies.values()
229 | 		freq_distribution = collections.Counter(frequencies)
230 | 		#plt.figure()
231 | 		#plt.loglog(freq_distribution.keys(), freq_distribution.values(), '.')
232 | 		#plt.show()
233 | 
234 | 	def plot_popularity_distribution(self):
235 | 		'''Bar plot of the number of movies in each popularity category
236 | 		'''
237 | 
238 | 		bars = np.zeros(10)
239 | 		for key, val in self.movies.items():
240 | 			popularity_index = OTHER_FEATURES[key, 3] - 1 # minus 1 to shift from 1-based to 0-based counting
241 | 			bars[popularity_index] += val
242 | 
243 | 		# plt.figure()
244 | 		# plt.bar(np.arange(10) + 0.5, bars, width=1)
245 | 		# plt.show() 
246 | 
247 | 	def number_of_movies(self):
248 | 		return len(self.movies)
249 | 
250 | 		


--------------------------------------------------------------------------------
/factorization/mf_base.py:
--------------------------------------------------------------------------------
  1 | from __future__ import division
  2 | from __future__ import print_function
  3 | import numpy as np
  4 | import math
  5 | import random
  6 | import re
  7 | import os
  8 | import glob
  9 | import sys
 10 | from time import time
 11 | from helpers import evaluation
 12 | 
 13 | class MFBase(object):
 14 | 	'''Base class for methods based on matrix factorization
 15 | 	'''
 16 | 
 17 | 	def __init__(self, reg = 0.0025, learning_rate = 0.05, annealing=1., init_sigma = 1):
 18 | 		self.name = 'Base for matrix factorization'
 19 | 		self.reg = reg
 20 | 		self.learning_rate = learning_rate # self.learning_rate will change due to annealing.
 21 | 		self.init_learning_rate = learning_rate # self.init_learning_rate keeps the original value (for filename)
 22 | 		self.annealing_rate = annealing
 23 | 		self.init_sigma = init_sigma
 24 | 		self.max_length = np.inf # For compatibility with the RNNs
 25 | 
 26 | 		self.metrics = {'recall': {'direction': 1},
 27 | 			'sps': {'direction': 1},
 28 | 			'user_coverage' : {'direction': 1},
 29 | 			'item_coverage' : {'direction': 1},
 30 | 			'ndcg' : {'direction': 1},
 31 | 			'blockbuster_share' : {'direction': -1}
 32 | 		}
 33 | 
 34 | 	def prepare_model(self, dataset):
 35 | 		'''Must be called before using train, load or top_k_recommendations
 36 | 		'''
 37 | 		self.dataset = dataset
 38 | 		self.n_items = dataset.n_items
 39 | 		self.n_users = dataset.n_users
 40 | 
 41 | 	def change_data_format(self, dataset):
 42 | 		'''Gets a generator of data in the sequence format and save data in the csr format
 43 | 		'''
 44 | 		
 45 | 		self.users = np.zeros((self.n_users,2), dtype=np.int32)
 46 | 		self.items = np.zeros(dataset.training_set.n_interactions, dtype=np.int32)
 47 | 		cursor = 0
 48 | 		with open(dataset.training_set.filename, 'r') as f:
 49 | 			for sequence in f:
 50 | 				sequence = sequence.split()
 51 | 				items = map(int, sequence[1::2])
 52 | 				self.users[int(sequence[0]), :] = [cursor, len(items)]
 53 | 				self.items[cursor:cursor+len(items)] = items
 54 | 				cursor += len(items)
 55 | 
 56 | 	def get_pareto_front(self, metrics, metrics_names):
 57 | 		costs = np.zeros((len(metrics[metrics_names[0]]), len(metrics_names)))
 58 | 		for i, m in enumerate(metrics_names):
 59 | 			costs[:, i] = np.array(metrics[m]) * self.metrics[m]['direction']
 60 | 		is_efficient = np.ones(costs.shape[0], dtype = bool)
 61 | 		for i, c in enumerate(costs):
 62 | 			if is_efficient[i]:
 63 | 				is_efficient[is_efficient] = np.any(costs[is_efficient]>=c, axis=1)
 64 | 		return np.where(is_efficient)[0].tolist()
 65 | 
 66 | 	def _compute_validation_metrics(self, metrics):
 67 | 		ev = evaluation.Evaluator(self.dataset, k=10)
 68 | 		for sequence, user_id in self.dataset.validation_set(epochs=1):
 69 | 			top_k = self.top_k_recommendations(sequence[:len(sequence)//2], user_id=int(user_id))
 70 | 			goal = [i[0] for i in sequence[len(sequence)//2:]]
 71 | 			ev.add_instance(goal, top_k)
 72 | 
 73 | 		metrics['recall'].append(ev.average_recall())
 74 | 		metrics['sps'].append(ev.sps())
 75 | 		metrics['ndcg'].append(ev.average_ndcg())
 76 | 		metrics['user_coverage'].append(ev.user_coverage())
 77 | 		metrics['item_coverage'].append(ev.item_coverage())
 78 | 		metrics['blockbuster_share'].append(ev.blockbuster_share())
 79 | 
 80 | 		return metrics
 81 | 
 82 | 	def train(self, dataset, 
 83 | 		max_time=np.inf, 
 84 | 		progress=2.0, 
 85 | 		time_based_progress=False, 
 86 | 		autosave='All', 
 87 | 		save_dir='', 
 88 | 		min_iterations=0, 
 89 | 		max_iter=np.inf, 
 90 | 		max_progress_interval=np.inf,
 91 | 		load_last_model=False,
 92 | 		early_stopping=None,
 93 | 		validation_metrics=['sps']):
 94 | 		'''Train the model based on the sequence given by the training_set
 95 | 
 96 | 		max_time is used to set the maximumn amount of time (in seconds) that the training can last before being stop.
 97 | 			By default, max_time=np.inf, which means that the training will last until the training_set runs out, or the user interrupt the program.
 98 | 		
 99 | 		progress is used to set when progress information should be printed during training. It can be either an int or a float:
100 | 			integer : print at linear intervals specified by the value of progress (i.e. : progress, 2*progress, 3*progress, ...)
101 | 			float : print at geometric intervals specified by the value of progress (i.e. : progress, progress^2, progress^3, ...)
102 | 
103 | 		max_progress_interval can be used to have geometric intervals in the begining then switch to linear intervals. 
104 | 			It ensures, independently of the progress parameter, that progress is shown at least every max_progress_interval.
105 | 
106 | 		time_based_progress is used to choose between using number of iterations or time as a progress indicator. True means time (in seconds) is used, False means number of iterations.
107 | 
108 | 		autosave is used to set whether the model should be saved during training. It can take several values:
109 | 			All : the model will be saved each time progress info is printed.
110 | 			Best : save only the best model so far
111 | 			None : does not save
112 | 
113 | 		min_iterations is used to set a minimum of iterations before printing the first information (and saving the model).
114 | 
115 | 		save_dir is the path to the directory where models are saved.
116 | 
117 | 		load_last_model: if true, find the latest model in the directory where models should be saved, and load it before starting training.
118 | 
119 | 		early_stopping : should be a callable that will recieve the list of validation error and the corresponding epochs and return a boolean indicating whether the learning should stop.
120 | 		'''
121 | 
122 | 		# Change data format
123 | 		self.change_data_format(dataset)
124 | 		#del dataset.training_set.lines
125 | 
126 | 		if len(set(validation_metrics) & set(self.metrics.keys())) < len(validation_metrics):
127 | 			raise ValueError('Incorrect validation metrics. Metrics must be chosen among: ' + ', '.join(self.metrics.keys()))
128 | 
129 | 		# Load last model if needed, else initialise the model
130 | 		iterations = 0
131 | 		epochs_offset = 0
132 | 		if load_last_model:
133 | 			epochs_offset = self.load_last(save_dir)
134 | 		if epochs_offset == 0:
135 | 			self.init_model()
136 | 
137 | 		start_time = time()
138 | 		next_save = int(progress)
139 | 		train_costs = []
140 | 		current_train_cost = []
141 | 		epochs = []
142 | 		metrics = {name:[] for name in self.metrics.keys()}
143 | 		filename = {}
144 | 
145 | 		while (time() - start_time < max_time and iterations < max_iter):
146 | 
147 | 			cost = self.training_step(iterations)
148 | 
149 | 			current_train_cost.append(cost)
150 | 
151 | 			# Cool learning rate
152 | 			if iterations % dataset.training_set.n_interactions == 0:
153 | 				self.learning_rate *= self.annealing_rate
154 | 
155 | 			# Check if it is time to save the model
156 | 			iterations += 1
157 | 
158 | 			if time_based_progress:
159 | 				progress_indicator = int(time() - start_time)
160 | 			else:
161 | 				progress_indicator = iterations
162 | 
163 | 			if progress_indicator >= next_save:
164 | 
165 | 				if progress_indicator >= min_iterations:
166 | 					
167 | 					# Save current epoch
168 | 					epochs.append(epochs_offset + iterations / dataset.training_set.n_interactions)
169 | 
170 | 					# Average train cost
171 | 					train_costs.append(np.mean(current_train_cost))
172 | 					current_train_cost = []
173 | 
174 | 					# Compute validation cost
175 | 					metrics = self._compute_validation_metrics(metrics)
176 | 
177 | 					# Print info
178 | 					self._print_progress(iterations, epochs[-1], start_time, train_costs, metrics, validation_metrics)
179 | 
180 | 					# Save model
181 | 					run_nb = len(metrics[self.metrics.keys()[0]])-1
182 | 					if autosave == 'All':
183 | 						filename[run_nb] = save_dir + self._get_model_filename(round(epochs[-1], 3))
184 | 						self.save(filename[run_nb])
185 | 					elif autosave == 'Best':
186 | 						pareto_runs = self.get_pareto_front(metrics, validation_metrics)
187 | 						if run_nb in pareto_runs:
188 | 							filename[run_nb] = save_dir + self._get_model_filename(round(epochs[-1], 3))
189 | 							self.save(filename[run_nb])
190 | 							to_delete = [r for r in filename if r not in pareto_runs]
191 | 							for run in to_delete:
192 | 								try:
193 | 									os.remove(filename[run])
194 | 									print('Deleted ', filename[run])
195 | 								except OSError:
196 | 									print('Warning : Previous model could not be deleted')
197 | 								del filename[run]
198 | 
199 | 					if early_stopping is not None:
200 | 						# Stop if early stopping is triggered for all the validation metrics
201 | 						if all([early_stopping(epochs, metrics[m]) for m in validation_metrics]):
202 | 							break 
203 | 
204 | 
205 | 				# Compute next checkpoint
206 | 				if isinstance(progress, int):
207 | 					next_save += min(progress, max_progress_interval)
208 | 				else:
209 | 					next_save += min(max_progress_interval, next_save * (progress - 1))
210 | 
211 | 		best_run = np.argmax(np.array(metrics[validation_metrics[0]]) * self.metrics[validation_metrics[0]]['direction'])
212 | 		return ({m: metrics[m][best_run] for m in self.metrics.keys()}, time()-start_time, filename[best_run])
213 | 
214 | 	def _print_progress(self, iterations, epochs, start_time, train_costs, metrics, validation_metrics):
215 | 		'''Print learning progress in terminal
216 | 		'''
217 | 		print(self.name, iterations, "batchs, ", epochs, " epochs in", time() - start_time, "s")
218 | 		print("Last train cost : ", train_costs[-1])
219 | 		for m in self.metrics:
220 | 			print(m, ': ', metrics[m][-1])
221 | 			if m in validation_metrics:
222 | 				print('Best ', m, ': ', max(np.array(metrics[m])*self.metrics[m]['direction'])*self.metrics[m]['direction'])
223 | 		print('-----------------')
224 | 
225 | 		# Print on stderr for easier recording of progress
226 | 		print(iterations, epochs, time() - start_time, train_costs[-1], ' '.join(map(str, [metrics[m][-1] for m in self.metrics])), file=sys.stderr)
227 | 
228 | 	def load_last(self, save_dir):
229 | 		'''Load last model from dir
230 | 		'''
231 | 		def extract_number_of_epochs(filename):
232 | 			m = re.search('_ne([0-9]+(\.[0-9]+)?)_', filename)
233 | 			return float(m.group(1))
234 | 
235 | 		# Get all the models for this RNN
236 | 		file = save_dir + self._get_model_filename("*")
237 | 		file = np.array(glob.glob(file))
238 | 
239 | 		if len(file) == 0:
240 | 			print('No previous model, starting from scratch')
241 | 			return 0
242 | 
243 | 		# Find last model and load it
244 | 		last_batch = np.amax(np.array(map(extract_number_of_epochs, file)))
245 | 		last_model = save_dir + self._get_model_filename(last_batch)
246 | 		print('Starting from model ' + last_model)
247 | 		self.load(last_model)
248 | 
249 | 		return last_batch


--------------------------------------------------------------------------------
/helpers/command_parser.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | from neural_networks.rnn_one_hot import RNNOneHot
  3 | from neural_networks.rnn_cluster import RNNCluster
  4 | from neural_networks.fism_cluster import FISMCluster
  5 | from neural_networks.rnn_margin import RNNMargin
  6 | from neural_networks.rnn_sampling import RNNSampling
  7 | from lazy.pop import Pop
  8 | from lazy.markov_model import MarkovModel
  9 | from lazy.user_knn import UserKNN
 10 | from neural_networks.stacked_denoising_autoencoder import StackedDenoisingAutoencoder
 11 | from factorization.bprmf import BPRMF
 12 | from factorization.fism import FISM
 13 | from factorization.fossil import Fossil
 14 | from factorization.fpmc import FPMC
 15 | from word2vec.ltm import LTM
 16 | from helpers.early_stopping import early_stopping_command_parser, get_early_stopper
 17 | from neural_networks.recurrent_layers import recurrent_layers_command_parser, get_recurrent_layers
 18 | from neural_networks.update_manager import update_manager_command_parser, get_update_manager
 19 | from neural_networks.sequence_noise import sequence_noise_command_parser, get_sequence_noise
 20 | from neural_networks.target_selection import target_selection_command_parser, get_target_selection
 21 | 
 22 | def command_parser(*sub_command_parser):
 23 | 	''' *sub_command_parser should be callables that will add arguments to the command parser
 24 | 	'''
 25 | 
 26 | 	parser = argparse.ArgumentParser()
 27 | 
 28 | 	for scp in sub_command_parser:
 29 | 		scp(parser)
 30 | 
 31 | 	args = parser.parse_args()
 32 | 	return args
 33 | 
 34 | def predictor_command_parser(parser):
 35 | 	parser.add_argument('-m', dest='method', choices=['RNN', 'SDA', 'BPRMF', 'FPMC', 'FISM', 'Fossil', 'LTM', 'UKNN', 'MM', 'POP'],
 36 | 	 help='Method', default='RNN')
 37 | 	parser.add_argument('-b', dest='batch_size', help='Batch size', default=16, type=int)
 38 | 	parser.add_argument('-l', dest='learning_rate', help='Learning rate', default=0.01, type=float)
 39 | 	parser.add_argument('-r', dest='regularization', help='Regularization (positive for L2, negative for L1)', default=0., type=float)
 40 | 	parser.add_argument('-g', dest='gradient_clipping', help='Gradient clipping', default=100, type=int)
 41 | 	parser.add_argument('-H', dest='hidden', help='Number of hidden neurons (for LTM and BPRMF)', default=20, type=int)
 42 | 	parser.add_argument('-L', dest='layers', help='Layers (for SDA)', default="20", type=str)
 43 | 	parser.add_argument('--loss', help='Loss function, choose between TOP1, BPR and Blackout (Sampling), or hinge, logit and logsig (multi-targets), or CCE (Categorical cross-entropy)', default='CCE', type=str)
 44 | 	parser.add_argument('--sampling', help='Number of sample for the computation of the loss in RNNSampling', default=32.0, type=float)
 45 | 	parser.add_argument('--sampling_bias', help='Sampling bias for cluster methods. 0. means uniform sampling, 1. means proportional to the item frequency', default=0., type=float)
 46 | 	parser.add_argument('--db', dest='diversity_bias', help='Diversity bias (for RNN with CCE, TOP1, BPR or Blackout loss)', default=0.0, type=float)
 47 | 	parser.add_argument('--in_do', dest='input_dropout', help='Input dropout (for SDA)', default=0.2, type=float)
 48 | 	parser.add_argument('--do', dest='dropout', help='Dropout (for SDA)', default=0.5, type=float)
 49 | 	parser.add_argument('--rf', help='Use rating features.', action='store_true')
 50 | 	parser.add_argument('--mf', help='Use movie features.', action='store_true')
 51 | 	parser.add_argument('--uf', help='Use users features.', action='store_true')
 52 | 	parser.add_argument('--ns', help='Neighborhood size (for UKNN).', default=80, type=int)
 53 | 	parser.add_argument('--pb', help='Popularity based (for RNNMargin).', action='store_true')
 54 | 	parser.add_argument('--balance', help='Balance between false positive and false negative error (for RNNMargin).', default=1., type=float)
 55 | 	parser.add_argument('--min_access', help='Estimation of minimum access probability (for RNNMargin).', default=0.05, type=float)
 56 | 	parser.add_argument('--k_cf', help='Number of features for the CF factorization (for FPMC).', default=32, type=int)
 57 | 	parser.add_argument('--k_mc', help='Number of features for the MC factorization (for FPMC).', default=32, type=int)
 58 | 	parser.add_argument('--init_sigma', help='Sigma of the gaussian initialization (for FPMC)', default=1, type=float)
 59 | 	parser.add_argument('--fpmc_bias', help='Sampling bias (for FPMC)', default=100., type=float)
 60 | 	parser.add_argument('--no_adaptive_sampling', help='No adaptive sampling (for FPMC)', action='store_true')
 61 | 	parser.add_argument('--cooling', help='Simulated annealing', default=1., type=float)
 62 | 	parser.add_argument('--ltm_damping', help='Temporal damping (for LTM)', default=0.8, type=float)
 63 | 	parser.add_argument('--ltm_window', help='Window for word2vec (for LTM)', default=5, type=int)
 64 | 	parser.add_argument('--ltm_no_trajectory', help='Do not use users trajectory in LTM, just use word2vec', action='store_true')
 65 | 	parser.add_argument('--max_length', help='Maximum length of sequences during training (for RNNs)', default=30, type=int)
 66 | 	parser.add_argument('--repeated_interactions', help='The model can recommend items with which the user already interacted', action='store_true')
 67 | 	parser.add_argument('--fism_alpha', help='Alpha parameter in FISM', default=0.2, type=float)
 68 | 	parser.add_argument('--fossil_order', help='Order of the markov chains in Fossil', default=1, type=int)
 69 | 
 70 | 	parser.add_argument('--c_sampling', help='Number of sample for the clustering loss. If unset, the same samples are used for the recommendation loss and for the clustering loss.', default=-1, type=int)
 71 | 	parser.add_argument('--ignore_clusters', help='Don\'t use clusters during test. Useful to observe the influence of clustering', action='store_true')
 72 | 	parser.add_argument('--clusters', help='Number of clusters. If unset, no clustering is used', default=-1, type=int)
 73 | 	parser.add_argument('--init_scale', help='Initial scale of the softmax and sigmoid in the clustering method.', default=1., type=float)
 74 | 	parser.add_argument('--scale_growing_rate', help='Rate of the geometric growth of the sigmoid/softmax scale in the clustering method.', default=1., type=float)
 75 | 	parser.add_argument('--max_scale', help='Max scale of the softmax and sigmoid in the clustering method.', default=50, type=float)
 76 | 	parser.add_argument('--csn', help='Cluster selection noise', default=0., type=float)
 77 | 	parser.add_argument('--cluster_type', choices=['softmax', 'mix', 'sigmoid'], help='Type of clusters. Softmax puts every item in 1 and only 1 cluster. Sigmoid allow puts items in 0 to n clusters. Mix puts items in 1 to n clusters.', default='mix', type=str)
 78 | 
 79 | 	update_manager_command_parser(parser)
 80 | 	recurrent_layers_command_parser(parser)
 81 | 	sequence_noise_command_parser(parser)
 82 | 	target_selection_command_parser(parser)
 83 | 
 84 | def get_predictor(args):
 85 | 	args.layers = map(int, args.layers.split('-'))
 86 | 
 87 | 	updater = get_update_manager(args)
 88 | 	recurrent_layer = get_recurrent_layers(args)
 89 | 	sequence_noise = get_sequence_noise(args)
 90 | 	target_selection = get_target_selection(args)
 91 | 
 92 | 	if args.method == "MF":
 93 | 		return Factorization()
 94 | 	elif args.method == "BPRMF":
 95 | 		return BPRMF(k=args.hidden, reg = args.regularization, learning_rate = args.learning_rate, annealing=args.cooling, init_sigma = args.init_sigma, adaptive_sampling=(not args.no_adaptive_sampling), sampling_bias=args.fpmc_bias)
 96 | 	elif args.method == "FISM":
 97 | 		if args.clusters > 0:
 98 | 			return FISMCluster(h=args.hidden, reg=args.regularization, alpha=args.fism_alpha, loss=args.loss, interactions_are_unique=(not args.repeated_interactions), predict_with_clusters=(not args.ignore_clusters), sampling_bias=args.sampling_bias, sampling=args.sampling, cluster_sampling=args.c_sampling, init_scale=args.init_scale, scale_growing_rate=args.scale_growing_rate, max_scale=args.max_scale, n_clusters=args.clusters, cluster_type=args.cluster_type, updater=updater, target_selection=target_selection, sequence_noise=sequence_noise, recurrent_layer=recurrent_layer, use_ratings_features=args.rf, use_movies_features=args.mf, use_users_features=args.uf, batch_size=args.batch_size)
 99 | 		else:
100 | 			return FISM(k=args.hidden, reg = args.regularization, learning_rate = args.learning_rate, annealing=args.cooling, init_sigma = args.init_sigma, loss=args.loss, alpha=args.fism_alpha)
101 | 	elif args.method == "Fossil":
102 | 		return Fossil(k=args.hidden, order=args.fossil_order, reg = args.regularization, learning_rate = args.learning_rate, annealing=args.cooling, init_sigma = args.init_sigma, alpha=args.fism_alpha)
103 | 	elif args.method == "FPMC":
104 | 		return FPMC(k_cf = args.k_cf, k_mc = args.k_mc, reg = args.regularization, learning_rate = args.learning_rate, annealing=args.cooling, init_sigma = args.init_sigma, adaptive_sampling=(not args.no_adaptive_sampling), sampling_bias=args.fpmc_bias)
105 | 	elif args.method == "LTM":
106 | 		return LTM(k = args.hidden, alpha = args.ltm_damping, window = args.ltm_window, learning_rate=args.learning_rate, use_trajectory=(not args.ltm_no_trajectory))
107 | 	elif args.method == "UKNN":
108 | 		return UserKNN(neighborhood_size=args.ns)
109 | 	elif args.method == "POP":
110 | 		return Pop()
111 | 	elif args.method == "MM":
112 | 		return MarkovModel()
113 | 	elif args.method == 'RNN':
114 | 		if args.clusters > 0:
115 | 			return RNNCluster(interactions_are_unique=(not args.repeated_interactions), max_length=args.max_length, cluster_selection_noise=args.csn, loss=args.loss, predict_with_clusters=(not args.ignore_clusters), sampling_bias=args.sampling_bias, sampling=args.sampling, cluster_sampling=args.c_sampling, init_scale=args.init_scale, scale_growing_rate=args.scale_growing_rate, max_scale=args.max_scale, n_clusters=args.clusters, cluster_type=args.cluster_type, updater=updater, target_selection=target_selection, sequence_noise=sequence_noise, recurrent_layer=recurrent_layer, use_ratings_features=args.rf, use_movies_features=args.mf, use_users_features=args.uf, batch_size=args.batch_size)
116 | 		elif args.loss == 'CCE':
117 | 			return RNNOneHot(interactions_are_unique=(not args.repeated_interactions), max_length=args.max_length, diversity_bias=args.diversity_bias, regularization=args.regularization, updater=updater, target_selection=target_selection, sequence_noise=sequence_noise, recurrent_layer=recurrent_layer, use_ratings_features=args.rf, use_movies_features=args.mf, use_users_features=args.uf, batch_size=args.batch_size)
118 | 		elif args.loss in ['hinge', 'logit', 'logsig']:
119 | 			return RNNMargin(interactions_are_unique=(not args.repeated_interactions), loss_function=args.loss, balance = args.balance, popularity_based = args.pb, min_access = args.min_access, target_selection=target_selection, sequence_noise=sequence_noise, recurrent_layer=recurrent_layer, max_length=args.max_length, updater=updater, use_ratings_features=args.rf, use_movies_features=args.mf, use_users_features=args.uf, batch_size=args.batch_size)
120 | 		elif args.loss in ['BPR', 'TOP1', 'Blackout']:
121 | 			return RNNSampling(interactions_are_unique=(not args.repeated_interactions), loss_function=args.loss, diversity_bias=args.diversity_bias, sampling=args.sampling, sampling_bias=args.sampling_bias, recurrent_layer=recurrent_layer, max_length=args.max_length, updater=updater, target_selection=target_selection, sequence_noise=sequence_noise, use_ratings_features=args.rf, use_movies_features=args.mf, use_users_features=args.uf, batch_size=args.batch_size)
122 | 		else:
123 | 			raise ValueError('Unknown loss for the RNN model')
124 | 	elif args.method == "SDA":
125 | 		return StackedDenoisingAutoencoder(interactions_are_unique=(not args.repeated_interactions), layers = args.layers, input_dropout=args.input_dropout, dropout=args.dropout, updater=updater, batch_size=args.batch_size, use_ratings_features=args.rf)
126 | 
127 | 
128 | 	


--------------------------------------------------------------------------------
/word2vec/ltm.py:
--------------------------------------------------------------------------------
  1 | from __future__ import division
  2 | from __future__ import print_function
  3 | import numpy as np
  4 | import math
  5 | import random
  6 | import re
  7 | import os
  8 | import glob
  9 | import sys
 10 | from time import time
 11 | from gensim.models.word2vec import Word2Vec 
 12 | from helpers import evaluation
 13 | 
 14 | 
 15 | class LTM(object):
 16 | 	""" Implementation of the algorithm proposed in "Latent Trajectory Modeling : A Light and Efficient Way to Introduce Time in Recommender Systems" by Guardia-Sebaoun, E. et al., 2015.
 17 | 	"""
 18 | 	def __init__(self, use_trajectory=True, alpha=0.8, k = 32, window = 5, learning_rate=0.025):
 19 | 		'''
 20 | 
 21 | 		parameters
 22 | 		----------
 23 | 		use_trajectory: boolean
 24 | 			If True, the original LTM algorithm is used. 
 25 | 			Else the users features are not computed, and the predictions are made only by taking the items with the closest word2vec representation from the (window/2) last item in the sequence.
 26 | 		alpha: float in (0,1)
 27 | 			temporal damping parameter from "Apprentissage de trajectoires temporelles pour la recommandation dans les communautes d'utilisateurs", by Guardia-Sebaoun, E. et al.
 28 | 		k : int > 0
 29 | 			number of dimension for the word2vec embedding
 30 | 		window : int > 0
 31 | 			window size for the word2vec embedding
 32 | 		learning_rate: float
 33 | 			initial learning rate for word2vec. (alpha parameter in the gensim implementation of word2vec)
 34 | 		'''
 35 | 		super(LTM, self).__init__()
 36 | 		self.use_trajectory = use_trajectory
 37 | 		self.alpha = alpha
 38 | 		self.k = k
 39 | 		self.window = window
 40 | 		self.learning_rate = learning_rate
 41 | 
 42 | 		self.name = 'Latent Trajectory Modeling'
 43 | 		self.max_length = np.inf # For compatibility with the RNNs
 44 | 
 45 | 		self.metrics = {'recall': {'direction': 1},
 46 | 			'sps': {'direction': 1},
 47 | 			'user_coverage' : {'direction': 1},
 48 | 			'item_coverage' : {'direction': 1},
 49 | 			'ndcg' : {'direction': 1},
 50 | 			'blockbuster_share' : {'direction': -1}
 51 | 		}
 52 | 
 53 | 
 54 | 	def _get_model_filename(self, epochs):
 55 | 		'''Return the name of the file to save the current model
 56 | 		'''
 57 | 		filename = "ltm_ne"+str(epochs)+"_lr"+str(self.learning_rate)+"_k"+str(self.k)+"_w"+str(self.window)
 58 | 		if self.use_trajectory:
 59 | 			filename += "_ut"+str(self.alpha)
 60 | 		return filename
 61 | 
 62 | 	def user_features(self, sequence):
 63 | 		'''Compute the transition features of the users based on his sequence of items.
 64 | 		'''
 65 | 		features = np.zeros(self.k)
 66 | 		for i in range(1,len(sequence)):
 67 | 			features = self.alpha * features + (1 - self.alpha) * (self.w2v_model[str(sequence[i][0])] - self.w2v_model[str(sequence[i-1][0])])
 68 | 
 69 | 		return features
 70 | 
 71 | 	def prepare_model(self, dataset):
 72 | 		''' For compatibility with other methods.
 73 | 		'''
 74 | 		pass
 75 | 
 76 | 	def top_k_recommendations(self, sequence, user_id=None, k=10, exclude=None):
 77 | 		''' Recieves a sequence of (id, rating), and produces k recommendations (as a list of ids)
 78 | 		'''
 79 | 		
 80 | 		if exclude is None:
 81 | 			exclude = []
 82 | 
 83 | 		if self.use_trajectory:
 84 | 			f = self.user_features(sequence)
 85 | 		else:
 86 | 			f = np.mean(np.array([self.w2v_model[str(sequence[-i-1][0])] for i in range(self.window//2)]), axis=0) # average over last window/2 items
 87 | 
 88 | 		top = self.w2v_model.similar_by_vector(f, topn=k+len(sequence)+len(exclude))
 89 | 		top = [int(i[0]) for i in top if int(i[0]) not in exclude]
 90 | 		s = [i[0] for i in sequence]
 91 | 		top = [i for i in top if i not in s]
 92 | 		return top[:k]
 93 | 
 94 | 		# # f = f / np.sqrt(np.sum(np.square(f)))
 95 | 		# # dist = np.dot(self.w2v_model.syn0, f)
 96 | 		# dist = -np.dot(self.w2v_model.syn0, f) / np.sum(np.square(self.w2v_model.syn0), axis=-1)
 97 | 		# # dist = np.sum(np.square(self.w2v_model.syn0 - f), axis=-1)
 98 | 
 99 | 		# # Put low similarity to viewed items to exclude them from recommendations
100 | 		# dist[[self.w2v_model.vocab[str(i)].index for i in exclude]] = np.inf
101 | 		# dist[[self.w2v_model.vocab[str(i[0])].index for i in sequence]] = np.inf
102 | 
103 | 		# # find top k according to dist
104 | 		# return [int(self.w2v_model.index2word[i]) for i in list(np.argpartition(dist, range(k))[:k])]
105 | 	
106 | 	def word2vec_training_generator(self, dataset):
107 | 		'''Take a generator of sequences and produce a generator in the format used by gensim word2vec module
108 | 		'''
109 | 		for sequence, user_id in dataset.training_set(epochs=1):
110 | 			yield [str(i[0]) for i in sequence]
111 | 
112 | 	def set_dataset(self, dataset):
113 | 		self.dataset = dataset
114 | 
115 | 	def train(self, dataset, 
116 | 		max_time=np.inf, 
117 | 		progress=2.0, 
118 | 		time_based_progress=False, 
119 | 		autosave='All', 
120 | 		save_dir='', 
121 | 		min_iterations=0, 
122 | 		max_iter=np.inf, 
123 | 		max_progress_interval=np.inf,
124 | 		load_last_model=False,
125 | 		early_stopping=None,
126 | 		validation_metrics=['sps']):
127 | 		'''Train the model based on the sequence given by the training_set
128 | 
129 | 		!!!! Contrary to what the train function of other algorithms, here an iteration is equivalent to one epoch !!!!!!!
130 | 
131 | 		max_time is used to set the maximumn amount of time (in seconds) that the training can last before being stop.
132 | 			By default, max_time=np.inf, which means that the training will last until the training_set runs out, or the user interrupt the program.
133 | 		
134 | 		progress is used to set when progress information should be printed during training. It can be either an int or a float:
135 | 			integer : print at linear intervals specified by the value of progress (i.e. : progress, 2*progress, 3*progress, ...)
136 | 			float : print at geometric intervals specified by the value of progress (i.e. : progress, progress^2, progress^3, ...)
137 | 
138 | 		max_progress_interval can be used to have geometric intervals in the begining then switch to linear intervals. 
139 | 			It ensures, independently of the progress parameter, that progress is shown at least every max_progress_interval.
140 | 
141 | 		time_based_progress is used to choose between using number of iterations or time as a progress indicator. True means time (in seconds) is used, False means number of iterations.
142 | 
143 | 		autosave is used to set whether the model should be saved during training. It can take several values:
144 | 			All : the model will be saved each time progress info is printed.
145 | 			Best : save only the best model so far
146 | 			None : does not save
147 | 
148 | 		min_iterations is used to set a minimum of iterations before printing the first information (and saving the model).
149 | 
150 | 		save_dir is the path to the directory where models are saved.
151 | 
152 | 		load_last_model: if true, find the latest model in the directory where models should be saved, and load it before starting training.
153 | 
154 | 		early_stopping : should be a callable that will recieve the list of validation error and the corresponding epochs and return a boolean indicating whether the learning should stop.
155 | 		'''
156 | 
157 | 		self.set_dataset(dataset)
158 | 		
159 | 		if len(set(validation_metrics) & set(self.metrics.keys())) < len(validation_metrics):
160 | 			raise ValueError('Incorrect validation metrics. Metrics must be chosen among: ' + ', '.join(self.metrics.keys()))
161 | 
162 | 		# Load last model if needed, else initialise the model
163 | 		iterations = 0
164 | 		epochs_offset = 0
165 | 		if load_last_model:
166 | 			epochs_offset = self.load_last(save_dir)
167 | 		if not hasattr(self, 'w2v_model'):
168 | 			self.w2v_model = Word2Vec(iter = 1, min_count = 1, size=self.k, window=self.window, alpha=self.learning_rate, sg=0)
169 | 			self.w2v_model.build_vocab([map(str, range(dataset.n_items))])
170 | 
171 | 		start_time = time()
172 | 		next_save = int(progress)
173 | 		epochs = []
174 | 		metrics = {name:[] for name in self.metrics.keys()}
175 | 		filename = {}
176 | 
177 | 		while (time() - start_time < max_time and iterations < max_iter):
178 | 
179 | 			# Train one epoch
180 | 			self.w2v_model.train(self.word2vec_training_generator(dataset))
181 | 
182 | 			# Check if it is time to save the model
183 | 			iterations += 1
184 | 
185 | 			if time_based_progress:
186 | 				progress_indicator = int(time() - start_time)
187 | 			else:
188 | 				progress_indicator = iterations
189 | 
190 | 			if progress_indicator >= next_save:
191 | 
192 | 				if progress_indicator >= min_iterations:
193 | 					
194 | 					# Save current epoch
195 | 					epochs.append(epochs_offset + iterations)
196 | 
197 | 					# Compute validation cost
198 | 					metrics = self._compute_validation_metrics(metrics)
199 | 						
200 | 					# Print info
201 | 					self._print_progress(iterations, epochs[-1], start_time, metrics, validation_metrics)
202 | 
203 | 					# Save model
204 | 					run_nb = len(metrics[self.metrics.keys()[0]])-1
205 | 					if autosave == 'All':
206 | 						filename[run_nb] = save_dir + self._get_model_filename(round(epochs[-1], 3))
207 | 						self.save(filename[run_nb])
208 | 					elif autosave == 'Best':
209 | 						pareto_runs = self.get_pareto_front(metrics, validation_metrics)
210 | 						if run_nb in pareto_runs:
211 | 							filename[run_nb] = save_dir + self._get_model_filename(round(epochs[-1], 3))
212 | 							self.save(filename[run_nb])
213 | 							to_delete = [r for r in filename if r not in pareto_runs]
214 | 							for run in to_delete:
215 | 								try:
216 | 									os.remove(filename[run])
217 | 								except OSError:
218 | 									print('Warning : Previous model could not be deleted')
219 | 								del filename[run]
220 | 
221 | 					if early_stopping is not None:
222 | 						# Stop if early stopping is triggered for all the validation metrics
223 | 						if all([early_stopping(epochs, metrics[m]) for m in validation_metrics]):
224 | 							break
225 | 
226 | 
227 | 				# Compute next checkpoint
228 | 				if isinstance(progress, int):
229 | 					next_save += min(progress, max_progress_interval)
230 | 				else:
231 | 					next_save += min(max_progress_interval, next_save * (progress - 1))
232 | 
233 | 	def get_pareto_front(self, metrics, metrics_names):
234 | 		costs = np.zeros((len(metrics[metrics_names[0]]), len(metrics_names)))
235 | 		for i, m in enumerate(metrics_names):
236 | 			costs[:, i] = np.array(metrics[m]) * self.metrics[m]['direction']
237 | 		is_efficient = np.ones(costs.shape[0], dtype = bool)
238 | 		for i, c in enumerate(costs):
239 | 			if is_efficient[i]:
240 | 				is_efficient[is_efficient] = np.any(costs[is_efficient]>=c, axis=1)
241 | 		return np.where(is_efficient)[0].tolist()
242 | 
243 | 	def _compute_validation_metrics(self, metrics):
244 | 		
245 | 		ev = evaluation.Evaluator(self.dataset, k=10)
246 | 		for sequence, user_id in self.dataset.validation_set(epochs=1):
247 | 			top_k = self.top_k_recommendations(sequence[:len(sequence)//2], user_id=int(user_id))
248 | 			goal = [i[0] for i in sequence[len(sequence)//2:]]
249 | 			ev.add_instance(goal, top_k)
250 | 
251 | 		metrics['recall'].append(ev.average_recall())
252 | 		metrics['sps'].append(ev.sps())
253 | 		metrics['ndcg'].append(ev.average_ndcg())
254 | 		metrics['user_coverage'].append(ev.user_coverage())
255 | 		metrics['item_coverage'].append(ev.item_coverage())
256 | 		metrics['blockbuster_share'].append(ev.blockbuster_share())
257 | 
258 | 		return metrics
259 | 
260 | 	def _print_progress(self, iterations, epochs, start_time, metrics, validation_metrics):
261 | 		'''Print learning progress in terminal
262 | 		'''
263 | 		print(self.name, iterations, "batchs, ", epochs, " epochs in", time() - start_time, "s")
264 | 		for m in self.metrics:
265 | 			print(m, ': ', metrics[m][-1])
266 | 			if m in validation_metrics:
267 | 				print('Best ', m, ': ', max(np.array(metrics[m])*self.metrics[m]['direction'])*self.metrics[m]['direction'])
268 | 		print('-----------------')
269 | 
270 | 		# Print on stderr for easier recording of progress
271 | 		print(iterations, epochs, time() - start_time, 'n/a', ' '.join(map(str, [metrics[m][-1] for m in self.metrics])), file=sys.stderr)
272 | 
273 | 	def save(self, filename):
274 | 		'''Save the word2vec object into a file
275 | 		'''
276 | 		print('Save model in ' + filename)
277 | 		self.w2v_model.save(filename)
278 | 
279 | 	def load_last(self, save_dir):
280 | 		'''Load last model from dir
281 | 		'''
282 | 		def extract_number_of_epochs(filename):
283 | 			m = re.search('_ne([0-9]+(\.[0-9]+)?)_', filename)
284 | 			return float(m.group(1))
285 | 
286 | 		# Get all the models for this RNN
287 | 		file = save_dir + self._get_model_filename("*")
288 | 		file = np.array(glob.glob(file))
289 | 
290 | 		if len(file) == 0:
291 | 			print('No previous model, starting from scratch')
292 | 			return 0
293 | 
294 | 		# Find last model and load it
295 | 		last_batch = np.amax(np.array(map(extract_number_of_epochs, file)))
296 | 		last_model = save_dir + self._get_model_filename(last_batch)
297 | 		print('Starting from model ' + last_model)
298 | 		self.load(last_model)
299 | 
300 | 		return last_batch
301 | 		
302 | 
303 | 	def load(self, filename):
304 | 		'''Load parameters values form a file
305 | 		'''
306 | 		self.w2v_model = Word2Vec.load(filename)


--------------------------------------------------------------------------------
/preprocess.py:
--------------------------------------------------------------------------------
  1 | from __future__ import print_function
  2 | 
  3 | import numpy as np
  4 | import pandas as pd
  5 | import random
  6 | import argparse
  7 | import os
  8 | import sys
  9 | from shutil import copyfile
 10 | 
 11 | def command_parser():
 12 | 	parser = argparse.ArgumentParser()
 13 | 	parser.add_argument('-f', dest='filename', help='Input file', required=True, type=str)
 14 | 	parser.add_argument('--columns', help='Order of the columns in the file (eg: "uirt"), u for user, i for item, t for timestamp, r for rating. If r is not present a default rating of 1 is given to all interaction. If t is not present interactions are assumed to be in chronological order. Extra columns are ignored. Default: uit', default="uit", type=str)
 15 | 	parser.add_argument('--sep', help='Separator between the column. If unspecified pandas will try to guess the separator', default="\s+", type=str)
 16 | 	parser.add_argument('--min_user_activity', help='Users with less interactions than this will be removed from the dataset. Default: 2', default=2, type=int)
 17 | 	parser.add_argument('--min_item_pop', help='Items with less interactions than this will be removed from the dataset. Default: 5', default=5, type=int)
 18 | 	parser.add_argument('--val_size', help='Number of users to put in the validation set. If in (0,1) it will be interpreted as the fraction of total number of users. Default: 0.1', default=0.1, type=float)
 19 | 	parser.add_argument('--test_size', help='Number of users to put in the test set. If in (0,1) it will be interpreted as the fraction of total number of users. Default: 0.1', default=0.1, type=float)
 20 | 	parser.add_argument('--seed', help='Seed for the random train/val/test split', default=1, type=int)
 21 | 
 22 | 	args = parser.parse_args()
 23 | 	args.dirname = os.path.dirname(os.path.abspath(args.filename)) + "/"
 24 | 	return args
 25 | 
 26 | def warn_user(dirname):
 27 | 	'''Ask user if he's sure to create files in that directory.
 28 | 	'''
 29 | 	print('This program will create a lot of files and directories in ' + dirname)
 30 | 	answer = raw_input('Are you sure that you want to do that ? [y/n]')
 31 | 	if answer != "y":
 32 | 		sys.exit(0)
 33 | 
 34 | def create_dirs(dirname):
 35 | 	if not os.path.exists(dirname + "data"):
 36 | 		os.makedirs(dirname + "data")
 37 | 
 38 | 	if not os.path.exists(dirname + "models"):
 39 | 		os.makedirs(dirname + "models")
 40 | 
 41 | 	if not os.path.exists(dirname + "results"):
 42 | 		os.makedirs(dirname + "results")
 43 | 
 44 | def load_data(filename, columns, separator):
 45 | 	''' Load the data from filename and sort it according to timestamp.
 46 | 	Returns a dataframe with 3 columns: user_id, item_id, rating
 47 | 	'''
 48 | 
 49 | 	print('Load data...')
 50 | 	data = pd.read_csv(filename, sep=separator, names=list(columns), index_col=False, usecols=range(len(columns)))
 51 | 
 52 | 	if 'r' not in columns:
 53 | 		# Add a column of default ratings
 54 | 		data['r'] = 1
 55 | 
 56 | 	if 't' in columns:
 57 | 		# sort according to the timestamp column
 58 | 		if data['t'].dtype == np.int64: # probably a timestamp
 59 | 			data['t'] = pd.to_datetime(data['t'], unit='s')
 60 | 		else:
 61 | 			data['t'] = pd.to_datetime(data['t'])
 62 | 		print('Sort data in chronological order...')
 63 | 		data.sort_values('t', inplace=True)
 64 | 
 65 | 	return data
 66 | 
 67 | def remove_rare_elements(data, min_user_activity, min_item_popularity):
 68 | 	'''Removes user and items that appears in too few interactions.
 69 | 	min_user_activity is the minimum number of interaction that a user should have.
 70 | 	min_item_popularity is the minimum number of interaction that an item should have.
 71 | 	NB: the constraint on item might not be strictly satisfied because rare users and items are removed in alternance, 
 72 | 	and the last removal of inactive users might create new rare items.
 73 | 	'''
 74 | 	
 75 | 	print('Remove inactive users and rare items...')
 76 | 
 77 | 	#Remove inactive users a first time
 78 | 	user_activity = data.groupby('u').size()
 79 | 	data = data[np.in1d(data.u, user_activity[user_activity >= min_user_activity].index)]
 80 | 	#Remove unpopular items
 81 | 	item_popularity = data.groupby('i').size()
 82 | 	data = data[np.in1d(data.i, item_popularity[item_popularity >= min_item_popularity].index)]
 83 | 	#Remove users that might have passed below the activity threshold due to the removal of rare items
 84 | 	user_activity = data.groupby('u').size()
 85 | 	data = data[np.in1d(data.u, user_activity[user_activity >= min_user_activity].index)]
 86 | 
 87 | 	return data
 88 | 
 89 | def save_index_mapping(data, separator, dirname):
 90 | 	''' Save the mapping of original user and item ids to numerical consecutive ids in dirname.
 91 | 	NB: some users and items might have been removed in previous steps and will therefore not appear in the mapping.
 92 | 	'''
 93 | 	
 94 | 	separator = "\t"
 95 | 
 96 | 
 97 | 	# Pandas categorical type will create the numerical ids we want
 98 | 	print('Map original users and items ids to consecutive numerical ids...')
 99 | 	data['u_original'] = data['u'].astype('category')
100 | 	data['i_original'] = data['i'].astype('category')
101 | 	data['u'] = data['u_original'].cat.codes
102 | 	data['i'] = data['i_original'].cat.codes
103 | 
104 | 	print('Save ids mapping to file...')
105 | 	user_mapping = pd.DataFrame({'original_id' : data['u_original'], 'new_id': data['u']})
106 | 	user_mapping.sort_values('original_id', inplace=True)
107 | 	user_mapping.drop_duplicates(subset='original_id', inplace=True)
108 | 	user_mapping.to_csv(dirname+"data/user_id_mapping", sep=separator, index=False)
109 | 
110 | 	item_mapping = pd.DataFrame({'original_id' : data['i_original'], 'new_id': data['i']})
111 | 	item_mapping.sort_values('original_id', inplace=True)
112 | 	item_mapping.drop_duplicates(subset='original_id', inplace=True)
113 | 	item_mapping.to_csv(dirname+"data/item_id_mapping", sep=separator, index=False)
114 | 
115 | 	return data
116 | 
117 | def split_data(data, nb_val_users, nb_test_users, dirname):
118 | 	'''Splits the data set into training, validation and test sets.
119 | 	Each user is in one and only one set.
120 | 	nb_val_users is the number of users to put in the validation set.
121 | 	nb_test_users is the number of users to put in the test set.
122 | 	'''
123 | 	nb_users = data['u'].nunique()
124 | 
125 | 	# check if nb_val_user is specified as a fraction
126 | 	if nb_val_users < 1:
127 | 		nb_val_users = round(nb_val_users * nb_users)
128 | 	if nb_test_users < 1:
129 | 		nb_test_users = round(nb_test_users * nb_users)
130 | 	nb_test_users = int(nb_test_users)
131 | 	nb_val_users = int(nb_val_users)
132 | 
133 | 	if nb_users <= nb_val_users+nb_test_users:
134 | 		raise ValueError('Not enough users in the dataset: choose less users for validation and test splits')
135 | 
136 | 	def extract_n_users(df, n):
137 | 		users_ids = np.random.choice(df['u'].unique(), n)
138 | 		n_set = df[df['u'].isin(users_ids)]
139 | 		remain_set = df.drop(n_set.index)
140 | 		return n_set, remain_set
141 | 
142 | 	print('Split data into training, validation and test sets...')
143 | 	test_set, tmp_set = extract_n_users(data, nb_test_users)
144 | 	val_set, train_set = extract_n_users(tmp_set, nb_val_users)
145 | 
146 | 	print('Save training, validation and test sets in the triplets format...')
147 | 	train_set.to_csv(dirname + "data/train_set_triplets", sep="\t", columns=['u', 'i', 'r'], index=False, header=False)
148 | 	val_set.to_csv(dirname + "data/val_set_triplets", sep="\t", columns=['u', 'i', 'r'], index=False, header=False)
149 | 	test_set.to_csv(dirname + "data/test_set_triplets", sep="\t", columns=['u', 'i', 'r'], index=False, header=False)
150 | 
151 | 	return train_set, val_set, test_set
152 | 
153 | def gen_sequences(data, half=False):
154 | 	'''Generates sequences of user actions from data.
155 | 	each sequence has the format [user_id, first_item_id, first_item_rating, 2nd_item_id, 2nd_item_rating, ...].
156 | 	If half is True, cut the sequences to half their true length (useful to produce the extended training set).
157 | 	'''
158 | 	data = data.sort_values('u', kind="mergesort") # Mergesort is stable and keeps the time ordering
159 | 	seq = []
160 | 	prev_id = -1
161 | 	for u, i, r in zip(data['u'], data['i'], data['r']):
162 | 		if u != prev_id:
163 | 			if len(seq) > 3:
164 | 				if half:
165 | 					seq = seq[:1+2*int((len(seq) - 1)/4)]
166 | 				yield seq
167 | 			prev_id = u
168 | 			seq = [u]
169 | 		seq.extend([i,r])
170 | 	if half:
171 | 		seq = seq[:1+2*int((len(seq) - 1)/4)]
172 | 	yield seq
173 | 
174 | def make_sequence_format(train_set, val_set, test_set, dirname):
175 | 	'''Convert the train/validation/test sets in the sequence format and save them.
176 | 	Also create the extended training sequences, which countains the first half of the sequences of users in the validation and test sets.
177 | 	'''
178 | 
179 | 	print('Save the training set in the sequences format...')
180 | 	with open(dirname+"data/train_set_sequences", "w") as f:
181 | 		for s in gen_sequences(train_set):
182 | 			f.write(' '.join(map(str, s)) + "\n")
183 | 
184 | 	print('Save the validation set in the sequences format...')
185 | 	with open(dirname+"data/val_set_sequences", "w") as f:
186 | 		for s in gen_sequences(val_set):
187 | 			f.write(' '.join(map(str, s)) + "\n")
188 | 
189 | 	print('Save the test set in the sequences format...')
190 | 	with open(dirname+"data/test_set_sequences", "w") as f:
191 | 		for s in gen_sequences(test_set):
192 | 			f.write(' '.join(map(str, s)) + "\n")
193 | 
194 | 	# sequences+ contains all the sequences of train_set_sequences plus half the sequences of val and test sets
195 | 	print('Save the extended training set in the sequences format...')
196 | 	copyfile(dirname+"data/train_set_sequences", dirname+"data/train_set_sequences+")
197 | 	with open(dirname+"data/train_set_sequences+", "a") as f:
198 | 		for s in gen_sequences(val_set, half=True):
199 | 			f.write(' '.join(map(str, s)) + "\n")
200 | 		for s in gen_sequences(test_set, half=True):
201 | 			f.write(' '.join(map(str, s)) + "\n")
202 | 
203 | def save_data_stats(data, train_set, val_set, test_set, dirname):
204 | 	print('Save stats...')
205 | 
206 | 	def _get_stats(df):
207 | 		return "\t".join(map(str, [df['u'].nunique(), df['i'].nunique(), len(df.index), df.groupby('u').size().max()]))
208 | 
209 | 	with open(dirname+"data/stats", "w") as f:
210 | 		f.write("set\tn_users\tn_items\tn_interactions\tlongest_sequence\n")
211 | 		f.write("Full\t"+ _get_stats(data) + "\n") 
212 | 		f.write("Train\t"+ _get_stats(train_set) + "\n") 
213 | 		f.write("Val\t"+ _get_stats(val_set) + "\n") 
214 | 		f.write("Test\t"+ _get_stats(test_set) + "\n") 
215 | 
216 | def make_readme(dirname, val_set, test_set):
217 | 	data_readme = '''The following files were automatically generated by preprocess.py
218 | 
219 | 	user_id_mapping
220 | 		mapping between the users ids in the original dataset and the new users ids.
221 | 		the first column contains the new id and the second the original id.
222 | 		Inactive users might have been deleted from the original, and they will therefore not appear in the id mapping.
223 | 
224 | 	item_id_mapping
225 | 		Idem for item ids.
226 | 
227 | 	train_set_triplets
228 | 		Training set in the triplets format.
229 | 		Each line is a user item interaction in the form (user_id, item_id, rating). 
230 | 		Interactions are listed in chronological order.
231 | 
232 | 	train_set_sequences
233 | 		Training set in the sequence format.
234 | 		Each line contains all the interactions of a user in the form (user_id, first_item_id, first_rating, 2nd_item_id, 2nd_rating, ...).
235 | 
236 | 	train_set_sequences+
237 | 		Extended training set in the sequence format.
238 | 		The extended training set contains all the training set plus the first half of the interactions of each users in the validation and testing set.
239 | 
240 | 	val_set_triplets
241 | 		Validation set in the triplets format
242 | 
243 | 	val_set_triplets
244 | 		Validation set in the sequence format
245 | 
246 | 	test_set_triplets
247 | 		Test set in the triplets format
248 | 
249 | 	test_set_triplets
250 | 		Test set in the sequence format
251 | 
252 | 	stats
253 | 		Contains some informations about the dataset.
254 | 
255 | 	The training, validation and test sets are obtain by randomly partitioning the users and all their interactions into 3 sets.
256 | 	The validation set contains {n_val} users, the test_set {n_test} users and the train set all the other users.
257 | 
258 | 	'''.format(n_val=str(val_set['u'].nunique()), n_test=str(test_set['u'].nunique()))
259 | 
260 | 	results_readme = '''The format of the results file is the following
261 | 	Each line correspond to one model, with the fields being:
262 | 		Number of epochs
263 | 		precision
264 | 		sps
265 | 		user coverage
266 | 		number of unique items in the test set
267 | 		number of unique items in the recommendations
268 | 		number of unique items in the succesful recommendations
269 | 		number of unique items in the short-term test set (when the goal is to predict precisely the next item)
270 | 		number of unique items in the successful short-term recommendations
271 | 		recall
272 | 		NDCG
273 | 	NB: all the metrics are computed "@10"
274 | 	'''
275 | 
276 | 	with open(dirname+"data/README", "w") as f: 
277 | 		f.write(data_readme)
278 | 	with open(dirname+"results/README", "w") as f: 
279 | 		f.write(results_readme)
280 | 
281 | def main():
282 | 	
283 | 	args = command_parser()
284 | 	np.random.seed(seed=args.seed)
285 | 	warn_user(args.dirname)
286 | 	create_dirs(args.dirname)
287 | 	data = load_data(args.filename, args.columns, args.sep)
288 | 	data = remove_rare_elements(data, args.min_user_activity, args.min_item_pop)
289 | 	data = save_index_mapping(data, args.sep, args.dirname)
290 | 	train_set, val_set, test_set = split_data(data, args.val_size, args.test_size, args.dirname)
291 | 	make_sequence_format(train_set, val_set, test_set, args.dirname)
292 | 	save_data_stats(data, train_set, val_set, test_set, args.dirname)
293 | 	make_readme(args.dirname, val_set, test_set)
294 | 
295 | 	print('Data ready!')
296 | 	
297 | 	print(data.head(10))
298 | 
299 | if __name__ == '__main__':
300 |     main()


--------------------------------------------------------------------------------
/neural_networks/fism_cluster.py:
--------------------------------------------------------------------------------
  1 | from __future__ import print_function
  2 | 
  3 | import numpy as np
  4 | import theano
  5 | import theano.tensor as T
  6 | import scipy.sparse as sp
  7 | import theano.sparse
  8 | import lasagne
  9 | import cPickle
 10 | import os
 11 | import sys
 12 | import random
 13 | from time import time
 14 | from rnn_cluster import RNNCluster
 15 | from sparse_lstm import *
 16 | from helpers import evaluation
 17 | from helpers.sparse_layer import SparseLayer
 18 | 
 19 | class FISMCluster(RNNCluster):
 20 | 	"""FISMCluster combines FISM with item clustering.
 21 | 
 22 | 	Parameters
 23 | 	----------
 24 | 
 25 | 	h: int
 26 | 		Size of the embedding.
 27 | 
 28 | 	alpha: float
 29 | 		Exponant of the normalization term in FISM
 30 | 
 31 | 	reg: float
 32 | 		Regularization coefficient. If reg > 0, L2 regularization is used, otherwise L1 regularization is used with coef -reg.
 33 | 
 34 | 	FISMCluster is built on top of RNNCluster, all the parameters associated to the clustering are described in RNNCluster.
 35 | 	"""
 36 | 	def __init__(self, h=100, alpha=0.5, reg=0.00025, max_length=np.inf, **kwargs):
 37 | 		super(FISMCluster, self).__init__(max_length=np.inf, **kwargs)
 38 | 		
 39 | 		self.n_hidden = h
 40 | 		self.alpha = alpha
 41 | 		self.reg = reg
 42 | 		self.target_selection.shuffle = True
 43 | 		self.name = "FISM Cluster with categorical cross entropy"
 44 | 		self.recurrent_layer.name = ""
 45 | 
 46 | 	def _get_model_filename(self, epochs):
 47 | 		'''Return the name of the file to save the current model
 48 | 		'''
 49 | 		filename = "fism_clusters"+str(self.n_clusters)+"_sc"+str(self.init_scale)
 50 | 
 51 | 		if self.scale_growing_rate != 1.:
 52 | 			filename += "-"+str(self.scale_growing_rate)+"-"+str(self.max_scale)
 53 | 
 54 | 		filename += "_h"+ str(self.n_hidden) + "_a" + str(self.alpha) +"_"
 55 | 		if self.sampling_bias > 0.:
 56 | 			filename += "p" + str(self.sampling_bias)
 57 | 		filename += "s"+str(self.n_samples)
 58 | 
 59 | 		if self.n_cluster_samples > 0:
 60 | 			filename += "_"
 61 | 			if self.sampling_bias > 0.:
 62 | 				filename += "p" + str(self.sampling_bias)
 63 | 			filename += "cs"+str(self.n_cluster_samples)
 64 | 
 65 | 		if self.cluster_type == 'softmax':
 66 | 			filename += "_softmax"
 67 | 		elif self.cluster_type == 'mix':
 68 | 			filename += "_mix"
 69 | 
 70 | 		if self.cluster_selection_noise > 0.:
 71 | 			filename += '_n' + str(self.cluster_selection_noise)
 72 | 
 73 | 		if self.reg != 0.:
 74 | 			filename += '_r' + str(self.reg)
 75 | 
 76 | 		filename += "_c" + self.loss
 77 | 			
 78 | 		return filename+"_"+self._common_filename(epochs)
 79 | 
 80 | 	def _prepare_networks(self, n_items):
 81 | 		''' Prepares the building blocks of the RNN, but does not compile them:
 82 | 		self.l_in : input layer
 83 | 		self.l_mask : mask of the input layer
 84 | 		self.target : target of the network
 85 | 		self.l_out : output of the network
 86 | 		self.cost : cost function
 87 | 		'''
 88 | 	   
 89 | 		self.n_items = n_items
 90 | 		
 91 | 		# Theano tensor for the targets
 92 | 		input_var = theano.sparse.csr_matrix('input_var')
 93 | 		self.target = T.ivector('target_output')
 94 | 		self.exclude = T.fmatrix('excluded_items')
 95 | 		self.samples = T.ivector('samples')
 96 | 		self.cluster_samples = T.ivector('cluster_samples')
 97 | 		
 98 | 		# The input is composed of to parts : the on-hot encoding of the movie, and the features of the movie
 99 | 		self.l_in = lasagne.layers.InputLayer(shape=(self.batch_size, self.n_items), input_var=input_var)
100 | 		
101 | 		l_user_rep = SparseLayer(self.l_in, num_units=self.n_hidden, nonlinearity=None, b=None)
102 | 
103 | 		self.user_representation_layer = l_user_rep
104 | 
105 | 		# The sliced output is then passed through linear layer to obtain the right output size
106 | 		self.l_out = BlackoutLayer(l_user_rep, num_units=self.n_items, num_outputs=self.n_samples, nonlinearity=None, W=lasagne.init.GlorotUniform())
107 | 
108 | 		# lasagne.layers.get_output produces a variable for the output of the net
109 | 		network_output = lasagne.layers.get_output(self.l_out, targets = self.target, samples=self.samples)
110 | 
111 | 		# loss function
112 | 		self.cost = self._loss(network_output,self.batch_size).mean()
113 | 		if self.reg > 0.:
114 | 			self.cost += self.reg * lasagne.regularization.regularize_network_params(self.l_out, lasagne.regularization.l2)
115 | 		elif self.reg < 0.:
116 | 			self.cost -= self.reg * lasagne.regularization.regularize_network_params(self.l_out, lasagne.regularization.l1)
117 | 
118 | 
119 | 		# Cluster learning
120 | 		self.T_scale = theano.shared(self.effective_scale)
121 | 		scaled_softmax = lambda x: lasagne.nonlinearities.softmax(x*self.T_scale)
122 | 
123 | 		self.cluster_selection_layer = lasagne.layers.DenseLayer(l_user_rep, b=None, num_units=self.n_clusters, nonlinearity=None)
124 | 		cluster_selection = lasagne.layers.get_output(self.cluster_selection_layer)
125 | 		if self.cluster_selection_noise > 0.:
126 | 			cluster_selection = cluster_selection + self._srng.normal(cluster_selection.shape, avg=0.0, std=self.cluster_selection_noise)
127 | 		cluster_selection = scaled_softmax(cluster_selection)
128 | 
129 | 		self.cluster_repartition = theano.shared((0.1 * np.random.randn(self.n_items, self.n_clusters)).astype(theano.config.floatX))
130 | 		if self.cluster_type == 'softmax':
131 | 			target_and_samples_clusters = scaled_softmax(self.cluster_repartition[T.concatenate([self.target, self.cluster_samples]), :])
132 | 		elif self.cluster_type == 'mix':
133 | 			target_and_samples_clusters = scaled_softmax(self.cluster_repartition[T.concatenate([self.target, self.cluster_samples]), :]) + \
134 | 				T.nnet.sigmoid(self.T_scale*self.cluster_repartition[T.concatenate([self.target, self.cluster_samples]), :])
135 | 		else:
136 | 			target_and_samples_clusters = T.nnet.sigmoid(self.T_scale*self.cluster_repartition[T.concatenate([self.target, self.cluster_samples]), :])
137 | 		cluster_score = cluster_selection.dot(target_and_samples_clusters.T)
138 | 
139 | 		self.cost_clusters = self._loss(cluster_score, self.batch_size).mean()
140 | 
141 | 	def _compile_train_function(self):
142 | 		''' Compile self.train. 
143 | 		self.train recieves a sequence and a target for every steps of the sequence, 
144 | 		compute error on every steps, update parameter and return global cost (i.e. the error).
145 | 		'''
146 | 		print("Compiling train...")
147 | 		# Compute AdaGrad updates for training
148 | 		all_params = lasagne.layers.get_all_params(self.l_out, trainable=True)
149 | 		updates = self.updater(self.cost, all_params)
150 | 
151 | 		params_clusters = self.cluster_selection_layer.get_params(trainable=True)
152 | 		params_clusters.append(self.cluster_repartition)
153 | 		updates.update(self.updater(self.cost_clusters, params_clusters))
154 | 		# Compile network
155 | 		self.train_function = theano.function([self.l_in.input_var, self.target, self.samples, self.cluster_samples, self.exclude], self.cost, updates=updates, allow_input_downcast=True, name="Train_function", on_unused_input='ignore')
156 | 		print("Compilation done.")
157 | 
158 | 	def _get_hard_clusters(self):
159 | 		if self.cluster_type == 'softmax':
160 | 			return lasagne.nonlinearities.softmax(100. * self.cluster_repartition)
161 | 		elif self.cluster_type == 'mix':
162 | 			# Clipping is used to avoid the sum of sigmoid and softmax to produce a cluster indicator of 2
163 | 			return (lasagne.nonlinearities.softmax(100. * self.cluster_repartition) + T.nnet.sigmoid(100. * self.cluster_repartition)).clip(0,1)
164 | 		else:
165 | 			return T.nnet.sigmoid(100. * self.cluster_repartition)
166 | 
167 | 	def _compile_predict_function(self):
168 | 		''' Compile self.predict, the deterministic rnn that output the prediction at the end of the sequence
169 | 		'''
170 | 		print("Compiling predict...")
171 | 		if self.predict_with_clusters:
172 | 			cluster_selection = lasagne.layers.get_output(self.cluster_selection_layer, deterministic=True)[0, :].argmax()
173 | 			user_representation = lasagne.layers.get_output(self.user_representation_layer, deterministic=True)
174 | 			theano_predict_function = theano.function([self.l_in.input_var], [user_representation, cluster_selection], allow_input_downcast=True, name="Predict_function", on_unused_input='ignore')
175 | 
176 | 			def cluster_predict_function(sequence, k, exclude):
177 | 				u, c = theano_predict_function(sequence)
178 | 				c = int(c)
179 | 				scores = u[0].dot(self.clusters_embeddings[c]) + self.clusters_bias[c]
180 | 
181 | 				cluster_index_exclude = []
182 | 				for i in exclude:
183 | 					if i in self.clusters_reverse_index[c]:
184 | 						cluster_index_exclude.append(self.clusters_reverse_index[c][i])
185 | 				scores[cluster_index_exclude] = -np.inf
186 | 
187 | 				# find top k according to output
188 | 				effective_k = min(k, len(self.clusters[c]))
189 | 				return list(self.clusters[c][np.argpartition(-scores, range(effective_k))[:effective_k]]), len(self.clusters[c])
190 | 
191 | 			self.predict_function = cluster_predict_function
192 | 		else:
193 | 			items_score = lasagne.nonlinearities.softmax(lasagne.layers.get_output(self.l_out, deterministic=True))
194 | 
195 | 			user_representation = lasagne.layers.get_output(self.user_representation_layer, deterministic=True)
196 | 			theano_predict_function = theano.function([self.l_in.input_var], user_representation, allow_input_downcast=True, name="Predict_function", on_unused_input='ignore')
197 | 
198 | 			def no_cluster_predict_function(sequence, k, exclude):
199 | 				u = theano_predict_function(sequence)
200 | 				scores = u[0].dot(self.l_out.W.get_value(borrow=True)) + self.l_out.b.get_value(borrow=True)
201 | 
202 | 				scores[exclude] = -np.inf
203 | 
204 | 				# find top k according to output
205 | 				return list(np.argpartition(-scores, range(k))[:k]), self.n_items
206 | 
207 | 			# theano_predict_function = theano.function([self.l_in.input_var], items_score, allow_input_downcast=True, name="Predict_function", on_unused_input='ignore')
208 | 
209 | 			# def no_cluster_predict_function(sequence, k, exclude):
210 | 			# 	scores = theano_predict_function(sequence)[0]
211 | 			# 	scores[exclude] = -np.inf
212 | 
213 | 			# 	# find top k according to output
214 | 			# 	return list(np.argpartition(-scores, range(k))[:k]), self.n_items
215 | 
216 | 			self.predict_function = no_cluster_predict_function
217 | 
218 | 		print("Compilation done.")
219 | 
220 | 	def _compile_test_function(self):
221 | 		''' Compile self.test_function, the deterministic rnn that output the precision@10
222 | 		'''
223 | 		print("Compiling test...")
224 | 		
225 | 		items_score1 = lasagne.nonlinearities.softmax(lasagne.layers.get_output(self.l_out, deterministic=True))
226 | 		
227 | 		cluster_selection = lasagne.layers.get_output(self.cluster_selection_layer, deterministic=True)[0, :].argmax()
228 | 		items_clusters = self._get_hard_clusters()
229 | 		used_items = items_clusters[:,cluster_selection]
230 | 		items_score2 = items_score1 * used_items
231 | 
232 | 		if self.interactions_are_unique:
233 | 			items_score1 *= (1 - self.exclude)
234 | 			items_score2 *= (1 - self.exclude)
235 | 
236 | 		theano_test_function = theano.function([self.l_in.input_var, self.target, self.samples, self.cluster_samples, self.exclude], [items_score1, items_score2, cluster_selection, used_items.sum()], allow_input_downcast=True, name="Test_function", on_unused_input='ignore')
237 | 
238 | 		def precision_test_function(theano_inputs):
239 | 			k = 10
240 | 			scores1, scores2, c_select, n_used_items = theano_test_function(*theano_inputs)
241 | 			ids1 = np.argpartition(-scores1, range(k), axis=-1)[0, :k]
242 | 			ids2 = np.argpartition(-scores2, range(k), axis=-1)[0, :k]
243 | 			
244 | 			return ids1, ids2, c_select, n_used_items
245 | 
246 | 		self.test_function = precision_test_function
247 | 
248 | 		print("Compilation done.")
249 | 
250 | 	def _prepare_input(self, sequences):
251 | 		''' Sequences is a list of [user_id, input_sequence, targets]
252 | 		'''
253 | 
254 | 		batch_size = len(sequences)
255 | 
256 | 		# Shape return variables
257 | 		X = sp.lil_matrix((batch_size, self.n_items), dtype=theano.config.floatX)
258 | 		Y = np.zeros((batch_size,), dtype='int32') # output target
259 | 		exclude = np.zeros((batch_size, self.n_items), dtype=theano.config.floatX)
260 | 
261 | 		
262 | 		for i, sequence in enumerate(sequences):
263 | 			user_id, in_seq, target = sequence
264 | 			for j in in_seq:
265 | 				X[i, j[0]] = 1./len(in_seq)**self.alpha
266 | 			Y[i] = target[0][0] # id of the first and only target
267 | 			exclude[i, [j[0] for j in in_seq]] = 1
268 | 
269 | 		if self.sampling_bias > 0.:
270 | 			samples = np.array([self._popularity_sample() for i in range(self.n_samples)], dtype='int32')
271 | 			if self.n_cluster_samples > 0:
272 | 				cluster_samples = np.array([self._popularity_sample() for i in range(self.n_cluster_samples)], dtype='int32')
273 | 			else:
274 | 				cluster_samples = samples
275 | 		else:
276 | 			samples = np.random.choice(self.n_items, self.n_samples).astype('int32')
277 | 			if self.n_cluster_samples > 0:
278 | 				cluster_samples = np.random.choice(self.n_items, self.n_cluster_samples).astype('int32')
279 | 			else:
280 | 				cluster_samples = samples
281 | 
282 | 		# scale
283 | 		if not hasattr(self, '_last_epoch'):
284 | 			self._last_epoch = self.dataset.training_set.epochs
285 | 		else:
286 | 			if self.dataset.training_set.epochs > self._last_epoch+1 and self.scale_growing_rate != 1.:
287 | 				self.effective_scale *= self.scale_growing_rate ** int(self.dataset.training_set.epochs - self._last_epoch)
288 | 				self._last_epoch += int(self.dataset.training_set.epochs - self._last_epoch)
289 | 				print("New scale: ", self.effective_scale)
290 | 				self.T_scale.set_value(self.effective_scale)
291 | 
292 | 		return (X.tocsr(), Y, samples, cluster_samples, exclude)
293 | 
294 | 	def top_k_recommendations(self, sequence, user_id=None, k=10, exclude=None):
295 | 		''' Recieves a sequence of (id, rating), and produces k recommendations (as a list of ids)
296 | 		'''
297 | 
298 | 		if exclude is None:
299 | 			exclude = []
300 | 
301 | 		# Compile network if needed
302 | 		if not hasattr(self, 'predict_function'):
303 | 			self._compile_predict_function()
304 | 
305 | 		# Prepare RNN input
306 | 		max_length_seq = sequence[-min(self.max_length, len(sequence)):]
307 | 		X = sp.lil_matrix((1, self.n_items), dtype=theano.config.floatX)
308 | 		for j in sequence:
309 | 			X[0, j[0]] = 1./len(sequence)**self.alpha
310 | 
311 | 		# Run RNN
312 | 		if self.interactions_are_unique:
313 | 			should_exclude = [i[0] for i in sequence]
314 | 		else:
315 | 			should_exclude = []
316 | 		should_exclude.extend(exclude)
317 | 		return self.predict_function(X.tocsr(), k, should_exclude)
318 | 
319 | 	


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | [![No Maintenance Intended](http://unmaintained.tech/badge.svg)](http://unmaintained.tech/)
  2 | 
  3 | # Collaborative filtering based on sequences
  4 | This python library includes multiple collaboraborative filtering algorithm that make use of the sequence of actions of the user: they not only use the fact that a user rated a certain item, but also that he rated before this other item or after that other one.
  5 | Some standard algorithms that do not use sequence information are also present for easier comparison.
  6 | 
  7 | All those algorithms aims to solve the "item recommendation" or "top-N recommendation" problem, which mean that they are not designed to predict ratings values, but only to predict which items are of interest for a given user.
  8 | 
  9 | Our code was used to produce the experiments in "[Collaborative Filtering with Recurrent Neural Networks](https://arxiv.org/abs/1608.07400)" and "[Long and Short-Term Recommendations with Recurrent
 10 | Neural Networks](http://iridia.ulb.ac.be/~rdevooght/papers/UMAP__Long_and_short_term_with_RNN.pdf)". 
 11 | If you use this code in your research, please cite us:
 12 | ````
 13 | @inproceedings{Rec_with_RNN,
 14 |  author = {Devooght, Robin and Bersini, Hugues},
 15 |  title = {Long and Short-Term Recommendations with Recurrent Neural Networks},
 16 |  booktitle = {Proceedings of the 25th Conference on User Modeling, Adaptation and Personalization},
 17 |  series = {UMAP '17},
 18 |  year = {2017},
 19 |  isbn = {978-1-4503-4635-1},
 20 |  location = {Bratislava, Slovakia},
 21 |  pages = {13--21},
 22 |  numpages = {9},
 23 |  url = {http://doi.acm.org/10.1145/3079628.3079670},
 24 |  doi = {10.1145/3079628.3079670},
 25 |  acmid = {3079670},
 26 |  publisher = {ACM},
 27 | } 
 28 | ````
 29 | 
 30 | ## Installation
 31 | The library has many dependencies: numpy/scipy, theano and lasagne for the neural networks, Gensim for word2vec and pandas for the data manipulation.
 32 | 
 33 | Numpy, scipy and Theano can be sometimes difficult to install, and we recommend looking at Theano's installation tutorial: http://deeplearning.net/software/theano/install.html
 34 | Gensim and pandas are easily installed with pip. Lasagne is also installed with pip but you have to specify the version >=0.2.dev1.
 35 | 
 36 | On Ubuntu, the following commands should install everything that you need:
 37 | ````
 38 | sudo apt-get install python-numpy python-scipy python-dev python-pip python-nose g++ libopenblas-dev git
 39 | sudo pip install Theano pandas gensim https://github.com/Lasagne/Lasagne/archive/master.zip
 40 | ````
 41 | 
 42 | ## Usage
 43 | The library is designed to be used in command line through three scripts:
 44 | * preprocess.py for the preparation of the dataset
 45 | * train.py for training models
 46 | * test.py for testing models
 47 | 
 48 | calling these scripts with the `--help` option will display the available options (e.g. `python preprocess.py --help`). 
 49 | 
 50 | ### preprocess.py
 51 | 
 52 | This script takes a file containing a dataset of user/item interactions and split it into training/validation/test sets and save them in the format used by train.py and test.py.
 53 | The original dataset must be in a format where each line correspond to a single user/item interaction.
 54 | 
 55 | The only required argument is `-f path/to/dataset`, which is used to specify the original dataset. The script will create subfolders named "data", "models" and "results" in the folder containing the original dataset. "data" is used by preprocess.py to store all the files it produces, "models" is used by train.py to store the trained models and "results" is used by test.py to store the results of the tests.
 56 | 
 57 | The optional arguments are the following:
 58 | 
 59 | Option | Desciption
 60 | ------ | ----------
 61 | `--columns` | Order of the columns in the file (eg: "uirt"), u for user, i for item, t for timestamp, r for rating. If r is not present a default rating of 1 is given to all interaction. If t is not present interactions are assumed to be in chronological order. Extra columns are ignored. Default: uit
 62 | `--sep` | Separator between the column. If unspecified pandas will try to guess the separator
 63 | `--min_user_activity` | Users with less interactions than this will be removed from the dataset. Default: 2
 64 | `--min_item_pop` | Items with less interactions than this will be removed from the dataset. Default: 5
 65 | `--val_size` | Number of users to put in the validation set. If in (0,1) it will be interpreted as the fraction of total number of users. Default: 0.1
 66 | `--test_size` | Number of users to put in the test set. If in (0,1) it will be interpreted as the fraction of total number of users. Default: 0.1
 67 | `--seed` | Seed for the random train/val/test split
 68 | 
 69 | #### Example 1
 70 | In the movielens 1M dataset each line has the following format:
 71 | ````
 72 | UserID::MovieID::Rating::Timestamp
 73 | ````
 74 | To process it you have to specify the order of the columns, in this case uirt (for user, item, rating, timestamp), and the separator ("::"). If you want to use a hundred users for the validation set and a hundred others for the test set, you'll have to use the following command:
 75 | ````
 76 | python preprocess.py -f path/to/ratings.dat --columns uirt --sep :: --val_size 100 --test_size 100
 77 | ````
 78 | #### Example 2
 79 | Consider a dataset where each line has the following format:
 80 | ````
 81 | timestamp, user_id, some_useless_data, item_id, more_useless_data
 82 | ````
 83 | You can specify the order of columns with "tuxiy" where x and y are placeholder names for the columns that will be discarted by the script. Using "tuxi" will also work, as all the columns not mentioned are discarded. As no rating column is present, each interaction will recieve the rating "1". If you also want for example to remove users with less than 10 interactions, use the following command:
 84 | ````
 85 | python preprocess.py -f path/to/file --columns tuxi --min_user_activity 10
 86 | ````
 87 | 
 88 | ### train.py
 89 | 
 90 | This script is used to train models and offers many options regarding when to save new models and when to stop training.
 91 | The basic usage is the following:
 92 | ````
 93 | python train.py -d path/to/dataset/ -m Method_name
 94 | ````
 95 | 
 96 | The argument `-d` is used to specify the path to the folder that contains the "data", "models" and "results" subfolders created by preprocess.py. 
 97 | If you have multiple datasets with a partly common path (e.g. path/to/dataset1/, path/to/dataset2/, etc.) you can specify this common path in the variable DEFAULT_DIR of helpers/data_handling.py. For example, setting DEFAULT_DIR = "path/to/" and using the argument `-d dataset1` will look for the dataset in "path/to/dataset1/".
 98 | 
 99 | The optional arguments are the following:
100 | 
101 | Option | Desciption
102 | ------ | ----------
103 | `--dir dirname/` | Name of the subfolder of "path/to/dataset/models/" in which to save the model. By default it will be saved directly in the models/ folder, but using subfolders can be useful when many models are tested.
104 | `--progress {int or float}` | Number of iterations (or seconds) between two evaluations of the model on the validation set. When the model is evaluated, progress is shown on the command line, and the model might be saved (depending on the `--save` option). An float value means that the evaluations happen at geometric intervals (rather than linear). Default: 2.0
105 | `--metrics value` | Metrics computed on the validation set, separated by commas. Available metrics are recall, sps, ndcg, item\_coverage, user\_coverage and blockbuster\_share. Default: sps.
106 | `--save [All, Best, None]` | Policy for saving models. If "None", no model is saved. If "All", the current model is saved each time the model is evaluated on the validation set, and no model is destroyed. If "Best", the current model is only saved if it improves over the previous best results on the validation set, and the previous best model is deleted. If "Best" and multiple metrics are used, all the pareto-optimal models are saved. 
107 | `--time_based_progress` | Base the interval between two evaluations on the number of elapsed seconds rather than on the number of iterations.
108 | `--mpi value` | Max number of iterations (or seconds) between two evaluations (useful when using geometric intervals). Default: inf.
109 | `--max_iter value` | Max number of iterations (default: inf).
110 | `--max_time value` | Max training time in seconds (default: inf).
111 | `--min_iter value` | Min number of iterations before making the first evaluation (default: 0).
112 | `--extended_set` | Use extended training set (contains first half of validation and test set). This is necessary for factorization based methods such as BPRMF and FPMC because they need to build a model for every user.
113 | `--tshuffle` | Shuffle the order of sequences between epochs.
114 | `--load_last_model` | Load Last model before starting training (it will search for a model build with all the same options and take the one with the largest number of epochs).
115 | `--es_m [WorstTimesX, StopAfterN, None]` | Early stopping method (by default none is used, and training continues until max_iter or max_time is reached). WorstTimesX will stop training if the number of iterations since the last best score on the validation set is longer than X times the longest time between two consecutive best scores. StopAfterN will stop the training if the model has not improved for the N last evaluations on the validation set.
116 | `--es_n N` | N parameter for StopAfterN (default: 5).
117 | `--es_x X` | X parameter for WorstTimesX (default: 2).
118 | `--es_min_wait num_epochs` | Mininum number of epochs before stopping (for WorstTimesX). Default: 1.
119 | `--es_LiB` | Lower is better for validation score. By default a higher validation score is considered better, but if it is not the case you can use this option.
120 | 
121 | The options specific to each method are explained in the Methods section.
122 | 
123 | ### test.py
124 | 
125 | This script test the models built with train.py on the test set.
126 | The basic usage is:
127 | ````
128 | python test.py -d path/to/dataset/ -m Method_name
129 | ````
130 | The argument `-d` works in the same way as with train.py, and the precise model to test is specified by the `--dir` option and the methods-specific options.
131 | If multiple models fit the options (They are in the same subfolder and were trained with the same method and same options), they are all evaluated one after the other, except if the argument `-i epoch_number` is also specified, which will then select the model based on the number of epochs.
132 | 
133 | `--metrics` allows to specify the list of metrics to compute, separated by commas. By default the metrics are: sps, recall, item\_coverage, user\_coverage, blockbuster_share.
134 | The "blockbuster share" is the percentage of correct recommendations among the 1% most popular items.
135 | The other available metrics are the sps, the ndcg and the assr (when clustering is used).
136 | 
137 | All the metrics are computed "@k", with k=10 by default. k can be changed using the `-k` option.
138 | 
139 | When the `--save` option is used, the results are saved in a file in "path/to/dataset/results/".
140 | the results of each model form a line of the file, and each line contains the number of epochs followed by the metrics specified by `--metrics`.
141 | 
142 | When testing a method based on clustering, the option `--ignore_clusters` can be used to test how the method performs without clusters.
143 | 
144 | ## Methods
145 | 
146 | The available methods are:
147 | * [Recurrent Neural Networks](#recurrent-neural-networks)
148 | * [Stacked Denoising Autoencoder](#stacked-denoising-autoencoders)
149 | * [Latent Tarjectory Modeling/word2vec](#latent-trajectory-modeling)
150 | * [BPR-MF](#bpr-mf)
151 | * [FPMC](#fpmc)
152 | * [FISM](#fism)
153 | * [Fossil](#fossil)
154 | * [Markov Chains](#markov-chain)
155 | * [User KNN](#user-knn)
156 | * [Popularity baseline](#pop)
157 | 
158 | ### Neural Networks
159 | #### Recurrent Neural Networks
160 | 
161 | Use it with `-m RNN`.
162 | The RNN have many options allowing to change the type/size/number of layers, the training procedure and the objective function, and some options are specific to a particular objective function.
163 | 
164 | ##### Layers
165 | 
166 | Option | Desciption
167 | ------ | ----------
168 | `--r_t [LSTM, GRU, Vanilla]` | Type of recurrent layer (default is GRU)
169 | `--r_l size_of_layer1-size_of_layer2-etc.` | Size and number of layers. for example, `--r_l 100-50-50` creates a layer with 50 hidden neurons on top of another layer with 50 hidden neurons on top of a layer with 100 hidden neurons. Default: 32.
170 | `--r_bi` | Use bidirectional layers.
171 | `--r_emb size` | Adds an embedding layer before the recurrent layer. By default no embedding layer is used, but it is adviced to use one (e.g. `--r_emb 100`).
172 | 
173 | ##### Update mechanism
174 | 
175 | Option | Desciption
176 | ------ | ----------
177 | `--u_m [adagrad, adadelta, rmsprop, nesterov, adam]` | Update mechanism (see [Lasagne doc](http://lasagne.readthedocs.io/en/latest/modules/updates.html)). Default is adam
178 | `--u_l float` | Learning rate (default: 0.001). The default learning rate works well with adam. For adagrad `--u_l 0.1` is adviced.
179 | `--u_rho float` | rho parameter for Adadelta and RMSProp, or momentum for Nesterov momentum (default: 0.9).
180 | `--u_b1 float` | Beta 1 parameter for Adam (default: 0.9).
181 | `--u_b2 float` | Beta 2 parameter for Adam (default: 0.999).
182 | 
183 | ##### Noise
184 | 
185 | Option | Desciption
186 | ------ | ----------
187 | `--n_dropout P` | Dropout probability (default: 0.)
188 | `--n_shuf P` | Probability that an item is swapped with another one (default: 0.).
189 | `--n_shuf_std STD` | If an item is swapped, the position of the other item is drawn from a normal distribution whose std is defined by this parameter (default: 5.).
190 | 
191 | ##### Other options
192 | 
193 | Option | Desciption
194 | ------ | ----------
195 | `-b int` | Size of the mini-batchs (default: 16)
196 | `--max_length int` | Maximum length of sequences (default: 200)
197 | `-g val` | Gradient clipping (default: 100)
198 | `--repeated_interactions` | Use when a user can interact multiple times with the same item. If not set, the items that the user already saw are never recommended.
199 | 
200 | ##### Objective functions
201 | 
202 | Option | Desciption
203 | ------ | ----------
204 | `--loss [CCE, Blackout, TOP1, BPR, hinge, logit, logsig]` | Objective function. CCE is the categorical cross-entropy, BPR, TOP1 and Blackout are based on sampling, and hinge, logit and logsig allow to have multiple targets. Default is CCE.
205 | `-r float` | *Only for CCE*. Add a regularization term. A positive value will use L2 regularization and a negative value will use L1. Default: 0.
206 | `--db float` | *Only for CCE, Blackout, BPR and TOP1*. Increase the diversity bias to put more pressure on learning correct recomendations for unfrequent items (default: 0.).
207 | `--sampling float or int` | *Only for Blackout, BPR and TOP1*. Number of items to sample in the error computation. Use a float in [0,1] to express it as a fraction of the number of items in the catalog, or an int > 0 to specify the number of samples directly. Default: 32.
208 | `--n_targets N` | *Only for hinge, logit and logsig*. Number of items in the sequence that are used as targets. Default: 1.
209 | 
210 | ##### Clustering
211 | 
212 | It is possible to combine RNNs with an item-clustering method. This leads to faster prediction on large dataset and creates meaningful item clusters.
213 | In order to use it, use the option `--clusters nb_of_clusters`.  
214 | For example, `python train.py -d path/to/dataset/ -m RNN --loss BPR --clusters 10` will train an RNN with the BPR loss and 10 clusters of items.
215 | Note that the clustering is only compatible with sampling-based loss (BPR, Blackout and TOP1). 
216 | It also works with `--loss CCE`, but a sampling version of CCE is then used instead of the normal categorical cross-entropy.
217 | 	
218 | 
219 | #### Stacked Denoising Autoencoders
220 | 
221 | Use it with `-m SDAE`.
222 | SDAE the RNN options described in "[Update mechanism](#update-mechanism)" and "[Other options](#other-options)".
223 | 
224 | Option | Desciption
225 | ------ | ----------
226 | `--L size_of_layer1-size_of_layer2-etc.` | Size and number of layers. for example, `--r_l 50-32-50` creates a layer with 50 hidden neurons on top of another layer with 32 hidden neurons on top of a layer with 50 hidden neurons. Default: 20.
227 | `--in_do float` | Dropout rate applied to the input layer of the SDAE (default: 0.2).
228 | `--do float` | Dropout rate applied to the hidden layers of the SDAE (default: 0.5).
229 | 
230 | #### Latent Trajectory Modeling
231 | 
232 | Use it with `-m  LTM`.
233 | LTM is a method based on word2vec, described in "[Latent Trajectory Modeling: A Light and Efficient Way to Introduce Time in Recommender Systems](http://dl.acm.org/citation.cfm?id=2799676)".
234 | LTM works in two steps: it first produces an embedding of the items with the word2vec algorithm using the sequence of items in the training set, then it estimates for each user a translation vector that would best explain the trajectory of that user in the embedded space.
235 | Predictions are made by finding the closest items to the last user item translated by the user's translation vector.
236 | Our implementation is mainly a wrapper around [Gensim's word2vec implementation](https://radimrehurek.com/gensim/models/word2vec.html).
237 | 
238 | Option | Desciption
239 | ------ | ----------
240 | `-H int` | Number of neurons (default: 20).
241 | `--ltm_window int` | Size of word2vec's window (default: 5).
242 | `--ltm_damping float` | Temporal damping (default: 0.8).
243 | `--ltm_no_trajectory` | Use this option to make predictions directly with word2vec, without the trajectory estimation proposed in the LTM paper.
244 | 
245 | ### Factorization-based
246 | #### FPMC
247 | 
248 | FPMC is a method combining factorized markov chains with the factorization of the user-item matrix (see "Factorizing personalized Markov chains for next-basket recommendation" by Rendle et al. in *Proceedings of WWW'10*).
249 | Use it with `-m FPMC`
250 | 
251 | Option | Desciption
252 | ------ | ----------
253 | `--k_cf int` | Rank of the user-item matrix factorization (default: 32).
254 | `--k_mc int` | Rank of the factorized Markov chain (default: 32).
255 | `-l val` | Learning rate (default: 0.01).
256 | `--cooling val` | Multiplicative factor applied to the learning rate after each epoch (default: 1)
257 | `--init_sigma val` | Standard deviation of the gaussian initialization (default: 1).
258 | `--fpmc_bias val` | Sampling bias (default: 100). By default the SGD process uses adaptive sampling to speed up learning. This parameter is used to control how much the sampling is biased towards high error items.
259 | `--no_adaptive_sampling` | No adaptive sampling
260 | `-r float` | Add a regularization term. A positive value will use L2 regularization and a negative value will use L1. Default: 0.
261 | 
262 | #### BPR-MF
263 | 
264 | BPR-MF is a matrix factorization method based on the BPR loss (see "BPR: Bayesian personalized ranking from implicit feedback" by Rendle et al. in *Proceedings of the twenty-fifth conference on uncertainty in artificial intelligence*)
265 | Use it with `-m BPRMF`
266 | 
267 | Option | Desciption
268 | ------ | ----------
269 | `-H int` | Rank of the user-item matrix factorization (default: 20).
270 | `-l val` | Learning rate (default: 0.01).
271 | `--cooling val` | Multiplicative factor applied to the learning rate after each epoch (default: 1)
272 | `--init_sigma val` | Standard deviation of the gaussian initialization (default: 1).
273 | `--fpmc_bias val` | Sampling bias (default: 100). By default the SGD process uses adaptive sampling to speed up learning. This parameter is used to control how much the sampling is biased towards high error items.
274 | `--no_adaptive_sampling` | No adaptive sampling
275 | `-r float` | Add a regularization term. A positive value will use L2 regularization and a negative value will use L1. Default: 0.
276 | 
277 | #### FISM
278 | 
279 | FISM is a method based of item-item factorization (see "Fism: factored item similarity models for top-n recommender systems" by Kabbur et al. in *Proceedings of SIGKDD'13*).
280 | It has the advantage over BPR-MF that it does not build a representation for each user. This leads to smaller models, and the ability to make recommendation to new users.
281 | Use it with `-m FISM --loss [BPR, RMSE]`
282 | 
283 | Option | Desciption
284 | ------ | ----------
285 | `--loss [BPR, RMSE]` | Loss function. "BPR" is the same loss as for BPR-MF, "RMSE" optimizes the square error. This cannot be left to default because the default loss is CCE, which is not compatible with FISM.
286 | `-H int` | Rank of the matrix factorization (default: 20).
287 | `--fism_alpha float` | Alpha parameter in FISM. (default: 0.2).
288 | `-l val` | Learning rate (default: 0.01).
289 | `--cooling val` | Multiplicative factor applied to the learning rate after each epoch (default: 1)
290 | `--init_sigma val` | Standard deviation of the gaussian initialization (default: 1).
291 | `-r float` | Add a regularization term. A positive value will use L2 regularization and a negative value will use L1. Default: 0.
292 | 
293 | FISM can be combined with item-clustering the same way that RNN can.
294 | To do so, add the option `--clusters nb_of_clusters`.
295 | When using clustering, a completely different implementation is used, which is based on Theano instead of Numpy.
296 | This has some implications on the available options:
297 | * The loss must be choosen among CCE, BPR, Blackout and TOP1 instead of BPR and RMSE.
298 | * The number of samples for each training step can be specified using `--sampling nb_of_samples`.
299 | * The update mechanism is controled by the options defined in [Update mechanism](#update-mechanism) instead of `-l` and `--cooling`.
300 | 
301 | #### Fossil
302 | 
303 | Fossil combines FISM with factorized markov chains (see "Fusing Similarity Models with Markov Chains for Sparse Sequential Recommendation" by He and McAuley in *Proceedings of ICDM'16*).
304 | Unlike FPMC, Fossil can use higher-order markov chains.
305 | Use it with `-m Fossil`
306 | 
307 | Option | Desciption
308 | ------ | ----------
309 | `-H int` | Rank of the matrix factorization (default: 20).
310 | `--fism_alpha float` | Alpha parameter in FISM. (default: 0.2).
311 | `--fossil_order int` | Order of the markov chains in Fossil. (default: 1).
312 | `-l val` | Learning rate (default: 0.01).
313 | `--cooling val` | Multiplicative factor applied to the learning rate after each epoch (default: 1)
314 | `--init_sigma val` | Standard deviation of the gaussian initialization (default: 1).
315 | `-r float` | Add a regularization term. A positive value will use L2 regularization and a negative value will use L1. Default: 0.
316 | 
317 | ### Lazy
318 | 
319 | Lazy methods do not build models, they make recommendation directly based on the dataset.
320 | They should therefor not be used with `train.py`, but only with `test.py`.
321 | 
322 | #### POP
323 | 
324 | Use it with `-m POP`.
325 | Always predict the most popular items.
326 | 
327 | #### Markov Chain
328 | 
329 | Use it with `-m MM`.
330 | Recommends the items that follow most often the last item the user's sequence.
331 | 
332 | #### User KNN
333 | 
334 | Use it with `-m UKNN`.
335 | User-based nearest neighbors approach. 
336 | The similarity measure between users is the cosine similarity: #number-of-common-items / sqrt(#number-of-items-of-user-a * #number-of-items-of-user-b).
337 | 
338 | Option | Desciption
339 | ------ | ----------
340 | `--ns int` | Neighborhood size (default: 80).
341 | 	
342 | 


--------------------------------------------------------------------------------
/neural_networks/rnn_cluster.py:
--------------------------------------------------------------------------------
  1 | from __future__ import print_function
  2 | 
  3 | import numpy as np
  4 | import theano
  5 | import theano.tensor as T
  6 | import lasagne
  7 | import cPickle
  8 | import os
  9 | import sys
 10 | import random
 11 | from bisect import bisect
 12 | from time import time
 13 | import rnn_base as rnn
 14 | from sparse_lstm import *
 15 | from helpers import evaluation
 16 | from theano.sandbox.rng_mrg import MRG_RandomStreams
 17 | 
 18 | 
 19 | class RNNCluster(rnn.RNNBase):
 20 | 	"""RNNCluster combines sampling-based RNN with item clustering.
 21 | 
 22 | 	Parameters
 23 | 	----------
 24 | 	n_clusters: int
 25 | 		Number of clusters
 26 | 
 27 | 	loss: "Blackout", "CCE", "BPR" or "BPRelu"
 28 | 		Determines the loss function, among:
 29 | 			- BPR, as used in "Session-based Recommendations with Recurrent Neural Networks", Hidasi, B. et al., 2016
 30 | 			- TOP1, defined in "Session-based Recommendations with Recurrent Neural Networks", Hidasi, B. et al., 2016
 31 | 			- Blackout, discriminative loss function defined in "BlackOut: Speeding up Recurrent Neural Network Language Models With Very Large Vocabularies", Ji, S. et al., 2015 (equation 6)
 32 | 			- BPRelu, approximation of BPR based on relu/hinge non-linearities
 33 | 			- CCE, categorical cross-entropy computed on the set of samples
 34 | 
 35 | 	cluster_type: "mix", "softmax" or "sigmoid"
 36 | 		Determines whether items can belong to multiple clusters.
 37 | 			- mix, items belong to at least one cluster, possibly many.
 38 | 			- softmax, items belong to one and only one cluster.
 39 | 			- sigmoid, items belong to zero, one or multiple clusters.
 40 | 
 41 | 	sampling: int
 42 | 		Number of samples.
 43 | 
 44 | 	cluster_sampling: int
 45 | 		If cluster_sampling > 0, the recommendation loss and the clustering loss use different samples.
 46 | 		In that case, cluster_sampling is the number of samples used by the clustering loss.
 47 | 
 48 | 	sampling_bias: float
 49 | 		Items are sampled with a probability proportional to their frequency to the power of the sampling_bias.
 50 | 
 51 | 	predict_with_clusters: bool
 52 | 		Set to false during testing if you want to ignore the clustering.
 53 | 
 54 | 	cluster_selection_noise: float
 55 | 		If cluster_selection_noise > 0, a random gaussian noise (whose std is cluster_selection_noise) is added to the cluster selection output during training.
 56 | 		Can help to explore a large number of clusters.
 57 | 	
 58 | 	init_scale: float
 59 | 		Initial scale of the softmax and sigmoid functions used in the cluster selection process.
 60 | 
 61 | 	scale_growing_rate: float
 62 | 		After each training epoch, the scale of the softmax and sigmoid functions is multiplied by the scale_growing_rate.
 63 | 
 64 | 	max_scale: float
 65 | 		Maximum allowed scale.
 66 | 
 67 | 	See classes SequenceNoise, RecurrentLayers, SelectTargets and update manager for options common to the other RNN methods.
 68 | 	"""
 69 | 
 70 | 	def __init__(self, n_clusters=10, loss="Blackout", cluster_type='mix', sampling=100, cluster_sampling=-1, sampling_bias=0., predict_with_clusters=True, cluster_selection_noise=0., init_scale=1., scale_growing_rate=1., max_scale=50, **kwargs):
 71 | 		super(RNNCluster, self).__init__(**kwargs)
 72 | 		
 73 | 		self.n_clusters = n_clusters
 74 | 		self.init_scale = np.cast[theano.config.floatX](init_scale)
 75 | 		self.effective_scale = np.cast[theano.config.floatX](init_scale)
 76 | 		self.scale_growing_rate = np.cast[theano.config.floatX](scale_growing_rate)
 77 | 		self.max_scale = np.cast[theano.config.floatX](max_scale)
 78 | 		self.cluster_type = cluster_type
 79 | 		self.sampling_bias = sampling_bias
 80 | 		self.loss = loss
 81 | 		self.cluster_selection_noise = cluster_selection_noise
 82 | 
 83 | 		self.predict_with_clusters = predict_with_clusters
 84 | 
 85 | 		if self.loss == "Blackout":
 86 | 			self._loss = self._blackout_loss
 87 | 		elif self.loss == 'lin':
 88 | 			self._loss = self._lin_loss
 89 | 		elif self.loss == 'BPRelu':
 90 | 			self._loss = self._BPRelu_loss
 91 | 		elif self.loss == 'BPR':
 92 | 			self._loss = self._BPR_loss
 93 | 		elif self.loss == 'TOP1':
 94 | 			self._loss = self._TOP1_loss
 95 | 		elif self.loss == 'CCE':
 96 | 			self._loss = self._cce_loss
 97 | 		else: 
 98 | 			raise ValueError('Unknown cluster loss')
 99 | 			
100 | 
101 | 		self.n_samples = int(sampling)
102 | 		self.n_cluster_samples = int(cluster_sampling)
103 | 
104 | 		self._srng = MRG_RandomStreams(lasagne.random.get_rng().randint(1, 2147462579))
105 | 		
106 | 
107 | 		self.name = "RNN Cluster with categorical cross entropy"
108 | 
109 | 		self.metrics = {'recall': {'direction': 1}, 
110 | 			'cluster_recall': {'direction': 1}, 
111 | 			'sps': {'direction': 1}, 
112 | 			'cluster_sps': {'direction': 1}, 
113 | 			'ignored_items': {'direction': -1}, 
114 | 			'assr': {'direction': 1}, 
115 | 			'cluster_use': {'direction': 1}, 
116 | 			'cluster_use_std': {'direction': -1}, 
117 | 			'cluster_size': {'direction': 1}
118 | 		}
119 | 
120 | 	def _get_model_filename(self, epochs):
121 | 		'''Return the name of the file to save the current model
122 | 		'''
123 | 		filename = "rnn_clusters"+str(self.n_clusters)+"_sc"+str(self.init_scale)
124 | 
125 | 		if self.scale_growing_rate != 1.:
126 | 			filename += "-"+str(self.scale_growing_rate)+"-"+str(self.max_scale)
127 | 
128 | 		filename+="_"
129 | 		if self.sampling_bias > 0.:
130 | 			filename += "p" + str(self.sampling_bias)
131 | 		filename += "s"+str(self.n_samples)
132 | 
133 | 		if self.n_cluster_samples > 0:
134 | 			filename += "_"
135 | 			if self.sampling_bias > 0.:
136 | 				filename += "p" + str(self.sampling_bias)
137 | 			filename += "cs"+str(self.n_cluster_samples)
138 | 
139 | 		if self.cluster_type == 'softmax':
140 | 			filename += "_softmax"
141 | 		elif self.cluster_type == 'mix':
142 | 			filename += "_mix"
143 | 
144 | 		if self.cluster_selection_noise > 0.:
145 | 			filename += '_n' + str(self.cluster_selection_noise)
146 | 
147 | 		filename += "_c" + self.loss
148 | 			
149 | 		return filename+"_"+self._common_filename(epochs)
150 | 
151 | 	def _blackout_loss(self, predictions, n_targets):
152 | 		targets = np.arange(n_targets)
153 | 		predictions = T.nnet.softmax(predictions)
154 | 		pos = T.nnet.categorical_crossentropy(predictions, targets)
155 | 		neg = T.log(1 - predictions)
156 | 		return pos - neg[:, targets.shape[0]:].sum(axis=-1)
157 | 
158 | 	def _cce_loss(self, predictions, n_targets):
159 | 		targets = np.arange(n_targets)
160 | 		predictions = T.nnet.softmax(predictions)
161 | 		pos = T.nnet.categorical_crossentropy(predictions, targets)
162 | 		return pos
163 | 
164 | 	def _lin_loss(self, predictions, n_targets):
165 | 		neg = predictions[:, n_targets:].sum(axis=-1)
166 | 		pos = T.diag(predictions)
167 | 		return neg - pos
168 | 
169 | 	def _BPR_loss(self, predictions, n_targets):
170 | 		diff = (predictions - T.diag(predictions).dimshuffle([0,'x']))[:, n_targets:]
171 | 		return -(T.log(T.nnet.sigmoid(-diff))).mean(axis=-1)
172 | 
173 | 	def _BPRelu_loss(self, predictions, n_targets):
174 | 		diff = (predictions - T.diag(predictions).dimshuffle([0,'x']))[:, n_targets:]
175 | 		return lasagne.nonlinearities.leaky_rectify(diff+0.5).mean(axis=-1)
176 | 
177 | 	def _TOP1_loss(self, predictions, n_targets):
178 | 		diff = (predictions - T.diag(predictions).dimshuffle([0,'x']))[:, n_targets:]
179 | 		reg = T.sqr(predictions[:, n_targets:])
180 | 		return (T.nnet.sigmoid(diff) + T.nnet.sigmoid(reg)).mean(axis=-1)
181 | 
182 | 	def _create_ini_clusters(self):
183 | 		c = 0.1 * np.random.randn(self.n_items, self.n_clusters)
184 | 		# c = -2 * np.random.random((self.n_items, self.n_clusters)) - 1
185 | 		# for i, j in enumerate(np.random.choice(self.n_clusters, self.n_items)):
186 | 		# 	c[i,j] *= -1
187 | 
188 | 		# print(np.round(c[:5, :], 2))
189 | 		return c.astype(theano.config.floatX)
190 | 
191 | 	def _prepare_networks(self, n_items):
192 | 		''' Prepares the building blocks of the RNN, but does not compile them:
193 | 		self.l_in : input layer
194 | 		self.l_mask : mask of the input layer
195 | 		self.target : target of the network
196 | 		self.l_out : output of the network
197 | 		self.cost : cost function
198 | 		'''
199 | 	   
200 | 		self.n_items = n_items
201 | 		# The input is composed of to parts : the on-hot encoding of the movie, and the features of the movie
202 | 		self.l_in = lasagne.layers.InputLayer(shape=(self.batch_size, self.max_length, self._input_size()))
203 | 		# The input is completed by a mask to inform the LSTM of the length of the sequence
204 | 		self.l_mask = lasagne.layers.InputLayer(shape=(self.batch_size, self.max_length))
205 | 
206 | 		# recurrent layer
207 | 		if not self.use_movies_features:
208 | 			l_recurrent = self.recurrent_layer(self.l_in, self.l_mask, true_input_size=self.n_items + self._n_optional_features(), only_return_final=True)
209 | 		else:
210 | 			l_recurrent = self.recurrent_layer(self.l_in, self.l_mask, true_input_size=None, only_return_final=True)
211 | 
212 | 		
213 | 		# Theano tensor for the targets
214 | 		self.target = T.ivector('target_output')
215 | 		self.exclude = T.fmatrix('excluded_items')
216 | 		self.samples = T.ivector('samples')
217 | 		self.cluster_samples = T.ivector('cluster_samples')
218 | 		
219 | 		self.user_representation_layer = l_recurrent
220 | 		
221 | 		# The sliced output is then passed through linear layer to obtain the right output size
222 | 		self.l_out = BlackoutLayer(l_recurrent, num_units=self.n_items, num_outputs=self.n_samples, nonlinearity=None, W=lasagne.init.GlorotUniform())
223 | 
224 | 		# lasagne.layers.get_output produces a variable for the output of the net
225 | 		network_output = lasagne.layers.get_output(self.l_out, targets = self.target, samples=self.samples)
226 | 
227 | 		# loss function
228 | 		self.cost = self._loss(network_output,self.batch_size).mean()
229 | 
230 | 
231 | 		# Cluster learning
232 | 		self.T_scale = theano.shared(self.effective_scale)
233 | 		scaled_softmax = lambda x: lasagne.nonlinearities.softmax(x*self.T_scale)
234 | 
235 | 		self.cluster_selection_layer = lasagne.layers.DenseLayer(l_recurrent, b=None, num_units=self.n_clusters, nonlinearity=None)
236 | 		cluster_selection = lasagne.layers.get_output(self.cluster_selection_layer)
237 | 		if self.cluster_selection_noise > 0.:
238 | 			cluster_selection = cluster_selection + self._srng.normal(cluster_selection.shape, avg=0.0, std=self.cluster_selection_noise)
239 | 		cluster_selection = scaled_softmax(cluster_selection)
240 | 
241 | 		self.cluster_repartition = theano.shared(self._create_ini_clusters())
242 | 		if self.cluster_type == 'softmax':
243 | 			target_and_samples_clusters = scaled_softmax(self.cluster_repartition[T.concatenate([self.target, self.cluster_samples]), :])
244 | 		elif self.cluster_type == 'mix':
245 | 			target_and_samples_clusters = scaled_softmax(self.cluster_repartition[T.concatenate([self.target, self.cluster_samples]), :]) + \
246 | 				T.nnet.sigmoid(self.T_scale*self.cluster_repartition[T.concatenate([self.target, self.cluster_samples]), :])
247 | 		else:
248 | 			target_and_samples_clusters = T.nnet.sigmoid(self.T_scale*self.cluster_repartition[T.concatenate([self.target, self.cluster_samples]), :])
249 | 		cluster_score = cluster_selection.dot(target_and_samples_clusters.T)
250 | 
251 | 		self.cost_clusters = self._loss(cluster_score, self.batch_size).mean()
252 | 		
253 | 
254 | 
255 | 
256 | 	
257 | 
258 | 	def _compile_train_function(self):
259 | 		''' Compile self.train. 
260 | 		self.train recieves a sequence and a target for every steps of the sequence, 
261 | 		compute error on every steps, update parameter and return global cost (i.e. the error).
262 | 		'''
263 | 		print("Compiling train...")
264 | 		# Compute AdaGrad updates for training
265 | 		all_params = lasagne.layers.get_all_params(self.l_out, trainable=True)
266 | 		updates = self.updater(self.cost, all_params)
267 | 
268 | 		params_clusters = self.cluster_selection_layer.get_params(trainable=True)
269 | 		params_clusters.append(self.cluster_repartition)
270 | 		updates.update(self.updater(self.cost_clusters, params_clusters))
271 | 		# Compile network
272 | 		self.train_function = theano.function([self.l_in.input_var, self.l_mask.input_var, self.target, self.samples, self.cluster_samples, self.exclude], self.cost, updates=updates, allow_input_downcast=True, name="Train_function", on_unused_input='ignore')
273 | 		print("Compilation done.")
274 | 
275 | 	def _get_hard_clusters(self):
276 | 		if self.cluster_type == 'softmax':
277 | 			return lasagne.nonlinearities.softmax(100. * self.cluster_repartition)
278 | 		elif self.cluster_type == 'mix':
279 | 			# Clipping is used to avoid the sum of sigmoid and softmax to produce a cluster indicator of 2
280 | 			return (lasagne.nonlinearities.softmax(100. * self.cluster_repartition) + T.nnet.sigmoid(100. * self.cluster_repartition)).clip(0,1)
281 | 		else:
282 | 			return T.nnet.sigmoid(100. * self.cluster_repartition)
283 | 
284 | 	def _compile_predict_function(self):
285 | 		''' Compile self.predict, the deterministic rnn that output the prediction at the end of the sequence
286 | 		'''
287 | 		print("Compiling predict...")
288 | 		if self.predict_with_clusters:
289 | 			cluster_selection = lasagne.layers.get_output(self.cluster_selection_layer, deterministic=True)[0, :].argmax()
290 | 			user_representation = lasagne.layers.get_output(self.user_representation_layer, deterministic=True)
291 | 			theano_predict_function = theano.function([self.l_in.input_var, self.l_mask.input_var], [user_representation, cluster_selection], allow_input_downcast=True, name="Predict_function", on_unused_input='ignore')
292 | 
293 | 			def cluster_predict_function(sequence, mask, k, exclude):
294 | 				u, c = theano_predict_function(sequence, mask)
295 | 				scores = u[0].dot(self.clusters_embeddings[c]) + self.clusters_bias[c]
296 | 
297 | 				cluster_index_exclude = []
298 | 				for i in exclude:
299 | 					if i in self.clusters_reverse_index[c]:
300 | 						cluster_index_exclude.append(self.clusters_reverse_index[c][i])
301 | 				scores[cluster_index_exclude] = -np.inf
302 | 
303 | 				# find top k according to output
304 | 				effective_k = min(k, len(self.clusters[c]))
305 | 				return list(self.clusters[c][np.argpartition(-scores, range(effective_k))[:effective_k]]), len(self.clusters[c])
306 | 
307 | 			self.predict_function = cluster_predict_function
308 | 		else:
309 | 			items_score = lasagne.nonlinearities.softmax(lasagne.layers.get_output(self.l_out, deterministic=True))
310 | 
311 | 			user_representation = lasagne.layers.get_output(self.user_representation_layer, deterministic=True)
312 | 			theano_predict_function = theano.function([self.l_in.input_var, self.l_mask.input_var], user_representation, allow_input_downcast=True, name="Predict_function", on_unused_input='ignore')
313 | 
314 | 			def no_cluster_predict_function(sequence, mask, k, exclude):
315 | 				u = theano_predict_function(sequence, mask)
316 | 				scores = u[0].dot(self.l_out.W.get_value(borrow=True)) + self.l_out.b.get_value(borrow=True)
317 | 
318 | 				scores[exclude] = -np.inf
319 | 
320 | 				# find top k according to output
321 | 				return list(np.argpartition(-scores, range(k))[:k]), self.n_items
322 | 
323 | 			self.predict_function = no_cluster_predict_function
324 | 
325 | 		print("Compilation done.")
326 | 
327 | 	def _compile_test_function(self):
328 | 		''' Compile self.test_function, the deterministic rnn that output the precision@10
329 | 		'''
330 | 		print("Compiling test...")
331 | 		
332 | 		items_score1 = lasagne.nonlinearities.softmax(lasagne.layers.get_output(self.l_out, deterministic=True))
333 | 		
334 | 		cluster_selection = lasagne.layers.get_output(self.cluster_selection_layer, deterministic=True)[0, :].argmax()
335 | 		items_clusters = self._get_hard_clusters()
336 | 		used_items = items_clusters[:,cluster_selection]
337 | 		items_score2 = items_score1 * used_items
338 | 
339 | 		if self.interactions_are_unique:
340 | 			items_score1 *= (1 - self.exclude)
341 | 			items_score2 *= (1 - self.exclude)
342 | 
343 | 		theano_test_function = theano.function([self.l_in.input_var, self.l_mask.input_var, self.target, self.samples, self.cluster_samples, self.exclude], [items_score1, items_score2, cluster_selection, used_items.sum()], allow_input_downcast=True, name="Test_function", on_unused_input='ignore')
344 | 
345 | 		def precision_test_function(theano_inputs):
346 | 			k = 10
347 | 			scores1, scores2, c_select, n_used_items = theano_test_function(*theano_inputs)
348 | 			ids1 = np.argpartition(-scores1, range(k), axis=-1)[0, :k]
349 | 			ids2 = np.argpartition(-scores2, range(k), axis=-1)[0, :k]
350 | 			
351 | 			return ids1, ids2, c_select, n_used_items
352 | 
353 | 		self.test_function = precision_test_function
354 | 
355 | 		print("Compilation done.")
356 | 
357 | 	def _popularity_sample(self):
358 | 		if not hasattr(self, '_cumsum'):
359 | 			self._cumsum = np.cumsum(np.power(self.dataset.item_popularity, self.sampling_bias))
360 | 
361 | 		return bisect(self._cumsum, random.uniform(0, self._cumsum[-1]))
362 | 
363 | 	def _prepare_input(self, sequences):
364 | 		''' Sequences is a list of [user_id, input_sequence, targets]
365 | 		'''
366 | 
367 | 		batch_size = len(sequences)
368 | 
369 | 		# Shape return variables
370 | 		X = np.zeros((batch_size, self.max_length, self._input_size()), dtype=self._input_type) # input of the RNN
371 | 		mask = np.zeros((batch_size, self.max_length)) # mask of the input (to deal with sequences of different length)
372 | 		Y = np.zeros((batch_size,), dtype='int32') # output target
373 | 		exclude = np.zeros((batch_size, self.n_items), dtype=theano.config.floatX)
374 | 
375 | 		
376 | 		for i, sequence in enumerate(sequences):
377 | 			user_id, in_seq, target = sequence
378 | 			seq_features = np.array(map(lambda x: self._get_features(x, user_id), in_seq))
379 | 			X[i, :len(in_seq), :] = seq_features # Copy sequences into X
380 | 			mask[i, :len(in_seq)] = 1
381 | 			Y[i] = target[0][0] # id of the first and only target
382 | 			exclude[i, [j[0] for j in in_seq]] = 1
383 | 
384 | 		if self.sampling_bias > 0.:
385 | 			samples = np.array([self._popularity_sample() for i in range(self.n_samples)], dtype='int32')
386 | 			if self.n_cluster_samples > 0:
387 | 				cluster_samples = np.array([self._popularity_sample() for i in range(self.n_cluster_samples)], dtype='int32')
388 | 			else:
389 | 				cluster_samples = samples
390 | 		else:
391 | 			samples = np.random.choice(self.n_items, self.n_samples).astype('int32')
392 | 			if self.n_cluster_samples > 0:
393 | 				cluster_samples = np.random.choice(self.n_items, self.n_cluster_samples).astype('int32')
394 | 			else:
395 | 				cluster_samples = samples
396 | 
397 | 		# scale
398 | 		if not hasattr(self, '_last_epoch'):
399 | 			self._last_epoch = self.dataset.training_set.epochs
400 | 		else:
401 | 			if self.dataset.training_set.epochs > self._last_epoch+1 and self.scale_growing_rate != 1.:
402 | 				self.effective_scale *= self.scale_growing_rate ** int(self.dataset.training_set.epochs - self._last_epoch)
403 | 				self._last_epoch += int(self.dataset.training_set.epochs - self._last_epoch)
404 | 				print("New scale: ", self.effective_scale)
405 | 				self.T_scale.set_value(self.effective_scale)
406 | 
407 | 		return (X, mask.astype(theano.config.floatX), Y, samples, cluster_samples, exclude)
408 | 
409 | 	def _compute_validation_metrics(self, metrics):
410 | 		clusters = np.zeros(self.n_clusters, dtype="int")
411 | 		used_items = []
412 | 		ev = evaluation.Evaluator(self.dataset, k=10)
413 | 		ev_clusters = evaluation.Evaluator(self.dataset, k=10)
414 | 		for batch, goal in self._gen_mini_batch(self.dataset.validation_set(epochs=1), test=True):
415 | 			pred1, pred2, cl, i = self.test_function(batch)
416 | 			ev.add_instance(goal, pred1)
417 | 			ev_clusters.add_instance(goal, pred2)
418 | 			clusters[cl] += 1
419 | 			used_items.append(i)
420 | 		
421 | 		if self.cluster_type == 'softmax':
422 | 			ignored_items = 0
423 | 			cluster_size = np.histogram(self.cluster_repartition.get_value(borrow=True).argmax(axis=1), bins=range(self.n_clusters+1))[0].tolist()
424 | 		elif self.cluster_type == 'mix':
425 | 			ignored_items = 0
426 | 			sig_clusters = self.cluster_repartition.get_value(borrow=True) > 0.
427 | 			softmax_clusters = self.cluster_repartition.get_value(borrow=True).argmax(axis=1)
428 | 			for i in range(self.n_items):
429 | 				sig_clusters[i, softmax_clusters[i]] = True
430 | 			cluster_size = sig_clusters.sum(axis=0)
431 | 		else:
432 | 			ignored_items = (self.cluster_repartition.get_value(borrow=True).max(axis=1) < 0.).sum()
433 | 			cluster_size = (self.cluster_repartition.get_value(borrow=True) > 0.).sum(axis=0)
434 | 		
435 | 		metrics['recall'].append(ev.average_recall())
436 | 		metrics['cluster_recall'].append(ev_clusters.average_recall())
437 | 		metrics['sps'].append(ev.sps())
438 | 		metrics['cluster_sps'].append(ev_clusters.sps())
439 | 		metrics['assr'].append(self.n_items / np.mean(used_items))
440 | 		metrics['ignored_items'].append(ignored_items)
441 | 		metrics['cluster_use'].append(clusters)
442 | 		metrics['cluster_use_std'].append(np.std(clusters))
443 | 		metrics['cluster_size'].append(cluster_size)
444 | 
445 | 		return metrics
446 | 
447 | 	def _print_progress(self, iterations, epochs, start_time, train_costs, metrics, validation_metrics):
448 | 		'''Print learning progress in terminal
449 | 		'''
450 | 		print(self.name, iterations, "batchs, ", epochs, " epochs in", time() - start_time, "s")
451 | 		print("Last train cost : ", train_costs[-1])
452 | 		for m in self.metrics.keys():
453 | 			print(m, ': ', metrics[m][-1])
454 | 			if m in validation_metrics:
455 | 				print('Best ', m, ': ', max(np.array(metrics[m])*self.metrics[m]['direction'])*self.metrics[m]['direction'])
456 | 		print('-----------------')
457 | 
458 | 		# Print on stderr for easier recording of progress
459 | 		print(iterations, epochs, time() - start_time, train_costs[-1], metrics['sps'][-1], metrics['cluster_sps'][-1], metrics['recall'][-1], metrics['cluster_recall'][-1], metrics['assr'][-1], metrics['ignored_items'][-1], metrics['cluster_use_std'][-1], file=sys.stderr)
460 | 
461 | 	def prepare_tests(self):
462 | 		'''Take the soft clustering and make actual clusters.
463 | 		'''
464 | 		cluster_membership = self.cluster_repartition.get_value(borrow=True)
465 | 		item_embeddings = self.l_out.W.get_value(borrow=True)
466 | 		item_bias = self.l_out.b.get_value(borrow=True)
467 | 		self.clusters = [[] for i in range(self.n_clusters)]
468 | 		for i in range(cluster_membership.shape[0]):
469 | 			no_cluster = True
470 | 			best_cluster = 0
471 | 			best_val = cluster_membership[i, 0]
472 | 			for j in range(self.n_clusters):
473 | 				if cluster_membership[i,j] > 0:
474 | 					self.clusters[j].append(i)
475 | 					no_cluster = False
476 | 				elif cluster_membership[i,j] > best_val:
477 | 					best_val = cluster_membership[i,j]
478 | 					best_cluster = j
479 | 			if no_cluster:
480 | 				self.clusters[best_cluster].append(i)
481 | 
482 | 		self.clusters = [np.array(c) for c in self.clusters]
483 | 		self.clusters_reverse_index = []
484 | 		for c in self.clusters:
485 | 			self.clusters_reverse_index.append({c[j]: j for j in range(len(c))})
486 | 		self.clusters_embeddings = [item_embeddings[:, c] for c in self.clusters]
487 | 		self.clusters_bias = [item_bias[c] for c in self.clusters]
488 | 
489 | 	def top_k_recommendations(self, sequence, user_id=None, k=10, exclude=None):
490 | 		''' Recieves a sequence of (id, rating), and produces k recommendations (as a list of ids)
491 | 		'''
492 | 
493 | 		if exclude is None:
494 | 			exclude = []
495 | 
496 | 		# Compile network if needed
497 | 		if not hasattr(self, 'predict_function'):
498 | 			self._compile_predict_function()
499 | 
500 | 		# Prepare RNN input
501 | 		max_length_seq = sequence[-min(self.max_length, len(sequence)):]
502 | 		X = np.zeros((1, self.max_length, self._input_size()), dtype=self._input_type) # input of the RNN
503 | 		X[0, :len(max_length_seq), :] = np.array(map(lambda x: self._get_features(x, user_id), max_length_seq))
504 | 		mask = np.zeros((1, self.max_length)) # mask of the input (to deal with sequences of different length)
505 | 		mask[0, :len(max_length_seq)] = 1
506 | 
507 | 		# Run RNN
508 | 		if self.interactions_are_unique:
509 | 			should_exclude = [i[0] for i in sequence]
510 | 		else:
511 | 			should_exclude = []
512 | 		should_exclude.extend(exclude)
513 | 		return self.predict_function(X, mask.astype(theano.config.floatX), k, should_exclude)
514 | 
515 | 	def save(self, filename):
516 | 		'''Save the parameters of a network into a file
517 | 		'''
518 | 		print('Save model in ' + filename)
519 | 		if not os.path.exists(os.path.dirname(filename)):
520 | 			os.makedirs(os.path.dirname(filename))
521 | 		param = lasagne.layers.get_all_param_values(self.l_out)
522 | 		param.append(self.cluster_repartition.get_value(borrow=True))
523 | 		param.append([p.get_value(borrow=True) for p in self.cluster_selection_layer.get_params()])
524 | 		f = file(filename, 'wb')
525 | 		cPickle.dump(param,f,protocol=cPickle.HIGHEST_PROTOCOL)
526 | 		f.close()
527 | 		
528 | 	def load(self, filename):
529 | 		'''Load parameters values form a file
530 | 		'''
531 | 		f = file(filename, 'rb')
532 | 		param = cPickle.load(f)
533 | 		f.close()
534 | 		lasagne.layers.set_all_param_values(self.l_out, [i.astype(theano.config.floatX) for i in param[:-2]])
535 | 		self.cluster_repartition.set_value(param[-2])
536 | 		for p, v in zip(self.cluster_selection_layer.get_params(), param[-1]):
537 | 			p.set_value(v)
538 | 
539 | 		self.prepare_tests()
540 | 


--------------------------------------------------------------------------------