├── .gitignore ├── README.md ├── autorec.png ├── download_data.sh ├── main.py ├── model.py └── utils.py /.gitignore: -------------------------------------------------------------------------------- 1 | ml-1m 2 | *.zip 3 | *.pyc 4 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # AutoRec: Autoencoders Meet Collaborative Filtering 2 | This is a TensorFlow implementation of [AutoRec](http://users.cecs.anu.edu.au/~u5098633/papers/www15.pdf). The dataset can be found [here](https://grouplens.org/datasets/movielens/). 3 | 4 | 5 | "AutoRec: Autoencoders Meet Collaborative Filtering". Suvash Sedhain, Aditya Krishna Menon, Scott Sanner, and Lexing Xie. 2015. 6 | 7 | 8 | 9 | # Requirements 10 | * numpy 11 | * pandas 12 | * tensorflow 13 | * tensorlayer 14 | 15 | # Usage 16 | ``` 17 | ./download_data.sh 18 | python main.py 19 | ``` 20 | -------------------------------------------------------------------------------- /autorec.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/npow/AutoRec/d336b1b80b425f03ada489428d0225f92c1d13dc/autorec.png -------------------------------------------------------------------------------- /download_data.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | wget http://files.grouplens.org/datasets/movielens/ml-1m.zip 4 | unzip ml-1m.zip 5 | -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | from model import AutoRec 3 | from utils import load_ratings 4 | 5 | flags = tf.app.flags 6 | flags.DEFINE_string('input_dir', 'ml-1m', 'input directory containing movielens files') 7 | flags.DEFINE_integer('batch_size', 256, 'batch size') 8 | flags.DEFINE_integer('hidden_size', 500, 'hidden size') 9 | flags.DEFINE_integer('n_epochs', 50, 'num epochs') 10 | flags.DEFINE_float('lr', 0.005, 'learning rate') 11 | flags.DEFINE_float('penalty', 1, 'regularization penalty') 12 | flags.DEFINE_float('keep', 0.9, 'dropout keep probability') 13 | flags.DEFINE_integer('random_state', 1234, 'random state seed') 14 | FLAGS = flags.FLAGS 15 | 16 | def main(_): 17 | print(FLAGS.__flags) 18 | FLAGS.data = load_ratings('%s/ratings.dat' % FLAGS.input_dir, random_state=FLAGS.random_state) 19 | model = AutoRec(**FLAGS.__flags) 20 | model.train(n_epochs=FLAGS.n_epochs) 21 | 22 | if __name__ == '__main__': 23 | tf.app.run() 24 | -------------------------------------------------------------------------------- /model.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import tensorflow as tf 3 | import tensorlayer as tl 4 | 5 | class AutoRec: 6 | def __init__(self, 7 | data, 8 | batch_size=256, 9 | hidden_size=500, 10 | lr=0.001, 11 | penalty=0.001, 12 | keep=0.9, 13 | **kwargs 14 | ): 15 | tf.reset_default_graph() 16 | tl.layers.clear_layers_name() 17 | num_movies = data['ratings'].shape[1] 18 | 19 | self.sess = tf.Session() 20 | self.data = data 21 | self.batch_size = batch_size 22 | 23 | self.r = tf.placeholder(dtype=tf.float32, shape=[None, num_movies], name='r') 24 | self.input_mask = tf.placeholder(dtype=tf.float32, shape=[None, num_movies], name='input_mask') 25 | self.output_mask = tf.placeholder(dtype=tf.float32, shape=[None, num_movies], name='output_mask') 26 | 27 | l_in = tl.layers.InputLayer(self.r * self.input_mask, name='input') 28 | l_in = tl.layers.DropoutLayer(l_in, keep=keep, name='dropout') 29 | l_encoder = tl.layers.DenseLayer(l_in, 30 | n_units=hidden_size, 31 | name='encoder', 32 | act=tf.nn.sigmoid, 33 | W_init=tf.truncated_normal_initializer(mean=0, stddev=0.05) 34 | ) 35 | l_decoder = tl.layers.DenseLayer(l_encoder, 36 | n_units=num_movies, 37 | name='decoder', 38 | act=tl.activation.identity, 39 | W_init=tf.truncated_normal_initializer(mean=0, stddev=0.05) 40 | ) 41 | self.network = l_decoder 42 | self.r_pred = l_decoder.outputs 43 | W_encoder = tl.layers.get_variables_with_name('encoder/W:0')[0] 44 | W_decoder = tl.layers.get_variables_with_name('decoder/W:0')[0] 45 | 46 | cost_reconstruction = tf.reduce_sum(tf.multiply((self.r - self.r_pred), self.output_mask) ** 2) 47 | cost_penalty = tf.reduce_sum(W_encoder ** 2) + tf.reduce_sum(W_decoder ** 2) 48 | 49 | self.cost = cost_reconstruction + penalty * 0.5 * cost_penalty 50 | self.train_op = tf.train.AdamOptimizer(lr, beta1=0.9, beta2=0.999, epsilon=1e-08, use_locking=False).minimize(self.cost, var_list=self.network.all_params) 51 | 52 | tl.layers.initialize_global_variables(self.sess) 53 | 54 | def get_batch(self, dataset, index, n_batches): 55 | if index == n_batches - 1: 56 | r = dataset[self.batch_size*index:] 57 | else: 58 | r = dataset[self.batch_size*index:self.batch_size*(index+1)] 59 | input_mask = (r != 0).astype(np.float32) 60 | return r, input_mask 61 | 62 | def train(self, n_epochs=100, shuffle_batch=True): 63 | n_train_batches = (len(self.data['ratings']) // self.batch_size) + 1 64 | epoch = 0 65 | while epoch < n_epochs: 66 | total_cost = 0 67 | minibatch_indices = range(n_train_batches) 68 | if shuffle_batch: 69 | np.random.shuffle(minibatch_indices) 70 | for minibatch_index in minibatch_indices: 71 | r, input_mask = self.get_batch(self.data['ratings'], minibatch_index, n_train_batches) 72 | feed_dict = { self.r: r, self.input_mask: input_mask, self.output_mask: input_mask } 73 | feed_dict.update(self.network.all_drop) # enable dropout 74 | cost, _ = self.sess.run([self.cost, self.train_op], feed_dict=feed_dict) 75 | total_cost += cost 76 | print('epoch: %s, total_cost: %s' % (epoch, total_cost)) 77 | for k in ['train', 'val', 'test']: 78 | self.test_model(k) 79 | epoch += 1 80 | 81 | def set_default_ratings(self, dataset, r_pred, input_mask): 82 | unseen_users = dataset['users'] - self.data['train']['users'] 83 | unseen_movies = dataset['movies'] - self.data['train']['movies'] 84 | for user in unseen_users: 85 | for movie in unseen_movies: 86 | if input_mask[user, movie] == 1: 87 | r_pred[user][movie] = 3 88 | 89 | def test_model(self, k): 90 | ratings = self.data['ratings'] 91 | dataset = self.data[k] 92 | input_mask = dataset['mask'] 93 | output_mask = (ratings != 0).astype(np.float32) 94 | feed_dict = { 95 | self.r: ratings, 96 | self.input_mask: input_mask, 97 | self.output_mask: output_mask, 98 | } 99 | dp_dict = tl.utils.dict_to_one(self.network.all_drop) # disable dropout 100 | feed_dict.update(dp_dict) 101 | r_pred = self.sess.run([self.r_pred], feed_dict=feed_dict) 102 | self.set_default_ratings(dataset, r_pred, input_mask) 103 | rmse = np.sqrt(np.sum(np.multiply(r_pred - ratings, output_mask) ** 2) / np.count_nonzero(output_mask)) 104 | print('%s RMSE: %f' % (k.upper(), rmse)) 105 | -------------------------------------------------------------------------------- /utils.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | from sklearn.model_selection import train_test_split 4 | 5 | def load_ratings(fname, random_state=42): 6 | ratings = pd.read_csv(fname, delimiter='::', names=['userId', 'movieId', 'rating', 'timestamp']) 7 | 8 | indices = range(len(ratings)) 9 | train_val_indices, test_indices = train_test_split(indices, test_size=0.1, random_state=random_state) 10 | train_indices, val_indices = train_test_split(train_val_indices, test_size=0.1, random_state=random_state) 11 | 12 | movie_idxs = {} 13 | user_idxs = {} 14 | def get_user_idx(user_id): 15 | if not user_id in user_idxs: 16 | user_idxs[user_id] = len(user_idxs) 17 | return user_idxs[user_id] 18 | 19 | def get_movie_idx(movie_id): 20 | if not movie_id in movie_idxs: 21 | movie_idxs[movie_id] = len(movie_idxs) 22 | return movie_idxs[movie_id] 23 | 24 | num_users = ratings.userId.nunique() 25 | num_movies = ratings.movieId.nunique() 26 | data = { 27 | 'ratings': np.zeros((num_users, num_movies), dtype=np.float32), 28 | 'train': { 29 | 'mask': np.zeros((num_users, num_movies), dtype=np.float32), 30 | 'users': set(), 31 | 'movies': set(), 32 | }, 33 | 'val': { 34 | 'mask': np.zeros((num_users, num_movies), dtype=np.float32), 35 | 'users': set(), 36 | 'movies': set(), 37 | }, 38 | 'test': { 39 | 'mask': np.zeros((num_users, num_movies), dtype=np.float32), 40 | 'users': set(), 41 | 'movies': set(), 42 | }, 43 | } 44 | 45 | for indices, k in [(train_indices, 'train'), (val_indices, 'val'), (test_indices, 'test')]: 46 | for row in ratings.iloc[indices].itertuples(): 47 | user_idx = get_user_idx(row.userId) 48 | movie_idx = get_movie_idx(row.movieId) 49 | data['ratings'][user_idx, movie_idx] = row.rating 50 | data[k]['mask'][user_idx, movie_idx] = 1 51 | data[k]['users'].add(user_idx) 52 | data[k]['movies'].add(movie_idx) 53 | 54 | return data 55 | --------------------------------------------------------------------------------