├── .gitignore ├── LICENSE ├── R ├── fossil.R └── preprocess.R ├── README.md ├── data └── movielens_trunc.csv └── img ├── fossil_model.png └── fossil_model2.png /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | env/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | 49 | # Translations 50 | *.mo 51 | *.pot 52 | 53 | # Django stuff: 54 | *.log 55 | local_settings.py 56 | 57 | # Flask stuff: 58 | instance/ 59 | .webassets-cache 60 | 61 | # Scrapy stuff: 62 | .scrapy 63 | 64 | # Sphinx documentation 65 | docs/_build/ 66 | 67 | # PyBuilder 68 | target/ 69 | 70 | # Jupyter Notebook 71 | .ipynb_checkpoints 72 | 73 | # pyenv 74 | .python-version 75 | 76 | # celery beat schedule file 77 | celerybeat-schedule 78 | 79 | # SageMath parsed files 80 | *.sage.py 81 | 82 | # dotenv 83 | .env 84 | 85 | # virtualenv 86 | .venv 87 | venv/ 88 | ENV/ 89 | 90 | # Spyder project settings 91 | .spyderproject 92 | .spyproject 93 | 94 | # Rope project settings 95 | .ropeproject 96 | 97 | # mkdocs documentation 98 | /site 99 | 100 | # mypy 101 | .mypy_cache/ 102 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2018 Seong Hyun Hwang 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /R/fossil.R: -------------------------------------------------------------------------------- 1 | library(plyr) 2 | library(data.table) 3 | library(Matrix) 4 | library(doMC) 5 | library(pryr) 6 | library(hashmap) 7 | 8 | registerDoMC(detectCores()-1) 9 | 10 | # FOSSIL: Factorized Sequential Prediction with Item Similarity Models 11 | # https://arxiv.org/pdf/1609.09152.pdf 12 | 13 | # Score either a true or a false product (prod) given user_id, user_prods, and the model parameters 14 | prod_score <- function(user_id, user_prods, prod, 15 | V, H, bias, eta_bias, eta, 16 | alpha, mc_order) { 17 | 18 | long_term_dynamics <- (length(user_prods) ^ (-alpha)) * apply(V[user_prods,,drop=FALSE], 2, sum) # vector of length k 19 | min_order <- min(mc_order, length(user_prods)) 20 | 21 | rev_user_prods <- rev(user_prods[-(1:(length(user_prods) - min_order))]) 22 | if (min_order == length(user_prods)) rev_user_prods <- rev(user_prods) 23 | 24 | if (length(user_id) == 0) { 25 | short_term_dynamics <- matrix((eta_bias + apply(eta, 2, mean))[1:min_order], nrow = 1) %*% V[rev_user_prods,,drop=FALSE] 26 | } else { 27 | short_term_dynamics <- matrix((eta_bias + eta[user_id,])[1:min_order], nrow = 1) %*% V[rev_user_prods,,drop=FALSE] 28 | } 29 | 30 | if (length(prod) != 0) { 31 | return(bias[prod] + apply((long_term_dynamics + short_term_dynamics) * H[prod,], 1, sum)) 32 | } else { 33 | return(bias + (long_term_dynamics + short_term_dynamics) %*% t(H)) 34 | } 35 | } 36 | 37 | # Train the model parameters 38 | # dat2: a data.table with column names usr, prod (duplicates rows allowed) 39 | fossil <- function(dat2, k_dim = 32, mc_order = 1, 40 | alpha = 0.5, reg = 0, init_sigma = 1, 41 | learning_rate = 0.5, learning_rate_decay = 1, 42 | maxiters = 100, maxtime = Inf, 43 | seed = 123) { 44 | 45 | starttime <- Sys.time() 46 | 47 | # n_users: total number of users 48 | # n_prods: total number of products 49 | n_users <- uniqueN(dat2[,usr]) 50 | n_prods <- uniqueN(dat2[,prod]) 51 | 52 | # Initialize the model parameters 53 | set.seed(seed) 54 | V <- init_sigma * matrix(rnorm(n_prods * k_dim), nrow = n_prods, ncol = k_dim) 55 | H <- init_sigma * matrix(rnorm(n_prods * k_dim), nrow = n_prods, ncol = k_dim) 56 | eta <- init_sigma * matrix(rnorm(n_users * mc_order), nrow = n_users, ncol = mc_order) 57 | eta_bias <- vector('numeric', length = mc_order) 58 | bias <- vector('numeric', length = n_prods) 59 | 60 | # Training starts here... 61 | iters <- 1 62 | avg_cost <- vector('numeric', maxiters) 63 | current_absolute_cost <- vector('numeric', maxiters) 64 | current_delta <- vector('numeric', maxiters) 65 | #avg_recall <- vector('numeric', maxiters) 66 | #avg_sps <- vector('numeric', maxiters) 67 | #avg_uc <- vector('numeric', maxiters) 68 | #avg_bbs <- vector('numeric', maxiters) 69 | 70 | while (iters <= maxiters && Sys.time() - starttime < 60*maxtime) { 71 | 72 | # Pick a random training sample: (user_id, true_prod, false_prod) 73 | #cat('Picking a random training sample\n') 74 | user <- sample(all_users, 1) 75 | user_id <- usr2idx[[user]] 76 | user_prod_ids <- dat2[usr == user, prod_idx] 77 | 78 | rand <- sample(1:length(user_prod_ids), 1) 79 | 80 | false_prod <- sample(1:n_prods, 1) 81 | #false_prod <- sample(unique(dat2[,prod_idx]), 1) 82 | while (false_prod %in% user_prod_ids[1:rand]) { 83 | false_prod <- sample(1:n_prods, 1) 84 | #false_prod <- sample(unique(dat2[,prod_idx]), 1) 85 | } 86 | 87 | user_prods <- user_prod_ids[1:rand] 88 | 89 | # Learning rate decay 90 | if (iters %% 10 == 0) { 91 | learning_rate <- learning_rate * learning_rate_decay 92 | } 93 | 94 | # Update model parameters using stochastic gradient descent one training sample at a time 95 | true_prod <- user_prods[length(user_prods)] 96 | user_prods <- user_prods[1:(length(user_prods)-1)] 97 | min_order <- min(mc_order, length(user_prods)) 98 | 99 | long_term_dynamics <- (length(user_prods) ^ (-alpha)) * apply(V[user_prods,,drop=FALSE], 2, sum) 100 | 101 | rev_user_prods <- rev(user_prods[-(1:(length(user_prods) - min_order))]) 102 | if (min_order == length(user_prods)) rev_user_prods <- rev(user_prods) 103 | 104 | short_term_dynamics <- matrix((eta_bias + eta[user_id,])[1:min_order], nrow = 1) %*% V[rev_user_prods,,drop=FALSE] 105 | 106 | # Compute absolute error and delta (sigmoid of error) 107 | #cat('Computing the error\n') 108 | x_true <- prod_score(user_id, user_prods, true_prod, V, H, bias, eta_bias, eta, alpha, mc_order) 109 | x_false <- prod_score(user_id, user_prods, false_prod, V, H, bias, eta_bias, eta, alpha, mc_order) 110 | absolute_error <- x_false - x_true 111 | delta <- 1 / (1 + exp(-min(10, max(-10, absolute_error)))) 112 | 113 | # Compute the updates 114 | # long_term_dynamics + short_term_dynamics = personalized weighting factor 115 | V_update <- learning_rate * (delta * (length(user_prods) ^ (-alpha)) * (H[true_prod,,drop=FALSE] - H[false_prod,,drop=FALSE])[rep(1, length(user_prods)),] - reg * V[user_prods,,drop=FALSE]) # matrix 116 | V_update2 <- learning_rate * delta * outer((eta_bias + eta[user_id,])[1:min_order], H[true_prod,] - H[false_prod,]) # matrix 117 | H_true_up <- learning_rate * (delta * (long_term_dynamics + short_term_dynamics) - reg * H[true_prod,]) 118 | H_false_up <- learning_rate * (-delta * (long_term_dynamics + short_term_dynamics) - reg * H[false_prod,]) 119 | bias_true_up <- learning_rate * (delta - reg * bias[true_prod]) 120 | bias_false_up <- learning_rate * (-delta - reg * bias[false_prod]) 121 | eta_bias_up <- learning_rate * (delta * apply((V[rev_user_prods,,drop=FALSE] * (H[true_prod,,drop=FALSE] - H[false_prod,,drop=FALSE])[rep(1, length(rev_user_prods)),]), 1, sum) - reg * eta_bias[1:min_order]) 122 | eta_up <- learning_rate * (delta * apply((V[rev_user_prods,,drop=FALSE] * (H[true_prod,,drop=FALSE] - H[false_prod,,drop=FALSE])[rep(1, length(rev_user_prods)),]), 1, sum) - reg * eta[user_id, 1:min_order]) 123 | 124 | # Update the model parameters 125 | #cat('Updating the model parameters\n') 126 | V[user_prods,] <- V[user_prods,] + V_update 127 | V[rev_user_prods,] <- V[rev_user_prods,] + V_update2 128 | H[true_prod,] <- H[true_prod,] + H_true_up 129 | H[false_prod,] <- H[false_prod] + H_false_up 130 | bias[true_prod] <- bias[true_prod] + bias_true_up 131 | bias[false_prod] <- bias[false_prod] + bias_false_up 132 | eta_bias[1:min_order] <- eta_bias[1:min_order] + eta_bias_up 133 | eta[user_id, 1:min_order] <- eta[user_id, 1:min_order] + eta_up 134 | 135 | current_absolute_cost[iters] <- absolute_error 136 | current_delta[iters] <- delta 137 | avg_cost[iters] <- sum(c(current_absolute_cost[1:iters], absolute_error)) / iters 138 | 139 | cat('User: ', user, '\n') 140 | cat('User ID: ', user_id, '\n') 141 | cat('True Product ID: ', true_prod, '\n') 142 | cat('False Product ID: ', false_prod, '\n') 143 | cat('Iteration: ', iters, '\n') 144 | cat('Average Error: ', avg_cost[iters], '\n') 145 | cat('Time Progressed: ', Sys.time() - starttime, '\n') 146 | cat('\n\n') 147 | 148 | iters <- iters + 1 149 | } 150 | 151 | return(list(avg_cost = avg_cost, 152 | current_absolute_cost = current_absolute_cost, 153 | current_delta = current_delta, 154 | V = V, 155 | H = H, 156 | bias = bias, 157 | eta_bias = eta_bias, 158 | eta = eta, 159 | alpha = alpha, 160 | mc_order = mc_order)) 161 | } 162 | 163 | # Recommend top k products for every user 164 | # users: a vector of users to recommend products 165 | top_k <- function(dat2, users, V, H, bias, eta_bias, eta, alpha, mc_order, 166 | top_k = 10, excluded_prods = NULL, parallel = FALSE) { 167 | 168 | dat_trunc <- dat2[usr %in% users] 169 | dat_trunc <- split(dat_trunc, by = 'usr') 170 | recommend <- llply(dat_trunc, function(x) { 171 | user <- unique(x[,usr]) 172 | uid <- unique(x[,usr_idx]) 173 | viewed_prod_ids <- x[,prod_idx] 174 | scores <- prod_score(user_id = uid, user_prods = viewed_prod_ids, prod = NULL, 175 | V = V, H = H, bias = bias, 176 | eta_bias = eta_bias, eta = eta, 177 | alpha = alpha, mc_order = mc_order) 178 | #scores[,viewed_prod_ids] <- -Inf 179 | if (length(excluded_prods) != 0) { 180 | excluded_prod_ids <- prod2idx[[excluded_prods]] 181 | scores[,excluded_prod_ids] <- -Inf 182 | } 183 | ranked_prod_ids <- order(scores, decreasing = TRUE)[1:top_k] 184 | output <- data.table(usr = user, 185 | usr_idx = uid, 186 | prod = idx2prod[[ranked_prod_ids]], 187 | prod_idx = ranked_prod_ids, 188 | score = scores[ranked_prod_ids]) 189 | return(output) 190 | }, .parallel = parallel) 191 | return(rbindlist(recommend)) 192 | } 193 | 194 | -------------------------------------------------------------------------------- /R/preprocess.R: -------------------------------------------------------------------------------- 1 | library(plyr) 2 | library(data.table) 3 | library(Matrix) 4 | library(doMC) 5 | library(pryr) 6 | library(hashmap) 7 | 8 | options(scipen = 15) 9 | 10 | # Load the dataset: MovieLens 1M 11 | dat <- fread("data/movielens_trunc.csv") 12 | dat2 <- dat[, .(usr, prod)] 13 | 14 | # Remove users with low rating activity and rare items 15 | set.seed(321) 16 | min_usr <- 1 17 | min_prod <- 5 18 | dat2 <- dat2[usr %in% dat2[, .N, usr][N > min_usr, usr]] 19 | dat2 <- dat2[prod %in% dat2[, .N, prod][N > min_prod, prod]] 20 | dat2 <- dat2[usr %in% dat2[, .N, usr][N > min_usr, usr]] 21 | 22 | # Optional: randomly sample 1000 users 23 | # dat2 <- dat2[usr %in% sample(unique(dat2[, usr]), 1000, replace = FALSE)] 24 | 25 | # Split data into train, dev, test 26 | # for every user, put the last 5 percent of products in test 27 | # and the next last 15 percent of remaining products in dev 28 | split_data <- function(dat2, test_prop = 0.05, dev_prop = 0.15, seed = 123, parallel = FALSE) { 29 | set.seed(seed) 30 | dat_trunc <- split(dat2, by = 'usr') 31 | dat_split <- llply(dat_trunc, function(x) { 32 | n <- x[,.N] 33 | test_idx <- tail(seq_len(n), floor(n*test_prop)) 34 | dev_idx <- tail(setdiff(seq_len(n), test_idx), floor(length(setdiff(seq_len(n), test_idx)) * dev_prop / (1-test_prop))) 35 | return(list(test = x[test_idx], 36 | dev = x[dev_idx], 37 | train = x[setdiff(seq_len(n), c(dev_idx, test_idx))])) 38 | }, .parallel = parallel) 39 | return(dat_split) 40 | } 41 | 42 | dat_split <- split_data(dat2, test_prop = 0.05, dev_prop = 0.15, seed = 321, parallel = TRUE) 43 | 44 | train <- rbindlist(llply(dat_split, function(x) x$train)) 45 | dev <- rbindlist(llply(dat_split, function(x) x$dev)) 46 | test <- rbindlist(llply(dat_split, function(x) x$test)) 47 | 48 | # rm(dat_split) 49 | # rm(dat2) 50 | 51 | # Look-up hashmaps for users and products 52 | all_prods <- unique(train[,prod]) 53 | prod2idx <- hashmap(key = all_prods, values = 1:length(all_prods)) 54 | idx2prod <- hashmap(key = 1:length(all_prods), values = all_prods) 55 | 56 | all_users <- unique(train[,usr]) 57 | usr2idx <- hashmap(key = all_users, values = 1:length(all_users)) 58 | idx2usr <- hashmap(key = 1:length(all_users), values = all_users) 59 | 60 | train[, usr_idx := usr2idx[[usr]]] 61 | train[, prod_idx := prod2idx[[prod]]] 62 | 63 | # Optional: train should now contain 4 columns usr, prod, usr_idx, prod_idx 64 | # head(train) 65 | 66 | # Retrieve the top 1% of products as popular products 67 | pop_prods <- train[, .N, by = .(prod, prod_idx)][order(-N)][1:floor(.N * 0.01)] 68 | 69 | # If a product in dev or test set does not appear in training, make sure to assign a fixed prod_idx of zero 70 | # and add that prod_idx to product hashmaps. Alternatively here, remove all products that do not appear in training 71 | dev[, usr_idx := usr2idx[[usr]]] 72 | dev[, prod_idx := prod2idx[[prod]]] 73 | dev <- dev[!is.na(prod_idx)] 74 | 75 | test[, usr_idx := usr2idx[[usr]]] 76 | test[, prod_idx := prod2idx[[prod]]] 77 | test <- test[!is.na(prod_idx)] 78 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # FOSSIL: Factorized Sequential Prediction with Item Similarity Models 2 | Paper: [Fusing Similarity Models with Markov Chains for Sparse Sequential Recommendation](https://arxiv.org/pdf/1609.09152.pdf) (He and McAuley, 2016) 3 | 4 | This repository contains my implementation of FOSSIL in R that predicts a user's next purchase based on the past purchase history. The algorithm uses a similarity-based method to reduce sparsity in modeling user preferences as well as higher order Markov chains to smooth user preferences across multiple time steps. 5 | 6 | ![FOSSIL](/img/fossil_model.png) 7 | 8 | ### Main Ideas 9 | FOSSIL models both long-term **user preference** (matrix factorization) and short-term **sequential dynamics** (markov chains). Previously, there had been several attempts to model sequential dynamics and subsequently combine it with matrix factorization: 10 | 11 | - Factorized Markov Chains (FMC) that factorizes the item-item transition matrix (non-personalized) 12 | - Tensor Factorization (TF; Rendle et al. 2008) where a dataset is represented by a tensor and its entries capture the likelihood that users transition from one item to another 13 | - Factorized Personalized Markov Chains (FPMC; Rendle et al. 2010) that factorizes the user-item matrix and considers Markov chains of first order. 14 | 15 | FOSSIL is similar in spirit to FPMC but takes an inspiration from **FISM** (Factored Item Similarity Models for Top-N Recommender Systems; Kabbur et al. 2013) to factorize item-item matrix in order to reduce sparsity by not having to explicitly model users and relaxes the first order Markov chain assumption of FPMC to higher orders. 16 | 17 | ![FOSSIL2](/img/fossil_model2.png) 18 | 19 | Since here we are interested in top-k recommendation (typically k = 10), the algorithm minimizes the S-BPR (Sequential Bayesian Personalized Ranking) loss using stochastic gradient descent, one training sample at a time, to optimize the model parameters. S-BPR uses a sigmoid function to characterize the probability that a true item is ranked higher than a false item given a user and the model parameters, assuming independence of users and time steps. 20 | 21 | ### Instructions 22 | Run the *preprocess.R* script on *movielens_trunc.csv* file and then the *fossil.R* on the training set produced. I'm going to upload the python version of the algorithm as well as functions for evaluation metrics soon, so stay tuned. 23 | 24 | -------------------------------------------------------------------------------- /img/fossil_model.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stathwang/FOSSIL/28dcbce0845738d8ed2d89b4acc041cf77e5c184/img/fossil_model.png -------------------------------------------------------------------------------- /img/fossil_model2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stathwang/FOSSIL/28dcbce0845738d8ed2d89b4acc041cf77e5c184/img/fossil_model2.png --------------------------------------------------------------------------------