├── .gitignore
├── LICENSE
├── R
    ├── fossil.R
    └── preprocess.R
├── README.md
├── data
    └── movielens_trunc.csv
└── img
    ├── fossil_model.png
    └── fossil_model2.png


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | env/
 12 | build/
 13 | develop-eggs/
 14 | dist/
 15 | downloads/
 16 | eggs/
 17 | .eggs/
 18 | lib/
 19 | lib64/
 20 | parts/
 21 | sdist/
 22 | var/
 23 | wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | 
 28 | # PyInstaller
 29 | #  Usually these files are written by a python script from a template
 30 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 31 | *.manifest
 32 | *.spec
 33 | 
 34 | # Installer logs
 35 | pip-log.txt
 36 | pip-delete-this-directory.txt
 37 | 
 38 | # Unit test / coverage reports
 39 | htmlcov/
 40 | .tox/
 41 | .coverage
 42 | .coverage.*
 43 | .cache
 44 | nosetests.xml
 45 | coverage.xml
 46 | *.cover
 47 | .hypothesis/
 48 | 
 49 | # Translations
 50 | *.mo
 51 | *.pot
 52 | 
 53 | # Django stuff:
 54 | *.log
 55 | local_settings.py
 56 | 
 57 | # Flask stuff:
 58 | instance/
 59 | .webassets-cache
 60 | 
 61 | # Scrapy stuff:
 62 | .scrapy
 63 | 
 64 | # Sphinx documentation
 65 | docs/_build/
 66 | 
 67 | # PyBuilder
 68 | target/
 69 | 
 70 | # Jupyter Notebook
 71 | .ipynb_checkpoints
 72 | 
 73 | # pyenv
 74 | .python-version
 75 | 
 76 | # celery beat schedule file
 77 | celerybeat-schedule
 78 | 
 79 | # SageMath parsed files
 80 | *.sage.py
 81 | 
 82 | # dotenv
 83 | .env
 84 | 
 85 | # virtualenv
 86 | .venv
 87 | venv/
 88 | ENV/
 89 | 
 90 | # Spyder project settings
 91 | .spyderproject
 92 | .spyproject
 93 | 
 94 | # Rope project settings
 95 | .ropeproject
 96 | 
 97 | # mkdocs documentation
 98 | /site
 99 | 
100 | # mypy
101 | .mypy_cache/
102 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2018 Seong Hyun Hwang
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/R/fossil.R:
--------------------------------------------------------------------------------
  1 | library(plyr)
  2 | library(data.table)
  3 | library(Matrix)
  4 | library(doMC)
  5 | library(pryr)
  6 | library(hashmap)
  7 | 
  8 | registerDoMC(detectCores()-1)
  9 | 
 10 | # FOSSIL: Factorized Sequential Prediction with Item Similarity Models
 11 | # https://arxiv.org/pdf/1609.09152.pdf
 12 | 
 13 | # Score either a true or a false product (prod) given user_id, user_prods, and the model parameters
 14 | prod_score <- function(user_id, user_prods, prod, 
 15 |                        V, H, bias, eta_bias, eta, 
 16 |                        alpha, mc_order) {
 17 |   
 18 |   long_term_dynamics <- (length(user_prods) ^ (-alpha)) * apply(V[user_prods,,drop=FALSE], 2, sum) # vector of length k
 19 |   min_order <- min(mc_order, length(user_prods))
 20 |   
 21 |   rev_user_prods <- rev(user_prods[-(1:(length(user_prods) - min_order))])
 22 |   if (min_order == length(user_prods)) rev_user_prods <- rev(user_prods)
 23 |   
 24 |   if (length(user_id) == 0) {
 25 |     short_term_dynamics <- matrix((eta_bias + apply(eta, 2, mean))[1:min_order], nrow = 1) %*% V[rev_user_prods,,drop=FALSE]
 26 |   } else {
 27 |     short_term_dynamics <- matrix((eta_bias + eta[user_id,])[1:min_order], nrow = 1) %*% V[rev_user_prods,,drop=FALSE]
 28 |   }
 29 |   
 30 |   if (length(prod) != 0) {
 31 |     return(bias[prod] + apply((long_term_dynamics + short_term_dynamics) * H[prod,], 1, sum))
 32 |   } else {
 33 |     return(bias + (long_term_dynamics + short_term_dynamics) %*% t(H))
 34 |   }
 35 | }
 36 | 
 37 | # Train the model parameters
 38 | # dat2: a data.table with column names usr, prod (duplicates rows allowed)
 39 | fossil <- function(dat2, k_dim = 32, mc_order = 1, 
 40 |                    alpha = 0.5, reg = 0, init_sigma = 1,
 41 |                    learning_rate = 0.5, learning_rate_decay = 1,
 42 |                    maxiters = 100, maxtime = Inf,
 43 |                    seed = 123) {
 44 |   
 45 |   starttime <- Sys.time()
 46 |   
 47 |   # n_users: total number of users
 48 |   # n_prods: total number of products
 49 |   n_users <- uniqueN(dat2[,usr])
 50 |   n_prods <- uniqueN(dat2[,prod])
 51 |   
 52 |   # Initialize the model parameters
 53 |   set.seed(seed)
 54 |   V <- init_sigma * matrix(rnorm(n_prods * k_dim), nrow = n_prods, ncol = k_dim)
 55 |   H <- init_sigma * matrix(rnorm(n_prods * k_dim), nrow = n_prods, ncol = k_dim)
 56 |   eta <- init_sigma * matrix(rnorm(n_users * mc_order), nrow = n_users, ncol = mc_order)
 57 |   eta_bias <- vector('numeric', length = mc_order)
 58 |   bias <- vector('numeric', length = n_prods)
 59 |   
 60 |   # Training starts here...
 61 |   iters <- 1
 62 |   avg_cost <- vector('numeric', maxiters)
 63 |   current_absolute_cost <- vector('numeric', maxiters)
 64 |   current_delta <- vector('numeric', maxiters)
 65 |   #avg_recall <- vector('numeric', maxiters)
 66 |   #avg_sps <- vector('numeric', maxiters)
 67 |   #avg_uc <- vector('numeric', maxiters)
 68 |   #avg_bbs <- vector('numeric', maxiters)
 69 | 
 70 |   while (iters <= maxiters && Sys.time() - starttime < 60*maxtime) {
 71 |     
 72 |     # Pick a random training sample: (user_id, true_prod, false_prod)
 73 |     #cat('Picking a random training sample\n')
 74 |     user <- sample(all_users, 1)
 75 |     user_id <- usr2idx[[user]]
 76 |     user_prod_ids <- dat2[usr == user, prod_idx]
 77 |     
 78 |     rand <- sample(1:length(user_prod_ids), 1)
 79 |     
 80 |     false_prod <- sample(1:n_prods, 1)
 81 |     #false_prod <- sample(unique(dat2[,prod_idx]), 1)
 82 |     while (false_prod %in% user_prod_ids[1:rand]) {
 83 |       false_prod <- sample(1:n_prods, 1)
 84 |       #false_prod <- sample(unique(dat2[,prod_idx]), 1)
 85 |     }
 86 |     
 87 |     user_prods <- user_prod_ids[1:rand]
 88 |     
 89 |     # Learning rate decay
 90 |     if (iters %% 10 == 0) {
 91 |       learning_rate <- learning_rate * learning_rate_decay
 92 |     }
 93 |     
 94 |     # Update model parameters using stochastic gradient descent one training sample at a time
 95 |     true_prod <- user_prods[length(user_prods)]
 96 |     user_prods <- user_prods[1:(length(user_prods)-1)]
 97 |     min_order <- min(mc_order, length(user_prods))
 98 |     
 99 |     long_term_dynamics <- (length(user_prods) ^ (-alpha)) * apply(V[user_prods,,drop=FALSE], 2, sum)
100 |     
101 |     rev_user_prods <- rev(user_prods[-(1:(length(user_prods) - min_order))])
102 |     if (min_order == length(user_prods)) rev_user_prods <- rev(user_prods)
103 |     
104 |     short_term_dynamics <- matrix((eta_bias + eta[user_id,])[1:min_order], nrow = 1) %*% V[rev_user_prods,,drop=FALSE]
105 |     
106 |     # Compute absolute error and delta (sigmoid of error)
107 |     #cat('Computing the error\n')
108 |     x_true <- prod_score(user_id, user_prods, true_prod, V, H, bias, eta_bias, eta, alpha, mc_order)
109 |     x_false <- prod_score(user_id, user_prods, false_prod, V, H, bias, eta_bias, eta, alpha, mc_order)
110 |     absolute_error <- x_false - x_true
111 |     delta <- 1 / (1 + exp(-min(10, max(-10, absolute_error))))
112 |     
113 |     # Compute the updates
114 |     # long_term_dynamics + short_term_dynamics = personalized weighting factor
115 |     V_update <- learning_rate * (delta * (length(user_prods) ^ (-alpha)) * (H[true_prod,,drop=FALSE] - H[false_prod,,drop=FALSE])[rep(1, length(user_prods)),] - reg * V[user_prods,,drop=FALSE]) # matrix
116 |     V_update2 <- learning_rate * delta * outer((eta_bias + eta[user_id,])[1:min_order], H[true_prod,] - H[false_prod,]) # matrix
117 |     H_true_up <- learning_rate * (delta * (long_term_dynamics + short_term_dynamics) - reg * H[true_prod,])
118 |     H_false_up <- learning_rate * (-delta * (long_term_dynamics + short_term_dynamics) - reg * H[false_prod,])
119 |     bias_true_up <- learning_rate * (delta - reg * bias[true_prod])
120 |     bias_false_up <- learning_rate * (-delta - reg * bias[false_prod])
121 |     eta_bias_up <- learning_rate * (delta * apply((V[rev_user_prods,,drop=FALSE] * (H[true_prod,,drop=FALSE] - H[false_prod,,drop=FALSE])[rep(1, length(rev_user_prods)),]), 1, sum) - reg * eta_bias[1:min_order])
122 |     eta_up <- learning_rate * (delta * apply((V[rev_user_prods,,drop=FALSE] * (H[true_prod,,drop=FALSE] - H[false_prod,,drop=FALSE])[rep(1, length(rev_user_prods)),]), 1, sum) - reg * eta[user_id, 1:min_order])
123 |     
124 |     # Update the model parameters
125 |     #cat('Updating the model parameters\n')
126 |     V[user_prods,] <- V[user_prods,] + V_update
127 |     V[rev_user_prods,] <- V[rev_user_prods,] + V_update2
128 |     H[true_prod,] <- H[true_prod,] + H_true_up
129 |     H[false_prod,] <- H[false_prod] + H_false_up
130 |     bias[true_prod] <- bias[true_prod] + bias_true_up
131 |     bias[false_prod] <- bias[false_prod] + bias_false_up
132 |     eta_bias[1:min_order] <- eta_bias[1:min_order] + eta_bias_up
133 |     eta[user_id, 1:min_order] <- eta[user_id, 1:min_order] + eta_up
134 |     
135 |     current_absolute_cost[iters] <- absolute_error
136 |     current_delta[iters] <- delta
137 |     avg_cost[iters] <- sum(c(current_absolute_cost[1:iters], absolute_error)) / iters
138 |     
139 |     cat('User: ', user, '\n')
140 |     cat('User ID: ', user_id, '\n')
141 |     cat('True Product ID: ', true_prod, '\n')
142 |     cat('False Product ID: ', false_prod, '\n')
143 |     cat('Iteration: ', iters, '\n')
144 |     cat('Average Error: ', avg_cost[iters], '\n')
145 |     cat('Time Progressed: ', Sys.time() - starttime, '\n')
146 |     cat('\n\n')
147 |     
148 |     iters <- iters + 1
149 |   }
150 |   
151 |   return(list(avg_cost = avg_cost,
152 |               current_absolute_cost = current_absolute_cost,
153 |               current_delta = current_delta,
154 |               V = V, 
155 |               H = H, 
156 |               bias = bias, 
157 |               eta_bias = eta_bias,
158 |               eta = eta,
159 |               alpha = alpha,
160 |               mc_order = mc_order))
161 | }
162 | 
163 | # Recommend top k products for every user
164 | # users: a vector of users to recommend products
165 | top_k <- function(dat2, users, V, H, bias, eta_bias, eta, alpha, mc_order, 
166 |                   top_k = 10, excluded_prods = NULL, parallel = FALSE) {
167 |   
168 |   dat_trunc <- dat2[usr %in% users]
169 |   dat_trunc <- split(dat_trunc, by = 'usr')
170 |   recommend <- llply(dat_trunc, function(x) {
171 |     user <- unique(x[,usr])
172 |     uid <- unique(x[,usr_idx])
173 |     viewed_prod_ids <- x[,prod_idx]
174 |     scores <- prod_score(user_id = uid, user_prods = viewed_prod_ids, prod = NULL, 
175 |                          V = V, H = H, bias = bias, 
176 |                          eta_bias = eta_bias, eta = eta, 
177 |                          alpha = alpha, mc_order = mc_order)
178 |     #scores[,viewed_prod_ids] <- -Inf
179 |     if (length(excluded_prods) != 0) {
180 |       excluded_prod_ids <- prod2idx[[excluded_prods]]
181 |       scores[,excluded_prod_ids] <- -Inf
182 |     }
183 |     ranked_prod_ids <- order(scores, decreasing = TRUE)[1:top_k]
184 |     output <- data.table(usr = user,
185 |                          usr_idx = uid,
186 |                          prod = idx2prod[[ranked_prod_ids]],
187 |                          prod_idx = ranked_prod_ids,
188 |                          score = scores[ranked_prod_ids])
189 |     return(output)
190 |   }, .parallel = parallel)
191 |   return(rbindlist(recommend))
192 | }
193 | 
194 | 


--------------------------------------------------------------------------------
/R/preprocess.R:
--------------------------------------------------------------------------------
 1 | library(plyr)
 2 | library(data.table)
 3 | library(Matrix)
 4 | library(doMC)
 5 | library(pryr)
 6 | library(hashmap)
 7 | 
 8 | options(scipen = 15)
 9 | 
10 | # Load the dataset: MovieLens 1M
11 | dat <- fread("data/movielens_trunc.csv")
12 | dat2 <- dat[, .(usr, prod)]
13 | 
14 | # Remove users with low rating activity and rare items
15 | set.seed(321)
16 | min_usr <- 1
17 | min_prod <- 5
18 | dat2 <- dat2[usr %in% dat2[, .N, usr][N > min_usr, usr]]
19 | dat2 <- dat2[prod %in% dat2[, .N, prod][N > min_prod, prod]]
20 | dat2 <- dat2[usr %in% dat2[, .N, usr][N > min_usr, usr]]
21 | 
22 | # Optional: randomly sample 1000 users
23 | # dat2 <- dat2[usr %in% sample(unique(dat2[, usr]), 1000, replace = FALSE)]
24 | 
25 | # Split data into train, dev, test
26 | # for every user, put the last 5 percent of products in test
27 | # and the next last 15 percent of remaining products in dev
28 | split_data <- function(dat2, test_prop = 0.05, dev_prop = 0.15, seed = 123, parallel = FALSE) {
29 |   set.seed(seed)
30 |   dat_trunc <- split(dat2, by = 'usr')
31 |   dat_split <- llply(dat_trunc, function(x) {
32 |     n <- x[,.N]
33 |     test_idx <- tail(seq_len(n), floor(n*test_prop))
34 |     dev_idx <- tail(setdiff(seq_len(n), test_idx), floor(length(setdiff(seq_len(n), test_idx)) * dev_prop / (1-test_prop)))
35 |     return(list(test = x[test_idx],
36 |                 dev = x[dev_idx],
37 |                 train = x[setdiff(seq_len(n), c(dev_idx, test_idx))]))
38 |   }, .parallel = parallel)
39 |   return(dat_split)
40 | }
41 | 
42 | dat_split <- split_data(dat2, test_prop = 0.05, dev_prop = 0.15, seed = 321, parallel = TRUE)
43 | 
44 | train <- rbindlist(llply(dat_split, function(x) x$train))
45 | dev <- rbindlist(llply(dat_split, function(x) x$dev))
46 | test <- rbindlist(llply(dat_split, function(x) x$test))
47 | 
48 | # rm(dat_split)
49 | # rm(dat2)
50 | 
51 | # Look-up hashmaps for users and products
52 | all_prods <- unique(train[,prod])
53 | prod2idx <- hashmap(key = all_prods, values = 1:length(all_prods))
54 | idx2prod <- hashmap(key = 1:length(all_prods), values = all_prods)
55 | 
56 | all_users <- unique(train[,usr])
57 | usr2idx <- hashmap(key = all_users, values = 1:length(all_users))
58 | idx2usr <- hashmap(key = 1:length(all_users), values = all_users)
59 | 
60 | train[, usr_idx := usr2idx[[usr]]]
61 | train[, prod_idx := prod2idx[[prod]]]
62 | 
63 | # Optional: train should now contain 4 columns usr, prod, usr_idx, prod_idx
64 | # head(train)
65 | 
66 | # Retrieve the top 1% of products as popular products
67 | pop_prods <- train[, .N, by = .(prod, prod_idx)][order(-N)][1:floor(.N * 0.01)]
68 | 
69 | # If a product in dev or test set does not appear in training, make sure to assign a fixed prod_idx of zero
70 | # and add that prod_idx to product hashmaps. Alternatively here, remove all products that do not appear in training
71 | dev[, usr_idx := usr2idx[[usr]]]
72 | dev[, prod_idx := prod2idx[[prod]]]
73 | dev <- dev[!is.na(prod_idx)]
74 | 
75 | test[, usr_idx := usr2idx[[usr]]]
76 | test[, prod_idx := prod2idx[[prod]]]
77 | test <- test[!is.na(prod_idx)]
78 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # FOSSIL: Factorized Sequential Prediction with Item Similarity Models
 2 | Paper: [Fusing Similarity Models with Markov Chains for Sparse Sequential Recommendation](https://arxiv.org/pdf/1609.09152.pdf) (He and McAuley, 2016)
 3 | 
 4 | This repository contains my implementation of FOSSIL in R that predicts a user's next purchase based on the past purchase history. The algorithm uses a similarity-based method to reduce sparsity in modeling user preferences as well as higher order Markov chains to smooth user preferences across multiple time steps.
 5 | 
 6 | ![FOSSIL](/img/fossil_model.png)
 7 | 
 8 | ### Main Ideas
 9 | FOSSIL models both long-term **user preference** (matrix factorization) and short-term **sequential dynamics** (markov chains). Previously, there had been several attempts to model sequential dynamics and subsequently combine it with matrix factorization:
10 | 
11 | - Factorized Markov Chains (FMC) that factorizes the item-item transition matrix (non-personalized)
12 | - Tensor Factorization (TF; Rendle et al. 2008) where a dataset is represented by a tensor and its entries capture the likelihood that users transition from one item to another
13 | - Factorized Personalized Markov Chains (FPMC; Rendle et al. 2010) that factorizes the user-item matrix and considers Markov chains of first order.
14 | 
15 | FOSSIL is similar in spirit to FPMC but takes an inspiration from **FISM** (Factored Item Similarity Models for Top-N Recommender Systems; Kabbur et al. 2013) to factorize item-item matrix in order to reduce sparsity by not having to explicitly model users and relaxes the first order Markov chain assumption of FPMC to higher orders.
16 | 
17 | ![FOSSIL2](/img/fossil_model2.png)
18 | 
19 | Since here we are interested in top-k recommendation (typically k = 10), the algorithm minimizes the S-BPR (Sequential Bayesian Personalized Ranking) loss using stochastic gradient descent, one training sample at a time, to optimize the model parameters. S-BPR uses a sigmoid function to characterize the probability that a true item is ranked higher than a false item given a user and the model parameters, assuming independence of users and time steps.
20 | 
21 | ### Instructions
22 | Run the *preprocess.R* script on *movielens_trunc.csv* file and then the *fossil.R* on the training set produced. I'm going to upload the python version of the algorithm as well as functions for evaluation metrics soon, so stay tuned.
23 | 
24 | 


--------------------------------------------------------------------------------
/img/fossil_model.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stathwang/FOSSIL/28dcbce0845738d8ed2d89b4acc041cf77e5c184/img/fossil_model.png


--------------------------------------------------------------------------------
/img/fossil_model2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stathwang/FOSSIL/28dcbce0845738d8ed2d89b4acc041cf77e5c184/img/fossil_model2.png


--------------------------------------------------------------------------------