├── LICENSE ├── README.md ├── recsys-binary-implicit-keras.R ├── tensorflow-wide-n-deep.R └── triplet-loss-keras ├── data-movielens.R ├── metric-auc.R ├── plot-model.R ├── triplet-loss-bpr-keras.R └── triplet-loss-margin-keras.R /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2018, Nan Xiao 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining 6 | a copy of this software and associated documentation files (the 7 | "Software"), to deal in the Software without restriction, including 8 | without limitation the rights to use, copy, modify, merge, publish, 9 | distribute, sublicense, and/or sell copies of the Software, and to 10 | permit persons to whom the Software is furnished to do so, subject to 11 | the following conditions: 12 | 13 | The above copyright notice and this permission notice shall be 14 | included in all copies or substantial portions of the Software. 15 | 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 17 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 18 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 19 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE 20 | LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 21 | OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION 22 | WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 23 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # deep-learning-recipes 2 | 3 | [![Project Status: Active – The project has reached a stable, usable state and is being actively developed.](https://www.repostatus.org/badges/latest/active.svg)](https://www.repostatus.org/#active) 4 | ![License: MIT](https://img.shields.io/github/license/nanxstats/deep-learning-recipes.svg) 5 | 6 | R implementation for selected machine learning methods with deep learning frameworks (Keras, Tensorflow). 7 | 8 | Principles: clean, self-contained, minimal dependency, works with the latest framework versions. 9 | 10 | ## Contents 11 | 12 | - Triplet losses for implicit feedback recommender systems. [[blog post](https://nanx.me/blog/post/triplet-loss-r-keras/)] [[code](triplet-loss-keras)] 13 | - Matrix factorization for binary implicit feedback data. [[blog post](https://nanx.me/blog/post/recsys-binary-implicit-feedback-r-keras/)] [[code](recsys-binary-implicit-keras.R)] 14 | - "Wide and deep" model for regression/classification. [[blog post](https://nanx.me/blog/post/building-my-first-deep-learning-machine/)] [[code](tensorflow-wide-n-deep.R)] 15 | -------------------------------------------------------------------------------- /recsys-binary-implicit-keras.R: -------------------------------------------------------------------------------- 1 | library("keras") 2 | library("magrittr") 3 | library("progress") 4 | library("hrbrthemes") 5 | library("ggsci") 6 | 7 | use_virtualenv("~/tensorflow/venv/") 8 | 9 | # 1. preprocess data ----------------------------------------------------------- 10 | 11 | # see full dataset at https://github.com/nanxstats/MEF/ 12 | url_drug <- "https://raw.githubusercontent.com/nanxstats/MEF/master/data/drug.txt" 13 | url_adr <- "https://raw.githubusercontent.com/nanxstats/MEF/master/data/adr.txt" 14 | url_pair <- "https://raw.githubusercontent.com/nanxstats/MEF/master/data/association.txt" 15 | 16 | df_drug <- read.table(url_drug, sep = "\t", as.is = TRUE) 17 | df_adr <- read.table(url_adr, sep = "\t", as.is = TRUE) 18 | df_pair <- read.table(url_pair, sep = "\t", as.is = TRUE) 19 | 20 | names(df_drug) <- c("drug_id", "drugbank_id", "kegg_id", "name", "smiles") 21 | names(df_adr) <- c("adr_id", "name") 22 | names(df_pair) <- c("drug_id", "adr_id") 23 | 24 | # clean up drug id 25 | clean_drug_id <- function(x) gsub("-", "", as.character(x)) 26 | df_drug$drug_id <- clean_drug_id(df_drug$drug_id) 27 | df_pair$drug_id <- clean_drug_id(df_pair$drug_id) 28 | 29 | # map drug id and adr id to integer numbers (for the input) 30 | for (i in 1L:nrow(df_pair)) { 31 | df_pair$"drug_id"[i] <- which(df_pair$"drug_id"[i] == df_drug$"drug_id") - 1L 32 | df_pair$"adr_id"[i] <- which(df_pair$"adr_id"[i] == df_adr$"adr_id") - 1L 33 | } 34 | 35 | # complete the unobserved rows as class 0 36 | pair <- expand.grid( 37 | "drug_id" = unique(df_pair$drug_id), 38 | "adr_id" = unique(df_pair$adr_id), 39 | stringsAsFactors = FALSE 40 | ) 41 | pair$"class" <- 0L 42 | 43 | # set known drug-ADR association pairs as class 1 44 | pb <- progress_bar$new( 45 | format = "[:bar] :percent eta: :eta", total = nrow(df_pair) 46 | ) 47 | for (i in 1L:nrow(df_pair)) { 48 | pb$tick() 49 | pair[intersect( 50 | which(df_pair$drug_id[i] == pair$drug_id), 51 | which(df_pair$adr[i] == pair$adr_id) 52 | ), "class"] <- 1L 53 | } 54 | 55 | # convert to integer 56 | pair$drug_id <- as.integer(pair$drug_id) 57 | pair$adr_id <- as.integer(pair$adr_id) 58 | 59 | # shuffle rows 60 | set.seed(42) 61 | pair <- pair[sample(1L:nrow(pair)), ] 62 | 63 | # 2. matrix factorization with Keras ------------------------------------------- 64 | 65 | # basic settings 66 | n_drug <- length(unique(df_pair$drug_id)) 67 | n_adr <- length(unique(df_pair$adr_id)) 68 | k <- 10 # number of latent factors to learn 69 | 70 | # input layers 71 | input_drug <- layer_input(shape = c(1)) 72 | input_adr <- layer_input(shape = c(1)) 73 | 74 | # embedding and flatten layers 75 | embed_drug <- input_drug %>% 76 | layer_embedding(input_dim = n_drug, output_dim = k, input_length = 1) %>% 77 | layer_flatten() 78 | embed_adr <- input_adr %>% 79 | layer_embedding(input_dim = n_adr, output_dim = k, input_length = 1) %>% 80 | layer_flatten() 81 | 82 | # dot product and output layer (can be replaced by arbitrary DNN architecture) 83 | pred <- layer_dot(list(embed_drug, embed_adr), axes = -1) %>% 84 | layer_dense(units = 1, activation = "sigmoid") 85 | 86 | # define model inputs/outputs 87 | model <- keras_model(inputs = c(input_drug, input_adr), outputs = pred) 88 | model %>% compile( 89 | loss = "binary_crossentropy", 90 | metric = "binary_accuracy", 91 | optimizer = optimizer_rmsprop() # the most stable one here 92 | ) 93 | 94 | # inspect model 95 | summary(model) 96 | 97 | # train the model 98 | history <- model %>% fit( 99 | x = list( 100 | matrix(pair$drug_id, ncol = 1), 101 | matrix(pair$adr_id, ncol = 1) 102 | ), 103 | y = matrix(pair$class, ncol = 1), 104 | class_weight = list("1" = 50.0, "0" = 1.0), # deal with unbalanced classes 105 | epochs = 20, 106 | batch_size = 2000, # needs some tuning 107 | validation_split = 0.2 108 | ) 109 | 110 | # plot training history 111 | plot(history) + 112 | theme_ipsum() + 113 | scale_color_startrek() + 114 | scale_fill_startrek() 115 | -------------------------------------------------------------------------------- /tensorflow-wide-n-deep.R: -------------------------------------------------------------------------------- 1 | library("msaenet") 2 | library("tensorflow") 3 | library("tfestimators") 4 | 5 | # load tensorflow-gpu in virtualenv folder 6 | use_virtualenv("~/tensorflow/venv/") 7 | 8 | # generate synthetic data for binary classification 9 | # 1 million observations x 100 features (20 useful) 10 | n <- 1e+6L 11 | p <- 1e+2L 12 | sim <- msaenet.sim.binomial(n = n, p = p, snr = 1, coef = rep(1, 20), p.train = 0.5) 13 | 14 | # create input data: features 15 | df_tr <- as.data.frame(sim$x.tr) 16 | df_te <- as.data.frame(sim$x.te) 17 | 18 | # set feature type so tensorflow recognizes them 19 | feat <- vector("list", ncol(df_tr)) 20 | for (i in 1L:length(feat)) feat[[i]] <- column_numeric(paste0("V", i)) 21 | 22 | wide_columns <- feature_columns(feat) 23 | deep_columns <- feature_columns(feat) 24 | 25 | # define the "wide and deep" model 26 | model <- dnn_linear_combined_classifier( 27 | linear_feature_columns = wide_columns, 28 | linear_optimizer = "Ftrl", 29 | dnn_feature_columns = deep_columns, 30 | dnn_optimizer = "Adam", 31 | dnn_hidden_units = c(50, 20, 10), 32 | dnn_dropout = 0.5 33 | ) 34 | 35 | # add response to the input data 36 | df_tr$y <- sim$y.tr 37 | df_te$y <- sim$y.te 38 | 39 | # input data constructor function 40 | constructed_input_fn <- function(dataset) 41 | input_fn(dataset, features = -y, response = y) 42 | 43 | train_input_fn <- constructed_input_fn(df_tr) 44 | eval_input_fn <- constructed_input_fn(df_te) 45 | 46 | # train the model 47 | train(model, input_fn = train_input_fn, steps = 1e+4) 48 | # 2018-08-19 20:50:13.846538: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1484] Adding visible gpu devices: 0 49 | # 2018-08-19 20:50:13.846567: I tensorflow/core/common_runtime/gpu/gpu_device.cc:965] Device interconnect StreamExecutor with strength 1 edge matrix: 50 | # 2018-08-19 20:50:13.846572: I tensorflow/core/common_runtime/gpu/gpu_device.cc:971] 0 51 | # 2018-08-19 20:50:13.846575: I tensorflow/core/common_runtime/gpu/gpu_device.cc:984] 0: N 52 | # 2018-08-19 20:50:13.846661: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1097] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 9492 MB memory) -> physical GPU (device: 0, name: GeForce GTX 1080 Ti, pci bus id: 0000:01:00.0, compute capability: 6.1) 53 | # Training 3907/10000 [============>....................] - ETA: 2m - loss: 12.08 54 | # Training completed after 3907 steps but 10000 steps was specified 55 | 56 | # evaluate model on the test set 57 | metrics <- evaluate(model, input_fn = eval_input_fn, steps = 1e+4) 58 | str(metrics) 59 | # Classes ‘tbl_df’, ‘tbl’ and 'data.frame': 1 obs. of 11 variables: 60 | # $ accuracy : num 0.745 61 | # $ accuracy_baseline : num 0.501 62 | # $ auc : num 0.828 63 | # $ auc_precision_recall: num 0.828 64 | # $ average_loss : num 0.507 65 | # $ label/mean : num 0.501 66 | # $ loss : num 64.9 67 | # $ precision : num 0.744 68 | # $ prediction/mean : num 0.502 69 | # $ recall : num 0.748 70 | # $ global_step : num 3907 71 | -------------------------------------------------------------------------------- /triplet-loss-keras/data-movielens.R: -------------------------------------------------------------------------------- 1 | read_movielens <- function() { 2 | url_uabase <- "http://files.grouplens.org/datasets/movielens/ml-100k/ua.base" 3 | url_uatest <- "http://files.grouplens.org/datasets/movielens/ml-100k/ua.test" 4 | train <- read.table(url_uabase) 5 | test <- read.table(url_uatest) 6 | names(train) <- names(test) <- c("user_id", "item_id", "rating", "timestamp") 7 | train$"timestamp" <- test$"timestamp" <- NULL 8 | list("train" = train, "test" = test) 9 | } 10 | 11 | binarize_ratings <- function(df) { 12 | idx_pos <- df$rating >= 4L 13 | idx_neg <- df$rating < 4L 14 | df$rating[idx_pos] <- 1L 15 | df$rating[idx_neg] <- 0L 16 | df 17 | } 18 | 19 | get_movielens_data <- function() { 20 | movielens <- read_movielens() 21 | train <- movielens$train 22 | test <- movielens$test 23 | 24 | # binarize the 1 to 5 star ratings 25 | train <- binarize_ratings(train) 26 | test <- binarize_ratings(test) 27 | 28 | list("train" = train, "test" = test) 29 | } 30 | 31 | # generate triplets 32 | # (random non-positive item as the negative item) 33 | get_triplets <- function(df) { 34 | # first, simply extract all positive pairs 35 | idx_pos <- which(df$rating == 1L) 36 | user_id <- df$user_id[idx_pos] 37 | item_id_pos <- df$item_id[idx_pos] 38 | 39 | df_ret <- data.frame(user_id, item_id_pos) 40 | df_ret$item_id_neg <- NA 41 | 42 | # sample from the non-positive items for each unique user id 43 | for (i in unique(df_ret$user_id)) { 44 | idx_user <- which(df_ret$user_id == i) 45 | df_user <- df_ret[idx_user, ] 46 | item_id_np <- setdiff(unique(df$item_id), df_user$item_id_pos) 47 | df_ret$item_id_neg[idx_user] <- sample(item_id_np, size = length(idx_user)) 48 | } 49 | df_ret 50 | } 51 | -------------------------------------------------------------------------------- /triplet-loss-keras/metric-auc.R: -------------------------------------------------------------------------------- 1 | library("pROC") 2 | 3 | # predict outcome probability 4 | predict_prob <- function(model, user_id, item_id) { 5 | user_matrix <- model %>% get_layer("embed_user") %>% get_weights() %>% extract2(1L) 6 | user_vector <- user_matrix[user_id + 1L, , drop = FALSE] 7 | item_matrix <- model %>% get_layer("embed_item") %>% get_weights() %>% extract2(1L) 8 | item_vectors <- item_matrix[item_id + 1L, , drop = FALSE] 9 | 10 | sigmoid <- function(x) 1 / (1 + exp(-x)) 11 | scores <- user_vector %*% t(item_vectors) %>% sigmoid() %>% as.vector() 12 | scores 13 | } 14 | 15 | # compute an "average AUC" for predictions 16 | auc_avg <- function(model, df) { 17 | 18 | # all user ids in the set to evaluate on 19 | user_id <- unique(df$user_id) 20 | n_users <- length(user_id) 21 | 22 | # all possible items in the test set 23 | n_items <- max(df$item_id) 24 | item_id <- 1L:n_items 25 | 26 | # for each user, we make predictions on all possible items, 27 | # compute the auc, until all auc values for all # of users are computed. 28 | # then average all the auc values. 29 | scores <- rep(NA, n_users) 30 | 31 | for (i in 1L:n_users) { 32 | prob <- predict_prob(model, user_id[i], item_id) 33 | label <- rep(0L, n_items) 34 | # fill in positive labels 35 | item_id_pos <- df$item_id[which(df$user_id == user_id[i] & df$rating == 1L)] 36 | if (length(item_id_pos) >= 1L) { 37 | label[item_id_pos] <- 1L 38 | # the argument `direction` is super important here... 39 | # since unlike sklearn.metrics.roc_auc_score, 40 | # pROC will always report the auc > 0.5 when direction = "auto". 41 | # so the mean value of all these aucs will be significantly > 0.5. 42 | scores[i] <- as.numeric(auc(label, prob, direction = "<")) 43 | } else { 44 | # all zeros exception handling: AUC requires the label to have two levels 45 | scores[i] <- NA 46 | } 47 | } 48 | 49 | mean(scores, na.rm = TRUE) 50 | } 51 | -------------------------------------------------------------------------------- /triplet-loss-keras/plot-model.R: -------------------------------------------------------------------------------- 1 | library("reticulate") 2 | 3 | # write to png (low-res) 4 | k <- import("keras") 5 | 6 | plot_model <- k$utils$plot_model 7 | plot_model(model, to_file = "triplet-loss-model-keras.png", show_shapes = TRUE, show_layer_names = TRUE) 8 | 9 | # write to pdf 10 | pydot <- import("pydot") 11 | 12 | model_to_dot <- k$utils$vis_utils$model_to_dot 13 | g <- model_to_dot(model) 14 | g$write_pdf("triplet-loss-model-keras.pdf") 15 | -------------------------------------------------------------------------------- /triplet-loss-keras/triplet-loss-bpr-keras.R: -------------------------------------------------------------------------------- 1 | # Matrix factorization with BPR triplet loss and Keras ------------------------- 2 | 3 | library("keras") 4 | library("magrittr") 5 | library("reshape2") 6 | library("ggplot2") 7 | library("gridExtra") 8 | library("hrbrthemes") 9 | library("ggsci") 10 | 11 | use_virtualenv("~/tensorflow/venv/") 12 | 13 | # 1. Define BPR triplet loss --------------------------------------------------- 14 | 15 | # identity loss: workaround Keras loss definition to use custom triplet loss 16 | # there is no true label: we just want to minimize the BPR triplet loss 17 | # to learn the embeddings 18 | loss_identity <- function(y_true, y_pred) k_mean(y_pred - 0 * y_true) 19 | 20 | # BPR triplet loss 21 | loss_bpr_triplet <- function(x) { 22 | embed_user <- x[[1]] 23 | embed_item_positive <- x[[2]] 24 | embed_item_negative <- x[[3]] 25 | 26 | loss <- 1.0 - k_sigmoid( 27 | k_sum(embed_user * embed_item_positive, axis = -1, keepdims = TRUE) - 28 | k_sum(embed_user * embed_item_negative, axis = -1, keepdims = TRUE) 29 | ) 30 | 31 | loss 32 | } 33 | 34 | # build and compile the model with BPR triplet loss 35 | build_model <- function(n_user, n_item, n_factor) { 36 | 37 | # input layer for users 38 | input_user <- layer_input(shape = c(1), name = "input_user") 39 | 40 | # input layers for items (positive and negative) 41 | input_item_positive <- layer_input(shape = c(1), name = "input_item_positive") 42 | input_item_negative <- layer_input(shape = c(1), name = "input_item_negative") 43 | 44 | # embedding layer for users 45 | embed_user <- input_user %>% 46 | layer_embedding( 47 | input_dim = n_user, output_dim = n_factor, 48 | input_length = 1, name = "embed_user" 49 | ) %>% 50 | layer_flatten() 51 | 52 | # embedding layer shared by positive and negative items 53 | layer_embed_item <- layer_embedding( 54 | input_dim = n_item, output_dim = n_factor, 55 | input_length = 1, name = "embed_item" 56 | ) 57 | 58 | embed_item_positive <- input_item_positive %>% 59 | layer_embed_item() %>% 60 | layer_flatten() 61 | embed_item_negative <- input_item_negative %>% 62 | layer_embed_item() %>% 63 | layer_flatten() 64 | 65 | # BPR triplet loss is the output 66 | loss <- list(embed_user, embed_item_positive, embed_item_negative) %>% 67 | layer_lambda(loss_bpr_triplet, output_shape = c(1)) 68 | 69 | # define model inputs/outputs 70 | model <- keras_model( 71 | inputs = c(input_user, input_item_positive, input_item_negative), 72 | outputs = loss 73 | ) 74 | 75 | # compile model 76 | model %>% compile(loss = loss_identity, optimizer = optimizer_nadam()) 77 | 78 | model 79 | } 80 | 81 | # 2. Prepare the data ---------------------------------------------------------- 82 | 83 | # set model parameters 84 | 85 | k <- 100 # number of latent factors to learn 86 | n_epochs <- 20 # number of epochs 87 | 88 | # read data 89 | source("data-movielens.R") 90 | movielens <- get_movielens_data() 91 | 92 | # prepare training and test data 93 | data_train <- movielens$train 94 | data_test <- movielens$test 95 | 96 | # model constants: remember to include all ids from train and test 97 | n_user <- length(unique(c(data_train$user_id, data_test$user_id))) + 1L 98 | n_item <- length(unique(c(data_train$item_id, data_test$item_id))) + 1L 99 | 100 | # prepare the test triplets 101 | triplets_test <- get_triplets(data_test) 102 | 103 | # 3. Train the model ----------------------------------------------------------- 104 | 105 | # build the model 106 | model <- build_model(n_user, n_item, k) 107 | 108 | # inspect the model 109 | summary(model) 110 | 111 | # sanity check: untrained model's auc should be around 0.5 112 | source("metric-auc.R") 113 | auc_avg(model, data_test) 114 | 115 | # training loop 116 | train_loss <- test_loss <- train_auc <- test_auc <- rep(NA, n_epochs) 117 | 118 | for (epoch in 1L:n_epochs) { 119 | cat("Epoch", epoch, "\n") 120 | 121 | # sample triplets from the training data 122 | triplets_train <- get_triplets(data_train) 123 | 124 | history_train <- model %>% 125 | fit( 126 | x = list( 127 | "input_user" = matrix(triplets_train$user_id, ncol = 1), 128 | "input_item_positive" = matrix(triplets_train$item_id_pos, ncol = 1), 129 | "input_item_negative" = matrix(triplets_train$item_id_neg, ncol = 1) 130 | ), 131 | y = matrix(1, nrow = length(triplets_train$user_id), ncol = 1), 132 | batch_size = 64, epochs = 1, verbose = 1, shuffle = TRUE 133 | ) 134 | 135 | train_loss[epoch] <- history_train$metrics$loss 136 | train_auc[epoch] <- auc_avg(model, data_train) 137 | cat("AUC train:", train_auc[epoch], "\n") 138 | 139 | history_test <- model %>% 140 | evaluate( 141 | x = list( 142 | "input_user" = matrix(triplets_test$user_id, ncol = 1), 143 | "input_item_positive" = matrix(triplets_test$item_id_pos, ncol = 1), 144 | "input_item_negative" = matrix(triplets_test$item_id_neg, ncol = 1) 145 | ), 146 | y = matrix(1.0, nrow = length(triplets_test$user_id), ncol = 1), 147 | batch_size = 64, verbose = 0 148 | ) 149 | 150 | test_loss[epoch] <- unname(history_test) 151 | test_auc[epoch] <- auc_avg(model, data_test) 152 | cat("AUC test:", test_auc[epoch], "\n") 153 | } 154 | 155 | # 4. Plot loss and user-averaged AUC ------------------------------------------- 156 | 157 | df_loss <- 158 | data.frame("epoch" = 1L:n_epochs, "train" = train_loss, "test" = test_loss) 159 | df_loss <- 160 | melt(df_loss, id.vars = "epoch", variable.name = "data", value.name = "loss") 161 | p_loss <- ggplot(df_loss, aes(x = epoch, y = loss)) + 162 | geom_point(aes(fill = data), shape = 21, colour = "#333333") + 163 | geom_smooth(aes(colour = data), span = 0.5, se = FALSE, show.legend = FALSE) + 164 | theme_ipsum() + 165 | theme(plot.margin = unit(c(1, 1, 0, 1), "cm")) + 166 | scale_fill_tron() + 167 | scale_color_tron() 168 | 169 | df_auc <- 170 | data.frame("epoch" = 1L:n_epochs, "train" = train_auc, "test" = test_auc) 171 | df_auc <- 172 | melt(df_auc, id.vars = "epoch", variable.name = "data", value.name = "auc") 173 | p_auc <- ggplot(df_auc, aes(x = epoch, y = auc)) + 174 | geom_point(aes(fill = data), shape = 21, colour = "#333333") + 175 | geom_smooth(aes(colour = data), span = 0.5, se = FALSE, show.legend = FALSE) + 176 | theme_ipsum() + 177 | theme(plot.margin = unit(c(0, 1, 1, 1), "cm")) + 178 | scale_fill_tron() + 179 | scale_color_tron() 180 | 181 | p <- grid.arrange(p_loss, p_auc, nrow = 2) 182 | 183 | ggsave("triplet-loss-bpr-movielens.png", p, width = 9, height = 6) 184 | -------------------------------------------------------------------------------- /triplet-loss-keras/triplet-loss-margin-keras.R: -------------------------------------------------------------------------------- 1 | # Matrix factorization with margin-based triplet loss and Keras ---------------- 2 | 3 | library("keras") 4 | library("magrittr") 5 | library("reshape2") 6 | library("ggplot2") 7 | library("gridExtra") 8 | library("hrbrthemes") 9 | library("ggsci") 10 | 11 | use_virtualenv("~/tensorflow/venv/") 12 | 13 | # 1. Define margin-based triplet loss ------------------------------------------ 14 | 15 | # identity loss: workaround Keras loss definition to use custom triplet loss 16 | # there is no true label: we just want to minimize the margin-based triplet loss 17 | # to learn the embeddings 18 | loss_identity <- function(y_true, y_pred) k_mean(y_pred - 0 * y_true) 19 | 20 | # margin-based triplet loss 21 | loss_margin_triplet <- function(x) { 22 | embed_user <- x[[1]] 23 | embed_item_positive <- x[[2]] 24 | embed_item_negative <- x[[3]] 25 | 26 | loss <- k_maximum( 27 | 0.0, 28 | k_sum(embed_user * embed_item_negative, axis = -1, keepdims = TRUE) - 29 | k_sum(embed_user * embed_item_positive, axis = -1, keepdims = TRUE) + 30 | 10.0 31 | ) 32 | 33 | loss 34 | } 35 | 36 | # build and compile the model with margin-based triplet loss 37 | build_model <- function(n_user, n_item, n_factor) { 38 | 39 | # input layer for users 40 | input_user <- layer_input(shape = c(1), name = "input_user") 41 | 42 | # input layers for items (positive and negative) 43 | input_item_positive <- layer_input(shape = c(1), name = "input_item_positive") 44 | input_item_negative <- layer_input(shape = c(1), name = "input_item_negative") 45 | 46 | # embedding layer for users 47 | embed_user <- input_user %>% 48 | layer_embedding( 49 | input_dim = n_user, output_dim = n_factor, 50 | input_length = 1, name = "embed_user" 51 | ) %>% 52 | layer_flatten() 53 | 54 | # embedding layer shared by positive and negative items 55 | layer_embed_item <- layer_embedding( 56 | input_dim = n_item, output_dim = n_factor, 57 | input_length = 1, name = "embed_item" 58 | ) 59 | 60 | embed_item_positive <- input_item_positive %>% 61 | layer_embed_item() %>% 62 | layer_flatten() 63 | embed_item_negative <- input_item_negative %>% 64 | layer_embed_item() %>% 65 | layer_flatten() 66 | 67 | # margin-based triplet loss is the output 68 | loss <- list(embed_user, embed_item_positive, embed_item_negative) %>% 69 | layer_lambda(loss_margin_triplet, output_shape = c(1)) 70 | 71 | # define model inputs/outputs 72 | model <- keras_model( 73 | inputs = c(input_user, input_item_positive, input_item_negative), 74 | outputs = loss 75 | ) 76 | 77 | # compile model 78 | model %>% compile(loss = loss_identity, optimizer = optimizer_nadam()) 79 | 80 | model 81 | } 82 | 83 | # 2. Prepare the data ---------------------------------------------------------- 84 | 85 | # set model parameters 86 | 87 | k <- 100 # number of latent factors to learn 88 | n_epochs <- 20 # number of epochs 89 | 90 | # read data 91 | source("data-movielens.R") 92 | movielens <- get_movielens_data() 93 | 94 | # prepare training and test data 95 | data_train <- movielens$train 96 | data_test <- movielens$test 97 | 98 | # model constants: remember to include all ids from train and test 99 | n_user <- length(unique(c(data_train$user_id, data_test$user_id))) + 1L 100 | n_item <- length(unique(c(data_train$item_id, data_test$item_id))) + 1L 101 | 102 | # prepare the test triplets 103 | triplets_test <- get_triplets(data_test) 104 | 105 | # 3. Train the model ----------------------------------------------------------- 106 | 107 | # build the model 108 | model <- build_model(n_user, n_item, k) 109 | 110 | # inspect the model 111 | summary(model) 112 | 113 | # sanity check: untrained model's auc should be around 0.5 114 | source("metric-auc.R") 115 | auc_avg(model, data_test) 116 | 117 | # training loop 118 | train_loss <- test_loss <- train_auc <- test_auc <- rep(NA, n_epochs) 119 | 120 | for (epoch in 1L:n_epochs) { 121 | cat("Epoch", epoch, "\n") 122 | 123 | # sample triplets from the training data 124 | triplets_train <- get_triplets(data_train) 125 | 126 | history_train <- model %>% 127 | fit( 128 | x = list( 129 | "input_user" = matrix(triplets_train$user_id, ncol = 1), 130 | "input_item_positive" = matrix(triplets_train$item_id_pos, ncol = 1), 131 | "input_item_negative" = matrix(triplets_train$item_id_neg, ncol = 1) 132 | ), 133 | y = matrix(1, nrow = length(triplets_train$user_id), ncol = 1), 134 | batch_size = 64, epochs = 1, verbose = 1, shuffle = TRUE 135 | ) 136 | 137 | train_loss[epoch] <- history_train$metrics$loss 138 | train_auc[epoch] <- auc_avg(model, data_train) 139 | cat("AUC train:", train_auc[epoch], "\n") 140 | 141 | history_test <- model %>% 142 | evaluate( 143 | x = list( 144 | "input_user" = matrix(triplets_test$user_id, ncol = 1), 145 | "input_item_positive" = matrix(triplets_test$item_id_pos, ncol = 1), 146 | "input_item_negative" = matrix(triplets_test$item_id_neg, ncol = 1) 147 | ), 148 | y = matrix(1.0, nrow = length(triplets_test$user_id), ncol = 1), 149 | batch_size = 64, verbose = 0 150 | ) 151 | 152 | test_loss[epoch] <- unname(history_test) 153 | test_auc[epoch] <- auc_avg(model, data_test) 154 | cat("AUC test:", test_auc[epoch], "\n") 155 | } 156 | 157 | # 4. Plot loss and user-averaged AUC ------------------------------------------- 158 | 159 | df_loss <- 160 | data.frame("epoch" = 1L:n_epochs, "train" = train_loss, "test" = test_loss) 161 | df_loss <- 162 | melt(df_loss, id.vars = "epoch", variable.name = "data", value.name = "loss") 163 | p_loss <- ggplot(df_loss, aes(x = epoch, y = loss)) + 164 | geom_point(aes(fill = data), shape = 21, colour = "#333333") + 165 | geom_smooth(aes(colour = data), span = 0.5, se = FALSE, show.legend = FALSE) + 166 | theme_ipsum() + 167 | theme(plot.margin = unit(c(1, 1, 0, 1), "cm")) + 168 | scale_fill_startrek() + 169 | scale_color_startrek() 170 | 171 | df_auc <- 172 | data.frame("epoch" = 1L:n_epochs, "train" = train_auc, "test" = test_auc) 173 | df_auc <- 174 | melt(df_auc, id.vars = "epoch", variable.name = "data", value.name = "auc") 175 | p_auc <- ggplot(df_auc, aes(x = epoch, y = auc)) + 176 | geom_point(aes(fill = data), shape = 21, colour = "#333333") + 177 | geom_smooth(aes(colour = data), span = 0.5, se = FALSE, show.legend = FALSE) + 178 | theme_ipsum() + 179 | theme(plot.margin = unit(c(0, 1, 1, 1), "cm")) + 180 | scale_fill_startrek() + 181 | scale_color_startrek() 182 | 183 | p <- grid.arrange(p_loss, p_auc, nrow = 2) 184 | 185 | ggsave("triplet-loss-margin-movielens.png", p, width = 9, height = 6) 186 | --------------------------------------------------------------------------------