├── .Rbuildignore ├── .gitignore ├── .travis.yml ├── DESCRIPTION ├── NAMESPACE ├── R ├── RcppExports.R ├── deepboost-data.R ├── deepboost-grid-search.R └── deepboost.R ├── README.md ├── data ├── adult.rda ├── australian.rda ├── banana.rda ├── bupa.rda ├── coli2000.rda ├── haberman.rda ├── heart.rda ├── magic.rda ├── pima.rda └── sonar.rda ├── deepboost.Rproj ├── demo ├── 00Index ├── deepboostGrid.R └── experiments.R ├── man ├── Deepboost-class.Rd ├── Deepboost.Rd ├── adult.Rd ├── australian.Rd ├── banana.Rd ├── bupa.Rd ├── coli2000.Rd ├── deepboost.default.Rd ├── deepboost.evaluate.Rd ├── deepboost.formula.Rd ├── deepboost.gridSearch.Rd ├── deepboost.predict.Rd ├── deepboost.print.Rd ├── deepboost.train.Rd ├── haberman.Rd ├── heart.Rd ├── magic.Rd ├── pima.Rd ├── predict-Deepboost-method.Rd ├── show-Deepboost-method.Rd └── sonar.Rd ├── src ├── Makevars ├── Makevars.win ├── RcppExports.cpp ├── boost.cc ├── boost.h ├── deepboost_C.cc ├── deepboost_C.h ├── deepboost_R.cpp ├── deepboost_converters.cpp ├── deepboost_converters.h ├── tree.cc ├── tree.h └── types.h └── tests ├── testthat.R └── testthat └── test_basic.R /.Rbuildignore: -------------------------------------------------------------------------------- 1 | ^.*\.Rproj$ 2 | ^\.Rproj\.user$ 3 | ^\.travis\.yml$ 4 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .Rproj.user 2 | .Rhistory 3 | .o 4 | .so 5 | .dll 6 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | # Sample .travis.yml for R projects 2 | 3 | language: r 4 | warnings_are_errors: true 5 | sudo: required 6 | 7 | r_packages: 8 | - covr 9 | 10 | after_success: 11 | - Rscript -e 'library(covr); codecov()' 12 | -------------------------------------------------------------------------------- /DESCRIPTION: -------------------------------------------------------------------------------- 1 | Package: deepboost 2 | Type: Package 3 | Title: Deep Boosting Ensemble Modeling 4 | Version: 0.1.6 5 | Date: 2017-11-08 6 | Author: Daniel Marcous [aut, cre], Yotam Sandbank [aut], Google Inc. [cph] 7 | Maintainer: Daniel Marcous 8 | Authors@R: c( 9 | person("Daniel", "Marcous", email = "dmarcous@gmail.com", role = c("aut","cre")), 10 | person("Yotam", "Sandbank", email = "yotamsandbank@gmail.com", role = "aut"), 11 | person("Google Inc.", role = "cph") 12 | ) 13 | Description: Provides deep boosting models training, evaluation, predicting and 14 | hyper parameter optimising using grid search and cross validation. 15 | Based on Google's Deep Boosting algorithm, and Google's C++ implementation. 16 | Cortes, C., Mohri, M., & Syed, U. (2014) . 17 | URL: https://github.com/dmarcous/CRAN_deepboost 18 | BugReports: https://github.com/dmarcous/CRAN_deepboost/issues 19 | License: Apache License (== 2.0) 20 | LazyData: TRUE 21 | Suggests: 22 | testthat, 23 | ada, 24 | caret 25 | Depends: 26 | R (>= 3.1) 27 | Imports: 28 | Rcpp (>= 0.12.2), 29 | methods 30 | LinkingTo: Rcpp 31 | RoxygenNote: 6.0.1 32 | -------------------------------------------------------------------------------- /NAMESPACE: -------------------------------------------------------------------------------- 1 | # Generated by roxygen2: do not edit by hand 2 | 3 | export(deepboost) 4 | export(deepboost.default) 5 | export(deepboost.evaluate) 6 | export(deepboost.formula) 7 | export(deepboost.gridSearch) 8 | export(deepboost.predict) 9 | export(deepboost.print) 10 | export(deepboost.train) 11 | exportMethods(predict) 12 | exportMethods(show) 13 | import(methods) 14 | importFrom(Rcpp,evalCpp) 15 | importFrom(stats,contrasts) 16 | importFrom(stats,model.matrix) 17 | importFrom(stats,model.response) 18 | useDynLib(deepboost) 19 | -------------------------------------------------------------------------------- /R/RcppExports.R: -------------------------------------------------------------------------------- 1 | # Generated by using Rcpp::compileAttributes() -> do not edit by hand 2 | # Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393 3 | 4 | Train_R <- function(data, tree_depth, num_iter, beta, lambda, loss_type, verbose) { 5 | .Call('_deepboost_Train_R', PACKAGE = 'deepboost', data, tree_depth, num_iter, beta, lambda, loss_type, verbose) 6 | } 7 | 8 | Predict_R <- function(newdata, model) { 9 | .Call('_deepboost_Predict_R', PACKAGE = 'deepboost', newdata, model) 10 | } 11 | 12 | PredictProbabilities_R <- function(newdata, model) { 13 | .Call('_deepboost_PredictProbabilities_R', PACKAGE = 'deepboost', newdata, model) 14 | } 15 | 16 | Evaluate_R <- function(data, model) { 17 | .Call('_deepboost_Evaluate_R', PACKAGE = 'deepboost', data, model) 18 | } 19 | 20 | -------------------------------------------------------------------------------- /R/deepboost-data.R: -------------------------------------------------------------------------------- 1 | #' Adult humans 2 | #' 3 | #' A dataset containing adult population personal details 4 | #' 5 | #' @format A data frame with 32560 rows and 15 variables: 6 | #' \describe{ 7 | #' \item{Adm.clerical}{unknown} 8 | #' \item{Bachelors}{person is a bachlor} 9 | #' \item{Male}{gender} 10 | #' \item{Never.married}{did person marry?} 11 | #' \item{Not.in.family}{is person a part of a family} 12 | #' \item{State.gov}{state} 13 | #' \item{United.States}{is from the united states} 14 | #' \item{White}{is white} 15 | #' \item{X..50K}{unknown} 16 | #' \item{X0}{unknown} 17 | #' \item{X13}{unknown} 18 | #' \item{X2174}{unknown} 19 | #' \item{X39}{unknown} 20 | #' \item{X40}{unknown} 21 | #' \item{X77516}{unknown} 22 | #' } 23 | #' @source \url{https://archive.ics.uci.edu/ml/datasets/Adult/} 24 | "adult" 25 | 26 | #' Australian 27 | "australian" 28 | 29 | #' banana 30 | "banana" 31 | 32 | #' bupa 33 | "bupa" 34 | 35 | #' coli2000 36 | "coli2000" 37 | 38 | #' haberman 39 | "haberman" 40 | 41 | #' heart 42 | "heart" 43 | 44 | #' magic 45 | "magic" 46 | 47 | #' pima 48 | "pima" 49 | 50 | #' sonar 51 | "sonar" 52 | -------------------------------------------------------------------------------- /R/deepboost-grid-search.R: -------------------------------------------------------------------------------- 1 | #' Returns optimised parameter list for deepboost model on given data 2 | #' @param formula A R Formula object see : ?formula 3 | #' @param data input data.frame as training for model 4 | #' @param k number of folds (default = 10) for cross validation optimisation 5 | #' @param seed for random split to train / test (default 666) 6 | #' @param logging_level print extra data while training 0 - no data, 1 - gridSearch data (default), 2 - all data 7 | #' @details Finds optimised parameters for deepboost training. 8 | #' using grid search techniques over: 9 | #' - predefined, battle tested parameter possible values 10 | #' - cross validation over k folds 11 | #' @return vector with average accuracy for chosen parameters, and a list of the best parameter combination: (accuracy, (num_iter, beta, lambda, loss_type)) 12 | #' @examples 13 | #' deepboost.gridSearch(y ~ ., 14 | #' data.frame(x1=rep(c(0,0,1,1),2),x2=rep(c(0,1,0,1),2),y=factor(rep(c(0,0,0,1),2))), k=2) 15 | #' @export 16 | deepboost.gridSearch <- function(formula, data, k=10, seed=666, logging_level=1) { 17 | 18 | if (!(is.numeric(k)) || k <= 1 || !(k%%1==0)) 19 | { 20 | stop("ERROR_paramter_setting : k must be >= 2 and integer (Default : 10)" ) 21 | } 22 | 23 | if (!(is.numeric(logging_level)) || logging_level < 0 || logging_level > 2 || !(k%%1==0)) 24 | { 25 | stop("ERROR_paramter_setting : logging_level must be integer (0 / 1 / 2) (Default : 1)" ) 26 | } 27 | 28 | verbose <- ifelse(logging_level>1,TRUE,FALSE) 29 | 30 | num_iter_vals = c(5,10,25,50) 31 | beta_vals = c(2^-0, 2^-1, 2^-2, 2^-3, 2^-4, 2^-5, 2^-6) 32 | lambda_vals = c(0.0001, 0.005, 0.01, 0.05, 0.1, 0.5) 33 | loss_type_vals = c("l","e") 34 | dpbGrid <- expand.grid(num_iter = num_iter_vals, 35 | beta = beta_vals, 36 | lambda = lambda_vals, 37 | loss_type = loss_type_vals) 38 | 39 | set.seed(seed) 40 | 41 | #Randomly shuffle the data 42 | data<-data[sample(nrow(data)),] 43 | 44 | folds <- cut(seq(1,nrow(data)),breaks=k,labels=FALSE) 45 | best_acc <- -Inf 46 | avg_acc <- 0 47 | 48 | for(combination in 1:nrow(dpbGrid)){ 49 | num_iter <- dpbGrid[combination,"num_iter"] 50 | beta <- dpbGrid[combination,"beta"] 51 | lambda <- dpbGrid[combination,"lambda"] 52 | loss_type <- as.character(dpbGrid[combination,"loss_type"]) 53 | acc <- 0 54 | 55 | for(fold in 1:k){ 56 | testIndexes <- which(folds==fold,arr.ind=TRUE) 57 | testData <- data[testIndexes, ] 58 | trainData <- data[-testIndexes, ] 59 | 60 | eval_model <- deepboost.formula(formula, trainData, num_iter = num_iter, beta = beta, lambda = lambda, loss_type = loss_type, verbose=verbose) 61 | acc <- acc + sum(predict(eval_model, testData) == testData[,length(testData)]) / nrow(testData) 62 | } 63 | acc <- acc / k 64 | if(acc > best_acc){ 65 | best_acc <- acc 66 | best_num_iter <- num_iter 67 | best_lambda <- lambda 68 | best_beta <- beta 69 | best_loss_type <- loss_type 70 | } 71 | avg_acc <- avg_acc + acc 72 | 73 | } 74 | avg_acc <- avg_acc / nrow(dpbGrid) 75 | 76 | if(logging_level > 0) 77 | { 78 | print(paste0("average accuracy : ", avg_acc)) 79 | print(paste0("accuracy: ", best_acc, ", num_iter: ", best_num_iter, ", beta: ", best_beta, ", lambda: ", best_lambda, ", loss_type: ", best_loss_type)) 80 | } 81 | 82 | RET <- 83 | c(avg_acc, 84 | list(best_num_iter, 85 | best_lambda, 86 | best_beta, 87 | best_loss_type)) 88 | 89 | return(RET) 90 | } 91 | -------------------------------------------------------------------------------- /R/deepboost.R: -------------------------------------------------------------------------------- 1 | #' @useDynLib deepboost 2 | #' @importFrom Rcpp evalCpp 3 | #' @importFrom stats contrasts model.matrix model.response 4 | #' @import methods 5 | NULL 6 | 7 | #' An S4 class to represent a deepboost model. 8 | #' 9 | #' @slot tree_depth maximum depth for a single decision tree in the model 10 | #' @slot num_iter number of iterations = number of trees in ensemble 11 | #' @slot beta regularisation for scores (L1) 12 | #' @slot lambda regularisation for tree depth 13 | #' @slot loss_type "l" logistic, "e" exponential 14 | #' @slot verbose print extra data while training TRUE / FALSE 15 | #' @slot examples data.frame with instances used for model training 16 | #' @slot model Deepboost model as used by C code serialised to R List 17 | #' @slot classes a vector of factors representing the classes used for classification with this model 18 | setClass("Deepboost", 19 | slots = list( 20 | tree_depth = "numeric", 21 | num_iter = "numeric", 22 | beta = "numeric", 23 | lambda= "numeric", 24 | loss_type = "character", 25 | verbose = "logical", 26 | examples = "data.frame", 27 | model = "list", 28 | classes = "character" 29 | )) 30 | 31 | #' Trains a deepboost model 32 | #' 33 | #' @param object A Deepboost S4 class object 34 | #' @param data input data.frame as training for model 35 | #' @param tree_depth maximum depth for a single decision tree in the model 36 | #' @param num_iter number of iterations = number of trees in ensemble 37 | #' @param beta regularisation for scores (L1) 38 | #' @param lambda regularisation for tree depth 39 | #' @param loss_type - "l" logistic, "e" exponential 40 | #' @param verbose - print extra data while training TRUE / FALSE 41 | #' @param classes a vector of factors representing the classes used for classification with this model 42 | #' @details (beta,lambda) = (0,0) - adaboost, (>0,0) - L1, (0,>0) deepboost, (>0, >0) deepbost+L1 43 | #' @return A trained Deepbost model 44 | #' @export 45 | deepboost.train <- function(object, data, 46 | tree_depth, 47 | num_iter, 48 | beta, 49 | lambda, 50 | loss_type, 51 | verbose, 52 | classes) { 53 | # set slots 54 | RET = new("Deepboost") 55 | 56 | # Check parameter validity 57 | if (!(is.numeric(tree_depth)) || tree_depth <= 0 || !(tree_depth%%1==0)) 58 | { 59 | stop("ERROR_paramter_setting : tree_depth must be >= 1 and integer (Default : 5)" ) 60 | } 61 | RET@tree_depth = as.integer(tree_depth) 62 | 63 | # Check parameter validity 64 | if (!(is.numeric(num_iter)) || num_iter <= 0 || !(num_iter%%1==0)) 65 | { 66 | stop("ERROR_paramter_setting : num_iter must be >= 1 and integer (Default : 1)" ) 67 | } 68 | RET@num_iter = as.integer(num_iter) 69 | 70 | # (beta, lambda) = 71 | # (0,0) - adaboost, (>0,0) - L1, (0,>0) deepboost, (>0, >0) deepbost+L1 72 | 73 | # Check parameter validity 74 | if (!(is.numeric(beta)) || beta < 0.0) 75 | { 76 | stop("ERROR_paramter_setting : beta must be >= 0 and double (Default : 0.0)" ) 77 | } 78 | RET@beta = as.double(beta) 79 | 80 | # Check parameter validity 81 | if (!(is.numeric(lambda)) || lambda < 0.0) 82 | { 83 | stop("ERROR_paramter_setting : lambda must be >= 0 and double (Default : 0.05)" ) 84 | } 85 | RET@lambda = as.double(lambda) 86 | 87 | # Check parameter validity 88 | if (!(is.character(loss_type)) || (loss_type != "l" && loss_type != "e")) 89 | { 90 | stop("ERROR_paramter_setting : loss_type must be \"l\" - logistic or \"e\" - exponential (Default : \"l\")" ) 91 | } 92 | RET@loss_type = as.character(loss_type) 93 | 94 | if (!(is.logical(verbose))) 95 | { 96 | stop("ERROR_paramter_setting : verbose must be boolean (True / False) (Default : TRUE)" ) 97 | } 98 | RET@verbose = verbose 99 | 100 | RET@examples = data 101 | RET@classes = classes 102 | 103 | # call training 104 | model = Train_R(RET@examples, 105 | RET@tree_depth, RET@num_iter, RET@beta, RET@lambda, RET@loss_type, RET@verbose) 106 | 107 | RET@model = model 108 | 109 | return(RET) 110 | } 111 | 112 | #' Predicts instances responses based on a deepboost model 113 | #' 114 | #' @param object A Deepboost S4 class object 115 | #' @param newdata A data.frame to predict responses for 116 | #' @param type Type of prediction : "terms" - for class labels, "response" for probabilities 117 | #' @return A vector of respones 118 | #' @examples 119 | #' dpb <- deepboost(y ~ ., 120 | #' data.frame(x1=rep(c(0,0,1,1),2),x2=rep(c(0,1,0,1),2),y=factor(rep(c(0,0,0,1),2))), 121 | #' num_iter=2,tree_depth=2) 122 | #' deepboost.predict(dpb,data.frame(x1=rep(c(1,1,1,0),5),x2=rep(c(1,1,1,1),5))) 123 | #' @export 124 | deepboost.predict <- function(object, newdata, type="terms") { 125 | # Check parameter validity 126 | if (!(is.character(type)) || (type != "terms" && type != "response")) 127 | { 128 | stop("ERROR_deepboost.predict : type must be \"terms\" - labels or \"response\" - probabilities" ) 129 | } 130 | 131 | if (type == "terms") 132 | { 133 | labels <- 134 | Predict_R(newdata, 135 | object@model) 136 | 137 | labels <- unlist(labels) 138 | labels[labels==1] <- object@classes[1] 139 | labels[labels==-1] <- object@classes[2] 140 | results = labels 141 | } 142 | else if (type == "response") 143 | { 144 | probabilities <- 145 | PredictProbabilities_R(newdata, 146 | object@model) 147 | 148 | probabilities <- unlist(probabilities) 149 | probMat <- matrix(nrow=length(probabilities),ncol=2) 150 | probMat[,1] <- probabilities 151 | probMat[,2] <- 1.0-probabilities 152 | results = probMat 153 | } 154 | 155 | return (results) 156 | } 157 | 158 | #' Evaluates and prints statistics for a deepboost model on the train set 159 | #' 160 | #' @param object A Deepboost S4 class object 161 | #' @return List with model_statistics to console the model evaluation string 162 | #' @examples 163 | #' dpb <- deepboost(y ~ ., 164 | #' data.frame(x1=rep(c(0,0,1,1),2),x2=rep(c(0,1,0,1),2),y=factor(rep(c(0,0,0,1),2))), 165 | #' num_iter=2,tree_depth=2) 166 | #' deepboost.print(dpb) 167 | #' @export 168 | deepboost.print <- function(object) { 169 | model_stats <- deepboost.evaluate(object, object@examples) 170 | print(paste("Model error:",model_stats[["error"]])) 171 | print(paste("Average tree size:",model_stats[["avg_tree_size"]])) 172 | print(paste("Number of trees:",model_stats[["num_trees"]])) 173 | return (model_stats) 174 | } 175 | 176 | #' Evaluates and prints statistics for a deepboost model 177 | #' 178 | #' @param object A Deepboost S4 class object 179 | #' @param data a \code{data.frame} object to evaluate with the model 180 | #' @return a list with model statistics - error, avg_tree_size, num_trees 181 | #' @examples 182 | #' dpb <- deepboost(y ~ ., 183 | #' data.frame(x1=rep(c(0,0,1,1),2),x2=rep(c(0,1,0,1),2),y=factor(rep(c(0,0,0,1),2))), 184 | #' num_iter=2,tree_depth=2) 185 | #' deepboost.evaluate(dpb,data.frame(x1=rep(c(1,1,1,0),2),x2=rep(c(1,1,1,1),2))) 186 | #' @export 187 | deepboost.evaluate <- function(object, data) { 188 | model_stats <- 189 | Evaluate_R(data, 190 | object@model) 191 | return (model_stats) 192 | } 193 | 194 | #' Empty Deepboost S4 class object with default settings 195 | Deepboost <- new("Deepboost", 196 | examples = data.frame(), 197 | model = list() 198 | ) 199 | 200 | #' Main function for deepboost model creation 201 | #' 202 | #' @param x A data.frame of samples' values 203 | #' @param y A data.frame of samples's labels 204 | #' @param instance_weights The weight of each example 205 | #' @param tree_depth maximum depth for a single decision tree in the model 206 | #' @param num_iter number of iterations = number of trees in ensemble 207 | #' @param beta regularisation for scores (L1) 208 | #' @param lambda regularisation for tree depth 209 | #' @param loss_type - "l" logistic, "e" exponential 210 | #' @param verbose - print extra data while training TRUE / FALSE 211 | #' @return A trained Deepbost model 212 | #' @examples 213 | #' deepboost.default(data.frame(x1=rep(c(0,0,1,1),2),x2=rep(c(0,1,0,1),2)), 214 | #' factor(rep(c(0,0,0,1),2)),num_iter=1) 215 | #' deepboost.default(data.frame(x1=rep(c(0,0,1,1),2),x2=rep(c(0,1,0,1),2)), 216 | #' factor(rep(c(0,0,0,1),2)), 217 | #' num_iter=2, beta=0.1, lambda=0.00125) 218 | #' @export 219 | deepboost.default <- function(x, y, instance_weights = NULL, 220 | tree_depth = 5, 221 | num_iter = 1, 222 | beta = 0.0, 223 | lambda= 0.05, 224 | loss_type = "l", 225 | verbose = TRUE 226 | ) { 227 | # initialize weights 228 | n <- nrow(x) 229 | if(is.null(instance_weights)) 230 | { 231 | instance_weights <- rep(1/n, n) 232 | } 233 | # make response either 1 or -1 234 | y <- factor(y) 235 | if (length(levels(y))!=2) 236 | { 237 | stop("ERROR_data : response must be binary" ) 238 | } 239 | classes = levels(y) 240 | levels(y) <- c(1,-1) 241 | # create data 242 | data <- data.frame(x) 243 | data['label'] <- y 244 | data['weight'] <- instance_weights 245 | 246 | fit <- deepboost.train(Deepboost, data, 247 | tree_depth, 248 | num_iter, 249 | beta, 250 | lambda, 251 | loss_type, 252 | verbose, 253 | classes) 254 | 255 | return (fit) 256 | } 257 | 258 | #' Main function for deepboost model creation 259 | #' 260 | #' @param formula A R Formula object see : ?formula 261 | #' @param data A data.frame of samples to train on 262 | #' @param instance_weights The weight of each example 263 | #' @param tree_depth maximum depth for a single decision tree in the model 264 | #' @param num_iter number of iterations = number of trees in ensemble 265 | #' @param beta regularisation for scores (L1) 266 | #' @param lambda regularisation for tree depth 267 | #' @param loss_type - "l" logistic, "e" exponential 268 | #' @param verbose - print extra data while training TRUE / FALSE 269 | #' @return A trained Deepbost model 270 | #' @examples 271 | #' deepboost(y ~ ., 272 | #' data.frame(x1=rep(c(0,0,1,1),2),x2=rep(c(0,1,0,1),2),y=factor(rep(c(0,0,0,1),2))), 273 | #' num_iter=1) 274 | #' deepboost(y ~ ., 275 | #' data.frame(x1=rep(c(0,0,1,1),22),x2=rep(c(0,1,0,1),2),y=factor(rep(c(0,0,0,1),2))), 276 | #' num_iter=2, beta=0.1, lambda=0.00125) 277 | #' @export 278 | deepboost <- function(formula, data, 279 | instance_weights = NULL, 280 | tree_depth = 5, 281 | num_iter = 1, 282 | beta = 0.0, 283 | lambda= 0.05, 284 | loss_type = "l", 285 | verbose = TRUE) { 286 | deepboost.formula(formula, data, 287 | instance_weights, 288 | tree_depth, 289 | num_iter, 290 | beta, 291 | lambda, 292 | loss_type, 293 | verbose) 294 | } 295 | 296 | #' Main function for deepboost model creation, using a formula 297 | #' 298 | #' @param formula A R Formula object see : ?formula 299 | #' @param data A data.frame of samples to train on 300 | #' @param instance_weights The weight of each example 301 | #' @param tree_depth maximum depth for a single decision tree in the model 302 | #' @param num_iter number of iterations = number of trees in ensemble 303 | #' @param beta regularisation for scores (L1) 304 | #' @param lambda regularisation for tree depth 305 | #' @param loss_type - "l" logistic, "e" exponential 306 | #' @param verbose - print extra data while training TRUE / FALSE 307 | #' @return A trained Deepbost model 308 | #' @examples 309 | #' deepboost.formula(y ~ ., 310 | #' data.frame(x1=rep(c(0,0,1,1),2),x2=rep(c(0,1,0,1),2),y=factor(rep(c(0,0,0,1),2))), 311 | #' num_iter=1) 312 | #' deepboost.formula(y ~ ., 313 | #' data.frame(x1=rep(c(0,0,1,1),2),x2=rep(c(0,1,0,1),2),y=factor(rep(c(0,0,0,1),2))), 314 | #' num_iter=2, beta=0.1, lambda=0.00125) 315 | #' @export 316 | deepboost.formula <- function(formula, data, instance_weights = NULL, 317 | tree_depth = 5, 318 | num_iter = 1, 319 | beta = 0.0, 320 | lambda= 0.05, 321 | loss_type = "l", 322 | verbose = TRUE) { 323 | # initialize weights 324 | n <- nrow(data) 325 | if(is.null(instance_weights)) 326 | { 327 | instance_weights <- rep(1/n, n) 328 | } 329 | # parse formula 330 | cl <- match.call() 331 | mf <- match.call(expand.dots = FALSE) 332 | m <- match(c("formula", "data"), names(mf), 0L) 333 | mf <- mf[c(1L, m)] 334 | mf$drop.unused.levels <- TRUE 335 | mf[[1L]] <- quote(stats::model.frame) 336 | mf <- eval(mf, parent.frame()) 337 | mt <- attr(mf, "terms") 338 | y <- factor(model.response(mf)) 339 | x <- model.matrix(mt, mf, contrasts) 340 | # make response either 1 or -1 341 | if (length(levels(y))!=2) 342 | { 343 | stop("ERROR_data : response must be binary" ) 344 | } 345 | classes = levels(y) 346 | levels(y) <- c(1,-1) 347 | # create data 348 | data <- data.frame(x[,-1]) 349 | data['label'] <- y 350 | data['weight'] <- instance_weights 351 | 352 | fit <- deepboost.train(Deepboost, data, 353 | tree_depth, 354 | num_iter, 355 | beta, 356 | lambda, 357 | loss_type, 358 | verbose, 359 | classes) 360 | 361 | return (fit) 362 | } 363 | 364 | #' Predict method for Deepboost model 365 | #' 366 | #' Predicted values based on deepboost model object. 367 | #' 368 | #' @param object Object of class "Deepboost" 369 | #' @param newdata takes \code{data.frame}. 370 | #' @param type Type of prediction 371 | #' 372 | #' @details 373 | #' The option \code{ntreelimit} purpose is to let the user train a model with lots 374 | #' of trees but use only the first trees for prediction to avoid overfitting 375 | #' (without having to train a new model with less trees). 376 | #' @examples 377 | #' dpb <- deepboost(y ~ ., 378 | #' data.frame(x1=rep(c(0,0,1,1),2),x2=rep(c(0,1,0,1),2),y=factor(rep(c(0,0,0,1),2))), 379 | #' num_iter=2,tree_depth=2) 380 | #' predict(dpb,data.frame(x1=rep(c(1,1,1,0),2),x2=rep(c(1,1,1,1),2))) 381 | #' @export 382 | setMethod("predict", signature = "Deepboost", 383 | definition = function(object, newdata, type="terms") { 384 | deepboost.predict(object, newdata, type) 385 | }) 386 | 387 | #' Print method for Deepboost model 388 | #' Evaluates a trained deepboost model object. 389 | #' 390 | #' @param object Object of class "Deepboost" 391 | #' 392 | #' @details 393 | #' Prints : 394 | #' Model error: X" 395 | #' Average tree size: Y" 396 | #' Number of trees: Z" 397 | #' @examples 398 | #' dpb <- deepboost(y ~ ., 399 | #' data.frame(x1=rep(c(0,0,1,1),2),x2=rep(c(0,1,0,1),2),y=factor(rep(c(0,0,0,1),2))), 400 | #' num_iter=2,tree_depth=2) 401 | #' print(dpb) 402 | #' @export 403 | setMethod("show", signature = "Deepboost", 404 | definition = function(object) { 405 | deepboost.print(object) 406 | }) 407 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Deepboost modeling. 2 | 3 | [![Travis-CI Build Status](https://travis-ci.org/dmarcous/CRAN_deepboost.svg?branch=master)](https://travis-ci.org/dmarcous/CRAN_deepboost) 4 | [![rstudio mirror downloads](http://cranlogs.r-pkg.org/badges/grand-total/deepboost)](https://github.com/metacran/cranlogs.app) 5 | [![cran version](http://www.r-pkg.org/badges/version/deepboost)](https://CRAN.R-project.org/package=deepboost) 6 | [![codecov.io](https://codecov.io/github/dmarcous/CRAN_deepboost/coverage.svg?branch=master)](https://codecov.io/github/dmarcous/CRAN_deepboost?branch=master) 7 | 8 | Provides deepboost models training, evaluation, predicting and hyper parameter optimising using grid search and cross validation. 9 | 10 | ## Details 11 | 12 | Based on Google's Deep Boosting algorithm by Cortes et al. 13 | 14 | See [this paper](http://jmlr.org/proceedings/papers/v32/cortesb14.pdf) for details 15 | 16 | Adapted from Google's C++ deepbbost implementation : 17 | 18 | 19 | 20 | Another version for the package that uses the original unmodified algorith exists in : 21 | 22 | 23 | 24 | ## Installation 25 | 26 | From CRAN : 27 | 28 | install.packages("deepboost") 29 | 30 | ## Examples 31 | 32 | Choosing parameters for a deepboost model : 33 | 34 | best_params <- deepboost.gridSearch(formula, data) 35 | 36 | Training a deepboost model : 37 | 38 | boost <- deepboost(formula, data, 39 | num_iter = best_params[2][[1]], 40 | beta = best_params[3][[1]], 41 | lambda = best_params[4][[1]], 42 | loss_type = best_params[5][[1]] 43 | ) 44 | 45 | Print trained model evaluation statistics : 46 | 47 | print(boost) 48 | 49 | Classifying using a trained deepboost model : 50 | 51 | labels <- predict(boost, newdata) 52 | 53 | See Help / demo directory for advanced usage. 54 | 55 | ## Credits 56 | 57 | R Package written and maintained by : 58 | 59 | Daniel Marcous 60 | 61 | Yotam Sandbank 62 | -------------------------------------------------------------------------------- /data/adult.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dmarcous/CRAN_deepboost/bfe7dbf824e4fbbe85a8036e04a8f7d689529044/data/adult.rda -------------------------------------------------------------------------------- /data/australian.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dmarcous/CRAN_deepboost/bfe7dbf824e4fbbe85a8036e04a8f7d689529044/data/australian.rda -------------------------------------------------------------------------------- /data/banana.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dmarcous/CRAN_deepboost/bfe7dbf824e4fbbe85a8036e04a8f7d689529044/data/banana.rda -------------------------------------------------------------------------------- /data/bupa.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dmarcous/CRAN_deepboost/bfe7dbf824e4fbbe85a8036e04a8f7d689529044/data/bupa.rda -------------------------------------------------------------------------------- /data/coli2000.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dmarcous/CRAN_deepboost/bfe7dbf824e4fbbe85a8036e04a8f7d689529044/data/coli2000.rda -------------------------------------------------------------------------------- /data/haberman.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dmarcous/CRAN_deepboost/bfe7dbf824e4fbbe85a8036e04a8f7d689529044/data/haberman.rda -------------------------------------------------------------------------------- /data/heart.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dmarcous/CRAN_deepboost/bfe7dbf824e4fbbe85a8036e04a8f7d689529044/data/heart.rda -------------------------------------------------------------------------------- /data/magic.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dmarcous/CRAN_deepboost/bfe7dbf824e4fbbe85a8036e04a8f7d689529044/data/magic.rda -------------------------------------------------------------------------------- /data/pima.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dmarcous/CRAN_deepboost/bfe7dbf824e4fbbe85a8036e04a8f7d689529044/data/pima.rda -------------------------------------------------------------------------------- /data/sonar.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dmarcous/CRAN_deepboost/bfe7dbf824e4fbbe85a8036e04a8f7d689529044/data/sonar.rda -------------------------------------------------------------------------------- /deepboost.Rproj: -------------------------------------------------------------------------------- 1 | Version: 1.0 2 | 3 | RestoreWorkspace: Default 4 | SaveWorkspace: Default 5 | AlwaysSaveHistory: Default 6 | 7 | EnableCodeIndexing: Yes 8 | UseSpacesForTab: Yes 9 | NumSpacesForTab: 2 10 | Encoding: UTF-8 11 | 12 | RnwWeave: Sweave 13 | LaTeX: pdfLaTeX 14 | 15 | AutoAppendNewline: Yes 16 | StripTrailingWhitespace: Yes 17 | 18 | BuildType: Package 19 | PackageUseDevtools: Yes 20 | PackageInstallArgs: --with-keep.source 21 | PackageCheckArgs: --as-cran 22 | PackageRoxygenize: rd,collate,namespace,vignette 23 | -------------------------------------------------------------------------------- /demo/00Index: -------------------------------------------------------------------------------- 1 | experiments Comparing Deepboost to Adaboost over sample datasets 2 | deepboostGrid Finding optimised parametrs for Deepboost using grid search and 10-fold CV 3 | -------------------------------------------------------------------------------- /demo/deepboostGrid.R: -------------------------------------------------------------------------------- 1 | library(deepboost) 2 | 3 | data("sonar") 4 | formula <- R ~ . 5 | best_params <- 6 | deepboost.gridSearch(formula, sonar) 7 | 8 | boost <- deepboost(formula, sonar, 9 | num_iter = best_params[2][[1]], 10 | beta = best_params[3][[1]], 11 | lambda = best_params[4][[1]], 12 | loss_type = best_params[5][[1]] 13 | ) 14 | 15 | print(boost) 16 | 17 | preds <- predict(boost, sonar) 18 | -------------------------------------------------------------------------------- /demo/experiments.R: -------------------------------------------------------------------------------- 1 | 2 | library(caret) 3 | library(ada) 4 | library(deepboost) 5 | 6 | # read datasets 7 | data("adult") 8 | data("australian") 9 | data("banana") 10 | data("bupa") 11 | data("coli2000") 12 | data("haberman") 13 | data("heart") 14 | data("magic") 15 | data("pima") 16 | data("sonar") 17 | 18 | # create lists of datasets and formulas 19 | datasets <- list(adult=adult, aust=australian, banana=banana, bupa=bupa, coli=coli2000, 20 | haber=haberman, heart=heart, magic=magic, pima=pima, sonar=sonar) 21 | formulas <- list(X..50K ~ X39 + X77516 + X13 + X2174 + X0 + X40, 22 | X0.3 ~ ., 23 | X.1.0 ~ ., 24 | X1 ~ ., 25 | X0.45 ~ ., 26 | negative ~ ., 27 | X2.2 ~ ., 28 | g ~ ., 29 | tested_positive ~ ., 30 | R ~ .) 31 | 32 | results <- data.frame(dataset = numeric(0), ensemble_size = numeric(0), ada_acc = numeric(0), ada_sd = numeric(0), 33 | ada_time = numeric(0), deep_acc = numeric(0), deep_sd = numeric(0), deep_time = numeric(0), 34 | t_test = numeric(0)) 35 | # for each number of iterations 36 | for(num_iter in c(5,10,20,50)){ 37 | # for each data set 38 | for(i in c(2,4,6,7,9,10)){ 39 | ds <- datasets[[i]] 40 | levels(ds[,length(ds)]) <- c(1,-1) 41 | formula <- formulas[[i]] 42 | ada_acc <- rep(0,5) 43 | deep_acc <- rep(0,5) 44 | ada_t <- 0 45 | deep_t <- 0 46 | # 5 different 10folds 47 | for(j in 1:5){ 48 | flds <- createFolds(1:nrow(ds), k = 10) 49 | for(k in 1:10){ 50 | l <- (k%%10)+1 51 | eval_train <- ds[-flds[[l]],] 52 | eval_test <- ds[flds[[l]],] 53 | train <- ds[-flds[[k]],] 54 | test <- ds[flds[[k]],] 55 | 56 | beta_vals = c(2^-0, 2^-1, 2^-2, 2^-3, 2^-4, 2^-5, 2^-6) 57 | lambda_vals = c(0.0001, 0.005, 0.01, 0.05, 0.1, 0.5) 58 | dpbGrid <- expand.grid(beta = beta_vals, 59 | lambda = lambda_vals) 60 | 61 | # train ADABOOST 62 | best_acc = 0 63 | best_nu = 0 64 | for(nu in beta_vals){ 65 | eval_model <- ada(formula, eval_train, iter = num_iter, nu=nu) 66 | acc <- sum(predict(eval_model, eval_test) == eval_test[,length(eval_test)]) / nrow(eval_test) 67 | if(acc > best_acc){ 68 | best_acc <- acc 69 | best_nu <- nu 70 | } 71 | } 72 | 73 | t <- Sys.time() 74 | ab_model <- ada(formula, train, iter = num_iter, nu=best_nu) 75 | ada_acc[j] <- ada_acc[j] + sum(predict(ab_model, test) == test[,length(test)]) / nrow(test) 76 | ada_t <- ada_t + round(difftime(Sys.time(), t, units = "secs"), 2) 77 | 78 | 79 | # train DEEPBOOST 80 | best_acc = 0 81 | best_lambda = 0 82 | best_beta = 0 83 | for(grow in 1:nrow(dpbGrid)){ 84 | beta <- dpbGrid[grow,"beta"] 85 | lambda <- dpbGrid[grow,"lambda"] 86 | eval_model <- deepboost.formula(formula, eval_train, num_iter = num_iter, beta = beta, lambda = lambda, verbose = F) 87 | acc <- sum(predict(eval_model, eval_test) == eval_test[,length(eval_test)]) / nrow(eval_test) 88 | if(acc > best_acc){ 89 | best_acc <- acc 90 | best_lambda <- lambda 91 | best_beta <- beta 92 | } 93 | } 94 | 95 | t <- Sys.time() 96 | db_model <- deepboost.formula(formula, train, num_iter = num_iter, beta = best_beta, lambda = best_lambda, verbose = F) 97 | deep_acc[j] <- deep_acc[j] + sum(predict(db_model, test) == test[,length(test)]) / nrow(test) 98 | deep_t <- deep_t + round(difftime(Sys.time(), t, units = "secs"), 2) 99 | } 100 | ada_acc[j] <- ada_acc[j]/10.0 101 | deep_acc[j] <- deep_acc[j]/10.0 102 | } 103 | # caluculate results 104 | ada_acc_mean <- round(mean(ada_acc), 4) 105 | #ada_auc_mean <- mean(ada_auc) 106 | deep_acc_mean <- round(mean(deep_acc), 4) 107 | #deep_auc_mean <- mean(deep_auc) 108 | ada_acc_sd <- round(sd(ada_acc), 6) 109 | #ada_auc_sd <- sd(ada_auc) 110 | deep_acc_sd <- round(sd(deep_acc), 6) 111 | #deep_auc_sd <- sd(deep_auc) 112 | acc_t_test <- t.test(ada_acc, deep_acc, paired=TRUE)$p.value < 0.05 113 | #auc_t_test <- t.test(ada_auc, deep_auc, paired=TRUE)$p.value < 0.05 114 | 115 | # print to file 116 | fname <- paste('./', names(datasets)[i], num_iter, ".res", sep='') 117 | res <- data.frame(dataset = names(datasets)[i], ensemble_size = num_iter, ada_acc = ada_acc_mean, 118 | ada_sd = ada_acc_sd, ada_time = ada_t, deep_acc = deep_acc_mean, 119 | deep_sd = deep_acc_sd, deep_time = deep_t, t_test = acc_t_test) 120 | write.csv(res, fname, row.names = FALSE) 121 | print(paste(ada_t+deep_t, 'seconds for dataset:', names(datasets)[i], ',ensemble size:', num_iter)) 122 | results <- rbind(results, res) 123 | } 124 | } 125 | write.csv(results, './results.txt', row.names = FALSE) 126 | -------------------------------------------------------------------------------- /man/Deepboost-class.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/deepboost.R 3 | \docType{class} 4 | \name{Deepboost-class} 5 | \alias{Deepboost-class} 6 | \title{An S4 class to represent a deepboost model.} 7 | \description{ 8 | An S4 class to represent a deepboost model. 9 | } 10 | \section{Slots}{ 11 | 12 | \describe{ 13 | \item{\code{tree_depth}}{maximum depth for a single decision tree in the model} 14 | 15 | \item{\code{num_iter}}{number of iterations = number of trees in ensemble} 16 | 17 | \item{\code{beta}}{regularisation for scores (L1)} 18 | 19 | \item{\code{lambda}}{regularisation for tree depth} 20 | 21 | \item{\code{loss_type}}{"l" logistic, "e" exponential} 22 | 23 | \item{\code{verbose}}{print extra data while training TRUE / FALSE} 24 | 25 | \item{\code{examples}}{data.frame with instances used for model training} 26 | 27 | \item{\code{model}}{Deepboost model as used by C code serialised to R List} 28 | 29 | \item{\code{classes}}{a vector of factors representing the classes used for classification with this model} 30 | }} 31 | 32 | -------------------------------------------------------------------------------- /man/Deepboost.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/deepboost.R 3 | \name{deepboost} 4 | \alias{deepboost} 5 | \title{Main function for deepboost model creation} 6 | \usage{ 7 | deepboost(formula, data, instance_weights = NULL, tree_depth = 5, 8 | num_iter = 1, beta = 0, lambda = 0.05, loss_type = "l", 9 | verbose = TRUE) 10 | } 11 | \arguments{ 12 | \item{formula}{A R Formula object see : ?formula} 13 | 14 | \item{data}{A data.frame of samples to train on} 15 | 16 | \item{instance_weights}{The weight of each example} 17 | 18 | \item{tree_depth}{maximum depth for a single decision tree in the model} 19 | 20 | \item{num_iter}{number of iterations = number of trees in ensemble} 21 | 22 | \item{beta}{regularisation for scores (L1)} 23 | 24 | \item{lambda}{regularisation for tree depth} 25 | 26 | \item{loss_type}{- "l" logistic, "e" exponential} 27 | 28 | \item{verbose}{- print extra data while training TRUE / FALSE} 29 | } 30 | \value{ 31 | A trained Deepbost model 32 | } 33 | \description{ 34 | Main function for deepboost model creation 35 | } 36 | \examples{ 37 | deepboost(y ~ ., 38 | data.frame(x1=rep(c(0,0,1,1),2),x2=rep(c(0,1,0,1),2),y=factor(rep(c(0,0,0,1),2))), 39 | num_iter=1) 40 | deepboost(y ~ ., 41 | data.frame(x1=rep(c(0,0,1,1),22),x2=rep(c(0,1,0,1),2),y=factor(rep(c(0,0,0,1),2))), 42 | num_iter=2, beta=0.1, lambda=0.00125) 43 | } 44 | -------------------------------------------------------------------------------- /man/adult.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/deepboost-data.R 3 | \docType{data} 4 | \name{adult} 5 | \alias{adult} 6 | \title{Adult humans} 7 | \format{A data frame with 32560 rows and 15 variables: 8 | \describe{ 9 | \item{Adm.clerical}{unknown} 10 | \item{Bachelors}{person is a bachlor} 11 | \item{Male}{gender} 12 | \item{Never.married}{did person marry?} 13 | \item{Not.in.family}{is person a part of a family} 14 | \item{State.gov}{state} 15 | \item{United.States}{is from the united states} 16 | \item{White}{is white} 17 | \item{X..50K}{unknown} 18 | \item{X0}{unknown} 19 | \item{X13}{unknown} 20 | \item{X2174}{unknown} 21 | \item{X39}{unknown} 22 | \item{X40}{unknown} 23 | \item{X77516}{unknown} 24 | }} 25 | \source{ 26 | \url{https://archive.ics.uci.edu/ml/datasets/Adult/} 27 | } 28 | \usage{ 29 | adult 30 | } 31 | \description{ 32 | A dataset containing adult population personal details 33 | } 34 | \keyword{datasets} 35 | -------------------------------------------------------------------------------- /man/australian.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/deepboost-data.R 3 | \docType{data} 4 | \name{australian} 5 | \alias{australian} 6 | \title{Australian} 7 | \format{An object of class \code{data.frame} with 689 rows and 15 columns.} 8 | \usage{ 9 | australian 10 | } 11 | \description{ 12 | Australian 13 | } 14 | \keyword{datasets} 15 | -------------------------------------------------------------------------------- /man/banana.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/deepboost-data.R 3 | \docType{data} 4 | \name{banana} 5 | \alias{banana} 6 | \title{banana} 7 | \format{An object of class \code{data.frame} with 5299 rows and 3 columns.} 8 | \usage{ 9 | banana 10 | } 11 | \description{ 12 | banana 13 | } 14 | \keyword{datasets} 15 | -------------------------------------------------------------------------------- /man/bupa.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/deepboost-data.R 3 | \docType{data} 4 | \name{bupa} 5 | \alias{bupa} 6 | \title{bupa} 7 | \format{An object of class \code{data.frame} with 344 rows and 7 columns.} 8 | \usage{ 9 | bupa 10 | } 11 | \description{ 12 | bupa 13 | } 14 | \keyword{datasets} 15 | -------------------------------------------------------------------------------- /man/coli2000.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/deepboost-data.R 3 | \docType{data} 4 | \name{coli2000} 5 | \alias{coli2000} 6 | \title{coli2000} 7 | \format{An object of class \code{data.frame} with 9821 rows and 86 columns.} 8 | \usage{ 9 | coli2000 10 | } 11 | \description{ 12 | coli2000 13 | } 14 | \keyword{datasets} 15 | -------------------------------------------------------------------------------- /man/deepboost.default.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/deepboost.R 3 | \name{deepboost.default} 4 | \alias{deepboost.default} 5 | \title{Main function for deepboost model creation} 6 | \usage{ 7 | deepboost.default(x, y, instance_weights = NULL, tree_depth = 5, 8 | num_iter = 1, beta = 0, lambda = 0.05, loss_type = "l", 9 | verbose = TRUE) 10 | } 11 | \arguments{ 12 | \item{x}{A data.frame of samples' values} 13 | 14 | \item{y}{A data.frame of samples's labels} 15 | 16 | \item{instance_weights}{The weight of each example} 17 | 18 | \item{tree_depth}{maximum depth for a single decision tree in the model} 19 | 20 | \item{num_iter}{number of iterations = number of trees in ensemble} 21 | 22 | \item{beta}{regularisation for scores (L1)} 23 | 24 | \item{lambda}{regularisation for tree depth} 25 | 26 | \item{loss_type}{- "l" logistic, "e" exponential} 27 | 28 | \item{verbose}{- print extra data while training TRUE / FALSE} 29 | } 30 | \value{ 31 | A trained Deepbost model 32 | } 33 | \description{ 34 | Main function for deepboost model creation 35 | } 36 | \examples{ 37 | deepboost.default(data.frame(x1=rep(c(0,0,1,1),2),x2=rep(c(0,1,0,1),2)), 38 | factor(rep(c(0,0,0,1),2)),num_iter=1) 39 | deepboost.default(data.frame(x1=rep(c(0,0,1,1),2),x2=rep(c(0,1,0,1),2)), 40 | factor(rep(c(0,0,0,1),2)), 41 | num_iter=2, beta=0.1, lambda=0.00125) 42 | } 43 | -------------------------------------------------------------------------------- /man/deepboost.evaluate.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/deepboost.R 3 | \name{deepboost.evaluate} 4 | \alias{deepboost.evaluate} 5 | \title{Evaluates and prints statistics for a deepboost model} 6 | \usage{ 7 | deepboost.evaluate(object, data) 8 | } 9 | \arguments{ 10 | \item{object}{A Deepboost S4 class object} 11 | 12 | \item{data}{a \code{data.frame} object to evaluate with the model} 13 | } 14 | \value{ 15 | a list with model statistics - error, avg_tree_size, num_trees 16 | } 17 | \description{ 18 | Evaluates and prints statistics for a deepboost model 19 | } 20 | \examples{ 21 | dpb <- deepboost(y ~ ., 22 | data.frame(x1=rep(c(0,0,1,1),2),x2=rep(c(0,1,0,1),2),y=factor(rep(c(0,0,0,1),2))), 23 | num_iter=2,tree_depth=2) 24 | deepboost.evaluate(dpb,data.frame(x1=rep(c(1,1,1,0),2),x2=rep(c(1,1,1,1),2))) 25 | } 26 | -------------------------------------------------------------------------------- /man/deepboost.formula.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/deepboost.R 3 | \name{deepboost.formula} 4 | \alias{deepboost.formula} 5 | \title{Main function for deepboost model creation, using a formula} 6 | \usage{ 7 | deepboost.formula(formula, data, instance_weights = NULL, tree_depth = 5, 8 | num_iter = 1, beta = 0, lambda = 0.05, loss_type = "l", 9 | verbose = TRUE) 10 | } 11 | \arguments{ 12 | \item{formula}{A R Formula object see : ?formula} 13 | 14 | \item{data}{A data.frame of samples to train on} 15 | 16 | \item{instance_weights}{The weight of each example} 17 | 18 | \item{tree_depth}{maximum depth for a single decision tree in the model} 19 | 20 | \item{num_iter}{number of iterations = number of trees in ensemble} 21 | 22 | \item{beta}{regularisation for scores (L1)} 23 | 24 | \item{lambda}{regularisation for tree depth} 25 | 26 | \item{loss_type}{- "l" logistic, "e" exponential} 27 | 28 | \item{verbose}{- print extra data while training TRUE / FALSE} 29 | } 30 | \value{ 31 | A trained Deepbost model 32 | } 33 | \description{ 34 | Main function for deepboost model creation, using a formula 35 | } 36 | \examples{ 37 | deepboost.formula(y ~ ., 38 | data.frame(x1=rep(c(0,0,1,1),2),x2=rep(c(0,1,0,1),2),y=factor(rep(c(0,0,0,1),2))), 39 | num_iter=1) 40 | deepboost.formula(y ~ ., 41 | data.frame(x1=rep(c(0,0,1,1),2),x2=rep(c(0,1,0,1),2),y=factor(rep(c(0,0,0,1),2))), 42 | num_iter=2, beta=0.1, lambda=0.00125) 43 | } 44 | -------------------------------------------------------------------------------- /man/deepboost.gridSearch.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/deepboost-grid-search.R 3 | \name{deepboost.gridSearch} 4 | \alias{deepboost.gridSearch} 5 | \title{Returns optimised parameter list for deepboost model on given data} 6 | \usage{ 7 | deepboost.gridSearch(formula, data, k = 10, seed = 666, logging_level = 1) 8 | } 9 | \arguments{ 10 | \item{formula}{A R Formula object see : ?formula} 11 | 12 | \item{data}{input data.frame as training for model} 13 | 14 | \item{k}{number of folds (default = 10) for cross validation optimisation} 15 | 16 | \item{seed}{for random split to train / test (default 666)} 17 | 18 | \item{logging_level}{print extra data while training 0 - no data, 1 - gridSearch data (default), 2 - all data} 19 | } 20 | \value{ 21 | vector with average accuracy for chosen parameters, and a list of the best parameter combination: (accuracy, (num_iter, beta, lambda, loss_type)) 22 | } 23 | \description{ 24 | Returns optimised parameter list for deepboost model on given data 25 | } 26 | \details{ 27 | Finds optimised parameters for deepboost training. 28 | using grid search techniques over: 29 | - predefined, battle tested parameter possible values 30 | - cross validation over k folds 31 | } 32 | \examples{ 33 | deepboost.gridSearch(y ~ ., 34 | data.frame(x1=rep(c(0,0,1,1),2),x2=rep(c(0,1,0,1),2),y=factor(rep(c(0,0,0,1),2))), k=2) 35 | } 36 | -------------------------------------------------------------------------------- /man/deepboost.predict.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/deepboost.R 3 | \name{deepboost.predict} 4 | \alias{deepboost.predict} 5 | \title{Predicts instances responses based on a deepboost model} 6 | \usage{ 7 | deepboost.predict(object, newdata, type = "terms") 8 | } 9 | \arguments{ 10 | \item{object}{A Deepboost S4 class object} 11 | 12 | \item{newdata}{A data.frame to predict responses for} 13 | 14 | \item{type}{Type of prediction : "terms" - for class labels, "response" for probabilities} 15 | } 16 | \value{ 17 | A vector of respones 18 | } 19 | \description{ 20 | Predicts instances responses based on a deepboost model 21 | } 22 | \examples{ 23 | dpb <- deepboost(y ~ ., 24 | data.frame(x1=rep(c(0,0,1,1),2),x2=rep(c(0,1,0,1),2),y=factor(rep(c(0,0,0,1),2))), 25 | num_iter=2,tree_depth=2) 26 | deepboost.predict(dpb,data.frame(x1=rep(c(1,1,1,0),5),x2=rep(c(1,1,1,1),5))) 27 | } 28 | -------------------------------------------------------------------------------- /man/deepboost.print.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/deepboost.R 3 | \name{deepboost.print} 4 | \alias{deepboost.print} 5 | \title{Evaluates and prints statistics for a deepboost model on the train set} 6 | \usage{ 7 | deepboost.print(object) 8 | } 9 | \arguments{ 10 | \item{object}{A Deepboost S4 class object} 11 | } 12 | \value{ 13 | List with model_statistics to console the model evaluation string 14 | } 15 | \description{ 16 | Evaluates and prints statistics for a deepboost model on the train set 17 | } 18 | \examples{ 19 | dpb <- deepboost(y ~ ., 20 | data.frame(x1=rep(c(0,0,1,1),2),x2=rep(c(0,1,0,1),2),y=factor(rep(c(0,0,0,1),2))), 21 | num_iter=2,tree_depth=2) 22 | deepboost.print(dpb) 23 | } 24 | -------------------------------------------------------------------------------- /man/deepboost.train.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/deepboost.R 3 | \name{deepboost.train} 4 | \alias{deepboost.train} 5 | \title{Trains a deepboost model} 6 | \usage{ 7 | deepboost.train(object, data, tree_depth, num_iter, beta, lambda, loss_type, 8 | verbose, classes) 9 | } 10 | \arguments{ 11 | \item{object}{A Deepboost S4 class object} 12 | 13 | \item{data}{input data.frame as training for model} 14 | 15 | \item{tree_depth}{maximum depth for a single decision tree in the model} 16 | 17 | \item{num_iter}{number of iterations = number of trees in ensemble} 18 | 19 | \item{beta}{regularisation for scores (L1)} 20 | 21 | \item{lambda}{regularisation for tree depth} 22 | 23 | \item{loss_type}{- "l" logistic, "e" exponential} 24 | 25 | \item{verbose}{- print extra data while training TRUE / FALSE} 26 | 27 | \item{classes}{a vector of factors representing the classes used for classification with this model} 28 | } 29 | \value{ 30 | A trained Deepbost model 31 | } 32 | \description{ 33 | Trains a deepboost model 34 | } 35 | \details{ 36 | (beta,lambda) = (0,0) - adaboost, (>0,0) - L1, (0,>0) deepboost, (>0, >0) deepbost+L1 37 | } 38 | -------------------------------------------------------------------------------- /man/haberman.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/deepboost-data.R 3 | \docType{data} 4 | \name{haberman} 5 | \alias{haberman} 6 | \title{haberman} 7 | \format{An object of class \code{data.frame} with 305 rows and 4 columns.} 8 | \usage{ 9 | haberman 10 | } 11 | \description{ 12 | haberman 13 | } 14 | \keyword{datasets} 15 | -------------------------------------------------------------------------------- /man/heart.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/deepboost-data.R 3 | \docType{data} 4 | \name{heart} 5 | \alias{heart} 6 | \title{heart} 7 | \format{An object of class \code{data.frame} with 269 rows and 14 columns.} 8 | \usage{ 9 | heart 10 | } 11 | \description{ 12 | heart 13 | } 14 | \keyword{datasets} 15 | -------------------------------------------------------------------------------- /man/magic.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/deepboost-data.R 3 | \docType{data} 4 | \name{magic} 5 | \alias{magic} 6 | \title{magic} 7 | \format{An object of class \code{data.frame} with 19019 rows and 11 columns.} 8 | \usage{ 9 | magic 10 | } 11 | \description{ 12 | magic 13 | } 14 | \keyword{datasets} 15 | -------------------------------------------------------------------------------- /man/pima.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/deepboost-data.R 3 | \docType{data} 4 | \name{pima} 5 | \alias{pima} 6 | \title{pima} 7 | \format{An object of class \code{data.frame} with 767 rows and 9 columns.} 8 | \usage{ 9 | pima 10 | } 11 | \description{ 12 | pima 13 | } 14 | \keyword{datasets} 15 | -------------------------------------------------------------------------------- /man/predict-Deepboost-method.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/deepboost.R 3 | \docType{methods} 4 | \name{predict,Deepboost-method} 5 | \alias{predict,Deepboost-method} 6 | \title{Predict method for Deepboost model} 7 | \usage{ 8 | \S4method{predict}{Deepboost}(object, newdata, type = "terms") 9 | } 10 | \arguments{ 11 | \item{object}{Object of class "Deepboost"} 12 | 13 | \item{newdata}{takes \code{data.frame}.} 14 | 15 | \item{type}{Type of prediction} 16 | } 17 | \description{ 18 | Predicted values based on deepboost model object. 19 | } 20 | \details{ 21 | The option \code{ntreelimit} purpose is to let the user train a model with lots 22 | of trees but use only the first trees for prediction to avoid overfitting 23 | (without having to train a new model with less trees). 24 | } 25 | \examples{ 26 | dpb <- deepboost(y ~ ., 27 | data.frame(x1=rep(c(0,0,1,1),2),x2=rep(c(0,1,0,1),2),y=factor(rep(c(0,0,0,1),2))), 28 | num_iter=2,tree_depth=2) 29 | predict(dpb,data.frame(x1=rep(c(1,1,1,0),2),x2=rep(c(1,1,1,1),2))) 30 | } 31 | -------------------------------------------------------------------------------- /man/show-Deepboost-method.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/deepboost.R 3 | \docType{methods} 4 | \name{show,Deepboost-method} 5 | \alias{show,Deepboost-method} 6 | \title{Print method for Deepboost model 7 | Evaluates a trained deepboost model object.} 8 | \usage{ 9 | \S4method{show}{Deepboost}(object) 10 | } 11 | \arguments{ 12 | \item{object}{Object of class "Deepboost"} 13 | } 14 | \description{ 15 | Print method for Deepboost model 16 | Evaluates a trained deepboost model object. 17 | } 18 | \details{ 19 | Prints : 20 | Model error: X" 21 | Average tree size: Y" 22 | Number of trees: Z" 23 | } 24 | \examples{ 25 | dpb <- deepboost(y ~ ., 26 | data.frame(x1=rep(c(0,0,1,1),2),x2=rep(c(0,1,0,1),2),y=factor(rep(c(0,0,0,1),2))), 27 | num_iter=2,tree_depth=2) 28 | print(dpb) 29 | } 30 | -------------------------------------------------------------------------------- /man/sonar.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/deepboost-data.R 3 | \docType{data} 4 | \name{sonar} 5 | \alias{sonar} 6 | \title{sonar} 7 | \format{An object of class \code{data.frame} with 207 rows and 61 columns.} 8 | \usage{ 9 | sonar 10 | } 11 | \description{ 12 | sonar 13 | } 14 | \keyword{datasets} 15 | -------------------------------------------------------------------------------- /src/Makevars: -------------------------------------------------------------------------------- 1 | # require c++11 2 | CXX_STD = CXX11 3 | -------------------------------------------------------------------------------- /src/Makevars.win: -------------------------------------------------------------------------------- 1 | # require c++11 2 | CXX_STD = CXX11 3 | 4 | OBJECTS = ./tree.o ./boost.o ./deepboost_C.o ./deepboost_converters.o ./deepboost_R.o ./RcppExports.o 5 | -------------------------------------------------------------------------------- /src/RcppExports.cpp: -------------------------------------------------------------------------------- 1 | // Generated by using Rcpp::compileAttributes() -> do not edit by hand 2 | // Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393 3 | 4 | #include 5 | 6 | using namespace Rcpp; 7 | 8 | // Train_R 9 | Rcpp::List Train_R(DataFrame data, int tree_depth, int num_iter, double beta, double lambda, char loss_type, bool verbose); 10 | RcppExport SEXP _deepboost_Train_R(SEXP dataSEXP, SEXP tree_depthSEXP, SEXP num_iterSEXP, SEXP betaSEXP, SEXP lambdaSEXP, SEXP loss_typeSEXP, SEXP verboseSEXP) { 11 | BEGIN_RCPP 12 | Rcpp::RObject rcpp_result_gen; 13 | Rcpp::RNGScope rcpp_rngScope_gen; 14 | Rcpp::traits::input_parameter< DataFrame >::type data(dataSEXP); 15 | Rcpp::traits::input_parameter< int >::type tree_depth(tree_depthSEXP); 16 | Rcpp::traits::input_parameter< int >::type num_iter(num_iterSEXP); 17 | Rcpp::traits::input_parameter< double >::type beta(betaSEXP); 18 | Rcpp::traits::input_parameter< double >::type lambda(lambdaSEXP); 19 | Rcpp::traits::input_parameter< char >::type loss_type(loss_typeSEXP); 20 | Rcpp::traits::input_parameter< bool >::type verbose(verboseSEXP); 21 | rcpp_result_gen = Rcpp::wrap(Train_R(data, tree_depth, num_iter, beta, lambda, loss_type, verbose)); 22 | return rcpp_result_gen; 23 | END_RCPP 24 | } 25 | // Predict_R 26 | Rcpp::List Predict_R(DataFrame newdata, Rcpp::List model); 27 | RcppExport SEXP _deepboost_Predict_R(SEXP newdataSEXP, SEXP modelSEXP) { 28 | BEGIN_RCPP 29 | Rcpp::RObject rcpp_result_gen; 30 | Rcpp::RNGScope rcpp_rngScope_gen; 31 | Rcpp::traits::input_parameter< DataFrame >::type newdata(newdataSEXP); 32 | Rcpp::traits::input_parameter< Rcpp::List >::type model(modelSEXP); 33 | rcpp_result_gen = Rcpp::wrap(Predict_R(newdata, model)); 34 | return rcpp_result_gen; 35 | END_RCPP 36 | } 37 | // PredictProbabilities_R 38 | Rcpp::List PredictProbabilities_R(DataFrame newdata, Rcpp::List model); 39 | RcppExport SEXP _deepboost_PredictProbabilities_R(SEXP newdataSEXP, SEXP modelSEXP) { 40 | BEGIN_RCPP 41 | Rcpp::RObject rcpp_result_gen; 42 | Rcpp::RNGScope rcpp_rngScope_gen; 43 | Rcpp::traits::input_parameter< DataFrame >::type newdata(newdataSEXP); 44 | Rcpp::traits::input_parameter< Rcpp::List >::type model(modelSEXP); 45 | rcpp_result_gen = Rcpp::wrap(PredictProbabilities_R(newdata, model)); 46 | return rcpp_result_gen; 47 | END_RCPP 48 | } 49 | // Evaluate_R 50 | Rcpp::List Evaluate_R(DataFrame data, Rcpp::List model); 51 | RcppExport SEXP _deepboost_Evaluate_R(SEXP dataSEXP, SEXP modelSEXP) { 52 | BEGIN_RCPP 53 | Rcpp::RObject rcpp_result_gen; 54 | Rcpp::RNGScope rcpp_rngScope_gen; 55 | Rcpp::traits::input_parameter< DataFrame >::type data(dataSEXP); 56 | Rcpp::traits::input_parameter< Rcpp::List >::type model(modelSEXP); 57 | rcpp_result_gen = Rcpp::wrap(Evaluate_R(data, model)); 58 | return rcpp_result_gen; 59 | END_RCPP 60 | } 61 | 62 | static const R_CallMethodDef CallEntries[] = { 63 | {"_deepboost_Train_R", (DL_FUNC) &_deepboost_Train_R, 7}, 64 | {"_deepboost_Predict_R", (DL_FUNC) &_deepboost_Predict_R, 2}, 65 | {"_deepboost_PredictProbabilities_R", (DL_FUNC) &_deepboost_PredictProbabilities_R, 2}, 66 | {"_deepboost_Evaluate_R", (DL_FUNC) &_deepboost_Evaluate_R, 2}, 67 | {NULL, NULL, 0} 68 | }; 69 | 70 | RcppExport void R_init_deepboost(DllInfo *dll) { 71 | R_registerRoutines(dll, NULL, CallEntries, NULL, NULL); 72 | R_useDynamicSymbols(dll, FALSE); 73 | } 74 | -------------------------------------------------------------------------------- /src/boost.cc: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2015 Google Inc. All rights reserved. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | 17 | #include "boost.h" 18 | 19 | #include 20 | //#include 21 | #include 22 | 23 | #include "tree.h" 24 | 25 | 26 | float ComputeEta(float wgtd_error, float tree_size, float alpha, float beta, float lambda) { 27 | wgtd_error = std::fmax(wgtd_error, kTolerance); // Helps with division by zero. 28 | const float error_term = 29 | (1 - wgtd_error) * std::exp(alpha) - wgtd_error * std::exp(-alpha); 30 | const float complexity_penalty = ComplexityPenalty(tree_size, beta, lambda); 31 | const float ratio = complexity_penalty / wgtd_error; 32 | float eta; 33 | if (std::fabs(error_term) <= 2 * complexity_penalty) { 34 | eta = -alpha; 35 | } else if (error_term > 2 * complexity_penalty) { 36 | eta = std::log(-ratio + std::sqrt(ratio * ratio + (1 - wgtd_error)/wgtd_error)); 37 | } else { 38 | eta = std::log(ratio + std::sqrt(ratio * ratio + (1 - wgtd_error)/wgtd_error)); 39 | } 40 | return eta; 41 | } 42 | 43 | // TODO(usyed): examples is passed by non-const reference because the example 44 | // weights need to be changed. This is bad style. 45 | void AddTreeToModel(vector& examples, Model* model, char loss_type, float beta, float lambda, int tree_depth) { 46 | // Initialize normalizer 47 | static float normalizer; 48 | if (model->empty()) { 49 | if (loss_type == 'e') { 50 | normalizer = std::exp(1) * static_cast(examples.size()); 51 | } else if (loss_type == 'l') { 52 | normalizer = 53 | static_cast(examples.size()) / (std::log(2) * (1 + std::exp(-1))); 54 | } 55 | } 56 | InitializeTreeData(examples, normalizer); 57 | int best_old_tree_idx = -1; 58 | float wgtd_error, gradient, best_wgtd_error = 0, best_gradient = 0; 59 | 60 | // Find best old tree 61 | bool old_tree_is_best = false; 62 | for (int i = 0; i < model->size(); ++i) { 63 | const float alpha = (*model)[i].first; 64 | if (std::fabs(alpha) < kTolerance) continue; // Skip zeroed-out weights. 65 | const Tree& old_tree = (*model)[i].second; 66 | wgtd_error = EvaluateTreeWgtd(examples, old_tree); 67 | int sign_edge = (wgtd_error >= 0.5) ? 1 : -1; 68 | gradient = Gradient(wgtd_error, old_tree.size(), alpha, sign_edge, beta, lambda); 69 | if (std::fabs(gradient) >= std::fabs(best_gradient)) { 70 | best_gradient = gradient; 71 | best_wgtd_error = wgtd_error; 72 | best_old_tree_idx = i; 73 | old_tree_is_best = true; 74 | } 75 | } 76 | 77 | // Find best new tree 78 | Tree new_tree = TrainTree(examples, beta, lambda, tree_depth); 79 | wgtd_error = EvaluateTreeWgtd(examples, new_tree); 80 | gradient = Gradient(wgtd_error, new_tree.size(), 0, -1, beta, lambda); 81 | if (model->empty() || std::fabs(gradient) > std::fabs(best_gradient)) { 82 | best_gradient = gradient; 83 | best_wgtd_error = wgtd_error; 84 | old_tree_is_best = false; 85 | } 86 | 87 | // Update model weights 88 | float alpha = 0; 89 | const Tree* tree; 90 | if (old_tree_is_best) { 91 | alpha = (*model)[best_old_tree_idx].first; 92 | tree = &((*model)[best_old_tree_idx].second); 93 | } else { 94 | alpha = 0; 95 | tree = &(new_tree); 96 | } 97 | const float eta = ComputeEta(best_wgtd_error, tree->size(), alpha, beta, lambda); 98 | if (old_tree_is_best) { 99 | (*model)[best_old_tree_idx].first += eta; 100 | } else { 101 | model->push_back(make_pair(eta, new_tree)); 102 | } 103 | 104 | // Update examples weights and compute normalizer 105 | const float old_normalizer = normalizer; 106 | normalizer = 0; 107 | for (Example& example : examples) { 108 | const float u = eta * example.label * ClassifyExample(example, *tree); 109 | if (loss_type == 'e') { 110 | example.weight = example.weight * std::exp(-u) * old_normalizer; 111 | } else if (loss_type == 'l') { 112 | const float z = (1 - std::log(2) * example.weight * old_normalizer) / 113 | (std::log(2) * example.weight * old_normalizer); 114 | example.weight = 1 / (std::log(2) * (1 + z * std::exp(u))); 115 | } 116 | normalizer += example.weight; 117 | } 118 | 119 | // Renormalize example weights 120 | // TODO(usyed): Two loops is inefficient. 121 | for (Example& example : examples) { 122 | example.weight /= normalizer; 123 | } 124 | } 125 | 126 | Probability ComputeExampleClassProbability(const Example& example, const Model& model) { 127 | float score = 0; 128 | float sumOfWeights = 0; 129 | float probability = 0; 130 | for (const pair& wgtd_tree : model) { 131 | score += wgtd_tree.first * ClassifyExample(example, wgtd_tree.second); 132 | sumOfWeights += wgtd_tree.first; 133 | } 134 | probability = ((score/sumOfWeights) + 1) / 2.0; 135 | return probability; 136 | } 137 | 138 | Label ClassifyExample(const Example& example, const Model& model) { 139 | float score = 0; 140 | score = ComputeExampleClassProbability(example, model); 141 | if (score < 0.5) { 142 | return -1; 143 | } else { 144 | return 1; 145 | } 146 | } 147 | 148 | void EvaluateModel(const vector& examples, const Model& model, 149 | float* error, float* avg_tree_size, int* num_trees) { 150 | float incorrect = 0; 151 | for (const Example& example : examples) { 152 | if (example.label != ClassifyExample(example, model)) { 153 | ++incorrect; 154 | } 155 | } 156 | *num_trees = 0; 157 | int sum_tree_size = 0; 158 | for (const pair& wgtd_tree : model) { 159 | if (std::fabs(wgtd_tree.first) >= kTolerance) { 160 | ++(*num_trees); 161 | sum_tree_size += wgtd_tree.second.size(); 162 | } 163 | } 164 | *error = (incorrect / examples.size()); 165 | *avg_tree_size = static_cast(sum_tree_size) / *num_trees; 166 | } 167 | -------------------------------------------------------------------------------- /src/boost.h: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2015 Google Inc. All rights reserved. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | 17 | #ifndef BOOST_H_ 18 | #define BOOST_H_ 19 | 20 | #include "types.h" 21 | 22 | // Either add a new tree to model or update the weight of an existing tree in 23 | // model. The tree and weight are selected via approximate coordinate descent on 24 | // the objective, where the "approximate" indicates that we do not search all 25 | // trees but instead grow trees greedily. 26 | void AddTreeToModel(vector& examples, Model* model, char loss_type, float beta, float lambda, int tree_depth); 27 | 28 | // Compute example probability with model. 29 | Probability ComputeExampleClassProbability(const Example& example, const Model& model); 30 | 31 | // Classify example with model. 32 | Label ClassifyExample(const Example& example, const Model& model); 33 | 34 | // Compute the error of model on examples. Also compute the number of trees in 35 | // model and their average size. 36 | void EvaluateModel(const vector& examples, const Model& model, 37 | float* error, float* avg_tree_size, int* num_trees); 38 | 39 | // Return the optimal weight to add to a tree that will maximally decrease the 40 | // objective. 41 | float ComputeEta(float wgtd_error, float tree_size, float alpha, float beta, float lambda); 42 | 43 | #endif // BOOST_H_ 44 | -------------------------------------------------------------------------------- /src/deepboost_C.cc: -------------------------------------------------------------------------------- 1 | /* 2 | Written by: 3 | Daniel Marcous, Yotam Sandbank 4 | */ 5 | 6 | #include "deepboost_C.h" 7 | #include "boost.h" 8 | #include "tree.h" 9 | 10 | #include 11 | 12 | using namespace Rcpp; 13 | 14 | 15 | // Train a deepboost model on the given examples, using 16 | // numIter iterations (which not necessarily means numIter trees) 17 | void Train(vector* train_examples, Model* model, int tree_depth, 18 | int num_iter, float beta, float lambda, char loss_type, bool verbose) { 19 | 20 | 21 | // Train the model 22 | for (int iter = 1; iter <= num_iter; ++iter) { 23 | AddTreeToModel(*train_examples, model, loss_type, beta, lambda, tree_depth); 24 | if (verbose) { 25 | float error, avg_tree_size; 26 | int num_trees; 27 | EvaluateModel(*train_examples, *model, &error, &avg_tree_size, 28 | &num_trees); 29 | Rcpp::Rcout << "Iteration: " << iter 30 | << ", error: " << error 31 | << ", avg tree size: " << avg_tree_size 32 | << ", num trees: " << num_trees 33 | << std::endl; 34 | } 35 | } 36 | } 37 | 38 | 39 | // Classify examples using model 40 | vector