├── .Rbuildignore
├── .gitignore
├── .travis.yml
├── DESCRIPTION
├── NAMESPACE
├── R
    ├── RcppExports.R
    ├── deepboost-data.R
    ├── deepboost-grid-search.R
    └── deepboost.R
├── README.md
├── data
    ├── adult.rda
    ├── australian.rda
    ├── banana.rda
    ├── bupa.rda
    ├── coli2000.rda
    ├── haberman.rda
    ├── heart.rda
    ├── magic.rda
    ├── pima.rda
    └── sonar.rda
├── deepboost.Rproj
├── demo
    ├── 00Index
    ├── deepboostGrid.R
    └── experiments.R
├── man
    ├── Deepboost-class.Rd
    ├── Deepboost.Rd
    ├── adult.Rd
    ├── australian.Rd
    ├── banana.Rd
    ├── bupa.Rd
    ├── coli2000.Rd
    ├── deepboost.default.Rd
    ├── deepboost.evaluate.Rd
    ├── deepboost.formula.Rd
    ├── deepboost.gridSearch.Rd
    ├── deepboost.predict.Rd
    ├── deepboost.print.Rd
    ├── deepboost.train.Rd
    ├── haberman.Rd
    ├── heart.Rd
    ├── magic.Rd
    ├── pima.Rd
    ├── predict-Deepboost-method.Rd
    ├── show-Deepboost-method.Rd
    └── sonar.Rd
├── src
    ├── Makevars
    ├── Makevars.win
    ├── RcppExports.cpp
    ├── boost.cc
    ├── boost.h
    ├── deepboost_C.cc
    ├── deepboost_C.h
    ├── deepboost_R.cpp
    ├── deepboost_converters.cpp
    ├── deepboost_converters.h
    ├── tree.cc
    ├── tree.h
    └── types.h
└── tests
    ├── testthat.R
    └── testthat
        └── test_basic.R


/.Rbuildignore:
--------------------------------------------------------------------------------
1 | ^.*\.Rproj$
2 | ^\.Rproj\.user$
3 | ^\.travis\.yml$
4 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .Rproj.user
2 | .Rhistory
3 | .o
4 | .so
5 | .dll
6 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | # Sample .travis.yml for R projects
 2 | 
 3 | language: r
 4 | warnings_are_errors: true
 5 | sudo: required
 6 | 
 7 | r_packages:
 8 |   - covr
 9 | 
10 | after_success:
11 |   - Rscript -e 'library(covr); codecov()'
12 | 


--------------------------------------------------------------------------------
/DESCRIPTION:
--------------------------------------------------------------------------------
 1 | Package: deepboost
 2 | Type: Package
 3 | Title: Deep Boosting Ensemble Modeling
 4 | Version: 0.1.6
 5 | Date: 2017-11-08
 6 | Author: Daniel Marcous [aut, cre], Yotam Sandbank [aut], Google Inc. [cph]
 7 | Maintainer: Daniel Marcous <dmarcous@gmail.com>
 8 | Authors@R: c(
 9 |     person("Daniel", "Marcous", email = "dmarcous@gmail.com", role = c("aut","cre")),
10 |     person("Yotam", "Sandbank", email = "yotamsandbank@gmail.com", role = "aut"),
11 |     person("Google Inc.", role = "cph")
12 |   )
13 | Description: Provides deep boosting models training, evaluation, predicting and
14 |  hyper parameter optimising using grid search and cross validation.
15 |  Based on Google's Deep Boosting algorithm, and Google's C++ implementation.
16 |  Cortes, C., Mohri, M., & Syed, U. (2014) <http://machinelearning.wustl.edu/mlpapers/papers/icml2014c2_cortesb14>.
17 | URL: https://github.com/dmarcous/CRAN_deepboost
18 | BugReports: https://github.com/dmarcous/CRAN_deepboost/issues
19 | License: Apache License (== 2.0)
20 | LazyData: TRUE
21 | Suggests:
22 |     testthat,
23 |     ada,
24 |     caret
25 | Depends:
26 |     R (>= 3.1)
27 | Imports:
28 |     Rcpp (>= 0.12.2),
29 |     methods
30 | LinkingTo: Rcpp
31 | RoxygenNote: 6.0.1
32 | 


--------------------------------------------------------------------------------
/NAMESPACE:
--------------------------------------------------------------------------------
 1 | # Generated by roxygen2: do not edit by hand
 2 | 
 3 | export(deepboost)
 4 | export(deepboost.default)
 5 | export(deepboost.evaluate)
 6 | export(deepboost.formula)
 7 | export(deepboost.gridSearch)
 8 | export(deepboost.predict)
 9 | export(deepboost.print)
10 | export(deepboost.train)
11 | exportMethods(predict)
12 | exportMethods(show)
13 | import(methods)
14 | importFrom(Rcpp,evalCpp)
15 | importFrom(stats,contrasts)
16 | importFrom(stats,model.matrix)
17 | importFrom(stats,model.response)
18 | useDynLib(deepboost)
19 | 


--------------------------------------------------------------------------------
/R/RcppExports.R:
--------------------------------------------------------------------------------
 1 | # Generated by using Rcpp::compileAttributes() -> do not edit by hand
 2 | # Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393
 3 | 
 4 | Train_R <- function(data, tree_depth, num_iter, beta, lambda, loss_type, verbose) {
 5 |     .Call('_deepboost_Train_R', PACKAGE = 'deepboost', data, tree_depth, num_iter, beta, lambda, loss_type, verbose)
 6 | }
 7 | 
 8 | Predict_R <- function(newdata, model) {
 9 |     .Call('_deepboost_Predict_R', PACKAGE = 'deepboost', newdata, model)
10 | }
11 | 
12 | PredictProbabilities_R <- function(newdata, model) {
13 |     .Call('_deepboost_PredictProbabilities_R', PACKAGE = 'deepboost', newdata, model)
14 | }
15 | 
16 | Evaluate_R <- function(data, model) {
17 |     .Call('_deepboost_Evaluate_R', PACKAGE = 'deepboost', data, model)
18 | }
19 | 
20 | 


--------------------------------------------------------------------------------
/R/deepboost-data.R:
--------------------------------------------------------------------------------
 1 | #' Adult humans
 2 | #'
 3 | #' A dataset containing adult population personal details
 4 | #'
 5 | #' @format A data frame with 32560 rows and 15 variables:
 6 | #' \describe{
 7 | #'   \item{Adm.clerical}{unknown}
 8 | #'   \item{Bachelors}{person is a bachlor}
 9 | #'   \item{Male}{gender}
10 | #'   \item{Never.married}{did person marry?}
11 | #'   \item{Not.in.family}{is person a part of a family}
12 | #'   \item{State.gov}{state}
13 | #'   \item{United.States}{is from the united states}
14 | #'   \item{White}{is white}
15 | #'   \item{X..50K}{unknown}
16 | #'   \item{X0}{unknown}
17 | #'   \item{X13}{unknown}
18 | #'   \item{X2174}{unknown}
19 | #'   \item{X39}{unknown}
20 | #'   \item{X40}{unknown}
21 | #'   \item{X77516}{unknown}
22 | #' }
23 | #' @source \url{https://archive.ics.uci.edu/ml/datasets/Adult/}
24 | "adult"
25 | 
26 | #' Australian
27 | "australian"
28 | 
29 | #' banana
30 | "banana"
31 | 
32 | #' bupa
33 | "bupa"
34 | 
35 | #' coli2000
36 | "coli2000"
37 | 
38 | #' haberman
39 | "haberman"
40 | 
41 | #' heart
42 | "heart"
43 | 
44 | #' magic
45 | "magic"
46 | 
47 | #' pima
48 | "pima"
49 | 
50 | #' sonar
51 | "sonar"
52 | 


--------------------------------------------------------------------------------
/R/deepboost-grid-search.R:
--------------------------------------------------------------------------------
 1 | #' Returns optimised parameter list for deepboost model on given data
 2 | #' @param formula A R Formula object see : ?formula
 3 | #' @param data input data.frame as training for model
 4 | #' @param k number of folds (default = 10) for cross validation optimisation
 5 | #' @param seed for random split to train / test (default 666)
 6 | #' @param logging_level print extra data while training 0 - no data, 1 - gridSearch data (default), 2 - all data
 7 | #' @details Finds optimised parameters for deepboost training.
 8 | #'  using grid search techniques over:
 9 | #'  - predefined, battle tested parameter possible values
10 | #'  - cross validation over k folds
11 | #' @return vector with average accuracy for chosen parameters, and a list of the best parameter combination: (accuracy, (num_iter, beta, lambda, loss_type))
12 | #' @examples
13 | #' deepboost.gridSearch(y ~ .,
14 | #'  data.frame(x1=rep(c(0,0,1,1),2),x2=rep(c(0,1,0,1),2),y=factor(rep(c(0,0,0,1),2))), k=2)
15 | #' @export
16 | deepboost.gridSearch <- function(formula, data, k=10, seed=666, logging_level=1) {
17 | 
18 |   if (!(is.numeric(k)) || k <= 1 || !(k%%1==0))
19 |   {
20 |     stop("ERROR_paramter_setting : k must be >= 2 and integer (Default : 10)" )
21 |   }
22 | 
23 |   if (!(is.numeric(logging_level)) || logging_level < 0 || logging_level > 2 || !(k%%1==0))
24 |   {
25 |     stop("ERROR_paramter_setting : logging_level must be integer (0 / 1 / 2) (Default : 1)" )
26 |   }
27 | 
28 |   verbose <- ifelse(logging_level>1,TRUE,FALSE)
29 | 
30 |   num_iter_vals = c(5,10,25,50)
31 |   beta_vals = c(2^-0, 2^-1, 2^-2, 2^-3, 2^-4, 2^-5, 2^-6)
32 |   lambda_vals = c(0.0001, 0.005, 0.01, 0.05, 0.1, 0.5)
33 |   loss_type_vals = c("l","e")
34 |   dpbGrid <-  expand.grid(num_iter = num_iter_vals,
35 |                           beta = beta_vals,
36 |                           lambda = lambda_vals,
37 |                           loss_type = loss_type_vals)
38 | 
39 |   set.seed(seed)
40 | 
41 |   #Randomly shuffle the data
42 |   data<-data[sample(nrow(data)),]
43 | 
44 |   folds <- cut(seq(1,nrow(data)),breaks=k,labels=FALSE)
45 |   best_acc <- -Inf
46 |   avg_acc <- 0
47 | 
48 |   for(combination in 1:nrow(dpbGrid)){
49 |     num_iter <- dpbGrid[combination,"num_iter"]
50 |     beta <- dpbGrid[combination,"beta"]
51 |     lambda <- dpbGrid[combination,"lambda"]
52 |     loss_type <- as.character(dpbGrid[combination,"loss_type"])
53 |     acc <- 0
54 | 
55 |     for(fold in 1:k){
56 |       testIndexes <- which(folds==fold,arr.ind=TRUE)
57 |       testData <- data[testIndexes, ]
58 |       trainData <- data[-testIndexes, ]
59 | 
60 |       eval_model <- deepboost.formula(formula, trainData, num_iter = num_iter, beta = beta, lambda = lambda, loss_type = loss_type, verbose=verbose)
61 |       acc <-  acc + sum(predict(eval_model, testData) == testData[,length(testData)]) / nrow(testData)
62 |     }
63 |     acc <- acc / k
64 |     if(acc > best_acc){
65 |       best_acc <- acc
66 |       best_num_iter <- num_iter
67 |       best_lambda <- lambda
68 |       best_beta <- beta
69 |       best_loss_type <- loss_type
70 |     }
71 |     avg_acc <- avg_acc + acc
72 | 
73 |   }
74 |   avg_acc <- avg_acc / nrow(dpbGrid)
75 | 
76 |   if(logging_level > 0)
77 |   {
78 |     print(paste0("average accuracy : ", avg_acc))
79 |     print(paste0("accuracy: ", best_acc, ", num_iter: ", best_num_iter, ", beta: ", best_beta, ", lambda: ", best_lambda, ", loss_type: ", best_loss_type))
80 |   }
81 | 
82 |   RET <-
83 |     c(avg_acc,
84 |       list(best_num_iter,
85 |            best_lambda,
86 |            best_beta,
87 |            best_loss_type))
88 | 
89 |   return(RET)
90 | }
91 | 


--------------------------------------------------------------------------------
/R/deepboost.R:
--------------------------------------------------------------------------------
  1 | #' @useDynLib deepboost
  2 | #' @importFrom Rcpp evalCpp
  3 | #' @importFrom stats contrasts model.matrix model.response
  4 | #' @import methods
  5 | NULL
  6 | 
  7 | #' An S4 class to represent a deepboost model.
  8 | #'
  9 | #' @slot tree_depth maximum depth for a single decision tree in the model
 10 | #' @slot num_iter number of iterations = number of trees in ensemble
 11 | #' @slot beta regularisation for scores (L1)
 12 | #' @slot lambda regularisation for tree depth
 13 | #' @slot loss_type "l" logistic, "e" exponential
 14 | #' @slot verbose print extra data while training TRUE / FALSE
 15 | #' @slot examples data.frame with instances used for model training
 16 | #' @slot model Deepboost model as used by C code serialised to R List
 17 | #' @slot classes a vector of factors representing the classes used for classification with this model
 18 | setClass("Deepboost",
 19 |          slots = list(
 20 |            tree_depth = "numeric",
 21 |            num_iter = "numeric",
 22 |            beta = "numeric",
 23 |            lambda= "numeric",
 24 |            loss_type = "character",
 25 |            verbose = "logical",
 26 |            examples = "data.frame",
 27 |            model = "list",
 28 |            classes = "character"
 29 |          ))
 30 | 
 31 | #' Trains a deepboost model
 32 | #'
 33 | #' @param object A Deepboost S4 class object
 34 | #' @param data input data.frame as training for model
 35 | #' @param tree_depth maximum depth for a single decision tree in the model
 36 | #' @param num_iter number of iterations = number of trees in ensemble
 37 | #' @param beta regularisation for scores (L1)
 38 | #' @param lambda regularisation for tree depth
 39 | #' @param loss_type - "l" logistic, "e" exponential
 40 | #' @param verbose - print extra data while training TRUE / FALSE
 41 | #' @param classes a vector of factors representing the classes used for classification with this model
 42 | #' @details (beta,lambda) = (0,0) - adaboost, (>0,0) - L1, (0,>0) deepboost, (>0, >0) deepbost+L1
 43 | #' @return A trained Deepbost model
 44 | #' @export
 45 | deepboost.train <- function(object, data,
 46 |                             tree_depth,
 47 |                             num_iter,
 48 |                             beta,
 49 |                             lambda,
 50 |                             loss_type,
 51 |                             verbose,
 52 |                             classes) {
 53 |   # set slots
 54 |   RET = new("Deepboost")
 55 | 
 56 |   # Check parameter validity
 57 |   if (!(is.numeric(tree_depth)) || tree_depth <= 0 || !(tree_depth%%1==0))
 58 |   {
 59 |     stop("ERROR_paramter_setting : tree_depth must be >= 1 and integer (Default : 5)" )
 60 |   }
 61 |   RET@tree_depth = as.integer(tree_depth)
 62 | 
 63 |   # Check parameter validity
 64 |   if (!(is.numeric(num_iter)) || num_iter <= 0 || !(num_iter%%1==0))
 65 |   {
 66 |     stop("ERROR_paramter_setting : num_iter must be >= 1 and integer (Default : 1)" )
 67 |   }
 68 |   RET@num_iter = as.integer(num_iter)
 69 | 
 70 |   # (beta, lambda) =
 71 |   # (0,0) - adaboost, (>0,0) - L1, (0,>0) deepboost, (>0, >0) deepbost+L1
 72 | 
 73 |   # Check parameter validity
 74 |   if (!(is.numeric(beta)) || beta < 0.0)
 75 |   {
 76 |     stop("ERROR_paramter_setting : beta must be >= 0 and double (Default : 0.0)" )
 77 |   }
 78 |   RET@beta = as.double(beta)
 79 | 
 80 |   # Check parameter validity
 81 |   if (!(is.numeric(lambda)) || lambda < 0.0)
 82 |   {
 83 |     stop("ERROR_paramter_setting : lambda must be >= 0 and double (Default : 0.05)" )
 84 |   }
 85 |   RET@lambda = as.double(lambda)
 86 | 
 87 |   # Check parameter validity
 88 |   if (!(is.character(loss_type)) || (loss_type != "l" && loss_type != "e"))
 89 |   {
 90 |     stop("ERROR_paramter_setting : loss_type must be \"l\" - logistic or \"e\" - exponential (Default : \"l\")" )
 91 |   }
 92 |   RET@loss_type = as.character(loss_type)
 93 | 
 94 |   if (!(is.logical(verbose)))
 95 |   {
 96 |     stop("ERROR_paramter_setting : verbose must be boolean (True / False) (Default : TRUE)" )
 97 |   }
 98 |   RET@verbose = verbose
 99 | 
100 |   RET@examples = data
101 |   RET@classes = classes
102 | 
103 |   # call training
104 |   model =  Train_R(RET@examples,
105 |                    RET@tree_depth, RET@num_iter, RET@beta, RET@lambda, RET@loss_type, RET@verbose)
106 | 
107 |   RET@model = model
108 | 
109 |   return(RET)
110 | }
111 | 
112 | #' Predicts instances responses based on a deepboost model
113 | #'
114 | #' @param object A Deepboost S4 class object
115 | #' @param newdata A data.frame to predict responses for
116 | #' @param type Type of prediction : "terms" - for class labels, "response" for probabilities
117 | #' @return A vector of respones
118 | #' @examples
119 | #' dpb <- deepboost(y ~ .,
120 | #'  data.frame(x1=rep(c(0,0,1,1),2),x2=rep(c(0,1,0,1),2),y=factor(rep(c(0,0,0,1),2))),
121 | #'  num_iter=2,tree_depth=2)
122 | #' deepboost.predict(dpb,data.frame(x1=rep(c(1,1,1,0),5),x2=rep(c(1,1,1,1),5)))
123 | #' @export
124 | deepboost.predict <- function(object, newdata, type="terms") {
125 |   # Check parameter validity
126 |   if (!(is.character(type)) || (type != "terms" && type != "response"))
127 |   {
128 |     stop("ERROR_deepboost.predict : type must be \"terms\" - labels or \"response\" - probabilities" )
129 |   }
130 | 
131 |   if (type == "terms")
132 |   {
133 |     labels <-
134 |       Predict_R(newdata,
135 |                 object@model)
136 | 
137 |     labels <- unlist(labels)
138 |     labels[labels==1] <- object@classes[1]
139 |     labels[labels==-1] <- object@classes[2]
140 |     results = labels
141 |   }
142 |   else if (type == "response")
143 |   {
144 |     probabilities <-
145 |       PredictProbabilities_R(newdata,
146 |                              object@model)
147 | 
148 |     probabilities <- unlist(probabilities)
149 |     probMat <- matrix(nrow=length(probabilities),ncol=2)
150 |     probMat[,1] <- probabilities
151 |     probMat[,2] <- 1.0-probabilities
152 |     results = probMat
153 |   }
154 | 
155 |   return (results)
156 | }
157 | 
158 | #' Evaluates and prints statistics for a deepboost model on the train set
159 | #'
160 | #' @param object A Deepboost S4 class object
161 | #' @return List with model_statistics to console the model evaluation string
162 | #' @examples
163 | #' dpb <- deepboost(y ~ .,
164 | #'  data.frame(x1=rep(c(0,0,1,1),2),x2=rep(c(0,1,0,1),2),y=factor(rep(c(0,0,0,1),2))),
165 | #'  num_iter=2,tree_depth=2)
166 | #' deepboost.print(dpb)
167 | #' @export
168 | deepboost.print <- function(object) {
169 |   model_stats <- deepboost.evaluate(object, object@examples)
170 |   print(paste("Model error:",model_stats[["error"]]))
171 |   print(paste("Average tree size:",model_stats[["avg_tree_size"]]))
172 |   print(paste("Number of trees:",model_stats[["num_trees"]]))
173 |   return (model_stats)
174 | }
175 | 
176 | #' Evaluates and prints statistics for a deepboost model
177 | #'
178 | #' @param object A Deepboost S4 class object
179 | #' @param data a \code{data.frame} object to evaluate with the model
180 | #' @return a list with model statistics - error, avg_tree_size, num_trees
181 | #' @examples
182 | #' dpb <- deepboost(y ~ .,
183 | #'  data.frame(x1=rep(c(0,0,1,1),2),x2=rep(c(0,1,0,1),2),y=factor(rep(c(0,0,0,1),2))),
184 | #'  num_iter=2,tree_depth=2)
185 | #' deepboost.evaluate(dpb,data.frame(x1=rep(c(1,1,1,0),2),x2=rep(c(1,1,1,1),2)))
186 | #' @export
187 | deepboost.evaluate <- function(object, data) {
188 |   model_stats <-
189 |     Evaluate_R(data,
190 |                object@model)
191 |   return (model_stats)
192 | }
193 | 
194 | #' Empty Deepboost S4 class object with default settings
195 | Deepboost <- new("Deepboost",
196 |                  examples = data.frame(),
197 |                  model = list()
198 | )
199 | 
200 | #' Main function for deepboost model creation
201 | #'
202 | #' @param x A data.frame of samples' values
203 | #' @param y A data.frame of samples's labels
204 | #' @param instance_weights The weight of each example
205 | #' @param tree_depth maximum depth for a single decision tree in the model
206 | #' @param num_iter number of iterations = number of trees in ensemble
207 | #' @param beta regularisation for scores (L1)
208 | #' @param lambda regularisation for tree depth
209 | #' @param loss_type - "l" logistic, "e" exponential
210 | #' @param verbose - print extra data while training TRUE / FALSE
211 | #' @return A trained Deepbost model
212 | #' @examples
213 | #' deepboost.default(data.frame(x1=rep(c(0,0,1,1),2),x2=rep(c(0,1,0,1),2)),
214 | #'  factor(rep(c(0,0,0,1),2)),num_iter=1)
215 | #' deepboost.default(data.frame(x1=rep(c(0,0,1,1),2),x2=rep(c(0,1,0,1),2)),
216 | #'  factor(rep(c(0,0,0,1),2)),
217 | #'  num_iter=2, beta=0.1, lambda=0.00125)
218 | #' @export
219 | deepboost.default <- function(x, y, instance_weights = NULL,
220 |                               tree_depth = 5,
221 |                               num_iter = 1,
222 |                               beta = 0.0,
223 |                               lambda= 0.05,
224 |                               loss_type = "l",
225 |                               verbose = TRUE
226 |                               ) {
227 |   # initialize weights
228 |   n <- nrow(x)
229 |   if(is.null(instance_weights))
230 |   {
231 |     instance_weights <- rep(1/n, n)
232 |   }
233 |   # make response either 1 or -1
234 |   y <- factor(y)
235 |   if (length(levels(y))!=2)
236 |   {
237 |     stop("ERROR_data : response must be binary" )
238 |   }
239 |   classes = levels(y)
240 |   levels(y) <- c(1,-1)
241 |   # create data
242 |   data <- data.frame(x)
243 |   data['label'] <- y
244 |   data['weight'] <- instance_weights
245 | 
246 |   fit <- deepboost.train(Deepboost, data,
247 |                          tree_depth,
248 |                          num_iter,
249 |                          beta,
250 |                          lambda,
251 |                          loss_type,
252 |                          verbose,
253 |                          classes)
254 | 
255 |   return (fit)
256 | }
257 | 
258 | #' Main function for deepboost model creation
259 | #'
260 | #' @param formula A R Formula object see : ?formula
261 | #' @param data A data.frame of samples to train on
262 | #' @param instance_weights The weight of each example
263 | #' @param tree_depth maximum depth for a single decision tree in the model
264 | #' @param num_iter number of iterations = number of trees in ensemble
265 | #' @param beta regularisation for scores (L1)
266 | #' @param lambda regularisation for tree depth
267 | #' @param loss_type - "l" logistic, "e" exponential
268 | #' @param verbose - print extra data while training TRUE / FALSE
269 | #' @return A trained Deepbost model
270 | #' @examples
271 | #' deepboost(y ~ .,
272 | #'  data.frame(x1=rep(c(0,0,1,1),2),x2=rep(c(0,1,0,1),2),y=factor(rep(c(0,0,0,1),2))),
273 | #'  num_iter=1)
274 | #' deepboost(y ~ .,
275 | #'  data.frame(x1=rep(c(0,0,1,1),22),x2=rep(c(0,1,0,1),2),y=factor(rep(c(0,0,0,1),2))),
276 | #'  num_iter=2, beta=0.1, lambda=0.00125)
277 | #' @export
278 | deepboost <- function(formula, data,
279 |                       instance_weights = NULL,
280 |                       tree_depth = 5,
281 |                       num_iter = 1,
282 |                       beta = 0.0,
283 |                       lambda= 0.05,
284 |                       loss_type = "l",
285 |                       verbose = TRUE) {
286 |   deepboost.formula(formula, data,
287 |                     instance_weights,
288 |                     tree_depth,
289 |                     num_iter,
290 |                     beta,
291 |                     lambda,
292 |                     loss_type,
293 |                     verbose)
294 | }
295 | 
296 | #' Main function for deepboost model creation, using a formula
297 | #'
298 | #' @param formula A R Formula object see : ?formula
299 | #' @param data A data.frame of samples to train on
300 | #' @param instance_weights The weight of each example
301 | #' @param tree_depth maximum depth for a single decision tree in the model
302 | #' @param num_iter number of iterations = number of trees in ensemble
303 | #' @param beta regularisation for scores (L1)
304 | #' @param lambda regularisation for tree depth
305 | #' @param loss_type - "l" logistic, "e" exponential
306 | #' @param verbose - print extra data while training TRUE / FALSE
307 | #' @return A trained Deepbost model
308 | #' @examples
309 | #' deepboost.formula(y ~ .,
310 | #'  data.frame(x1=rep(c(0,0,1,1),2),x2=rep(c(0,1,0,1),2),y=factor(rep(c(0,0,0,1),2))),
311 | #'  num_iter=1)
312 | #' deepboost.formula(y ~ .,
313 | #'  data.frame(x1=rep(c(0,0,1,1),2),x2=rep(c(0,1,0,1),2),y=factor(rep(c(0,0,0,1),2))),
314 | #'  num_iter=2, beta=0.1, lambda=0.00125)
315 | #' @export
316 | deepboost.formula <- function(formula, data, instance_weights = NULL,
317 |                               tree_depth = 5,
318 |                               num_iter = 1,
319 |                               beta = 0.0,
320 |                               lambda= 0.05,
321 |                               loss_type = "l",
322 |                               verbose = TRUE) {
323 |   # initialize weights
324 |   n <- nrow(data)
325 |   if(is.null(instance_weights))
326 |   {
327 |     instance_weights <- rep(1/n, n)
328 |   }
329 |   # parse formula
330 |   cl <- match.call()
331 |   mf <- match.call(expand.dots = FALSE)
332 |   m <- match(c("formula", "data"), names(mf), 0L)
333 |   mf <- mf[c(1L, m)]
334 |   mf$drop.unused.levels <- TRUE
335 |   mf[[1L]] <- quote(stats::model.frame)
336 |   mf <- eval(mf, parent.frame())
337 |   mt <- attr(mf, "terms")
338 |   y <- factor(model.response(mf))
339 |   x <- model.matrix(mt, mf, contrasts)
340 |   # make response either 1 or -1
341 |   if (length(levels(y))!=2)
342 |   {
343 |     stop("ERROR_data : response must be binary" )
344 |   }
345 |   classes = levels(y)
346 |   levels(y) <- c(1,-1)
347 |   # create data
348 |   data <- data.frame(x[,-1])
349 |   data['label'] <- y
350 |   data['weight'] <- instance_weights
351 | 
352 |   fit <- deepboost.train(Deepboost, data,
353 |                          tree_depth,
354 |                          num_iter,
355 |                          beta,
356 |                          lambda,
357 |                          loss_type,
358 |                          verbose,
359 |                          classes)
360 | 
361 |   return (fit)
362 | }
363 | 
364 | #' Predict method for Deepboost model
365 | #'
366 | #' Predicted values based on deepboost model object.
367 | #'
368 | #' @param object Object of class "Deepboost"
369 | #' @param newdata takes \code{data.frame}.
370 | #' @param type Type of prediction
371 | #'
372 | #' @details
373 | #' The option \code{ntreelimit} purpose is to let the user train a model with lots
374 | #' of trees but use only the first trees for prediction to avoid overfitting
375 | #' (without having to train a new model with less trees).
376 | #' @examples
377 | #' dpb <- deepboost(y ~ .,
378 | #'  data.frame(x1=rep(c(0,0,1,1),2),x2=rep(c(0,1,0,1),2),y=factor(rep(c(0,0,0,1),2))),
379 | #'  num_iter=2,tree_depth=2)
380 | #' predict(dpb,data.frame(x1=rep(c(1,1,1,0),2),x2=rep(c(1,1,1,1),2)))
381 | #' @export
382 | setMethod("predict", signature = "Deepboost",
383 |           definition = function(object, newdata, type="terms") {
384 |             deepboost.predict(object, newdata, type)
385 | })
386 | 
387 | #' Print method for Deepboost model
388 | #' Evaluates a trained deepboost model object.
389 | #'
390 | #' @param object Object of class "Deepboost"
391 | #'
392 | #' @details
393 | #' Prints :
394 | #' Model error: X"
395 | #' Average tree size: Y"
396 | #' Number of trees: Z"
397 | #' @examples
398 | #' dpb <- deepboost(y ~ .,
399 | #'  data.frame(x1=rep(c(0,0,1,1),2),x2=rep(c(0,1,0,1),2),y=factor(rep(c(0,0,0,1),2))),
400 | #'  num_iter=2,tree_depth=2)
401 | #' print(dpb)
402 | #' @export
403 | setMethod("show", signature = "Deepboost",
404 |           definition = function(object) {
405 |             deepboost.print(object)
406 |           })
407 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Deepboost modeling.
 2 | 
 3 | [![Travis-CI Build Status](https://travis-ci.org/dmarcous/CRAN_deepboost.svg?branch=master)](https://travis-ci.org/dmarcous/CRAN_deepboost)
 4 | [![rstudio mirror downloads](http://cranlogs.r-pkg.org/badges/grand-total/deepboost)](https://github.com/metacran/cranlogs.app)
 5 | [![cran version](http://www.r-pkg.org/badges/version/deepboost)](https://CRAN.R-project.org/package=deepboost)
 6 | [![codecov.io](https://codecov.io/github/dmarcous/CRAN_deepboost/coverage.svg?branch=master)](https://codecov.io/github/dmarcous/CRAN_deepboost?branch=master)
 7 | 
 8 | Provides deepboost models training, evaluation, predicting and hyper parameter optimising using grid search and cross validation.
 9 | 
10 | ## Details
11 | 
12 | Based on Google's Deep Boosting algorithm by Cortes et al.
13 | 
14 | See [this paper](http://jmlr.org/proceedings/papers/v32/cortesb14.pdf) for details
15 | 
16 | Adapted from Google's C++ deepbbost implementation :
17 | 
18 | <https://github.com/google/deepboost>
19 | 
20 | Another version for the package that uses the original unmodified algorith exists in :
21 | 
22 | <https://github.com/dmarcous/deepboost>
23 | 
24 | ## Installation
25 | 
26 | From CRAN : 
27 | 
28 |     install.packages("deepboost")
29 | 
30 | ## Examples
31 | 
32 | Choosing parameters for a deepboost model :
33 | 
34 |     best_params <- deepboost.gridSearch(formula, data)
35 | 
36 | Training a deepboost model :
37 | 
38 |     boost <- deepboost(formula, data,
39 |                         num_iter = best_params[2][[1]], 
40 |                         beta = best_params[3][[1]], 
41 |                         lambda = best_params[4][[1]], 
42 |                         loss_type = best_params[5][[1]]
43 |                         )
44 | 
45 | Print trained model evaluation statistics :                         
46 | 
47 |     print(boost)
48 | 
49 | Classifying using a trained deepboost model :
50 | 
51 |     labels <- predict(boost, newdata)
52 |     
53 | See Help / demo directory for advanced usage.
54 | 
55 | ## Credits
56 | 
57 | R Package written and maintained by :
58 | 
59 | Daniel Marcous <dmarcous@gmail.com>
60 | 
61 | Yotam Sandbank <yotamsandbank@gmail.com>
62 | 


--------------------------------------------------------------------------------
/data/adult.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dmarcous/CRAN_deepboost/bfe7dbf824e4fbbe85a8036e04a8f7d689529044/data/adult.rda


--------------------------------------------------------------------------------
/data/australian.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dmarcous/CRAN_deepboost/bfe7dbf824e4fbbe85a8036e04a8f7d689529044/data/australian.rda


--------------------------------------------------------------------------------
/data/banana.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dmarcous/CRAN_deepboost/bfe7dbf824e4fbbe85a8036e04a8f7d689529044/data/banana.rda


--------------------------------------------------------------------------------
/data/bupa.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dmarcous/CRAN_deepboost/bfe7dbf824e4fbbe85a8036e04a8f7d689529044/data/bupa.rda


--------------------------------------------------------------------------------
/data/coli2000.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dmarcous/CRAN_deepboost/bfe7dbf824e4fbbe85a8036e04a8f7d689529044/data/coli2000.rda


--------------------------------------------------------------------------------
/data/haberman.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dmarcous/CRAN_deepboost/bfe7dbf824e4fbbe85a8036e04a8f7d689529044/data/haberman.rda


--------------------------------------------------------------------------------
/data/heart.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dmarcous/CRAN_deepboost/bfe7dbf824e4fbbe85a8036e04a8f7d689529044/data/heart.rda


--------------------------------------------------------------------------------
/data/magic.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dmarcous/CRAN_deepboost/bfe7dbf824e4fbbe85a8036e04a8f7d689529044/data/magic.rda


--------------------------------------------------------------------------------
/data/pima.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dmarcous/CRAN_deepboost/bfe7dbf824e4fbbe85a8036e04a8f7d689529044/data/pima.rda


--------------------------------------------------------------------------------
/data/sonar.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dmarcous/CRAN_deepboost/bfe7dbf824e4fbbe85a8036e04a8f7d689529044/data/sonar.rda


--------------------------------------------------------------------------------
/deepboost.Rproj:
--------------------------------------------------------------------------------
 1 | Version: 1.0
 2 | 
 3 | RestoreWorkspace: Default
 4 | SaveWorkspace: Default
 5 | AlwaysSaveHistory: Default
 6 | 
 7 | EnableCodeIndexing: Yes
 8 | UseSpacesForTab: Yes
 9 | NumSpacesForTab: 2
10 | Encoding: UTF-8
11 | 
12 | RnwWeave: Sweave
13 | LaTeX: pdfLaTeX
14 | 
15 | AutoAppendNewline: Yes
16 | StripTrailingWhitespace: Yes
17 | 
18 | BuildType: Package
19 | PackageUseDevtools: Yes
20 | PackageInstallArgs: --with-keep.source
21 | PackageCheckArgs: --as-cran
22 | PackageRoxygenize: rd,collate,namespace,vignette
23 | 


--------------------------------------------------------------------------------
/demo/00Index:
--------------------------------------------------------------------------------
1 | experiments               Comparing Deepboost to Adaboost over sample datasets
2 | deepboostGrid             Finding optimised parametrs for Deepboost using grid search and 10-fold CV
3 | 


--------------------------------------------------------------------------------
/demo/deepboostGrid.R:
--------------------------------------------------------------------------------
 1 | library(deepboost)
 2 | 
 3 | data("sonar")
 4 | formula <- R ~ .
 5 | best_params <-
 6 |   deepboost.gridSearch(formula, sonar)
 7 | 
 8 | boost <- deepboost(formula, sonar,
 9 |                    num_iter = best_params[2][[1]],
10 |                    beta = best_params[3][[1]],
11 |                    lambda = best_params[4][[1]],
12 |                    loss_type = best_params[5][[1]]
13 | )
14 | 
15 | print(boost)
16 | 
17 | preds <- predict(boost, sonar)
18 | 


--------------------------------------------------------------------------------
/demo/experiments.R:
--------------------------------------------------------------------------------
  1 | 
  2 | library(caret)
  3 | library(ada)
  4 | library(deepboost)
  5 | 
  6 | # read datasets
  7 | data("adult")
  8 | data("australian")
  9 | data("banana")
 10 | data("bupa")
 11 | data("coli2000")
 12 | data("haberman")
 13 | data("heart")
 14 | data("magic")
 15 | data("pima")
 16 | data("sonar")
 17 | 
 18 | # create lists of datasets and formulas
 19 | datasets <- list(adult=adult, aust=australian, banana=banana, bupa=bupa, coli=coli2000,
 20 |                  haber=haberman, heart=heart, magic=magic, pima=pima, sonar=sonar)
 21 | formulas <- list(X..50K ~ X39 + X77516 + X13 + X2174 +  X0 + X40,
 22 |                  X0.3 ~ .,
 23 |                  X.1.0 ~ .,
 24 |                  X1 ~ .,
 25 |                  X0.45 ~ .,
 26 |                  negative ~ .,
 27 |                  X2.2 ~ .,
 28 |                  g ~ .,
 29 |                  tested_positive ~ .,
 30 |                  R ~ .)
 31 | 
 32 | results <- data.frame(dataset = numeric(0), ensemble_size = numeric(0), ada_acc = numeric(0), ada_sd = numeric(0),
 33 |                       ada_time = numeric(0), deep_acc = numeric(0), deep_sd = numeric(0), deep_time = numeric(0),
 34 |                       t_test = numeric(0))
 35 | # for each number of iterations
 36 | for(num_iter in c(5,10,20,50)){
 37 |   # for each data set
 38 |   for(i in c(2,4,6,7,9,10)){
 39 |     ds <- datasets[[i]]
 40 |     levels(ds[,length(ds)]) <- c(1,-1)
 41 |     formula <- formulas[[i]]
 42 |     ada_acc <- rep(0,5)
 43 |     deep_acc <- rep(0,5)
 44 |     ada_t <- 0
 45 |     deep_t <- 0
 46 |     # 5 different 10folds
 47 |     for(j in 1:5){
 48 |       flds <- createFolds(1:nrow(ds), k = 10)
 49 |       for(k in 1:10){
 50 |         l <- (k%%10)+1
 51 |         eval_train <- ds[-flds[[l]],]
 52 |         eval_test <- ds[flds[[l]],]
 53 |         train <- ds[-flds[[k]],]
 54 |         test <- ds[flds[[k]],]
 55 | 
 56 |         beta_vals = c(2^-0, 2^-1, 2^-2, 2^-3, 2^-4, 2^-5, 2^-6)
 57 |         lambda_vals = c(0.0001, 0.005, 0.01, 0.05, 0.1, 0.5)
 58 |         dpbGrid <-  expand.grid(beta = beta_vals,
 59 |                                 lambda = lambda_vals)
 60 | 
 61 |         # train ADABOOST
 62 |         best_acc = 0
 63 |         best_nu = 0
 64 |         for(nu in beta_vals){
 65 |           eval_model <- ada(formula, eval_train, iter = num_iter, nu=nu)
 66 |           acc <-  sum(predict(eval_model, eval_test) == eval_test[,length(eval_test)]) / nrow(eval_test)
 67 |           if(acc > best_acc){
 68 |             best_acc <- acc
 69 |             best_nu <- nu
 70 |           }
 71 |         }
 72 | 
 73 |         t <- Sys.time()
 74 |         ab_model <- ada(formula, train, iter = num_iter, nu=best_nu)
 75 |         ada_acc[j] <- ada_acc[j] + sum(predict(ab_model, test) == test[,length(test)]) / nrow(test)
 76 |         ada_t <- ada_t + round(difftime(Sys.time(), t, units = "secs"), 2)
 77 | 
 78 | 
 79 |         # train DEEPBOOST
 80 |         best_acc = 0
 81 |         best_lambda = 0
 82 |         best_beta = 0
 83 |         for(grow in 1:nrow(dpbGrid)){
 84 |           beta <- dpbGrid[grow,"beta"]
 85 |           lambda <- dpbGrid[grow,"lambda"]
 86 |           eval_model <- deepboost.formula(formula, eval_train, num_iter = num_iter, beta = beta, lambda = lambda, verbose = F)
 87 |           acc <-  sum(predict(eval_model, eval_test) == eval_test[,length(eval_test)]) / nrow(eval_test)
 88 |           if(acc > best_acc){
 89 |             best_acc <- acc
 90 |             best_lambda <- lambda
 91 |             best_beta <- beta
 92 |           }
 93 |         }
 94 | 
 95 |         t <- Sys.time()
 96 |         db_model <- deepboost.formula(formula, train, num_iter = num_iter, beta = best_beta, lambda = best_lambda, verbose = F)
 97 |         deep_acc[j] <- deep_acc[j] + sum(predict(db_model, test) == test[,length(test)]) / nrow(test)
 98 |         deep_t <- deep_t + round(difftime(Sys.time(), t, units = "secs"), 2)
 99 |       }
100 |       ada_acc[j] <- ada_acc[j]/10.0
101 |       deep_acc[j] <- deep_acc[j]/10.0
102 |     }
103 |     # caluculate results
104 |     ada_acc_mean <- round(mean(ada_acc), 4)
105 |     #ada_auc_mean <- mean(ada_auc)
106 |     deep_acc_mean <- round(mean(deep_acc), 4)
107 |     #deep_auc_mean <- mean(deep_auc)
108 |     ada_acc_sd <- round(sd(ada_acc), 6)
109 |     #ada_auc_sd <- sd(ada_auc)
110 |     deep_acc_sd <- round(sd(deep_acc), 6)
111 |     #deep_auc_sd <- sd(deep_auc)
112 |     acc_t_test <- t.test(ada_acc, deep_acc, paired=TRUE)$p.value < 0.05
113 |     #auc_t_test <- t.test(ada_auc, deep_auc, paired=TRUE)$p.value < 0.05
114 | 
115 |     # print to file
116 |     fname <- paste('./', names(datasets)[i], num_iter, ".res", sep='')
117 |     res <- data.frame(dataset = names(datasets)[i], ensemble_size = num_iter, ada_acc = ada_acc_mean,
118 |                       ada_sd = ada_acc_sd, ada_time = ada_t, deep_acc = deep_acc_mean,
119 |                       deep_sd = deep_acc_sd, deep_time = deep_t,  t_test = acc_t_test)
120 |     write.csv(res, fname, row.names = FALSE)
121 |     print(paste(ada_t+deep_t, 'seconds for dataset:', names(datasets)[i], ',ensemble size:', num_iter))
122 |     results <- rbind(results, res)
123 |   }
124 | }
125 | write.csv(results, './results.txt', row.names = FALSE)
126 | 


--------------------------------------------------------------------------------
/man/Deepboost-class.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/deepboost.R
 3 | \docType{class}
 4 | \name{Deepboost-class}
 5 | \alias{Deepboost-class}
 6 | \title{An S4 class to represent a deepboost model.}
 7 | \description{
 8 | An S4 class to represent a deepboost model.
 9 | }
10 | \section{Slots}{
11 | 
12 | \describe{
13 | \item{\code{tree_depth}}{maximum depth for a single decision tree in the model}
14 | 
15 | \item{\code{num_iter}}{number of iterations = number of trees in ensemble}
16 | 
17 | \item{\code{beta}}{regularisation for scores (L1)}
18 | 
19 | \item{\code{lambda}}{regularisation for tree depth}
20 | 
21 | \item{\code{loss_type}}{"l" logistic, "e" exponential}
22 | 
23 | \item{\code{verbose}}{print extra data while training TRUE / FALSE}
24 | 
25 | \item{\code{examples}}{data.frame with instances used for model training}
26 | 
27 | \item{\code{model}}{Deepboost model as used by C code serialised to R List}
28 | 
29 | \item{\code{classes}}{a vector of factors representing the classes used for classification with this model}
30 | }}
31 | 
32 | 


--------------------------------------------------------------------------------
/man/Deepboost.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/deepboost.R
 3 | \name{deepboost}
 4 | \alias{deepboost}
 5 | \title{Main function for deepboost model creation}
 6 | \usage{
 7 | deepboost(formula, data, instance_weights = NULL, tree_depth = 5,
 8 |   num_iter = 1, beta = 0, lambda = 0.05, loss_type = "l",
 9 |   verbose = TRUE)
10 | }
11 | \arguments{
12 | \item{formula}{A R Formula object see : ?formula}
13 | 
14 | \item{data}{A data.frame of samples to train on}
15 | 
16 | \item{instance_weights}{The weight of each example}
17 | 
18 | \item{tree_depth}{maximum depth for a single decision tree in the model}
19 | 
20 | \item{num_iter}{number of iterations = number of trees in ensemble}
21 | 
22 | \item{beta}{regularisation for scores (L1)}
23 | 
24 | \item{lambda}{regularisation for tree depth}
25 | 
26 | \item{loss_type}{- "l" logistic, "e" exponential}
27 | 
28 | \item{verbose}{- print extra data while training TRUE / FALSE}
29 | }
30 | \value{
31 | A trained Deepbost model
32 | }
33 | \description{
34 | Main function for deepboost model creation
35 | }
36 | \examples{
37 | deepboost(y ~ .,
38 |  data.frame(x1=rep(c(0,0,1,1),2),x2=rep(c(0,1,0,1),2),y=factor(rep(c(0,0,0,1),2))),
39 |  num_iter=1)
40 | deepboost(y ~ .,
41 |  data.frame(x1=rep(c(0,0,1,1),22),x2=rep(c(0,1,0,1),2),y=factor(rep(c(0,0,0,1),2))),
42 |  num_iter=2, beta=0.1, lambda=0.00125)
43 | }
44 | 


--------------------------------------------------------------------------------
/man/adult.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/deepboost-data.R
 3 | \docType{data}
 4 | \name{adult}
 5 | \alias{adult}
 6 | \title{Adult humans}
 7 | \format{A data frame with 32560 rows and 15 variables:
 8 | \describe{
 9 |   \item{Adm.clerical}{unknown}
10 |   \item{Bachelors}{person is a bachlor}
11 |   \item{Male}{gender}
12 |   \item{Never.married}{did person marry?}
13 |   \item{Not.in.family}{is person a part of a family}
14 |   \item{State.gov}{state}
15 |   \item{United.States}{is from the united states}
16 |   \item{White}{is white}
17 |   \item{X..50K}{unknown}
18 |   \item{X0}{unknown}
19 |   \item{X13}{unknown}
20 |   \item{X2174}{unknown}
21 |   \item{X39}{unknown}
22 |   \item{X40}{unknown}
23 |   \item{X77516}{unknown}
24 | }}
25 | \source{
26 | \url{https://archive.ics.uci.edu/ml/datasets/Adult/}
27 | }
28 | \usage{
29 | adult
30 | }
31 | \description{
32 | A dataset containing adult population personal details
33 | }
34 | \keyword{datasets}
35 | 


--------------------------------------------------------------------------------
/man/australian.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/deepboost-data.R
 3 | \docType{data}
 4 | \name{australian}
 5 | \alias{australian}
 6 | \title{Australian}
 7 | \format{An object of class \code{data.frame} with 689 rows and 15 columns.}
 8 | \usage{
 9 | australian
10 | }
11 | \description{
12 | Australian
13 | }
14 | \keyword{datasets}
15 | 


--------------------------------------------------------------------------------
/man/banana.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/deepboost-data.R
 3 | \docType{data}
 4 | \name{banana}
 5 | \alias{banana}
 6 | \title{banana}
 7 | \format{An object of class \code{data.frame} with 5299 rows and 3 columns.}
 8 | \usage{
 9 | banana
10 | }
11 | \description{
12 | banana
13 | }
14 | \keyword{datasets}
15 | 


--------------------------------------------------------------------------------
/man/bupa.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/deepboost-data.R
 3 | \docType{data}
 4 | \name{bupa}
 5 | \alias{bupa}
 6 | \title{bupa}
 7 | \format{An object of class \code{data.frame} with 344 rows and 7 columns.}
 8 | \usage{
 9 | bupa
10 | }
11 | \description{
12 | bupa
13 | }
14 | \keyword{datasets}
15 | 


--------------------------------------------------------------------------------
/man/coli2000.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/deepboost-data.R
 3 | \docType{data}
 4 | \name{coli2000}
 5 | \alias{coli2000}
 6 | \title{coli2000}
 7 | \format{An object of class \code{data.frame} with 9821 rows and 86 columns.}
 8 | \usage{
 9 | coli2000
10 | }
11 | \description{
12 | coli2000
13 | }
14 | \keyword{datasets}
15 | 


--------------------------------------------------------------------------------
/man/deepboost.default.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/deepboost.R
 3 | \name{deepboost.default}
 4 | \alias{deepboost.default}
 5 | \title{Main function for deepboost model creation}
 6 | \usage{
 7 | deepboost.default(x, y, instance_weights = NULL, tree_depth = 5,
 8 |   num_iter = 1, beta = 0, lambda = 0.05, loss_type = "l",
 9 |   verbose = TRUE)
10 | }
11 | \arguments{
12 | \item{x}{A data.frame of samples' values}
13 | 
14 | \item{y}{A data.frame of samples's labels}
15 | 
16 | \item{instance_weights}{The weight of each example}
17 | 
18 | \item{tree_depth}{maximum depth for a single decision tree in the model}
19 | 
20 | \item{num_iter}{number of iterations = number of trees in ensemble}
21 | 
22 | \item{beta}{regularisation for scores (L1)}
23 | 
24 | \item{lambda}{regularisation for tree depth}
25 | 
26 | \item{loss_type}{- "l" logistic, "e" exponential}
27 | 
28 | \item{verbose}{- print extra data while training TRUE / FALSE}
29 | }
30 | \value{
31 | A trained Deepbost model
32 | }
33 | \description{
34 | Main function for deepboost model creation
35 | }
36 | \examples{
37 | deepboost.default(data.frame(x1=rep(c(0,0,1,1),2),x2=rep(c(0,1,0,1),2)),
38 |  factor(rep(c(0,0,0,1),2)),num_iter=1)
39 | deepboost.default(data.frame(x1=rep(c(0,0,1,1),2),x2=rep(c(0,1,0,1),2)),
40 |  factor(rep(c(0,0,0,1),2)),
41 |  num_iter=2, beta=0.1, lambda=0.00125)
42 | }
43 | 


--------------------------------------------------------------------------------
/man/deepboost.evaluate.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/deepboost.R
 3 | \name{deepboost.evaluate}
 4 | \alias{deepboost.evaluate}
 5 | \title{Evaluates and prints statistics for a deepboost model}
 6 | \usage{
 7 | deepboost.evaluate(object, data)
 8 | }
 9 | \arguments{
10 | \item{object}{A Deepboost S4 class object}
11 | 
12 | \item{data}{a \code{data.frame} object to evaluate with the model}
13 | }
14 | \value{
15 | a list with model statistics - error, avg_tree_size, num_trees
16 | }
17 | \description{
18 | Evaluates and prints statistics for a deepboost model
19 | }
20 | \examples{
21 | dpb <- deepboost(y ~ .,
22 |  data.frame(x1=rep(c(0,0,1,1),2),x2=rep(c(0,1,0,1),2),y=factor(rep(c(0,0,0,1),2))),
23 |  num_iter=2,tree_depth=2)
24 | deepboost.evaluate(dpb,data.frame(x1=rep(c(1,1,1,0),2),x2=rep(c(1,1,1,1),2)))
25 | }
26 | 


--------------------------------------------------------------------------------
/man/deepboost.formula.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/deepboost.R
 3 | \name{deepboost.formula}
 4 | \alias{deepboost.formula}
 5 | \title{Main function for deepboost model creation, using a formula}
 6 | \usage{
 7 | deepboost.formula(formula, data, instance_weights = NULL, tree_depth = 5,
 8 |   num_iter = 1, beta = 0, lambda = 0.05, loss_type = "l",
 9 |   verbose = TRUE)
10 | }
11 | \arguments{
12 | \item{formula}{A R Formula object see : ?formula}
13 | 
14 | \item{data}{A data.frame of samples to train on}
15 | 
16 | \item{instance_weights}{The weight of each example}
17 | 
18 | \item{tree_depth}{maximum depth for a single decision tree in the model}
19 | 
20 | \item{num_iter}{number of iterations = number of trees in ensemble}
21 | 
22 | \item{beta}{regularisation for scores (L1)}
23 | 
24 | \item{lambda}{regularisation for tree depth}
25 | 
26 | \item{loss_type}{- "l" logistic, "e" exponential}
27 | 
28 | \item{verbose}{- print extra data while training TRUE / FALSE}
29 | }
30 | \value{
31 | A trained Deepbost model
32 | }
33 | \description{
34 | Main function for deepboost model creation, using a formula
35 | }
36 | \examples{
37 | deepboost.formula(y ~ .,
38 |  data.frame(x1=rep(c(0,0,1,1),2),x2=rep(c(0,1,0,1),2),y=factor(rep(c(0,0,0,1),2))),
39 |  num_iter=1)
40 | deepboost.formula(y ~ .,
41 |  data.frame(x1=rep(c(0,0,1,1),2),x2=rep(c(0,1,0,1),2),y=factor(rep(c(0,0,0,1),2))),
42 |  num_iter=2, beta=0.1, lambda=0.00125)
43 | }
44 | 


--------------------------------------------------------------------------------
/man/deepboost.gridSearch.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/deepboost-grid-search.R
 3 | \name{deepboost.gridSearch}
 4 | \alias{deepboost.gridSearch}
 5 | \title{Returns optimised parameter list for deepboost model on given data}
 6 | \usage{
 7 | deepboost.gridSearch(formula, data, k = 10, seed = 666, logging_level = 1)
 8 | }
 9 | \arguments{
10 | \item{formula}{A R Formula object see : ?formula}
11 | 
12 | \item{data}{input data.frame as training for model}
13 | 
14 | \item{k}{number of folds (default = 10) for cross validation optimisation}
15 | 
16 | \item{seed}{for random split to train / test (default 666)}
17 | 
18 | \item{logging_level}{print extra data while training 0 - no data, 1 - gridSearch data (default), 2 - all data}
19 | }
20 | \value{
21 | vector with average accuracy for chosen parameters, and a list of the best parameter combination: (accuracy, (num_iter, beta, lambda, loss_type))
22 | }
23 | \description{
24 | Returns optimised parameter list for deepboost model on given data
25 | }
26 | \details{
27 | Finds optimised parameters for deepboost training.
28 |  using grid search techniques over:
29 |  - predefined, battle tested parameter possible values
30 |  - cross validation over k folds
31 | }
32 | \examples{
33 | deepboost.gridSearch(y ~ .,
34 |  data.frame(x1=rep(c(0,0,1,1),2),x2=rep(c(0,1,0,1),2),y=factor(rep(c(0,0,0,1),2))), k=2)
35 | }
36 | 


--------------------------------------------------------------------------------
/man/deepboost.predict.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/deepboost.R
 3 | \name{deepboost.predict}
 4 | \alias{deepboost.predict}
 5 | \title{Predicts instances responses based on a deepboost model}
 6 | \usage{
 7 | deepboost.predict(object, newdata, type = "terms")
 8 | }
 9 | \arguments{
10 | \item{object}{A Deepboost S4 class object}
11 | 
12 | \item{newdata}{A data.frame to predict responses for}
13 | 
14 | \item{type}{Type of prediction : "terms" - for class labels, "response" for probabilities}
15 | }
16 | \value{
17 | A vector of respones
18 | }
19 | \description{
20 | Predicts instances responses based on a deepboost model
21 | }
22 | \examples{
23 | dpb <- deepboost(y ~ .,
24 |  data.frame(x1=rep(c(0,0,1,1),2),x2=rep(c(0,1,0,1),2),y=factor(rep(c(0,0,0,1),2))),
25 |  num_iter=2,tree_depth=2)
26 | deepboost.predict(dpb,data.frame(x1=rep(c(1,1,1,0),5),x2=rep(c(1,1,1,1),5)))
27 | }
28 | 


--------------------------------------------------------------------------------
/man/deepboost.print.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/deepboost.R
 3 | \name{deepboost.print}
 4 | \alias{deepboost.print}
 5 | \title{Evaluates and prints statistics for a deepboost model on the train set}
 6 | \usage{
 7 | deepboost.print(object)
 8 | }
 9 | \arguments{
10 | \item{object}{A Deepboost S4 class object}
11 | }
12 | \value{
13 | List with model_statistics to console the model evaluation string
14 | }
15 | \description{
16 | Evaluates and prints statistics for a deepboost model on the train set
17 | }
18 | \examples{
19 | dpb <- deepboost(y ~ .,
20 |  data.frame(x1=rep(c(0,0,1,1),2),x2=rep(c(0,1,0,1),2),y=factor(rep(c(0,0,0,1),2))),
21 |  num_iter=2,tree_depth=2)
22 | deepboost.print(dpb)
23 | }
24 | 


--------------------------------------------------------------------------------
/man/deepboost.train.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/deepboost.R
 3 | \name{deepboost.train}
 4 | \alias{deepboost.train}
 5 | \title{Trains a deepboost model}
 6 | \usage{
 7 | deepboost.train(object, data, tree_depth, num_iter, beta, lambda, loss_type,
 8 |   verbose, classes)
 9 | }
10 | \arguments{
11 | \item{object}{A Deepboost S4 class object}
12 | 
13 | \item{data}{input data.frame as training for model}
14 | 
15 | \item{tree_depth}{maximum depth for a single decision tree in the model}
16 | 
17 | \item{num_iter}{number of iterations = number of trees in ensemble}
18 | 
19 | \item{beta}{regularisation for scores (L1)}
20 | 
21 | \item{lambda}{regularisation for tree depth}
22 | 
23 | \item{loss_type}{- "l" logistic, "e" exponential}
24 | 
25 | \item{verbose}{- print extra data while training TRUE / FALSE}
26 | 
27 | \item{classes}{a vector of factors representing the classes used for classification with this model}
28 | }
29 | \value{
30 | A trained Deepbost model
31 | }
32 | \description{
33 | Trains a deepboost model
34 | }
35 | \details{
36 | (beta,lambda) = (0,0) - adaboost, (>0,0) - L1, (0,>0) deepboost, (>0, >0) deepbost+L1
37 | }
38 | 


--------------------------------------------------------------------------------
/man/haberman.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/deepboost-data.R
 3 | \docType{data}
 4 | \name{haberman}
 5 | \alias{haberman}
 6 | \title{haberman}
 7 | \format{An object of class \code{data.frame} with 305 rows and 4 columns.}
 8 | \usage{
 9 | haberman
10 | }
11 | \description{
12 | haberman
13 | }
14 | \keyword{datasets}
15 | 


--------------------------------------------------------------------------------
/man/heart.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/deepboost-data.R
 3 | \docType{data}
 4 | \name{heart}
 5 | \alias{heart}
 6 | \title{heart}
 7 | \format{An object of class \code{data.frame} with 269 rows and 14 columns.}
 8 | \usage{
 9 | heart
10 | }
11 | \description{
12 | heart
13 | }
14 | \keyword{datasets}
15 | 


--------------------------------------------------------------------------------
/man/magic.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/deepboost-data.R
 3 | \docType{data}
 4 | \name{magic}
 5 | \alias{magic}
 6 | \title{magic}
 7 | \format{An object of class \code{data.frame} with 19019 rows and 11 columns.}
 8 | \usage{
 9 | magic
10 | }
11 | \description{
12 | magic
13 | }
14 | \keyword{datasets}
15 | 


--------------------------------------------------------------------------------
/man/pima.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/deepboost-data.R
 3 | \docType{data}
 4 | \name{pima}
 5 | \alias{pima}
 6 | \title{pima}
 7 | \format{An object of class \code{data.frame} with 767 rows and 9 columns.}
 8 | \usage{
 9 | pima
10 | }
11 | \description{
12 | pima
13 | }
14 | \keyword{datasets}
15 | 


--------------------------------------------------------------------------------
/man/predict-Deepboost-method.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/deepboost.R
 3 | \docType{methods}
 4 | \name{predict,Deepboost-method}
 5 | \alias{predict,Deepboost-method}
 6 | \title{Predict method for Deepboost model}
 7 | \usage{
 8 | \S4method{predict}{Deepboost}(object, newdata, type = "terms")
 9 | }
10 | \arguments{
11 | \item{object}{Object of class "Deepboost"}
12 | 
13 | \item{newdata}{takes \code{data.frame}.}
14 | 
15 | \item{type}{Type of prediction}
16 | }
17 | \description{
18 | Predicted values based on deepboost model object.
19 | }
20 | \details{
21 | The option \code{ntreelimit} purpose is to let the user train a model with lots
22 | of trees but use only the first trees for prediction to avoid overfitting
23 | (without having to train a new model with less trees).
24 | }
25 | \examples{
26 | dpb <- deepboost(y ~ .,
27 |  data.frame(x1=rep(c(0,0,1,1),2),x2=rep(c(0,1,0,1),2),y=factor(rep(c(0,0,0,1),2))),
28 |  num_iter=2,tree_depth=2)
29 | predict(dpb,data.frame(x1=rep(c(1,1,1,0),2),x2=rep(c(1,1,1,1),2)))
30 | }
31 | 


--------------------------------------------------------------------------------
/man/show-Deepboost-method.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/deepboost.R
 3 | \docType{methods}
 4 | \name{show,Deepboost-method}
 5 | \alias{show,Deepboost-method}
 6 | \title{Print method for Deepboost model
 7 | Evaluates a trained deepboost model object.}
 8 | \usage{
 9 | \S4method{show}{Deepboost}(object)
10 | }
11 | \arguments{
12 | \item{object}{Object of class "Deepboost"}
13 | }
14 | \description{
15 | Print method for Deepboost model
16 | Evaluates a trained deepboost model object.
17 | }
18 | \details{
19 | Prints :
20 | Model error: X"
21 | Average tree size: Y"
22 | Number of trees: Z"
23 | }
24 | \examples{
25 | dpb <- deepboost(y ~ .,
26 |  data.frame(x1=rep(c(0,0,1,1),2),x2=rep(c(0,1,0,1),2),y=factor(rep(c(0,0,0,1),2))),
27 |  num_iter=2,tree_depth=2)
28 | print(dpb)
29 | }
30 | 


--------------------------------------------------------------------------------
/man/sonar.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/deepboost-data.R
 3 | \docType{data}
 4 | \name{sonar}
 5 | \alias{sonar}
 6 | \title{sonar}
 7 | \format{An object of class \code{data.frame} with 207 rows and 61 columns.}
 8 | \usage{
 9 | sonar
10 | }
11 | \description{
12 | sonar
13 | }
14 | \keyword{datasets}
15 | 


--------------------------------------------------------------------------------
/src/Makevars:
--------------------------------------------------------------------------------
1 | # require c++11
2 | CXX_STD = CXX11
3 | 


--------------------------------------------------------------------------------
/src/Makevars.win:
--------------------------------------------------------------------------------
1 | # require c++11
2 | CXX_STD = CXX11
3 | 
4 | OBJECTS = ./tree.o ./boost.o ./deepboost_C.o ./deepboost_converters.o ./deepboost_R.o ./RcppExports.o
5 | 


--------------------------------------------------------------------------------
/src/RcppExports.cpp:
--------------------------------------------------------------------------------
 1 | // Generated by using Rcpp::compileAttributes() -> do not edit by hand
 2 | // Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393
 3 | 
 4 | #include <Rcpp.h>
 5 | 
 6 | using namespace Rcpp;
 7 | 
 8 | // Train_R
 9 | Rcpp::List Train_R(DataFrame data, int tree_depth, int num_iter, double beta, double lambda, char loss_type, bool verbose);
10 | RcppExport SEXP _deepboost_Train_R(SEXP dataSEXP, SEXP tree_depthSEXP, SEXP num_iterSEXP, SEXP betaSEXP, SEXP lambdaSEXP, SEXP loss_typeSEXP, SEXP verboseSEXP) {
11 | BEGIN_RCPP
12 |     Rcpp::RObject rcpp_result_gen;
13 |     Rcpp::RNGScope rcpp_rngScope_gen;
14 |     Rcpp::traits::input_parameter< DataFrame >::type data(dataSEXP);
15 |     Rcpp::traits::input_parameter< int >::type tree_depth(tree_depthSEXP);
16 |     Rcpp::traits::input_parameter< int >::type num_iter(num_iterSEXP);
17 |     Rcpp::traits::input_parameter< double >::type beta(betaSEXP);
18 |     Rcpp::traits::input_parameter< double >::type lambda(lambdaSEXP);
19 |     Rcpp::traits::input_parameter< char >::type loss_type(loss_typeSEXP);
20 |     Rcpp::traits::input_parameter< bool >::type verbose(verboseSEXP);
21 |     rcpp_result_gen = Rcpp::wrap(Train_R(data, tree_depth, num_iter, beta, lambda, loss_type, verbose));
22 |     return rcpp_result_gen;
23 | END_RCPP
24 | }
25 | // Predict_R
26 | Rcpp::List Predict_R(DataFrame newdata, Rcpp::List model);
27 | RcppExport SEXP _deepboost_Predict_R(SEXP newdataSEXP, SEXP modelSEXP) {
28 | BEGIN_RCPP
29 |     Rcpp::RObject rcpp_result_gen;
30 |     Rcpp::RNGScope rcpp_rngScope_gen;
31 |     Rcpp::traits::input_parameter< DataFrame >::type newdata(newdataSEXP);
32 |     Rcpp::traits::input_parameter< Rcpp::List >::type model(modelSEXP);
33 |     rcpp_result_gen = Rcpp::wrap(Predict_R(newdata, model));
34 |     return rcpp_result_gen;
35 | END_RCPP
36 | }
37 | // PredictProbabilities_R
38 | Rcpp::List PredictProbabilities_R(DataFrame newdata, Rcpp::List model);
39 | RcppExport SEXP _deepboost_PredictProbabilities_R(SEXP newdataSEXP, SEXP modelSEXP) {
40 | BEGIN_RCPP
41 |     Rcpp::RObject rcpp_result_gen;
42 |     Rcpp::RNGScope rcpp_rngScope_gen;
43 |     Rcpp::traits::input_parameter< DataFrame >::type newdata(newdataSEXP);
44 |     Rcpp::traits::input_parameter< Rcpp::List >::type model(modelSEXP);
45 |     rcpp_result_gen = Rcpp::wrap(PredictProbabilities_R(newdata, model));
46 |     return rcpp_result_gen;
47 | END_RCPP
48 | }
49 | // Evaluate_R
50 | Rcpp::List Evaluate_R(DataFrame data, Rcpp::List model);
51 | RcppExport SEXP _deepboost_Evaluate_R(SEXP dataSEXP, SEXP modelSEXP) {
52 | BEGIN_RCPP
53 |     Rcpp::RObject rcpp_result_gen;
54 |     Rcpp::RNGScope rcpp_rngScope_gen;
55 |     Rcpp::traits::input_parameter< DataFrame >::type data(dataSEXP);
56 |     Rcpp::traits::input_parameter< Rcpp::List >::type model(modelSEXP);
57 |     rcpp_result_gen = Rcpp::wrap(Evaluate_R(data, model));
58 |     return rcpp_result_gen;
59 | END_RCPP
60 | }
61 | 
62 | static const R_CallMethodDef CallEntries[] = {
63 |     {"_deepboost_Train_R", (DL_FUNC) &_deepboost_Train_R, 7},
64 |     {"_deepboost_Predict_R", (DL_FUNC) &_deepboost_Predict_R, 2},
65 |     {"_deepboost_PredictProbabilities_R", (DL_FUNC) &_deepboost_PredictProbabilities_R, 2},
66 |     {"_deepboost_Evaluate_R", (DL_FUNC) &_deepboost_Evaluate_R, 2},
67 |     {NULL, NULL, 0}
68 | };
69 | 
70 | RcppExport void R_init_deepboost(DllInfo *dll) {
71 |     R_registerRoutines(dll, NULL, CallEntries, NULL, NULL);
72 |     R_useDynamicSymbols(dll, FALSE);
73 | }
74 | 


--------------------------------------------------------------------------------
/src/boost.cc:
--------------------------------------------------------------------------------
  1 | /*
  2 | Copyright 2015 Google Inc. All rights reserved.
  3 | 
  4 | Licensed under the Apache License, Version 2.0 (the "License");
  5 | you may not use this file except in compliance with the License.
  6 | You may obtain a copy of the License at
  7 | 
  8 |     http://www.apache.org/licenses/LICENSE-2.0
  9 | 
 10 | Unless required by applicable law or agreed to in writing, software
 11 | distributed under the License is distributed on an "AS IS" BASIS,
 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | See the License for the specific language governing permissions and
 14 | limitations under the License.
 15 | */
 16 | 
 17 | #include "boost.h"
 18 | 
 19 | #include <float.h>
 20 | //#include <math.h>
 21 | #include <cmath>
 22 | 
 23 | #include "tree.h"
 24 | 
 25 | 
 26 | float ComputeEta(float wgtd_error, float tree_size, float alpha, float beta, float lambda) {
 27 |   wgtd_error = std::fmax(wgtd_error, kTolerance);  // Helps with division by zero.
 28 |   const float error_term =
 29 |       (1 - wgtd_error) * std::exp(alpha) - wgtd_error * std::exp(-alpha);
 30 |   const float complexity_penalty = ComplexityPenalty(tree_size, beta, lambda);
 31 |   const float ratio = complexity_penalty / wgtd_error;
 32 |   float eta;
 33 |   if (std::fabs(error_term) <= 2 * complexity_penalty) {
 34 |     eta = -alpha;
 35 |   } else if (error_term > 2 * complexity_penalty) {
 36 |     eta = std::log(-ratio + std::sqrt(ratio * ratio + (1 - wgtd_error)/wgtd_error));
 37 |   } else {
 38 |     eta = std::log(ratio + std::sqrt(ratio * ratio + (1 - wgtd_error)/wgtd_error));
 39 |   }
 40 |   return eta;
 41 | }
 42 | 
 43 | // TODO(usyed): examples is passed by non-const reference because the example
 44 | // weights need to be changed. This is bad style.
 45 | void AddTreeToModel(vector<Example>& examples, Model* model, char loss_type, float beta, float lambda, int tree_depth) {
 46 |   // Initialize normalizer
 47 |   static float normalizer;
 48 |   if (model->empty()) {
 49 |     if (loss_type == 'e') {
 50 |       normalizer = std::exp(1) * static_cast<float>(examples.size());
 51 |     } else if (loss_type == 'l') {
 52 |       normalizer =
 53 |           static_cast<float>(examples.size()) / (std::log(2) * (1 + std::exp(-1)));
 54 |     }
 55 |   }
 56 |   InitializeTreeData(examples, normalizer);
 57 |   int best_old_tree_idx = -1;
 58 |   float wgtd_error, gradient, best_wgtd_error = 0, best_gradient = 0;
 59 | 
 60 |   // Find best old tree
 61 |   bool old_tree_is_best = false;
 62 |   for (int i = 0; i < model->size(); ++i) {
 63 |     const float alpha = (*model)[i].first;
 64 |     if (std::fabs(alpha) < kTolerance) continue;  // Skip zeroed-out weights.
 65 |     const Tree& old_tree = (*model)[i].second;
 66 |     wgtd_error = EvaluateTreeWgtd(examples, old_tree);
 67 |     int sign_edge = (wgtd_error >= 0.5) ? 1 : -1;
 68 |     gradient = Gradient(wgtd_error, old_tree.size(), alpha, sign_edge, beta, lambda);
 69 |     if (std::fabs(gradient) >= std::fabs(best_gradient)) {
 70 |       best_gradient = gradient;
 71 |       best_wgtd_error = wgtd_error;
 72 |       best_old_tree_idx = i;
 73 |       old_tree_is_best = true;
 74 |     }
 75 |   }
 76 | 
 77 |   // Find best new tree
 78 |   Tree new_tree = TrainTree(examples, beta, lambda, tree_depth);
 79 |   wgtd_error = EvaluateTreeWgtd(examples, new_tree);
 80 |   gradient = Gradient(wgtd_error, new_tree.size(), 0, -1, beta, lambda);
 81 |   if (model->empty() || std::fabs(gradient) > std::fabs(best_gradient)) {
 82 |     best_gradient = gradient;
 83 |     best_wgtd_error = wgtd_error;
 84 |     old_tree_is_best = false;
 85 |   }
 86 | 
 87 |   // Update model weights
 88 |   float alpha = 0;
 89 |   const Tree* tree;
 90 |   if (old_tree_is_best) {
 91 |     alpha = (*model)[best_old_tree_idx].first;
 92 |     tree = &((*model)[best_old_tree_idx].second);
 93 |   } else {
 94 |     alpha = 0;
 95 |     tree = &(new_tree);
 96 |   }
 97 |   const float eta = ComputeEta(best_wgtd_error, tree->size(), alpha, beta, lambda);
 98 |   if (old_tree_is_best) {
 99 |     (*model)[best_old_tree_idx].first += eta;
100 |   } else {
101 |     model->push_back(make_pair(eta, new_tree));
102 |   }
103 | 
104 |   // Update examples weights and compute normalizer
105 |   const float old_normalizer = normalizer;
106 |   normalizer = 0;
107 |   for (Example& example : examples) {
108 |     const float u = eta * example.label * ClassifyExample(example, *tree);
109 |     if (loss_type == 'e') {
110 |       example.weight = example.weight * std::exp(-u) * old_normalizer;
111 |     } else if (loss_type == 'l') {
112 |       const float z = (1 - std::log(2) * example.weight * old_normalizer) /
113 |                       (std::log(2) * example.weight * old_normalizer);
114 |       example.weight = 1 / (std::log(2) * (1 + z * std::exp(u)));
115 |     }
116 |     normalizer += example.weight;
117 |   }
118 | 
119 |   // Renormalize example weights
120 |   // TODO(usyed): Two loops is inefficient.
121 |   for (Example& example : examples) {
122 |     example.weight /= normalizer;
123 |   }
124 | }
125 | 
126 | Probability ComputeExampleClassProbability(const Example& example, const Model& model) {
127 |   float score = 0;
128 |   float sumOfWeights = 0;
129 |   float probability = 0;
130 |   for (const pair<Weight, Tree>& wgtd_tree : model) {
131 |     score += wgtd_tree.first * ClassifyExample(example, wgtd_tree.second);
132 |     sumOfWeights += wgtd_tree.first;
133 |   }
134 |   probability = ((score/sumOfWeights) + 1) / 2.0;
135 |   return probability;
136 | }
137 | 
138 | Label ClassifyExample(const Example& example, const Model& model) {
139 |   float score = 0;
140 |   score = ComputeExampleClassProbability(example, model);
141 |   if (score < 0.5) {
142 |     return -1;
143 |   } else {
144 |     return 1;
145 |   }
146 | }
147 | 
148 | void EvaluateModel(const vector<Example>& examples, const Model& model,
149 |                    float* error, float* avg_tree_size, int* num_trees) {
150 |   float incorrect = 0;
151 |   for (const Example& example : examples) {
152 |     if (example.label != ClassifyExample(example, model)) {
153 |       ++incorrect;
154 |     }
155 |   }
156 |   *num_trees = 0;
157 |   int sum_tree_size = 0;
158 |   for (const pair<Weight, Tree>& wgtd_tree : model) {
159 |     if (std::fabs(wgtd_tree.first) >= kTolerance) {
160 |       ++(*num_trees);
161 |       sum_tree_size += wgtd_tree.second.size();
162 |     }
163 |   }
164 |   *error = (incorrect / examples.size());
165 |   *avg_tree_size = static_cast<float>(sum_tree_size) / *num_trees;
166 | }
167 | 


--------------------------------------------------------------------------------
/src/boost.h:
--------------------------------------------------------------------------------
 1 | /*
 2 | Copyright 2015 Google Inc. All rights reserved.
 3 | 
 4 | Licensed under the Apache License, Version 2.0 (the "License");
 5 | you may not use this file except in compliance with the License.
 6 | You may obtain a copy of the License at
 7 | 
 8 |     http://www.apache.org/licenses/LICENSE-2.0
 9 | 
10 | Unless required by applicable law or agreed to in writing, software
11 | distributed under the License is distributed on an "AS IS" BASIS,
12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | See the License for the specific language governing permissions and
14 | limitations under the License.
15 | */
16 | 
17 | #ifndef BOOST_H_
18 | #define BOOST_H_
19 | 
20 | #include "types.h"
21 | 
22 | // Either add a new tree to model or update the weight of an existing tree in
23 | // model. The tree and weight are selected via approximate coordinate descent on
24 | // the objective, where the "approximate" indicates that we do not search all
25 | // trees but instead grow trees greedily.
26 | void AddTreeToModel(vector<Example>& examples, Model* model, char loss_type, float beta, float lambda, int tree_depth);
27 | 
28 | // Compute example probability with model.
29 | Probability ComputeExampleClassProbability(const Example& example, const Model& model);
30 | 
31 | // Classify example with model.
32 | Label ClassifyExample(const Example& example, const Model& model);
33 | 
34 | // Compute the error of model on examples. Also compute the number of trees in
35 | // model and their average size.
36 | void EvaluateModel(const vector<Example>& examples, const Model& model,
37 |                    float* error, float* avg_tree_size, int* num_trees);
38 | 
39 | // Return the optimal weight to add to a tree that will maximally decrease the
40 | // objective.
41 | float ComputeEta(float wgtd_error, float tree_size, float alpha, float beta, float lambda);
42 | 
43 | #endif  // BOOST_H_
44 | 


--------------------------------------------------------------------------------
/src/deepboost_C.cc:
--------------------------------------------------------------------------------
 1 | /*
 2 | Written by:
 3 | Daniel Marcous, Yotam Sandbank
 4 | */
 5 | 
 6 | #include "deepboost_C.h"
 7 | #include "boost.h"
 8 | #include "tree.h"
 9 | 
10 | #include <Rcpp.h>
11 | 
12 | using namespace Rcpp;
13 | 
14 | 
15 | // Train a deepboost model on the given examples, using
16 | // numIter iterations (which not necessarily means numIter trees)
17 | void Train(vector<Example>* train_examples, Model* model, int tree_depth,
18 |  int num_iter, float beta, float lambda, char loss_type, bool verbose) {
19 | 
20 | 
21 | 	// Train the model
22 | 	for (int iter = 1; iter <= num_iter; ++iter) {
23 | 		AddTreeToModel(*train_examples, model, loss_type, beta, lambda, tree_depth);
24 | 		if (verbose) {
25 | 			float error, avg_tree_size;
26 | 			int num_trees;
27 | 			EvaluateModel(*train_examples, *model, &error, &avg_tree_size,
28 | 						  &num_trees);
29 | 			Rcpp::Rcout << "Iteration: " << iter
30 | 			            << ", error: " << error
31 | 			            << ", avg tree size: " << avg_tree_size
32 | 			            << ", num trees: " << num_trees
33 | 			            << std::endl;
34 | 		}
35 | 	}
36 | }
37 | 
38 | 
39 | // Classify examples using model
40 | vector<Label> Predict(const vector<Example>& examples, const Model& model){
41 | 	//TODO::initiate labels
42 | 	vector<Label> labels;
43 |     labels.resize(examples.size(), 0);
44 | 	for (unsigned i=0; i<examples.size(); i++){
45 | 		labels[i] = ClassifyExample(examples[i], model);
46 |     }
47 | 	return labels;
48 | }
49 | vector<Probability> PredictProbabilities(const vector<Example>& examples, const Model& model){
50 |   vector<Probability> probabilities;
51 |   probabilities.resize(examples.size(), 0);
52 |   for (unsigned i=0; i<examples.size(); i++){
53 |     probabilities[i] = ComputeExampleClassProbability(examples[i], model);
54 |   }
55 |   return probabilities;
56 | }
57 | 
58 | 
59 | // Compute the error of model on examples. Also compute the number of trees in
60 | // model and their average size.
61 | void Evaluate(const vector<Example>& examples, const Model& model,
62 |                    float* error, float* avg_tree_size, int* num_trees){
63 | 	EvaluateModel(examples, model, error, avg_tree_size, num_trees);
64 | }
65 | 


--------------------------------------------------------------------------------
/src/deepboost_C.h:
--------------------------------------------------------------------------------
 1 | /*
 2 | Written by:
 3 | Daniel Marcous, Yotam Sandbank
 4 | */
 5 | 
 6 | #ifndef DEEPBOOST_C_H_
 7 | #define DEEPBOOST_C_H_
 8 | 
 9 | #include "types.h"
10 | 
11 | // Train a deepboost model on the given examples, using
12 | // numIter iterations (which not necessarily means numIter trees)
13 | void Train(vector<Example>* train_examples, Model* model, int tree_depth,
14 |  int num_iter, float beta, float lambda, char loss_type, bool verbose);
15 | 
16 | 
17 | // Compute examples probabilities using model
18 | vector<Probability> PredictProbabilities(const vector<Example>& examples, const Model& model);
19 | 
20 | 
21 | // Classify examples using model
22 | vector<Label> Predict(const vector<Example>& examples, const Model& model);
23 | 
24 | 
25 | // Compute the error of model on examples. Also compute the number of trees in
26 | // model and their average size.
27 | void Evaluate(const vector<Example>& examples, const Model& model,
28 |                    float* error, float* avg_tree_size, int* num_trees);
29 | 
30 | #endif  // DEEPBOOST_C_H_
31 | 


--------------------------------------------------------------------------------
/src/deepboost_R.cpp:
--------------------------------------------------------------------------------
 1 | #include <Rcpp.h>
 2 | #include "types.h"
 3 | #include "deepboost_converters.h"
 4 | #include "deepboost_C.h"
 5 | 
 6 | using namespace Rcpp;
 7 | 
 8 | //’ Trains a deepboost ensemble model
 9 | //’
10 | //’ @param data input data.frame as training for model
11 | //’ @param tree_depth maximum depth for a single decision tree in the model
12 | //’ @param num_iter number of iterations = number of trees in ensemble
13 | //’ @param beta regularisation for scores (L1)
14 | //’ @param lambda regularisation for tree depth
15 | //’ @param loss_type - "l" logistic, "e" exponential
16 | //’ @param verbose - print extra data while training TRUE / FALSE
17 | //’ @return a trained Deepboost model
18 | // [[Rcpp::export]]
19 | Rcpp::List Train_R(DataFrame data,
20 |                    int tree_depth, int num_iter,
21 |                    double beta, double lambda, char loss_type,
22 |                    bool verbose) {
23 |   // Train with inner model
24 |   List model_R =
25 |     Train_C(data,
26 |             tree_depth, num_iter, beta, lambda, loss_type, verbose);
27 | 
28 |   return model_R;
29 | }
30 | 
31 | //’ Predicts instances labels based on a deepboost model
32 | //’
33 | //’ @param newdata input data.frame to predict labels for
34 | //’ @param model trained Deepboost model
35 | //’ @return a list with labels for all instances in newdata
36 | // [[Rcpp::export]]
37 | Rcpp::List Predict_R(DataFrame newdata,
38 |                         Rcpp::List model) {
39 |   List labels = Predict_C(newdata, model);
40 |   return labels;
41 | }
42 | 
43 | //’ Predicts instances probabilities based on a deepboost model
44 | //’
45 | //’ @param newdata input data.frame to predict labels for
46 | //’ @param model trained Deepboost model
47 | //’ @return a list with probabilities for all instances in newdata
48 | // [[Rcpp::export]]
49 | Rcpp::List PredictProbabilities_R(DataFrame newdata,
50 |                      Rcpp::List model) {
51 |   List probabilities = PredictProbabilities_C(newdata, model);
52 |   return probabilities;
53 | }
54 | 
55 | //’ Evaluates and prints statistics for a deepboost model
56 | //’
57 | //’ @param data input data.frame as training for model
58 | //’ @param model trained Deepboost model
59 | //’ @return a list with model statistics - error, avg_tree_size, num_trees
60 | // [[Rcpp::export]]
61 | Rcpp::List Evaluate_R(DataFrame data,
62 |                          Rcpp::List model) {
63 | 
64 |   List model_stats = Evaluate_C(data, model);
65 |   return model_stats;
66 | }
67 | 


--------------------------------------------------------------------------------
/src/deepboost_converters.cpp:
--------------------------------------------------------------------------------
  1 | #include <Rcpp.h>
  2 | #include "types.h"
  3 | #include "deepboost_converters.h"
  4 | #include "deepboost_C.h"
  5 | 
  6 | // [[Rcpp::plugins("cpp11")]]
  7 | 
  8 | using namespace Rcpp;
  9 | 
 10 | vector<Example> createExampleVectorFromDataFrame(DataFrame data)
 11 | {
 12 |     vector<Example> examples;
 13 |     int example_number = data.nrows();
 14 |     StringVector labels;
 15 |     NumericVector weights;
 16 |     bool labelsExist=false;
 17 |     bool weightsExist=false;
 18 | 
 19 |     StringVector names = data.names();
 20 |     StringVector feature_names;
 21 |     for (int i = 0; i < names.length(); ++i) {
 22 |       if (names[i] == "label")
 23 |       {
 24 |         labels = data["label"];
 25 |         labelsExist = true;
 26 |       }
 27 |       else if (names[i] == "weight")
 28 |       {
 29 |         weights = data["weight"];
 30 |         weightsExist = true;
 31 |       }
 32 |       else
 33 |       {
 34 |         feature_names.push_back(names[i]);
 35 |       }
 36 |     }
 37 | 
 38 |     for (int i = 0; i < example_number; ++i) {
 39 |       Example *example = new Example;
 40 | 
 41 |       if (labelsExist)
 42 |       {
 43 |         //example -> label = std::atoi(as<std::string>(labels[i]));
 44 |         std::istringstream(as<std::string>(labels[i])) >> example -> label;
 45 |       }
 46 |       else
 47 |       {
 48 |         example -> label = (int)NULL;
 49 |       }
 50 | 
 51 |       if(weightsExist)
 52 |       {
 53 |         example -> weight = weights[i];
 54 |       }
 55 |       else
 56 |       {
 57 |         example -> weight = 1.0/example_number;
 58 |       }
 59 |       example -> values = vector<Value>();
 60 | 
 61 |       examples.push_back(*example);
 62 |     }
 63 | 
 64 |     for (int j = 0; j < feature_names.size(); ++j)
 65 |     {
 66 |       NumericVector current_feature = data[String(feature_names[j])];
 67 | 
 68 |       for (int i = 0; i < example_number; ++i) {
 69 |         examples[i].values.push_back(current_feature[i]);
 70 |       }
 71 |     }
 72 | 
 73 |     return examples;
 74 | }
 75 | 
 76 | Rcpp::List modelToList(Model model_)
 77 | {
 78 |   List model = List();
 79 |   for (pair<Weight, Tree> pair_: model_) {
 80 |     vector<Node> tree_ = pair_.second;
 81 |     List nodes = List::create();
 82 |     for(Node node_: tree_) {
 83 |       vector<Example> examples_ = node_.examples;
 84 |       List examples = List::create();
 85 | //       TODO : check without initializing values
 86 | //       for(Example example_: examples_){
 87 | //         examples.push_back(
 88 | //           List::create(
 89 | //             Named("values",example_.values),
 90 | //             Named("label",example_.label),
 91 | //             Named("weight",example_.weight)
 92 | //           )
 93 | //         );
 94 | //       }
 95 |       nodes.push_back(
 96 |           List::create(
 97 |                       Named("examples",examples),
 98 |                       Named("split_feature",node_.split_feature),
 99 |                       Named("split_value",node_.split_value),
100 |                       Named("left_child_id",node_.left_child_id),
101 |                       Named("right_child_id",node_.right_child_id),
102 |                       Named("positive_weight",node_.positive_weight),
103 |                       Named("negative_weight",node_.negative_weight),
104 |                       Named("leaf",node_.leaf),
105 |                       Named("depth",node_.depth)
106 |                       )
107 |                       );
108 |     }
109 |     model.push_back(
110 |       List::create(
111 |         Named("tree_weight",pair_.first),
112 |         Named("tree",nodes)
113 |         )
114 |       );
115 |   }
116 |   return model;
117 | }
118 | 
119 | Model listToModel(Rcpp::List model_)
120 | {
121 |   Model model;
122 |   for(List pair_ : model_){
123 |     Weight tree_weight = Rcpp::as<Weight>(pair_["tree_weight"]);
124 |     List nodes_ = Rcpp::as<List>(pair_["tree"]);
125 | 
126 |     vector<Node> tree;
127 |     for (List node_ : nodes_){
128 |       List examples_ = Rcpp::as<List>(node_["examples"]);
129 |       Feature split_feature = Rcpp::as<Feature>(node_["split_feature"]);
130 |       Value split_value = Rcpp::as<Value>(node_["split_value"]);
131 |       NodeId left_child_id = Rcpp::as<NodeId>(node_["left_child_id"]);
132 |       NodeId right_child_id = Rcpp::as<NodeId>(node_["right_child_id"]);
133 |       Weight positive_weight = Rcpp::as<Weight>(node_["positive_weight"]);
134 |       Weight negative_weight = Rcpp::as<Weight>(node_["negative_weight"]);
135 |       bool leaf = Rcpp::as<bool>(node_["leaf"]);
136 |       int depth = Rcpp::as<int>(node_["depth"]);
137 | 
138 |       vector<Example> examples;
139 |       // TODO : check with empty example vector
140 | //       for (List example_ : examples_){
141 | //         vector<Value> values = Rcpp::as<vector<Value>>(example_["values"]);
142 | //         Label label = Rcpp::as<Label>(example_["label"]);
143 | //         Weight weight = Rcpp::as<Weight>(example_["weight"]);
144 | //
145 | //         Example *example = new Example;
146 | //
147 | //         example -> values = values;
148 | //         example -> label = label;
149 | //         example -> weight = weight;
150 | //
151 | //         examples.push_back(*example);
152 | //     }
153 | 
154 |       Node *node = new Node;
155 | 
156 |       node -> examples = examples;
157 |       node -> split_feature = split_feature;
158 |       node -> split_value = split_value;
159 |       node -> left_child_id = left_child_id;
160 |       node -> right_child_id = right_child_id;
161 |       node -> positive_weight = positive_weight;
162 |       node -> negative_weight = negative_weight;
163 |       node -> leaf = leaf;
164 |       node -> depth = depth;
165 | 
166 |       tree.push_back(*node);
167 |     }
168 | 
169 |     pair<Weight, Tree> pair = make_pair(tree_weight, tree);
170 |     model.push_back(pair);
171 |   }
172 | 
173 |   return model;
174 | }
175 | 
176 | Rcpp::List Train_C(DataFrame data,
177 |                         int tree_depth, int num_iter,
178 |                         double beta, double lambda, char loss_type,
179 |                         bool verbose)
180 | {
181 |   vector<Example> train_examples = createExampleVectorFromDataFrame(data);
182 |   Model model_;
183 | 
184 |   Train(&train_examples,
185 |         &model_,
186 |         tree_depth, num_iter, beta, lambda, loss_type, verbose);
187 | 
188 |   List model = modelToList(model_);
189 | 
190 |   return model;
191 | }
192 | 
193 | Rcpp::List Evaluate_C(DataFrame data, Rcpp::List model)
194 | {
195 |   vector<Example> examples = createExampleVectorFromDataFrame(data);
196 | 
197 |   Model model_ = listToModel(model);
198 | 
199 |   float error;
200 |   float avg_tree_size;
201 |   int num_trees;
202 | 
203 |   Evaluate(examples, model_,
204 |                 &error, &avg_tree_size, &num_trees);
205 | 
206 |   List model_stats = List::create(
207 |     Named("error",error),
208 |     Named("avg_tree_size",avg_tree_size),
209 |     Named("num_trees",num_trees)
210 |     );
211 | 
212 |   return model_stats;
213 | }
214 | 
215 | Rcpp::List Predict_C(DataFrame data, Rcpp::List model)
216 | {
217 |   List labels;
218 | 
219 |   // Create datasets for predict
220 |   vector<Example> examples = createExampleVectorFromDataFrame(data);
221 |   Model model_ = listToModel(model);
222 | 
223 |   // predict
224 |   vector<Label> labels_ = Predict(examples, model_);
225 | 
226 |   // adjust return value
227 |   for (Label label_ : labels_){
228 |     labels.push_back(label_);
229 |   }
230 | 
231 |   return (labels);
232 | }
233 | 
234 | Rcpp::List PredictProbabilities_C(DataFrame data, Rcpp::List model)
235 | {
236 |   List probabilities;
237 | 
238 |   // Create datasets for predict
239 |   vector<Example> examples = createExampleVectorFromDataFrame(data);
240 |   Model model_ = listToModel(model);
241 | 
242 |   // predict
243 |   vector<Probability> probabilities_ = PredictProbabilities(examples, model_);
244 | 
245 |   // adjust return value
246 |   for (Probability probability_ : probabilities_){
247 |     probabilities.push_back(probability_);
248 |   }
249 | 
250 |   return (probabilities);
251 | }
252 | 
253 | 
254 | 


--------------------------------------------------------------------------------
/src/deepboost_converters.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  Written by:
 3 | Daniel Marcous, Yotam Sandbank
 4 | */
 5 | 
 6 | #ifndef DEEPBOOST_CONVERTERS_H_
 7 | #define DEEPBOOST_CONVERTERS_H_
 8 | 
 9 | #include <Rcpp.h>
10 | #include "types.h"
11 | 
12 | using namespace Rcpp;
13 | 
14 | // Train a deepboost model on the given examples, using
15 | // numIter iterations (which not necessarily means numIter trees)
16 | Rcpp::List Train_C(DataFrame data,
17 |                               int tree_depth, int num_iter,
18 |                               double beta, double lambda, char loss_type,
19 |                               bool verbose);
20 | 
21 | // Compute the error of model on examples. Also compute the number of trees in
22 | // model and their average size.
23 | // Returns a list with the model's error, avg_tree_size, num_trees
24 | Rcpp::List Evaluate_C(DataFrame data, Rcpp::List model);
25 | 
26 | // Classify examples using model
27 | Rcpp::List Predict_C(DataFrame data, Rcpp::List model);
28 | 
29 | // Compute examples probabilities using model
30 | Rcpp::List PredictProbabilities_C(DataFrame data, Rcpp::List model);
31 | 
32 | #endif  // DEEPBOOST_CONVERTERS_H_
33 | 


--------------------------------------------------------------------------------
/src/tree.cc:
--------------------------------------------------------------------------------
  1 | /*
  2 | Copyright 2015 Google Inc. All rights reserved.
  3 | 
  4 | Licensed under the Apache License, Version 2.0 (the "License");
  5 | you may not use this file except in compliance with the License.
  6 | You may obtain a copy of the License at
  7 | 
  8 |     http://www.apache.org/licenses/LICENSE-2.0
  9 | 
 10 | Unless required by applicable law or agreed to in writing, software
 11 | distributed under the License is distributed on an "AS IS" BASIS,
 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | See the License for the specific language governing permissions and
 14 | limitations under the License.
 15 | */
 16 | 
 17 | //#include <math.h>
 18 | #include <cmath>
 19 | #include "tree.h"
 20 | 
 21 | 
 22 | // TODO(usyed): Global variables are bad style.
 23 | static int num_features;
 24 | static int num_examples;
 25 | static float the_normalizer;
 26 | static bool is_initialized = false;
 27 | 
 28 | void InitializeTreeData(const vector<Example>& examples, float normalizer) {
 29 |   num_examples = examples.size();
 30 |   num_features = examples[0].values.size();
 31 |   the_normalizer = normalizer;
 32 |   is_initialized = true;
 33 | }
 34 | 
 35 | Node MakeRootNode(const vector<Example>& examples) {
 36 |   Node root;
 37 |   root.examples = examples;
 38 |   root.positive_weight = root.negative_weight = 0;
 39 |   for (const Example& example : examples) {
 40 |     if (example.label == 1) {
 41 |       root.positive_weight += example.weight;
 42 |     } else {  // label == -1
 43 |       root.negative_weight += example.weight;
 44 |     }
 45 |   }
 46 |   root.leaf = true;
 47 |   root.depth = 0;
 48 |   return root;
 49 | }
 50 | 
 51 | map<Value, pair<Weight, Weight>> MakeValueToWeightsMap(const Node& node,
 52 |                                                        Feature feature) {
 53 |   map<Value, pair<Weight, Weight>> value_to_weights;
 54 |   for (const Example& example : node.examples) {
 55 |     if (example.label == 1) {
 56 |       value_to_weights[example.values[feature]].first += example.weight;
 57 |     } else {  // label = -1
 58 |       value_to_weights[example.values[feature]].second += example.weight;
 59 |     }
 60 |   }
 61 |   return value_to_weights;
 62 | }
 63 | 
 64 | void BestSplitValue(const map<Value, pair<Weight, Weight>>& value_to_weights,
 65 |                     const Node& node, int tree_size, Value* split_value,
 66 |                     float* delta_gradient, float beta, float lambda) {
 67 |   *delta_gradient = 0;
 68 |   Weight left_positive_weight = 0, left_negative_weight = 0,
 69 |          right_positive_weight = node.positive_weight,
 70 |          right_negative_weight = node.negative_weight;
 71 |   float old_error = std::fmin(left_positive_weight + right_positive_weight,
 72 |                          left_negative_weight + right_negative_weight);
 73 |   float old_gradient = Gradient(old_error, tree_size, 0, -1, beta, lambda);
 74 |   for (const pair<Value, pair<Weight, Weight>>& elem : value_to_weights) {
 75 |     left_positive_weight += elem.second.first;
 76 |     right_positive_weight -= elem.second.first;
 77 |     left_negative_weight += elem.second.second;
 78 |     right_negative_weight -= elem.second.second;
 79 |     float new_error = std::fmin(left_positive_weight, left_negative_weight) +
 80 |       std::fmin(right_positive_weight, right_negative_weight);
 81 |     float new_gradient = Gradient(new_error, tree_size + 2, 0, -1, beta, lambda);
 82 |     if (std::fabs(new_gradient) - std::fabs(old_gradient) >
 83 |         *delta_gradient + kTolerance) {
 84 |       *delta_gradient = std::fabs(new_gradient) - std::fabs(old_gradient);
 85 |       *split_value = elem.first;
 86 |     }
 87 |   }
 88 | }
 89 | 
 90 | void MakeChildNodes(Feature split_feature, Value split_value, Node* parent,
 91 |                     Tree* tree) {
 92 |   parent->split_feature = split_feature;
 93 |   parent->split_value = split_value;
 94 |   parent->leaf = false;
 95 |   Node left_child, right_child;
 96 |   left_child.depth = right_child.depth = parent->depth + 1;
 97 |   left_child.leaf = right_child.leaf = true;
 98 |   left_child.positive_weight = left_child.negative_weight =
 99 |       right_child.positive_weight = right_child.negative_weight = 0;
100 |   for (const Example& example : parent->examples) {
101 |     Node* child;
102 |     if (example.values[split_feature] <= split_value) {
103 |       child = &left_child;
104 |     } else {
105 |       child = &right_child;
106 |     }
107 |     // TODO(usyed): Moving examples around is inefficient.
108 |     child->examples.push_back(example);
109 |     if (example.label == 1) {
110 |       child->positive_weight += example.weight;
111 |     } else {  // label == -1
112 |       child->negative_weight += example.weight;
113 |     }
114 |   }
115 |   parent->left_child_id = tree->size();
116 |   parent->right_child_id = tree->size() + 1;
117 |   tree->push_back(left_child);
118 |   tree->push_back(right_child);
119 | }
120 | 
121 | Tree TrainTree(const vector<Example>& examples, float beta, float lambda, int tree_depth) {
122 |   Tree tree;
123 |   tree.push_back(MakeRootNode(examples));
124 |   NodeId node_id = 0;
125 |   while (node_id < tree.size()) {
126 |     Node& node = tree[node_id];  // TODO(usyed): Too bad this can't be const.
127 |     Feature best_split_feature;
128 |     Value best_split_value;
129 |     float best_delta_gradient = 0;
130 |     for (Feature split_feature = 0; split_feature < num_features;
131 |          ++split_feature) {
132 |       const map<Value, pair<Weight, Weight>> value_to_weights =
133 |           MakeValueToWeightsMap(node, split_feature);
134 |       Value split_value;
135 |       float delta_gradient;
136 |       BestSplitValue(value_to_weights, node, tree.size(), &split_value,
137 |                      &delta_gradient, beta, lambda);
138 |       if (delta_gradient > best_delta_gradient + kTolerance) {
139 |         best_delta_gradient = delta_gradient;
140 |         best_split_feature = split_feature;
141 |         best_split_value = split_value;
142 |       }
143 |     }
144 |     if (node.depth < tree_depth && best_delta_gradient > kTolerance) {
145 |       MakeChildNodes(best_split_feature, best_split_value, &node, &tree);
146 |     }
147 |     ++node_id;
148 |   }
149 |   return tree;
150 | }
151 | 
152 | Label ClassifyExample(const Example& example, const Tree& tree) {
153 |   const Node* node = &tree[0];
154 |   while (node->leaf == false) {
155 |     if (example.values[node->split_feature] <= node->split_value) {
156 |       node = &tree[node->left_child_id];
157 |     } else {
158 |       node = &tree[node->right_child_id];
159 |     }
160 |   }
161 |   if (node->positive_weight >= node->negative_weight) {
162 |     return 1;
163 |   } else {
164 |     return -1;
165 |   }
166 | }
167 | 
168 | float Gradient(float wgtd_error, int tree_size, float alpha, int sign_edge, float beta, float lambda) {
169 |   // TODO(usyed): Can we make some mild assumptions and get rid of sign_edge?
170 |   const float complexity_penalty = ComplexityPenalty(tree_size, beta, lambda);
171 |   const float edge = wgtd_error - 0.5;
172 |   const int sign_alpha = (alpha >= 0) ? 1 : -1;
173 |   if (std::fabs(alpha) > kTolerance) {
174 |     return edge + sign_alpha * complexity_penalty;
175 |   } else if (std::fabs(edge) <= complexity_penalty) {
176 |     return 0;
177 |   } else {
178 |     return edge - sign_edge * complexity_penalty;
179 |   }
180 | }
181 | 
182 | float EvaluateTreeWgtd(const vector<Example>& examples, const Tree& tree) {
183 |   float wgtd_error = 0;
184 |   for (const Example& example : examples) {
185 |     if (ClassifyExample(example, tree) != example.label) {
186 |       wgtd_error += example.weight;
187 |     }
188 |   }
189 |   return wgtd_error;
190 | }
191 | 
192 | float ComplexityPenalty(int tree_size, float beta, float lambda) {
193 |   float rademacher =
194 |       std::sqrt((float)(((2 * tree_size + 1) * (std::log(num_features + 2) / std::log(2)) *
195 |            std::log(num_examples)) /
196 |            num_examples));
197 |   return ((lambda * rademacher + beta) * num_examples) /
198 |          (2 * the_normalizer);
199 | }
200 | 


--------------------------------------------------------------------------------
/src/tree.h:
--------------------------------------------------------------------------------
 1 | /*
 2 | Copyright 2015 Google Inc. All rights reserved.
 3 | 
 4 | Licensed under the Apache License, Version 2.0 (the "License");
 5 | you may not use this file except in compliance with the License.
 6 | You may obtain a copy of the License at
 7 | 
 8 |     http://www.apache.org/licenses/LICENSE-2.0
 9 | 
10 | Unless required by applicable law or agreed to in writing, software
11 | distributed under the License is distributed on an "AS IS" BASIS,
12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | See the License for the specific language governing permissions and
14 | limitations under the License.
15 | */
16 | 
17 | #ifndef TREE_H_
18 | #define TREE_H_
19 | 
20 | #include "types.h"
21 | 
22 | // Initialize some global variables.
23 | void InitializeTreeData(const vector<Example>& examples, float normalizer);
24 | 
25 | // Return root node for a tree.
26 | Node MakeRootNode(const vector<Example>& examples);
27 | 
28 | // Return a tree trained on examples.
29 | Tree TrainTree(const vector<Example>& examples, float beta, float lambda, int tree_depth);
30 | 
31 | // Make child nodes using split feature/value and add them to the tree. Also
32 | // update info in the parent node, like child pointers.
33 | void MakeChildNodes(Feature split_feature, Value split_value, Node* parent,
34 |                     Tree* tree);
35 | 
36 | // Return a map from each value of feature to a pair of weights. The first
37 | // weight in the pair is the total weight of positive examples at node that have
38 | // that value for feature, and the second weight in the pair is the total weight
39 | // of negative examples at node that have that value for feature. This map is
40 | // used to determine the best split feature/value.
41 | map<Value, pair<Weight, Weight>> MakeValueToWeightsMap(const Node& node,
42 |                                                        Feature feature);
43 | 
44 | // Given a value-to-weights map for a feature (constructed by
45 | // MakeValueToWeightsMap()), determine the best split value for the feature and
46 | // the improvement in the gradient of the objective if we split on that value.
47 | // Note that delta_gradient <= 0 indicates that we should not split on this
48 | // feature.
49 | void BestSplitValue(const map<Value, pair<Weight, Weight>>& value_to_weights,
50 |                     const Node& node, int tree_size, Value* split_value,
51 |                     float* delta_gradient, float beta, float lambda);
52 | 
53 | // Given an example and a tree, classify the example with the tree.
54 | // NB: This function assumes that if an example has a feature value that is
55 | // _less than or equal to_ a node's split value then the example should be sent
56 | // to the left child, and otherwise sent to the right child.
57 | Label ClassifyExample(const Example& example, const Tree& tree);
58 | 
59 | // Return the (sub)gradient of the objective with respect to a tree.
60 | float Gradient(float wgtd_error, int tree_size, float alpha, int sign_edge, float beta, float lambda);
61 | 
62 | // Given a set of examples and a tree, return the weighted error of tree on
63 | // the examples.
64 | float EvaluateTreeWgtd(const vector<Example>& examples, const Tree& tree);
65 | 
66 | // Return complexity penalty.
67 | float ComplexityPenalty(int tree_size, float beta, float lambda);
68 | 
69 | #endif  // TREE_H_
70 | 


--------------------------------------------------------------------------------
/src/types.h:
--------------------------------------------------------------------------------
 1 | /*
 2 | Copyright 2015 Google Inc. All rights reserved.
 3 | 
 4 | Licensed under the Apache License, Version 2.0 (the "License");
 5 | you may not use this file except in compliance with the License.
 6 | You may obtain a copy of the License at
 7 | 
 8 |     http://www.apache.org/licenses/LICENSE-2.0
 9 | 
10 | Unless required by applicable law or agreed to in writing, software
11 | distributed under the License is distributed on an "AS IS" BASIS,
12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | See the License for the specific language governing permissions and
14 | limitations under the License.
15 | */
16 | 
17 | #ifndef TYPES_H_
18 | #define TYPES_H_
19 | 
20 | #include <map>
21 | #include <vector>
22 | 
23 | using std::map;
24 | using std::pair;
25 | using std::vector;
26 | 
27 | // Used in many places as the minimum possible difference between two distinct
28 | // numbers. Helps make code stable, tests predictable, etc.
29 | static const float kTolerance = 1e-7;
30 | 
31 | typedef int Feature;
32 | typedef int Label;
33 | typedef int NodeId;
34 | typedef float Value;
35 | typedef float Weight;
36 | typedef float Probability;
37 | 
38 | // An example consists of a vector of feature values, a label and a weight.
39 | // Note that this is a dense feature representation; the value of every
40 | // feature is contained in the vector, listed in a canonical order.
41 | typedef struct Example {
42 |   vector<Value> values;
43 |   Label label;
44 |   Weight weight;
45 | } Example;
46 | 
47 | // A tree node.
48 | typedef struct Node {
49 |   vector<Example> examples;  // Examples at this node.
50 |   Feature split_feature;  // Split feature.
51 |   Value split_value;  // Split value.
52 |   NodeId left_child_id;  // Pointer to left child, if any.
53 |   NodeId right_child_id;  // Pointer to right child, if any.
54 |   Weight positive_weight;  // Total weight of positive examples at this node.
55 |   Weight negative_weight;  // Total weight of negative examples at this node.
56 |   bool leaf;  // Is this node is a leaf?
57 |   int depth;  // Depth of the node in the tree. Root node has depth 0.
58 | } Node;
59 | 
60 | // A tree is a vector of nodes.
61 | typedef vector<Node> Tree;
62 | 
63 | // A model is a vector of (weight, tree) pairs, i.e., a weighted combination of
64 | // trees.
65 | typedef vector<pair<Weight, Tree> > Model;
66 | 
67 | #endif  // TYPES_H_
68 | 


--------------------------------------------------------------------------------
/tests/testthat.R:
--------------------------------------------------------------------------------
1 | library(testthat)
2 | library(deepboost)
3 | 
4 | test_check("deepboost")
5 | 


--------------------------------------------------------------------------------
/tests/testthat/test_basic.R:
--------------------------------------------------------------------------------
 1 | library(deepboost)
 2 | 
 3 | context("basic functions")
 4 | 
 5 | data(adult, package='deepboost')
 6 | 
 7 | set.seed(666)
 8 | 
 9 | formula <- X..50K ~ X39 + X77516 + X13 + X2174 +  X0 + X40
10 | levels(adult[,length(adult)]) <- c(1,-1)
11 | 
12 | train <- adult[1:29000,]
13 | test <- adult[29001:32560,]
14 | 
15 | test_that("train and predict formula works", {
16 |   bst <- deepboost.formula(formula, train, num_iter = 5)
17 |   pred <- predict(bst, test)
18 |   expect_equal(length(pred), 3560)
19 | })
20 | 
21 | test_that("train and predict default works", {
22 |   bst <- deepboost.default(train[,c("X39","X77516","X13")], train$X..50K, num_iter = 5)
23 |   pred <- predict(bst, test)
24 |   expect_equal(length(pred), 3560)
25 | })
26 | 
27 | test_that("train and predict probs works", {
28 |   bst <- deepboost.default(train[,c("X39","X77516","X13")], train$X..50K, num_iter = 5)
29 |   pred <- predict(bst, test, type="response")
30 |   expect_equal(nrow(pred), 3560)
31 | })
32 | 
33 | test_that("predict labels and predict probs output similar decisions", {
34 |   bst <- deepboost.default(train[,c("X39","X77516","X13")], train$X..50K, num_iter = 5)
35 |   predLabels <- predict(bst, test)
36 |   predProbabilities <- predict(bst, test, type="response")
37 |   expect_equal((predLabels=="1"), (predProbabilities[,1] > 0.5))
38 | })
39 | 
40 | test_that("grid search works", {
41 |   best_params <-
42 |     deepboost.gridSearch(x1 ~ x2, data.frame(x1=rep(c(1,1,1,0),5),x2=rep(c(1,1,1,1),5)),
43 |                          seed = 666, k = 2)
44 | 
45 |   expect_equal(is.numeric(best_params[1][[1]]), TRUE)
46 |   expect_equal(is.numeric(best_params[2][[1]]), TRUE)
47 |   expect_equal(is.numeric(best_params[3][[1]]), TRUE)
48 |   expect_equal(is.numeric(best_params[4][[1]]), TRUE)
49 |   expect_equal(is.character(best_params[5][[1]]), TRUE)
50 | 
51 | })
52 | 


--------------------------------------------------------------------------------