├── DESCRIPTION
├── NAMESPACE
├── R
    ├── calculateTunabilityMeasures.R
    ├── calculateTunabilityMeasuresPackageDefault.R
    ├── calculateTuningSpace.R
    ├── compareSurrogateModels.R
    ├── helpers.R
    └── makeSurrogateModels.R
├── README.md
├── main.R
├── results_accuracy.RData
├── results_auc.RData
├── results_brier.RData
└── shiny
    ├── app.R
    ├── app_data.RData
    ├── helpers.R
    ├── preproc.R
    ├── results_all.RData
    └── rsconnect
        └── shinyapps.io
            └── philipppro
                └── tunability.dcf


/DESCRIPTION:
--------------------------------------------------------------------------------
 1 | Package: tunability
 2 | Title: Calculate tunability measures and hyperparameter spaces for tuning
 3 | Version: 0.0.0.1
 4 | Authors@R: person("Philipp", "Probst", email = "philipp_probst@gmx.de", role = c("aut", "cre"))
 5 | Description: Calculate tunability measures and hyperparameter spaces for tuning based on results of the OpenML bot.
 6 | License: GPL-2
 7 | Encoding: UTF-8
 8 | LazyData: true
 9 | Depends:
10 |   R (>= 3.3.3),
11 |   mlr (>= 2.10),
12 |   ParamHelpers (>= 1.8),
13 |   OpenML (>= 1.2),
14 |   batchtools (>= 0.9.0),
15 |   BBmisc (>= 1.10),
16 |   dplyr (>= 0.5.0),
17 |   checkmate (>= 1.8.0),
18 |   tidyr (>= 0.6.1),
19 |   stringi (>= 1.1.2)
20 | 


--------------------------------------------------------------------------------
/NAMESPACE:
--------------------------------------------------------------------------------
1 | # Generated by roxygen2: fake comment so roxygen2 overwrites silently.
2 | exportPattern("^[^\\.]")
3 | 


--------------------------------------------------------------------------------
/R/calculateTunabilityMeasures.R:
--------------------------------------------------------------------------------
  1 | #' Calculate default hyperparameter setting
  2 | #' @param surrogates Surrogate models
  3 | calculateDefault = function(surrogates, n.points = 100000, normalization = FALSE) {
  4 |   surr = surrogates$surrogates
  5 |   param.set = surrogates$param.set
  6 |   rnd.points = generateRandomDesign(n.points, param.set, trafo = TRUE)
  7 |   rnd.points = deleteNA(rnd.points)
  8 |   
  9 |   preds = matrix(NA, nrow(rnd.points), length(surr))
 10 |   for(i in seq_along(surr)) {
 11 |     print(paste("surrogate predict: task", i, "of", length(surr)))
 12 |     preds[, i] = predict(surr[[i]], newdata = rnd.points)$data$response
 13 |   }
 14 |   # Best default in general
 15 |   if(normalization == FALSE) {
 16 |     average_preds = apply(preds, 1, mean)
 17 |   } else {
 18 |     new_preds = preds
 19 |     for(i in 1:ncol(preds)) {
 20 |       new_preds[, i] = scale(preds[, i])
 21 |     }
 22 |     average_preds = apply(new_preds, 1, mean)
 23 |   }
 24 |   
 25 |   best = which(average_preds == max(average_preds))[1]
 26 |   default = rnd.points[best,, drop = FALSE]
 27 |   rownames(default) = NULL
 28 |   
 29 |   list(default = rnd.points[best,, drop = FALSE], result = preds[best, ])
 30 |   
 31 |   # Default calculation with LOOCV
 32 |   #best_i = numeric(ncol(preds))
 33 |   #preds_i_best = numeric(ncol(preds))
 34 |   #default.loocv = list()
 35 |   # Best default with LOOCV
 36 |   #for(i in 1:ncol(preds)) {
 37 |   #  preds_i = rowMeans(preds[, -i])
 38 |   #  best_i[i] = which(preds_i == max(preds_i))[1]
 39 |   #  preds_i_best[i] = preds[best_i[i],i]
 40 |   #  default.loocv[[i]] = rnd.points[best_i[i],, drop = FALSE]
 41 |   #}
 42 |   
 43 |   #list(default = rnd.points[best,, drop = FALSE], result = preds[best, ], 
 44 |   #  default.loocv = rnd.points[best_i,], result.loocv = preds_i_best)
 45 | }
 46 | 
 47 | #' Calculate performance of hyperparameter setting
 48 | #' @param par.set Parameter setting
 49 | calculatePerformance = function(surrogates, default) {
 50 |   surr = surrogates$surrogates
 51 |   preds = numeric(length(surr))
 52 |   for(i in seq_along(surr)) {
 53 |     print(paste("surrogate predict: task", i, "of", length(surr)))
 54 |     preds[i] = predict(surr[[i]], newdata = default)$data$response
 55 |   }
 56 |   # Best default
 57 |   list(default = default, result = preds)
 58 | }
 59 | 
 60 | #' Calculate optimal hyperparameter values for an algorithm
 61 | #' @param surrogate Surrogate models
 62 | #' @param hyperpar Number of hyperparameters that should be evaluated at once; Possible options: one, two and all
 63 | calculateDatasetOptimum = function(surrogates, default, hyperpar = "all", n.points = 10000) {
 64 |   surr = surrogates$surrogates
 65 |   param.set = surrogates$param.set
 66 |   if (hyperpar == "all") {
 67 |     rnd.points = generateRandomDesign(n.points, param.set, trafo = TRUE)
 68 |     rnd.points = deleteNA(rnd.points)
 69 | 
 70 |     preds = matrix(NA, nrow(rnd.points), length(surr))
 71 |     for(i in seq_along(surr)) {
 72 |       print(paste("surrogate predict: task", i, "of", length(surr)))
 73 |       preds[, i] = predict(surr[[i]], newdata = rnd.points)$data$response
 74 |     }
 75 |     # Best Value
 76 |     rnd.points[apply(preds, 2, which.max),]
 77 |     return(list(optimum = diag(preds[apply(preds, 2, which.max), ]), par.sets = rnd.points[apply(preds, 2, which.max),, drop = FALSE]))
 78 |   }
 79 |   
 80 |   if (hyperpar == "one") {
 81 |     result = matrix(NA, length(surr), length(param.set$pars))
 82 |     # only do this for parameters that makes sense changing them
 83 |     for(i in seq_along(param.set$pars)) {
 84 |       print(names(param.set$pars)[i])
 85 |       rnd.points1 = generateRandomDesignWithDefaults(n.points, param.set, trafo = TRUE, default, subset = names(param.set$pars)[i])
 86 |       # deleteNAs
 87 |       rnd.points1 = deleteNA(rnd.points1)
 88 |       
 89 |       # Prediction 
 90 |       preds = matrix(NA, nrow(rnd.points1), length(surr))
 91 |       
 92 |       for(j in seq_along(surr)) {
 93 |         preds[, j] = predict(surr[[j]], newdata = rnd.points1)$data$response
 94 |       }
 95 |       # Best Value
 96 |       # rnd.points1[apply(preds, 2, which.max),]
 97 |       result[, i] = diag(preds[apply(preds, 2, which.max), ])
 98 |     }
 99 |     result = data.frame(result)
100 |     colnames(result) = names(param.set$pars)
101 |     return(list(optimum = result))
102 |   }
103 |   if (hyperpar == "two") {
104 |     result = array(NA, dim = c(length(surr), length(param.set$pars), length(param.set$pars)))
105 |     
106 |     for(i in seq_along(param.set$pars)[-length(param.set$pars)]) {
107 |       for(j in seq_along(param.set$pars)[(i+1):length(param.set$pars)]) {
108 |         print(c(names(param.set$pars)[i], names(param.set$pars)[j]))
109 |         rnd.points1 = generateRandomDesignWithDefaults(n.points, param.set, trafo = TRUE, default, subset = names(param.set$pars)[c(i,j)])
110 |         rnd.points1 = deleteNA(rnd.points1)
111 |         
112 |         # Prediction 
113 |         preds = matrix(NA, nrow(rnd.points1), length(surr))
114 |         for(k in seq_along(surr)) {
115 |           preds[, k] = predict(surr[[k]], newdata = rnd.points1)$data$response
116 |         }
117 |         # Best Value
118 |         # rnd.points1[apply(preds, 2, which.max),]
119 |         result[, i, j] = diag(preds[apply(preds, 2, which.max), ])
120 |       }
121 |     }
122 |     return(list(optimum = result))
123 |   }
124 | }
125 | 
126 | #' Calculate tunability measures
127 | #' @param surrogate Surrogate models
128 | calculateTunability = function(default, optimumHyperpar, optimumTwoHyperpar = NULL) {
129 |   optimumHyperpar$optimum - default$result
130 | }
131 | 
132 | deleteNA = function(task.data) {
133 |   for(i in 1:ncol(task.data)) {
134 |     if(is.numeric(task.data[, i]))
135 |       task.data[is.na(task.data[, i]), i] = -10 - 1
136 |     if(is.factor(task.data[, i])) {
137 |       task.data[, i] = addNA(task.data[, i])
138 |       task.data[, i] = droplevels(task.data[, i])
139 |     }
140 |     if(is.logical(task.data[, i]))
141 |       task.data[, i] = as.factor(task.data[, i])
142 |   }
143 |   task.data
144 | }
145 | 
146 | generateRandomDesignWithDefaults = function(n.points, param.set, trafo, default, subset) {
147 |   rnd.points.def = default$default[rep(1, n.points), , drop = FALSE]
148 |   
149 |   # Required Parameters and Values
150 |   reqPar = as.character(sapply(sapply(param.set$pars, `[[`, 12), `[[`, 2))
151 |   reqValue = as.character(sapply(sapply(param.set$pars, `[[`, 12), `[[`, 3))
152 |   
153 |   param.set1 = param.set
154 |   # If there are dependent variables include them
155 |   if(any(subset %in% reqPar)) {
156 |     subset2 = unique(c(subset, names(param.set$pars)[reqPar %in% subset]))
157 |   } else {
158 |     subset2 = subset
159 |   }
160 |   
161 |   param.set1$pars = param.set$pars[subset2]
162 |   rnd.points1 = rnd.points.def
163 |   
164 |   # If one parameter is required by another set it to the specific value
165 |   for(m in seq_along(subset)) {
166 |   if(!is.null(param.set1$pars[[m]]$requires)) {
167 |     reqParSubset = as.character(param.set1$pars[[m]]$requires[2])
168 |     reqValueSubset = as.character(param.set1$pars[[m]]$requires[3])
169 |     
170 |     rnd.points1[, reqParSubset] = reqValueSubset
171 |     
172 |     for(l in seq_along(param.set$pars)) {
173 |       if(reqPar[l] == reqParSubset & reqValue[l] != reqValueSubset)
174 |         rnd.points1[, l] = -10 - 1
175 |     }
176 |     if (!(reqParSubset %in% subset))
177 |       param.set1$pars[[m]]$requires = NULL
178 |   }
179 |   }
180 |   
181 |   rnd.points = generateRandomDesign(n.points, param.set1, trafo = TRUE)
182 |   rnd.points1[, subset2] = rnd.points
183 |   
184 |   # Set the dependent values back to default
185 |   back_to_default = subset2[!(subset2 %in% subset)]
186 |   for(q in back_to_default) {
187 |     if (q == "degree") {  # Spezialfall svm, da wir hier keinen sinnvollen Default haben und den Package default nehmen
188 |       rnd.points1[!is.na(rnd.points1[,q]), q] = 3
189 |     } else {
190 |       rnd.points1[!is.na(rnd.points1[,q]), q] = default$default[,q]
191 |     }
192 |   }
193 |   # Add the default
194 |   rnd.points1 = rbind(default$default, rnd.points1)
195 |   rnd.points1
196 | }
197 |   
198 | 
199 | 


--------------------------------------------------------------------------------
/R/calculateTunabilityMeasuresPackageDefault.R:
--------------------------------------------------------------------------------
 1 | #' Calculate default hyperparameter setting
 2 | #' @param surrogates Surrogate models
 3 | #' @param def Package defaults
 4 | calculatePackageDefaultPerformance = function(surrogates, def, tbl.metaFeatures, tbl.results) {
 5 |   surr = surrogates$surrogates
 6 |   preds = numeric(length(surr))
 7 |   for(i in seq_along(surr)) {
 8 |     print(paste("surrogate predict: task", i, "of", length(surr)))
 9 |     default = convertPackageDefault(def, surr[[i]], tbl.metaFeatures, tbl.results)
10 |     preds[i] = predict(surr[[i]], newdata = default)$data$response
11 |   }
12 |   # Best default
13 | 
14 |   list(default = default, result = preds)
15 | }
16 | 
17 | convertPackageDefault = function(def, surr, tbl.metaFeatures, tbl.results) {
18 |   data_idi = surr$task.desc$id
19 |   
20 |   matching_task_data = unique(tbl.results[, c("data_id")])
21 |   n_feats = filter(tbl.metaFeatures, quality == "NumberOfFeatures") %>%
22 |     select(., -quality) #%>%
23 |     #inner_join(., matching_task_data, by = "data_id")
24 |   p = as.numeric(filter(n_feats, data_id == data_idi)$value)
25 |   
26 |   if ("mtry" %in% names(def)) {
27 |     def$mtry = floor(sqrt(p))/p
28 |   }
29 |   if ("gamma" %in% names(def)) {
30 |     def$gamma = 1/p
31 |   }
32 |   def
33 | }
34 | 
35 | #' Calculate optimal hyperparameter values for an algorithm
36 | #' @param surrogate Surrogate models
37 | #' @param hyperpar Number of hyperparameters that should be evaluated at once; Possible options: one, two and all
38 | calculateDatasetOptimumPackageDefault = function(surrogates, default, hyperpar = "one", n.points = 10000, tbl.metaFeatures, tbl.results) {
39 |   surr = surrogates$surrogates
40 |   param.set = surrogates$param.set
41 |   
42 |   if (hyperpar == "one") {
43 |     result = matrix(NA, length(surr), length(param.set$pars))
44 |     # only do this for parameters that makes sense changing them
45 |     for(i in seq_along(param.set$pars)) {
46 |       print(names(param.set$pars)[i])
47 |       rnd.points1 = generateRandomDesignWithDefaults(n.points, param.set, trafo = TRUE, default, subset = names(param.set$pars)[i])
48 |       # deleteNAs
49 |       rnd.points1 = deleteNA(rnd.points1)
50 |       
51 |       # Prediction 
52 |       preds = matrix(NA, nrow(rnd.points1), length(surr))
53 |       
54 |       for(j in seq_along(surr)) {
55 |         if (!(names(param.set$pars)[i] %in% c("mtry", "gamma"))) {
56 |           rnd.points1 = convertPackageDefault(rnd.points1, surr[[j]], tbl.metaFeatures, tbl.results)
57 |         }
58 |         preds[, j] = predict(surr[[j]], newdata = rnd.points1)$data$response
59 |       }
60 |       # Best default
61 |       # rnd.points1[apply(preds, 2, which.max),]
62 |       result[, i] = diag(preds[apply(preds, 2, which.max), ])
63 |     }
64 |     result = data.frame(result)
65 |     colnames(result) = names(param.set$pars)
66 |     return(list(optimum = result))
67 |   }
68 |   if (hyperpar == "two") {
69 |     result = array(NA, dim = c(length(surr), length(param.set$pars), length(param.set$pars)))
70 |     
71 |     for(i in seq_along(param.set$pars)[-length(param.set$pars)]) {
72 |       for(j in seq_along(param.set$pars)[(i+1):length(param.set$pars)]) {
73 |         print(c(names(param.set$pars)[i], names(param.set$pars)[j]))
74 |         rnd.points1 = generateRandomDesignWithDefaults(n.points, param.set, trafo = TRUE, default, subset = names(param.set$pars)[c(i,j)])
75 |         rnd.points1 = deleteNA(rnd.points1)
76 |         
77 |         # Prediction 
78 |         preds = matrix(NA, nrow(rnd.points1), length(surr))
79 |         for(k in seq_along(surr)) {
80 |           if (!any(names(param.set$pars)[c(i,j)] %in% c("mtry", "gamma"))) {
81 |           rnd.points1 = convertPackageDefault(rnd.points1, surr[[k]], tbl.metaFeatures, tbl.results)
82 |           }
83 |           preds[, k] = predict(surr[[k]], newdata = rnd.points1)$data$response
84 |         }
85 |         # Best default
86 |         # rnd.points1[apply(preds, 2, which.max),]
87 |         result[, i, j] = diag(preds[apply(preds, 2, which.max), ])
88 |       }
89 |     }
90 |     return(list(optimum = result))
91 |   }
92 | }


--------------------------------------------------------------------------------
/R/calculateTuningSpace.R:
--------------------------------------------------------------------------------
 1 | #' Calculate default hyperparameter space for tuning
 2 | #' @param surrogate Surrogate models
 3 | calculateTuningSpace = function(optimum, quant) {
 4 |   space = data.frame(row.names = c(quant, 1-quant))
 5 |   space2 = list()
 6 |   par.sets = optimum$par.sets
 7 |   for(i in 1:ncol(par.sets)) {
 8 |     if(is.numeric(par.sets[,i])) {
 9 |       par.sets[par.sets[,i]==-11,i] = NA
10 |       space = cbind(space, quantile(par.sets[,i], c(quant, 1-quant), na.rm = TRUE))
11 |       colnames(space)[ncol(space)] = names(par.sets)[i]
12 |     }
13 |     if(is.factor(par.sets[,i]) | is.logical(par.sets[,i])) {
14 |       logic = table(par.sets[,i]) / length(par.sets[,i]) > quant
15 |       space2 = c(space2, list(names(table(par.sets[,i]))[logic]))
16 |       names(space2)[length(space2)] = names(par.sets)[i]
17 |     }
18 |   }
19 |   return(list(numerics = space, factors = space2))
20 | }
21 | 


--------------------------------------------------------------------------------
/R/compareSurrogateModels.R:
--------------------------------------------------------------------------------
 1 | #' Compare different surrogate models
 2 | #' @param measure.name Name of the measure to optimize
 3 | #' @param learner.name Name of learner
 4 | #' @param data.ids [\code{numeric}] ids of the dataset
 5 | #' @param lrn.par.set learner-parameter set which should include relevant bounds for flow
 6 | #' @param tbl.results df with getMlrRandomBotResults()
 7 | #' @param tbl.hypPars df with getMlrRandomBotHyperpars()
 8 | #' @param tbl.metaFeatures df with getMlrRandomBotHyperpars()
 9 | #' @param surrogate.mlr.lrns list of mlr learners that should be compared
10 | #' @param min.experiments minimum number of experiments that should be available for a dataset, otherwise the dataset is excluded
11 | #' @return surrogate model
12 | compareSurrogateModels = function(measure.name, learner.name, data.ids, tbl.results, 
13 |   tbl.metaFeatures, tbl.hypPars, lrn.par.set, surrogate.mlr.lrns) {
14 |   
15 |   param.set = lrn.par.set[[which(names(lrn.par.set) == paste0(substr(learner.name, 5, 100), ".set"))]]$param.set
16 |   #train mlr model on full table for measure
17 |   task.data = makeBotTable(measure.name, learner.name, tbl.results, tbl.metaFeatures, tbl.hypPars, param.set, data.ids)
18 |   task.data = data.frame(task.data)
19 |   task.data = deleteNA(task.data)
20 |   
21 |   # get specific data ids
22 |   if(!is.null(data.ids)) {
23 |     uni = unique(task.data$data_id)
24 |     task.ids = uni[uni %in% data.ids]
25 |   } else {
26 |     task.ids = unique(task.data$data_id)
27 |   }
28 |   
29 |   mlr.tasks = list()
30 |   for(i in seq_along(data.ids)) {
31 |     data.idi = data.ids[i]
32 |     data = subset(task.data, data_id == data.ids[i], select =  c("measure.value", names(param.set$pars)))
33 |     # Rename column names because of weird "sample" behaviour of cubist
34 |     colnames(data) = gsub("sample", "ampel", colnames(data))
35 |     mlr.tasks[[i]] = makeRegrTask(id = as.character(data.idi), data, target = "measure.value")
36 |   }
37 |   mlr.lrns = surrogate.mlr.lrns
38 |   measures = list(mse, rsq, kendalltau, spearmanrho)
39 |   rdesc = makeResampleDesc("RepCV", reps = 10, folds = 10)
40 |   mlr.benchmark = benchmark(mlr.lrns, mlr.tasks, resamplings = rdesc, keep.pred = FALSE, models = FALSE, measures = measures)
41 |   
42 |   return(mlr.benchmark)
43 | }


--------------------------------------------------------------------------------
/R/helpers.R:
--------------------------------------------------------------------------------
  1 | 
  2 | 
  3 | getSimpleLearners = function(){
  4 |   # Simple learner param set
  5 |   simple.lrn.par.set = makeLrnPsSets(learner = makeLearner("classif.glmnet", predict.type = "prob"),
  6 |     param.set = makeParamSet(
  7 |       makeNumericParam("alpha", lower = 0, upper = 1, default = 1),
  8 |       makeNumericVectorParam("lambda", len = 1L, lower = -10, upper = 10, default = 0 ,trafo = function(x) 2^x)))
  9 |   
 10 |   simple.lrn.par.set = makeLrnPsSets(learner = makeLearner("classif.rpart", predict.type = "prob"), 
 11 |     param.set = makeParamSet(
 12 |       makeNumericParam("cp", lower = 0, upper = 1, default = 0.01),
 13 |       makeIntegerParam("maxdepth", lower = 1, upper = 30, default = 30),
 14 |       makeIntegerParam("minbucket", lower = 1, upper = 60, default = 1),
 15 |       makeIntegerParam("minsplit", lower = 1, upper = 60, default = 20)), 
 16 |     lrn.ps.sets = simple.lrn.par.set)
 17 |   
 18 |   return(simple.lrn.par.set)
 19 | }
 20 | 
 21 | getMultipleLearners = function(){
 22 |   simple.lrn.par.set = getSimpleLearners()
 23 |   
 24 |   # increase to a general param set
 25 |   lrn.par.set = makeLrnPsSets(learner = makeLearner("classif.kknn", predict.type = "prob"), 
 26 |     param.set = makeParamSet(
 27 |       makeIntegerParam("k", lower = 1, upper = 30)),
 28 |     lrn.ps.sets = simple.lrn.par.set)
 29 |   
 30 |   lrn.par.set = makeLrnPsSets(learner = makeLearner("classif.svm", predict.type = "prob"), 
 31 |     param.set = makeParamSet(
 32 |       makeDiscreteParam("kernel", values = c("linear", "polynomial", "radial")),
 33 |       makeNumericParam("cost", lower = -10, upper = 10, trafo = function(x) 2^x),
 34 |       makeNumericParam("gamma", lower = -10, upper = 10, trafo = function(x) 2^x, requires = quote(kernel == "radial")),
 35 |       makeIntegerParam("degree", lower = 2, upper = 5, requires = quote(kernel == "polynomial"))),
 36 |     lrn.ps.sets = lrn.par.set)
 37 |   
 38 |   lrn.par.set = makeLrnPsSets(learner = makeLearner("classif.ranger", predict.type = "prob"), 
 39 |     param.set = makeParamSet(
 40 |       makeIntegerParam("num.trees", lower = 1, upper = 2000),
 41 |       makeLogicalParam("replace"),
 42 |       makeNumericParam("sample.fraction", lower = 0.1, upper = 1),
 43 |       makeNumericParam("mtry", lower = 0, upper = 1),
 44 |       makeLogicalParam(id = "respect.unordered.factors"),
 45 |       makeNumericParam("min.node.size", lower = 0, upper = 1)),
 46 |     lrn.ps.sets = lrn.par.set)
 47 |   
 48 |   lrn.par.set = makeLrnPsSets(learner = makeLearner("classif.xgboost", predict.type = "prob"), 
 49 |     param.set = makeParamSet(
 50 |       makeIntegerParam("nrounds", lower = 1, upper = 5000), 
 51 |       makeNumericParam("eta", lower = -10, upper = 0, trafo = function(x) 2^x),
 52 |       makeNumericParam("subsample",lower = 0.1, upper = 1),
 53 |       makeDiscreteParam("booster", values = c("gbtree", "gblinear")),
 54 |       makeIntegerParam("max_depth", lower = 1, upper = 15, requires = quote(booster == "gbtree")),
 55 |       makeNumericParam("min_child_weight", lower = 0, upper = 7, requires = quote(booster == "gbtree"), trafo = function(x) 2^x),
 56 |       makeNumericParam("colsample_bytree", lower = 0, upper = 1, requires = quote(booster == "gbtree")),
 57 |       makeNumericParam("colsample_bylevel", lower = 0, upper = 1, requires = quote(booster == "gbtree")),
 58 |       makeNumericParam("lambda", lower = -10, upper = 10, trafo = function(x) 2^x),
 59 |       makeNumericParam("alpha", lower = -10, upper = 10, trafo = function(x) 2^x)),
 60 |     lrn.ps.sets = lrn.par.set)
 61 |   
 62 |   return(lrn.par.set)
 63 | }
 64 | 
 65 | makeLrnPsSets = function(learner, param.set, lrn.ps.sets = NULL, 
 66 |   id = paste0(learner$id, ".set"), overwrite = FALSE) {
 67 |   
 68 |   assertClass(learner, "Learner")
 69 |   assertClass(param.set, "ParamSet")
 70 |   par.match = names(param.set$pars) %in% names(learner$par.set$pars)
 71 |   if(all(par.match)){
 72 |     ls = list(learner = learner, param.set = param.set)
 73 |   } else {
 74 |     stop(paste("The following parameters in param.set are not included in learner:", 
 75 |       paste(names(param.set$pars[par.match == FALSE]), collapse = ", ")))
 76 |   }
 77 |   
 78 |   if(is.null(lrn.ps.sets)){
 79 |     lrn.ps.sets = list()
 80 |     lrn.ps.sets[[id]] = ls
 81 |     attr(lrn.ps.sets, "class") = "LrnPsSet"
 82 |   } else {
 83 |     assertClass(lrn.ps.sets, "LrnPsSet")
 84 |     
 85 |     if(id %in% names(lrn.ps.sets) & overwrite == FALSE){
 86 |       stop("tune.pair already contains id: \"", id, "\". Please specify a new id or set overwrite = TRUE.")
 87 |     } else {
 88 |       lrn.ps.sets[[id]] = ls
 89 |     }
 90 |   }
 91 |   
 92 |   return(lrn.ps.sets)
 93 | }
 94 | 
 95 | calculateTunability = function(default, optimumHyperpar, optimumTwoHyperpar = NULL) {
 96 |   optimumHyperpar$optimum - default$result
 97 | }
 98 | 
 99 | lrn.par.set = getMultipleLearners()
100 | 


--------------------------------------------------------------------------------
/R/makeSurrogateModels.R:
--------------------------------------------------------------------------------
  1 | #' Create surrogate models for different tasks
  2 | #' @param measure.name Name of the measure to optimize
  3 | #' @param learner.name Name of learner
  4 | #' @param data.ids [\code{numeric}] ids of the dataset
  5 | #' @param lrn.par.set learner-parameter set which should include relevant bounds for flow
  6 | #' @param tbl.results df with getMlrRandomBotResults()
  7 | #' @param tbl.hypPars df with getMlrRandomBotHyperpars()
  8 | #' @param tbl.metaFeatures df with getMlrRandomBotHyperpars()
  9 | #' @param min.experiments minimum number of experiments that should be available for a dataset, otherwise the dataset is excluded
 10 | #' @return surrogate model
 11 | makeSurrogateModels = function(measure.name, learner.name, data.ids, tbl.results, 
 12 |   tbl.metaFeatures, tbl.hypPars, lrn.par.set, surrogate.mlr.lrn) {
 13 | 
 14 |   param.set = lrn.par.set[[which(names(lrn.par.set) == paste0(substr(learner.name, 5, 100), ".set"))]]$param.set
 15 |   #train mlr model on full table for measure
 16 |   task.data = makeBotTable(measure.name, learner.name, tbl.results, tbl.metaFeatures, tbl.hypPars, param.set, data.ids)
 17 |   task.data = data.frame(task.data)
 18 |   task.data = deleteNA(task.data)
 19 |   
 20 |   # get specific task ids
 21 |   if(!is.null(data.ids)) {
 22 |     uni = unique(task.data$data_id)
 23 |     data.ids = sort(uni[uni %in% data.ids])
 24 |   } else {
 25 |     data.ids = sort(unique(task.data$data_id))
 26 |   }
 27 |   
 28 |   mlr.mod.measure = list()
 29 |   for(i in seq_along(data.ids)) {
 30 |     print(paste("surrogate train: task", i, "of", length(data.ids)))
 31 |     data.idi = data.ids[i]
 32 |     
 33 |     mlr.task.measure = makeRegrTask(id = as.character(data.idi), subset(task.data, data_id == data.idi, select =  c("measure.value", names(param.set$pars))), target = "measure.value")
 34 |     mlr.lrn = surrogate.mlr.lrn
 35 |     mlr.mod.measure[[i]] = train(mlr.lrn, mlr.task.measure)
 36 |     gc()
 37 |   }
 38 |   return(list(surrogates = mlr.mod.measure, param.set = param.set))
 39 | }
 40 | 
 41 | 
 42 | #' Merge results, hyperpars and features tables and prepare for mlr.task input
 43 | #' @param measure.name.filter What measure to analyse
 44 | #' @param learner.name What learner to analyse
 45 | #' @param tbl.results df with getMlrRandomBotResults()
 46 | #' @param tbl.hypPars df with getMlrRandomBotHyperpars()
 47 | #' @param tbl.metaFeatures df with getMlrRandomBotHyperpars()
 48 | #' @return [\code{data.frame}] Complete table used for creating the surrogate model 
 49 | makeBotTable = function(measure.name, learner.name, tbl.results, tbl.metaFeatures, tbl.hypPars, param.set, data.ids) {
 50 |   
 51 |   tbl.hypPars.learner = tbl.hypPars[tbl.hypPars$fullName == learner.name, ]
 52 |   tbl.hypPars.learner = spread(tbl.hypPars.learner, name, value)
 53 |   tbl.hypPars.learner = data.frame(tbl.hypPars.learner)
 54 |   # Convert the columns to the specific classes
 55 |   params = getParamIds(param.set)
 56 |   param_types = getParamTypes(param.set)
 57 |   for(i in seq_along(params))
 58 |     tbl.hypPars.learner[, params[i]] = conversion_function(tbl.hypPars.learner[, params[i]], param_types[i])
 59 |   
 60 |   bot.table = inner_join(tbl.results, tbl.hypPars.learner, by = "setup") %>%
 61 |     select(., -run_id, -setup, -fullName)
 62 | 
 63 |   # Scale mtry and min.node.size in random forest
 64 |   if(learner.name == "mlr.classif.ranger"){
 65 |     n_feats = filter(tbl.metaFeatures, quality == "NumberOfFeatures") %>%
 66 |       select(., -quality)
 67 |     n_feats$value = as.numeric(n_feats$value)
 68 |     bot.table = inner_join(bot.table, n_feats, by = "data_id")
 69 |     bot.table$mtry = bot.table$mtry/bot.table$value
 70 |     bot.table = bot.table %>% select(., -value)
 71 |     
 72 |     n_inst = filter(tbl.metaFeatures, quality == "NumberOfInstances") %>%
 73 |       select(., -quality)
 74 |     n_inst$value = as.numeric(n_inst$value)
 75 |     bot.table = inner_join(bot.table, n_inst, by = "data_id")
 76 |     bot.table$min.node.size = log(bot.table$min.node.size, 2) / log(bot.table$value, 2)
 77 |     bot.table = bot.table %>% select(., -value)
 78 |   }
 79 |   
 80 |   bot.table = bot.table %>% select(., -task_id)
 81 |   colnames(bot.table)[colnames(bot.table) == measure.name] = "measure.value"
 82 |   bot.table$measure.value = as.numeric(bot.table$measure.value)
 83 |   
 84 |   # select only runs on the specific data.ids
 85 |   bot.table =  subset(bot.table, data_id %in% data.ids)
 86 |   
 87 |   return(bot.table)
 88 | }
 89 | 
 90 | 
 91 | conversion_function = function(x, param_type) {
 92 |   if(param_type %in% c("integer", "numeric", "numericvector")) 
 93 |     x = as.numeric(x)
 94 |   if(param_type %in% c("character", "logical", "factor", "discrete"))
 95 |     x = as.factor(x)
 96 |   return(x)
 97 | }
 98 | 
 99 | #' Get relevant datasets
100 | #'
101 | #' @param tbl.results 
102 | #' @param tbl.hypPars 
103 | #' @param min.experiments 
104 | calculateDataIds = function(tbl.results, tbl.hypPars, min.experiments = 200) {
105 |   whole.table = inner_join(tbl.results, tbl.hypPars, by = "setup") %>% select(., data_id, fullName)
106 |   cross.table = table(whole.table$data_id, whole.table$fullName)
107 |   bigger = rowSums(cross.table > min.experiments)
108 |   data.ids = names(bigger)[bigger == 5]
109 |   return(data.ids)
110 | }
111 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # tunability
2 | 
3 | This repository was used for the calculation of tunability measures and tuning spaces as described in the paper:
4 | 
5 | [Tunability: Importance of Hyperparameters of Machine Learning Algorithms](http://www.jmlr.org/papers/v20/18-444.html)
6 | 
7 | For the calculation just run the [main.R](https://github.com/PhilippPro/tunability/blob/master/main.R) file. 
8 | The calculation can take several days to finish. 
9 | 


--------------------------------------------------------------------------------
/main.R:
--------------------------------------------------------------------------------
  1 | library(checkpoint)
  2 | checkpoint("2018-07-01")
  3 | 
  4 | library(devtools)
  5 | library(OpenML)
  6 | library(batchtools)
  7 | #OMLbots_path = "/home/probst/Paper/Exploration_of_Hyperparameters/OMLbots"
  8 | #OMLbots_path = "C:/Promotion/Hyperparameters/OMLbots"
  9 | #load_all(OMLbots_path)
 10 | load_all()
 11 | lrn.par.set = getMultipleLearners()
 12 | 
 13 | # Get file from the figshare repository
 14 | load(url("https://ndownloader.figshare.com/files/10811309"))
 15 | 
 16 | # From wide format to long
 17 | #a = read.csv(url("https://ndownloader.figshare.com/files/10462300"))
 18 | 
 19 | #a = read.csv(url("https://ndownloader.figshare.com/files/10811312"))
 20 | #library(xtable)
 21 | #head(a)
 22 | #table(a$data_id)
 23 | 
 24 | ################################ Restrict data to 500000 results for each algorithm
 25 | data.ids = calculateDataIds(tbl.results, tbl.hypPars, min.experiments = 200)
 26 | # Only results for OpenML100 datasets
 27 | #tasks = listOMLTasks(number.of.classes = 2L, tag = "OpenML100", estimation.procedure = "10-fold Crossvalidation", number.of.missing.values = 0)
 28 | #data.ids = data.ids[data.ids %in% tasks$data.id]
 29 | 
 30 | # Change the sign for the brier score to get the correct results
 31 | tbl.results$brier = -tbl.results$brier
 32 | 
 33 | library(stringi)
 34 | learner.names = paste0("mlr.", names(lrn.par.set))
 35 | learner.names = stri_sub(learner.names, 1, -5)
 36 | measures = c("auc", "accuracy", "brier")
 37 | measure = c("auc")
 38 | 
 39 | ################################ Compare different surrogate models (complete)
 40 | 
 41 | # only models which do not have to be tuned!
 42 | surrogate.mlr.lrns = list(
 43 |   makeLearner("regr.lm"),
 44 |   makeLearner("regr.rpart"),
 45 |   makeLearner("regr.kknn"),
 46 |   makeLearner("regr.ranger"),
 47 | #  makeLearner("regr.ranger", par.vals = list(num.trees = 2000, respect.unordered.factors = "order")),
 48 |   makeLearner("regr.cubist")
 49 |   #makeLearner("regr.xgboost", par.vals = list(nrounds = 300, eta = 0.03, max_depth = 2, nthread = 1)),
 50 |   #makeLearner("regr.svm"),
 51 |   #makeLearner("regr.bartMachine"),
 52 |   #makeLearner("regr.glmnet"), 
 53 |   #makeLearner("regr.brnn"), # too many errors
 54 |   #makeLearner("regr.km")
 55 | )
 56 | 
 57 | k = 2
 58 | i = 6
 59 | bmr = list()
 60 | 
 61 | load(paste0("bmr_", measures[k], ".RData"))
 62 | 
 63 | for(k in 1:3) {
 64 |   configureMlr(show.info = TRUE, on.learner.error = "warn", on.learner.warning = "warn", on.error.dump = TRUE)
 65 |   library("parallelMap")
 66 |   parallelStartSocket(5)
 67 |   for (i in 1:6) {
 68 |     print(i)
 69 |     set.seed(521 + i)
 70 |     # task.id 146085, 14966 does not work for svm
 71 |       bmr[[i]] = compareSurrogateModels(measure.name = measures[k], learner.name = learner.names[i], 
 72 |         data.ids = data.ids, tbl.results, tbl.metaFeatures, tbl.hypPars, lrn.par.set, surrogate.mlr.lrns)
 73 |     gc()
 74 |     save(bmr, file = paste0("bmr_", measures[k], ".RData"))
 75 |   }
 76 | parallelStop()
 77 | names(bmr) = learner.names
 78 | 
 79 | for(i in seq_along(bmr)) {
 80 |   print(i)
 81 |   rmat = convertBMRToRankMatrix(bmr[[i]])
 82 |   print(rmat)
 83 |   print(plotBMRSummary(bmr[[i]], measure = kendalltau))
 84 |   print(plotBMRBoxplots(bmr[[i]], style = "violin"))
 85 |   print(plotBMRRanksAsBarChart(bmr[[i]], pos = "stack"))
 86 | }
 87 | bmr_surrogate = bmr
 88 | 
 89 | # replace NA results of lm/kknn, rsq
 90 | for(i in seq_along(data.ids)) {
 91 |   for(j in seq_along(learner.names)) {
 92 |     for(l in seq_along(surrogate.mlr.lrns)) {
 93 |       rsq = bmr_surrogate[[j]]$results[[i]][[l]]$measures.test$rsq
 94 |       bmr_surrogate[[j]]$results[[i]][[l]]$aggr[2] = mean(rsq[rsq>0], na.rm = T)
 95 |       bmr_surrogate$mlr.classif.kknn$results[[i]][[l]]$aggr[3] = 
 96 |         mean(bmr_surrogate$mlr.classif.kknn$results[[i]][[l]]$measures.test$kendalltau, na.rm = T)
 97 |       bmr_surrogate$mlr.classif.kknn$results[[i]][[l]]$aggr[4] = 
 98 |         mean(bmr_surrogate$mlr.classif.kknn$results[[i]][[l]]$measures.test$spearmanrho, na.rm = T)
 99 |     }
100 |   }
101 | }
102 | 
103 | # Save results
104 | save(bmr_surrogate, file = paste0("results_", measures[k], ".RData"))
105 | 
106 | 
107 | # Best model in general: ranger, cubist
108 | 
109 | ################################# Calculate tunability measures
110 | surrogate.mlr.lrn = makeLearner("regr.ranger", par.vals = list(num.threads = 4))
111 | #surrogate.mlr.lrn = makeLearner("regr.ranger", par.vals = list(num.trees = 2000, respect.unordered.factors = "order", num.threads = 4))
112 | #surrogate.mlr.lrn = makeLearner("regr.cubist")
113 | 
114 | results = list()
115 | 
116 | for(i in seq_along(learner.names)) {
117 |   print(i)
118 |   set.seed(199 + i)
119 |   # Surrogate model calculation
120 |   surrogates = makeSurrogateModels(measure.name = measures[k], learner.name = learner.names[i], 
121 |     data.ids = data.ids, tbl.results, tbl.metaFeatures, tbl.hypPars, lrn.par.set, surrogate.mlr.lrn)
122 |   save(surrogates, file = paste0("surrogates_", measures[k], "_", i, ".RData"))
123 | }
124 | 
125 | for(i in seq_along(learner.names)) {
126 |   print(i)
127 |   set.seed(199 + i)
128 |   load(paste0("surrogates_", measures[k], "_", i, ".RData"))
129 |   # Default calculation
130 |   default = calculateDefault(surrogates)
131 |   # Tunability overall
132 |   optimum = calculateDatasetOptimum(surrogates, default, hyperpar = "all", n.points = 100000)
133 |   # Tunability hyperparameter specific
134 |   optimumHyperpar = calculateDatasetOptimum(surrogates, default, hyperpar = "one", n.points = 100000)
135 |   # Tunability for two hyperparameters
136 |   optimumTwoHyperpar = calculateDatasetOptimum(surrogates, default, hyperpar = "two", n.points = 10000)
137 |   # Tuning space
138 |   tuningSpace = calculateTuningSpace(optimum, quant = 0.05)
139 |     
140 |   results[[i]] = list(default = default,  optimum = optimum, optimumHyperpar = optimumHyperpar, 
141 |     optimumTwoHyperpar = optimumTwoHyperpar, tuningSpace = tuningSpace)
142 |   gc()
143 |   save(bmr_surrogate, results, file = paste0("results_", measures[k], ".RData"))
144 | }
145 | names(results) = learner.names
146 | 
147 | # Calculations
148 | default = results$mlr.classif.xgboost$default
149 | optimum = results$mlr.classif.xgboost$optimum
150 | optimumHyperpar = results$mlr.classif.xgboost$optimumHyperpar
151 | overallTunability = calculateTunability(default, optimum)
152 | mean(overallTunability)
153 | tunability = calculateTunability(default, optimumHyperpar)
154 | data.frame(t(colMeans(tunability)))
155 | # scaled
156 | data.frame(t(colMeans(tunability/overallTunability, na.rm = T)))
157 | 
158 | default$default[is.numeric(default$default)] = default$default[,is.numeric(default$default)]
159 | 
160 | def = default$default
161 | 
162 | for(i in 1:length(def)) {
163 |   if(is.numeric(def[[i]]))
164 |     def[[i]] = round(def[[i]], 3)
165 | }
166 | 
167 | # Interaction
168 | # Bare values
169 | tab = colMeans(results$mlr.classif.xgboost$optimumTwoHyperpar$optimum, dims = 1, na.rm = TRUE) - 
170 |   mean(results$mlr.classif.xgboost$default$result)
171 | diag(tab) = colMeans(tunability)
172 | colnames(tab) = rownames(tab) = names(tunability)
173 | tab
174 | # Interaction
175 | colMeans(results$mlr.classif.xgboost$optimumTwoHyperpar$optimum, dims = 1, na.rm = TRUE) - 
176 |   mean(results$mlr.classif.xgboost$default$result) - 
177 |   outer(colMeans(tunability), colMeans(tunability), '+')
178 | # Performance gain
179 | colMeans(results$mlr.classif.xgboost$optimumTwoHyperpar$optimum, dims = 1, na.rm = TRUE) - 
180 |   mean(results$mlr.classif.xgboost$default$result) - 
181 |   outer(colMeans(tunability), colMeans(tunability), pmax)
182 | 
183 | # Package defaults
184 | package.defaults = list(
185 |   glmnet = data.frame(alpha = 1, lambda = 0), # no regularization
186 |   rpart = data.frame(cp = 0.01, maxdepth = 30, minbucket = 7, minsplit = 20),
187 |   kknn = data.frame(k = 7),
188 |   svm = data.frame(kernel = "radial", cost = 1, gamma = 1, degree = 3), 
189 |   ranger = data.frame(num.trees = 500, replace = TRUE, sample.fraction = 1, mtry  = 0.1, respect.unordered.factors = FALSE, min.node.size = 0),
190 |   xgboost = data.frame(nrounds = 500, eta = 0.3, subsample = 1, booster = "gbtree", max_depth = 6, min_child_weight = 1,
191 |     colsample_bytree = 1, colsample_bylevel = 1, lambda = 1, alpha = 1)
192 | )
193 | 
194 | # Parameters dependent on data characteristics: svm: gamma, ranger: mtry. 
195 | # Not Specified: glmnet: alpha, xgboost: nrounds
196 | resultsPackageDefaults = list()
197 | 
198 | for(i in seq_along(learner.names)) {
199 |   print(i)
200 |   set.seed(199 + i)
201 |   load(paste0("surrogates_", measures[k], "_", i, ".RData"))
202 |   
203 |   def = package.defaults[[i]]
204 |   default = calculatePackageDefaultPerformance(surrogates, def, tbl.metaFeatures, tbl.results)
205 |   optimumHyperpar = calculateDatasetOptimumPackageDefault(surrogates, default, hyperpar = "one", n.points = 100000, tbl.metaFeatures, tbl.results)
206 |   optimumTwoHyperpar = calculateDatasetOptimumPackageDefault(surrogates, default, hyperpar = "two", n.points = 10000, tbl.metaFeatures, tbl.results)
207 |   resultsPackageDefaults[[i]] = list(default = default,  optimumHyperpar = optimumHyperpar, optimumTwoHyperpar = optimumTwoHyperpar)
208 |   save(bmr_surrogate, results, resultsPackageDefaults, file = paste0("results_", measures[k], ".RData"))
209 | }
210 | names(resultsPackageDefaults) = learner.names
211 | 
212 | resultsPackageDefaults$mlr.classif.svm$default$default$gamma = "1/p"
213 | resultsPackageDefaults$mlr.classif.ranger$default$default$mtry = "sqrt(p)"
214 | resultsPackageDefaults$mlr.classif.ranger$default$default$min.node.size = "1"
215 | 
216 | save(bmr_surrogate, results, resultsPackageDefaults, file = paste0("results_", measures[k], ".RData"))
217 | 
218 | # Calculations
219 | default = resultsPackageDefaults$mlr.classif.ranger$default
220 | optimum = results$mlr.classif.ranger$optimum
221 | optimumHyperpar = resultsPackageDefaults$mlr.classif.ranger$optimumHyperpar
222 | overallTunability = calculateTunability(default, optimum)
223 | mean(overallTunability)
224 | 
225 | tunability = calculateTunability(default, optimumHyperpar)
226 | 
227 | data.frame(t(colMeans(tunability)))
228 | # scaled
229 | data.frame(t(colMeans(tunability/overallTunability, na.rm = T)))
230 | 
231 | # KI for tunability
232 | y = overallTunability
233 | hist(y)
234 | qqnorm(y)
235 | qqline(y)
236 | 
237 | t_value = qt(0.975, length(y) - 1)
238 | mean(y) + c(-t_value, t_value) * sd(y) / sqrt(length(y))
239 | 
240 | # Tunability of the "algorithm"; overfitting problem!
241 | the_order = order(results[[5]]$default$result)
242 | plot(results$mlr.classif.glmnet$default$result[the_order], type = "l", ylab = "AUC")
243 | avg_results = numeric(6)
244 | best_results = best_results_default = numeric(length(results$mlr.classif.glmnet$default$result))
245 | 
246 | for(i in seq_along(learner.names)) {
247 |   lines(results[[i]]$default$result[the_order], col = i)
248 |   avg_results[i] = mean(results[[i]]$default$result)
249 |   for(j in 1:length(results[[i]]$default$result)) {
250 |     best_results_default[j] = ifelse(results[[i]]$default$result[j] > best_results[j], results[[i]]$default$result[j], best_results[j])
251 |     best_results[j] = ifelse(results[[i]]$optimum$optimum[j] > best_results[j], results[[i]]$optimum$optimum[j], best_results[j])
252 |   }
253 | }
254 | legend("topleft", legend = substr(learner.names, 13, 100), col = 1:6, lty = 1)
255 | 
256 | round(best_results - (results[[5]]$default$result), 3)
257 | mean(best_results_default - (results[[5]]$default$result))
258 | mean(best_results - (results[[5]]$default$result))
259 | 
260 | mean((results[[5]]$default$result) - (results[[6]]$default$result))
261 | # maybe overfitting! 
262 | 
263 | # Make Crossvalidation to test if there is overfitting
264 | results_cv = list()
265 | for(i in 1:6) {
266 |   print(i)
267 |   set.seed(3000 + i)
268 |   load(paste0("surrogates_", measures[k], "_", i, ".RData"))
269 |   
270 |   # CV
271 |   n_surr = length(surrogates$surrogates)
272 |   shuffle = sample(n_surr)
273 |   folds = cut(shuffle, breaks = 5, labels = FALSE)
274 |   
275 |   default = list()
276 |   optimumHyperpar = list()
277 |   optimumTwoHyperpar = list()
278 |   
279 |   for(j in 1:5) {
280 |     print(paste(j,i))
281 |     testInd = which(folds == j, arr.ind = TRUE)
282 |     trainInd = which(folds != j, arr.ind = TRUE)
283 |     
284 |     # Default calculation
285 |     default1 = calculateDefault(surrogates = list(surrogates = surrogates$surrogates[trainInd], param.set = surrogates$param.set))
286 |     # Calculate performance of these defaults on test datasets
287 |     default[[j]] = calculatePerformance(list(surrogates = surrogates$surrogates[testInd], param.set = surrogates$param.set), default1$default)
288 |     # Tunability hyperparameter specific
289 |     optimumHyperpar[[j]] = calculateDatasetOptimum(surrogates = list(surrogates = surrogates$surrogates[testInd], param.set = surrogates$param.set), default[[j]], hyperpar = "one", n.points = 100000)
290 |     # Tunability for two hyperparameters
291 |     optimumTwoHyperpar[[j]] = calculateDatasetOptimum(list(surrogates = surrogates$surrogates[testInd], param.set = surrogates$param.set), default[[j]], hyperpar = "two", n.points = 10000)
292 |     
293 |     results_cv[[i]] = list(default = default, optimumHyperpar = optimumHyperpar, optimumTwoHyperpar = optimumTwoHyperpar)
294 |     gc()
295 |   }
296 |   save(bmr_surrogate, results, resultsPackageDefaults, results_cv, file = paste0("results_", measures[k], ".RData"))
297 | }
298 | names(results_cv) = learner.names
299 | save(bmr_surrogate, results, resultsPackageDefaults, results_cv, lrn.par.set, file = paste0("results_", measures[k], ".RData"))
300 | }
301 | 
302 | # overall tunability, cross-validated
303 | for(i in seq_along(learner.names)){
304 |   print(learner.names[i])
305 |   print(mean(calculateTunability(results[[i]]$default, results[[i]]$optimum)))
306 |   print(mean(results[[i]]$optimum$optimum - unlist(sapply(results_cv[[i]]$default, "[[", 2))))
307 | }
308 | for(i in seq_along(learner.names)){
309 |   print(learner.names[i])
310 |   print(rbind(colMeans(calculateTunability(results[[i]]$default, results[[i]]$optimumHyperpar)),
311 |   colMeans(do.call(rbind, unlist(results_cv[[i]]$optimumHyperpar, recursive=FALSE)) - unlist(sapply(results_cv[[i]]$default, "[[", 2)))))
312 | }
313 | 
314 | # Save results for shiny
315 | 
316 | results_auc = NULL
317 | names = load("results_auc.RData")
318 | for(i in seq_along(names))
319 |   results_auc[[i]] = get(names[i])
320 | names(results_auc) = names
321 | results_accuracy = NULL
322 | names = load("results_accuracy.RData")
323 | for(i in seq_along(names))
324 |   results_accuracy[[i]] = get(names[i])
325 | names(results_accuracy) = names
326 | results_brier = NULL
327 | names = load("results_brier.RData")
328 | for(i in seq_along(names))
329 |   results_brier[[i]] = get(names[i])
330 | names(results_brier) = names
331 | 
332 | results_all = list(auc = results_auc, accuracy = results_accuracy, brier = results_brier)
333 | save(results_all, file = "./shiny/results_all.RData")
334 | 
335 | # Annex
336 | 
337 | lrn.regr = makeLearner("regr.ksvm")
338 | fit.regr = train(lrn.regr, bh.task)
339 | fa = generateFunctionalANOVAData(fit.regr, bh.task, "lstat", depth = 1, fun = median)
340 | 
341 | 
342 | # Defaults normalized (check if results differ substantially)
343 | 
344 | results_normalized = list()
345 | k = 1
346 | for(i in seq_along(learner.names)) {
347 |   print(i)
348 |   load(paste0("surrogates_", measures[k], "_", i, ".RData"))
349 |   # Defaults with normalization
350 |   default = calculateDefault(surrogates, normalization = TRUE)
351 |   # Tunability hyperparameter specific
352 |   optimumHyperpar = calculateDatasetOptimum(surrogates, default, hyperpar = "one", n.points = 100000)
353 |   # Tunability for two hyperparameters
354 |   #optimumTwoHyperpar = calculateDatasetOptimum(surrogates, default, hyperpar = "two", n.points = 10000)
355 |   # Tuning space
356 |   results_normalized[[i]] = list(default = default,  optimumHyperpar = optimumHyperpar)
357 |   gc()
358 |   save(results_normalized, file = paste0("./results_normalized_", measures[k], ".RData"))
359 | }
360 | 
361 | 
362 | for(i in 1:6){
363 |   print(learner.names[i])
364 |   print(rbind(results_normalized[[i]]$default$default, results[[i]]$default$default))
365 |   print("Tunability")
366 |   print(mean(calculateTunability(results_normalized[[i]]$default, results[[i]]$optimum)))
367 |   print(mean(calculateTunability(results[[i]]$default, results[[i]]$optimum)))
368 |   print("Tunability parameter")
369 |   print(rbind(colMeans(calculateTunability(results_normalized[[i]]$default, results[[i]]$optimumHyperpar)),
370 |     colMeans(calculateTunability(results[[i]]$default, results_normalized[[i]]$optimumHyperpar))))
371 |   print("--------------------------------------------------------------------------------------------------------")
372 | }
373 | 
374 | # xtable version
375 | library(xtable)
376 | for(i in 1:6){
377 |   defs = rbind(results_normalized[[i]]$default$default, results[[i]]$default$default)
378 |   rownames(defs) = c("norm", "mean")
379 |   colnames(defs) = substr(colnames(defs), 1, 5)
380 |   print(xtable(defs, caption = paste(learner.names[i], "defaults"), digits = 3))
381 |   tuna = rbind(
382 |     c(mean(calculateTunability(results_normalized[[i]]$default, results[[i]]$optimum)), 
383 |       colMeans(calculateTunability(results_normalized[[i]]$default, results[[i]]$optimumHyperpar))),
384 |     c(mean(calculateTunability(results[[i]]$default, results[[i]]$optimum)), 
385 |       colMeans(calculateTunability(results[[i]]$default, results[[i]]$optimumHyperpar)))
386 |   )
387 |   colnames(tuna)[1] = "all"
388 |   colnames(tuna) = substr(colnames(tuna), 1, 5)
389 |   rownames(tuna) = c("norm", "mean")
390 |   print(xtable(tuna, caption = paste(learner.names[i], "tunability"), digits = 3))
391 | }
392 | 
393 | # rpart sieht sehr komisch aus!
394 | 


--------------------------------------------------------------------------------
/results_accuracy.RData:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PhilippPro/tunability/9fc4fa84f500b1eec446ad50860901c315680a76/results_accuracy.RData


--------------------------------------------------------------------------------
/results_auc.RData:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PhilippPro/tunability/9fc4fa84f500b1eec446ad50860901c315680a76/results_auc.RData


--------------------------------------------------------------------------------
/results_brier.RData:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PhilippPro/tunability/9fc4fa84f500b1eec446ad50860901c315680a76/results_brier.RData


--------------------------------------------------------------------------------
/shiny/app.R:
--------------------------------------------------------------------------------
  1 | library(ggplot2)
  2 | library(plotly)
  3 | # library(tidyr)
  4 | library(shiny)
  5 | library(shinyjs)
  6 | #library(shinydashboard)
  7 | # library(shinyBS)
  8 | library(data.table)
  9 | library(DT)
 10 | library(ParamHelpers)
 11 | library(mlr)
 12 | #library(devtools)
 13 | library(checkmate)
 14 | library(glmnet)
 15 | library(kknn)
 16 | library(rpart)
 17 | library(e1071)
 18 | library(ranger)
 19 | library(xgboost)
 20 | 
 21 | server = function(input, output) {
 22 |   
 23 |   load("app_data.RData")
 24 |   source("helpers.R")
 25 |   
 26 |   rv <- reactiveValues()
 27 |   rv$setupComplete <- FALSE
 28 |   
 29 |   ## simulate data load
 30 |   observe({
 31 |     if(input$btn_data){
 32 |       ## set my condition to TRUE
 33 |       rv$setupComplete <- TRUE
 34 |     }
 35 |     
 36 |     ## the conditional panel reads this output
 37 |     output$setupComplete <- reactive({
 38 |       return(rv$setupComplete)
 39 |     })
 40 |     outputOptions(output, 'setupComplete', suspendWhenHidden=FALSE)
 41 |   })
 42 |   
 43 |   
 44 |   measure.names = names(app_data)
 45 |   learner.names = names(app_data$auc$results)
 46 |   
 47 |   output$measureAll = renderUI({
 48 |    selectInput('meas', 'Performance measure', measure.names, selected = measure.names[1], multiple = FALSE)
 49 |   })
 50 |   
 51 |   output$algorithm = renderUI({
 52 |    selectInput('algo', 'Algorithm', learner.names, selected = learner.names[1], multiple = FALSE)
 53 |   })
 54 |   
 55 |   output$defaultchoice <- renderUI({
 56 |     selectInput('defaultchoice', 'Defaults', c("Optimal defaults", "Package defaults"), selected = "Optimal defaults", multiple = FALSE)
 57 |   })
 58 |    
 59 |    bmrInput = reactive({
 60 |      inputi = "glmnet"
 61 |      if(!is.null(input$algo))
 62 |        inputi = input$algo
 63 |      measur = "auc"
 64 |      if(!is.null(input$meas))
 65 |        measur = input$meas
 66 |      app_data[[which(measure.names == measur)]]$surrogate[[which(learner.names == inputi)]]
 67 |    })
 68 |    
 69 |    bmrAggr = reactive({
 70 |      perfs = data.table(bmrInput())[, -"task.id"]
 71 |      # delete datasets with missing results
 72 |      error.results = which(is.na(perfs$kendalltau.test.mean) | perfs$rsq.test.mean < 0)
 73 |      error.results = unlist(lapply(unique(floor(error.results/5 - 0.0001)),
 74 |        function(x) x + seq(0.2, 1, 0.2)))*5
 75 |      if(length(error.results)!=0)
 76 |        perfs = perfs[-error.results,]
 77 |      perfs = data.frame(perfs[, lapply(list(mse = mse.test.mean, rsq = rsq.test.mean, kendalltau = kendalltau.test.mean,
 78 |        spearmanrho = spearmanrho.test.mean),function(x) mean(x)), by = "surrogate"])
 79 |      perfs
 80 |    })
 81 | 
 82 |    output$logscale = renderUI({
 83 |      selectInput('logscale', 'Logarithmic scale', c("No", "Yes"), selected = "No", multiple = FALSE)
 84 |    })
 85 |    
 86 |    output$bmr_measure = renderUI({
 87 |      measures = gsub("\\..*","",colnames(bmrInput())[-c(1,2)])
 88 |      selectInput('bmr_measure', 'Performance measure', measures, selected = measures[2], multiple = FALSE)
 89 |    })
 90 | 
 91 |    output$bmr_result = renderTable({
 92 |      bmrAggr()
 93 |    }, digits = 5)
 94 |    
 95 |   #df_test = app_data$auc$surrogate$glmnet
 96 |   #plotBMRSummary
 97 |   #bmr_measure = gsub("\\..*","",colnames(app_data$auc$surrogate$glmnet)[-c(1,2)])
 98 |   #learner_ids = gsub("\\..*","",learner.names)
 99 |    url <- a("Bot Paper", href="https://arxiv.org/pdf/1806.10961.pdf")
100 |    
101 |    output$tab <- renderUI({
102 |      tagList("In the plot below the performances of the surrogate models on the different datasets is depicted. 
103 |    The data_id's correspond to the dataset ids of OpenML, for details see also the", url, ".")
104 |    })
105 | 
106 |   output$plot1 = renderPlot({
107 |      sel_meas = paste0(input$bmr_measure, ".test.mean")
108 |      p = ggplot(bmrInput(), aes_string(x = sel_meas, y = "task.id", col = "surrogate", shape = "surrogate"))
109 |      p = p + geom_point(size = 4L, position = position_jitter(width = 0, height = 0.05))
110 |      p = p + scale_shape_manual(values = rep(19, length(learner.names)))
111 |      p = p + ylab("Data_id")
112 |      p = p + xlab(sel_meas)
113 |      if (ifelse(!is.null(input$logscale), input$logscale == "Yes" , TRUE)) {
114 |      #if (input$logscale == "Yes") {
115 |        p + scale_x_log10() + ggtitle("Performance on datasets")
116 |      } else {
117 |        p + ggtitle("Performance on datasets")
118 |      }
119 |    })
120 | 
121 |   output$task = renderUI({
122 |     selectInput('taski', 'Task', c("classification", "regression"), selected = "classification", multiple = FALSE)
123 |   })
124 | 
125 | 
126 | 
127 |   resultsInput = reactive({
128 |     if (input$defaultchoice == "Optimal defaults") {
129 |       app_data[[input$meas]]$results[[input$algo]]
130 |     } else {
131 |       app_data[[input$meas]]$resultsPackageDefaults[[input$algo]]
132 |     }
133 |   })
134 | 
135 |   output$defaults = renderTable({
136 |     resultsInput()$default$default
137 |   }, digits = 3)
138 | 
139 |   overall = reactive({
140 |     calculateTunability(resultsInput()$default, app_data[[input$meas]]$results[[input$algo]]$optimum)
141 |   })
142 | 
143 |   tunabilityValues = reactive({
144 |     calculateTunability(resultsInput()$default, resultsInput()$optimumHyperpar)
145 |   })
146 | 
147 |   tunabilityValuesMean = reactive({
148 |     colMeans(calculateTunability(resultsInput()$default, resultsInput()$optimumHyperpar))
149 |   })
150 | 
151 |   output$scaled = renderUI({
152 |     selectInput('scaled', 'Scaled (per Dataset)', c(TRUE, FALSE), selected = FALSE, multiple = FALSE)
153 |   })
154 | 
155 |   output$overallTunability = renderTable({
156 |     if (input$scaled) {
157 |       mean(overall()/overall(), na.rm = TRUE)
158 |     } else {
159 |       mean(overall())
160 |     }
161 |   }, colnames = FALSE, digits = 3)
162 | 
163 |   output$tunability = renderTable({
164 |     if (input$scaled) {
165 |       data.frame(t(colMeans(tunabilityValues()/overall(), na.rm = T)))
166 |     } else {
167 |       data.frame(t(tunabilityValuesMean()))
168 |     }
169 |   }, digits = 3)
170 | 
171 |   output$plot3 = renderPlotly({
172 |     dataf = data.frame(overall(), tunabilityValues())
173 |     colnames(dataf)[1] = "overall"
174 |     column.names = colnames(dataf)
175 |     dataf = stack(dataf)
176 |     dataf$ind = factor(dataf$ind, column.names)
177 |     ggplot(dataf, aes(x = ind, y = values)) + geom_boxplot() + coord_cartesian(ylim = c(input$yrange[1],input$yrange[2])) +
178 |       ylab("tunability per dataset") + xlab("hyperparameter") # for the x axis label  # + ggtitle(substring(learner.names[i], 13))
179 |   })
180 | 
181 |   output$visual = renderUI({
182 |     selectInput('visual', 'Visualization of the tunability', c("Density", "Histogram"), selected = "Density", multiple = FALSE)
183 |   })
184 | 
185 |   output$visual3 = renderUI({
186 |       selectInput('visual3', 'Hyperparameter', c(names(tunabilityValuesMean())), selected = "All", multiple = FALSE)
187 |   })
188 | 
189 |   output$plot4 = renderPlotly({
190 |     dataf = data.frame(app_data[[input$meas]]$results[[input$algo]]$optimum$par.sets[,input$visual3])
191 |     name = input$visual3
192 |     num = is.numeric(dataf[,1])
193 | 
194 |     inputi = "glmnet"
195 |     
196 |     if(!is.null(input$algo))
197 |       inputi = input$algo
198 | 
199 |     if(num) {
200 |       dataf = dataf[dataf[,1]!=-11, , drop = F]
201 |       learner.i = which(learner.names == inputi)
202 |       TRAFO = is.null(lrn.par.set[[learner.i]][[2]]$pars[[name]]$trafo)
203 |       if(TRAFO) {
204 |         ggplot(data=dataf, aes(dataf[,1])) + geom_histogram(aes(y=..density..), bins = input$nrbin, col = "black", fill = "white") + xlim(range(dataf[,1])) + xlab(name)
205 |       } else {
206 |         ggplot(data=dataf, aes(dataf[,1])) + geom_histogram(aes(y=..density..), bins = input$nrbin, col = "black", fill = "white") + xlim(range(dataf[,1])) + xlab(paste(name, "(log-scale)")) + scale_x_continuous(trans = "log10")
207 |       }
208 |     } else {
209 |       ggplot(data=dataf, aes(dataf[,1])) + geom_bar(aes(y = (..count..)/sum(..count..)), col = "black", fill = "white") +
210 |         xlab(name) + ylab("relative frequency")
211 |     }
212 |   })
213 | 
214 |   output$quantile = renderUI({
215 |     numericInput('quantile', 'Quantile for tuning space calculation', 0.1, min = 0, max = 1)
216 |   })
217 | 
218 |   tuningSpace = reactive({
219 |     tab = calculateTuningSpace(app_data[[input$meas]]$results[[input$algo]]$optimum, quant = input$quantile)
220 |     tab$numerics = cbind(Quantile = rownames(tab$numerics), tab$numerics)
221 |     tab
222 |   })
223 | 
224 |   output$tuningSpaceNumerics = renderTable({
225 |     tuningSpace()$numerics
226 |   }, rownames = FALSE, digits = 3)
227 | 
228 |   output$tuningSpaceFactors = renderTable({
229 |     tuningSpace()$factors
230 |   })
231 | 
232 |   output$combi = renderUI({
233 |     selectInput('combination', 'Measures',
234 |       c("Tunability", "Joint gain", "Interaction effect"),
235 |       selected = "Tunability", multiple = FALSE)
236 |   })
237 | 
238 |   output$combiTable <- renderTable({
239 |     tab = colMeans(resultsInput()$optimumTwoHyperpar$optimum, dims = 1, na.rm = TRUE) - mean(resultsInput()$default$result)
240 |     if(input$combination == "Tunability") {
241 |       diag(tab) = tunabilityValuesMean()
242 |     } else {
243 |       if(input$combination == "Interaction effect") {
244 |         tab = tab - outer(tunabilityValuesMean(), tunabilityValuesMean(), '+')
245 |       } else {
246 |         tab = tab - outer(tunabilityValuesMean(), tunabilityValuesMean(), pmax)
247 |       }
248 |     }
249 |     colnames(tab) = rownames(tab) = names(tunabilityValuesMean())
250 |     tab
251 |   }, rownames = TRUE, digits = 4)
252 | 
253 | 
254 |   output$par.set = renderUI({
255 |     tagList(makeLearnerParamUI(app_data[[input$meas]]$results[[input$algo]]))
256 |   })
257 | 
258 | 
259 |   output$performanceHypParSetting = renderTable({
260 |     var_names = colnames(app_data[[input$meas]]$results[[input$algo]]$optimum$par.sets)
261 |     par.set = numeric()
262 |     for(i in 1:length(var_names)) {
263 |       par.set[i] = input[[var_names[i]]]
264 |     }
265 |     par.set
266 |     #calculatePerformance(surrogates_all[[input$algo]], par.set)$preds
267 |   })
268 |   # performanceHypParSetting = reactive({
269 |   #   calculatePerformance(surrogates_all[[input$algo]], par.set)
270 |   # })
271 |   
272 | }
273 | 
274 | makeLearnerParamUI = function(results_algo) {
275 |   par.set = results_algo$optimum$par.sets
276 |   inp = list()
277 |   for(i in 1:ncol(par.set)) {
278 |     par.type = class(par.set[,i])
279 |     par.id = names(par.set)[i]
280 |     if (par.type == "numeric")
281 |       inp[[i]] = numericInput(par.id, par.id, results_algo$default$default[i])
282 |     if (par.type == "factor")
283 |       inp[[i]] = selectInput(par.id, par.id, choices = unique(par.set[,i]), selected = results_algo$default$default[i])
284 |   }
285 |   inp
286 | }
287 | 
288 | ui = fluidPage(
289 |   conditionalPanel(condition = "!output.setupComplete",
290 |     column(12, h2(p("Tunability Shiny App"))),
291 |     column(12, h5(p("This app contains additional material for the paper 'Tunability: Importance of Hyperparameters of Machine Learning Algorithms'.
292 |       For starting the app, just click the button:"))),
293 |     column(12, align = "center", actionButton(inputId = "btn_data", label = "Start the shiny app!", width = '400px', 
294 |       style="color: #fff; background-color: #337ab7; border-color: #2e6da4")),
295 |     br(),
296 |     hr(),
297 |     br(),
298 |     hr(),
299 |     br(),
300 |     hr(),
301 |     column(10, h3(p("Tunability: Importance of Hyperparameters of Machine Learning Algorithms"))),
302 |     br(),
303 |     br(),
304 |     hr(),
305 |     column(10, h5(p("Authors: Philipp Probst, Bernd Bischl, Anne-Laure Boulesteix"))),
306 |     br(),
307 |     hr(),
308 |     column(10, h4(p("Paper Abstract"))),
309 |     column(10, h5(p("Modern supervised machine learning algorithms involve hyperparameters that have to be set before running them. 
310 | Options for setting hyperparameters are default values from the software package, manual configuration by the user or configuring them for optimal predictive performance by a tuning procedure. 
311 |       The goal of this paper is two-fold. 
312 |       Firstly, we formalize the problem of tuning from a statistical point of view, define data-based defaults and suggest general measures quantifying the tunability of hyperparameters of algorithms. 
313 |       Secondly, we conduct a large-scale benchmarking study based on 38 datasets from the OpenML platform and six common machine learning algorithms. 
314 |       We apply our measures to assess the tunability of their parameters. 
315 |       Our results yield default values for hyperparameters and enable users to decide whether it is worth conducting a possibly time consuming tuning strategy, to focus on the most important hyperparameters and to choose adequate hyperparameter spaces for tuning. ")))
316 |   ),
317 |   conditionalPanel(condition = "output.setupComplete",
318 |     titlePanel("Tunability Shiny App"),
319 |     hr(),
320 |     wellPanel(fluidRow(column(12, h4("General settings"))),
321 |       fluidRow(column(4,uiOutput("measureAll")),column(4,uiOutput("algorithm")),column(4,uiOutput("defaultchoice")))),
322 |     hr(),
323 |     fluidRow(column(12, h4(p("(around 10 seconds loading time for each panel)", style = "color:blue")))),
324 |     tabsetPanel(
325 |       tabPanel("Surrogate models comparison", 
326 |         fluidRow(column(12, h2("Comparison of the quality of surrogate models")),
327 |           column(12, h5("The calculation of the tunability is based on the surrogate models. 
328 |         Hence, it is important to evaluate the performance of the surrogate model. 
329 |         In this panel five different surrogate models are compared. 
330 |         For the final calculation of the tunability measures the ranger surrogate models is chosen because it provides good and stable results.
331 |         See also section 5.1 in the paper."))),
332 |         hr(),
333 |         fluidRow(column(12, h4("Average performances of the surrogate models on the different datasets")),
334 |           column(12, tableOutput("bmr_result"))),
335 |         hr(),
336 |         fluidRow(column(12, h4("Distribution of the surrogate model performances on the different datasets")),
337 |           column(12, uiOutput("tab")),
338 |           column(6, uiOutput("logscale")), column(6, uiOutput("bmr_measure")),
339 |           plotOutput("plot1", width = "95%"))#,
340 |         #plotOutput("plot2", width = "95%")
341 |       ),
342 |       
343 |       tabPanel("Defaults and tunability", 
344 |         fluidRow(column(12, h2("Defaults and tunability")), 
345 |           column(12, h5("In this panel the defaults and the correspending tunabilities are depicted. For details see sections 3.2, 3.3, 3.4, 5.2 and 5.3 in the paper."))),
346 |         hr(),
347 |         fluidRow(column(12, h4("Defaults")),
348 |           column(12, h5("This table contains the default hyperparameter values that are used for the calculation of the tunability values.
349 |         The optimal defaults were calculated by taking the best average performance of a hyperparameter setting on all datasets. 
350 |         The package defaults are given by the corresponding R-packages.")),
351 |           column(12, tableOutput("defaults"))), 
352 |         hr(),
353 |         fluidRow(
354 |           column(12, h4("Tunability")),
355 |           column(12, h5("The tunability values are calculated by taking the best performance of a hyperparameter setting on a 
356 |           dataset (overall and for single hyperparameters) and subtracting the performance of the default hyperparameter setting.")),
357 |           column(12, h4("Mean tunability over the datasets")),
358 |           column(12, h5("For the following table the mean of the tunabilities of all the datasets is taken to provide one measure of tunability for each parameter.
359 |             The scaled version divides the tunability per hyperparameter by the overall tunability of the algorithm per dataset and takes the mean afterwards.")),
360 |           column(12, fluidRow(
361 |             column(1, h5("Overall mean tunability"), tableOutput("overallTunability")),
362 |             column(11, h5("Hyperparameters"), tableOutput("tunability"))
363 |           )),
364 |           column(12, uiOutput("scaled"))
365 |         ),
366 |         fluidRow(column(12, h4("Boxplot of tunability values per dataset")),
367 |           br(),
368 |           br(),
369 |           column(12, h4("Tunability values per dataset")),
370 |           plotlyOutput("plot3", width = "95%", inline = F),
371 |           sliderInput("yrange",  "Y-axis limits:", min = 0, max = 0.5, value = c(0, 0.06), width = "800px")
372 |         )),
373 |       tabPanel("Combined tunability",
374 |         fluidRow(column(12, h2("Tunability of hyperparameter combinations and joint gain")), 
375 |           column(12, h5("In this panel the tunabilities of hyperparameter combinations and joint gains can be seen. For details see section 3.5 and 5.4 in the paper."))),
376 |         hr(),
377 |         fluidRow(column(12, uiOutput("combi")),
378 |           column(12, h4("Combined tunability and interaction effects")),
379 |           column(12, h5("The tunability values of the single hyperparameters are depicted on the diagonal, the combined tunabilities on the upper right of 
380 |           the table. For details of the calculation (also of the joint gain) see section 3.5 in the paper.")),
381 |           column(12, tableOutput("combiTable")))
382 |         #)
383 |         #)
384 |       ),
385 |       tabPanel("Tuning space",
386 |         fluidRow(column(12, h2("Hyperparameter ranges for tuning and priors")), 
387 |           column(12, h5("In this panel the optimal hyperparameter ranges are depicted. For details see sections 3.6 and 5.5 in the paper."))),
388 |         hr(),
389 |         fluidRow(column(12, h4("Tuning Space"),
390 |           column(12, h5("The tuning space is calculated by taking the best hyperparameters on each dataset and calculating the quantiles of these.")),
391 |           column(12, uiOutput("quantile")),
392 |           column(12, "Numerics", align="left", tableOutput("tuningSpaceNumerics")),
393 |           column(12, "Factors", align="left", tableOutput("tuningSpaceFactors"))
394 |         )),
395 |         hr(),
396 |         fluidRow(column(12, h4("Histogram of best hyperparameter on each of the datasets (possible prior for tuning)")),
397 |           column(12, uiOutput("visual3"))),
398 |         plotlyOutput("plot4", width = "95%", inline = F),
399 |         sliderInput("nrbin",  "Number of bins:", min = 0, max = 50, value = c(6), width = "800px")
400 |         #fluidRow(column(6, uiOutput("visual")),
401 |         #  column(6, uiOutput("visual2")))
402 |         #plotlyOutput("plot5", inline = F),
403 |         
404 |         # conditionalPanel(
405 |         # condition = "input.visual == 'Histogram'",
406 |         #   sliderInput("bins",  "Number of bins:", min = 1, max = 50, value = 30)
407 |         # ))
408 |       )
409 |     )
410 |   )
411 | )
412 | 
413 | 
414 | 
415 | shinyApp(ui = ui, server = server)


--------------------------------------------------------------------------------
/shiny/app_data.RData:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PhilippPro/tunability/9fc4fa84f500b1eec446ad50860901c315680a76/shiny/app_data.RData


--------------------------------------------------------------------------------
/shiny/helpers.R:
--------------------------------------------------------------------------------
  1 | getSimpleLearners = function(){
  2 |   # Simple learner param set
  3 |   simple.lrn.par.set = makeLrnPsSets(learner = makeLearner("classif.glmnet", predict.type = "prob"),
  4 |     param.set = makeParamSet(
  5 |       makeNumericParam("alpha", lower = 0, upper = 1, default = 1),
  6 |       makeNumericVectorParam("lambda", len = 1L, lower = -10, upper = 10, default = 0 ,trafo = function(x) 2^x)))
  7 |   
  8 |   simple.lrn.par.set = makeLrnPsSets(learner = makeLearner("classif.rpart", predict.type = "prob"), 
  9 |     param.set = makeParamSet(
 10 |       makeNumericParam("cp", lower = 0, upper = 1, default = 0.01),
 11 |       makeIntegerParam("maxdepth", lower = 1, upper = 30, default = 30),
 12 |       makeIntegerParam("minbucket", lower = 1, upper = 60, default = 1),
 13 |       makeIntegerParam("minsplit", lower = 1, upper = 60, default = 20)), 
 14 |     lrn.ps.sets = simple.lrn.par.set)
 15 |   
 16 |   return(simple.lrn.par.set)
 17 | }
 18 | 
 19 | getMultipleLearners = function(){
 20 |   simple.lrn.par.set = getSimpleLearners()
 21 |   
 22 |   # increase to a general param set
 23 |   lrn.par.set = makeLrnPsSets(learner = makeLearner("classif.kknn", predict.type = "prob"), 
 24 |     param.set = makeParamSet(
 25 |       makeIntegerParam("k", lower = 1, upper = 30)),
 26 |     lrn.ps.sets = simple.lrn.par.set)
 27 |   
 28 |   lrn.par.set = makeLrnPsSets(learner = makeLearner("classif.svm", predict.type = "prob"), 
 29 |     param.set = makeParamSet(
 30 |       makeDiscreteParam("kernel", values = c("linear", "polynomial", "radial")),
 31 |       makeNumericParam("cost", lower = -10, upper = 10, trafo = function(x) 2^x),
 32 |       makeNumericParam("gamma", lower = -10, upper = 10, trafo = function(x) 2^x, requires = quote(kernel == "radial")),
 33 |       makeIntegerParam("degree", lower = 2, upper = 5, requires = quote(kernel == "polynomial"))),
 34 |     lrn.ps.sets = lrn.par.set)
 35 |   
 36 |   lrn.par.set = makeLrnPsSets(learner = makeLearner("classif.ranger", predict.type = "prob"), 
 37 |     param.set = makeParamSet(
 38 |       makeIntegerParam("num.trees", lower = 1, upper = 2000),
 39 |       makeLogicalParam("replace"),
 40 |       makeNumericParam("sample.fraction", lower = 0.1, upper = 1),
 41 |       makeNumericParam("mtry", lower = 0, upper = 1),
 42 |       makeLogicalParam(id = "respect.unordered.factors"),
 43 |       makeNumericParam("min.node.size", lower = 0, upper = 1)),
 44 |     lrn.ps.sets = lrn.par.set)
 45 |   
 46 |   lrn.par.set = makeLrnPsSets(learner = makeLearner("classif.xgboost", predict.type = "prob"), 
 47 |     param.set = makeParamSet(
 48 |       makeIntegerParam("nrounds", lower = 1, upper = 5000), 
 49 |       makeNumericParam("eta", lower = -10, upper = 0, trafo = function(x) 2^x),
 50 |       makeNumericParam("subsample",lower = 0.1, upper = 1),
 51 |       makeDiscreteParam("booster", values = c("gbtree", "gblinear")),
 52 |       makeIntegerParam("max_depth", lower = 1, upper = 15, requires = quote(booster == "gbtree")),
 53 |       makeNumericParam("min_child_weight", lower = 0, upper = 7, requires = quote(booster == "gbtree"), trafo = function(x) 2^x),
 54 |       makeNumericParam("colsample_bytree", lower = 0, upper = 1, requires = quote(booster == "gbtree")),
 55 |       makeNumericParam("colsample_bylevel", lower = 0, upper = 1, requires = quote(booster == "gbtree")),
 56 |       makeNumericParam("lambda", lower = -10, upper = 10, trafo = function(x) 2^x),
 57 |       makeNumericParam("alpha", lower = -10, upper = 10, trafo = function(x) 2^x)),
 58 |     lrn.ps.sets = lrn.par.set)
 59 |   
 60 |   return(lrn.par.set)
 61 | }
 62 | 
 63 | makeLrnPsSets = function(learner, param.set, lrn.ps.sets = NULL, 
 64 |   id = paste0(learner$id, ".set"), overwrite = FALSE) {
 65 |   
 66 |   assertClass(learner, "Learner")
 67 |   assertClass(param.set, "ParamSet")
 68 |   par.match = names(param.set$pars) %in% names(learner$par.set$pars)
 69 |   if(all(par.match)){
 70 |     ls = list(learner = learner, param.set = param.set)
 71 |   } else {
 72 |     stop(paste("The following parameters in param.set are not included in learner:", 
 73 |       paste(names(param.set$pars[par.match == FALSE]), collapse = ", ")))
 74 |   }
 75 |   
 76 |   if(is.null(lrn.ps.sets)){
 77 |     lrn.ps.sets = list()
 78 |     lrn.ps.sets[[id]] = ls
 79 |     attr(lrn.ps.sets, "class") = "LrnPsSet"
 80 |   } else {
 81 |     assertClass(lrn.ps.sets, "LrnPsSet")
 82 |     
 83 |     if(id %in% names(lrn.ps.sets) & overwrite == FALSE){
 84 |       stop("tune.pair already contains id: \"", id, "\". Please specify a new id or set overwrite = TRUE.")
 85 |     } else {
 86 |       lrn.ps.sets[[id]] = ls
 87 |     }
 88 |   }
 89 |   
 90 |   return(lrn.ps.sets)
 91 | }
 92 | 
 93 | lrn.par.set = getMultipleLearners()
 94 | 
 95 | calculateTunability = function(default, optimumHyperpar, optimumTwoHyperpar = NULL) {
 96 |   optimumHyperpar$optimum - default$result
 97 | }
 98 | 
 99 | calculateTuningSpace = function(optimum, quant) {
100 |   space = data.frame(row.names = c(quant, 1-quant))
101 |   space2 = list()
102 |   par.sets = optimum$par.sets
103 |   for(i in 1:ncol(par.sets)) {
104 |     if(is.numeric(par.sets[,i])) {
105 |       par.sets[par.sets[,i]==-11,i] = NA
106 |       space = cbind(space, quantile(par.sets[,i], c(quant, 1-quant), na.rm = TRUE))
107 |       colnames(space)[ncol(space)] = names(par.sets)[i]
108 |     }
109 |     if(is.factor(par.sets[,i]) | is.logical(par.sets[,i])) {
110 |       logic = table(par.sets[,i]) / length(par.sets[,i]) > quant
111 |       space2 = c(space2, list(names(table(par.sets[,i]))[logic]))
112 |       names(space2)[length(space2)] = names(par.sets)[i]
113 |     }
114 |   }
115 |   return(list(numerics = space, factors = space2))
116 | }
117 | 


--------------------------------------------------------------------------------
/shiny/preproc.R:
--------------------------------------------------------------------------------
 1 | setwd("/nfsmb/koll/probst/Paper/Exploration_of_Hyperparameters/tunability/shiny")
 2 | load("results_all.RData")
 3 | 
 4 | # Nur absolut notwendige Information extrahieren
 5 | app_data = list()
 6 | 
 7 | measures = names(results_all)
 8 | classifiers = names(results_all$auc$bmr_surrogate)
 9 | 
10 | for(i in measures) {
11 |   for(j in classifiers) {
12 |    app_data[[i]]$surrogate[[j]] = getBMRAggrPerformances(results_all[[i]]$bmr_surrogate[[j]], as.df = TRUE)
13 |    app_data[[i]]$results = results_all[[i]]$results
14 |    app_data[[i]]$resultsPackageDefaults = results_all[[i]]$resultsPackageDefaults
15 |    app_data[[i]]$results_cv = results_all[[i]]$results_cv
16 |    app_data[[i]]$lrn.par.set = results_all[[i]]$lrn.par.set
17 |   }
18 | }
19 | 
20 | for(i in measures) {
21 |   names(app_data[[i]]$surrogate) = substring(names(app_data[[i]]$surrogate), 13)
22 |   names(app_data[[i]]$results) = substring(names(app_data[[i]]$results), 13)
23 |   names(app_data[[i]]$resultsPackageDefaults) = substring(names(app_data[[i]]$resultsPackageDefaults), 13)
24 |   names(app_data[[i]]$results_cv) = substring(names(app_data[[i]]$results_cv), 13)
25 |   #names(app_data[[1]]$lrn.par.set)
26 | }
27 | 
28 | for(i in measures) {
29 |   for(j in classifiers) {
30 |     colnames(app_data[[i]]$surrogate[[j]])[2] = "surrogate"
31 |     levels(app_data[[i]]$surrogate[[j]]$surrogate) = substring(levels(app_data[[i]]$surrogate[[j]]$surrogate), 6)
32 |   }
33 | }
34 | 
35 | save(app_data, file = "app_data.RData")
36 | 
37 | # auc und accuracy surrogate Zeug unterscheidet sich nicht!? -> neu rechnen -> nur den 6er von accuracy?, auch in Paper!
38 | # resultsPackageDefaults fehlt bei der AUC
39 | 


--------------------------------------------------------------------------------
/shiny/results_all.RData:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PhilippPro/tunability/9fc4fa84f500b1eec446ad50860901c315680a76/shiny/results_all.RData


--------------------------------------------------------------------------------
/shiny/rsconnect/shinyapps.io/philipppro/tunability.dcf:
--------------------------------------------------------------------------------
 1 | name: tunability
 2 | title: tunability
 3 | username:
 4 | account: philipppro
 5 | server: shinyapps.io
 6 | hostUrl: https://api.shinyapps.io/v1
 7 | appId: 299916
 8 | bundleId: 1845757
 9 | url: https://philipppro.shinyapps.io/tunability/
10 | when: 1548856147.23563
11 | asMultiple: FALSE
12 | asStatic: FALSE
13 | ignoredFiles: preproc.R|results_all.RData|old/app_old.R|old/old_app.R
14 | 


--------------------------------------------------------------------------------