├── DESCRIPTION ├── NAMESPACE ├── R ├── calculateTunabilityMeasures.R ├── calculateTunabilityMeasuresPackageDefault.R ├── calculateTuningSpace.R ├── compareSurrogateModels.R ├── helpers.R └── makeSurrogateModels.R ├── README.md ├── main.R ├── results_accuracy.RData ├── results_auc.RData ├── results_brier.RData └── shiny ├── app.R ├── app_data.RData ├── helpers.R ├── preproc.R ├── results_all.RData └── rsconnect └── shinyapps.io └── philipppro └── tunability.dcf /DESCRIPTION: -------------------------------------------------------------------------------- 1 | Package: tunability 2 | Title: Calculate tunability measures and hyperparameter spaces for tuning 3 | Version: 0.0.0.1 4 | Authors@R: person("Philipp", "Probst", email = "philipp_probst@gmx.de", role = c("aut", "cre")) 5 | Description: Calculate tunability measures and hyperparameter spaces for tuning based on results of the OpenML bot. 6 | License: GPL-2 7 | Encoding: UTF-8 8 | LazyData: true 9 | Depends: 10 | R (>= 3.3.3), 11 | mlr (>= 2.10), 12 | ParamHelpers (>= 1.8), 13 | OpenML (>= 1.2), 14 | batchtools (>= 0.9.0), 15 | BBmisc (>= 1.10), 16 | dplyr (>= 0.5.0), 17 | checkmate (>= 1.8.0), 18 | tidyr (>= 0.6.1), 19 | stringi (>= 1.1.2) 20 | -------------------------------------------------------------------------------- /NAMESPACE: -------------------------------------------------------------------------------- 1 | # Generated by roxygen2: fake comment so roxygen2 overwrites silently. 2 | exportPattern("^[^\\.]") 3 | -------------------------------------------------------------------------------- /R/calculateTunabilityMeasures.R: -------------------------------------------------------------------------------- 1 | #' Calculate default hyperparameter setting 2 | #' @param surrogates Surrogate models 3 | calculateDefault = function(surrogates, n.points = 100000, normalization = FALSE) { 4 | surr = surrogates$surrogates 5 | param.set = surrogates$param.set 6 | rnd.points = generateRandomDesign(n.points, param.set, trafo = TRUE) 7 | rnd.points = deleteNA(rnd.points) 8 | 9 | preds = matrix(NA, nrow(rnd.points), length(surr)) 10 | for(i in seq_along(surr)) { 11 | print(paste("surrogate predict: task", i, "of", length(surr))) 12 | preds[, i] = predict(surr[[i]], newdata = rnd.points)$data$response 13 | } 14 | # Best default in general 15 | if(normalization == FALSE) { 16 | average_preds = apply(preds, 1, mean) 17 | } else { 18 | new_preds = preds 19 | for(i in 1:ncol(preds)) { 20 | new_preds[, i] = scale(preds[, i]) 21 | } 22 | average_preds = apply(new_preds, 1, mean) 23 | } 24 | 25 | best = which(average_preds == max(average_preds))[1] 26 | default = rnd.points[best,, drop = FALSE] 27 | rownames(default) = NULL 28 | 29 | list(default = rnd.points[best,, drop = FALSE], result = preds[best, ]) 30 | 31 | # Default calculation with LOOCV 32 | #best_i = numeric(ncol(preds)) 33 | #preds_i_best = numeric(ncol(preds)) 34 | #default.loocv = list() 35 | # Best default with LOOCV 36 | #for(i in 1:ncol(preds)) { 37 | # preds_i = rowMeans(preds[, -i]) 38 | # best_i[i] = which(preds_i == max(preds_i))[1] 39 | # preds_i_best[i] = preds[best_i[i],i] 40 | # default.loocv[[i]] = rnd.points[best_i[i],, drop = FALSE] 41 | #} 42 | 43 | #list(default = rnd.points[best,, drop = FALSE], result = preds[best, ], 44 | # default.loocv = rnd.points[best_i,], result.loocv = preds_i_best) 45 | } 46 | 47 | #' Calculate performance of hyperparameter setting 48 | #' @param par.set Parameter setting 49 | calculatePerformance = function(surrogates, default) { 50 | surr = surrogates$surrogates 51 | preds = numeric(length(surr)) 52 | for(i in seq_along(surr)) { 53 | print(paste("surrogate predict: task", i, "of", length(surr))) 54 | preds[i] = predict(surr[[i]], newdata = default)$data$response 55 | } 56 | # Best default 57 | list(default = default, result = preds) 58 | } 59 | 60 | #' Calculate optimal hyperparameter values for an algorithm 61 | #' @param surrogate Surrogate models 62 | #' @param hyperpar Number of hyperparameters that should be evaluated at once; Possible options: one, two and all 63 | calculateDatasetOptimum = function(surrogates, default, hyperpar = "all", n.points = 10000) { 64 | surr = surrogates$surrogates 65 | param.set = surrogates$param.set 66 | if (hyperpar == "all") { 67 | rnd.points = generateRandomDesign(n.points, param.set, trafo = TRUE) 68 | rnd.points = deleteNA(rnd.points) 69 | 70 | preds = matrix(NA, nrow(rnd.points), length(surr)) 71 | for(i in seq_along(surr)) { 72 | print(paste("surrogate predict: task", i, "of", length(surr))) 73 | preds[, i] = predict(surr[[i]], newdata = rnd.points)$data$response 74 | } 75 | # Best Value 76 | rnd.points[apply(preds, 2, which.max),] 77 | return(list(optimum = diag(preds[apply(preds, 2, which.max), ]), par.sets = rnd.points[apply(preds, 2, which.max),, drop = FALSE])) 78 | } 79 | 80 | if (hyperpar == "one") { 81 | result = matrix(NA, length(surr), length(param.set$pars)) 82 | # only do this for parameters that makes sense changing them 83 | for(i in seq_along(param.set$pars)) { 84 | print(names(param.set$pars)[i]) 85 | rnd.points1 = generateRandomDesignWithDefaults(n.points, param.set, trafo = TRUE, default, subset = names(param.set$pars)[i]) 86 | # deleteNAs 87 | rnd.points1 = deleteNA(rnd.points1) 88 | 89 | # Prediction 90 | preds = matrix(NA, nrow(rnd.points1), length(surr)) 91 | 92 | for(j in seq_along(surr)) { 93 | preds[, j] = predict(surr[[j]], newdata = rnd.points1)$data$response 94 | } 95 | # Best Value 96 | # rnd.points1[apply(preds, 2, which.max),] 97 | result[, i] = diag(preds[apply(preds, 2, which.max), ]) 98 | } 99 | result = data.frame(result) 100 | colnames(result) = names(param.set$pars) 101 | return(list(optimum = result)) 102 | } 103 | if (hyperpar == "two") { 104 | result = array(NA, dim = c(length(surr), length(param.set$pars), length(param.set$pars))) 105 | 106 | for(i in seq_along(param.set$pars)[-length(param.set$pars)]) { 107 | for(j in seq_along(param.set$pars)[(i+1):length(param.set$pars)]) { 108 | print(c(names(param.set$pars)[i], names(param.set$pars)[j])) 109 | rnd.points1 = generateRandomDesignWithDefaults(n.points, param.set, trafo = TRUE, default, subset = names(param.set$pars)[c(i,j)]) 110 | rnd.points1 = deleteNA(rnd.points1) 111 | 112 | # Prediction 113 | preds = matrix(NA, nrow(rnd.points1), length(surr)) 114 | for(k in seq_along(surr)) { 115 | preds[, k] = predict(surr[[k]], newdata = rnd.points1)$data$response 116 | } 117 | # Best Value 118 | # rnd.points1[apply(preds, 2, which.max),] 119 | result[, i, j] = diag(preds[apply(preds, 2, which.max), ]) 120 | } 121 | } 122 | return(list(optimum = result)) 123 | } 124 | } 125 | 126 | #' Calculate tunability measures 127 | #' @param surrogate Surrogate models 128 | calculateTunability = function(default, optimumHyperpar, optimumTwoHyperpar = NULL) { 129 | optimumHyperpar$optimum - default$result 130 | } 131 | 132 | deleteNA = function(task.data) { 133 | for(i in 1:ncol(task.data)) { 134 | if(is.numeric(task.data[, i])) 135 | task.data[is.na(task.data[, i]), i] = -10 - 1 136 | if(is.factor(task.data[, i])) { 137 | task.data[, i] = addNA(task.data[, i]) 138 | task.data[, i] = droplevels(task.data[, i]) 139 | } 140 | if(is.logical(task.data[, i])) 141 | task.data[, i] = as.factor(task.data[, i]) 142 | } 143 | task.data 144 | } 145 | 146 | generateRandomDesignWithDefaults = function(n.points, param.set, trafo, default, subset) { 147 | rnd.points.def = default$default[rep(1, n.points), , drop = FALSE] 148 | 149 | # Required Parameters and Values 150 | reqPar = as.character(sapply(sapply(param.set$pars, `[[`, 12), `[[`, 2)) 151 | reqValue = as.character(sapply(sapply(param.set$pars, `[[`, 12), `[[`, 3)) 152 | 153 | param.set1 = param.set 154 | # If there are dependent variables include them 155 | if(any(subset %in% reqPar)) { 156 | subset2 = unique(c(subset, names(param.set$pars)[reqPar %in% subset])) 157 | } else { 158 | subset2 = subset 159 | } 160 | 161 | param.set1$pars = param.set$pars[subset2] 162 | rnd.points1 = rnd.points.def 163 | 164 | # If one parameter is required by another set it to the specific value 165 | for(m in seq_along(subset)) { 166 | if(!is.null(param.set1$pars[[m]]$requires)) { 167 | reqParSubset = as.character(param.set1$pars[[m]]$requires[2]) 168 | reqValueSubset = as.character(param.set1$pars[[m]]$requires[3]) 169 | 170 | rnd.points1[, reqParSubset] = reqValueSubset 171 | 172 | for(l in seq_along(param.set$pars)) { 173 | if(reqPar[l] == reqParSubset & reqValue[l] != reqValueSubset) 174 | rnd.points1[, l] = -10 - 1 175 | } 176 | if (!(reqParSubset %in% subset)) 177 | param.set1$pars[[m]]$requires = NULL 178 | } 179 | } 180 | 181 | rnd.points = generateRandomDesign(n.points, param.set1, trafo = TRUE) 182 | rnd.points1[, subset2] = rnd.points 183 | 184 | # Set the dependent values back to default 185 | back_to_default = subset2[!(subset2 %in% subset)] 186 | for(q in back_to_default) { 187 | if (q == "degree") { # Spezialfall svm, da wir hier keinen sinnvollen Default haben und den Package default nehmen 188 | rnd.points1[!is.na(rnd.points1[,q]), q] = 3 189 | } else { 190 | rnd.points1[!is.na(rnd.points1[,q]), q] = default$default[,q] 191 | } 192 | } 193 | # Add the default 194 | rnd.points1 = rbind(default$default, rnd.points1) 195 | rnd.points1 196 | } 197 | 198 | 199 | -------------------------------------------------------------------------------- /R/calculateTunabilityMeasuresPackageDefault.R: -------------------------------------------------------------------------------- 1 | #' Calculate default hyperparameter setting 2 | #' @param surrogates Surrogate models 3 | #' @param def Package defaults 4 | calculatePackageDefaultPerformance = function(surrogates, def, tbl.metaFeatures, tbl.results) { 5 | surr = surrogates$surrogates 6 | preds = numeric(length(surr)) 7 | for(i in seq_along(surr)) { 8 | print(paste("surrogate predict: task", i, "of", length(surr))) 9 | default = convertPackageDefault(def, surr[[i]], tbl.metaFeatures, tbl.results) 10 | preds[i] = predict(surr[[i]], newdata = default)$data$response 11 | } 12 | # Best default 13 | 14 | list(default = default, result = preds) 15 | } 16 | 17 | convertPackageDefault = function(def, surr, tbl.metaFeatures, tbl.results) { 18 | data_idi = surr$task.desc$id 19 | 20 | matching_task_data = unique(tbl.results[, c("data_id")]) 21 | n_feats = filter(tbl.metaFeatures, quality == "NumberOfFeatures") %>% 22 | select(., -quality) #%>% 23 | #inner_join(., matching_task_data, by = "data_id") 24 | p = as.numeric(filter(n_feats, data_id == data_idi)$value) 25 | 26 | if ("mtry" %in% names(def)) { 27 | def$mtry = floor(sqrt(p))/p 28 | } 29 | if ("gamma" %in% names(def)) { 30 | def$gamma = 1/p 31 | } 32 | def 33 | } 34 | 35 | #' Calculate optimal hyperparameter values for an algorithm 36 | #' @param surrogate Surrogate models 37 | #' @param hyperpar Number of hyperparameters that should be evaluated at once; Possible options: one, two and all 38 | calculateDatasetOptimumPackageDefault = function(surrogates, default, hyperpar = "one", n.points = 10000, tbl.metaFeatures, tbl.results) { 39 | surr = surrogates$surrogates 40 | param.set = surrogates$param.set 41 | 42 | if (hyperpar == "one") { 43 | result = matrix(NA, length(surr), length(param.set$pars)) 44 | # only do this for parameters that makes sense changing them 45 | for(i in seq_along(param.set$pars)) { 46 | print(names(param.set$pars)[i]) 47 | rnd.points1 = generateRandomDesignWithDefaults(n.points, param.set, trafo = TRUE, default, subset = names(param.set$pars)[i]) 48 | # deleteNAs 49 | rnd.points1 = deleteNA(rnd.points1) 50 | 51 | # Prediction 52 | preds = matrix(NA, nrow(rnd.points1), length(surr)) 53 | 54 | for(j in seq_along(surr)) { 55 | if (!(names(param.set$pars)[i] %in% c("mtry", "gamma"))) { 56 | rnd.points1 = convertPackageDefault(rnd.points1, surr[[j]], tbl.metaFeatures, tbl.results) 57 | } 58 | preds[, j] = predict(surr[[j]], newdata = rnd.points1)$data$response 59 | } 60 | # Best default 61 | # rnd.points1[apply(preds, 2, which.max),] 62 | result[, i] = diag(preds[apply(preds, 2, which.max), ]) 63 | } 64 | result = data.frame(result) 65 | colnames(result) = names(param.set$pars) 66 | return(list(optimum = result)) 67 | } 68 | if (hyperpar == "two") { 69 | result = array(NA, dim = c(length(surr), length(param.set$pars), length(param.set$pars))) 70 | 71 | for(i in seq_along(param.set$pars)[-length(param.set$pars)]) { 72 | for(j in seq_along(param.set$pars)[(i+1):length(param.set$pars)]) { 73 | print(c(names(param.set$pars)[i], names(param.set$pars)[j])) 74 | rnd.points1 = generateRandomDesignWithDefaults(n.points, param.set, trafo = TRUE, default, subset = names(param.set$pars)[c(i,j)]) 75 | rnd.points1 = deleteNA(rnd.points1) 76 | 77 | # Prediction 78 | preds = matrix(NA, nrow(rnd.points1), length(surr)) 79 | for(k in seq_along(surr)) { 80 | if (!any(names(param.set$pars)[c(i,j)] %in% c("mtry", "gamma"))) { 81 | rnd.points1 = convertPackageDefault(rnd.points1, surr[[k]], tbl.metaFeatures, tbl.results) 82 | } 83 | preds[, k] = predict(surr[[k]], newdata = rnd.points1)$data$response 84 | } 85 | # Best default 86 | # rnd.points1[apply(preds, 2, which.max),] 87 | result[, i, j] = diag(preds[apply(preds, 2, which.max), ]) 88 | } 89 | } 90 | return(list(optimum = result)) 91 | } 92 | } -------------------------------------------------------------------------------- /R/calculateTuningSpace.R: -------------------------------------------------------------------------------- 1 | #' Calculate default hyperparameter space for tuning 2 | #' @param surrogate Surrogate models 3 | calculateTuningSpace = function(optimum, quant) { 4 | space = data.frame(row.names = c(quant, 1-quant)) 5 | space2 = list() 6 | par.sets = optimum$par.sets 7 | for(i in 1:ncol(par.sets)) { 8 | if(is.numeric(par.sets[,i])) { 9 | par.sets[par.sets[,i]==-11,i] = NA 10 | space = cbind(space, quantile(par.sets[,i], c(quant, 1-quant), na.rm = TRUE)) 11 | colnames(space)[ncol(space)] = names(par.sets)[i] 12 | } 13 | if(is.factor(par.sets[,i]) | is.logical(par.sets[,i])) { 14 | logic = table(par.sets[,i]) / length(par.sets[,i]) > quant 15 | space2 = c(space2, list(names(table(par.sets[,i]))[logic])) 16 | names(space2)[length(space2)] = names(par.sets)[i] 17 | } 18 | } 19 | return(list(numerics = space, factors = space2)) 20 | } 21 | -------------------------------------------------------------------------------- /R/compareSurrogateModels.R: -------------------------------------------------------------------------------- 1 | #' Compare different surrogate models 2 | #' @param measure.name Name of the measure to optimize 3 | #' @param learner.name Name of learner 4 | #' @param data.ids [\code{numeric}] ids of the dataset 5 | #' @param lrn.par.set learner-parameter set which should include relevant bounds for flow 6 | #' @param tbl.results df with getMlrRandomBotResults() 7 | #' @param tbl.hypPars df with getMlrRandomBotHyperpars() 8 | #' @param tbl.metaFeatures df with getMlrRandomBotHyperpars() 9 | #' @param surrogate.mlr.lrns list of mlr learners that should be compared 10 | #' @param min.experiments minimum number of experiments that should be available for a dataset, otherwise the dataset is excluded 11 | #' @return surrogate model 12 | compareSurrogateModels = function(measure.name, learner.name, data.ids, tbl.results, 13 | tbl.metaFeatures, tbl.hypPars, lrn.par.set, surrogate.mlr.lrns) { 14 | 15 | param.set = lrn.par.set[[which(names(lrn.par.set) == paste0(substr(learner.name, 5, 100), ".set"))]]$param.set 16 | #train mlr model on full table for measure 17 | task.data = makeBotTable(measure.name, learner.name, tbl.results, tbl.metaFeatures, tbl.hypPars, param.set, data.ids) 18 | task.data = data.frame(task.data) 19 | task.data = deleteNA(task.data) 20 | 21 | # get specific data ids 22 | if(!is.null(data.ids)) { 23 | uni = unique(task.data$data_id) 24 | task.ids = uni[uni %in% data.ids] 25 | } else { 26 | task.ids = unique(task.data$data_id) 27 | } 28 | 29 | mlr.tasks = list() 30 | for(i in seq_along(data.ids)) { 31 | data.idi = data.ids[i] 32 | data = subset(task.data, data_id == data.ids[i], select = c("measure.value", names(param.set$pars))) 33 | # Rename column names because of weird "sample" behaviour of cubist 34 | colnames(data) = gsub("sample", "ampel", colnames(data)) 35 | mlr.tasks[[i]] = makeRegrTask(id = as.character(data.idi), data, target = "measure.value") 36 | } 37 | mlr.lrns = surrogate.mlr.lrns 38 | measures = list(mse, rsq, kendalltau, spearmanrho) 39 | rdesc = makeResampleDesc("RepCV", reps = 10, folds = 10) 40 | mlr.benchmark = benchmark(mlr.lrns, mlr.tasks, resamplings = rdesc, keep.pred = FALSE, models = FALSE, measures = measures) 41 | 42 | return(mlr.benchmark) 43 | } -------------------------------------------------------------------------------- /R/helpers.R: -------------------------------------------------------------------------------- 1 | 2 | 3 | getSimpleLearners = function(){ 4 | # Simple learner param set 5 | simple.lrn.par.set = makeLrnPsSets(learner = makeLearner("classif.glmnet", predict.type = "prob"), 6 | param.set = makeParamSet( 7 | makeNumericParam("alpha", lower = 0, upper = 1, default = 1), 8 | makeNumericVectorParam("lambda", len = 1L, lower = -10, upper = 10, default = 0 ,trafo = function(x) 2^x))) 9 | 10 | simple.lrn.par.set = makeLrnPsSets(learner = makeLearner("classif.rpart", predict.type = "prob"), 11 | param.set = makeParamSet( 12 | makeNumericParam("cp", lower = 0, upper = 1, default = 0.01), 13 | makeIntegerParam("maxdepth", lower = 1, upper = 30, default = 30), 14 | makeIntegerParam("minbucket", lower = 1, upper = 60, default = 1), 15 | makeIntegerParam("minsplit", lower = 1, upper = 60, default = 20)), 16 | lrn.ps.sets = simple.lrn.par.set) 17 | 18 | return(simple.lrn.par.set) 19 | } 20 | 21 | getMultipleLearners = function(){ 22 | simple.lrn.par.set = getSimpleLearners() 23 | 24 | # increase to a general param set 25 | lrn.par.set = makeLrnPsSets(learner = makeLearner("classif.kknn", predict.type = "prob"), 26 | param.set = makeParamSet( 27 | makeIntegerParam("k", lower = 1, upper = 30)), 28 | lrn.ps.sets = simple.lrn.par.set) 29 | 30 | lrn.par.set = makeLrnPsSets(learner = makeLearner("classif.svm", predict.type = "prob"), 31 | param.set = makeParamSet( 32 | makeDiscreteParam("kernel", values = c("linear", "polynomial", "radial")), 33 | makeNumericParam("cost", lower = -10, upper = 10, trafo = function(x) 2^x), 34 | makeNumericParam("gamma", lower = -10, upper = 10, trafo = function(x) 2^x, requires = quote(kernel == "radial")), 35 | makeIntegerParam("degree", lower = 2, upper = 5, requires = quote(kernel == "polynomial"))), 36 | lrn.ps.sets = lrn.par.set) 37 | 38 | lrn.par.set = makeLrnPsSets(learner = makeLearner("classif.ranger", predict.type = "prob"), 39 | param.set = makeParamSet( 40 | makeIntegerParam("num.trees", lower = 1, upper = 2000), 41 | makeLogicalParam("replace"), 42 | makeNumericParam("sample.fraction", lower = 0.1, upper = 1), 43 | makeNumericParam("mtry", lower = 0, upper = 1), 44 | makeLogicalParam(id = "respect.unordered.factors"), 45 | makeNumericParam("min.node.size", lower = 0, upper = 1)), 46 | lrn.ps.sets = lrn.par.set) 47 | 48 | lrn.par.set = makeLrnPsSets(learner = makeLearner("classif.xgboost", predict.type = "prob"), 49 | param.set = makeParamSet( 50 | makeIntegerParam("nrounds", lower = 1, upper = 5000), 51 | makeNumericParam("eta", lower = -10, upper = 0, trafo = function(x) 2^x), 52 | makeNumericParam("subsample",lower = 0.1, upper = 1), 53 | makeDiscreteParam("booster", values = c("gbtree", "gblinear")), 54 | makeIntegerParam("max_depth", lower = 1, upper = 15, requires = quote(booster == "gbtree")), 55 | makeNumericParam("min_child_weight", lower = 0, upper = 7, requires = quote(booster == "gbtree"), trafo = function(x) 2^x), 56 | makeNumericParam("colsample_bytree", lower = 0, upper = 1, requires = quote(booster == "gbtree")), 57 | makeNumericParam("colsample_bylevel", lower = 0, upper = 1, requires = quote(booster == "gbtree")), 58 | makeNumericParam("lambda", lower = -10, upper = 10, trafo = function(x) 2^x), 59 | makeNumericParam("alpha", lower = -10, upper = 10, trafo = function(x) 2^x)), 60 | lrn.ps.sets = lrn.par.set) 61 | 62 | return(lrn.par.set) 63 | } 64 | 65 | makeLrnPsSets = function(learner, param.set, lrn.ps.sets = NULL, 66 | id = paste0(learner$id, ".set"), overwrite = FALSE) { 67 | 68 | assertClass(learner, "Learner") 69 | assertClass(param.set, "ParamSet") 70 | par.match = names(param.set$pars) %in% names(learner$par.set$pars) 71 | if(all(par.match)){ 72 | ls = list(learner = learner, param.set = param.set) 73 | } else { 74 | stop(paste("The following parameters in param.set are not included in learner:", 75 | paste(names(param.set$pars[par.match == FALSE]), collapse = ", "))) 76 | } 77 | 78 | if(is.null(lrn.ps.sets)){ 79 | lrn.ps.sets = list() 80 | lrn.ps.sets[[id]] = ls 81 | attr(lrn.ps.sets, "class") = "LrnPsSet" 82 | } else { 83 | assertClass(lrn.ps.sets, "LrnPsSet") 84 | 85 | if(id %in% names(lrn.ps.sets) & overwrite == FALSE){ 86 | stop("tune.pair already contains id: \"", id, "\". Please specify a new id or set overwrite = TRUE.") 87 | } else { 88 | lrn.ps.sets[[id]] = ls 89 | } 90 | } 91 | 92 | return(lrn.ps.sets) 93 | } 94 | 95 | calculateTunability = function(default, optimumHyperpar, optimumTwoHyperpar = NULL) { 96 | optimumHyperpar$optimum - default$result 97 | } 98 | 99 | lrn.par.set = getMultipleLearners() 100 | -------------------------------------------------------------------------------- /R/makeSurrogateModels.R: -------------------------------------------------------------------------------- 1 | #' Create surrogate models for different tasks 2 | #' @param measure.name Name of the measure to optimize 3 | #' @param learner.name Name of learner 4 | #' @param data.ids [\code{numeric}] ids of the dataset 5 | #' @param lrn.par.set learner-parameter set which should include relevant bounds for flow 6 | #' @param tbl.results df with getMlrRandomBotResults() 7 | #' @param tbl.hypPars df with getMlrRandomBotHyperpars() 8 | #' @param tbl.metaFeatures df with getMlrRandomBotHyperpars() 9 | #' @param min.experiments minimum number of experiments that should be available for a dataset, otherwise the dataset is excluded 10 | #' @return surrogate model 11 | makeSurrogateModels = function(measure.name, learner.name, data.ids, tbl.results, 12 | tbl.metaFeatures, tbl.hypPars, lrn.par.set, surrogate.mlr.lrn) { 13 | 14 | param.set = lrn.par.set[[which(names(lrn.par.set) == paste0(substr(learner.name, 5, 100), ".set"))]]$param.set 15 | #train mlr model on full table for measure 16 | task.data = makeBotTable(measure.name, learner.name, tbl.results, tbl.metaFeatures, tbl.hypPars, param.set, data.ids) 17 | task.data = data.frame(task.data) 18 | task.data = deleteNA(task.data) 19 | 20 | # get specific task ids 21 | if(!is.null(data.ids)) { 22 | uni = unique(task.data$data_id) 23 | data.ids = sort(uni[uni %in% data.ids]) 24 | } else { 25 | data.ids = sort(unique(task.data$data_id)) 26 | } 27 | 28 | mlr.mod.measure = list() 29 | for(i in seq_along(data.ids)) { 30 | print(paste("surrogate train: task", i, "of", length(data.ids))) 31 | data.idi = data.ids[i] 32 | 33 | mlr.task.measure = makeRegrTask(id = as.character(data.idi), subset(task.data, data_id == data.idi, select = c("measure.value", names(param.set$pars))), target = "measure.value") 34 | mlr.lrn = surrogate.mlr.lrn 35 | mlr.mod.measure[[i]] = train(mlr.lrn, mlr.task.measure) 36 | gc() 37 | } 38 | return(list(surrogates = mlr.mod.measure, param.set = param.set)) 39 | } 40 | 41 | 42 | #' Merge results, hyperpars and features tables and prepare for mlr.task input 43 | #' @param measure.name.filter What measure to analyse 44 | #' @param learner.name What learner to analyse 45 | #' @param tbl.results df with getMlrRandomBotResults() 46 | #' @param tbl.hypPars df with getMlrRandomBotHyperpars() 47 | #' @param tbl.metaFeatures df with getMlrRandomBotHyperpars() 48 | #' @return [\code{data.frame}] Complete table used for creating the surrogate model 49 | makeBotTable = function(measure.name, learner.name, tbl.results, tbl.metaFeatures, tbl.hypPars, param.set, data.ids) { 50 | 51 | tbl.hypPars.learner = tbl.hypPars[tbl.hypPars$fullName == learner.name, ] 52 | tbl.hypPars.learner = spread(tbl.hypPars.learner, name, value) 53 | tbl.hypPars.learner = data.frame(tbl.hypPars.learner) 54 | # Convert the columns to the specific classes 55 | params = getParamIds(param.set) 56 | param_types = getParamTypes(param.set) 57 | for(i in seq_along(params)) 58 | tbl.hypPars.learner[, params[i]] = conversion_function(tbl.hypPars.learner[, params[i]], param_types[i]) 59 | 60 | bot.table = inner_join(tbl.results, tbl.hypPars.learner, by = "setup") %>% 61 | select(., -run_id, -setup, -fullName) 62 | 63 | # Scale mtry and min.node.size in random forest 64 | if(learner.name == "mlr.classif.ranger"){ 65 | n_feats = filter(tbl.metaFeatures, quality == "NumberOfFeatures") %>% 66 | select(., -quality) 67 | n_feats$value = as.numeric(n_feats$value) 68 | bot.table = inner_join(bot.table, n_feats, by = "data_id") 69 | bot.table$mtry = bot.table$mtry/bot.table$value 70 | bot.table = bot.table %>% select(., -value) 71 | 72 | n_inst = filter(tbl.metaFeatures, quality == "NumberOfInstances") %>% 73 | select(., -quality) 74 | n_inst$value = as.numeric(n_inst$value) 75 | bot.table = inner_join(bot.table, n_inst, by = "data_id") 76 | bot.table$min.node.size = log(bot.table$min.node.size, 2) / log(bot.table$value, 2) 77 | bot.table = bot.table %>% select(., -value) 78 | } 79 | 80 | bot.table = bot.table %>% select(., -task_id) 81 | colnames(bot.table)[colnames(bot.table) == measure.name] = "measure.value" 82 | bot.table$measure.value = as.numeric(bot.table$measure.value) 83 | 84 | # select only runs on the specific data.ids 85 | bot.table = subset(bot.table, data_id %in% data.ids) 86 | 87 | return(bot.table) 88 | } 89 | 90 | 91 | conversion_function = function(x, param_type) { 92 | if(param_type %in% c("integer", "numeric", "numericvector")) 93 | x = as.numeric(x) 94 | if(param_type %in% c("character", "logical", "factor", "discrete")) 95 | x = as.factor(x) 96 | return(x) 97 | } 98 | 99 | #' Get relevant datasets 100 | #' 101 | #' @param tbl.results 102 | #' @param tbl.hypPars 103 | #' @param min.experiments 104 | calculateDataIds = function(tbl.results, tbl.hypPars, min.experiments = 200) { 105 | whole.table = inner_join(tbl.results, tbl.hypPars, by = "setup") %>% select(., data_id, fullName) 106 | cross.table = table(whole.table$data_id, whole.table$fullName) 107 | bigger = rowSums(cross.table > min.experiments) 108 | data.ids = names(bigger)[bigger == 5] 109 | return(data.ids) 110 | } 111 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # tunability 2 | 3 | This repository was used for the calculation of tunability measures and tuning spaces as described in the paper: 4 | 5 | [Tunability: Importance of Hyperparameters of Machine Learning Algorithms](http://www.jmlr.org/papers/v20/18-444.html) 6 | 7 | For the calculation just run the [main.R](https://github.com/PhilippPro/tunability/blob/master/main.R) file. 8 | The calculation can take several days to finish. 9 | -------------------------------------------------------------------------------- /main.R: -------------------------------------------------------------------------------- 1 | library(checkpoint) 2 | checkpoint("2018-07-01") 3 | 4 | library(devtools) 5 | library(OpenML) 6 | library(batchtools) 7 | #OMLbots_path = "/home/probst/Paper/Exploration_of_Hyperparameters/OMLbots" 8 | #OMLbots_path = "C:/Promotion/Hyperparameters/OMLbots" 9 | #load_all(OMLbots_path) 10 | load_all() 11 | lrn.par.set = getMultipleLearners() 12 | 13 | # Get file from the figshare repository 14 | load(url("https://ndownloader.figshare.com/files/10811309")) 15 | 16 | # From wide format to long 17 | #a = read.csv(url("https://ndownloader.figshare.com/files/10462300")) 18 | 19 | #a = read.csv(url("https://ndownloader.figshare.com/files/10811312")) 20 | #library(xtable) 21 | #head(a) 22 | #table(a$data_id) 23 | 24 | ################################ Restrict data to 500000 results for each algorithm 25 | data.ids = calculateDataIds(tbl.results, tbl.hypPars, min.experiments = 200) 26 | # Only results for OpenML100 datasets 27 | #tasks = listOMLTasks(number.of.classes = 2L, tag = "OpenML100", estimation.procedure = "10-fold Crossvalidation", number.of.missing.values = 0) 28 | #data.ids = data.ids[data.ids %in% tasks$data.id] 29 | 30 | # Change the sign for the brier score to get the correct results 31 | tbl.results$brier = -tbl.results$brier 32 | 33 | library(stringi) 34 | learner.names = paste0("mlr.", names(lrn.par.set)) 35 | learner.names = stri_sub(learner.names, 1, -5) 36 | measures = c("auc", "accuracy", "brier") 37 | measure = c("auc") 38 | 39 | ################################ Compare different surrogate models (complete) 40 | 41 | # only models which do not have to be tuned! 42 | surrogate.mlr.lrns = list( 43 | makeLearner("regr.lm"), 44 | makeLearner("regr.rpart"), 45 | makeLearner("regr.kknn"), 46 | makeLearner("regr.ranger"), 47 | # makeLearner("regr.ranger", par.vals = list(num.trees = 2000, respect.unordered.factors = "order")), 48 | makeLearner("regr.cubist") 49 | #makeLearner("regr.xgboost", par.vals = list(nrounds = 300, eta = 0.03, max_depth = 2, nthread = 1)), 50 | #makeLearner("regr.svm"), 51 | #makeLearner("regr.bartMachine"), 52 | #makeLearner("regr.glmnet"), 53 | #makeLearner("regr.brnn"), # too many errors 54 | #makeLearner("regr.km") 55 | ) 56 | 57 | k = 2 58 | i = 6 59 | bmr = list() 60 | 61 | load(paste0("bmr_", measures[k], ".RData")) 62 | 63 | for(k in 1:3) { 64 | configureMlr(show.info = TRUE, on.learner.error = "warn", on.learner.warning = "warn", on.error.dump = TRUE) 65 | library("parallelMap") 66 | parallelStartSocket(5) 67 | for (i in 1:6) { 68 | print(i) 69 | set.seed(521 + i) 70 | # task.id 146085, 14966 does not work for svm 71 | bmr[[i]] = compareSurrogateModels(measure.name = measures[k], learner.name = learner.names[i], 72 | data.ids = data.ids, tbl.results, tbl.metaFeatures, tbl.hypPars, lrn.par.set, surrogate.mlr.lrns) 73 | gc() 74 | save(bmr, file = paste0("bmr_", measures[k], ".RData")) 75 | } 76 | parallelStop() 77 | names(bmr) = learner.names 78 | 79 | for(i in seq_along(bmr)) { 80 | print(i) 81 | rmat = convertBMRToRankMatrix(bmr[[i]]) 82 | print(rmat) 83 | print(plotBMRSummary(bmr[[i]], measure = kendalltau)) 84 | print(plotBMRBoxplots(bmr[[i]], style = "violin")) 85 | print(plotBMRRanksAsBarChart(bmr[[i]], pos = "stack")) 86 | } 87 | bmr_surrogate = bmr 88 | 89 | # replace NA results of lm/kknn, rsq 90 | for(i in seq_along(data.ids)) { 91 | for(j in seq_along(learner.names)) { 92 | for(l in seq_along(surrogate.mlr.lrns)) { 93 | rsq = bmr_surrogate[[j]]$results[[i]][[l]]$measures.test$rsq 94 | bmr_surrogate[[j]]$results[[i]][[l]]$aggr[2] = mean(rsq[rsq>0], na.rm = T) 95 | bmr_surrogate$mlr.classif.kknn$results[[i]][[l]]$aggr[3] = 96 | mean(bmr_surrogate$mlr.classif.kknn$results[[i]][[l]]$measures.test$kendalltau, na.rm = T) 97 | bmr_surrogate$mlr.classif.kknn$results[[i]][[l]]$aggr[4] = 98 | mean(bmr_surrogate$mlr.classif.kknn$results[[i]][[l]]$measures.test$spearmanrho, na.rm = T) 99 | } 100 | } 101 | } 102 | 103 | # Save results 104 | save(bmr_surrogate, file = paste0("results_", measures[k], ".RData")) 105 | 106 | 107 | # Best model in general: ranger, cubist 108 | 109 | ################################# Calculate tunability measures 110 | surrogate.mlr.lrn = makeLearner("regr.ranger", par.vals = list(num.threads = 4)) 111 | #surrogate.mlr.lrn = makeLearner("regr.ranger", par.vals = list(num.trees = 2000, respect.unordered.factors = "order", num.threads = 4)) 112 | #surrogate.mlr.lrn = makeLearner("regr.cubist") 113 | 114 | results = list() 115 | 116 | for(i in seq_along(learner.names)) { 117 | print(i) 118 | set.seed(199 + i) 119 | # Surrogate model calculation 120 | surrogates = makeSurrogateModels(measure.name = measures[k], learner.name = learner.names[i], 121 | data.ids = data.ids, tbl.results, tbl.metaFeatures, tbl.hypPars, lrn.par.set, surrogate.mlr.lrn) 122 | save(surrogates, file = paste0("surrogates_", measures[k], "_", i, ".RData")) 123 | } 124 | 125 | for(i in seq_along(learner.names)) { 126 | print(i) 127 | set.seed(199 + i) 128 | load(paste0("surrogates_", measures[k], "_", i, ".RData")) 129 | # Default calculation 130 | default = calculateDefault(surrogates) 131 | # Tunability overall 132 | optimum = calculateDatasetOptimum(surrogates, default, hyperpar = "all", n.points = 100000) 133 | # Tunability hyperparameter specific 134 | optimumHyperpar = calculateDatasetOptimum(surrogates, default, hyperpar = "one", n.points = 100000) 135 | # Tunability for two hyperparameters 136 | optimumTwoHyperpar = calculateDatasetOptimum(surrogates, default, hyperpar = "two", n.points = 10000) 137 | # Tuning space 138 | tuningSpace = calculateTuningSpace(optimum, quant = 0.05) 139 | 140 | results[[i]] = list(default = default, optimum = optimum, optimumHyperpar = optimumHyperpar, 141 | optimumTwoHyperpar = optimumTwoHyperpar, tuningSpace = tuningSpace) 142 | gc() 143 | save(bmr_surrogate, results, file = paste0("results_", measures[k], ".RData")) 144 | } 145 | names(results) = learner.names 146 | 147 | # Calculations 148 | default = results$mlr.classif.xgboost$default 149 | optimum = results$mlr.classif.xgboost$optimum 150 | optimumHyperpar = results$mlr.classif.xgboost$optimumHyperpar 151 | overallTunability = calculateTunability(default, optimum) 152 | mean(overallTunability) 153 | tunability = calculateTunability(default, optimumHyperpar) 154 | data.frame(t(colMeans(tunability))) 155 | # scaled 156 | data.frame(t(colMeans(tunability/overallTunability, na.rm = T))) 157 | 158 | default$default[is.numeric(default$default)] = default$default[,is.numeric(default$default)] 159 | 160 | def = default$default 161 | 162 | for(i in 1:length(def)) { 163 | if(is.numeric(def[[i]])) 164 | def[[i]] = round(def[[i]], 3) 165 | } 166 | 167 | # Interaction 168 | # Bare values 169 | tab = colMeans(results$mlr.classif.xgboost$optimumTwoHyperpar$optimum, dims = 1, na.rm = TRUE) - 170 | mean(results$mlr.classif.xgboost$default$result) 171 | diag(tab) = colMeans(tunability) 172 | colnames(tab) = rownames(tab) = names(tunability) 173 | tab 174 | # Interaction 175 | colMeans(results$mlr.classif.xgboost$optimumTwoHyperpar$optimum, dims = 1, na.rm = TRUE) - 176 | mean(results$mlr.classif.xgboost$default$result) - 177 | outer(colMeans(tunability), colMeans(tunability), '+') 178 | # Performance gain 179 | colMeans(results$mlr.classif.xgboost$optimumTwoHyperpar$optimum, dims = 1, na.rm = TRUE) - 180 | mean(results$mlr.classif.xgboost$default$result) - 181 | outer(colMeans(tunability), colMeans(tunability), pmax) 182 | 183 | # Package defaults 184 | package.defaults = list( 185 | glmnet = data.frame(alpha = 1, lambda = 0), # no regularization 186 | rpart = data.frame(cp = 0.01, maxdepth = 30, minbucket = 7, minsplit = 20), 187 | kknn = data.frame(k = 7), 188 | svm = data.frame(kernel = "radial", cost = 1, gamma = 1, degree = 3), 189 | ranger = data.frame(num.trees = 500, replace = TRUE, sample.fraction = 1, mtry = 0.1, respect.unordered.factors = FALSE, min.node.size = 0), 190 | xgboost = data.frame(nrounds = 500, eta = 0.3, subsample = 1, booster = "gbtree", max_depth = 6, min_child_weight = 1, 191 | colsample_bytree = 1, colsample_bylevel = 1, lambda = 1, alpha = 1) 192 | ) 193 | 194 | # Parameters dependent on data characteristics: svm: gamma, ranger: mtry. 195 | # Not Specified: glmnet: alpha, xgboost: nrounds 196 | resultsPackageDefaults = list() 197 | 198 | for(i in seq_along(learner.names)) { 199 | print(i) 200 | set.seed(199 + i) 201 | load(paste0("surrogates_", measures[k], "_", i, ".RData")) 202 | 203 | def = package.defaults[[i]] 204 | default = calculatePackageDefaultPerformance(surrogates, def, tbl.metaFeatures, tbl.results) 205 | optimumHyperpar = calculateDatasetOptimumPackageDefault(surrogates, default, hyperpar = "one", n.points = 100000, tbl.metaFeatures, tbl.results) 206 | optimumTwoHyperpar = calculateDatasetOptimumPackageDefault(surrogates, default, hyperpar = "two", n.points = 10000, tbl.metaFeatures, tbl.results) 207 | resultsPackageDefaults[[i]] = list(default = default, optimumHyperpar = optimumHyperpar, optimumTwoHyperpar = optimumTwoHyperpar) 208 | save(bmr_surrogate, results, resultsPackageDefaults, file = paste0("results_", measures[k], ".RData")) 209 | } 210 | names(resultsPackageDefaults) = learner.names 211 | 212 | resultsPackageDefaults$mlr.classif.svm$default$default$gamma = "1/p" 213 | resultsPackageDefaults$mlr.classif.ranger$default$default$mtry = "sqrt(p)" 214 | resultsPackageDefaults$mlr.classif.ranger$default$default$min.node.size = "1" 215 | 216 | save(bmr_surrogate, results, resultsPackageDefaults, file = paste0("results_", measures[k], ".RData")) 217 | 218 | # Calculations 219 | default = resultsPackageDefaults$mlr.classif.ranger$default 220 | optimum = results$mlr.classif.ranger$optimum 221 | optimumHyperpar = resultsPackageDefaults$mlr.classif.ranger$optimumHyperpar 222 | overallTunability = calculateTunability(default, optimum) 223 | mean(overallTunability) 224 | 225 | tunability = calculateTunability(default, optimumHyperpar) 226 | 227 | data.frame(t(colMeans(tunability))) 228 | # scaled 229 | data.frame(t(colMeans(tunability/overallTunability, na.rm = T))) 230 | 231 | # KI for tunability 232 | y = overallTunability 233 | hist(y) 234 | qqnorm(y) 235 | qqline(y) 236 | 237 | t_value = qt(0.975, length(y) - 1) 238 | mean(y) + c(-t_value, t_value) * sd(y) / sqrt(length(y)) 239 | 240 | # Tunability of the "algorithm"; overfitting problem! 241 | the_order = order(results[[5]]$default$result) 242 | plot(results$mlr.classif.glmnet$default$result[the_order], type = "l", ylab = "AUC") 243 | avg_results = numeric(6) 244 | best_results = best_results_default = numeric(length(results$mlr.classif.glmnet$default$result)) 245 | 246 | for(i in seq_along(learner.names)) { 247 | lines(results[[i]]$default$result[the_order], col = i) 248 | avg_results[i] = mean(results[[i]]$default$result) 249 | for(j in 1:length(results[[i]]$default$result)) { 250 | best_results_default[j] = ifelse(results[[i]]$default$result[j] > best_results[j], results[[i]]$default$result[j], best_results[j]) 251 | best_results[j] = ifelse(results[[i]]$optimum$optimum[j] > best_results[j], results[[i]]$optimum$optimum[j], best_results[j]) 252 | } 253 | } 254 | legend("topleft", legend = substr(learner.names, 13, 100), col = 1:6, lty = 1) 255 | 256 | round(best_results - (results[[5]]$default$result), 3) 257 | mean(best_results_default - (results[[5]]$default$result)) 258 | mean(best_results - (results[[5]]$default$result)) 259 | 260 | mean((results[[5]]$default$result) - (results[[6]]$default$result)) 261 | # maybe overfitting! 262 | 263 | # Make Crossvalidation to test if there is overfitting 264 | results_cv = list() 265 | for(i in 1:6) { 266 | print(i) 267 | set.seed(3000 + i) 268 | load(paste0("surrogates_", measures[k], "_", i, ".RData")) 269 | 270 | # CV 271 | n_surr = length(surrogates$surrogates) 272 | shuffle = sample(n_surr) 273 | folds = cut(shuffle, breaks = 5, labels = FALSE) 274 | 275 | default = list() 276 | optimumHyperpar = list() 277 | optimumTwoHyperpar = list() 278 | 279 | for(j in 1:5) { 280 | print(paste(j,i)) 281 | testInd = which(folds == j, arr.ind = TRUE) 282 | trainInd = which(folds != j, arr.ind = TRUE) 283 | 284 | # Default calculation 285 | default1 = calculateDefault(surrogates = list(surrogates = surrogates$surrogates[trainInd], param.set = surrogates$param.set)) 286 | # Calculate performance of these defaults on test datasets 287 | default[[j]] = calculatePerformance(list(surrogates = surrogates$surrogates[testInd], param.set = surrogates$param.set), default1$default) 288 | # Tunability hyperparameter specific 289 | optimumHyperpar[[j]] = calculateDatasetOptimum(surrogates = list(surrogates = surrogates$surrogates[testInd], param.set = surrogates$param.set), default[[j]], hyperpar = "one", n.points = 100000) 290 | # Tunability for two hyperparameters 291 | optimumTwoHyperpar[[j]] = calculateDatasetOptimum(list(surrogates = surrogates$surrogates[testInd], param.set = surrogates$param.set), default[[j]], hyperpar = "two", n.points = 10000) 292 | 293 | results_cv[[i]] = list(default = default, optimumHyperpar = optimumHyperpar, optimumTwoHyperpar = optimumTwoHyperpar) 294 | gc() 295 | } 296 | save(bmr_surrogate, results, resultsPackageDefaults, results_cv, file = paste0("results_", measures[k], ".RData")) 297 | } 298 | names(results_cv) = learner.names 299 | save(bmr_surrogate, results, resultsPackageDefaults, results_cv, lrn.par.set, file = paste0("results_", measures[k], ".RData")) 300 | } 301 | 302 | # overall tunability, cross-validated 303 | for(i in seq_along(learner.names)){ 304 | print(learner.names[i]) 305 | print(mean(calculateTunability(results[[i]]$default, results[[i]]$optimum))) 306 | print(mean(results[[i]]$optimum$optimum - unlist(sapply(results_cv[[i]]$default, "[[", 2)))) 307 | } 308 | for(i in seq_along(learner.names)){ 309 | print(learner.names[i]) 310 | print(rbind(colMeans(calculateTunability(results[[i]]$default, results[[i]]$optimumHyperpar)), 311 | colMeans(do.call(rbind, unlist(results_cv[[i]]$optimumHyperpar, recursive=FALSE)) - unlist(sapply(results_cv[[i]]$default, "[[", 2))))) 312 | } 313 | 314 | # Save results for shiny 315 | 316 | results_auc = NULL 317 | names = load("results_auc.RData") 318 | for(i in seq_along(names)) 319 | results_auc[[i]] = get(names[i]) 320 | names(results_auc) = names 321 | results_accuracy = NULL 322 | names = load("results_accuracy.RData") 323 | for(i in seq_along(names)) 324 | results_accuracy[[i]] = get(names[i]) 325 | names(results_accuracy) = names 326 | results_brier = NULL 327 | names = load("results_brier.RData") 328 | for(i in seq_along(names)) 329 | results_brier[[i]] = get(names[i]) 330 | names(results_brier) = names 331 | 332 | results_all = list(auc = results_auc, accuracy = results_accuracy, brier = results_brier) 333 | save(results_all, file = "./shiny/results_all.RData") 334 | 335 | # Annex 336 | 337 | lrn.regr = makeLearner("regr.ksvm") 338 | fit.regr = train(lrn.regr, bh.task) 339 | fa = generateFunctionalANOVAData(fit.regr, bh.task, "lstat", depth = 1, fun = median) 340 | 341 | 342 | # Defaults normalized (check if results differ substantially) 343 | 344 | results_normalized = list() 345 | k = 1 346 | for(i in seq_along(learner.names)) { 347 | print(i) 348 | load(paste0("surrogates_", measures[k], "_", i, ".RData")) 349 | # Defaults with normalization 350 | default = calculateDefault(surrogates, normalization = TRUE) 351 | # Tunability hyperparameter specific 352 | optimumHyperpar = calculateDatasetOptimum(surrogates, default, hyperpar = "one", n.points = 100000) 353 | # Tunability for two hyperparameters 354 | #optimumTwoHyperpar = calculateDatasetOptimum(surrogates, default, hyperpar = "two", n.points = 10000) 355 | # Tuning space 356 | results_normalized[[i]] = list(default = default, optimumHyperpar = optimumHyperpar) 357 | gc() 358 | save(results_normalized, file = paste0("./results_normalized_", measures[k], ".RData")) 359 | } 360 | 361 | 362 | for(i in 1:6){ 363 | print(learner.names[i]) 364 | print(rbind(results_normalized[[i]]$default$default, results[[i]]$default$default)) 365 | print("Tunability") 366 | print(mean(calculateTunability(results_normalized[[i]]$default, results[[i]]$optimum))) 367 | print(mean(calculateTunability(results[[i]]$default, results[[i]]$optimum))) 368 | print("Tunability parameter") 369 | print(rbind(colMeans(calculateTunability(results_normalized[[i]]$default, results[[i]]$optimumHyperpar)), 370 | colMeans(calculateTunability(results[[i]]$default, results_normalized[[i]]$optimumHyperpar)))) 371 | print("--------------------------------------------------------------------------------------------------------") 372 | } 373 | 374 | # xtable version 375 | library(xtable) 376 | for(i in 1:6){ 377 | defs = rbind(results_normalized[[i]]$default$default, results[[i]]$default$default) 378 | rownames(defs) = c("norm", "mean") 379 | colnames(defs) = substr(colnames(defs), 1, 5) 380 | print(xtable(defs, caption = paste(learner.names[i], "defaults"), digits = 3)) 381 | tuna = rbind( 382 | c(mean(calculateTunability(results_normalized[[i]]$default, results[[i]]$optimum)), 383 | colMeans(calculateTunability(results_normalized[[i]]$default, results[[i]]$optimumHyperpar))), 384 | c(mean(calculateTunability(results[[i]]$default, results[[i]]$optimum)), 385 | colMeans(calculateTunability(results[[i]]$default, results[[i]]$optimumHyperpar))) 386 | ) 387 | colnames(tuna)[1] = "all" 388 | colnames(tuna) = substr(colnames(tuna), 1, 5) 389 | rownames(tuna) = c("norm", "mean") 390 | print(xtable(tuna, caption = paste(learner.names[i], "tunability"), digits = 3)) 391 | } 392 | 393 | # rpart sieht sehr komisch aus! 394 | -------------------------------------------------------------------------------- /results_accuracy.RData: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PhilippPro/tunability/9fc4fa84f500b1eec446ad50860901c315680a76/results_accuracy.RData -------------------------------------------------------------------------------- /results_auc.RData: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PhilippPro/tunability/9fc4fa84f500b1eec446ad50860901c315680a76/results_auc.RData -------------------------------------------------------------------------------- /results_brier.RData: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PhilippPro/tunability/9fc4fa84f500b1eec446ad50860901c315680a76/results_brier.RData -------------------------------------------------------------------------------- /shiny/app.R: -------------------------------------------------------------------------------- 1 | library(ggplot2) 2 | library(plotly) 3 | # library(tidyr) 4 | library(shiny) 5 | library(shinyjs) 6 | #library(shinydashboard) 7 | # library(shinyBS) 8 | library(data.table) 9 | library(DT) 10 | library(ParamHelpers) 11 | library(mlr) 12 | #library(devtools) 13 | library(checkmate) 14 | library(glmnet) 15 | library(kknn) 16 | library(rpart) 17 | library(e1071) 18 | library(ranger) 19 | library(xgboost) 20 | 21 | server = function(input, output) { 22 | 23 | load("app_data.RData") 24 | source("helpers.R") 25 | 26 | rv <- reactiveValues() 27 | rv$setupComplete <- FALSE 28 | 29 | ## simulate data load 30 | observe({ 31 | if(input$btn_data){ 32 | ## set my condition to TRUE 33 | rv$setupComplete <- TRUE 34 | } 35 | 36 | ## the conditional panel reads this output 37 | output$setupComplete <- reactive({ 38 | return(rv$setupComplete) 39 | }) 40 | outputOptions(output, 'setupComplete', suspendWhenHidden=FALSE) 41 | }) 42 | 43 | 44 | measure.names = names(app_data) 45 | learner.names = names(app_data$auc$results) 46 | 47 | output$measureAll = renderUI({ 48 | selectInput('meas', 'Performance measure', measure.names, selected = measure.names[1], multiple = FALSE) 49 | }) 50 | 51 | output$algorithm = renderUI({ 52 | selectInput('algo', 'Algorithm', learner.names, selected = learner.names[1], multiple = FALSE) 53 | }) 54 | 55 | output$defaultchoice <- renderUI({ 56 | selectInput('defaultchoice', 'Defaults', c("Optimal defaults", "Package defaults"), selected = "Optimal defaults", multiple = FALSE) 57 | }) 58 | 59 | bmrInput = reactive({ 60 | inputi = "glmnet" 61 | if(!is.null(input$algo)) 62 | inputi = input$algo 63 | measur = "auc" 64 | if(!is.null(input$meas)) 65 | measur = input$meas 66 | app_data[[which(measure.names == measur)]]$surrogate[[which(learner.names == inputi)]] 67 | }) 68 | 69 | bmrAggr = reactive({ 70 | perfs = data.table(bmrInput())[, -"task.id"] 71 | # delete datasets with missing results 72 | error.results = which(is.na(perfs$kendalltau.test.mean) | perfs$rsq.test.mean < 0) 73 | error.results = unlist(lapply(unique(floor(error.results/5 - 0.0001)), 74 | function(x) x + seq(0.2, 1, 0.2)))*5 75 | if(length(error.results)!=0) 76 | perfs = perfs[-error.results,] 77 | perfs = data.frame(perfs[, lapply(list(mse = mse.test.mean, rsq = rsq.test.mean, kendalltau = kendalltau.test.mean, 78 | spearmanrho = spearmanrho.test.mean),function(x) mean(x)), by = "surrogate"]) 79 | perfs 80 | }) 81 | 82 | output$logscale = renderUI({ 83 | selectInput('logscale', 'Logarithmic scale', c("No", "Yes"), selected = "No", multiple = FALSE) 84 | }) 85 | 86 | output$bmr_measure = renderUI({ 87 | measures = gsub("\\..*","",colnames(bmrInput())[-c(1,2)]) 88 | selectInput('bmr_measure', 'Performance measure', measures, selected = measures[2], multiple = FALSE) 89 | }) 90 | 91 | output$bmr_result = renderTable({ 92 | bmrAggr() 93 | }, digits = 5) 94 | 95 | #df_test = app_data$auc$surrogate$glmnet 96 | #plotBMRSummary 97 | #bmr_measure = gsub("\\..*","",colnames(app_data$auc$surrogate$glmnet)[-c(1,2)]) 98 | #learner_ids = gsub("\\..*","",learner.names) 99 | url <- a("Bot Paper", href="https://arxiv.org/pdf/1806.10961.pdf") 100 | 101 | output$tab <- renderUI({ 102 | tagList("In the plot below the performances of the surrogate models on the different datasets is depicted. 103 | The data_id's correspond to the dataset ids of OpenML, for details see also the", url, ".") 104 | }) 105 | 106 | output$plot1 = renderPlot({ 107 | sel_meas = paste0(input$bmr_measure, ".test.mean") 108 | p = ggplot(bmrInput(), aes_string(x = sel_meas, y = "task.id", col = "surrogate", shape = "surrogate")) 109 | p = p + geom_point(size = 4L, position = position_jitter(width = 0, height = 0.05)) 110 | p = p + scale_shape_manual(values = rep(19, length(learner.names))) 111 | p = p + ylab("Data_id") 112 | p = p + xlab(sel_meas) 113 | if (ifelse(!is.null(input$logscale), input$logscale == "Yes" , TRUE)) { 114 | #if (input$logscale == "Yes") { 115 | p + scale_x_log10() + ggtitle("Performance on datasets") 116 | } else { 117 | p + ggtitle("Performance on datasets") 118 | } 119 | }) 120 | 121 | output$task = renderUI({ 122 | selectInput('taski', 'Task', c("classification", "regression"), selected = "classification", multiple = FALSE) 123 | }) 124 | 125 | 126 | 127 | resultsInput = reactive({ 128 | if (input$defaultchoice == "Optimal defaults") { 129 | app_data[[input$meas]]$results[[input$algo]] 130 | } else { 131 | app_data[[input$meas]]$resultsPackageDefaults[[input$algo]] 132 | } 133 | }) 134 | 135 | output$defaults = renderTable({ 136 | resultsInput()$default$default 137 | }, digits = 3) 138 | 139 | overall = reactive({ 140 | calculateTunability(resultsInput()$default, app_data[[input$meas]]$results[[input$algo]]$optimum) 141 | }) 142 | 143 | tunabilityValues = reactive({ 144 | calculateTunability(resultsInput()$default, resultsInput()$optimumHyperpar) 145 | }) 146 | 147 | tunabilityValuesMean = reactive({ 148 | colMeans(calculateTunability(resultsInput()$default, resultsInput()$optimumHyperpar)) 149 | }) 150 | 151 | output$scaled = renderUI({ 152 | selectInput('scaled', 'Scaled (per Dataset)', c(TRUE, FALSE), selected = FALSE, multiple = FALSE) 153 | }) 154 | 155 | output$overallTunability = renderTable({ 156 | if (input$scaled) { 157 | mean(overall()/overall(), na.rm = TRUE) 158 | } else { 159 | mean(overall()) 160 | } 161 | }, colnames = FALSE, digits = 3) 162 | 163 | output$tunability = renderTable({ 164 | if (input$scaled) { 165 | data.frame(t(colMeans(tunabilityValues()/overall(), na.rm = T))) 166 | } else { 167 | data.frame(t(tunabilityValuesMean())) 168 | } 169 | }, digits = 3) 170 | 171 | output$plot3 = renderPlotly({ 172 | dataf = data.frame(overall(), tunabilityValues()) 173 | colnames(dataf)[1] = "overall" 174 | column.names = colnames(dataf) 175 | dataf = stack(dataf) 176 | dataf$ind = factor(dataf$ind, column.names) 177 | ggplot(dataf, aes(x = ind, y = values)) + geom_boxplot() + coord_cartesian(ylim = c(input$yrange[1],input$yrange[2])) + 178 | ylab("tunability per dataset") + xlab("hyperparameter") # for the x axis label # + ggtitle(substring(learner.names[i], 13)) 179 | }) 180 | 181 | output$visual = renderUI({ 182 | selectInput('visual', 'Visualization of the tunability', c("Density", "Histogram"), selected = "Density", multiple = FALSE) 183 | }) 184 | 185 | output$visual3 = renderUI({ 186 | selectInput('visual3', 'Hyperparameter', c(names(tunabilityValuesMean())), selected = "All", multiple = FALSE) 187 | }) 188 | 189 | output$plot4 = renderPlotly({ 190 | dataf = data.frame(app_data[[input$meas]]$results[[input$algo]]$optimum$par.sets[,input$visual3]) 191 | name = input$visual3 192 | num = is.numeric(dataf[,1]) 193 | 194 | inputi = "glmnet" 195 | 196 | if(!is.null(input$algo)) 197 | inputi = input$algo 198 | 199 | if(num) { 200 | dataf = dataf[dataf[,1]!=-11, , drop = F] 201 | learner.i = which(learner.names == inputi) 202 | TRAFO = is.null(lrn.par.set[[learner.i]][[2]]$pars[[name]]$trafo) 203 | if(TRAFO) { 204 | ggplot(data=dataf, aes(dataf[,1])) + geom_histogram(aes(y=..density..), bins = input$nrbin, col = "black", fill = "white") + xlim(range(dataf[,1])) + xlab(name) 205 | } else { 206 | ggplot(data=dataf, aes(dataf[,1])) + geom_histogram(aes(y=..density..), bins = input$nrbin, col = "black", fill = "white") + xlim(range(dataf[,1])) + xlab(paste(name, "(log-scale)")) + scale_x_continuous(trans = "log10") 207 | } 208 | } else { 209 | ggplot(data=dataf, aes(dataf[,1])) + geom_bar(aes(y = (..count..)/sum(..count..)), col = "black", fill = "white") + 210 | xlab(name) + ylab("relative frequency") 211 | } 212 | }) 213 | 214 | output$quantile = renderUI({ 215 | numericInput('quantile', 'Quantile for tuning space calculation', 0.1, min = 0, max = 1) 216 | }) 217 | 218 | tuningSpace = reactive({ 219 | tab = calculateTuningSpace(app_data[[input$meas]]$results[[input$algo]]$optimum, quant = input$quantile) 220 | tab$numerics = cbind(Quantile = rownames(tab$numerics), tab$numerics) 221 | tab 222 | }) 223 | 224 | output$tuningSpaceNumerics = renderTable({ 225 | tuningSpace()$numerics 226 | }, rownames = FALSE, digits = 3) 227 | 228 | output$tuningSpaceFactors = renderTable({ 229 | tuningSpace()$factors 230 | }) 231 | 232 | output$combi = renderUI({ 233 | selectInput('combination', 'Measures', 234 | c("Tunability", "Joint gain", "Interaction effect"), 235 | selected = "Tunability", multiple = FALSE) 236 | }) 237 | 238 | output$combiTable <- renderTable({ 239 | tab = colMeans(resultsInput()$optimumTwoHyperpar$optimum, dims = 1, na.rm = TRUE) - mean(resultsInput()$default$result) 240 | if(input$combination == "Tunability") { 241 | diag(tab) = tunabilityValuesMean() 242 | } else { 243 | if(input$combination == "Interaction effect") { 244 | tab = tab - outer(tunabilityValuesMean(), tunabilityValuesMean(), '+') 245 | } else { 246 | tab = tab - outer(tunabilityValuesMean(), tunabilityValuesMean(), pmax) 247 | } 248 | } 249 | colnames(tab) = rownames(tab) = names(tunabilityValuesMean()) 250 | tab 251 | }, rownames = TRUE, digits = 4) 252 | 253 | 254 | output$par.set = renderUI({ 255 | tagList(makeLearnerParamUI(app_data[[input$meas]]$results[[input$algo]])) 256 | }) 257 | 258 | 259 | output$performanceHypParSetting = renderTable({ 260 | var_names = colnames(app_data[[input$meas]]$results[[input$algo]]$optimum$par.sets) 261 | par.set = numeric() 262 | for(i in 1:length(var_names)) { 263 | par.set[i] = input[[var_names[i]]] 264 | } 265 | par.set 266 | #calculatePerformance(surrogates_all[[input$algo]], par.set)$preds 267 | }) 268 | # performanceHypParSetting = reactive({ 269 | # calculatePerformance(surrogates_all[[input$algo]], par.set) 270 | # }) 271 | 272 | } 273 | 274 | makeLearnerParamUI = function(results_algo) { 275 | par.set = results_algo$optimum$par.sets 276 | inp = list() 277 | for(i in 1:ncol(par.set)) { 278 | par.type = class(par.set[,i]) 279 | par.id = names(par.set)[i] 280 | if (par.type == "numeric") 281 | inp[[i]] = numericInput(par.id, par.id, results_algo$default$default[i]) 282 | if (par.type == "factor") 283 | inp[[i]] = selectInput(par.id, par.id, choices = unique(par.set[,i]), selected = results_algo$default$default[i]) 284 | } 285 | inp 286 | } 287 | 288 | ui = fluidPage( 289 | conditionalPanel(condition = "!output.setupComplete", 290 | column(12, h2(p("Tunability Shiny App"))), 291 | column(12, h5(p("This app contains additional material for the paper 'Tunability: Importance of Hyperparameters of Machine Learning Algorithms'. 292 | For starting the app, just click the button:"))), 293 | column(12, align = "center", actionButton(inputId = "btn_data", label = "Start the shiny app!", width = '400px', 294 | style="color: #fff; background-color: #337ab7; border-color: #2e6da4")), 295 | br(), 296 | hr(), 297 | br(), 298 | hr(), 299 | br(), 300 | hr(), 301 | column(10, h3(p("Tunability: Importance of Hyperparameters of Machine Learning Algorithms"))), 302 | br(), 303 | br(), 304 | hr(), 305 | column(10, h5(p("Authors: Philipp Probst, Bernd Bischl, Anne-Laure Boulesteix"))), 306 | br(), 307 | hr(), 308 | column(10, h4(p("Paper Abstract"))), 309 | column(10, h5(p("Modern supervised machine learning algorithms involve hyperparameters that have to be set before running them. 310 | Options for setting hyperparameters are default values from the software package, manual configuration by the user or configuring them for optimal predictive performance by a tuning procedure. 311 | The goal of this paper is two-fold. 312 | Firstly, we formalize the problem of tuning from a statistical point of view, define data-based defaults and suggest general measures quantifying the tunability of hyperparameters of algorithms. 313 | Secondly, we conduct a large-scale benchmarking study based on 38 datasets from the OpenML platform and six common machine learning algorithms. 314 | We apply our measures to assess the tunability of their parameters. 315 | Our results yield default values for hyperparameters and enable users to decide whether it is worth conducting a possibly time consuming tuning strategy, to focus on the most important hyperparameters and to choose adequate hyperparameter spaces for tuning. "))) 316 | ), 317 | conditionalPanel(condition = "output.setupComplete", 318 | titlePanel("Tunability Shiny App"), 319 | hr(), 320 | wellPanel(fluidRow(column(12, h4("General settings"))), 321 | fluidRow(column(4,uiOutput("measureAll")),column(4,uiOutput("algorithm")),column(4,uiOutput("defaultchoice")))), 322 | hr(), 323 | fluidRow(column(12, h4(p("(around 10 seconds loading time for each panel)", style = "color:blue")))), 324 | tabsetPanel( 325 | tabPanel("Surrogate models comparison", 326 | fluidRow(column(12, h2("Comparison of the quality of surrogate models")), 327 | column(12, h5("The calculation of the tunability is based on the surrogate models. 328 | Hence, it is important to evaluate the performance of the surrogate model. 329 | In this panel five different surrogate models are compared. 330 | For the final calculation of the tunability measures the ranger surrogate models is chosen because it provides good and stable results. 331 | See also section 5.1 in the paper."))), 332 | hr(), 333 | fluidRow(column(12, h4("Average performances of the surrogate models on the different datasets")), 334 | column(12, tableOutput("bmr_result"))), 335 | hr(), 336 | fluidRow(column(12, h4("Distribution of the surrogate model performances on the different datasets")), 337 | column(12, uiOutput("tab")), 338 | column(6, uiOutput("logscale")), column(6, uiOutput("bmr_measure")), 339 | plotOutput("plot1", width = "95%"))#, 340 | #plotOutput("plot2", width = "95%") 341 | ), 342 | 343 | tabPanel("Defaults and tunability", 344 | fluidRow(column(12, h2("Defaults and tunability")), 345 | column(12, h5("In this panel the defaults and the correspending tunabilities are depicted. For details see sections 3.2, 3.3, 3.4, 5.2 and 5.3 in the paper."))), 346 | hr(), 347 | fluidRow(column(12, h4("Defaults")), 348 | column(12, h5("This table contains the default hyperparameter values that are used for the calculation of the tunability values. 349 | The optimal defaults were calculated by taking the best average performance of a hyperparameter setting on all datasets. 350 | The package defaults are given by the corresponding R-packages.")), 351 | column(12, tableOutput("defaults"))), 352 | hr(), 353 | fluidRow( 354 | column(12, h4("Tunability")), 355 | column(12, h5("The tunability values are calculated by taking the best performance of a hyperparameter setting on a 356 | dataset (overall and for single hyperparameters) and subtracting the performance of the default hyperparameter setting.")), 357 | column(12, h4("Mean tunability over the datasets")), 358 | column(12, h5("For the following table the mean of the tunabilities of all the datasets is taken to provide one measure of tunability for each parameter. 359 | The scaled version divides the tunability per hyperparameter by the overall tunability of the algorithm per dataset and takes the mean afterwards.")), 360 | column(12, fluidRow( 361 | column(1, h5("Overall mean tunability"), tableOutput("overallTunability")), 362 | column(11, h5("Hyperparameters"), tableOutput("tunability")) 363 | )), 364 | column(12, uiOutput("scaled")) 365 | ), 366 | fluidRow(column(12, h4("Boxplot of tunability values per dataset")), 367 | br(), 368 | br(), 369 | column(12, h4("Tunability values per dataset")), 370 | plotlyOutput("plot3", width = "95%", inline = F), 371 | sliderInput("yrange", "Y-axis limits:", min = 0, max = 0.5, value = c(0, 0.06), width = "800px") 372 | )), 373 | tabPanel("Combined tunability", 374 | fluidRow(column(12, h2("Tunability of hyperparameter combinations and joint gain")), 375 | column(12, h5("In this panel the tunabilities of hyperparameter combinations and joint gains can be seen. For details see section 3.5 and 5.4 in the paper."))), 376 | hr(), 377 | fluidRow(column(12, uiOutput("combi")), 378 | column(12, h4("Combined tunability and interaction effects")), 379 | column(12, h5("The tunability values of the single hyperparameters are depicted on the diagonal, the combined tunabilities on the upper right of 380 | the table. For details of the calculation (also of the joint gain) see section 3.5 in the paper.")), 381 | column(12, tableOutput("combiTable"))) 382 | #) 383 | #) 384 | ), 385 | tabPanel("Tuning space", 386 | fluidRow(column(12, h2("Hyperparameter ranges for tuning and priors")), 387 | column(12, h5("In this panel the optimal hyperparameter ranges are depicted. For details see sections 3.6 and 5.5 in the paper."))), 388 | hr(), 389 | fluidRow(column(12, h4("Tuning Space"), 390 | column(12, h5("The tuning space is calculated by taking the best hyperparameters on each dataset and calculating the quantiles of these.")), 391 | column(12, uiOutput("quantile")), 392 | column(12, "Numerics", align="left", tableOutput("tuningSpaceNumerics")), 393 | column(12, "Factors", align="left", tableOutput("tuningSpaceFactors")) 394 | )), 395 | hr(), 396 | fluidRow(column(12, h4("Histogram of best hyperparameter on each of the datasets (possible prior for tuning)")), 397 | column(12, uiOutput("visual3"))), 398 | plotlyOutput("plot4", width = "95%", inline = F), 399 | sliderInput("nrbin", "Number of bins:", min = 0, max = 50, value = c(6), width = "800px") 400 | #fluidRow(column(6, uiOutput("visual")), 401 | # column(6, uiOutput("visual2"))) 402 | #plotlyOutput("plot5", inline = F), 403 | 404 | # conditionalPanel( 405 | # condition = "input.visual == 'Histogram'", 406 | # sliderInput("bins", "Number of bins:", min = 1, max = 50, value = 30) 407 | # )) 408 | ) 409 | ) 410 | ) 411 | ) 412 | 413 | 414 | 415 | shinyApp(ui = ui, server = server) -------------------------------------------------------------------------------- /shiny/app_data.RData: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PhilippPro/tunability/9fc4fa84f500b1eec446ad50860901c315680a76/shiny/app_data.RData -------------------------------------------------------------------------------- /shiny/helpers.R: -------------------------------------------------------------------------------- 1 | getSimpleLearners = function(){ 2 | # Simple learner param set 3 | simple.lrn.par.set = makeLrnPsSets(learner = makeLearner("classif.glmnet", predict.type = "prob"), 4 | param.set = makeParamSet( 5 | makeNumericParam("alpha", lower = 0, upper = 1, default = 1), 6 | makeNumericVectorParam("lambda", len = 1L, lower = -10, upper = 10, default = 0 ,trafo = function(x) 2^x))) 7 | 8 | simple.lrn.par.set = makeLrnPsSets(learner = makeLearner("classif.rpart", predict.type = "prob"), 9 | param.set = makeParamSet( 10 | makeNumericParam("cp", lower = 0, upper = 1, default = 0.01), 11 | makeIntegerParam("maxdepth", lower = 1, upper = 30, default = 30), 12 | makeIntegerParam("minbucket", lower = 1, upper = 60, default = 1), 13 | makeIntegerParam("minsplit", lower = 1, upper = 60, default = 20)), 14 | lrn.ps.sets = simple.lrn.par.set) 15 | 16 | return(simple.lrn.par.set) 17 | } 18 | 19 | getMultipleLearners = function(){ 20 | simple.lrn.par.set = getSimpleLearners() 21 | 22 | # increase to a general param set 23 | lrn.par.set = makeLrnPsSets(learner = makeLearner("classif.kknn", predict.type = "prob"), 24 | param.set = makeParamSet( 25 | makeIntegerParam("k", lower = 1, upper = 30)), 26 | lrn.ps.sets = simple.lrn.par.set) 27 | 28 | lrn.par.set = makeLrnPsSets(learner = makeLearner("classif.svm", predict.type = "prob"), 29 | param.set = makeParamSet( 30 | makeDiscreteParam("kernel", values = c("linear", "polynomial", "radial")), 31 | makeNumericParam("cost", lower = -10, upper = 10, trafo = function(x) 2^x), 32 | makeNumericParam("gamma", lower = -10, upper = 10, trafo = function(x) 2^x, requires = quote(kernel == "radial")), 33 | makeIntegerParam("degree", lower = 2, upper = 5, requires = quote(kernel == "polynomial"))), 34 | lrn.ps.sets = lrn.par.set) 35 | 36 | lrn.par.set = makeLrnPsSets(learner = makeLearner("classif.ranger", predict.type = "prob"), 37 | param.set = makeParamSet( 38 | makeIntegerParam("num.trees", lower = 1, upper = 2000), 39 | makeLogicalParam("replace"), 40 | makeNumericParam("sample.fraction", lower = 0.1, upper = 1), 41 | makeNumericParam("mtry", lower = 0, upper = 1), 42 | makeLogicalParam(id = "respect.unordered.factors"), 43 | makeNumericParam("min.node.size", lower = 0, upper = 1)), 44 | lrn.ps.sets = lrn.par.set) 45 | 46 | lrn.par.set = makeLrnPsSets(learner = makeLearner("classif.xgboost", predict.type = "prob"), 47 | param.set = makeParamSet( 48 | makeIntegerParam("nrounds", lower = 1, upper = 5000), 49 | makeNumericParam("eta", lower = -10, upper = 0, trafo = function(x) 2^x), 50 | makeNumericParam("subsample",lower = 0.1, upper = 1), 51 | makeDiscreteParam("booster", values = c("gbtree", "gblinear")), 52 | makeIntegerParam("max_depth", lower = 1, upper = 15, requires = quote(booster == "gbtree")), 53 | makeNumericParam("min_child_weight", lower = 0, upper = 7, requires = quote(booster == "gbtree"), trafo = function(x) 2^x), 54 | makeNumericParam("colsample_bytree", lower = 0, upper = 1, requires = quote(booster == "gbtree")), 55 | makeNumericParam("colsample_bylevel", lower = 0, upper = 1, requires = quote(booster == "gbtree")), 56 | makeNumericParam("lambda", lower = -10, upper = 10, trafo = function(x) 2^x), 57 | makeNumericParam("alpha", lower = -10, upper = 10, trafo = function(x) 2^x)), 58 | lrn.ps.sets = lrn.par.set) 59 | 60 | return(lrn.par.set) 61 | } 62 | 63 | makeLrnPsSets = function(learner, param.set, lrn.ps.sets = NULL, 64 | id = paste0(learner$id, ".set"), overwrite = FALSE) { 65 | 66 | assertClass(learner, "Learner") 67 | assertClass(param.set, "ParamSet") 68 | par.match = names(param.set$pars) %in% names(learner$par.set$pars) 69 | if(all(par.match)){ 70 | ls = list(learner = learner, param.set = param.set) 71 | } else { 72 | stop(paste("The following parameters in param.set are not included in learner:", 73 | paste(names(param.set$pars[par.match == FALSE]), collapse = ", "))) 74 | } 75 | 76 | if(is.null(lrn.ps.sets)){ 77 | lrn.ps.sets = list() 78 | lrn.ps.sets[[id]] = ls 79 | attr(lrn.ps.sets, "class") = "LrnPsSet" 80 | } else { 81 | assertClass(lrn.ps.sets, "LrnPsSet") 82 | 83 | if(id %in% names(lrn.ps.sets) & overwrite == FALSE){ 84 | stop("tune.pair already contains id: \"", id, "\". Please specify a new id or set overwrite = TRUE.") 85 | } else { 86 | lrn.ps.sets[[id]] = ls 87 | } 88 | } 89 | 90 | return(lrn.ps.sets) 91 | } 92 | 93 | lrn.par.set = getMultipleLearners() 94 | 95 | calculateTunability = function(default, optimumHyperpar, optimumTwoHyperpar = NULL) { 96 | optimumHyperpar$optimum - default$result 97 | } 98 | 99 | calculateTuningSpace = function(optimum, quant) { 100 | space = data.frame(row.names = c(quant, 1-quant)) 101 | space2 = list() 102 | par.sets = optimum$par.sets 103 | for(i in 1:ncol(par.sets)) { 104 | if(is.numeric(par.sets[,i])) { 105 | par.sets[par.sets[,i]==-11,i] = NA 106 | space = cbind(space, quantile(par.sets[,i], c(quant, 1-quant), na.rm = TRUE)) 107 | colnames(space)[ncol(space)] = names(par.sets)[i] 108 | } 109 | if(is.factor(par.sets[,i]) | is.logical(par.sets[,i])) { 110 | logic = table(par.sets[,i]) / length(par.sets[,i]) > quant 111 | space2 = c(space2, list(names(table(par.sets[,i]))[logic])) 112 | names(space2)[length(space2)] = names(par.sets)[i] 113 | } 114 | } 115 | return(list(numerics = space, factors = space2)) 116 | } 117 | -------------------------------------------------------------------------------- /shiny/preproc.R: -------------------------------------------------------------------------------- 1 | setwd("/nfsmb/koll/probst/Paper/Exploration_of_Hyperparameters/tunability/shiny") 2 | load("results_all.RData") 3 | 4 | # Nur absolut notwendige Information extrahieren 5 | app_data = list() 6 | 7 | measures = names(results_all) 8 | classifiers = names(results_all$auc$bmr_surrogate) 9 | 10 | for(i in measures) { 11 | for(j in classifiers) { 12 | app_data[[i]]$surrogate[[j]] = getBMRAggrPerformances(results_all[[i]]$bmr_surrogate[[j]], as.df = TRUE) 13 | app_data[[i]]$results = results_all[[i]]$results 14 | app_data[[i]]$resultsPackageDefaults = results_all[[i]]$resultsPackageDefaults 15 | app_data[[i]]$results_cv = results_all[[i]]$results_cv 16 | app_data[[i]]$lrn.par.set = results_all[[i]]$lrn.par.set 17 | } 18 | } 19 | 20 | for(i in measures) { 21 | names(app_data[[i]]$surrogate) = substring(names(app_data[[i]]$surrogate), 13) 22 | names(app_data[[i]]$results) = substring(names(app_data[[i]]$results), 13) 23 | names(app_data[[i]]$resultsPackageDefaults) = substring(names(app_data[[i]]$resultsPackageDefaults), 13) 24 | names(app_data[[i]]$results_cv) = substring(names(app_data[[i]]$results_cv), 13) 25 | #names(app_data[[1]]$lrn.par.set) 26 | } 27 | 28 | for(i in measures) { 29 | for(j in classifiers) { 30 | colnames(app_data[[i]]$surrogate[[j]])[2] = "surrogate" 31 | levels(app_data[[i]]$surrogate[[j]]$surrogate) = substring(levels(app_data[[i]]$surrogate[[j]]$surrogate), 6) 32 | } 33 | } 34 | 35 | save(app_data, file = "app_data.RData") 36 | 37 | # auc und accuracy surrogate Zeug unterscheidet sich nicht!? -> neu rechnen -> nur den 6er von accuracy?, auch in Paper! 38 | # resultsPackageDefaults fehlt bei der AUC 39 | -------------------------------------------------------------------------------- /shiny/results_all.RData: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PhilippPro/tunability/9fc4fa84f500b1eec446ad50860901c315680a76/shiny/results_all.RData -------------------------------------------------------------------------------- /shiny/rsconnect/shinyapps.io/philipppro/tunability.dcf: -------------------------------------------------------------------------------- 1 | name: tunability 2 | title: tunability 3 | username: 4 | account: philipppro 5 | server: shinyapps.io 6 | hostUrl: https://api.shinyapps.io/v1 7 | appId: 299916 8 | bundleId: 1845757 9 | url: https://philipppro.shinyapps.io/tunability/ 10 | when: 1548856147.23563 11 | asMultiple: FALSE 12 | asStatic: FALSE 13 | ignoredFiles: preproc.R|results_all.RData|old/app_old.R|old/old_app.R 14 | --------------------------------------------------------------------------------