├── LICENSE ├── README.md ├── caret-classification ├── iris-classification-all-fast.tsv ├── caret-all-binary-class-PimaIndiansDiabetes.R ├── caret-all-binary-class-PimaIndiansDiabetes.tsv ├── iris-classification-all-fast.R └── iris-classification-caret-all.R ├── caret-cv ├── HAR-all-CV-methods.R ├── caret-all-cv-methods-lapply-sapply.R ├── caret-all-cv-parallel-cubist.R ├── caret-all-cv-parallel-qrf.R └── caret-cv-simple.R ├── caret-datasets ├── caret-MS-datasets.csv └── view-caret-ML-datasets.R ├── caret-parallel ├── caret-parallel-train-cubist.R ├── caret-parallel-train-rf-deLuxe.R ├── caret-parallel-train.R ├── learning-curve-plots-caret-parallel.R ├── run-multiple-caret-models-parallel-lapply.R └── run-multiple-caret-models-parallel-sapply.R ├── caret-regression ├── caret-all-regression-models.R ├── caret-all-regressions-DT-cars.csv ├── caret-all-regressions-DT-concrete.R ├── caret-all-regressions-DT-concrete.csv └── caret-regression-plotObsVsPred.R ├── caret-setup ├── caret-get-all-models-automatically.R ├── caret-model-list-v6058.csv ├── caret-modelLookup-DT.R ├── caret-setup-comfort.R ├── caret-setup-deLuxe.R └── caret-simple-setup.R └── caret-tune └── caret-tune-evolutionial-algorithm-svmRadial.R /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2015 Tobias Kind 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | 23 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # caret-machine-learning examples 2 | This R repository collects practical working examples for many of the 200 classifications and regression models in caret and is geared towards practitioners. Example contributions from different fields are highly welcome. 3 | 4 | The caret machine learning package [(WIKI)](http://topepo.github.io/caret/index.html) bundles around 200 classification and regression algorithms. Additional support for caret is given at the website [appliedpredictivemodeling.com](http://appliedpredictivemodeling.com/) and the excellent book by Max Kuhn and Kjell Johnson [ISBN: 978-1-4614-6848-6](http://link.springer.com/book/10.1007/978-1-4614-6849-3). 5 | 6 | Please read more in this [**caret-machine-learning WIKI**](https://github.com/tobigithub/caret-machine-learning/wiki) or browse the example R code. 7 | 8 | --- 9 | -------------------------------------------------------------------------------- /caret-classification/ iris-classification-all-fast.tsv: -------------------------------------------------------------------------------- 1 | Num Name Accuracy Kappa time [s] Model name 2 | 2 avNNet 0.969 0.953 3.400 Model Averaged Neural Network 3 | 97 vglmCumulative 0.965 0.947 2.840 Cumulative Probability Model for Ordinal Data 4 | 96 vglmContRatio 0.963 0.944 2.950 Continuation Ratio Model for Ordinal Data 5 | 79 sda 0.961 0.941 0.880 Shrinkage Discriminant Analysis 6 | 1 amdai 0.961 0.941 10.850 Adaptive Mixture Discriminant Analysis 7 | 31 hdrda 0.961 0.941 1.590 High-Dimensional Regularized Discriminant Analysis 8 | 37 lda 0.961 0.941 0.580 Linear Discriminant Analysis 9 | 38 lda2 0.961 0.941 0.590 Linear Discriminant Analysis 10 | 60 pda 0.961 0.941 0.730 Penalized Discriminant Analysis 11 | 69 rda 0.961 0.941 1.890 Regularized Discriminant Analysis 12 | 27 glmnet 0.960 0.940 1.570 glmnet 13 | 72 rlda 0.960 0.940 0.870 Regularized Linear Discriminant Analysis 14 | 59 pcaNNet 0.959 0.939 1.430 Neural Networks with Feature Extraction 15 | 95 vglmAdjCat 0.958 0.937 2.060 Adjacent Categories Probability Model for Ordinal Data 16 | 52 nnet 0.957 0.936 1.360 Neural Network 17 | 56 parRF 0.956 0.933 0.880 Parallel Random Forest 18 | 41 loclda 0.956 0.933 2.340 Localized Linear Discriminant Analysis 19 | 53 oblique.tree 0.955 0.932 3.830 Oblique Trees 20 | 20 extraTrees 0.955 0.932 2.940 Random Forest by Randomization 21 | 25 gbm 0.955 0.932 1.170 Stochastic Gradient Boosting 22 | 13 CSimca 0.955 0.932 1.340 SIMCA 23 | 6 Boruta 0.954 0.931 5.610 Random Forest with Additional Feature Selection 24 | 45 mda 0.954 0.931 1.130 Mixture Discriminant Analysis 25 | 70 rf 0.953 0.930 1.400 Random Forest 26 | 94 treebag 0.953 0.929 1.330 Bagged CART 27 | 68 rbfDDA 0.953 0.929 2.790 Radial Basis Function Network 28 | 67 ranger 0.952 0.928 0.980 Random Forest 29 | 42 LogitBoost 0.951 0.926 0.850 Boosted Logistic Regression 30 | 87 svmLinear 0.950 0.924 0.670 Support Vector Machines with Linear Kernel 31 | 88 svmLinear2 0.949 0.923 0.730 Support Vector Machines with Linear Kernel 32 | 30 hdda 0.949 0.923 0.860 High Dimensional Discriminant Analysis 33 | 39 Linda 0.949 0.923 0.830 Robust Linear Discriminant Analysis 34 | 36 knn 0.948 0.922 0.610 k-Nearest Neighbors 35 | 40 LMT 0.946 0.918 2.760 Logistic Model Trees 36 | 86 stepQDA 0.945 0.918 2.830 Quadratic Discriminant Analysis with Stepwise Feature Selection 37 | 65 qda 0.945 0.917 0.610 Quadratic Discriminant Analysis 38 | 78 RSimca 0.945 0.917 0.940 Robust SIMCA 39 | 21 fda 0.945 0.916 0.830 Flexible Discriminant Analysis 40 | 3 bagFDAGCV 0.944 0.916 4.650 Bagged FDA using gCV Pruning 41 | 9 C5.0 0.944 0.915 1.150 C5.0 42 | 85 stepLDA 0.943 0.915 2.910 Linear Discriminant Analysis with Stepwise Feature Selection 43 | 99 wsrf 0.943 0.914 1.110 Weighted Subspace Random Forest 44 | 18 earth 0.943 0.914 1.000 Multivariate Adaptive Regression Spline 45 | 44 lvq 0.942 0.913 0.870 Learning Vector Quantization 46 | 100 xyf 0.942 0.913 2.320 Self-Organizing Maps 47 | 29 hda 0.942 0.912 3.870 Heteroscedastic Discriminant Analysis 48 | 76 rpart2 0.940 0.910 0.640 CART 49 | 54 OneR 0.940 0.909 0.830 Single Rule Classification 50 | 12 cforest 0.938 0.906 2.930 Conditional Inference Random Forest 51 | 74 rpart 0.938 0.906 0.700 CART 52 | 75 rpart1SE 0.938 0.906 0.660 CART 53 | 77 rpartScore 0.938 0.906 2.390 CART or Ordinal Responses 54 | 89 svmPoly 0.938 0.906 2.890 Support Vector Machines with Polynomial Kernel 55 | 14 ctree 0.937 0.906 0.860 Conditional Inference Tree 56 | 15 ctree2 0.937 0.906 1.200 Conditional Inference Tree 57 | 26 gcvEarth 0.937 0.904 0.810 Multivariate Adaptive Regression Splines 58 | 33 JRip 0.936 0.904 2.300 Rule-Based Classifier 59 | 58 partDSA 0.936 0.903 4.950 partDSA 60 | 57 PART 0.934 0.901 0.840 Rule-Based Classifier 61 | 49 mlpWeightDecay 0.933 0.899 7.470 Multi-Layer Perceptron 62 | 50 mlpWeightDecayML 0.933 0.899 7.770 Multi-Layer Perceptron, multiple layers 63 | 11 C5.0Tree 0.933 0.899 0.550 Single C5.0 Tree 64 | 10 C5.0Rules 0.932 0.898 0.530 Single C5.0 Ruleset 65 | 32 J48 0.932 0.897 2.940 C4.5-like Trees 66 | 47 mlp 0.931 0.896 3.270 Multi-Layer Perceptron 67 | 48 mlpML 0.931 0.896 3.090 Multi-Layer Perceptron, with multiple layers 68 | 66 QdaCov 0.927 0.891 0.750 Robust Quadratic Discriminant Analysis 69 | 91 svmRadialCost 0.926 0.889 0.840 Support Vector Machines with Radial Basis Function Kernel 70 | 5 bdk 0.926 0.888 1.900 Self-Organizing Map 71 | 64 protoclass 0.925 0.887 1.190 Greedy Prototype Selection 72 | 90 svmRadial 0.923 0.884 0.890 Support Vector Machines with Radial Basis Function Kernel 73 | 92 svmRadialSigma 0.923 0.884 1.360 Support Vector Machines with Radial Basis Function Kernel 74 | 93 svmRadialWeights 0.923 0.884 1.060 Support Vector Machines with Class Weights 75 | 43 lssvmRadial 0.914 0.871 3.590 Least Squares Support Vector Machine with Radial Basis Function Kernel 76 | 35 kknn 0.912 0.867 1.150 k-Nearest Neighbors 77 | 24 gaussprRadial 0.909 0.864 2.310 Gaussian Process with Radial Basis Function Kernel 78 | 61 PenalizedLDA 0.902 0.853 0.880 Penalized Linear Discriminant Analysis 79 | 55 pam 0.900 0.850 0.870 Nearest Shrunken Centroids 80 | 51 nb 0.899 0.849 1.010 Naive Bayes 81 | 16 dda 0.891 0.837 2.260 Diagonal Discriminant Analysis 82 | 19 elm 0.875 0.812 1.020 Extreme Learning Machine 83 | 82 slda 0.836 0.753 0.970 Stabilized Linear Discriminant Analysis 84 | 84 spls 0.796 0.693 1.360 Sparse Partial Least Squares 85 | 34 kernelpls 0.794 0.692 0.630 Partial Least Squares 86 | 63 pls 0.794 0.692 0.630 Partial Least Squares 87 | 81 simpls 0.794 0.692 0.660 Partial Least Squares 88 | 98 widekernelpls 0.794 0.692 0.730 Partial Least Squares 89 | 71 RFlda 0.758 0.571 0.770 Factor-Based Linear Discriminant Analysis 90 | 46 Mlda 0.757 0.569 0.720 Maximum Uncertainty Linear Discriminant Analysis 91 | 83 sparseLDA 0.666 0.499 1.330 Sparse Linear Discriminant Analysis 92 | 4 bayesglm 0.665 0.495 0.850 Bayesian Generalized Linear Model 93 | 8 bstSm 0.665 0.495 2.780 Boosted Smoothing Spline 94 | 22 gam 0.665 0.495 2.540 Generalized Additive Model using Splines 95 | 23 gamLoess 0.665 0.495 1.130 Generalized Additive Model using LOESS 96 | 62 plr 0.665 0.495 0.860 Penalized Logistic Regression 97 | 73 rocc 0.665 0.495 0.890 ROC-Based Classifier 98 | 80 sdwd 0.665 0.495 1.570 Sparse Distance Weighted Discrimination 99 | 7 BstLm 0.588 0.380 2.330 Boosted Linear Model 100 | 17 dnn 0.320 0.000 2.620 Stacked AutoEncoder Deep Neural Network 101 | 28 gpls 1.000 2.340 Generalized Partial Least Squares 102 | -------------------------------------------------------------------------------- /caret-classification/caret-all-binary-class-PimaIndiansDiabetes.R: -------------------------------------------------------------------------------- 1 | # Use of all 160 caret models for binary classification and diabetes set 2 | # The output from fast (working) binary classification models is 3 | # exported to a sortable table in a web browser using the DT library 4 | # https://github.com/tobigithub/caret-machine-learning 5 | # R3.3.1 and caret_6.0-70 6 | # Tobias Kind (2016) 7 | 8 | # use mlbench, caret and DT library 9 | require(mlbench) 10 | require(caret) 11 | require(DT) 12 | 13 | # load diabetes set 768 x 9 14 | data(PimaIndiansDiabetes) 15 | dim(PimaIndiansDiabetes) 16 | 17 | # get all model names for classification 18 | m <- unique(modelLookup()[modelLookup()$forClass,c(1)]) 19 | length(m); m; 20 | 21 | # slow classification models ("rbf" crashes; "dwdLinear", "ownn", "snn" have issues) 22 | # all others may have just failed and are not listed here 23 | # 24 | removeModels <- c("AdaBag", "AdaBoost.M1", "FH.GBML", "pda2", "PenalizedLDA", 25 | "GFS.GCCL", "rbf", "RFlda", "nodeHarvest", "ORFsvm", "dwdLinear", "dwdPoly", "gam", 26 | "gaussprLinear", "ownn", "sddaLDA", "sddaQDA", "SLAVE", "smda", "snn", "rmda", 27 | "rFerns", "wsrf","ordinalNet","awnb", "awtan","manb","nbDiscrete","nbSearch","tan", 28 | "tanSearch","bartMachine","randomGLM", "Rborist", "adaboost") 29 | 30 | #remove all slow and failed models from model list 31 | m <- m[!m %in% removeModels] 32 | 33 | #m <- c("glm","gbm", "adaboost" ,"rf") 34 | 35 | # pre-load all packages (does not really work due to other dependencies) 36 | suppressPackageStartupMessages(ll <-lapply(m, require, character.only = TRUE)) 37 | 38 | # show which libraries were loaded 39 | sessionInfo() 40 | 41 | # load X and Y (this will be transferred to to train function) 42 | #X = PimaIndiansDiabetes[1:60,1:8] 43 | #Y = PimaIndiansDiabetes$diabetes[1:60] 44 | X = PimaIndiansDiabetes[,1:8] 45 | Y = PimaIndiansDiabetes$diabetes 46 | 47 | # register parallel front-end 48 | library(doParallel); cl <- makeCluster(detectCores()); registerDoParallel(cl) 49 | 50 | # this is required otherwise the first method is benchmarked wrong 51 | warmup <-train(y=Y, x=X, "rf", trControl = trainControl(method = "boot632")) 52 | 53 | # this setup actually calls the caret::train function, in order to provide 54 | # minimal error handling this type of construct is needed. 55 | trainCall <- function(i) 56 | { 57 | cat("----------------------------------------------------","\n"); 58 | set.seed(123); cat(i," <- loaded\n"); 59 | return(tryCatch( 60 | t2 <- train(y=Y, x=X, (i), trControl = trainControl(method = "boot632")), 61 | error=function(e) NULL)) 62 | } 63 | 64 | # use lapply/loop to run everything, required for try/catch error function to work 65 | t2 <- lapply(m, trainCall) 66 | 67 | #remove NULL values, we only allow succesful methods, provenance is deleted. 68 | t2 <- t2[!sapply(t2, is.null)] 69 | 70 | # this setup extracts the results with minimal error handling 71 | # TrainKappa can be sometimes zero, but Accuracy SD can be still available 72 | # see Kappa value http://epiville.ccnmtl.columbia.edu/popup/how_to_calculate_kappa.html 73 | printCall <- function(i) 74 | { 75 | return(tryCatch( 76 | { 77 | cat(sprintf("%-22s",(m[i]))) 78 | cat(round(getTrainPerf(t2[[i]])$TrainAccuracy,4),"\t") 79 | cat(round(getTrainPerf(t2[[i]])$TrainKappa,4),"\t") 80 | cat(t2[[i]]$times$everything[3],"\n")}, 81 | error=function(e) NULL)) 82 | } 83 | 84 | r2 <- lapply(1:length(t2), printCall) 85 | 86 | # stop cluster and register sequntial front end 87 | stopCluster(cl); registerDoSEQ(); 88 | 89 | # preallocate data types 90 | i = 1; MAX = length(t2); 91 | x1 <- character() # Name 92 | x2 <- numeric() # R2 93 | x3 <- numeric() # RMSE 94 | x4 <- numeric() # time [s] 95 | x5 <- character() # long model name 96 | 97 | # fill data and check indexes and NA with loop/lapply 98 | for (i in 1:length(t2)) { 99 | x1[i] <- t2[[i]]$method 100 | x2[i] <- as.numeric(round(getTrainPerf(t2[[i]])$TrainAccuracy,4)) 101 | x3[i] <- as.numeric(round(getTrainPerf(t2[[i]])$TrainKappa,4)) 102 | x4[i] <- as.numeric(t2[[i]]$times$everything[3]) 103 | x5[i] <- t2[[i]]$modelInfo$label 104 | } 105 | 106 | # coerce to data frame 107 | df1 <- data.frame(x1,x2,x3,x4,x5, stringsAsFactors=FALSE) 108 | 109 | # print all results to R-GUI 110 | df1 111 | 112 | # plot models, just as example 113 | # ggplot(t2[[1]]) 114 | # ggplot(t2[[1]]) 115 | 116 | # call web output with correct column names 117 | datatable(df1, options = list( 118 | columnDefs = list(list(className = 'dt-left', targets = c(0,1,2,3,4,5))), 119 | pageLength = MAX, 120 | order = list(list(2, 'desc'))), 121 | colnames = c('Num', 'Name', 'Accuracy', 'Kappa', 'time [s]', 'Model name'), 122 | caption = paste('Classification results from caret models',Sys.time()), 123 | class = 'cell-border stripe') %>% 124 | formatRound('x2', 3) %>% 125 | formatRound('x3', 3) %>% 126 | formatRound('x4', 3) %>% 127 | formatStyle(2, 128 | background = styleColorBar(x2, 'steelblue'), 129 | backgroundSize = '100% 90%', 130 | backgroundRepeat = 'no-repeat', 131 | backgroundPosition = 'center' 132 | ) 133 | 134 | 135 | ### END 136 | 137 | -------------------------------------------------------------------------------- /caret-classification/caret-all-binary-class-PimaIndiansDiabetes.tsv: -------------------------------------------------------------------------------- 1 | Num Name Accuracy Kappa time [s] Model name 2 | 73 ORFlog 0.844 0.656 682 Oblique Random Forest 3 | 89 rf 0.849 0.656 4.24 Random Forest 4 | 75 ORFridge 0.846 0.655 1494.08 Oblique Random Forest 5 | 74 ORFpls 0.846 0.654 367.87 Oblique Random Forest 6 | 77 parRF 0.847 0.652 9.52 Parallel Random Forest 7 | 11 Boruta 0.847 0.652 132.71 Random Forest with Additional Feature Selection 8 | 86 ranger 0.845 0.651 23.89 Random Forest 9 | 30 extraTrees 0.846 0.648 20.36 Random Forest by Randomization 10 | 101 RRFglobal 0.842 0.643 5.21 Regularized Random Forest 11 | 120 treebag 0.836 0.63 2 Bagged CART 12 | 125 xgbLinear 0.834 0.628 43.35 eXtreme Gradient Boosting 13 | 126 xgbTree 0.831 0.623 13.64 eXtreme Gradient Boosting 14 | 100 RRF 0.822 0.602 11.14 Regularized Random Forest 15 | 24 deepboost 0.822 0.601 64.7 DeepBoost 16 | 87 rbfDDA 0.823 0.599 15.19 Radial Basis Function Network 17 | 94 rotationForestCp 0.804 0.555 9.82 Rotation Forest 18 | 84 protoclass 0.796 0.553 4.7 Greedy Prototype Selection 19 | 39 gbm 0.796 0.538 1.4 Stochastic Gradient Boosting 20 | 71 oblique.tree 0.79 0.534 91.98 Oblique Trees 21 | 93 rotationForest 0.793 0.527 5.33 Rotation Forest 22 | 16 C5.0Cost 0.789 0.524 10.3 Cost-Sensitive C5.0 23 | 15 C5.0 0.789 0.524 5.09 C5.0 24 | 118 svmRadialSigma 0.786 0.51 3.24 Support Vector Machines with Radial Basis Function Kernel 25 | 19 cforest 0.786 0.51 32.27 Conditional Inference Random Forest 26 | 1 ada 0.782 0.509 34.96 Boosted Classification Trees 27 | 37 gaussprPoly 0.785 0.506 638.57 Gaussian Process with Polynomial Kernel 28 | 80 pcaNNet 0.775 0.503 3.37 Neural Networks with Feature Extraction 29 | 35 gamLoess 0.779 0.5 1.39 Generalized Additive Model using LOESS 30 | 52 kknn 0.776 0.498 2.48 k-Nearest Neighbors 31 | 38 gaussprRadial 0.78 0.496 55.13 Gaussian Process with Radial Basis Function Kernel 32 | 31 fda 0.778 0.493 0.95 Flexible Discriminant Analysis 33 | 5 bagEarthGCV 0.777 0.492 10.6 Bagged MARS using gCV Pruning 34 | 27 earth 0.777 0.491 1.19 Multivariate Adaptive Regression Spline 35 | 40 gcvEarth 0.777 0.491 0.86 Multivariate Adaptive Regression Splines 36 | 4 bagEarth 0.777 0.491 23.94 Bagged MARS 37 | 119 svmRadialWeights 0.779 0.49 3.11 Support Vector Machines with Class Weights 38 | 116 svmRadial 0.779 0.49 1.49 Support Vector Machines with Radial Basis Function Kernel 39 | 117 svmRadialCost 0.778 0.488 1.52 Support Vector Machines with Radial Basis Function Kernel 40 | 96 rpart1SE 0.77 0.487 0.75 CART 41 | 14 bstTree 0.779 0.486 20.59 Boosted Tree 42 | 7 bagFDAGCV 0.774 0.483 8.12 Bagged FDA using gCV Pruning 43 | 34 gamboost 0.775 0.48 3.4 Boosted Generalized Additive Model 44 | 6 bagFDA 0.773 0.479 19.83 Bagged Flexible Discriminant Analysis 45 | 21 ctree 0.767 0.479 0.98 Conditional Inference Tree 46 | 115 svmPoly 0.778 0.477 6.96 Support Vector Machines with Polynomial Kernel 47 | 32 FRBCS.CHI 0.762 0.476 1254.79 Fuzzy Rules Using Chi's Method 48 | 58 loclda 0.774 0.476 5.1 Localized Linear Discriminant Analysis 49 | 43 glmnet 0.773 0.476 1.07 glmnet 50 | 88 rda 0.773 0.475 3.93 Regularized Discriminant Analysis 51 | 55 lda2 0.773 0.475 0.63 Linear Discriminant Analysis 52 | 2 amdai 0.773 0.475 0.62 Adaptive Mixture Discriminant Analysis 53 | 54 lda 0.773 0.475 0.62 Linear Discriminant Analysis 54 | 48 hdrda 0.773 0.475 8.02 High-Dimensional Regularized Discriminant Analysis 55 | 81 pda 0.773 0.475 0.83 Penalized Discriminant Analysis 56 | 8 bayesglm 0.773 0.475 0.75 Bayesian Generalized Linear Model 57 | 121 vglmAdjCat 0.773 0.474 2.15 Adjacent Categories Probability Model for Ordinal Data 58 | 36 gamSpline 0.773 0.474 1.94 Generalized Additive Model using Splines 59 | 68 multinom 0.773 0.474 0.85 Penalized Multinomial Regression 60 | 41 glm 0.773 0.474 0.66 Generalized Linear Model 61 | 82 plr 0.773 0.474 10.91 Penalized Logistic Regression 62 | 44 glmStepAIC 0.772 0.474 1.19 Generalized Linear Model with Stepwise Feature Selection 63 | 122 vglmContRatio 0.773 0.473 4.06 Continuation Ratio Model for Ordinal Data 64 | 123 vglmCumulative 0.773 0.473 2.68 Cumulative Probability Model for Ordinal Data 65 | 62 mda 0.773 0.473 1.31 Mixture Discriminant Analysis 66 | 109 spls 0.773 0.471 4.13 Sparse Partial Least Squares 67 | 91 rlda 0.74 0.471 1.67 Regularized Linear Discriminant Analysis 68 | 42 glmboost 0.771 0.469 1.11 Boosted Generalized Linear Model 69 | 102 rrlda 0.752 0.467 24.74 Robust Regularized Linear Discriminant Analysis 70 | 69 nb 0.764 0.466 3.65 Naive Bayes 71 | 114 svmLinearWeights 0.77 0.466 2.36 Linear Support Vector Machines with Class Weights 72 | 113 svmLinear2 0.77 0.466 1.25 Support Vector Machines with Linear Kernel 73 | 90 rfRules 0.769 0.466 475.51 Random Forest Rule-Based Model 74 | 97 rpart2 0.766 0.465 0.81 CART 75 | 112 svmLinear 0.769 0.464 0.92 Support Vector Machines with Linear Kernel 76 | 33 FRBCS.W 0.776 0.463 1218.4 Fuzzy Rules with Weight Factor 77 | 60 lssvmRadial 0.767 0.462 8.93 Least Squares Support Vector Machine with Radial Basis Function Kernel 78 | 105 sdwd 0.771 0.462 1.34 Sparse Distance Weighted Discrimination 79 | 63 Mlda 0.768 0.459 0.81 Maximum Uncertainty Linear Discriminant Analysis 80 | 18 C5.0Tree 0.754 0.455 0.7 Single C5.0 Tree 81 | 49 J48 0.756 0.455 2.93 C4.5-like Trees 82 | 104 sda 0.764 0.454 1.11 Shrinkage Discriminant Analysis 83 | 10 blackboost 0.763 0.448 4.98 Boosted Tree 84 | 23 dda 0.737 0.446 3.2 Diagonal Discriminant Analysis 85 | 50 JRip 0.753 0.442 3.62 Rule-Based Classifier 86 | 17 C5.0Rules 0.749 0.439 0.75 Single C5.0 Ruleset 87 | 78 PART 0.746 0.439 15.49 Rule-Based Classifier 88 | 45 gpls 0.757 0.434 15.14 Generalized Partial Least Squares 89 | 83 pls 0.758 0.434 0.81 Partial Least Squares 90 | 124 widekernelpls 0.758 0.434 0.78 Partial Least Squares 91 | 106 simpls 0.758 0.434 0.76 Partial Least Squares 92 | 51 kernelpls 0.758 0.434 0.73 Partial Least Squares 93 | 29 evtree 0.754 0.433 119.09 Tree Models from Genetic Algorithms 94 | 59 LogitBoost 0.747 0.43 1.04 Boosted Logistic Regression 95 | 46 hda 0.751 0.423 14.34 Heteroscedastic Discriminant Analysis 96 | 99 rpartScore 0.748 0.423 14.29 CART or Ordinal Responses 97 | 95 rpart 0.748 0.423 0.81 CART 98 | 13 bstSm 0.758 0.421 13.74 Boosted Smoothing Spline 99 | 85 qda 0.745 0.42 0.65 Quadratic Discriminant Analysis 100 | 57 LMT 0.74 0.419 5.76 Logistic Model Trees 101 | 56 Linda 0.737 0.418 1.91 Robust Linear Discriminant Analysis 102 | 47 hdda 0.74 0.416 0.86 High Dimensional Discriminant Analysis 103 | 92 rocc 0.751 0.412 1.53 ROC-Based Classifier 104 | 53 knn 0.739 0.41 0.77 k-Nearest Neighbors 105 | 22 ctree2 0.746 0.409 1.53 Conditional Inference Tree 106 | 3 avNNet 0.746 0.394 12.67 Model Averaged Neural Network 107 | 110 stepLDA 0.744 0.391 7.36 Linear Discriminant Analysis with Stepwise Feature Selection 108 | 20 CSimca 0.692 0.389 1.22 SIMCA 109 | 111 stepQDA 0.744 0.387 7.4 Quadratic Discriminant Analysis with Stepwise Feature Selection 110 | 79 partDSA 0.725 0.377 12.29 partDSA 111 | 26 dwdRadial 0.776 0.368 35 Distance Weighted Discrimination with Radial Basis Function Kernel 112 | 103 RSimca 0.67 0.367 2.17 Robust SIMCA 113 | 127 xyf 0.715 0.339 4.74 Self-Organizing Maps 114 | 9 bdk 0.713 0.323 5.17 Self-Organizing Map 115 | 76 pam 0.729 0.321 7 Nearest Shrunken Centroids 116 | 98 rpartCost 0.729 0.316 0.87 Cost-Sensitive CART 117 | 72 OneR 0.706 0.313 0.99 Single Rule Classification 118 | 61 lvq 0.705 0.296 1.35 Learning Vector Quantization 119 | 70 nnet 0.704 0.295 2.8 Neural Network 120 | 28 elm 0.669 0.171 1.17 Extreme Learning Machine 121 | 107 slda 0.671 0.143 1.39 Stabilized Linear Discriminant Analysis 122 | 12 BstLm 0.644 0.026 2.84 Boosted Linear Model 123 | 67 mlpWeightDecayML 0.648 0 34.63 Multi-Layer Perceptron multiple layers 124 | 66 mlpWeightDecay 0.648 0 34.26 Multi-Layer Perceptron 125 | 25 dnn 0.648 0 5.22 Stacked AutoEncoder Deep Neural Network 126 | 108 sparseLDA 0.354 -0.001 1.66 Sparse Linear Discriminant Analysis 127 | 64 mlp 0.647 -0.001 12.56 Multi-Layer Perceptron 128 | 65 mlpML 0.647 -0.001 12.02 Multi-Layer Perceptron with multiple layers 129 | 130 | -------------------------------------------------------------------------------- /caret-classification/iris-classification-all-fast.R: -------------------------------------------------------------------------------- 1 | # A selection of "fast" of all 160 caret models for multi-class classification and iris set 2 | # A number of slow and broken models are excluded, this may change with each release 3 | # The output from fast (working) binary classification models is 4 | # exported to a sortable table in a web browser using the DT library 5 | # Total runtime 145 seconds on 16 core (3.1GHz) (all single ML methods <4 seconds) 6 | # https://github.com/tobigithub/caret-machine-learning 7 | # 8 | # Warning: requires DeLuxe installation of all caret dependencies. 9 | # Warning: invokes DLL hell under Windows "maximal number of DLLs reached..." 10 | # https://github.com/tobigithub/caret-machine-learning/tree/master/caret-setup 11 | # 12 | # R3.3.1 and caret 6.0-70 13 | # Tobias Kind (2016) 14 | 15 | # use mlbench, caret and DT library, please make sure they are already installed 16 | require(mlbench) 17 | require(caret) 18 | require(DT) 19 | 20 | # load iris set 21 | data(iris) 22 | dim(iris) 23 | 24 | # get all model names for classification 25 | m <- unique(modelLookup()[modelLookup()$forClass,c(1)]) 26 | length(m); m; 27 | 28 | # slow classification models ("rbf" crashes; "dwdLinear", "ownn", "snn" have issues) 29 | # all others may have just failed and are not listed here, models may be very accurate 30 | removeModels <- c("AdaBag","AdaBoost.M1","pda2","dwdRadial","rbf","dwdLinear", "dwdPoly", 31 | "gaussprLinear","gaussprPoly","rFerns","sddaLDA", "smda", "sddaQDA", "xgbLinear","xgbTree", 32 | "AdaBag","FH.GBML","ORFsvm","ownn","vbmpRadial","SLAVE","ORFlog","GFS.GCCL","ORFpls", 33 | "snn", "bagEarth","ORFridge","rmda","awnb", "awtan", "manb", "nbDiscrete", "nbSearch", 34 | "ordinalNet", "blackboost","tan","tanSearch","randomGLM","Rborist", 35 | "FRBCS.W", "FRBCS.CHI","evtree","bstTree","bagEarthGCV","bagFDA","rrlda") 36 | 37 | #remove all slow and failed models from model list 38 | m <- m[!m %in% removeModels] 39 | 40 | # not multiclass 41 | # Something is wrong; all the Accuracy metric values are missing: 42 | removeModels <- c("ada","adaboost","bag","bartMachine","binda","C5.0Cost","chaid", 43 | "deepboost","gamboost","glm","glmboost","glmStepAIC") 44 | 45 | #remove multiclass fails from model list 46 | m <- m[!m %in% removeModels] 47 | 48 | # pre-load all packages (does not really work due to other dependencies) 49 | suppressPackageStartupMessages(ll <-lapply(m, require, character.only = TRUE)) 50 | 51 | # show which libraries were loaded 52 | sessionInfo() 53 | 54 | # load X and Y (this will be transferred to to train function) 55 | X = iris[,1:3] 56 | Y = iris$Species 57 | 58 | # register parallel front-end 59 | library(doParallel); cl <- makeCluster(detectCores()); registerDoParallel(cl) 60 | 61 | # this is required otherwise the first method is benchmarked wrong 62 | warmup <-train(y=Y, x=X, "rf", trControl = trainControl(method = "boot632")) 63 | 64 | # this setup actually calls the caret::train function, in order to provide 65 | # minimal error handling this type of construct is needed. 66 | trainCall <- function(i) 67 | { 68 | cat("----------------------------------------------------","\n"); 69 | set.seed(123); cat(i," <- loaded\n"); 70 | return(tryCatch( 71 | t2 <- train(y=Y, x=X, (i), trControl = trainControl(method = "boot632")), 72 | error=function(e) NULL)) 73 | } 74 | 75 | # use lapply/loop to run everything, required for try/catch error function to work 76 | t2 <- lapply(m, trainCall) 77 | 78 | #remove NULL values, we only allow succesful methods, provenance is deleted. 79 | t2 <- t2[!sapply(t2, is.null)] 80 | 81 | # this setup extracts the results with minimal error handling 82 | # TrainKappa can be sometimes zero, but Accuracy SD can be still available 83 | # see Kappa value http://epiville.ccnmtl.columbia.edu/popup/how_to_calculate_kappa.html 84 | printCall <- function(i) 85 | { 86 | return(tryCatch( 87 | { 88 | cat(sprintf("%-22s",(m[i]))) 89 | cat(round(getTrainPerf(t2[[i]])$TrainAccuracy,4),"\t") 90 | cat(round(getTrainPerf(t2[[i]])$TrainKappa,4),"\t") 91 | cat(t2[[i]]$times$everything[3],"\n")}, 92 | error=function(e) NULL)) 93 | } 94 | 95 | r2 <- lapply(1:length(t2), printCall) 96 | 97 | # stop cluster and register sequntial front end 98 | stopCluster(cl); registerDoSEQ(); 99 | 100 | # preallocate data types 101 | i = 1; MAX = length(t2); 102 | x1 <- character() # Name 103 | x2 <- numeric() # R2 104 | x3 <- numeric() # RMSE 105 | x4 <- numeric() # time [s] 106 | x5 <- character() # long model name 107 | 108 | # fill data and check indexes and NA with loop/lapply 109 | for (i in 1:length(t2)) { 110 | x1[i] <- t2[[i]]$method 111 | x2[i] <- as.numeric(round(getTrainPerf(t2[[i]])$TrainAccuracy,4)) 112 | x3[i] <- as.numeric(round(getTrainPerf(t2[[i]])$TrainKappa,4)) 113 | x4[i] <- as.numeric(t2[[i]]$times$everything[3]) 114 | x5[i] <- t2[[i]]$modelInfo$label 115 | } 116 | 117 | # coerce to data frame 118 | df1 <- data.frame(x1,x2,x3,x4,x5, stringsAsFactors=FALSE) 119 | 120 | # print all results to R-GUI 121 | df1 122 | 123 | # plot models, just as example 124 | # ggplot(t2[[1]]) 125 | # ggplot(t2[[1]]) 126 | 127 | # call web output with correct column names 128 | DT::datatable(df1, options = list( 129 | columnDefs = list(list(className = 'dt-left', targets = c(0,1,2,3,4,5))), 130 | pageLength = MAX, 131 | order = list(list(3, 'desc'))), # sort according to kappa value 132 | colnames = c('Num', 'Name', 'Accuracy', 'Kappa', 'time [s]', 'Model name'), 133 | caption = paste('Classification results from caret models',Sys.time()), 134 | class = 'cell-border stripe') %>% 135 | formatRound('x2', 3) %>% 136 | formatRound('x3', 3) %>% 137 | formatRound('x4', 3) %>% 138 | formatStyle(2, 139 | background = styleColorBar(x2, 'steelblue'), 140 | backgroundSize = '100% 90%', 141 | backgroundRepeat = 'no-repeat', 142 | backgroundPosition = 'center' 143 | ) 144 | 145 | ### END 146 | -------------------------------------------------------------------------------- /caret-classification/iris-classification-caret-all.R: -------------------------------------------------------------------------------- 1 | # Use of all 160 caret models for multi-class classification and iris set 2 | # The output from fast (working) binary classification models is 3 | # exported to a sortable table in a web browser using the DT library 4 | # https://github.com/tobigithub/caret-machine-learning 5 | # Tobias Kind (2015) 6 | 7 | # use mlbench, caret and DT library 8 | require(mlbench) 9 | require(caret) 10 | require(DT) 11 | 12 | # load iris set 13 | data(iris) 14 | dim(iris) 15 | 16 | # get all model names for classification 17 | m <- unique(modelLookup()[modelLookup()$forClass,c(1)]) 18 | length(m); m; 19 | 20 | # slow classification models ("rbf" crashes; "dwdLinear", "ownn", "snn" have issues) 21 | # all others may have just failed and are not listed here 22 | removeModels <- c("AdaBoost.M1","pda2","dwdRadial","rbf","dwdLinear", "dwdPoly", 23 | "gaussprLinear","gaussprPoly","rFerns","sddaLDA", "smda", "sddaQDA", "xgbLinear") 24 | 25 | #remove all slow and failed models from model list 26 | m <- m[!m %in% removeModels] 27 | 28 | # pre-load all packages (does not really work due to other dependencies) 29 | suppressPackageStartupMessages(ll <-lapply(m, require, character.only = TRUE)) 30 | 31 | # show which libraries were loaded 32 | sessionInfo() 33 | 34 | # load X and Y (this will be transferred to to train function) 35 | X = iris[,1:3] 36 | Y = iris$Species 37 | 38 | # register parallel front-end 39 | library(doParallel); cl <- makeCluster(detectCores()); registerDoParallel(cl) 40 | 41 | # this setup actually calls the caret::train function, in order to provide 42 | # minimal error handling this type of construct is needed. 43 | trainCall <- function(i) 44 | { 45 | cat("----------------------------------------------------","\n"); 46 | set.seed(123); cat(i," <- loaded\n"); 47 | return(tryCatch( 48 | t2 <- train(y=Y, x=X, (i), trControl = trainControl(method = "boot632")), 49 | error=function(e) NULL)) 50 | } 51 | 52 | # use lapply/loop to run everything, required for try/catch error function to work 53 | t2 <- lapply(m, trainCall) 54 | 55 | #remove NULL values, we only allow succesful methods, provenance is deleted. 56 | t2 <- t2[!sapply(t2, is.null)] 57 | 58 | # this setup extracts the results with minimal error handling 59 | # TrainKappa can be sometimes zero, but Accuracy SD can be still available 60 | # see Kappa value http://epiville.ccnmtl.columbia.edu/popup/how_to_calculate_kappa.html 61 | printCall <- function(i) 62 | { 63 | return(tryCatch( 64 | { 65 | cat(sprintf("%-22s",(m[i]))) 66 | cat(round(getTrainPerf(t2[[i]])$TrainAccuracy,4),"\t") 67 | cat(round(getTrainPerf(t2[[i]])$TrainKappa,4),"\t") 68 | cat(t2[[i]]$times$everything[3],"\n")}, 69 | error=function(e) NULL)) 70 | } 71 | 72 | r2 <- lapply(1:length(t2), printCall) 73 | 74 | # stop cluster and register sequntial front end 75 | stopCluster(cl); registerDoSEQ(); 76 | 77 | # preallocate data types 78 | i = 1; MAX = length(t2); 79 | x1 <- character() # Name 80 | x2 <- numeric() # R2 81 | x3 <- numeric() # RMSE 82 | x4 <- numeric() # time [s] 83 | x5 <- character() # long model name 84 | 85 | # fill data and check indexes and NA with loop/lapply 86 | for (i in 1:length(t2)) { 87 | x1[i] <- t2[[i]]$method 88 | x2[i] <- as.numeric(round(getTrainPerf(t2[[i]])$TrainAccuracy,4)) 89 | x3[i] <- as.numeric(round(getTrainPerf(t2[[i]])$TrainKappa,4)) 90 | x4[i] <- as.numeric(t2[[i]]$times$everything[3]) 91 | x5[i] <- t2[[i]]$modelInfo$label 92 | } 93 | 94 | # coerce to data frame 95 | df1 <- data.frame(x1,x2,x3,x4,x5, stringsAsFactors=FALSE) 96 | 97 | # print all results to R-GUI 98 | df1 99 | 100 | # plot models, just as example 101 | # ggplot(t2[[1]]) 102 | # ggplot(t2[[1]]) 103 | 104 | # call web output with correct column names 105 | datatable(df1, options = list( 106 | columnDefs = list(list(className = 'dt-left', targets = c(0,1,2,3,4,5))), 107 | pageLength = MAX, 108 | order = list(list(2, 'desc'))), 109 | colnames = c('Num', 'Name', 'Accuracy', 'Kappa', 'time [s]', 'Model name'), 110 | caption = paste('Classification results from caret models',Sys.time()), 111 | class = 'cell-border stripe') %>% 112 | formatRound('x2', 3) %>% 113 | formatRound('x3', 3) %>% 114 | formatRound('x4', 3) %>% 115 | formatStyle(2, 116 | background = styleColorBar(x2, 'steelblue'), 117 | backgroundSize = '100% 90%', 118 | backgroundRepeat = 'no-repeat', 119 | backgroundPosition = 'center' 120 | ) 121 | 122 | # print confusion matrix example 123 | caret::confusionMatrix(t2[[1]]) 124 | 125 | 126 | ### END 127 | -------------------------------------------------------------------------------- /caret-cv/HAR-all-CV-methods.R: -------------------------------------------------------------------------------- 1 | 2 | # Parallel Random Forest and knn with multiple CV methods 3 | # Data: http://groupware.les.inf.puc-rio.br/har 4 | # Sources used: 5 | # http://bigcomputing.blogspot.com/2014/10/an-example-of-using-random-forest-in.html 6 | # https://www.coursera.org/specializations/jhudatascience?utm_medium=courseDescripTop 7 | # https://rstudio-pubs-static.s3.amazonaws.com/89748_264cbfde747d4d779d7bd6b9b3f31f45.html 8 | # Google -> "InTrain<-createDataPartition(" predict(rf_model,test) "pml-training.csv" 9 | # Google -> "Using devices such as Jawbone Up, Nike FuelBand, and Fitbit" caret 10 | # Google -> https://www.google.com/?gws_rd=ssl#q=%22B+A+B+A+A+E+D+B+A+A+B+C+B+A+E+E+A+B+B+B%22 11 | # https://yoke2.github.io/PMLCourseProject/pmlreport.html 12 | # https://github.com/tobigithub/caret-machine-learning 13 | # Tobias Kind (2015) 14 | 15 | 16 | library(caret) 17 | require(ggplot2) 18 | require(randomForest) 19 | 20 | library(doSNOW) 21 | library(parallel) 22 | 23 | training_URL<-"http://d396qusza40orc.cloudfront.net/predmachlearn/pml-training.csv" 24 | test_URL<-"http://d396qusza40orc.cloudfront.net/predmachlearn/pml-testing.csv" 25 | training<-read.csv(training_URL,na.strings=c("NA","")) 26 | test<-read.csv(test_URL,na.strings=c("NA","")) 27 | 28 | training<-training[,7:160] 29 | test<-test[,7:160] 30 | 31 | mostly_data<-apply(!is.na(training),2,sum)>19621 32 | training<-training[,mostly_data] 33 | test<-test[,mostly_data] 34 | dim(training) 35 | 36 | 37 | #plot feature wise 38 | #https://rpubs.com/davizuku/practical_ml 39 | selCols <- grep("^accel_", names(training)); 40 | featurePlot(x = training[,selCols], 41 | y = training$classe, 42 | plot = "boxplot"); 43 | 44 | selCols <- grep("^magnet_", names(training)); 45 | featurePlot(x = training[,selCols], 46 | y = training$classe, 47 | plot = "boxplot"); 48 | 49 | selCols <- grep("^gyros_", names(training)); 50 | featurePlot(x = training[,selCols], 51 | y = training$classe, 52 | plot = "boxplot"); 53 | 54 | 55 | #plot training data 56 | featurePlot(x=training[,c(1:12)], y=training$classe, plot = 'box') 57 | 58 | InTrain<-createDataPartition(y=training$classe,p=0.3,list=FALSE) 59 | training1<-training[InTrain,] 60 | 61 | # detect true cores requires parallel() 62 | nCores <- detectCores(logical = FALSE) 63 | # detect threads 64 | nThreads <- detectCores(logical = TRUE) 65 | 66 | cl <- makeCluster(nThreads, type="SOCK") 67 | registerDoSNOW(cl); cl; 68 | getDoParWorkers() 69 | getDoParName() 70 | 71 | #------------------------------------------------------------ 72 | # rf usually works 73 | ptm <- proc.time() 74 | rf_model<-train(classe~.,data=training1,method="rf") 75 | # method="repeatedcv", number=10, repeats=3 ## repeated k-fold Cross Validation 76 | # method="cv",number=5 ## k-fold Cross Validation 77 | # method="LOOCV" ## Leave One Out Cross Validation 78 | # method="boot", number=100 ## Bootstrap 79 | # method = "boot632" ## The .632+ Bootstrap 80 | # trControl=trainControl(method="boot632"), 81 | # prox=TRUE,allowParallel=TRUE) 82 | proc.time() - ptm 83 | 84 | #------------------------------------------------------------ 85 | # knn 86 | ptm <- proc.time() 87 | model1<-train(classe~.,data=training1,method="knn") 88 | proc.time() - ptm 89 | 90 | #------------------------------------------------------------ 91 | # "repeatedcv" ## repeated k-fold Cross Validation 92 | ptm <- proc.time() 93 | model2<-train(classe~.,data=training1,method="knn", 94 | trControl=trainControl(method="repeatedcv", number=3, repeats=3)) ## repeated k-fold Cross Validation 95 | proc.time() - ptm 96 | #------------------------------------------------------------ 97 | # "cv" ## k-fold Cross Validation 98 | ptm <- proc.time() 99 | model3<-train(classe~.,data=training1,method="knn", 100 | trControl=trainControl(method="cv",number=3)) ## k-fold Cross Validation 101 | proc.time() - ptm 102 | #------------------------------------------------------------ 103 | # "LOOCV" ## Leave One Out Cross Validation 104 | ptm <- proc.time() 105 | model4<-train(classe~.,data=training1,method="knn", 106 | trControl=trainControl(method="LOOCV", repeats=1)) ## Leave One Out Cross Validation 107 | proc.time() - ptm 108 | #------------------------------------------------------------ 109 | # "boot" ## Bootstrap 110 | ptm <- proc.time() 111 | model5<-train(classe~.,data=training1,method="knn", 112 | trControl=trainControl(method="boot", number=10)) ## Bootstrap 113 | proc.time() - ptm 114 | #------------------------------------------------------------ 115 | # "boot632" ## The .632+ Bootstrap 116 | ptm <- proc.time() 117 | model6<-train(classe~.,data=training1,method="knn", 118 | trControl=trainControl(method="boot632")) ## The .632+ Bootstrap 119 | proc.time() - ptm 120 | #------------------------------------------------------------ 121 | ## Times for splits and trains 122 | ## user system elapsed 123 | ## rf............ 17.78 0.74 126.16 124 | ## knn........... 0.87 0.80 19.76 125 | ## knn-repeatedcv 0.75 0.56 3.12 126 | ## knn-cv........ 0.69 0.45 1.81 127 | ## knn-LOOCV..... 77.60 34.54 120.77 128 | ## knn-boot...... 0.69 0.67 6.35 129 | ## knn-boot632 0.99 0.80 23.12 130 | #------------------------------------------------------------ 131 | rf_model 132 | model1 133 | model2 134 | model3 135 | model4 136 | model5 137 | model6 138 | #------------------------------------------------------------ 139 | 140 | print(rf_model) 141 | print(rf_model$finalModel) 142 | plot(rf_model$finalModel) 143 | rf_model$results 144 | 145 | # number of variables per level (mtry) 146 | confusionMatrix(rf_model) 147 | plot(rf_model) 148 | 149 | #QPLOT 150 | qplot(roll_belt, magnet_dumbbell_y, colour=classe, data=training) 151 | 152 | rf_test <- predict(rf_model,test) 153 | rf_test 154 | # Correct solution 155 | #B A B A A E D B A A B C B A E E A B B B 156 | 157 | stopCluster(cl) 158 | 159 | #--- register foreach for sequential mode 160 | registerDoSEQ() 161 | 162 | ### END 163 | -------------------------------------------------------------------------------- /caret-cv/caret-all-cv-methods-lapply-sapply.R: -------------------------------------------------------------------------------- 1 | # Run simple cross-validation method with caret and knn 2 | # https://github.com/tobigithub/caret-machine-learning 3 | # Tobias Kind (2015) 4 | 5 | # All caret cross-validation methods applied using lapply (list result) 6 | # regression example using knn (very fast); "none" is not allowed for lapply 7 | 8 | require(caret); data(BloodBrain); 9 | cvMethods <- c("boot632","LGOCV","LOOCV","cv","repeatedcv", "boot"); 10 | all <- lapply(cvMethods ,function(x) {set.seed(123); print(x); tc <- trainControl(method=(x)) 11 | fit1 <- train(bbbDescr, logBBB, trControl=tc, method="knn") }) 12 | all 13 | 14 | # just to show the structure of output 15 | # sapply(all,getTrainPerf) 16 | # lapply(all,getTrainPerf) 17 | 18 | # extract the used cvMethods (redundant because already incvMethods) 19 | myNames <- lapply(1:6, function(x) all[[x]]$control$method) 20 | # save results 21 | results <- sapply(all,getTrainPerf) 22 | # change column Names to cv methods 23 | colnames(results) <- myNames; 24 | # get the results 25 | results 26 | 27 | # boot632 LGOCV LOOCV cv repeatedcv boot 28 | # TrainRMSE 0.619778 0.6275048 0.6309407 0.6192086 0.6192086 0.66943 29 | # TrainRsquared 0.4009745 0.3554037 0.3429081 0.3831812 0.3831812 0.3140373 30 | # method "knn" "knn" "knn" "knn" "knn" "knn" 31 | 32 | #--------------------------------------------------------------------------- 33 | 34 | # All cross-validation methods applied using sapply (matrix result) 35 | # regression example using knn (very fast); "none" is not allowed for lapply 36 | 37 | require(caret); data(BloodBrain); 38 | cvMethods <- c("boot632","LGOCV","LOOCV","cv","repeatedcv", "boot" ); 39 | all <- sapply(cvMethods ,function(x) {set.seed(123); print(x); tc <- trainControl(method=(x)) 40 | fit1 <- train(bbbDescr, logBBB, trControl=tc, method="knn") }); all 41 | all[4, ] 42 | 43 | # boot632 LGOCV LOOCV cv repeatedcv boot 44 | # method "knn" "knn" "knn" "knn" "knn" "knn" 45 | # modelInfo List,13 List,13 List,13 List,13 List,13 List,13 46 | # modelType "Regression" "Regression" "Regression" "Regression" "Regression" "Regression" 47 | # results List,7 List,5 List,3 List,5 List,5 List,5 48 | # pred NULL NULL List,4 NULL NULL NULL 49 | # bestTune List,1 List,1 List,1 List,1 List,1 List,1 50 | # call Expression Expression Expression Expression Expression Expression 51 | # dots List,0 List,0 List,0 List,0 List,0 List,0 52 | # metric "RMSE" "RMSE" "RMSE" "RMSE" "RMSE" "RMSE" 53 | # control List,26 List,26 List,26 List,26 List,26 List,26 54 | # finalModel List,7 List,7 List,7 List,7 List,7 List,7 55 | # preProcess NULL NULL NULL NULL NULL NULL 56 | # trainingData List,135 List,135 List,135 List,135 List,135 List,135 57 | # resample List,3 List,3 NULL List,3 List,3 List,3 58 | # resampledCM NULL NULL NULL NULL NULL NULL 59 | # perfNames Character,2 Character,2 Character,2 Character,2 Character,2 Character,2 60 | # maximize FALSE FALSE FALSE FALSE FALSE FALSE 61 | # yLimits Numeric,2 Numeric,2 Numeric,2 Numeric,2 Numeric,2 Numeric,2 62 | # times List,3 List,3 List,3 List,3 List,3 List,3 63 | 64 | 65 | ### END 66 | -------------------------------------------------------------------------------- /caret-cv/caret-all-cv-parallel-cubist.R: -------------------------------------------------------------------------------- 1 | # Run all cross-validations with method with cubist 2 | # Read: http://rulequest.com/cubist-examples.html 3 | # Read: https://cran.r-project.org/web/packages/Cubist/vignettes/cubist.pdf 4 | # Read: http://www.r-bloggers.com/ensemble-learning-with-cubist-model/ 5 | # 6 | # https://github.com/tobigithub/caret-machine-learning 7 | # Tobias Kind (2015) 8 | 9 | 10 | # load libs 11 | require(caret); data(BloodBrain); 12 | 13 | # register parallel client 14 | library(doParallel); cl <- makeCluster(detectCores()); registerDoParallel(cl) 15 | 16 | # define all cross-validation methods 17 | cvMethods <- c("boot632","LGOCV","LOOCV","cv","repeatedcv", "boot"); 18 | 19 | # use R lapply function to loop through all CV methos with qrf 20 | all <- lapply(cvMethods ,function(x) {set.seed(123); print(x); tc <- trainControl(method=(x)) 21 | fit1 <- train(bbbDescr, logBBB, trControl=tc, method="cubist") }); all; 22 | 23 | # extract the used cvMethods (redundant because already incvMethods) 24 | myNames <- lapply(1:6, function(x) all[[x]]$control$method) 25 | 26 | # save results 27 | results <- sapply(all,getTrainPerf) 28 | 29 | # change column Names to cv methods 30 | colnames(results) <- myNames; 31 | 32 | # get the results 33 | results 34 | 35 | # stop cluster 36 | stopCluster(cl); registerDoSEQ(); 37 | 38 | # boot632 LGOCV LOOCV cv repeatedcv boot 39 | # TrainRMSE 0.3794002 0.4959378 0.4997026 0.4933169 0.4930747 0.5617455 40 | # TrainRsquared 0.6743715 0.6067721 0.5875271 0.603017 0.6032699 0.4883528 41 | # method "cubist" "cubist" "cubist" "cubist" "cubist" "cubist" 42 | 43 | 44 | ### END 45 | -------------------------------------------------------------------------------- /caret-cv/caret-all-cv-parallel-qrf.R: -------------------------------------------------------------------------------- 1 | # Run all cross-validations with method with qrf (Quantile Random Forest) 2 | # https://github.com/tobigithub/caret-machine-learning 3 | # Tobias Kind (2015) 4 | 5 | 6 | # load libs 7 | require(caret); data(BloodBrain); 8 | 9 | # register parallel client 10 | library(doParallel); cl <- makeCluster(detectCores()); registerDoParallel(cl) 11 | 12 | # define all cross-validation methods 13 | cvMethods <- c("boot632","LGOCV","LOOCV","cv","repeatedcv", "boot"); 14 | 15 | # use R lapply function to loop through all CV methos with qrf 16 | all <- lapply(cvMethods ,function(x) {set.seed(123); print(x); tc <- trainControl(method=(x)) 17 | fit1 <- train(bbbDescr, logBBB, trControl=tc, method="qrf") }); all; 18 | 19 | # extract the used cvMethods (redundant because already incvMethods) 20 | myNames <- lapply(1:6, function(x) all[[x]]$control$method) 21 | 22 | # save results 23 | results <- sapply(all,getTrainPerf) 24 | 25 | # change column Names to cv methods 26 | colnames(results) <- myNames; 27 | 28 | # get the results 29 | results 30 | 31 | # stop cluster 32 | stopCluster(cl); registerDoSEQ(); 33 | 34 | # boot632 LGOCV LOOCV cv repeatedcv boot 35 | # TrainRMSE 0.4199394 0.5450903 0.5264716 0.5210002 0.5127061 0.5539934 36 | # TrainRsquared 0.6829296 0.5193978 0.5474211 0.561647 0.5776622 0.5350395 37 | # method "qrf" "qrf" "qrf" "qrf" "qrf" "qrf" 38 | 39 | ### END 40 | -------------------------------------------------------------------------------- /caret-cv/caret-cv-simple.R: -------------------------------------------------------------------------------- 1 | # Run simple cross-validation method with caret and knn 2 | # https://github.com/tobigithub/caret-machine-learning 3 | # Tobias Kind (2015) 4 | 5 | # Single example, no cross-validation 6 | require(caret); data(BloodBrain); set.seed(123); 7 | fit1 <- train(bbbDescr, logBBB, "knn"); fit1 8 | 9 | # cross-validation example with method boot 10 | require(caret); data(BloodBrain); set.seed(123); 11 | tc <- trainControl(method="boot") 12 | fit1 <- train(bbbDescr, logBBB, trControl=tc, method="knn"); fit1 13 | 14 | 15 | ### END 16 | -------------------------------------------------------------------------------- /caret-datasets/caret-MS-datasets.csv: -------------------------------------------------------------------------------- 1 | Num,Data set,library,Reg/Class,Y-target,X-input,Dimension,Design 2 | 1,data(iris),R,Class,Species,remaining data,150 x 5,multi class 3 | 2,data(trees),R,NA,NA,remaining data,31x 3,multi class 4 | 3,data(Glass),R,Class,Type,remaining data,214 x 10,multi class 5 | 4,data(cox2),caret,Class,cox2Class ,"cox2Descr,cox2IC50",462x 255,Two class 6 | 5,data(oil),caret,Class,oilType,fattyAcids,96 x 7,multi class 7 | 6,data(dhfr),caret,Reg,Y,remaining data,325 x 229,multi param 8 | 7,data(GermanCredit),caret,Reg/Class,Class,remaining data,1000 x 62,multi param 9 | 8,data(BostonHousing),mlbench,Reg,medv,remaining data,506x14,multi param 10 | 9,data(BloodBrain),caret,Reg,logBBB ,bbbDescr ,208 x 134,multi param 11 | 10,data(mdrr),caret,Class,mdrrClass,mdrrDescr,528 x 342,Two class 12 | 11,data(Satellite) ,mlbench,Class,classes,remaining data,6435 x 37,multi class 13 | 12,data(cars),caret,Reg/Class,any,any,804x15,mixed 14 | 13,data(dhfr) ,caret,Class,Y,remaining data,325x229,Two class 15 | 14,data(pottery) ,caret,Class,potteryClass ,pottery ,NA,Two class 16 | 15,data(segmentationData) ,caret,Class,Class,remaining data,2019 x 61,Two class 17 | 16,data(tecator) ,caret,Reg,endpoints ,absorp ,215x 100,multi param 18 | 17,data(abalone) ,APM,Class,Type,remaining data,4177 x 9,multi class 19 | 18,data(AlzheimerDisease) ,APM,Class,diagnosis ,predictors ,333 x 130,Two class 20 | 19,data(ChemicalManufacturingProcess) ,APM,Reg,Yield,remaining data,176 x 58,multi param 21 | 20,data(concrete) ,APM,Reg,CompressiveStrength ,remaining data,1030 x 9,multi param 22 | 21,data(FuelEconomy) ,APM,Reg/Class,,,3 sets,multi param 23 | 22,data(hepatic) ,APM,Class,injury ,remaining data,2 sets,multi param 24 | 23,data(solubility) ,APM,Reg,solTestY ,,1267 x 228,multi param 25 | 24,data(permeability) ,APM,Reg,permeability ,fingerprints ,165 x 1107,multi param 26 | 25,data(schedulingData) ,APM,Class,Class,remaining data,4331x8,multi class 27 | 26,data(segmentationOriginal) ,APM,Class,Class,remaining data,2019 x 119, 28 | 27,data(twoClassData) ,APM,Class,classes ,predictors ,208 x 2,Two class 29 | 28,data(BreastCancer) ,mlbench,Class,Class ,remaining data,699 x 11,Two class 30 | 29,data(PimaIndiansDiabetes) ,mlbench,Class,diabetes ,remaining data,768 x 9,Two class 31 | 30,data(Sonar) ,mlbench,Class,Class ,remaining data,208x61,Two class 32 | 31,Human Activity Recognition (HAR) ,puc-rio.br,Class,“class”,remaining data,165634 x 21 ,multi class 33 | -------------------------------------------------------------------------------- /caret-datasets/view-caret-ML-datasets.R: -------------------------------------------------------------------------------- 1 | # View and load ML datasets for working with caret 2 | # https://github.com/tobigithub/caret-machine-learning 3 | # Tobias Kind (2015) 4 | 5 | # show all available data sets installed 6 | library(caret); library(datasets); library(AppliedPredictiveModeling); library(mlbench); data(); 7 | 8 | # load the dataset 9 | data(iris) 10 | # get the dimension of the dataset 11 | dim(iris) 12 | ## [1] 150 5 13 | length(iris) 14 | ## [1] 5 15 | # get the class name (here data frame) to choose correct operators 16 | class(iris) 17 | ## [1] "data.frame" 18 | 19 | # invoke simple data viewer 20 | View(iris) 21 | # invoke the useless editor 22 | edit(iris) 23 | # get the data structure 24 | str(iris) 25 | > str(iris) 26 | ##'data.frame': 150 obs. of 5 variables: 27 | ## $ Sepal.Length: num 5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ... 28 | ## $ Sepal.Width : num 3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ... 29 | ## $ Petal.Length: num 1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ... 30 | ## $ Petal.Width : num 0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ... 31 | ## $ Species : Factor w/ 3 levels "setosa","versicolor",..: 1 1 32 | 33 | -------------------------------------------------------------------------------- /caret-parallel/caret-parallel-train-cubist.R: -------------------------------------------------------------------------------- 1 | # Run caret models "cubist" in parallel 2 | # https://github.com/tobigithub/caret-machine-learning 3 | # Tobias Kind (2015) 4 | 5 | library(doParallel); cl <- makeCluster(16); registerDoParallel(cl) 6 | require(caret); data(BloodBrain); 7 | fit1 <- train(bbbDescr, logBBB, "cubist"); 8 | fit1; fit1$times$everything 9 | stopCluster(cl); registerDoSEQ(); 10 | 11 | # /user time/ is the actual caret training time 12 | # /system time/ is operating system overhead 13 | # /elapse time/ is total run-time 14 | 15 | # for parallel 2x total speed-up 16 | # but parallel 45x training speed-up (!) 17 | # parallel and caret overhead are 42 sec 18 | # Hence oberhead for short methods is quite large 19 | # Overhead for longer train methods will be small 20 | 21 | # Cubist with one CPU [s] 22 | # user system elapsed 23 | # 91.20 0.04 91.27 24 | 25 | # Cubist with 16 CPUs [s] 26 | # user system elapsed 27 | # 2.00 0.03 44.68 28 | -------------------------------------------------------------------------------- /caret-parallel/caret-parallel-train-rf-deLuxe.R: -------------------------------------------------------------------------------- 1 | # Run random forest in parallel with CPU core and thread info 2 | # https://github.com/tobigithub/caret-machine-learning 3 | # Tobias Kind (2015) 4 | 5 | require(caret) 6 | data(BloodBrain) 7 | set.seed(123) 8 | 9 | # Library parallel() is a native R library, no CRAN required 10 | library(parallel) 11 | nCores <- detectCores(logical = FALSE) 12 | nThreads <- detectCores(logical = TRUE) 13 | cat("CPU with",nCores,"cores and",nThreads,"threads detected.\n") 14 | 15 | # load the doParallel/doSNOW library for caret cluster use 16 | library(doParallel) 17 | cl <- makeCluster(nThreads) 18 | registerDoParallel(cl) 19 | 20 | # random forest regression 21 | fit1 <- train(bbbDescr, logBBB, "rf") 22 | fit1; 23 | 24 | 25 | stopCluster(cl) 26 | registerDoSEQ() 27 | ### END 28 | 29 | # 208 samples 30 | # 134 predictors 31 | # 32 | # No pre-processing 33 | # Resampling: Bootstrapped (25 reps) 34 | # Summary of sample sizes: 208, 208, 208, 208, 208, 208, ... 35 | # Resampling results across tuning parameters: 36 | # 37 | # mtry RMSE Rsquared 38 | # 2 0.5443770 0.5725600 39 | # 68 0.5408819 0.5568365 40 | # 134 0.5490382 0.5413179 41 | # 42 | # RMSE was used to select the optimal model using the smallest value. 43 | # The final value used for the model was mtry = 68. 44 | -------------------------------------------------------------------------------- /caret-parallel/caret-parallel-train.R: -------------------------------------------------------------------------------- 1 | # Run multiple caret models in parallel using lapply 2 | # https://github.com/tobigithub/caret-machine-learning 3 | # Tobias Kind (2015) 4 | 5 | # ------------------------------------------------------------------------- 6 | # FIRST sequential code (not parallel one CPU core): 7 | # ------------------------------------------------------------------------- 8 | 9 | require(caret); data(BloodBrain); set.seed(123) 10 | fit1 <- train(bbbDescr, logBBB, "knn"); fit1 11 | 12 | # ------------------------------------------------------------------------- 13 | # SECOND parallel register 4 cores (no worries if you only have 2) 14 | # train the caret model in parallel 15 | # ------------------------------------------------------------------------- 16 | 17 | library(doParallel); cl <- makeCluster(4); registerDoParallel(cl) 18 | require(caret); data(BloodBrain); set.seed(123) 19 | fit1 <- train(bbbDescr, logBBB, "knn"); fit1 20 | stopCluster(cl); registerDoSEQ(); 21 | 22 | ### END 23 | -------------------------------------------------------------------------------- /caret-parallel/learning-curve-plots-caret-parallel.R: -------------------------------------------------------------------------------- 1 | # Learning curve plots for R caret classifications and regressions in parallel 2 | # (ROC vs training size, RMSE vs training size) 3 | # Source: Max Kuhn (topepo); https://github.com/topepo/caret/issues/278 4 | # https://github.com/tobigithub/caret-machine-learning 5 | # Tobias Kind (2015) 6 | 7 | #---------------------------------------------------------------------- 8 | # Library parallel() is a native R library, no CRAN required 9 | library(parallel) 10 | nCores <- detectCores(logical = FALSE) 11 | nThreads <- detectCores(logical = TRUE) 12 | cat("CPU with",nCores,"cores and",nThreads,"threads detected.\n") 13 | 14 | # load the doParallel/doSNOW library for caret cluster use 15 | library(doParallel) 16 | cl <- makeCluster(nThreads) 17 | registerDoParallel(cl) 18 | 19 | #---------------------------------------------------------------------- 20 | ## function: learning_curve_dat plots training-size vs RMSE or ROC 21 | ## dat: entire data set used for modling 22 | ## y: character stirng for the outcome column name 23 | ## proportion: proportion of data used to train the model 24 | ## test_prop: proportion of data used initially set aside for testing 25 | ## verbose: write out a log of training milestones 26 | ## ...: arguments to pass to `train` 27 | #---------------------------------------------------------------------- 28 | learning_curve_dat <- function(dat, 29 | outcome = colnames(dat)[1], 30 | proportion = (1:10)/10, test_prop = 0, 31 | verbose = TRUE, ...) { 32 | 33 | proportion <- sort(unique(proportion)) 34 | n_size <- length(proportion) 35 | 36 | if(test_prop > 0) { 37 | for_model <- createDataPartition(dat[, outcome], p = 1 - test_prop, list = FALSE) 38 | } else for_model <- 1:nrow(dat) 39 | 40 | n <- length(for_model) 41 | 42 | resampled <- vector(mode = "list", length = n_size) 43 | tested <- if(test_prop > 0) resampled else NULL 44 | apparent <- resampled 45 | for(i in seq(along = proportion)) { 46 | if(verbose) cat("Training for ", round(proportion[i]*100, 1), 47 | "% (n = ", floor(n*proportion[i]), ")\n", sep = "") 48 | in_mod <- if(proportion[i] < 1) sample(for_model, size = floor(n*proportion[i])) else for_model 49 | mod <- train(x = dat[in_mod, colnames(dat) != outcome, drop = FALSE], 50 | y = dat[in_mod, outcome], 51 | ...) 52 | if(i == 1) perf_names <- mod$perfNames 53 | resampled[[i]] <- merge(mod$resample, mod$bestTune) 54 | resampled[[i]]$Training_Size <- length(in_mod) 55 | 56 | if(test_prop > 0) { 57 | if(!mod$control$classProbs) { 58 | test_preds <- extractPrediction(list(model = mod), 59 | testX = dat[-for_model, colnames(dat) != outcome, drop = FALSE], 60 | testY = dat[-for_model, outcome]) 61 | } else { 62 | test_preds <- extractProb(list(model = mod), 63 | testX = dat[-for_model, colnames(dat) != outcome, drop = FALSE], 64 | testY = dat[-for_model, outcome]) 65 | } 66 | test_perf <- mod$control$summaryFunction(test_preds, lev = mod$finalModel$obsLevels) 67 | test_perf <- as.data.frame(t(test_perf)) 68 | test_perf$Training_Size <- length(in_mod) 69 | tested[[i]] <- test_perf 70 | try(rm(test_preds, test_perf), silent = TRUE) 71 | } 72 | 73 | if(!mod$control$classProbs) { 74 | app_preds <- extractPrediction(list(model = mod), 75 | testX = dat[in_mod, colnames(dat) != outcome, drop = FALSE], 76 | testY = dat[in_mod, outcome]) 77 | } else { 78 | app_preds <- extractProb(list(model = mod), 79 | testX = dat[in_mod, colnames(dat) != outcome, drop = FALSE], 80 | testY = dat[in_mod, outcome]) 81 | } 82 | app_perf <- mod$control$summaryFunction(app_preds, lev = mod$finalModel$obsLevels) 83 | app_perf <- as.data.frame(t(app_perf)) 84 | app_perf$Training_Size <- length(in_mod) 85 | apparent[[i]] <- app_perf 86 | 87 | try(rm(mod, in_mod, app_preds, app_perf), silent = TRUE) 88 | } 89 | 90 | resampled <- do.call("rbind", resampled) 91 | resampled <- resampled[, c(perf_names, "Training_Size")] 92 | resampled$Data <- "Resampling" 93 | apparent <- do.call("rbind", apparent) 94 | apparent <- apparent[, c(perf_names, "Training_Size")] 95 | apparent$Data <- "Training" 96 | out <- rbind(resampled, apparent) 97 | if(test_prop > 0) { 98 | tested <- do.call("rbind", tested) 99 | tested <- tested[, c(perf_names, "Training_Size")] 100 | tested$Data <- "Testing" 101 | out <- rbind(out, tested) 102 | } 103 | out 104 | } 105 | 106 | #---------------------------------------------------------------------- 107 | # multiplot for plotting multiple ggplots 108 | # Example: multiplot(p1,p2,p3,p4,p5,p6,cols=3) 109 | # Source: http://www.peterhaschke.com/r/2013/04/24/MultiPlot.html 110 | #---------------------------------------------------------------------- 111 | 112 | multiplot <- function(..., plotlist = NULL, file, cols = 1, layout = NULL) { 113 | require(grid) 114 | 115 | plots <- c(list(...), plotlist) 116 | 117 | numPlots = length(plots) 118 | 119 | if (is.null(layout)) { 120 | layout <- matrix(seq(1, cols * ceiling(numPlots/cols)), 121 | ncol = cols, nrow = ceiling(numPlots/cols)) 122 | } 123 | 124 | if (numPlots == 1) { 125 | print(plots[[1]]) 126 | 127 | } else { 128 | grid.newpage() 129 | pushViewport(viewport(layout = grid.layout(nrow(layout), ncol(layout)))) 130 | 131 | for (i in 1:numPlots) { 132 | matchidx <- as.data.frame(which(layout == i, arr.ind = TRUE)) 133 | 134 | print(plots[[i]], vp = viewport(layout.pos.row = matchidx$row, 135 | layout.pos.col = matchidx$col)) 136 | } 137 | } 138 | } 139 | 140 | 141 | #---------------------------------------------------------------------- 142 | ## Classification example 143 | #---------------------------------------------------------------------- 144 | library(caret) 145 | library(xgboost) 146 | 147 | # set plot to 2x3 148 | par(mfrow=c(2,3)) 149 | 150 | set.seed(1412) 151 | class_dat <- twoClassSim(2000) 152 | 153 | set.seed(29510) 154 | lda_data <- learning_curve_dat(dat = class_dat, outcome = "Class", 155 | test_prop = 1/4, 156 | ## `train` arguments 157 | method = "lda", 158 | metric = "ROC", 159 | trControl = trainControl(classProbs = TRUE, 160 | method = "boot632", 161 | summaryFunction = twoClassSummary)) 162 | 163 | p1 <- ggplot(lda_data, aes(x = Training_Size, y = ROC, color = Data)) + 164 | geom_smooth(method = loess, span = .8) + 165 | ggtitle("LDA classification with boot632 CV") + 166 | theme_bw() 167 | p1 168 | #---------------------------------------------------------------------- 169 | set.seed(29510) 170 | rf_data <- learning_curve_dat(dat = class_dat, outcome = "Class", 171 | test_prop = 1/4, 172 | ## `train` arguments 173 | method = "rf", 174 | metric = "ROC", 175 | tuneLength = 4, 176 | trControl = trainControl(classProbs = TRUE, 177 | method = "boot632", 178 | summaryFunction = twoClassSummary)) 179 | 180 | p2 <- ggplot(rf_data, aes(x = Training_Size, y = ROC, color = Data)) + 181 | geom_smooth(method = loess, span = .8) + 182 | ggtitle("rf classification with boot632 CV") + 183 | theme_bw() 184 | p2 185 | #---------------------------------------------------------------------- 186 | set.seed(29510) 187 | rf_data <- learning_curve_dat(dat = class_dat, outcome = "Class", 188 | test_prop = 1/4, 189 | ## `train` arguments 190 | method = "parRF", 191 | metric = "ROC", 192 | tuneLength = 4, 193 | trControl = trainControl(classProbs = TRUE, 194 | method = "boot632", 195 | summaryFunction = twoClassSummary)) 196 | 197 | p3 <- ggplot(rf_data, aes(x = Training_Size, y = ROC, color = Data)) + 198 | geom_smooth(method = loess, span = .8) + 199 | ggtitle("parRF classification with boot632 CV") + 200 | theme_bw() 201 | p3 202 | #---------------------------------------------------------------------- 203 | ## Regression example 204 | #---------------------------------------------------------------------- 205 | 206 | set.seed(19135) 207 | reg_dat <- SLC14_1(2000) 208 | 209 | set.seed(31535) 210 | bag_data <- learning_curve_dat(dat = reg_dat, outcome = "y", 211 | test_prop = 1/4, 212 | ## `train` arguments 213 | method = "treebag", 214 | trControl = trainControl(method = "boot632"), 215 | ## `bagging` arguments 216 | nbagg = 100) 217 | 218 | p4 <- ggplot(bag_data, aes(x = Training_Size, y = RMSE, color = Data)) + 219 | geom_smooth(method = loess, span = .8) + 220 | ggtitle("treebag regression with boot632 CV") + 221 | theme_bw() 222 | p4 223 | 224 | 225 | #---------------------------------------------------------------------- 226 | set.seed(31535) 227 | svm_data <- learning_curve_dat(dat = reg_dat, outcome = "y", 228 | test_prop = 0, 229 | ## `train` arguments 230 | method = "svmRadial", 231 | preProc = c("center", "scale"), 232 | tuneGrid = data.frame(sigma = 0.03, C = 2^10), 233 | trControl = trainControl(method = "boot632")) 234 | 235 | p5 <- ggplot(svm_data, aes(x = Training_Size, y = RMSE, color = Data)) + 236 | geom_smooth(method = loess, span = .8) + 237 | ggtitle("svmRadial regression with boot632 CV") + 238 | theme_bw() 239 | p5 240 | 241 | #---------------------------------------------------------------------- 242 | set.seed(31535) 243 | svm_no_test <- learning_curve_dat(dat = reg_dat, outcome = "y", 244 | test_prop = 1/4, 245 | ## `train` arguments 246 | method = "svmRadial", 247 | preProc = c("center", "scale"), 248 | tuneGrid = data.frame(sigma = 0.03, C = 2^10), 249 | trControl = trainControl(method = "boot632")) 250 | 251 | p6 <- ggplot(svm_no_test, aes(x = Training_Size, y = RMSE, color = Data)) + 252 | geom_smooth(method = loess, span = .8) + 253 | ggtitle("svmRadial regression with boot632 CV") + 254 | theme_bw() 255 | p6 256 | 257 | 258 | multiplot(p1,p2,p3,p4,p5,p6,cols=3) 259 | 260 | stopCluster(cl) 261 | registerDoSEQ() 262 | ### END 263 | -------------------------------------------------------------------------------- /caret-parallel/run-multiple-caret-models-parallel-lapply.R: -------------------------------------------------------------------------------- 1 | # Run multiple caret models in parallel using lapply 2 | # https://github.com/tobigithub/caret-machine-learning 3 | # Tobias Kind (2015) 4 | 5 | 6 | require(caret); data(BloodBrain); m <- c("qrf","xgbTree","rknn","knn","rf"); 7 | library(doParallel); cl <- makeCluster(8); registerDoParallel(cl) 8 | t2 <- lapply(m,function(x) {set.seed(123); seeds <- vector(mode = "list", length = nrow(bbbDescr) + 1); seeds <- lapply(seeds, function(x) 1:20); t1 <- train(bbbDescr, logBBB, (x),trControl = trainControl(method = "cv",seeds=seeds))}) 9 | r2 <- lapply(1:length(t2), function(x) {cat(sprintf("%-10s",(m[x])));cat(t2[[x]]$results$Rsquared[which.min(t2[[x]]$results$RMSE)],"\t"); cat(t2[[x]]$results$RMSE[which.min(t2[[x]]$results$RMSE)],"\n")}) 10 | stopCluster(cl); registerDoSEQ(); 11 | 12 | #model R^2 RMSE 13 | #qrf 0.5861108 0.5120318 14 | #xgbTree 0.6129255 0.4858211 15 | #rknn 0.4351047 0.5941893 16 | #knn 0.3736528 0.6185242 17 | #rf 0.6037442 0.493395 18 | -------------------------------------------------------------------------------- /caret-parallel/run-multiple-caret-models-parallel-sapply.R: -------------------------------------------------------------------------------- 1 | # Run multiple caret models in parallel using sapply 2 | # See: http://stackoverflow.com/questions/3505701/r-grouping-functions-sapply-vs-lapply-vs-apply-vs-tapply-vs-by-vs-aggrega 3 | # https://github.com/tobigithub/caret-machine-learning 4 | # Tobias Kind (2015) 5 | 6 | require(caret); data(BloodBrain); m <- c("qrf","xgbTree","knn") 7 | library(doParallel); cl <- makeCluster(12); registerDoParallel(cl) 8 | sapply(m,function(x) {t1 <- train(bbbDescr, logBBB, (x))} ,USE.NAMES = TRUE) 9 | class(t2); t2; t2[4,]; stopCluster(cl); registerDoSEQ(); 10 | 11 | # qrf xgbTree knn 12 | #method "qrf" "xgbTree" "knn" 13 | #modelInfo List,11 List,14 List,13 14 | #modelType "Regression" "Regression" "Regression" 15 | #results List,5 List,7 List,5 16 | #pred NULL NULL NULL 17 | #bestTune List,1 List,3 List,1 18 | #call Expression Expression Expression 19 | #dots List,0 List,0 List,0 20 | #metric "RMSE" "RMSE" "RMSE" 21 | #control List,26 List,26 List,26 22 | #finalModel List,23 List,6 List,7 23 | #preProcess NULL NULL NULL 24 | #trainingData List,135 List,135 List,135 25 | #resample List,3 List,3 List,3 26 | #resampledCM NULL NULL NULL 27 | #perfNames Character,2 Character,2 Character,2 28 | #maximize FALSE FALSE FALSE 29 | #yLimits Numeric,2 Numeric,2 Numeric,2 30 | #times List,3 List,3 List,3 31 | 32 | -------------------------------------------------------------------------------- /caret-regression/caret-all-regression-models.R: -------------------------------------------------------------------------------- 1 | # Run all caret regression models in parallel and compare R^2 and RMSE 2 | # Example data is the very small "cars" dataset. Replace with your own set. 3 | # The regression output from 85 fast (working) regression models is 4 | # exported to a sortable table in a web browser using the DT library 5 | # https://github.com/tobigithub/caret-machine-learning 6 | # Tobias Kind (2016) 7 | # Works for caret_6.0-70 and R version 3.3.1 8 | 9 | # load caret and DT the cars data set 10 | require(caret); require(DT); data(cars); 11 | 12 | # get all model names just as example 13 | m <- unique(modelLookup()[modelLookup()$forReg,c(1)]) 14 | 15 | # fill variable m with the fast working models 16 | m <- c("avNNet", "bagEarth", "bagEarthGCV", 17 | "bayesglm", "bdk", "blackboost", "Boruta", "brnn", "BstLm" , 18 | "bstTree", "cforest", "ctree", "ctree2", "cubist", "DENFIS", 19 | "dnn", "earth", "elm", "enet", "evtree", 20 | "extraTrees", "gamLoess", "gaussprLinear", "gaussprPoly", "gaussprRadial", 21 | "gcvEarth","glm", "glmboost", "glmnet", "icr", "kernelpls", 22 | "kknn", "knn", "krlsRadial", "lars" , "lasso", 23 | "leapBackward", "leapForward", "leapSeq", "lm", "M5", "M5Rules", 24 | "mlpWeightDecay", "neuralnet" , "partDSA", 25 | "pcaNNet", "pcr", "penalized", "pls", "plsRglm", "ppr", 26 | "qrf" , "ranger", "rf", "rfRules", "rbfDDA", 27 | "ridge", "rlm", "rpart", "rpart2", "rqlasso", 28 | "rqnc", "RRF", "RRFglobal", "rvmPoly", "rvmRadial", 29 | "SBC", "simpls", "spls", "superpc" , 30 | "svmLinear", "svmLinear2", "svmPoly", "svmRadial", "svmRadialCost", 31 | "treebag", "widekernelpls", "WM", "xgbLinear", 32 | "xgbTree", "xyf") 33 | 34 | 35 | # load all packages (does not really work due to other dependencies) 36 | suppressPackageStartupMessages(ll <-lapply(m, require, character.only = TRUE)) 37 | 38 | # define x and y for regression 39 | y <- mtcars$mpg; x <- mtcars[, -mtcars$mpg]; 40 | 41 | # load all libraries 42 | library(doParallel); cl <- makeCluster(detectCores()); registerDoParallel(cl) 43 | 44 | # use lapply/loop to run everything 45 | t2 <- lapply(m,function(i) 46 | {cat("----------------------------------------------------","\n"); 47 | set.seed(123); cat(i," <- loaded\n"); 48 | t2 <- train(y=y, x=x, (i), trControl = trainControl(method = "boot632")) 49 | } 50 | ) 51 | 52 | # use lapply to print the results 53 | r2 <- lapply(1:length(t2), function(i) 54 | {cat(sprintf("%-20s",(m[i]))); 55 | cat(round(t2[[i]]$results$Rsquared[which.min(t2[[i]]$results$RMSE)],4),"\t"); 56 | cat(round(t2[[i]]$results$RMSE[which.min(t2[[i]]$results$RMSE)],4),"\t") 57 | cat(t2[[i]]$times$everything[3],"\n") 58 | } 59 | ) 60 | 61 | # stop the parallel processing and register sequential front-end 62 | stopCluster(cl); registerDoSEQ(); 63 | 64 | # preallocate data types 65 | i = 1; MAX = length(t2); 66 | x1 <- character() # Name 67 | x2 <- numeric() # R2 68 | x3 <- numeric() # RMSE 69 | x4 <- numeric() # time [s] 70 | x5 <- character() # long model name 71 | 72 | # fill data and check indexes and NA 73 | for (i in 1:length(t2)) { 74 | x1[i] <- t2[[i]]$method 75 | x2[i] <- as.numeric(t2[[i]]$results$Rsquared[which.min(t2[[i]]$results$RMSE)]) 76 | x3[i] <- as.numeric(t2[[i]]$results$RMSE[which.min(t2[[i]]$results$RMSE)]) 77 | x4[i] <- as.numeric(t2[[i]]$times$everything[3]) 78 | x5[i] <- t2[[i]]$modelInfo$label 79 | } 80 | 81 | # coerce to data frame 82 | df1 <- data.frame(x1,x2,x3,x4,x5, stringsAsFactors=FALSE) 83 | 84 | # print all results to R-GUI 85 | df1 86 | 87 | # call web browser output with sortable column names 88 | datatable(df1, options = list( 89 | columnDefs = list(list(className = 'dt-left', targets = c(0,1,2,3,4,5))), 90 | pageLength = MAX, 91 | order = list(list(2, 'desc'))), 92 | colnames = c('Num', 'Name', 'R^2', 'RMSE', 'time [s]', 'Model name'), 93 | caption = paste('Regression results from caret models',Sys.time()), 94 | class = 'cell-border stripe') %>% 95 | formatRound('x2', 3) %>% 96 | formatRound('x3', 3) %>% 97 | formatRound('x4', 3) %>% 98 | formatStyle(2, 99 | background = styleColorBar(x2, 'steelblue'), 100 | backgroundSize = '100% 90%', 101 | backgroundRepeat = 'no-repeat', 102 | backgroundPosition = 'center' 103 | ) 104 | 105 | ### END 106 | 107 | #----------------------------------------------------------------------------- 108 | # Num Name R^2 RMSE time[s] Model name 109 | # #1 avNNet 20.269 4.98 Model Averaged Neural Network 110 | # 2 bagEarth 1 0 3.8 Bagged MARS 111 | # 3 bagEarthGCV 1 0 2.22 Bagged MARS using gCV Pruning 112 | # 4 bayesglm 1 0 1.11 Bayesian Generalized Linear Model 113 | # 5 bdk 0.81 2.602 1.49 Self-Organizing Map 114 | # 6 blackboost 0.878 2.37 3.9 Boosted Tree 115 | # 7 Boruta 0.965 1.317 25.79 Random Forest with Additional Feature Selection 116 | # 8 brnn 0.999 0.215 0.95 Bayesian Regularized Neural Networks 117 | # 9 BstLm 0.826 2.661 2.89 Boosted Linear Model 118 | # 10 bstTree 0.912 1.766 17.98 Boosted Tree 119 | ... 120 | # 83 xgbTree 0.983 0.679 3.970 eXtreme Gradient Boosting 121 | # 84 xyf 0.834 2.609 1.560 Self-Organizing Maps 122 | #----------------------------------------------------------------------------- 123 | 124 | ### total time 385.14 [s] or 6.4 min with 4c/16t@4.2 GHz 125 | 126 | 127 | 128 | -------------------------------------------------------------------------------- /caret-regression/caret-all-regressions-DT-cars.csv: -------------------------------------------------------------------------------- 1 | Num,Name,R^2,RMSE,time [s],Model name 2 | 2,bagEarth,1,0,2.37,Bagged MARS 3 | 3,bagEarthGCV,1,0,1.58,Bagged MARS using gCV Pruning 4 | 4,bayesglm,1,0,1.29,Bayesian Generalized Linear Model 5 | 17,earth,1,0,0.79,Multivariate Adaptive Regression Spline 6 | 19,enet,1,0,1.06,Elasticnet 7 | 23,gamLoess,1,0,1.48,Generalized Additive Model using LOESS 8 | 27,gcvEarth,1,0,0.8,Multivariate Adaptive Regression Splines 9 | 28,glm,1,0,1.04,Generalized Linear Model 10 | 29,glmboost,1,0,0.86,Boosted Generalized Linear Model 11 | 30,glmnet,1,0.176,1.06,glmnet 12 | 36,lars,1,0,0.81,Least Angle Regression 13 | 37,lasso,1,0.604,0.77,The lasso 14 | 38,leapBackward,1,0,0.9,Linear Regression with Backwards Selection 15 | 39,leapForward,1,0,0.78,Linear Regression with Forward Selection 16 | 40,leapSeq,1,0,0.81,Linear Regression with Stepwise Selection 17 | 41,lm,1,0,0.77,Linear Regression 18 | 42,M5,1,0,3.16,Model Tree 19 | 43,M5Rules,1,0,1.53,Model Rules 20 | 58,ridge,1,0,0.89,Ridge Regression 21 | 61,rlm,1,0,0.78,Robust Linear Model 22 | 64,rqlasso,1,0,1.65,Quantile Regression with LASSO penalty 23 | 65,rqnc,1,0,0.97,Non-Convex Penalized Quantile Regression 24 | 14,cubist,1,0,1.06,Cubist 25 | 20,enpls,1,0,18.19,Ensemble Partial Least Squares Regression 26 | 49,penalized,1,0.025,1.53,Penalized Linear Regression 27 | 68,rvmPoly,1,0.025,1.3,Relevance Vector Machines with Polynomial Kernel 28 | 32,kernelpls,0.999,0.156,0.89,Partial Least Squares 29 | 50,pls,0.999,0.156,0.78,Partial Least Squares 30 | 80,widekernelpls,0.999,0.156,0.84,Partial Least Squares 31 | 71,simpls,0.999,0.156,0.83,Partial Least Squares 32 | 48,pcr,0.999,0.177,0.81,Principal Component Analysis 33 | 8,brnn,0.999,0.237,0.95,Bayesian Regularized Neural Networks 34 | 72,spls,0.998,0.185,1.23,Sparse Partial Least Squares 35 | 73,superpc,0.998,19.931,0.97,Supervised Principal Component Analysis 36 | 52,ppr,0.998,0.189,0.81,Projection Pursuit Regression 37 | 51,plsRglm,0.995,0.352,3.4,Partial Least Squares Generalized Linear Models 38 | 74,svmLinear,0.993,0.522,0.82,Support Vector Machines with Linear Kernel 39 | 75,svmLinear2,0.993,0.522,0.85,Support Vector Machines with Linear Kernel 40 | 24,gaussprLinear,0.993,0.523,2.08,Gaussian Process 41 | 22,extraTrees,0.977,0.944,2.6,Random Forest by Randomization 42 | 76,svmPoly,0.976,0.901,1.9,Support Vector Machines with Polynomial Kernel 43 | 18,elm,0.973,0.914,1.16,Extreme Learning Machine 44 | 54,ranger,0.971,1.202,1.13,Random Forest 45 | 35,krlsRadial,0.971,1.102,3.07,Radial Basis Function Kernel Regularized Least Squares 46 | 7,Boruta,0.963,1.225,10.79,Random Forest with Additional Feature Selection 47 | 55,rf,0.961,1.246,0.92,Random Forest 48 | 66,RRF,0.96,1.254,1.17,Regularized Random Forest 49 | 67,RRFglobal,0.959,1.263,1.23,Regularized Random Forest 50 | 25,gaussprPoly,0.958,1.172,1.06,Gaussian Process with Polynomial Kernel 51 | 53,qrf,0.932,1.667,1.11,Quantile Random Forest 52 | 77,svmRadial,0.929,1.834,0.87,Support Vector Machines with Radial Basis Function Kernel 53 | 78,svmRadialCost,0.924,1.837,1,Support Vector Machines with Radial Basis Function Kernel 54 | 10,bstTree,0.911,1.748,7.24,Boosted Tree 55 | 60,rknnBel,0.91,2.016,15.44,Random k-Nearest Neighbors with Feature Selection 56 | 59,rknn,0.909,2.263,4.62,Random k-Nearest Neighbors 57 | 70,SBC,0.905,1.609,1.61,Subtractive Clustering and Fuzzy c-Means Rules 58 | 31,icr,0.902,1.994,1.09,Independent Component Regression 59 | 26,gaussprRadial,0.896,2.411,0.84,Gaussian Process with Radial Basis Function Kernel 60 | 6,blackboost,0.886,2.193,4.37,Boosted Tree 61 | 33,kknn,0.883,1.819,1.05,k-Nearest Neighbors 62 | 81,WM,0.869,2.134,3.1,Wang and Mendel Fuzzy Rules 63 | 79,treebag,0.862,2.613,1.58,Bagged CART 64 | 84,xyf,0.844,2.434,1.84,Self-Organizing Maps 65 | 11,cforest,0.843,2.691,0.98,Conditional Inference Random Forest 66 | 5,bdk,0.837,2.414,1.69,Self-Organizing Map 67 | 9,BstLm,0.826,2.651,1.86,Boosted Linear Model 68 | 34,knn,0.812,2.795,0.78,k-Nearest Neighbors 69 | 21,evtree,0.789,2.796,2.98,Tree Models from Genetic Algorithms 70 | 12,ctree,0.77,2.982,0.83,Conditional Inference Tree 71 | 13,ctree2,0.77,2.982,0.83,Conditional Inference Tree 72 | 46,partDSA,0.749,3.089,3.19,partDSA 73 | 62,rpart,0.744,3.029,1.22,CART 74 | 63,rpart2,0.744,3.029,0.84,CART 75 | 56,rfRules,0.512,4.943,32.53,Random Forest Rule-Based Model 76 | 69,rvmRadial,0.396,12.565,0.9,Relevance Vector Machines with Radial Basis Function Kernel 77 | 44,mlpWeightDecay,0.338,6.218,2.53,Multi-Layer Perceptron 78 | 15,DENFIS,0.318,9.409,10.34,Dynamic Evolving Neural-Fuzzy Inference System 79 | 57,rbfDDA,0.212,20.857,1.81,Radial Basis Function Network 80 | 1,avNNet,,19.972,7.31,Model Averaged Neural Network 81 | 16,dnn,,6.172,1.64,Stacked AutoEncoder Deep Neural Network 82 | 45,neuralnet,,5.993,20.98,Neural Network 83 | 47,pcaNNet,,19.972,1.17,Neural Networks with Feature Extraction 84 | 82,xgbLinear,,6.045,790.15,eXtreme Gradient Boosting 85 | 83,xgbTree,,6.045,49.58,eXtreme Gradient Boosting 86 | -------------------------------------------------------------------------------- /caret-regression/caret-all-regressions-DT-concrete.R: -------------------------------------------------------------------------------- 1 | # All working and fast caret regression models applied to data(concrete) 2 | # The regression output from fast (working) regression models is 3 | # exported to a sortable table in a web browser using the DT library 4 | # https://github.com/tobigithub/caret-machine-learning 5 | # Tobias Kind (2015) 6 | 7 | require(caret); require(DT); require(AppliedPredictiveModeling); 8 | data(concrete); 9 | 10 | m <- c( "avNNet" , "bagEarth", "bagEarthGCV", 11 | "bayesglm", "bdk", "blackboost", "Boruta", "brnn", "BstLm" , 12 | "bstTree", "cforest", "ctree", "ctree2", "cubist" , 13 | "dnn", "earth", "elm", "enet", "enpls", 14 | "gamLoess", "gaussprLinear", "gaussprPoly", "gaussprRadial", 15 | "gcvEarth","glm", "glmboost", "glmnet", "icr", "kernelpls", 16 | "kknn", "knn", "krlsRadial", "lars" , "lasso", 17 | "leapBackward", "leapForward", "leapSeq", "lm", "M5", "M5Rules", 18 | "mlpWeightDecay", "neuralnet" , "partDSA", 19 | "pcaNNet", "pcr", "penalized", "pls", "plsRglm", "ppr", 20 | "qrf" , "ranger", "rf" , "rbfDDA", 21 | "ridge", "rknn", "rlm", "rpart", "rpart2", "rqlasso", 22 | "rqnc", "RRF", "RRFglobal", "rvmPoly", "rvmRadial", 23 | "SBC", "simpls", "spls", "superpc" , 24 | "svmLinear", "svmLinear2", "svmPoly", "svmRadial", "svmRadialCost", 25 | "treebag", "widekernelpls", "xgbLinear", 26 | "xgbTree", "xyf") 27 | 28 | 29 | # load all packages (does not really work due to other dependencies) 30 | suppressPackageStartupMessages(ll <-lapply(m, require, character.only = TRUE)) 31 | 32 | # define x and y for regression 33 | y <- concrete$CompressiveStrength; x <- concrete[, 1:8]; 34 | 35 | # register parallel front-end 36 | library(doParallel); cl <- makeCluster(detectCores()); registerDoParallel(cl) 37 | 38 | # use lapply/loop to run everything 39 | t2 <- lapply(m,function(i) 40 | {cat("----------------------------------------------------","\n"); 41 | set.seed(123); cat(i," <- loaded\n"); 42 | t2 <- train(y=y, x=x, (i), trControl = trainControl(method = "boot632")) 43 | } 44 | ) 45 | 46 | 47 | r2 <- lapply(1:length(t2), function(i) 48 | {cat(sprintf("%-20s",(m[i]))); 49 | cat(round(t2[[i]]$results$Rsquared[which.min(t2[[i]]$results$RMSE)],4),"\t"); 50 | cat(round(t2[[i]]$results$RMSE[which.min(t2[[i]]$results$RMSE)],4),"\t") 51 | cat(t2[[i]]$times$everything[3],"\n") 52 | } 53 | ) 54 | 55 | # stop cluster and register sequntial front end 56 | stopCluster(cl); registerDoSEQ(); 57 | 58 | # preallocate data types 59 | i = 1; MAX = length(t2); 60 | x1 <- character() # Name 61 | x2 <- numeric() # R2 62 | x3 <- numeric() # RMSE 63 | x4 <- numeric() # time [s] 64 | x5 <- character() # long model name 65 | 66 | # fill data and check indexes and NA 67 | for (i in 1:length(t2)) { 68 | x1[i] <- t2[[i]]$method 69 | x2[i] <- as.numeric(t2[[i]]$results$Rsquared[which.min(t2[[i]]$results$RMSE)]) 70 | x3[i] <- as.numeric(t2[[i]]$results$RMSE[which.min(t2[[i]]$results$RMSE)]) 71 | x4[i] <- as.numeric(t2[[i]]$times$everything[3]) 72 | x5[i] <- t2[[i]]$modelInfo$label 73 | } 74 | 75 | # coerce to data frame 76 | df1 <- data.frame(x1,x2,x3,x4,x5, stringsAsFactors=FALSE) 77 | 78 | # print all results to R-GUI 79 | df1 80 | 81 | # plot RMSE vs boosting iterations for xgbLinear and xgbTree 82 | # next 2 lines this is static code, index extraction may fail 83 | ggplot(t2[[76]]) 84 | ggplot(t2[[77]]) 85 | 86 | # call web output with correct column names 87 | datatable(df1, options = list( 88 | columnDefs = list(list(className = 'dt-left', targets = c(0,1,2,3,4,5))), 89 | pageLength = MAX, 90 | order = list(list(2, 'desc'))), 91 | colnames = c('Num', 'Name', 'R^2', 'RMSE', 'time [s]', 'Model name'), 92 | caption = paste('Regression results from caret models',Sys.time()), 93 | class = 'cell-border stripe') %>% 94 | formatRound('x2', 3) %>% 95 | formatRound('x3', 3) %>% 96 | formatRound('x4', 3) %>% 97 | formatStyle(2, 98 | background = styleColorBar(x2, 'steelblue'), 99 | backgroundSize = '100% 90%', 100 | backgroundRepeat = 'no-repeat', 101 | backgroundPosition = 'center' 102 | ) 103 | 104 | 105 | ### END 106 | -------------------------------------------------------------------------------- /caret-regression/caret-all-regressions-DT-concrete.csv: -------------------------------------------------------------------------------- 1 | Num,Name,R^2,RMSE,time [s],Model name 2 | 14,cubist,0.935,4.122,7.41,Cubist 3 | 52,rf,0.932,4.177,11.67,Random Forest 4 | 62,RRFglobal,0.932,4.178,13.29,Regularized Random Forest 5 | 7,Boruta,0.932,4.183,83.31,Random Forest with Additional Feature Selection 6 | 61,RRF,0.932,4.185,22.87,Regularized Random Forest 7 | 51,ranger,0.93,4.223,6.16,Random Forest 8 | 50,qrf,0.92,4.493,3.18,Quantile Random Forest 9 | 10,bstTree,0.923,4.582,102.66,Boosted Tree 10 | 39,M5,0.892,5.469,34.68,Model Tree 11 | 11,cforest,0.89,5.565,23.07,Conditional Inference Random Forest 12 | 32,krlsRadial,0.869,5.622,320.78,Radial Basis Function Kernel Regularized Least Squares 13 | 3,bagEarthGCV,0.868,6.074,6.79,Bagged MARS using gCV Pruning 14 | 6,blackboost,0.871,6.11,5.1,Boosted Tree 15 | 2,bagEarth,0.866,6.125,21.35,Bagged MARS 16 | 24,gcvEarth,0.859,6.264,1.44,Multivariate Adaptive Regression Splines 17 | 22,gaussprPoly,0.857,6.308,75.74,Gaussian Process with Polynomial Kernel 18 | 16,earth,0.857,6.314,2.1,Multivariate Adaptive Regression Spline 19 | 40,M5Rules,0.855,6.319,20.14,Model Rules 20 | 71,svmPoly,0.856,6.33,108.18,Support Vector Machines with Polynomial Kernel 21 | 73,svmRadialCost,0.856,6.335,9.02,Support Vector Machines with Radial Basis Function Kernel 22 | 8,brnn,0.855,6.359,11.03,Bayesian Regularized Neural Networks 23 | 72,svmRadial,0.854,6.384,5.72,Support Vector Machines with Radial Basis Function Kernel 24 | 20,gamLoess,0.85,6.48,1.9,Generalized Additive Model using LOESS 25 | 76,xgbLinear,0.855,6.487,659.2,eXtreme Gradient Boosting 26 | 23,gaussprRadial,0.839,6.775,6.07,Gaussian Process with Radial Basis Function Kernel 27 | 12,ctree,0.824,6.965,1.72,Conditional Inference Tree 28 | 77,xgbTree,0.826,7.116,44.19,eXtreme Gradient Boosting 29 | 30,kknn,0.779,7.526,1.47,k-Nearest Neighbors 30 | 74,treebag,0.803,7.532,5.86,Bagged CART 31 | 49,ppr,0.788,7.609,1.26,Projection Pursuit Regression 32 | 55,rknn,0.774,8.377,21.79,Random k-Nearest Neighbors 33 | 31,knn,0.721,8.794,1.08,k-Nearest Neighbors 34 | 65,SBC,0.709,8.945,462.52,Subtractive Clustering and Fuzzy c-Means Rules 35 | 19,enpls,0.61,10.442,343.36,Ensemble Partial Least Squares Regression 36 | 21,gaussprLinear,0.61,10.442,4.46,Gaussian Process 37 | 54,ridge,0.61,10.443,1.7,Ridge Regression 38 | 18,enet,0.61,10.443,2.56,Elasticnet 39 | 46,penalized,0.61,10.443,14.72,Penalized Linear Regression 40 | 25,glm,0.61,10.443,1.18,Generalized Linear Model 41 | 38,lm,0.61,10.443,1.19,Linear Regression 42 | 33,lars,0.61,10.443,1.34,Least Angle Regression 43 | 4,bayesglm,0.61,10.443,1.79,Bayesian Generalized Linear Model 44 | 27,glmnet,0.61,10.443,1.49,glmnet 45 | 34,lasso,0.61,10.449,1.28,The lasso 46 | 56,rlm,0.606,10.572,1.18,Robust Linear Model 47 | 58,rpart2,0.594,10.635,1.33,CART 48 | 26,glmboost,0.595,10.694,1.58,Boosted Generalized Linear Model 49 | 67,spls,0.586,10.758,4.98,Sparse Partial Least Squares 50 | 48,plsRglm,0.586,10.764,46.07,Partial Least Squares Generalized Linear Models 51 | 47,pls,0.579,10.873,1.12,Partial Least Squares 52 | 66,simpls,0.579,10.873,1.2,Partial Least Squares 53 | 29,kernelpls,0.579,10.873,1.25,Partial Least Squares 54 | 75,widekernelpls,0.579,10.873,20.11,Partial Least Squares 55 | 70,svmLinear2,0.594,10.889,1.93,Support Vector Machines with Linear Kernel 56 | 69,svmLinear,0.594,10.909,1.54,Support Vector Machines with Linear Kernel 57 | 59,rqlasso,0.591,10.982,2.54,Quantile Regression with LASSO penalty 58 | 60,rqnc,0.591,10.983,5.73,Non-Convex Penalized Quantile Regression 59 | 35,leapBackward,0.55,11.218,1.22,Linear Regression with Backwards Selection 60 | 36,leapForward,0.548,11.249,1.25,Linear Regression with Forward Selection 61 | 13,ctree2,0.536,11.375,1.4,Conditional Inference Tree 62 | 37,leapSeq,0.532,11.431,1.13,Linear Regression with Stepwise Selection 63 | 63,rvmPoly,0.509,11.61,140.07,Relevance Vector Machines with Polynomial Kernel 64 | 17,elm,0.45,12.308,3.51,Extreme Learning Machine 65 | 57,rpart,0.439,12.514,1.53,CART 66 | 43,partDSA,0.397,12.975,6.5,partDSA 67 | 9,BstLm,0.433,13.468,2.37,Boosted Linear Model 68 | 78,xyf,0.334,13.678,2.97,Self-Organizing Maps 69 | 28,icr,0.329,13.702,3.62,Independent Component Regression 70 | 5,bdk,0.419,14.261,3.03,Self-Organizing Map 71 | 45,pcr,0.262,14.368,1.09,Principal Component Analysis 72 | 42,neuralnet,,16.701,6.63,Neural Network 73 | 15,dnn,,16.786,39.46,Stacked AutoEncoder Deep Neural Network 74 | 41,mlpWeightDecay,,18.03,17.8,Multi-Layer Perceptron 75 | 64,rvmRadial,0.301,30.056,225.6,Relevance Vector Machines with Radial Basis Function Kernel 76 | 44,pcaNNet,,38.657,9.33,Neural Networks with Feature Extraction 77 | 1,avNNet,,38.657,54.51,Model Averaged Neural Network 78 | 68,superpc,0.262,38.798,3.78,Supervised Principal Component Analysis 79 | 53,rbfDDA,0.006,39.561,390.54,Radial Basis Function Network 80 | -------------------------------------------------------------------------------- /caret-regression/caret-regression-plotObsVsPred.R: -------------------------------------------------------------------------------- 1 | # Regression analysis and visualization 2 | # Plot observed vs predicted values for training and test set from CART and PLS 3 | # Source: http://www.inside-r.org/packages/cran/caret/docs/plotObsVsPred 4 | # Author: Max Kuhn 5 | # 6 | # https://github.com/tobigithub/caret-machine-learning 7 | # Tobias Kind (2015) 8 | 9 | # load libraries and models 10 | require(caret) 11 | require(mlbench) 12 | data(BostonHousing) 13 | 14 | # perform CART (Classification And Regression Tree) analysis 15 | set.seed(123) 16 | rpartFit <- train(BostonHousing[1:100, -c(4, 14)], 17 | BostonHousing$medv[1:100], 18 | "rpart", tuneLength = 9) 19 | 20 | # perform PLS (Partial Least Squares) analysis 21 | set.seed(123) 22 | plsFit <- train(BostonHousing[1:100, -c(4, 14)], 23 | BostonHousing$medv[1:100], 24 | "pls") 25 | 26 | # extract optimal tuning values for further use 27 | predVals <- extractPrediction(list(rpartFit, plsFit), 28 | testX = BostonHousing[101:200, -c(4, 14)], 29 | testY = BostonHousing$medv[101:200], 30 | unkX = BostonHousing[201:300, -c(4, 14)]) 31 | 32 | # plot CART and PLS observed vs predicted values for training and test set 33 | plotObsVsPred(predVals) 34 | 35 | ### END 36 | -------------------------------------------------------------------------------- /caret-setup/caret-get-all-models-automatically.R: -------------------------------------------------------------------------------- 1 | # Get all caret models for regression and classification 2 | # https://github.com/tobigithub/caret-machine-learning 3 | # Tobias Kind (2015) 4 | 5 | # ----------------------------------------------------------- 6 | # get all caret models for regression 7 | 8 | require(caret) 9 | modNames <- unique(modelLookup()[modelLookup()$forReg,c(1)]) 10 | length(modNames); modNames; 11 | 12 | # ----------------------------------------------------------- 13 | # get all caret models for classification 14 | 15 | require(caret) 16 | modNames <- unique(modelLookup()[modelLookup()$forClass,c(1)]) 17 | length(modNames); modNames; 18 | -------------------------------------------------------------------------------- /caret-setup/caret-model-list-v6058.csv: -------------------------------------------------------------------------------- 1 | Num,Model,method Argument Value,Type,Packages,Tuning Parameters 2 | 1,Boosted Classification Trees,ada,Classification,"ada, plyr","iter, maxdepth, nu" 3 | 2,Bagged AdaBoost,AdaBag,Classification,"adabag, plyr","mfinal, maxdepth" 4 | 3,AdaBoost.M1,AdaBoost.M1,Classification,"adabag, plyr","mfinal, maxdepth, coeflearn" 5 | 4,Adaptive Mixture Discriminant Analysis,amdai,Classification,adaptDA,model 6 | 5,Adaptive-Network-Based Fuzzy Inference System,ANFIS,Regression,frbs,"num.labels, max.iter" 7 | 6,Model Averaged Neural Network,avNNet,Dual Use,nnet,"size, decay, bag" 8 | 7,Naive Bayes Classifier with Attribute Weighting,awnb,Classification,bnclassify,smooth 9 | 8,Tree Augmented Naive Bayes Classifier with Attribute Weighting,awtan,Classification,bnclassify,"score, smooth" 10 | 9,Bagged Model,bag,Dual Use,caret,vars 11 | 10,Bagged MARS,bagEarth,Dual Use,earth,"nprune, degree" 12 | 11,Bagged MARS using gCV Pruning,bagEarthGCV,Dual Use,earth,degree 13 | 12,Bagged Flexible Discriminant Analysis,bagFDA,Classification,"earth, mda","degree, nprune" 14 | 13,Bagged FDA using gCV Pruning,bagFDAGCV,Classification,earth,degree 15 | 14,Bayesian Additive Regression Trees,bartMachine,Dual Use,bartMachine,"num_trees, k, alpha, beta, nu" 16 | 15,Bayesian Generalized Linear Model,bayesglm,Dual Use,arm,None 17 | 16,Self-Organizing Map,bdk,Dual Use,kohonen,"xdim, ydim, xweight, topo" 18 | 17,Binary Discriminant Analysis,binda,Classification,binda,lambda.freqs 19 | 18,Boosted Tree,blackboost,Dual Use,"party, mboost, plyr","mstop, maxdepth" 20 | 19,Random Forest with Additional Feature Selection,Boruta,Dual Use,"Boruta, randomForest",mtry 21 | 20,Bayesian Regularized Neural Networks,brnn,Regression,brnn,neurons 22 | 21,Boosted Linear Model,BstLm,Dual Use,"bst, plyr","mstop, nu" 23 | 22,Boosted Smoothing Spline,bstSm,Dual Use,"bst, plyr","mstop, nu" 24 | 23,Boosted Tree,bstTree,Dual Use,"bst, plyr","mstop, maxdepth, nu" 25 | 24,C5.0,C5.0,Classification,"C50, plyr","trials, model, winnow" 26 | 25,Cost-Sensitive C5.0,C5.0Cost,Classification,"C50, plyr","trials, model, winnow, cost" 27 | 26,Single C5.0 Ruleset,C5.0Rules,Classification,C50,None 28 | 27,Single C5.0 Tree,C5.0Tree,Classification,C50,None 29 | 28,Conditional Inference Random Forest,cforest,Dual Use,party,mtry 30 | 29,CHi-squared Automated Interaction Detection,chaid,Classification,CHAID,"alpha2, alpha3, alpha4" 31 | 30,SIMCA,CSimca,Classification,rrcovHD,None 32 | 31,Conditional Inference Tree,ctree,Dual Use,party,mincriterion 33 | 32,Conditional Inference Tree,ctree2,Dual Use,party,maxdepth 34 | 33,Cubist,cubist,Regression,Cubist,"committees, neighbors" 35 | 34,Dynamic Evolving Neural-Fuzzy Inference System,DENFIS,Regression,frbs,"Dthr, max.iter" 36 | 35,Stacked AutoEncoder Deep Neural Network,dnn,Dual Use,deepnet,"layer1, layer2, layer3, hidden_dropout, visible_dropout" 37 | 36,Linear Distance Weighted Discrimination,dwdLinear,Classification,kerndwd,"lambda, qval" 38 | 37,Distance Weighted Discrimination with Polynomial Kernel,dwdPoly,Classification,kerndwd,"lambda, qval, degree, scale" 39 | 38,Distance Weighted Discrimination with Radial Basis Function Kernel,dwdRadial,Classification,"kernlab, kerndwd","lambda, qval, sigma" 40 | 39,Multivariate Adaptive Regression Spline,earth,Dual Use,earth,"nprune, degree" 41 | 40,Extreme Learning Machine,elm,Dual Use,elmNN,"nhid, actfun" 42 | 41,Elasticnet,enet,Regression,elasticnet,"fraction, lambda" 43 | 42,Ensemble Partial Least Squares Regression,enpls,Regression,enpls,maxcomp 44 | 43,Ensemble Partial Least Squares Regression with Feature Selection,enpls.fs,Regression,enpls,"maxcomp, threshold" 45 | 44,Tree Models from Genetic Algorithms,evtree,Dual Use,evtree,alpha 46 | 45,Random Forest by Randomization,extraTrees,Dual Use,extraTrees,"mtry, numRandomCuts" 47 | 46,Flexible Discriminant Analysis,fda,Classification,"earth, mda","degree, nprune" 48 | 47,Fuzzy Rules Using Genetic Cooperative-Competitive Learning and Pittsburgh,FH.GBML,Classification,frbs,"max.num.rule, popu.size, max.gen" 49 | 48,Fuzzy Inference Rules by Descent Method,FIR.DM,Regression,frbs,"num.labels, max.iter" 50 | 49,Ridge Regression with Variable Selection,foba,Regression,foba,"k, lambda" 51 | 50,Fuzzy Rules Using Chi's Method,FRBCS.CHI,Classification,frbs,"num.labels, type.mf" 52 | 51,Fuzzy Rules with Weight Factor,FRBCS.W,Classification,frbs,"num.labels, type.mf" 53 | 52,Simplified TSK Fuzzy Rules,FS.HGD,Regression,frbs,"num.labels, max.iter" 54 | 53,Generalized Additive Model using Splines,gam,Dual Use,mgcv,"select, method" 55 | 54,Boosted Generalized Additive Model,gamboost,Dual Use,mboost,"mstop, prune" 56 | 55,Generalized Additive Model using LOESS,gamLoess,Dual Use,gam,"span, degree" 57 | 56,Generalized Additive Model using Splines,gamSpline,Dual Use,gam,df 58 | 57,Gaussian Process,gaussprLinear,Dual Use,kernlab,None 59 | 58,Gaussian Process with Polynomial Kernel,gaussprPoly,Dual Use,kernlab,"degree, scale" 60 | 59,Gaussian Process with Radial Basis Function Kernel,gaussprRadial,Dual Use,kernlab,sigma 61 | 60,Stochastic Gradient Boosting,gbm,Dual Use,"gbm, plyr","n.trees, interaction.depth, shrinkage, n.minobsinnode" 62 | 61,Multivariate Adaptive Regression Splines,gcvEarth,Dual Use,earth,degree 63 | 62,Fuzzy Rules via MOGUL,GFS.FR.MOGUL,Regression,frbs,"max.gen, max.iter, max.tune" 64 | 63,Fuzzy Rules Using Genetic Cooperative-Competitive Learning,GFS.GCCL,Classification,frbs,"num.labels, popu.size, max.gen" 65 | 64,Genetic Lateral Tuning and Rule Selection of Linguistic Fuzzy Systems,GFS.LT.RS,Regression,frbs,"popu.size, num.labels, max.gen" 66 | 65,Fuzzy Rules via Thrift,GFS.THRIFT,Regression,frbs,"popu.size, num.labels, max.gen" 67 | 66,Generalized Linear Model,glm,Dual Use,,None 68 | 67,Boosted Generalized Linear Model,glmboost,Dual Use,mboost,"mstop, prune" 69 | 68,glmnet,glmnet,Dual Use,glmnet,"alpha, lambda" 70 | 69,Generalized Linear Model with Stepwise Feature Selection,glmStepAIC,Dual Use,MASS,None 71 | 70,Generalized Partial Least Squares,gpls,Classification,gpls,K.prov 72 | 71,Heteroscedastic Discriminant Analysis,hda,Classification,hda,"gamma, lambda, newdim" 73 | 72,High Dimensional Discriminant Analysis,hdda,Classification,HDclassif,"threshold, model" 74 | 73,Hybrid Neural Fuzzy Inference System,HYFIS,Regression,frbs,"num.labels, max.iter" 75 | 74,Independent Component Regression,icr,Regression,fastICA,n.comp 76 | 75,C4.5-like Trees,J48,Classification,RWeka,C 77 | 76,Rule-Based Classifier,JRip,Classification,RWeka,NumOpt 78 | 77,Partial Least Squares,kernelpls,Dual Use,pls,ncomp 79 | 78,k-Nearest Neighbors,kknn,Dual Use,kknn,"kmax, distance, kernel" 80 | 79,k-Nearest Neighbors,knn,Dual Use,,k 81 | 80,Polynomial Kernel Regularized Least Squares,krlsPoly,Regression,KRLS,"lambda, degree" 82 | 81,Radial Basis Function Kernel Regularized Least Squares,krlsRadial,Regression,"KRLS, kernlab","lambda, sigma" 83 | 82,Least Angle Regression,lars,Regression,lars,fraction 84 | 83,Least Angle Regression,lars2,Regression,lars,step 85 | 84,The lasso,lasso,Regression,elasticnet,fraction 86 | 85,Linear Discriminant Analysis,lda,Classification,MASS,None 87 | 86,Linear Discriminant Analysis,lda2,Classification,MASS,dimen 88 | 87,Linear Regression with Backwards Selection,leapBackward,Regression,leaps,nvmax 89 | 88,Linear Regression with Forward Selection,leapForward,Regression,leaps,nvmax 90 | 89,Linear Regression with Stepwise Selection,leapSeq,Regression,leaps,nvmax 91 | 90,Robust Linear Discriminant Analysis,Linda,Classification,rrcov,None 92 | 91,Linear Regression,lm,Regression,,None 93 | 92,Linear Regression with Stepwise Selection,lmStepAIC,Regression,MASS,None 94 | 93,Logistic Model Trees,LMT,Classification,RWeka,iter 95 | 94,Localized Linear Discriminant Analysis,loclda,Classification,klaR,k 96 | 95,Bagged Logic Regression,logicBag,Dual Use,logicFS,"nleaves, ntrees" 97 | 96,Boosted Logistic Regression,LogitBoost,Classification,caTools,nIter 98 | 97,Logic Regression,logreg,Dual Use,LogicReg,"treesize, ntrees" 99 | 98,Least Squares Support Vector Machine,lssvmLinear,Classification,kernlab,None 100 | 99,Least Squares Support Vector Machine with Polynomial Kernel,lssvmPoly,Classification,kernlab,"degree, scale" 101 | 100,Least Squares Support Vector Machine with Radial Basis Function Kernel,lssvmRadial,Classification,kernlab,sigma 102 | 101,Learning Vector Quantization,lvq,Classification,class,"size, k" 103 | 102,Model Tree,M5,Regression,RWeka,"pruned, smoothed, rules" 104 | 103,Model Rules,M5Rules,Regression,RWeka,"pruned, smoothed" 105 | 104,Mixture Discriminant Analysis,mda,Classification,mda,subclasses 106 | 105,Maximum Uncertainty Linear Discriminant Analysis,Mlda,Classification,HiDimDA,None 107 | 106,Multi-Layer Perceptron,mlp,Dual Use,RSNNS,size 108 | 107,Multi-Layer Perceptron,mlpWeightDecay,Dual Use,RSNNS,"size, decay" 109 | 108,Penalized Multinomial Regression,multinom,Classification,nnet,decay 110 | 109,Naive Bayes,nb,Classification,klaR,"fL, usekernel" 111 | 110,Naive Bayes Classifier,nbDiscrete,Classification,bnclassify,smooth 112 | 111,Semi-Naive Structure Learner Wrapper,nbSearch,Classification,bnclassify,"k, epsilon, smooth, final_smooth, direction" 113 | 112,Neural Network,neuralnet,Regression,neuralnet,"layer1, layer2, layer3" 114 | 113,Neural Network,nnet,Dual Use,nnet,"size, decay" 115 | 114,Non-Negative Least Squares,nnls,Regression,nnls,None 116 | 115,Tree-Based Ensembles,nodeHarvest,Dual Use,nodeHarvest,"maxinter, mode" 117 | 116,Oblique Trees,oblique.tree,Classification,oblique.tree,"oblique.splits, variable.selection" 118 | 117,Single Rule Classification,OneR,Classification,RWeka,None 119 | 118,Oblique Random Forest,ORFlog,Classification,obliqueRF,mtry 120 | 119,Oblique Random Forest,ORFpls,Classification,obliqueRF,mtry 121 | 120,Oblique Random Forest,ORFridge,Classification,obliqueRF,mtry 122 | 121,Oblique Random Forest,ORFsvm,Classification,obliqueRF,mtry 123 | 122,Optimal Weighted Nearest Neighbor Classifier,ownn,Classification,snn,K 124 | 123,Nearest Shrunken Centroids,pam,Classification,pamr,threshold 125 | 124,Parallel Random Forest,parRF,Dual Use,"e1071, randomForest",mtry 126 | 125,Rule-Based Classifier,PART,Classification,RWeka,"threshold, pruned" 127 | 126,partDSA,partDSA,Dual Use,partDSA,"cut.off.growth, MPD" 128 | 127,Neural Networks with Feature Extraction,pcaNNet,Dual Use,nnet,"size, decay" 129 | 128,Principal Component Analysis,pcr,Regression,pls,ncomp 130 | 129,Penalized Discriminant Analysis,pda,Classification,mda,lambda 131 | 130,Penalized Discriminant Analysis,pda2,Classification,mda,df 132 | 131,Penalized Linear Regression,penalized,Regression,penalized,"lambda1, lambda2" 133 | 132,Penalized Linear Discriminant Analysis,PenalizedLDA,Classification,"penalizedLDA, plyr","lambda, K" 134 | 133,Penalized Logistic Regression,plr,Classification,stepPlr,"lambda, cp" 135 | 134,Partial Least Squares,pls,Dual Use,pls,ncomp 136 | 135,Partial Least Squares Generalized Linear Models,plsRglm,Dual Use,plsRglm,"nt, alpha.pvals.expli" 137 | 136,Ordered Logistic or Probit Regression,polr,Classification,MASS,None 138 | 137,Projection Pursuit Regression,ppr,Regression,,nterms 139 | 138,Greedy Prototype Selection,protoclass,Classification,"proxy, protoclass","eps, Minkowski" 140 | 139,Knn regression via sklearn.neighbors.KNeighborsRegressor,pythonKnnReg,Regression,rPython,"n_neighbors, weights, algorithm, leaf_size, metric, p" 141 | 140,Quadratic Discriminant Analysis,qda,Classification,MASS,None 142 | 141,Robust Quadratic Discriminant Analysis,QdaCov,Classification,rrcov,None 143 | 142,Quantile Random Forest,qrf,Regression,quantregForest,mtry 144 | 143,Quantile Regression Neural Network,qrnn,Regression,qrnn,"n.hidden, penalty, bag" 145 | 144,Random Forest,ranger,Dual Use,"e1071, ranger",mtry 146 | 145,Radial Basis Function Network,rbf,Dual Use,RSNNS,size 147 | 146,Radial Basis Function Network,rbfDDA,Dual Use,RSNNS,negativeThreshold 148 | 147,Regularized Discriminant Analysis,rda,Classification,klaR,"gamma, lambda" 149 | 148,Relaxed Lasso,relaxo,Regression,"relaxo, plyr","lambda, phi" 150 | 149,Random Forest,rf,Dual Use,randomForest,mtry 151 | 150,Random Ferns,rFerns,Classification,rFerns,depth 152 | 151,Factor-Based Linear Discriminant Analysis,RFlda,Classification,HiDimDA,q 153 | 152,Random Forest Rule-Based Model,rfRules,Dual Use,"randomForest, inTrees, plyr","mtry, maxdepth" 154 | 153,Ridge Regression,ridge,Regression,elasticnet,lambda 155 | 154,Random k-Nearest Neighbors,rknn,Dual Use,rknn,"k, mtry" 156 | 155,Random k-Nearest Neighbors with Feature Selection,rknnBel,Dual Use,"rknn, plyr","k, mtry, d" 157 | 156,Robust Linear Model,rlm,Regression,MASS,None 158 | 157,Robust Mixture Discriminant Analysis,rmda,Classification,robustDA,"K, model" 159 | 158,ROC-Based Classifier,rocc,Classification,rocc,xgenes 160 | 159,Rotation Forest,rotationForest,Classification,rotationForest,"K, L" 161 | 160,Rotation Forest,rotationForestCp,Classification,"rpart, plyr, rotationForest","K, L, cp" 162 | 161,CART,rpart,Dual Use,rpart,cp 163 | 162,CART,rpart2,Dual Use,rpart,maxdepth 164 | 163,Cost-Sensitive CART,rpartCost,Classification,rpart,"cp, Cost" 165 | 164,Quantile Regression with LASSO penalty,rqlasso,Regression,rqPen,lambda 166 | 165,Non-Convex Penalized Quantile Regression,rqnc,Regression,rqPen,"lambda, penalty" 167 | 166,Regularized Random Forest,RRF,Dual Use,"randomForest, RRF","mtry, coefReg, coefImp" 168 | 167,Regularized Random Forest,RRFglobal,Dual Use,RRF,"mtry, coefReg" 169 | 168,Robust Regularized Linear Discriminant Analysis,rrlda,Classification,rrlda,"lambda, hp, penalty" 170 | 169,Robust SIMCA,RSimca,Classification,rrcovHD,None 171 | 170,Relevance Vector Machines with Linear Kernel,rvmLinear,Regression,kernlab,None 172 | 171,Relevance Vector Machines with Polynomial Kernel,rvmPoly,Regression,kernlab,"scale, degree" 173 | 172,Relevance Vector Machines with Radial Basis Function Kernel,rvmRadial,Regression,kernlab,sigma 174 | 173,Subtractive Clustering and Fuzzy c-Means Rules,SBC,Regression,frbs,"r.a, eps.high, eps.low" 175 | 174,Shrinkage Discriminant Analysis,sda,Classification,sda,"diagonal, lambda" 176 | 175,Stepwise Diagonal Linear Discriminant Analysis,sddaLDA,Classification,SDDA,None 177 | 176,Stepwise Diagonal Quadratic Discriminant Analysis,sddaQDA,Classification,SDDA,None 178 | 177,Sparse Distance Weighted Discrimination,sdwd,Classification,sdwd,"lambda, lambda2" 179 | 178,Partial Least Squares,simpls,Dual Use,pls,ncomp 180 | 179,Fuzzy Rules Using the Structural Learning Algorithm on Vague Environment,SLAVE,Classification,frbs,"num.labels, max.iter, max.gen" 181 | 180,Stabilized Linear Discriminant Analysis,slda,Classification,ipred,None 182 | 181,Sparse Mixture Discriminant Analysis,smda,Classification,sparseLDA,"NumVars, lambda, R" 183 | 182,Stabilized Nearest Neighbor Classifier,snn,Classification,snn,lambda 184 | 183,Sparse Linear Discriminant Analysis,sparseLDA,Classification,sparseLDA,"NumVars, lambda" 185 | 184,Sparse Partial Least Squares,spls,Dual Use,spls,"K, eta, kappa" 186 | 185,Linear Discriminant Analysis with Stepwise Feature Selection,stepLDA,Classification,"klaR, MASS","maxvar, direction" 187 | 186,Quadratic Discriminant Analysis with Stepwise Feature Selection,stepQDA,Classification,"klaR, MASS","maxvar, direction" 188 | 187,Supervised Principal Component Analysis,superpc,Regression,superpc,"threshold, n.components" 189 | 188,Support Vector Machines with Boundrange String Kernel,svmBoundrangeString,Dual Use,kernlab,"length, C" 190 | 189,Support Vector Machines with Exponential String Kernel,svmExpoString,Dual Use,kernlab,"lambda, C" 191 | 190,Support Vector Machines with Linear Kernel,svmLinear,Dual Use,kernlab,C 192 | 191,Support Vector Machines with Linear Kernel,svmLinear2,Dual Use,e1071,cost 193 | 192,Support Vector Machines with Polynomial Kernel,svmPoly,Dual Use,kernlab,"degree, scale, C" 194 | 193,Support Vector Machines with Radial Basis Function Kernel,svmRadial,Dual Use,kernlab,"sigma, C" 195 | 194,Support Vector Machines with Radial Basis Function Kernel,svmRadialCost,Dual Use,kernlab,C 196 | 195,Support Vector Machines with Class Weights,svmRadialWeights,Classification,kernlab,"sigma, C, Weight" 197 | 196,Support Vector Machines with Spectrum String Kernel,svmSpectrumString,Dual Use,kernlab,"length, C" 198 | 197,Tree Augmented Naive Bayes Classifier,tan,Classification,bnclassify,"score, smooth" 199 | 198,Tree Augmented Naive Bayes Classifier Structure Learner Wrapper,tanSearch,Classification,bnclassify,"k, epsilon, smooth, final_smooth, sp" 200 | 199,Bagged CART,treebag,Dual Use,"ipred, plyr, e1071",None 201 | 200,Variational Bayesian Multinomial Probit Regression,vbmpRadial,Classification,vbmp,estimateTheta 202 | 201,Partial Least Squares,widekernelpls,Dual Use,pls,ncomp 203 | 202,Wang and Mendel Fuzzy Rules,WM,Regression,frbs,"num.labels, type.mf" 204 | 203,Weighted Subspace Random Forest,wsrf,Classification,wsrf,mtry 205 | 204,eXtreme Gradient Boosting,xgbLinear,Dual Use,xgboost,"nrounds, lambda, alpha" 206 | 205,eXtreme Gradient Boosting,xgbTree,Dual Use,"xgboost, plyr","nrounds, max_depth, eta" 207 | 206,Self-Organizing Maps,xyf,Dual Use,kohonen,"xdim, ydim, xweight, topo" 208 | -------------------------------------------------------------------------------- /caret-setup/caret-modelLookup-DT.R: -------------------------------------------------------------------------------- 1 | # Get all caret models for regression and classification and output in web browser 2 | # The package DT is required 3 | # See: https://github.com/topepo/caret/blob/master/pkg/caret/R/modelLookup.R 4 | # 5 | # https://github.com/tobigithub/caret-machine-learning 6 | # Tobias Kind (2015) 7 | 8 | require(caret) 9 | # install.packages("DT") 10 | require(DT) 11 | 12 | # this caret function returns the models and their availability 13 | # to perform regression and classification 14 | modelLookup() 15 | #---------------------------------------------- 16 | # modelLookup() 17 | # 'data.frame': 372 obs. of 6 variables: 18 | # $ model : chr "ada" "ada" "ada" "AdaBag" ... 19 | # $ parameter: Factor w/ 144 levels "iter","maxdepth",..: 1 2 3 4 2 4 2 5 6 8 ... 20 | # $ label : Factor w/ 155 levels "#Trees","Learning Rate",..: 1 3 2 1 3 1 3 4 5 6 ... 21 | # $ forReg : logi FALSE FALSE FALSE FALSE FALSE FALSE ... 22 | # $ forClass : logi TRUE TRUE TRUE TRUE TRUE TRUE ... 23 | # $ probModel: 24 | #---------------------------------------------- 25 | 26 | #length of models in function 27 | MAX = dim(modelLookup())[1]; 28 | #perform model Lookup 29 | caretModels <- modelLookup() 30 | #coerce into dataframe for web output 31 | caretModels <- as.data.frame(caretModels) 32 | class(caretModels) 33 | 34 | # call web output with correct column names 35 | datatable(caretModels, options = list( 36 | columnDefs = list(list(className = 'dt-left', targets = c(0,1,2,3,4,5,6))), 37 | pageLength = MAX, 38 | order = list(list(0, 'asc'))), 39 | colnames = c('Num','model',' parameter', 'label', 'forReg', 'forClass',' probModel'), 40 | caption = paste('Caret models for regression and classification',Sys.time()), 41 | class = 'cell-border stripe') %>% 42 | formatStyle(2, 43 | background = styleColorBar(1, 'steelblue'), 44 | backgroundSize = '100% 90%', 45 | backgroundRepeat = 'no-repeat', 46 | backgroundPosition = 'center' 47 | ) 48 | 49 | ### END 50 | 51 | # Output will be in sortable table web browser and file index.html 52 | # The table can be easily copy/pasted or saved as csv or XLS 53 | 54 | # Num model parameter label forReg forClass probModel 55 | #1 ada iter #Trees false true true 56 | #2 ada maxdepth Max Tree Depth false true true 57 | #3 ada nu Learning Rate false true true 58 | -------------------------------------------------------------------------------- /caret-setup/caret-setup-comfort.R: -------------------------------------------------------------------------------- 1 | # Installation of caret package with commonly used 340 dependencies 2 | # https://github.com/tobigithub/caret-machine-learning 3 | # Tobias Kind (2015) 4 | 5 | # installs most of the 340 caret dependencies + caret book + seven commonly used but not all of them 6 | mostCommon <- c("caret", "AppliedPredictiveModeling", "ggplot2", "data.table", "plyr", "knitr", "shiny", "xts", "lattice") 7 | install.packages(mostCommon, dependencies = c("Imports", "Depends", "Suggests")) 8 | require(caret); sessionInfo() 9 | 10 | ### END 11 | -------------------------------------------------------------------------------- /caret-setup/caret-setup-deLuxe.R: -------------------------------------------------------------------------------- 1 | # Installation of caret package with allmost 400 required caret dependencies 2 | # https://github.com/tobigithub/caret-machine-learning 3 | # Tobias Kind (2015) 4 | 5 | # 1) load few caret packages from BioConductor, this will create most troubles 6 | # this is a static solution (not good) check with below URL for more info 7 | # https://github.com/topepo/caret/blob/master/release_process/update_pkgs.R 8 | # Answer "n" when asked for updates 9 | source("http://bioconductor.org/biocLite.R") 10 | biocLite() 11 | biocLite(c("arm", "gpls", "logicFS", "vbmp")) 12 | 13 | # 2) installs most of the 340 caret dependencies + seven commonly used but not all of them 14 | # Make sure to allow firewall access for doMPI if needed 15 | mostCommon <- c("caret", "AppliedPredictiveModeling", "ggplot2", "data.table", "plyr", "knitr", "shiny", "xts", "lattice") 16 | install.packages(mostCommon, dependencies = c("Imports", "Depends", "Suggests")) 17 | 18 | # 3) then load caret and check which additional libraries covering 200 models need to be installed 19 | # warnings will still exist; because caret loaded few dependencies they can not be updated 20 | # during runtime, may create errors 21 | require(caret); sessionInfo(); 22 | caretLibs <- unique(unlist(lapply(getModelInfo(), function(x) x$library))) 23 | detach("package:caret", unload=TRUE) 24 | install.packages(caretLibs, dependencies = c("Imports", "Depends", "Suggests")) 25 | 26 | # 4) load packages from R-Forge also "rPython" maybe on CRAN and R-Forge 27 | install.packages(c("CHAID"), repos="http://R-Forge.R-project.org") 28 | 29 | # 5) Restart R, clean-up mess, and say 'y' when asked 30 | # All packages that are not in CRAN such as SDDA need to be installed by hand 31 | source("http://bioconductor.org/biocLite.R") 32 | biocLite() 33 | biocLite(c("gpls", "logicFS", "vbmp")) 34 | 35 | # "Warning: cannot remove prior installation of package" 36 | # in case of final installation issues, check packages plyr, MASS and ggplot2 37 | # the library directories may have to be removed manually with Administrator access. 38 | # get the library location with .libPaths() 39 | # R has to be closed and restarted and the following two lines below have to be executed 40 | # (additional issues may occour under WIN with doMPI and msmpi.dll) 41 | 42 | ## rP <- c("plyr","ggplot2","MASS") 43 | ## install.packages(rP, dependencies = c("Imports", "Depends", "Suggests")) 44 | 45 | # the final straw, after everything was messed-up, restart R and do it again 46 | # install.packages("caret", dependencies = c("Imports", "Depends", "Suggests")) 47 | ### END 48 | 49 | -------------------------------------------------------------------------------- /caret-setup/caret-simple-setup.R: -------------------------------------------------------------------------------- 1 | # Installation of caret package with most dependencies 2 | # https://github.com/tobigithub/caret-machine-learning 3 | # Tobias Kind (2015) 4 | 5 | # installs most of the 300 caret dependencies but not all of them 6 | install.packages("caret", dependencies = c("Imports", "Depends", "Suggests")) 7 | 8 | ### END 9 | 10 | -------------------------------------------------------------------------------- /caret-tune/caret-tune-evolutionial-algorithm-svmRadial.R: -------------------------------------------------------------------------------- 1 | # Tune "svmRadial" in caret with evolutional algorithm using DEoptim. 2 | # Author: Rafael Ladeira https://github.com/rladeira 3 | # Source: https://github.com/topepo/caret/issues/321 4 | # Only runs on UNIX/Elcapitan due to library(doMC) 5 | # https://github.com/tobigithub/caret-machine-learning 6 | # Tobias Kind (2015) 7 | 8 | library(caret) 9 | library(parallel) 10 | library(doMC) 11 | 12 | set.seed(17516) 13 | training_data <- SLC14_1(500) 14 | testing_data <- SLC14_1(10^5) 15 | 16 | registerDoMC(cores = detectCores()) 17 | 18 | svm_fit <- function(x) { 19 | mod <- train(y ~ ., data = training_data, 20 | method = "svmRadial", 21 | preProc = c("center", "scale"), 22 | trControl = trainControl(method = "cv"), 23 | tuneGrid = data.frame(C = 2^x[1], sigma = exp(x[2]))) 24 | getTrainPerf(mod)[, "TrainRMSE"] 25 | } 26 | 27 | library(DEoptim) 28 | library(kernlab) 29 | 30 | ## converged after 31 iterations 31 | svm_de_obj <- DEoptim(fn = svm_fit, 32 | ## test cost values between ~0 and 2^10, 33 | ## test sigma values between exp(-5) and 1 34 | lower = c(-5, -5), 35 | upper = c(10, 0), 36 | control = DEoptim.control(reltol = 1e-3, 37 | steptol = 10, 38 | itermax = 100)) 39 | 40 | 41 | fitted_params <- svm_de_obj$optim$bestmem 42 | 43 | svm_model <- train(y ~ ., data = training_data, 44 | method = "svmRadial", 45 | preProc = c("center", "scale"), 46 | trControl = trainControl(method = "cv", number = 10), 47 | tuneGrid = data.frame(C = 2^fitted_params[1], 48 | sigma = exp(fitted_params[2]))) 49 | 50 | predictions <- predict(svm_model, testing_data) 51 | 52 | cat("Train RMSE:", getTrainPerf(svm_model)[, "TrainRMSE"], "\n") 53 | cat("Test RMSE:", RMSE(predictions, testing_data$y)) 54 | 55 | ## Train RMSE: 5.733844 56 | ## Test RMSE: 5.758587 57 | --------------------------------------------------------------------------------