├── LICENSE
├── README.md
├── caret-classification
    ├──  	iris-classification-all-fast.tsv
    ├── caret-all-binary-class-PimaIndiansDiabetes.R
    ├── caret-all-binary-class-PimaIndiansDiabetes.tsv
    ├── iris-classification-all-fast.R
    └── iris-classification-caret-all.R
├── caret-cv
    ├── HAR-all-CV-methods.R
    ├── caret-all-cv-methods-lapply-sapply.R
    ├── caret-all-cv-parallel-cubist.R
    ├── caret-all-cv-parallel-qrf.R
    └── caret-cv-simple.R
├── caret-datasets
    ├── caret-MS-datasets.csv
    └── view-caret-ML-datasets.R
├── caret-parallel
    ├── caret-parallel-train-cubist.R
    ├── caret-parallel-train-rf-deLuxe.R
    ├── caret-parallel-train.R
    ├── learning-curve-plots-caret-parallel.R
    ├── run-multiple-caret-models-parallel-lapply.R
    └── run-multiple-caret-models-parallel-sapply.R
├── caret-regression
    ├── caret-all-regression-models.R
    ├── caret-all-regressions-DT-cars.csv
    ├── caret-all-regressions-DT-concrete.R
    ├── caret-all-regressions-DT-concrete.csv
    └── caret-regression-plotObsVsPred.R
├── caret-setup
    ├── caret-get-all-models-automatically.R
    ├── caret-model-list-v6058.csv
    ├── caret-modelLookup-DT.R
    ├── caret-setup-comfort.R
    ├── caret-setup-deLuxe.R
    └── caret-simple-setup.R
└── caret-tune
    └── caret-tune-evolutionial-algorithm-svmRadial.R


/LICENSE:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Copyright (c) 2015 Tobias Kind
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 
23 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # caret-machine-learning examples
2 | This R repository collects practical working examples for many of the 200 classifications and regression models in caret and is geared towards practitioners. Example contributions from different fields are highly welcome.
3 | 
4 | The caret machine learning package [(WIKI)](http://topepo.github.io/caret/index.html) bundles around 200 classification and regression algorithms. Additional support for caret is given at the website [appliedpredictivemodeling.com](http://appliedpredictivemodeling.com/) and the excellent book by  Max Kuhn and Kjell Johnson [ISBN: 978-1-4614-6848-6](http://link.springer.com/book/10.1007/978-1-4614-6849-3). 
5 | 
6 | Please read more in this [**caret-machine-learning WIKI**](https://github.com/tobigithub/caret-machine-learning/wiki) or browse the example R code.
7 | 
8 | ---
9 | 


--------------------------------------------------------------------------------
/caret-classification/ 	iris-classification-all-fast.tsv:
--------------------------------------------------------------------------------
  1 | Num	Name	Accuracy	Kappa	time [s]	Model name
  2 | 2	avNNet	0.969	0.953	3.400	Model Averaged Neural Network
  3 | 97	vglmCumulative	0.965	0.947	2.840	Cumulative Probability Model for Ordinal Data
  4 | 96	vglmContRatio	0.963	0.944	2.950	Continuation Ratio Model for Ordinal Data
  5 | 79	sda	0.961	0.941	0.880	Shrinkage Discriminant Analysis
  6 | 1	amdai	0.961	0.941	10.850	Adaptive Mixture Discriminant Analysis
  7 | 31	hdrda	0.961	0.941	1.590	High-Dimensional Regularized Discriminant Analysis
  8 | 37	lda	0.961	0.941	0.580	Linear Discriminant Analysis
  9 | 38	lda2	0.961	0.941	0.590	Linear Discriminant Analysis
 10 | 60	pda	0.961	0.941	0.730	Penalized Discriminant Analysis
 11 | 69	rda	0.961	0.941	1.890	Regularized Discriminant Analysis
 12 | 27	glmnet	0.960	0.940	1.570	glmnet
 13 | 72	rlda	0.960	0.940	0.870	Regularized Linear Discriminant Analysis
 14 | 59	pcaNNet	0.959	0.939	1.430	Neural Networks with Feature Extraction
 15 | 95	vglmAdjCat	0.958	0.937	2.060	Adjacent Categories Probability Model for Ordinal Data
 16 | 52	nnet	0.957	0.936	1.360	Neural Network
 17 | 56	parRF	0.956	0.933	0.880	Parallel Random Forest
 18 | 41	loclda	0.956	0.933	2.340	Localized Linear Discriminant Analysis
 19 | 53	oblique.tree	0.955	0.932	3.830	Oblique Trees
 20 | 20	extraTrees	0.955	0.932	2.940	Random Forest by Randomization
 21 | 25	gbm	0.955	0.932	1.170	Stochastic Gradient Boosting
 22 | 13	CSimca	0.955	0.932	1.340	SIMCA
 23 | 6	Boruta	0.954	0.931	5.610	Random Forest with Additional Feature Selection
 24 | 45	mda	0.954	0.931	1.130	Mixture Discriminant Analysis
 25 | 70	rf	0.953	0.930	1.400	Random Forest
 26 | 94	treebag	0.953	0.929	1.330	Bagged CART
 27 | 68	rbfDDA	0.953	0.929	2.790	Radial Basis Function Network
 28 | 67	ranger	0.952	0.928	0.980	Random Forest
 29 | 42	LogitBoost	0.951	0.926	0.850	Boosted Logistic Regression
 30 | 87	svmLinear	0.950	0.924	0.670	Support Vector Machines with Linear Kernel
 31 | 88	svmLinear2	0.949	0.923	0.730	Support Vector Machines with Linear Kernel
 32 | 30	hdda	0.949	0.923	0.860	High Dimensional Discriminant Analysis
 33 | 39	Linda	0.949	0.923	0.830	Robust Linear Discriminant Analysis
 34 | 36	knn	0.948	0.922	0.610	k-Nearest Neighbors
 35 | 40	LMT	0.946	0.918	2.760	Logistic Model Trees
 36 | 86	stepQDA	0.945	0.918	2.830	Quadratic Discriminant Analysis with Stepwise Feature Selection
 37 | 65	qda	0.945	0.917	0.610	Quadratic Discriminant Analysis
 38 | 78	RSimca	0.945	0.917	0.940	Robust SIMCA
 39 | 21	fda	0.945	0.916	0.830	Flexible Discriminant Analysis
 40 | 3	bagFDAGCV	0.944	0.916	4.650	Bagged FDA using gCV Pruning
 41 | 9	C5.0	0.944	0.915	1.150	C5.0
 42 | 85	stepLDA	0.943	0.915	2.910	Linear Discriminant Analysis with Stepwise Feature Selection
 43 | 99	wsrf	0.943	0.914	1.110	Weighted Subspace Random Forest
 44 | 18	earth	0.943	0.914	1.000	Multivariate Adaptive Regression Spline
 45 | 44	lvq	0.942	0.913	0.870	Learning Vector Quantization
 46 | 100	xyf	0.942	0.913	2.320	Self-Organizing Maps
 47 | 29	hda	0.942	0.912	3.870	Heteroscedastic Discriminant Analysis
 48 | 76	rpart2	0.940	0.910	0.640	CART
 49 | 54	OneR	0.940	0.909	0.830	Single Rule Classification
 50 | 12	cforest	0.938	0.906	2.930	Conditional Inference Random Forest
 51 | 74	rpart	0.938	0.906	0.700	CART
 52 | 75	rpart1SE	0.938	0.906	0.660	CART
 53 | 77	rpartScore	0.938	0.906	2.390	CART or Ordinal Responses
 54 | 89	svmPoly	0.938	0.906	2.890	Support Vector Machines with Polynomial Kernel
 55 | 14	ctree	0.937	0.906	0.860	Conditional Inference Tree
 56 | 15	ctree2	0.937	0.906	1.200	Conditional Inference Tree
 57 | 26	gcvEarth	0.937	0.904	0.810	Multivariate Adaptive Regression Splines
 58 | 33	JRip	0.936	0.904	2.300	Rule-Based Classifier
 59 | 58	partDSA	0.936	0.903	4.950	partDSA
 60 | 57	PART	0.934	0.901	0.840	Rule-Based Classifier
 61 | 49	mlpWeightDecay	0.933	0.899	7.470	Multi-Layer Perceptron
 62 | 50	mlpWeightDecayML	0.933	0.899	7.770	Multi-Layer Perceptron, multiple layers
 63 | 11	C5.0Tree	0.933	0.899	0.550	Single C5.0 Tree
 64 | 10	C5.0Rules	0.932	0.898	0.530	Single C5.0 Ruleset
 65 | 32	J48	0.932	0.897	2.940	C4.5-like Trees
 66 | 47	mlp	0.931	0.896	3.270	Multi-Layer Perceptron
 67 | 48	mlpML	0.931	0.896	3.090	Multi-Layer Perceptron, with multiple layers
 68 | 66	QdaCov	0.927	0.891	0.750	Robust Quadratic Discriminant Analysis
 69 | 91	svmRadialCost	0.926	0.889	0.840	Support Vector Machines with Radial Basis Function Kernel
 70 | 5	bdk	0.926	0.888	1.900	Self-Organizing Map
 71 | 64	protoclass	0.925	0.887	1.190	Greedy Prototype Selection
 72 | 90	svmRadial	0.923	0.884	0.890	Support Vector Machines with Radial Basis Function Kernel
 73 | 92	svmRadialSigma	0.923	0.884	1.360	Support Vector Machines with Radial Basis Function Kernel
 74 | 93	svmRadialWeights	0.923	0.884	1.060	Support Vector Machines with Class Weights
 75 | 43	lssvmRadial	0.914	0.871	3.590	Least Squares Support Vector Machine with Radial Basis Function Kernel
 76 | 35	kknn	0.912	0.867	1.150	k-Nearest Neighbors
 77 | 24	gaussprRadial	0.909	0.864	2.310	Gaussian Process with Radial Basis Function Kernel
 78 | 61	PenalizedLDA	0.902	0.853	0.880	Penalized Linear Discriminant Analysis
 79 | 55	pam	0.900	0.850	0.870	Nearest Shrunken Centroids
 80 | 51	nb	0.899	0.849	1.010	Naive Bayes
 81 | 16	dda	0.891	0.837	2.260	Diagonal Discriminant Analysis
 82 | 19	elm	0.875	0.812	1.020	Extreme Learning Machine
 83 | 82	slda	0.836	0.753	0.970	Stabilized Linear Discriminant Analysis
 84 | 84	spls	0.796	0.693	1.360	Sparse Partial Least Squares
 85 | 34	kernelpls	0.794	0.692	0.630	Partial Least Squares
 86 | 63	pls	0.794	0.692	0.630	Partial Least Squares
 87 | 81	simpls	0.794	0.692	0.660	Partial Least Squares
 88 | 98	widekernelpls	0.794	0.692	0.730	Partial Least Squares
 89 | 71	RFlda	0.758	0.571	0.770	Factor-Based Linear Discriminant Analysis
 90 | 46	Mlda	0.757	0.569	0.720	Maximum Uncertainty Linear Discriminant Analysis
 91 | 83	sparseLDA	0.666	0.499	1.330	Sparse Linear Discriminant Analysis
 92 | 4	bayesglm	0.665	0.495	0.850	Bayesian Generalized Linear Model
 93 | 8	bstSm	0.665	0.495	2.780	Boosted Smoothing Spline
 94 | 22	gam	0.665	0.495	2.540	Generalized Additive Model using Splines
 95 | 23	gamLoess	0.665	0.495	1.130	Generalized Additive Model using LOESS
 96 | 62	plr	0.665	0.495	0.860	Penalized Logistic Regression
 97 | 73	rocc	0.665	0.495	0.890	ROC-Based Classifier
 98 | 80	sdwd	0.665	0.495	1.570	Sparse Distance Weighted Discrimination
 99 | 7	BstLm	0.588	0.380	2.330	Boosted Linear Model
100 | 17	dnn	0.320	0.000	2.620	Stacked AutoEncoder Deep Neural Network
101 | 28	gpls	1.000		2.340	Generalized Partial Least Squares
102 | 


--------------------------------------------------------------------------------
/caret-classification/caret-all-binary-class-PimaIndiansDiabetes.R:
--------------------------------------------------------------------------------
  1 | # Use of all 160 caret models for binary classification and diabetes set
  2 | # The  output from  fast (working) binary classification models is
  3 | # exported to a sortable table in a web browser using the DT library
  4 | # https://github.com/tobigithub/caret-machine-learning
  5 | # R3.3.1 and caret_6.0-70
  6 | # Tobias Kind (2016)
  7 | 
  8 | # use mlbench, caret and DT library
  9 | require(mlbench)
 10 | require(caret)
 11 | require(DT)
 12 | 
 13 | # load diabetes set 768 x 9
 14 | data(PimaIndiansDiabetes) 
 15 | dim(PimaIndiansDiabetes) 
 16 | 
 17 | # get all model names for classification
 18 | m <- unique(modelLookup()[modelLookup()$forClass,c(1)])
 19 | length(m); m;
 20 | 
 21 | # slow classification models ("rbf" crashes; "dwdLinear", "ownn", "snn" have issues)
 22 | # all others may have just failed and are not listed here
 23 | #
 24 | removeModels <- c("AdaBag", "AdaBoost.M1", "FH.GBML", "pda2", "PenalizedLDA",
 25 | "GFS.GCCL", "rbf", "RFlda", "nodeHarvest", "ORFsvm", "dwdLinear", "dwdPoly", "gam",
 26 | "gaussprLinear", "ownn", "sddaLDA", "sddaQDA", "SLAVE", "smda", "snn", "rmda", 
 27 | "rFerns", "wsrf","ordinalNet","awnb", "awtan","manb","nbDiscrete","nbSearch","tan",
 28 | "tanSearch","bartMachine","randomGLM", "Rborist", "adaboost")
 29 | 
 30 | #remove all slow and failed models from model list
 31 | m <- m[!m %in% removeModels]
 32 | 
 33 | #m <- c("glm","gbm",  "adaboost" ,"rf")
 34 | 
 35 | # pre-load all packages (does not really work due to other dependencies)
 36 | suppressPackageStartupMessages(ll <-lapply(m, require, character.only = TRUE))
 37 | 
 38 | # show which libraries were loaded  
 39 | sessionInfo()
 40 | 
 41 | # load X and Y (this will be transferred to to train function)
 42 | #X = PimaIndiansDiabetes[1:60,1:8]
 43 | #Y = PimaIndiansDiabetes$diabetes[1:60]
 44 |  X = PimaIndiansDiabetes[,1:8]
 45 |  Y = PimaIndiansDiabetes$diabetes
 46 | 
 47 | # register parallel front-end
 48 | library(doParallel); cl <- makeCluster(detectCores()); registerDoParallel(cl)
 49 | 
 50 | # this is required otherwise the first method is benchmarked wrong
 51 | warmup <-train(y=Y, x=X, "rf", trControl = trainControl(method = "boot632"))
 52 | 
 53 | # this setup actually calls the caret::train function, in order to provide
 54 | # minimal error handling this type of construct is needed.
 55 | trainCall <- function(i) 
 56 | 	{
 57 | 	     cat("----------------------------------------------------","\n");
 58 | 	     set.seed(123); cat(i," <- loaded\n");
 59 | 	     return(tryCatch(
 60 | 	     		t2 <- train(y=Y, x=X, (i), trControl = trainControl(method = "boot632")),
 61 | 	     		error=function(e) NULL))
 62 | 	}
 63 | 
 64 | # use lapply/loop to run everything, required for try/catch error function to work
 65 | t2 <- lapply(m, trainCall)
 66 | 
 67 | #remove NULL values, we only allow succesful methods, provenance is deleted.
 68 | t2 <- t2[!sapply(t2, is.null)]
 69 | 
 70 | # this setup extracts the results with minimal error handling 
 71 | # TrainKappa can be sometimes zero, but Accuracy SD can be still available
 72 | # see Kappa value http://epiville.ccnmtl.columbia.edu/popup/how_to_calculate_kappa.html
 73 | printCall <- function(i) 
 74 | 	{
 75 | 	     return(tryCatch(
 76 | 	     	{
 77 | 	     	 cat(sprintf("%-22s",(m[i])))
 78 | 		 cat(round(getTrainPerf(t2[[i]])$TrainAccuracy,4),"\t")
 79 | 		 cat(round(getTrainPerf(t2[[i]])$TrainKappa,4),"\t")
 80 | 		 cat(t2[[i]]$times$everything[3],"\n")},
 81 | 	         error=function(e) NULL))
 82 | 	}
 83 | 	
 84 | r2 <- lapply(1:length(t2), printCall)
 85 | 
 86 | # stop cluster and register sequntial front end
 87 | stopCluster(cl); registerDoSEQ();
 88 | 
 89 | # preallocate data types
 90 | i = 1; MAX = length(t2);
 91 | x1 <- character() # Name
 92 | x2 <- numeric()   # R2
 93 | x3 <- numeric()   # RMSE
 94 | x4 <- numeric()   # time [s]
 95 | x5 <- character() # long model name
 96 |  
 97 | # fill data and check indexes and NA with loop/lapply 
 98 | for (i in 1:length(t2)) {
 99 |     x1[i] <- t2[[i]]$method
100 |     x2[i] <- as.numeric(round(getTrainPerf(t2[[i]])$TrainAccuracy,4))
101 |     x3[i] <- as.numeric(round(getTrainPerf(t2[[i]])$TrainKappa,4))
102 |     x4[i] <- as.numeric(t2[[i]]$times$everything[3])
103 |     x5[i] <- t2[[i]]$modelInfo$label
104 | }
105 |   
106 | # coerce to data frame
107 | df1 <- data.frame(x1,x2,x3,x4,x5, stringsAsFactors=FALSE)
108 | 
109 | # print all results to R-GUI
110 | df1
111 | 
112 | # plot models, just as example
113 | # ggplot(t2[[1]])
114 | # ggplot(t2[[1]])
115 | 
116 | # call web output with correct column names
117 | datatable(df1,  options = list(
118 | 		columnDefs = list(list(className = 'dt-left', targets = c(0,1,2,3,4,5))),
119 | 		pageLength = MAX,
120 |   		order = list(list(2, 'desc'))),
121 | 		colnames = c('Num', 'Name', 'Accuracy', 'Kappa', 'time [s]', 'Model name'),
122 | 	        caption = paste('Classification results from caret models',Sys.time()),
123 | 	        class = 'cell-border stripe')  %>% 	       
124 | 	        formatRound('x2', 3) %>%  
125 | 	        formatRound('x3', 3) %>%
126 | 	        formatRound('x4', 3) %>%
127 | 		    formatStyle(2,
128 | 		    background = styleColorBar(x2, 'steelblue'),
129 | 		    backgroundSize = '100% 90%',
130 | 		    backgroundRepeat = 'no-repeat',
131 | 		    backgroundPosition = 'center'
132 | )
133 | 
134 | 
135 | ### END
136 | 
137 | 


--------------------------------------------------------------------------------
/caret-classification/caret-all-binary-class-PimaIndiansDiabetes.tsv:
--------------------------------------------------------------------------------
  1 | Num	Name	Accuracy	Kappa	time [s]	Model name
  2 | 73	ORFlog	0.844	0.656	682	Oblique Random Forest
  3 | 89	rf	0.849	0.656	4.24	Random Forest
  4 | 75	ORFridge	0.846	0.655	1494.08	Oblique Random Forest
  5 | 74	ORFpls	0.846	0.654	367.87	Oblique Random Forest
  6 | 77	parRF	0.847	0.652	9.52	Parallel Random Forest
  7 | 11	Boruta	0.847	0.652	132.71	Random Forest with Additional Feature Selection
  8 | 86	ranger	0.845	0.651	23.89	Random Forest
  9 | 30	extraTrees	0.846	0.648	20.36	Random Forest by Randomization
 10 | 101	RRFglobal	0.842	0.643	5.21	Regularized Random Forest
 11 | 120	treebag	0.836	0.63	2	Bagged CART
 12 | 125	xgbLinear	0.834	0.628	43.35	eXtreme Gradient Boosting
 13 | 126	xgbTree	0.831	0.623	13.64	eXtreme Gradient Boosting
 14 | 100	RRF	0.822	0.602	11.14	Regularized Random Forest
 15 | 24	deepboost	0.822	0.601	64.7	DeepBoost
 16 | 87	rbfDDA	0.823	0.599	15.19	Radial Basis Function Network
 17 | 94	rotationForestCp	0.804	0.555	9.82	Rotation Forest
 18 | 84	protoclass	0.796	0.553	4.7	Greedy Prototype Selection
 19 | 39	gbm	0.796	0.538	1.4	Stochastic Gradient Boosting
 20 | 71	oblique.tree	0.79	0.534	91.98	Oblique Trees
 21 | 93	rotationForest	0.793	0.527	5.33	Rotation Forest
 22 | 16	C5.0Cost	0.789	0.524	10.3	Cost-Sensitive C5.0
 23 | 15	C5.0	0.789	0.524	5.09	C5.0
 24 | 118	svmRadialSigma	0.786	0.51	3.24	Support Vector Machines with Radial Basis Function Kernel
 25 | 19	cforest	0.786	0.51	32.27	Conditional Inference Random Forest
 26 | 1	ada	0.782	0.509	34.96	Boosted Classification Trees
 27 | 37	gaussprPoly	0.785	0.506	638.57	Gaussian Process with Polynomial Kernel
 28 | 80	pcaNNet	0.775	0.503	3.37	Neural Networks with Feature Extraction
 29 | 35	gamLoess	0.779	0.5	1.39	Generalized Additive Model using LOESS
 30 | 52	kknn	0.776	0.498	2.48	k-Nearest Neighbors
 31 | 38	gaussprRadial	0.78	0.496	55.13	Gaussian Process with Radial Basis Function Kernel
 32 | 31	fda	0.778	0.493	0.95	Flexible Discriminant Analysis
 33 | 5	bagEarthGCV	0.777	0.492	10.6	Bagged MARS using gCV Pruning
 34 | 27	earth	0.777	0.491	1.19	Multivariate Adaptive Regression Spline
 35 | 40	gcvEarth	0.777	0.491	0.86	Multivariate Adaptive Regression Splines
 36 | 4	bagEarth	0.777	0.491	23.94	Bagged MARS
 37 | 119	svmRadialWeights	0.779	0.49	3.11	Support Vector Machines with Class Weights
 38 | 116	svmRadial	0.779	0.49	1.49	Support Vector Machines with Radial Basis Function Kernel
 39 | 117	svmRadialCost	0.778	0.488	1.52	Support Vector Machines with Radial Basis Function Kernel
 40 | 96	rpart1SE	0.77	0.487	0.75	CART
 41 | 14	bstTree	0.779	0.486	20.59	Boosted Tree
 42 | 7	bagFDAGCV	0.774	0.483	8.12	Bagged FDA using gCV Pruning
 43 | 34	gamboost	0.775	0.48	3.4	Boosted Generalized Additive Model
 44 | 6	bagFDA	0.773	0.479	19.83	Bagged Flexible Discriminant Analysis
 45 | 21	ctree	0.767	0.479	0.98	Conditional Inference Tree
 46 | 115	svmPoly	0.778	0.477	6.96	Support Vector Machines with Polynomial Kernel
 47 | 32	FRBCS.CHI	0.762	0.476	1254.79	Fuzzy Rules Using Chi's Method
 48 | 58	loclda	0.774	0.476	5.1	Localized Linear Discriminant Analysis
 49 | 43	glmnet	0.773	0.476	1.07	glmnet
 50 | 88	rda	0.773	0.475	3.93	Regularized Discriminant Analysis
 51 | 55	lda2	0.773	0.475	0.63	Linear Discriminant Analysis
 52 | 2	amdai	0.773	0.475	0.62	Adaptive Mixture Discriminant Analysis
 53 | 54	lda	0.773	0.475	0.62	Linear Discriminant Analysis
 54 | 48	hdrda	0.773	0.475	8.02	High-Dimensional Regularized Discriminant Analysis
 55 | 81	pda	0.773	0.475	0.83	Penalized Discriminant Analysis
 56 | 8	bayesglm	0.773	0.475	0.75	Bayesian Generalized Linear Model
 57 | 121	vglmAdjCat	0.773	0.474	2.15	Adjacent Categories Probability Model for Ordinal Data
 58 | 36	gamSpline	0.773	0.474	1.94	Generalized Additive Model using Splines
 59 | 68	multinom	0.773	0.474	0.85	Penalized Multinomial Regression
 60 | 41	glm	0.773	0.474	0.66	Generalized Linear Model
 61 | 82	plr	0.773	0.474	10.91	Penalized Logistic Regression
 62 | 44	glmStepAIC	0.772	0.474	1.19	Generalized Linear Model with Stepwise Feature Selection
 63 | 122	vglmContRatio	0.773	0.473	4.06	Continuation Ratio Model for Ordinal Data
 64 | 123	vglmCumulative	0.773	0.473	2.68	Cumulative Probability Model for Ordinal Data
 65 | 62	mda	0.773	0.473	1.31	Mixture Discriminant Analysis
 66 | 109	spls	0.773	0.471	4.13	Sparse Partial Least Squares
 67 | 91	rlda	0.74	0.471	1.67	Regularized Linear Discriminant Analysis
 68 | 42	glmboost	0.771	0.469	1.11	Boosted Generalized Linear Model
 69 | 102	rrlda	0.752	0.467	24.74	Robust Regularized Linear Discriminant Analysis
 70 | 69	nb	0.764	0.466	3.65	Naive Bayes
 71 | 114	svmLinearWeights	0.77	0.466	2.36	Linear Support Vector Machines with Class Weights
 72 | 113	svmLinear2	0.77	0.466	1.25	Support Vector Machines with Linear Kernel
 73 | 90	rfRules	0.769	0.466	475.51	Random Forest Rule-Based Model
 74 | 97	rpart2	0.766	0.465	0.81	CART
 75 | 112	svmLinear	0.769	0.464	0.92	Support Vector Machines with Linear Kernel
 76 | 33	FRBCS.W	0.776	0.463	1218.4	Fuzzy Rules with Weight Factor
 77 | 60	lssvmRadial	0.767	0.462	8.93	Least Squares Support Vector Machine with Radial Basis Function Kernel
 78 | 105	sdwd	0.771	0.462	1.34	Sparse Distance Weighted Discrimination
 79 | 63	Mlda	0.768	0.459	0.81	Maximum Uncertainty Linear Discriminant Analysis
 80 | 18	C5.0Tree	0.754	0.455	0.7	Single C5.0 Tree
 81 | 49	J48	0.756	0.455	2.93	C4.5-like Trees
 82 | 104	sda	0.764	0.454	1.11	Shrinkage Discriminant Analysis
 83 | 10	blackboost	0.763	0.448	4.98	Boosted Tree
 84 | 23	dda	0.737	0.446	3.2	Diagonal Discriminant Analysis
 85 | 50	JRip	0.753	0.442	3.62	Rule-Based Classifier
 86 | 17	C5.0Rules	0.749	0.439	0.75	Single C5.0 Ruleset
 87 | 78	PART	0.746	0.439	15.49	Rule-Based Classifier
 88 | 45	gpls	0.757	0.434	15.14	Generalized Partial Least Squares
 89 | 83	pls	0.758	0.434	0.81	Partial Least Squares
 90 | 124	widekernelpls	0.758	0.434	0.78	Partial Least Squares
 91 | 106	simpls	0.758	0.434	0.76	Partial Least Squares
 92 | 51	kernelpls	0.758	0.434	0.73	Partial Least Squares
 93 | 29	evtree	0.754	0.433	119.09	Tree Models from Genetic Algorithms
 94 | 59	LogitBoost	0.747	0.43	1.04	Boosted Logistic Regression
 95 | 46	hda	0.751	0.423	14.34	Heteroscedastic Discriminant Analysis
 96 | 99	rpartScore	0.748	0.423	14.29	CART or Ordinal Responses
 97 | 95	rpart	0.748	0.423	0.81	CART
 98 | 13	bstSm	0.758	0.421	13.74	Boosted Smoothing Spline
 99 | 85	qda	0.745	0.42	0.65	Quadratic Discriminant Analysis
100 | 57	LMT	0.74	0.419	5.76	Logistic Model Trees
101 | 56	Linda	0.737	0.418	1.91	Robust Linear Discriminant Analysis
102 | 47	hdda	0.74	0.416	0.86	High Dimensional Discriminant Analysis
103 | 92	rocc	0.751	0.412	1.53	ROC-Based Classifier
104 | 53	knn	0.739	0.41	0.77	k-Nearest Neighbors
105 | 22	ctree2	0.746	0.409	1.53	Conditional Inference Tree
106 | 3	avNNet	0.746	0.394	12.67	Model Averaged Neural Network
107 | 110	stepLDA	0.744	0.391	7.36	Linear Discriminant Analysis with Stepwise Feature Selection
108 | 20	CSimca	0.692	0.389	1.22	SIMCA
109 | 111	stepQDA	0.744	0.387	7.4	Quadratic Discriminant Analysis with Stepwise Feature Selection
110 | 79	partDSA	0.725	0.377	12.29	partDSA
111 | 26	dwdRadial	0.776	0.368	35	Distance Weighted Discrimination with Radial Basis Function Kernel
112 | 103	RSimca	0.67	0.367	2.17	Robust SIMCA
113 | 127	xyf	0.715	0.339	4.74	Self-Organizing Maps
114 | 9	bdk	0.713	0.323	5.17	Self-Organizing Map
115 | 76	pam	0.729	0.321	7	Nearest Shrunken Centroids
116 | 98	rpartCost	0.729	0.316	0.87	Cost-Sensitive CART
117 | 72	OneR	0.706	0.313	0.99	Single Rule Classification
118 | 61	lvq	0.705	0.296	1.35	Learning Vector Quantization
119 | 70	nnet	0.704	0.295	2.8	Neural Network
120 | 28	elm	0.669	0.171	1.17	Extreme Learning Machine
121 | 107	slda	0.671	0.143	1.39	Stabilized Linear Discriminant Analysis
122 | 12	BstLm	0.644	0.026	2.84	Boosted Linear Model
123 | 67	mlpWeightDecayML	0.648	0	34.63	Multi-Layer Perceptron multiple layers
124 | 66	mlpWeightDecay	0.648	0	34.26	Multi-Layer Perceptron
125 | 25	dnn	0.648	0	5.22	Stacked AutoEncoder Deep Neural Network
126 | 108	sparseLDA	0.354	-0.001	1.66	Sparse Linear Discriminant Analysis
127 | 64	mlp	0.647	-0.001	12.56	Multi-Layer Perceptron
128 | 65	mlpML	0.647	-0.001	12.02	Multi-Layer Perceptron  with multiple layers
129 | 
130 | 


--------------------------------------------------------------------------------
/caret-classification/iris-classification-all-fast.R:
--------------------------------------------------------------------------------
  1 | # A selection of "fast" of all 160 caret models for multi-class classification and iris set
  2 | # A number of slow and broken models are excluded, this may change with each release
  3 | # The  output from  fast (working) binary classification models is
  4 | # exported to a sortable table in a web browser using the DT library
  5 | # Total runtime 145 seconds on 16 core (3.1GHz) (all single ML methods <4 seconds)
  6 | # https://github.com/tobigithub/caret-machine-learning
  7 | #
  8 | # Warning: requires DeLuxe installation of all caret dependencies.
  9 | # Warning: invokes DLL hell under Windows "maximal number of DLLs reached..."
 10 | # https://github.com/tobigithub/caret-machine-learning/tree/master/caret-setup
 11 | #
 12 | # R3.3.1 and caret 6.0-70
 13 | # Tobias Kind (2016)
 14 | 
 15 | # use mlbench, caret and DT library, please make sure they are already installed
 16 | require(mlbench)
 17 | require(caret)
 18 | require(DT)
 19 | 
 20 | # load iris set
 21 | data(iris) 
 22 | dim(iris) 
 23 | 
 24 | # get all model names for classification
 25 | m <- unique(modelLookup()[modelLookup()$forClass,c(1)])
 26 | length(m); m;
 27 | 
 28 | # slow classification models ("rbf" crashes; "dwdLinear", "ownn", "snn" have issues)
 29 | # all others may have just failed and are not listed here, models may be very accurate
 30 | removeModels <- c("AdaBag","AdaBoost.M1","pda2","dwdRadial","rbf","dwdLinear", "dwdPoly",
 31 | "gaussprLinear","gaussprPoly","rFerns","sddaLDA", "smda", "sddaQDA", "xgbLinear","xgbTree",
 32 | "AdaBag","FH.GBML","ORFsvm","ownn","vbmpRadial","SLAVE","ORFlog","GFS.GCCL","ORFpls",
 33 | "snn", "bagEarth","ORFridge","rmda","awnb", "awtan", "manb", "nbDiscrete", "nbSearch",
 34 | "ordinalNet", "blackboost","tan","tanSearch","randomGLM","Rborist",
 35 | "FRBCS.W", "FRBCS.CHI","evtree","bstTree","bagEarthGCV","bagFDA","rrlda")
 36 | 
 37 | #remove all slow and failed models from model list
 38 | m <- m[!m %in% removeModels]
 39 | 
 40 | # not multiclass
 41 | # Something is wrong; all the Accuracy metric values are missing:
 42 | removeModels <- c("ada","adaboost","bag","bartMachine","binda","C5.0Cost","chaid",
 43 | "deepboost","gamboost","glm","glmboost","glmStepAIC")
 44 | 
 45 | #remove multiclass fails from model list
 46 | m <- m[!m %in% removeModels]
 47 | 
 48 | # pre-load all packages (does not really work due to other dependencies)
 49 | suppressPackageStartupMessages(ll <-lapply(m, require, character.only = TRUE))
 50 | 
 51 | # show which libraries were loaded  
 52 | sessionInfo()
 53 | 
 54 | # load X and Y (this will be transferred to to train function)
 55 | X = iris[,1:3]
 56 | Y = iris$Species
 57 | 
 58 | # register parallel front-end
 59 | library(doParallel); cl <- makeCluster(detectCores()); registerDoParallel(cl)
 60 | 
 61 | # this is required otherwise the first method is benchmarked wrong
 62 | warmup <-train(y=Y, x=X, "rf", trControl = trainControl(method = "boot632"))
 63 | 
 64 | # this setup actually calls the caret::train function, in order to provide
 65 | # minimal error handling this type of construct is needed.
 66 | trainCall <- function(i) 
 67 | 	{
 68 | 	     cat("----------------------------------------------------","\n");
 69 | 	     set.seed(123); cat(i," <- loaded\n");
 70 | 	     return(tryCatch(
 71 | 	     		t2 <- train(y=Y, x=X, (i), trControl = trainControl(method = "boot632")),
 72 | 	     		error=function(e) NULL))
 73 | 	}
 74 | 
 75 | # use lapply/loop to run everything, required for try/catch error function to work
 76 | t2 <- lapply(m, trainCall)
 77 | 
 78 | #remove NULL values, we only allow succesful methods, provenance is deleted.
 79 | t2 <- t2[!sapply(t2, is.null)]
 80 | 
 81 | # this setup extracts the results with minimal error handling 
 82 | # TrainKappa can be sometimes zero, but Accuracy SD can be still available
 83 | # see Kappa value http://epiville.ccnmtl.columbia.edu/popup/how_to_calculate_kappa.html
 84 | printCall <- function(i) 
 85 | 	{
 86 | 	     return(tryCatch(
 87 | 	     	{
 88 | 	     	 cat(sprintf("%-22s",(m[i])))
 89 | 		 cat(round(getTrainPerf(t2[[i]])$TrainAccuracy,4),"\t")
 90 | 		 cat(round(getTrainPerf(t2[[i]])$TrainKappa,4),"\t")
 91 | 		 cat(t2[[i]]$times$everything[3],"\n")},
 92 | 	         error=function(e) NULL))
 93 | 	}
 94 | 	
 95 | r2 <- lapply(1:length(t2), printCall)
 96 | 
 97 | # stop cluster and register sequntial front end
 98 | stopCluster(cl); registerDoSEQ();
 99 | 
100 | # preallocate data types
101 | i = 1; MAX = length(t2);
102 | x1 <- character() # Name
103 | x2 <- numeric()   # R2
104 | x3 <- numeric()   # RMSE
105 | x4 <- numeric()   # time [s]
106 | x5 <- character() # long model name
107 |  
108 | # fill data and check indexes and NA with loop/lapply 
109 | for (i in 1:length(t2)) {
110 |     x1[i] <- t2[[i]]$method
111 |     x2[i] <- as.numeric(round(getTrainPerf(t2[[i]])$TrainAccuracy,4))
112 |     x3[i] <- as.numeric(round(getTrainPerf(t2[[i]])$TrainKappa,4))
113 |     x4[i] <- as.numeric(t2[[i]]$times$everything[3])
114 |     x5[i] <- t2[[i]]$modelInfo$label
115 | }
116 |   
117 | # coerce to data frame
118 | df1 <- data.frame(x1,x2,x3,x4,x5, stringsAsFactors=FALSE)
119 | 
120 | # print all results to R-GUI
121 | df1
122 | 
123 | # plot models, just as example
124 | # ggplot(t2[[1]])
125 | # ggplot(t2[[1]])
126 | 
127 | # call web output with correct column names
128 | DT::datatable(df1,  options = list(
129 | 		columnDefs = list(list(className = 'dt-left', targets = c(0,1,2,3,4,5))),
130 | 		pageLength = MAX,
131 |   		order = list(list(3, 'desc'))), # sort according to kappa value
132 | 		colnames = c('Num', 'Name', 'Accuracy', 'Kappa', 'time [s]', 'Model name'),
133 | 	        caption = paste('Classification results from caret models',Sys.time()),
134 | 	        class = 'cell-border stripe')  %>% 	       
135 | 	        formatRound('x2', 3) %>%  
136 | 	        formatRound('x3', 3) %>%
137 | 	        formatRound('x4', 3) %>%
138 | 		    formatStyle(2,
139 | 		    background = styleColorBar(x2, 'steelblue'),
140 | 		    backgroundSize = '100% 90%',
141 | 		    backgroundRepeat = 'no-repeat',
142 | 		    backgroundPosition = 'center'
143 | )
144 | 
145 | ### END
146 | 


--------------------------------------------------------------------------------
/caret-classification/iris-classification-caret-all.R:
--------------------------------------------------------------------------------
  1 | # Use of all 160 caret models for multi-class classification and iris set
  2 | # The  output from  fast (working) binary classification models is
  3 | # exported to a sortable table in a web browser using the DT library
  4 | # https://github.com/tobigithub/caret-machine-learning
  5 | # Tobias Kind (2015)
  6 | 
  7 | # use mlbench, caret and DT library
  8 | require(mlbench)
  9 | require(caret)
 10 | require(DT)
 11 | 
 12 | # load iris set
 13 | data(iris) 
 14 | dim(iris) 
 15 | 
 16 | # get all model names for classification
 17 | m <- unique(modelLookup()[modelLookup()$forClass,c(1)])
 18 | length(m); m;
 19 | 
 20 | # slow classification models ("rbf" crashes; "dwdLinear", "ownn", "snn" have issues)
 21 | # all others may have just failed and are not listed here
 22 | removeModels <- c("AdaBoost.M1","pda2","dwdRadial","rbf","dwdLinear", "dwdPoly",
 23 | "gaussprLinear","gaussprPoly","rFerns","sddaLDA", "smda", "sddaQDA", "xgbLinear")
 24 | 
 25 | #remove all slow and failed models from model list
 26 | m <- m[!m %in% removeModels]
 27 | 
 28 | # pre-load all packages (does not really work due to other dependencies)
 29 | suppressPackageStartupMessages(ll <-lapply(m, require, character.only = TRUE))
 30 | 
 31 | # show which libraries were loaded  
 32 | sessionInfo()
 33 | 
 34 | # load X and Y (this will be transferred to to train function)
 35 | X = iris[,1:3]
 36 | Y = iris$Species
 37 | 
 38 | # register parallel front-end
 39 | library(doParallel); cl <- makeCluster(detectCores()); registerDoParallel(cl)
 40 | 
 41 | # this setup actually calls the caret::train function, in order to provide
 42 | # minimal error handling this type of construct is needed.
 43 | trainCall <- function(i) 
 44 | 	{
 45 | 	     cat("----------------------------------------------------","\n");
 46 | 	     set.seed(123); cat(i," <- loaded\n");
 47 | 	     return(tryCatch(
 48 | 	     		t2 <- train(y=Y, x=X, (i), trControl = trainControl(method = "boot632")),
 49 | 	     		error=function(e) NULL))
 50 | 	}
 51 | 
 52 | # use lapply/loop to run everything, required for try/catch error function to work
 53 | t2 <- lapply(m, trainCall)
 54 | 
 55 | #remove NULL values, we only allow succesful methods, provenance is deleted.
 56 | t2 <- t2[!sapply(t2, is.null)]
 57 | 
 58 | # this setup extracts the results with minimal error handling 
 59 | # TrainKappa can be sometimes zero, but Accuracy SD can be still available
 60 | # see Kappa value http://epiville.ccnmtl.columbia.edu/popup/how_to_calculate_kappa.html
 61 | printCall <- function(i) 
 62 | 	{
 63 | 	     return(tryCatch(
 64 | 	     	{
 65 | 	     	 cat(sprintf("%-22s",(m[i])))
 66 | 		 cat(round(getTrainPerf(t2[[i]])$TrainAccuracy,4),"\t")
 67 | 		 cat(round(getTrainPerf(t2[[i]])$TrainKappa,4),"\t")
 68 | 		 cat(t2[[i]]$times$everything[3],"\n")},
 69 | 	         error=function(e) NULL))
 70 | 	}
 71 | 	
 72 | r2 <- lapply(1:length(t2), printCall)
 73 | 
 74 | # stop cluster and register sequntial front end
 75 | stopCluster(cl); registerDoSEQ();
 76 | 
 77 | # preallocate data types
 78 | i = 1; MAX = length(t2);
 79 | x1 <- character() # Name
 80 | x2 <- numeric()   # R2
 81 | x3 <- numeric()   # RMSE
 82 | x4 <- numeric()   # time [s]
 83 | x5 <- character() # long model name
 84 |  
 85 | # fill data and check indexes and NA with loop/lapply 
 86 | for (i in 1:length(t2)) {
 87 |     x1[i] <- t2[[i]]$method
 88 |     x2[i] <- as.numeric(round(getTrainPerf(t2[[i]])$TrainAccuracy,4))
 89 |     x3[i] <- as.numeric(round(getTrainPerf(t2[[i]])$TrainKappa,4))
 90 |     x4[i] <- as.numeric(t2[[i]]$times$everything[3])
 91 |     x5[i] <- t2[[i]]$modelInfo$label
 92 | }
 93 |   
 94 | # coerce to data frame
 95 | df1 <- data.frame(x1,x2,x3,x4,x5, stringsAsFactors=FALSE)
 96 | 
 97 | # print all results to R-GUI
 98 | df1
 99 | 
100 | # plot models, just as example
101 | # ggplot(t2[[1]])
102 | # ggplot(t2[[1]])
103 | 
104 | # call web output with correct column names
105 | datatable(df1,  options = list(
106 | 		columnDefs = list(list(className = 'dt-left', targets = c(0,1,2,3,4,5))),
107 | 		pageLength = MAX,
108 |   		order = list(list(2, 'desc'))),
109 | 		colnames = c('Num', 'Name', 'Accuracy', 'Kappa', 'time [s]', 'Model name'),
110 | 	        caption = paste('Classification results from caret models',Sys.time()),
111 | 	        class = 'cell-border stripe')  %>% 	       
112 | 	        formatRound('x2', 3) %>%  
113 | 	        formatRound('x3', 3) %>%
114 | 	        formatRound('x4', 3) %>%
115 | 		    formatStyle(2,
116 | 		    background = styleColorBar(x2, 'steelblue'),
117 | 		    backgroundSize = '100% 90%',
118 | 		    backgroundRepeat = 'no-repeat',
119 | 		    backgroundPosition = 'center'
120 | )
121 | 
122 | # print confusion matrix example
123 | caret::confusionMatrix(t2[[1]])
124 | 
125 | 
126 | ### END
127 | 


--------------------------------------------------------------------------------
/caret-cv/HAR-all-CV-methods.R:
--------------------------------------------------------------------------------
  1 | 
  2 | # Parallel Random Forest and knn with multiple CV methods
  3 | # Data: http://groupware.les.inf.puc-rio.br/har
  4 | # Sources used: 
  5 | # http://bigcomputing.blogspot.com/2014/10/an-example-of-using-random-forest-in.html
  6 | # https://www.coursera.org/specializations/jhudatascience?utm_medium=courseDescripTop
  7 | # https://rstudio-pubs-static.s3.amazonaws.com/89748_264cbfde747d4d779d7bd6b9b3f31f45.html
  8 | # Google -> "InTrain<-createDataPartition(" predict(rf_model,test) "pml-training.csv"
  9 | # Google -> "Using devices such as Jawbone Up, Nike FuelBand, and Fitbit" caret
 10 | # Google -> https://www.google.com/?gws_rd=ssl#q=%22B+A+B+A+A+E+D+B+A+A+B+C+B+A+E+E+A+B+B+B%22
 11 | # https://yoke2.github.io/PMLCourseProject/pmlreport.html
 12 | # https://github.com/tobigithub/caret-machine-learning
 13 | # Tobias Kind (2015)
 14 | 
 15 | 
 16 | library(caret)
 17 | require(ggplot2)
 18 | require(randomForest)
 19 | 
 20 | library(doSNOW)
 21 | library(parallel)
 22 | 
 23 | training_URL<-"http://d396qusza40orc.cloudfront.net/predmachlearn/pml-training.csv"
 24 | test_URL<-"http://d396qusza40orc.cloudfront.net/predmachlearn/pml-testing.csv"
 25 | training<-read.csv(training_URL,na.strings=c("NA",""))
 26 | test<-read.csv(test_URL,na.strings=c("NA",""))
 27 | 
 28 | training<-training[,7:160]
 29 | test<-test[,7:160]
 30 | 
 31 | mostly_data<-apply(!is.na(training),2,sum)>19621
 32 | training<-training[,mostly_data]
 33 | test<-test[,mostly_data]
 34 | dim(training)
 35 | 
 36 | 
 37 | #plot feature wise
 38 | #https://rpubs.com/davizuku/practical_ml
 39 | selCols <- grep("^accel_", names(training));
 40 | featurePlot(x = training[,selCols],
 41 |             y = training$classe,
 42 |             plot = "boxplot");
 43 | 
 44 | selCols <- grep("^magnet_", names(training));
 45 | featurePlot(x = training[,selCols],
 46 |             y = training$classe,
 47 |             plot = "boxplot");
 48 | 
 49 | selCols <- grep("^gyros_", names(training));
 50 | featurePlot(x = training[,selCols],
 51 |             y = training$classe,
 52 |             plot = "boxplot");
 53 | 
 54 | 
 55 | #plot training data
 56 | featurePlot(x=training[,c(1:12)], y=training$classe, plot = 'box')
 57 | 
 58 | InTrain<-createDataPartition(y=training$classe,p=0.3,list=FALSE)
 59 | training1<-training[InTrain,]
 60 | 
 61 | # detect true cores requires parallel()
 62 | nCores <- detectCores(logical = FALSE)
 63 | # detect threads
 64 | nThreads <- detectCores(logical = TRUE)
 65 | 
 66 | cl <- makeCluster(nThreads, type="SOCK")
 67 | registerDoSNOW(cl); cl;
 68 | getDoParWorkers()
 69 | getDoParName()
 70 | 
 71 | #------------------------------------------------------------
 72 | #  rf usually works
 73 | ptm <- proc.time()
 74 | rf_model<-train(classe~.,data=training1,method="rf")
 75 |             # method="repeatedcv", number=10, repeats=3 ## repeated k-fold Cross Validation
 76 |             # method="cv",number=5 ## k-fold Cross Validation
 77 |                 # method="LOOCV" ## Leave One Out Cross Validation
 78 |                 # method="boot", number=100 ## Bootstrap
 79 |                 # method = "boot632" ## The .632+ Bootstrap 
 80 |                 # trControl=trainControl(method="boot632"),
 81 |                 # prox=TRUE,allowParallel=TRUE)
 82 | proc.time() - ptm
 83 | 
 84 | #------------------------------------------------------------
 85 | # knn
 86 | ptm <- proc.time()
 87 | model1<-train(classe~.,data=training1,method="knn")
 88 | proc.time() - ptm
 89 | 
 90 | #------------------------------------------------------------
 91 | # "repeatedcv" ##  repeated k-fold Cross Validation
 92 | ptm <- proc.time()
 93 | model2<-train(classe~.,data=training1,method="knn",
 94 |           trControl=trainControl(method="repeatedcv", number=3, repeats=3)) ## repeated k-fold Cross Validation
 95 |  proc.time() - ptm
 96 | #------------------------------------------------------------
 97 | #  "cv"  ##  k-fold Cross Validation
 98 | ptm <- proc.time()
 99 | model3<-train(classe~.,data=training1,method="knn",
100 |           trControl=trainControl(method="cv",number=3))  ## k-fold Cross Validation
101 | proc.time() - ptm
102 | #------------------------------------------------------------
103 | # "LOOCV" ## Leave One Out Cross Validation
104 | ptm <- proc.time()
105 | model4<-train(classe~.,data=training1,method="knn",
106 |                   trControl=trainControl(method="LOOCV", repeats=1))  ## Leave One Out Cross Validation
107 | proc.time() - ptm
108 | #------------------------------------------------------------
109 | #  "boot" ## Bootstrap
110 | ptm <- proc.time()
111 | model5<-train(classe~.,data=training1,method="knn",
112 |                   trControl=trainControl(method="boot", number=10)) ## Bootstrap
113 | proc.time() - ptm
114 | #------------------------------------------------------------
115 | #  "boot632" ## The .632+ Bootstrap 
116 | ptm <- proc.time()
117 | model6<-train(classe~.,data=training1,method="knn",
118 |                  trControl=trainControl(method="boot632"))  ## The .632+ Bootstrap 
119 | proc.time() - ptm
120 | #------------------------------------------------------------
121 | ## Times for splits and trains
122 | ##                 user  system elapsed 
123 | ## rf............ 17.78    0.74  126.16
124 | ## knn...........  0.87    0.80   19.76
125 | ## knn-repeatedcv  0.75    0.56    3.12 
126 | ## knn-cv........  0.69    0.45    1.81
127 | ## knn-LOOCV..... 77.60   34.54  120.77
128 | ## knn-boot......  0.69    0.67    6.35
129 | ## knn-boot632     0.99    0.80   23.12
130 | #------------------------------------------------------------
131 | rf_model
132 | model1
133 | model2
134 | model3
135 | model4
136 | model5
137 | model6
138 | #------------------------------------------------------------
139 | 
140 | print(rf_model)
141 | print(rf_model$finalModel)
142 | plot(rf_model$finalModel)
143 | rf_model$results
144 | 
145 | #  number of variables per level (mtry)
146 | confusionMatrix(rf_model)
147 | plot(rf_model)
148 | 
149 | #QPLOT
150 | qplot(roll_belt, magnet_dumbbell_y, colour=classe, data=training)  
151 | 
152 | rf_test <- predict(rf_model,test)
153 | rf_test
154 | # Correct solution
155 | #B A B A A E D B A A B C B A E E A B B B
156 | 
157 | stopCluster(cl)
158 | 
159 | #--- register foreach for sequential mode
160 | registerDoSEQ()
161 | 
162 | ### END
163 | 


--------------------------------------------------------------------------------
/caret-cv/caret-all-cv-methods-lapply-sapply.R:
--------------------------------------------------------------------------------
 1 | # Run simple cross-validation method with caret and knn
 2 | # https://github.com/tobigithub/caret-machine-learning
 3 | # Tobias Kind (2015)
 4 | 
 5 | # All caret cross-validation methods applied using lapply (list result) 
 6 | # regression example using knn (very fast); "none" is not allowed for lapply
 7 | 
 8 |   require(caret); data(BloodBrain); 
 9 |   cvMethods <- c("boot632","LGOCV","LOOCV","cv","repeatedcv", "boot");
10 |   all <- lapply(cvMethods ,function(x) {set.seed(123); print(x); tc <- trainControl(method=(x))
11 |                     fit1 <- train(bbbDescr, logBBB, trControl=tc, method="knn") })  
12 |   all
13 |   
14 |   # just to show the structure of output
15 |   # sapply(all,getTrainPerf)
16 |   # lapply(all,getTrainPerf)
17 |   
18 |   # extract the used cvMethods (redundant because already incvMethods) 
19 |   myNames <- lapply(1:6, function(x) all[[x]]$control$method)
20 |   # save results
21 |   results <- sapply(all,getTrainPerf)
22 |   # change column Names to cv methods
23 |    colnames(results) <- myNames; 
24 |   # get the results
25 |   results
26 |  
27 | #               boot632   LGOCV     LOOCV     cv        repeatedcv boot     
28 | # TrainRMSE     0.619778  0.6275048 0.6309407 0.6192086 0.6192086  0.66943  
29 | # TrainRsquared 0.4009745 0.3554037 0.3429081 0.3831812 0.3831812  0.3140373
30 | # method        "knn"     "knn"     "knn"     "knn"     "knn"      "knn"    
31 | 
32 | #---------------------------------------------------------------------------
33 | 
34 | # All cross-validation methods applied using sapply (matrix result)
35 | # regression example using knn (very fast); "none" is not allowed for lapply
36 | 
37 |   require(caret); data(BloodBrain); 
38 |   cvMethods <- c("boot632","LGOCV","LOOCV","cv","repeatedcv", "boot" );
39 |   all <- sapply(cvMethods ,function(x) {set.seed(123); print(x); tc <- trainControl(method=(x))
40 |                     fit1 <- train(bbbDescr, logBBB, trControl=tc, method="knn") }); all 
41 |   all[4, ]
42 |   
43 | #                boot632      LGOCV        LOOCV        cv           repeatedcv   boot        
44 | # method       "knn"        "knn"        "knn"        "knn"        "knn"        "knn"       
45 | # modelInfo    List,13      List,13      List,13      List,13      List,13      List,13     
46 | # modelType    "Regression" "Regression" "Regression" "Regression" "Regression" "Regression"
47 | # results      List,7       List,5       List,3       List,5       List,5       List,5      
48 | # pred         NULL         NULL         List,4       NULL         NULL         NULL        
49 | # bestTune     List,1       List,1       List,1       List,1       List,1       List,1      
50 | # call         Expression   Expression   Expression   Expression   Expression   Expression  
51 | # dots         List,0       List,0       List,0       List,0       List,0       List,0      
52 | # metric       "RMSE"       "RMSE"       "RMSE"       "RMSE"       "RMSE"       "RMSE"      
53 | # control      List,26      List,26      List,26      List,26      List,26      List,26     
54 | # finalModel   List,7       List,7       List,7       List,7       List,7       List,7      
55 | # preProcess   NULL         NULL         NULL         NULL         NULL         NULL        
56 | # trainingData List,135     List,135     List,135     List,135     List,135     List,135    
57 | # resample     List,3       List,3       NULL         List,3       List,3       List,3      
58 | # resampledCM  NULL         NULL         NULL         NULL         NULL         NULL        
59 | # perfNames    Character,2  Character,2  Character,2  Character,2  Character,2  Character,2 
60 | # maximize     FALSE        FALSE        FALSE        FALSE        FALSE        FALSE       
61 | # yLimits      Numeric,2    Numeric,2    Numeric,2    Numeric,2    Numeric,2    Numeric,2   
62 | # times        List,3       List,3       List,3       List,3       List,3       List,3    
63 | 
64 |   
65 |   ### END
66 | 


--------------------------------------------------------------------------------
/caret-cv/caret-all-cv-parallel-cubist.R:
--------------------------------------------------------------------------------
 1 | # Run all cross-validations with method with cubist 
 2 | # Read: http://rulequest.com/cubist-examples.html
 3 | # Read: https://cran.r-project.org/web/packages/Cubist/vignettes/cubist.pdf
 4 | # Read: http://www.r-bloggers.com/ensemble-learning-with-cubist-model/
 5 | #
 6 | # https://github.com/tobigithub/caret-machine-learning
 7 | # Tobias Kind (2015)
 8 | 
 9 | 
10 | # load libs
11 | require(caret); data(BloodBrain); 
12 | 
13 | # register parallel client
14 | library(doParallel); cl <- makeCluster(detectCores()); registerDoParallel(cl) 
15 | 
16 | # define all cross-validation methods
17 | cvMethods <- c("boot632","LGOCV","LOOCV","cv","repeatedcv", "boot");
18 | 
19 | # use R lapply function to loop through all CV methos with qrf
20 | all <- lapply(cvMethods ,function(x) {set.seed(123); print(x); tc <- trainControl(method=(x))
21 |               fit1 <- train(bbbDescr, logBBB, trControl=tc, method="cubist") }); all;
22 | 
23 | # extract the used cvMethods (redundant because already incvMethods) 
24 | myNames <- lapply(1:6, function(x) all[[x]]$control$method)
25 | 
26 | # save results
27 | results <- sapply(all,getTrainPerf)
28 | 
29 | # change column Names to cv methods
30 | colnames(results) <- myNames; 
31 | 
32 | # get the results
33 | results
34 | 
35 | # stop cluster
36 | stopCluster(cl); registerDoSEQ();
37 | 
38 | #               boot632   LGOCV     LOOCV     cv        repeatedcv boot     
39 | # TrainRMSE     0.3794002 0.4959378 0.4997026 0.4933169 0.4930747  0.5617455
40 | # TrainRsquared 0.6743715 0.6067721 0.5875271 0.603017  0.6032699  0.4883528
41 | # method        "cubist"  "cubist"  "cubist"  "cubist"  "cubist"   "cubist" 
42 | 
43 | 
44 | ### END
45 | 


--------------------------------------------------------------------------------
/caret-cv/caret-all-cv-parallel-qrf.R:
--------------------------------------------------------------------------------
 1 | # Run all cross-validations with method with qrf (Quantile Random Forest)
 2 | # https://github.com/tobigithub/caret-machine-learning
 3 | # Tobias Kind (2015)
 4 | 
 5 | 
 6 | # load libs
 7 | require(caret); data(BloodBrain); 
 8 | 
 9 | # register parallel client
10 | library(doParallel); cl <- makeCluster(detectCores()); registerDoParallel(cl) 
11 | 
12 | # define all cross-validation methods
13 | cvMethods <- c("boot632","LGOCV","LOOCV","cv","repeatedcv", "boot");
14 | 
15 | # use R lapply function to loop through all CV methos with qrf
16 | all <- lapply(cvMethods ,function(x) {set.seed(123); print(x); tc <- trainControl(method=(x))
17 |               fit1 <- train(bbbDescr, logBBB, trControl=tc, method="qrf") }); all;
18 | 
19 | # extract the used cvMethods (redundant because already incvMethods) 
20 | myNames <- lapply(1:6, function(x) all[[x]]$control$method)
21 | 
22 | # save results
23 | results <- sapply(all,getTrainPerf)
24 | 
25 | # change column Names to cv methods
26 | colnames(results) <- myNames; 
27 | 
28 | # get the results
29 | results
30 | 
31 | # stop cluster
32 | stopCluster(cl); registerDoSEQ();
33 | 
34 | #               boot632   LGOCV     LOOCV     cv        repeatedcv boot     
35 | # TrainRMSE     0.4199394 0.5450903 0.5264716 0.5210002 0.5127061  0.5539934
36 | # TrainRsquared 0.6829296 0.5193978 0.5474211 0.561647  0.5776622  0.5350395
37 | # method        "qrf"     "qrf"     "qrf"     "qrf"     "qrf"      "qrf"    
38 | 
39 | ### END
40 | 


--------------------------------------------------------------------------------
/caret-cv/caret-cv-simple.R:
--------------------------------------------------------------------------------
 1 | # Run simple cross-validation method with caret and knn
 2 | # https://github.com/tobigithub/caret-machine-learning
 3 | # Tobias Kind (2015)
 4 | 
 5 | # Single example, no cross-validation
 6 |   require(caret); data(BloodBrain); set.seed(123);
 7 |   fit1 <- train(bbbDescr, logBBB, "knn"); fit1
 8 | 
 9 | # cross-validation example with method boot 
10 |   require(caret); data(BloodBrain); set.seed(123);
11 |   tc <- trainControl(method="boot")
12 |   fit1 <- train(bbbDescr, logBBB, trControl=tc, method="knn");  fit1
13 |   
14 | 
15 | ### END
16 | 


--------------------------------------------------------------------------------
/caret-datasets/caret-MS-datasets.csv:
--------------------------------------------------------------------------------
 1 | Num,Data set,library,Reg/Class,Y-target,X-input,Dimension,Design
 2 | 1,data(iris),R,Class,Species,remaining data,150 x 5,multi class
 3 | 2,data(trees),R,NA,NA,remaining data,31x 3,multi class
 4 | 3,data(Glass),R,Class,Type,remaining data,214 x 10,multi class
 5 | 4,data(cox2),caret,Class,cox2Class ,"cox2Descr,cox2IC50",462x 255,Two class
 6 | 5,data(oil),caret,Class,oilType,fattyAcids,96 x 7,multi class
 7 | 6,data(dhfr),caret,Reg,Y,remaining data,325 x 229,multi param
 8 | 7,data(GermanCredit),caret,Reg/Class,Class,remaining data,1000 x  62,multi param
 9 | 8,data(BostonHousing),mlbench,Reg,medv,remaining data,506x14,multi param
10 | 9,data(BloodBrain),caret,Reg,logBBB ,bbbDescr ,208 x 134,multi param
11 | 10,data(mdrr),caret,Class,mdrrClass,mdrrDescr,528 x 342,Two class
12 | 11,data(Satellite) ,mlbench,Class,classes,remaining data,6435 x  37,multi class
13 | 12,data(cars),caret,Reg/Class,any,any,804x15,mixed
14 | 13,data(dhfr) ,caret,Class,Y,remaining data,325x229,Two class
15 | 14,data(pottery) ,caret,Class,potteryClass ,pottery ,NA,Two class
16 | 15,data(segmentationData) ,caret,Class,Class,remaining data,2019  x  61,Two class
17 | 16,data(tecator) ,caret,Reg,endpoints ,absorp ,215x 100,multi param
18 | 17,data(abalone) ,APM,Class,Type,remaining data,4177  x  9,multi class
19 | 18,data(AlzheimerDisease) ,APM,Class,diagnosis ,predictors ,333 x 130,Two class
20 | 19,data(ChemicalManufacturingProcess) ,APM,Reg,Yield,remaining data,176 x  58,multi param
21 | 20,data(concrete) ,APM,Reg,CompressiveStrength ,remaining data,1030 x   9,multi param
22 | 21,data(FuelEconomy) ,APM,Reg/Class,,,3 sets,multi param
23 | 22,data(hepatic) ,APM,Class,injury ,remaining data,2 sets,multi param
24 | 23,data(solubility) ,APM,Reg,solTestY ,,1267 x 228,multi param
25 | 24,data(permeability) ,APM,Reg,permeability ,fingerprints ,165 x 1107,multi param
26 | 25,data(schedulingData) ,APM,Class,Class,remaining data,4331x8,multi class
27 | 26,data(segmentationOriginal) ,APM,Class,Class,remaining data,2019  x 119,
28 | 27,data(twoClassData) ,APM,Class,classes ,predictors ,208 x  2,Two class
29 | 28,data(BreastCancer) ,mlbench,Class,Class ,remaining data,699 x 11,Two class
30 | 29,data(PimaIndiansDiabetes) ,mlbench,Class,diabetes ,remaining data,768 x  9,Two class
31 | 30,data(Sonar) ,mlbench,Class,Class ,remaining data,208x61,Two class
32 | 31,Human Activity Recognition (HAR) ,puc-rio.br,Class,“class”,remaining data,165634 x 21 ,multi class
33 | 


--------------------------------------------------------------------------------
/caret-datasets/view-caret-ML-datasets.R:
--------------------------------------------------------------------------------
 1 | # View and load ML datasets for working with caret 
 2 | # https://github.com/tobigithub/caret-machine-learning
 3 | # Tobias Kind (2015)
 4 | 
 5 | # show all available data sets installed
 6 | library(caret); library(datasets); library(AppliedPredictiveModeling); library(mlbench); data();
 7 | 
 8 | # load the dataset
 9 | data(iris)
10 | # get the dimension of the dataset
11 | dim(iris)
12 | ## [1] 150   5
13 | length(iris)
14 | ## [1] 5
15 | # get the class name (here data frame)  to choose correct operators
16 | class(iris)
17 | ## [1] "data.frame"
18 | 
19 | # invoke simple data viewer
20 | View(iris)
21 | # invoke the useless editor
22 | edit(iris)
23 | # get the data structure
24 | str(iris)
25 | > str(iris)
26 | ##'data.frame':   150 obs. of  5 variables:
27 | ## $ Sepal.Length: num  5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
28 | ## $ Sepal.Width : num  3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
29 | ## $ Petal.Length: num  1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
30 | ## $ Petal.Width : num  0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
31 | ## $ Species     : Factor w/ 3 levels "setosa","versicolor",..: 1 1 
32 | 
33 | 


--------------------------------------------------------------------------------
/caret-parallel/caret-parallel-train-cubist.R:
--------------------------------------------------------------------------------
 1 | # Run caret models "cubist" in parallel
 2 | # https://github.com/tobigithub/caret-machine-learning
 3 | # Tobias Kind (2015)
 4 | 
 5 | library(doParallel); cl <- makeCluster(16); registerDoParallel(cl) 
 6 |   require(caret); data(BloodBrain); 
 7 |   fit1 <- train(bbbDescr, logBBB, "cubist"); 
 8 |   fit1; fit1$times$everything
 9 | stopCluster(cl); registerDoSEQ();
10 | 
11 | # /user time/ is the actual caret training time
12 | # /system time/ is operating system overhead
13 | # /elapse time/ is total run-time 
14 | 
15 | # for parallel 2x total speed-up
16 | # but parallel 45x training speed-up (!)
17 | # parallel and caret overhead are 42 sec
18 | # Hence oberhead for short methods is quite large
19 | # Overhead for longer train methods will be small
20 | 
21 | # Cubist with one CPU [s]
22 | # user  system elapsed 
23 | # 91.20    0.04   91.27 
24 | 
25 | # Cubist with 16 CPUs [s]
26 | # user  system elapsed 
27 | # 2.00    0.03   44.68 
28 | 


--------------------------------------------------------------------------------
/caret-parallel/caret-parallel-train-rf-deLuxe.R:
--------------------------------------------------------------------------------
 1 | # Run random forest in parallel with CPU core and thread info
 2 | # https://github.com/tobigithub/caret-machine-learning
 3 | # Tobias Kind (2015)
 4 | 
 5 | require(caret)
 6 | data(BloodBrain)
 7 | set.seed(123)
 8 | 
 9 | # Library parallel() is a native R library, no CRAN required
10 | library(parallel)
11 | nCores <- detectCores(logical = FALSE)
12 | nThreads <- detectCores(logical = TRUE)
13 | cat("CPU with",nCores,"cores and",nThreads,"threads detected.\n")
14 | 
15 | # load the doParallel/doSNOW library for caret cluster use
16 | library(doParallel)
17 | cl <- makeCluster(nThreads)
18 | registerDoParallel(cl)
19 | 
20 | # random forest regression
21 | fit1 <- train(bbbDescr, logBBB, "rf")
22 | fit1; 
23 | 
24 | 
25 | stopCluster(cl)
26 | registerDoSEQ()
27 | ### END
28 | 
29 | # 208 samples
30 | # 134 predictors
31 | # 
32 | # No pre-processing
33 | # Resampling: Bootstrapped (25 reps) 
34 | # Summary of sample sizes: 208, 208, 208, 208, 208, 208, ... 
35 | # Resampling results across tuning parameters:
36 | # 
37 | #   mtry  RMSE       Rsquared 
38 | #     2   0.5443770  0.5725600
39 | #    68   0.5408819  0.5568365
40 | #   134   0.5490382  0.5413179
41 | # 
42 | # RMSE was used to select the optimal model using  the smallest value.
43 | # The final value used for the model was mtry = 68. 
44 | 


--------------------------------------------------------------------------------
/caret-parallel/caret-parallel-train.R:
--------------------------------------------------------------------------------
 1 | # Run multiple caret models in parallel using lapply
 2 | # https://github.com/tobigithub/caret-machine-learning
 3 | # Tobias Kind (2015)
 4 | 
 5 | # -------------------------------------------------------------------------
 6 | # FIRST sequential code (not parallel one CPU core):
 7 | # ------------------------------------------------------------------------- 
 8 | 
 9 | require(caret); data(BloodBrain); set.seed(123)
10 | fit1 <- train(bbbDescr, logBBB, "knn"); fit1
11 | 
12 | # ------------------------------------------------------------------------- 
13 | # SECOND parallel register 4 cores (no worries if you only have 2)
14 | # train the caret model in parallel 
15 | # -------------------------------------------------------------------------
16 | 
17 | library(doParallel); cl <- makeCluster(4); registerDoParallel(cl) 
18 |   require(caret); data(BloodBrain); set.seed(123)
19 |   fit1 <- train(bbbDescr, logBBB, "knn"); fit1
20 | stopCluster(cl); registerDoSEQ();
21 | 
22 | ### END
23 | 


--------------------------------------------------------------------------------
/caret-parallel/learning-curve-plots-caret-parallel.R:
--------------------------------------------------------------------------------
  1 | # Learning curve plots for R caret classifications and regressions in parallel 
  2 | # (ROC vs training size, RMSE vs training  size)
  3 | # Source: Max Kuhn (topepo); https://github.com/topepo/caret/issues/278
  4 | # https://github.com/tobigithub/caret-machine-learning
  5 | # Tobias Kind (2015)
  6 | 
  7 | #----------------------------------------------------------------------
  8 | # Library parallel() is a native R library, no CRAN required
  9 | library(parallel)
 10 | nCores <- detectCores(logical = FALSE)
 11 | nThreads <- detectCores(logical = TRUE)
 12 | cat("CPU with",nCores,"cores and",nThreads,"threads detected.\n")
 13 | 
 14 | # load the doParallel/doSNOW library for caret cluster use
 15 | library(doParallel)
 16 | cl <- makeCluster(nThreads)
 17 | registerDoParallel(cl)
 18 | 
 19 | #----------------------------------------------------------------------
 20 | ##   function: learning_curve_dat plots training-size vs RMSE or ROC
 21 | ##        dat: entire data set used for modling
 22 | ##          y: character stirng for the outcome column name
 23 | ## proportion: proportion of data used to train the model
 24 | ##  test_prop: proportion of data used initially set aside for testing
 25 | ##    verbose: write out a log of training milestones
 26 | ##        ...: arguments to pass to `train`
 27 | #----------------------------------------------------------------------
 28 | learning_curve_dat <- function(dat, 
 29 |                               outcome = colnames(dat)[1],
 30 |                               proportion = (1:10)/10, test_prop = 0, 
 31 |                               verbose = TRUE, ...) {
 32 | 
 33 |   proportion <- sort(unique(proportion))
 34 |   n_size <- length(proportion)
 35 | 
 36 |   if(test_prop > 0) {
 37 |     for_model <- createDataPartition(dat[, outcome], p = 1 - test_prop, list = FALSE)
 38 |   } else for_model <- 1:nrow(dat)
 39 | 
 40 |   n <- length(for_model)
 41 | 
 42 |   resampled <- vector(mode = "list", length = n_size)
 43 |   tested <- if(test_prop > 0) resampled else NULL
 44 |   apparent <- resampled
 45 |   for(i in seq(along = proportion)) {
 46 |     if(verbose) cat("Training for ", round(proportion[i]*100, 1), 
 47 |                     "% (n = ", floor(n*proportion[i]), ")\n", sep = "")
 48 |     in_mod <- if(proportion[i] < 1) sample(for_model, size = floor(n*proportion[i])) else for_model
 49 |     mod <- train(x = dat[in_mod, colnames(dat) != outcome, drop = FALSE],
 50 |                  y = dat[in_mod, outcome],
 51 |                  ...)
 52 |     if(i == 1) perf_names <- mod$perfNames
 53 |     resampled[[i]] <- merge(mod$resample, mod$bestTune)
 54 |     resampled[[i]]$Training_Size <- length(in_mod)
 55 | 
 56 |     if(test_prop > 0) {
 57 |       if(!mod$control$classProbs) {
 58 |         test_preds <- extractPrediction(list(model = mod), 
 59 |                                         testX = dat[-for_model, colnames(dat) != outcome, drop = FALSE],
 60 |                                         testY = dat[-for_model, outcome])
 61 |       } else {
 62 |         test_preds <- extractProb(list(model = mod), 
 63 |                                   testX = dat[-for_model, colnames(dat) != outcome, drop = FALSE],
 64 |                                   testY = dat[-for_model, outcome])
 65 |       }
 66 |       test_perf <- mod$control$summaryFunction(test_preds, lev = mod$finalModel$obsLevels)
 67 |       test_perf <- as.data.frame(t(test_perf))
 68 |       test_perf$Training_Size <- length(in_mod)
 69 |       tested[[i]] <- test_perf
 70 |       try(rm(test_preds, test_perf), silent = TRUE)
 71 |     }
 72 | 
 73 |     if(!mod$control$classProbs) {
 74 |       app_preds <- extractPrediction(list(model = mod), 
 75 |                                      testX = dat[in_mod, colnames(dat) != outcome, drop = FALSE],
 76 |                                      testY = dat[in_mod, outcome])
 77 |     } else {
 78 |       app_preds <- extractProb(list(model = mod), 
 79 |                                testX = dat[in_mod, colnames(dat) != outcome, drop = FALSE],
 80 |                                testY = dat[in_mod, outcome])
 81 |     }
 82 |     app_perf <- mod$control$summaryFunction(app_preds, lev = mod$finalModel$obsLevels)
 83 |     app_perf <- as.data.frame(t(app_perf))
 84 |     app_perf$Training_Size <- length(in_mod)    
 85 |     apparent[[i]] <- app_perf
 86 | 
 87 |     try(rm(mod, in_mod, app_preds, app_perf), silent = TRUE)
 88 |   }
 89 | 
 90 |   resampled <- do.call("rbind", resampled)
 91 |   resampled <- resampled[, c(perf_names, "Training_Size")]
 92 |   resampled$Data <- "Resampling"
 93 |   apparent <- do.call("rbind", apparent)
 94 |   apparent <- apparent[, c(perf_names, "Training_Size")]
 95 |   apparent$Data <- "Training"
 96 |   out <- rbind(resampled, apparent)
 97 |   if(test_prop > 0) {
 98 |     tested <- do.call("rbind", tested)
 99 |     tested <- tested[, c(perf_names, "Training_Size")]
100 |     tested$Data <- "Testing"
101 |     out <- rbind(out, tested)
102 |   }
103 |   out
104 | }
105 | 
106 | #----------------------------------------------------------------------
107 | # multiplot for plotting multiple ggplots
108 | # Example: multiplot(p1,p2,p3,p4,p5,p6,cols=3)
109 | # Source: http://www.peterhaschke.com/r/2013/04/24/MultiPlot.html
110 | #----------------------------------------------------------------------
111 | 
112 | multiplot <- function(..., plotlist = NULL, file, cols = 1, layout = NULL) {
113 |   require(grid)
114 | 
115 |   plots <- c(list(...), plotlist)
116 | 
117 |   numPlots = length(plots)
118 | 
119 |   if (is.null(layout)) {
120 |     layout <- matrix(seq(1, cols * ceiling(numPlots/cols)),
121 |                     ncol = cols, nrow = ceiling(numPlots/cols))
122 |   }
123 | 
124 |   if (numPlots == 1) {
125 |     print(plots[[1]])
126 | 
127 |   } else {
128 |     grid.newpage()
129 |     pushViewport(viewport(layout = grid.layout(nrow(layout), ncol(layout))))
130 | 
131 |     for (i in 1:numPlots) {
132 |       matchidx <- as.data.frame(which(layout == i, arr.ind = TRUE))
133 | 
134 |       print(plots[[i]], vp = viewport(layout.pos.row = matchidx$row,
135 |                                       layout.pos.col = matchidx$col))
136 |     }
137 |   }
138 | }
139 | 
140 | 
141 | #----------------------------------------------------------------------
142 | ## Classification example
143 | #----------------------------------------------------------------------
144 | library(caret)
145 | library(xgboost)
146 | 
147 | # set plot to 2x3
148 | par(mfrow=c(2,3)) 
149 | 
150 | set.seed(1412)
151 | class_dat <- twoClassSim(2000)
152 | 
153 | set.seed(29510)
154 | lda_data <- learning_curve_dat(dat = class_dat, outcome = "Class",
155 |                               test_prop = 1/4, 
156 |                               ## `train` arguments
157 |                               method = "lda", 
158 |                               metric = "ROC",
159 |                               trControl = trainControl(classProbs = TRUE, 
160 |                                			       method = "boot632",
161 |                                                        summaryFunction = twoClassSummary))
162 | 
163 | p1 <- ggplot(lda_data, aes(x = Training_Size, y = ROC, color = Data)) + 
164 |   geom_smooth(method = loess, span = .8) + 
165 |   ggtitle("LDA classification with boot632 CV") +
166 |   theme_bw()
167 | p1
168 | #----------------------------------------------------------------------
169 | set.seed(29510)
170 | rf_data <- learning_curve_dat(dat = class_dat, outcome = "Class",
171 |                              test_prop = 1/4, 
172 |                              ## `train` arguments
173 |                              method = "rf", 
174 |                              metric = "ROC",
175 |                              tuneLength = 4,
176 |                              trControl = trainControl(classProbs = TRUE, 
177 |                                                       method = "boot632",
178 |                                                       summaryFunction = twoClassSummary))
179 | 
180 | p2 <- ggplot(rf_data, aes(x = Training_Size, y = ROC, color = Data)) + 
181 |   geom_smooth(method = loess, span = .8) + 
182 |   ggtitle("rf classification with boot632 CV") +
183 |   theme_bw()
184 | p2
185 | #----------------------------------------------------------------------
186 | set.seed(29510)
187 | rf_data <- learning_curve_dat(dat = class_dat, outcome = "Class",
188 |                              test_prop = 1/4, 
189 |                              ## `train` arguments
190 |                              method = "parRF", 
191 |                              metric = "ROC",
192 |                              tuneLength = 4,
193 |                              trControl = trainControl(classProbs = TRUE, 
194 |                                                       method = "boot632",
195 |                                                       summaryFunction = twoClassSummary))
196 | 
197 | p3 <- ggplot(rf_data, aes(x = Training_Size, y = ROC, color = Data)) + 
198 |   geom_smooth(method = loess, span = .8) + 
199 |   ggtitle("parRF classification with boot632 CV") +
200 |   theme_bw()
201 | p3
202 | #----------------------------------------------------------------------
203 | ## Regression example
204 | #----------------------------------------------------------------------
205 | 
206 | set.seed(19135)
207 | reg_dat <- SLC14_1(2000)
208 | 
209 | set.seed(31535)
210 | bag_data <- learning_curve_dat(dat = reg_dat, outcome = "y",
211 |                               test_prop = 1/4, 
212 |                               ## `train` arguments
213 |                               method = "treebag", 
214 |                               trControl = trainControl(method = "boot632"),
215 |                               ## `bagging` arguments
216 |                               nbagg = 100)
217 | 
218 | p4 <- ggplot(bag_data, aes(x = Training_Size, y = RMSE, color = Data)) + 
219 |   geom_smooth(method = loess, span = .8) + 
220 |   ggtitle("treebag regression with boot632 CV") +
221 |   theme_bw()
222 | p4
223 | 
224 | 
225 | #----------------------------------------------------------------------
226 | set.seed(31535)
227 | svm_data <- learning_curve_dat(dat = reg_dat, outcome = "y",
228 |                               test_prop = 0, 
229 |                               ## `train` arguments
230 |                               method = "svmRadial", 
231 |                               preProc = c("center", "scale"),
232 |                               tuneGrid = data.frame(sigma =  0.03, C = 2^10),
233 |                               trControl = trainControl(method = "boot632"))
234 | 
235 | p5 <- ggplot(svm_data, aes(x = Training_Size, y = RMSE, color = Data)) + 
236 |   geom_smooth(method = loess, span = .8) + 
237 |   ggtitle("svmRadial regression with boot632 CV") +
238 |   theme_bw()
239 | p5
240 | 
241 | #----------------------------------------------------------------------
242 | set.seed(31535)
243 | svm_no_test <- learning_curve_dat(dat = reg_dat, outcome = "y",
244 |                                  test_prop = 1/4, 
245 |                                  ## `train` arguments
246 |                                  method = "svmRadial", 
247 |                                  preProc = c("center", "scale"),
248 |                                  tuneGrid = data.frame(sigma =  0.03, C = 2^10),
249 |                                  trControl = trainControl(method = "boot632"))
250 | 
251 | p6 <- ggplot(svm_no_test, aes(x = Training_Size, y = RMSE, color = Data)) + 
252 |   geom_smooth(method = loess, span = .8) + 
253 |   ggtitle("svmRadial regression with boot632 CV") +
254 |   theme_bw()
255 | p6
256 | 
257 |  
258 | multiplot(p1,p2,p3,p4,p5,p6,cols=3)
259 |   
260 | stopCluster(cl)
261 | registerDoSEQ()
262 | ### END
263 | 


--------------------------------------------------------------------------------
/caret-parallel/run-multiple-caret-models-parallel-lapply.R:
--------------------------------------------------------------------------------
 1 | # Run multiple caret models in parallel using lapply
 2 | # https://github.com/tobigithub/caret-machine-learning
 3 | # Tobias Kind (2015)
 4 | 
 5 | 
 6 | require(caret); data(BloodBrain); m <- c("qrf","xgbTree","rknn","knn","rf");
 7 | library(doParallel); cl <- makeCluster(8); registerDoParallel(cl)
 8 | t2 <- lapply(m,function(x) {set.seed(123); seeds <- vector(mode = "list", length = nrow(bbbDescr) + 1); seeds <- lapply(seeds, function(x) 1:20); t1 <- train(bbbDescr, logBBB, (x),trControl = trainControl(method = "cv",seeds=seeds))})
 9 | r2 <- lapply(1:length(t2), function(x) {cat(sprintf("%-10s",(m[x])));cat(t2[[x]]$results$Rsquared[which.min(t2[[x]]$results$RMSE)],"\t"); cat(t2[[x]]$results$RMSE[which.min(t2[[x]]$results$RMSE)],"\n")})
10 | stopCluster(cl); registerDoSEQ();
11 | 
12 | #model     R^2           RMSE
13 | #qrf       0.5861108     0.5120318 
14 | #xgbTree   0.6129255     0.4858211 
15 | #rknn      0.4351047     0.5941893 
16 | #knn       0.3736528     0.6185242 
17 | #rf        0.6037442     0.493395 
18 | 


--------------------------------------------------------------------------------
/caret-parallel/run-multiple-caret-models-parallel-sapply.R:
--------------------------------------------------------------------------------
 1 | # Run multiple caret models in parallel using sapply
 2 | # See: http://stackoverflow.com/questions/3505701/r-grouping-functions-sapply-vs-lapply-vs-apply-vs-tapply-vs-by-vs-aggrega
 3 | # https://github.com/tobigithub/caret-machine-learning
 4 | # Tobias Kind (2015)
 5 | 
 6 | require(caret); data(BloodBrain); m <- c("qrf","xgbTree","knn")
 7 | library(doParallel); cl <- makeCluster(12); registerDoParallel(cl)
 8 | sapply(m,function(x) {t1 <- train(bbbDescr, logBBB, (x))} ,USE.NAMES = TRUE)
 9 | class(t2); t2; t2[4,]; stopCluster(cl); registerDoSEQ();
10 | 
11 | #             qrf          xgbTree      knn         
12 | #method       "qrf"        "xgbTree"    "knn"       
13 | #modelInfo    List,11      List,14      List,13     
14 | #modelType    "Regression" "Regression" "Regression"
15 | #results      List,5       List,7       List,5      
16 | #pred         NULL         NULL         NULL        
17 | #bestTune     List,1       List,3       List,1      
18 | #call         Expression   Expression   Expression  
19 | #dots         List,0       List,0       List,0      
20 | #metric       "RMSE"       "RMSE"       "RMSE"      
21 | #control      List,26      List,26      List,26     
22 | #finalModel   List,23      List,6       List,7      
23 | #preProcess   NULL         NULL         NULL        
24 | #trainingData List,135     List,135     List,135    
25 | #resample     List,3       List,3       List,3      
26 | #resampledCM  NULL         NULL         NULL        
27 | #perfNames    Character,2  Character,2  Character,2 
28 | #maximize     FALSE        FALSE        FALSE       
29 | #yLimits      Numeric,2    Numeric,2    Numeric,2   
30 | #times        List,3       List,3       List,3      
31 | 
32 | 


--------------------------------------------------------------------------------
/caret-regression/caret-all-regression-models.R:
--------------------------------------------------------------------------------
  1 | # Run all caret regression models in parallel and compare R^2 and RMSE
  2 | # Example data is the very small "cars" dataset. Replace with your own set.
  3 | # The regression output from 85 fast (working) regression models is
  4 | # exported to a sortable table in a web browser using the DT library
  5 | # https://github.com/tobigithub/caret-machine-learning
  6 | # Tobias Kind (2016)
  7 | # Works for  caret_6.0-70   and R version 3.3.1 
  8 | 
  9 | # load caret and DT the cars data set
 10 | require(caret); require(DT);  data(cars);
 11 | 
 12 | # get all model names just as example
 13 | m <- unique(modelLookup()[modelLookup()$forReg,c(1)])
 14 | 
 15 | # fill variable m with the fast working models  
 16 | m <- c("avNNet", "bagEarth", "bagEarthGCV", 
 17 | "bayesglm", "bdk", "blackboost", "Boruta", "brnn", "BstLm" , 
 18 | "bstTree", "cforest", "ctree", "ctree2", "cubist", "DENFIS", 
 19 | "dnn", "earth", "elm", "enet",   "evtree", 
 20 | "extraTrees",  "gamLoess",  "gaussprLinear", "gaussprPoly", "gaussprRadial", 
 21 | "gcvEarth","glm", "glmboost", "glmnet", "icr", "kernelpls", 
 22 | "kknn", "knn",  "krlsRadial", "lars" , "lasso", 
 23 | "leapBackward", "leapForward", "leapSeq", "lm", "M5", "M5Rules", 
 24 | "mlpWeightDecay", "neuralnet" , "partDSA", 
 25 | "pcaNNet", "pcr", "penalized", "pls", "plsRglm", "ppr", 
 26 | "qrf" , "ranger",  "rf", "rfRules", "rbfDDA",
 27 | "ridge", "rlm", "rpart", "rpart2", "rqlasso", 
 28 | "rqnc", "RRF", "RRFglobal",  "rvmPoly", "rvmRadial", 
 29 | "SBC", "simpls", "spls", "superpc" , 
 30 | "svmLinear", "svmLinear2", "svmPoly", "svmRadial", "svmRadialCost", 
 31 | "treebag", "widekernelpls", "WM", "xgbLinear", 
 32 | "xgbTree", "xyf")
 33 |  
 34 |  
 35 | # load all packages (does not really work due to other dependencies)
 36 | suppressPackageStartupMessages(ll <-lapply(m, require, character.only = TRUE))
 37 | 
 38 | # define x and y for regression
 39 | y <- mtcars$mpg; x <- mtcars[, -mtcars$mpg];
 40 | 
 41 | # load all libraries
 42 | library(doParallel); cl <- makeCluster(detectCores()); registerDoParallel(cl)
 43 | 
 44 | # use lapply/loop to run everything
 45 | t2 <- lapply(m,function(i) 
 46 | 	{cat("----------------------------------------------------","\n");
 47 | 	set.seed(123); cat(i," <- loaded\n");
 48 | 	t2 <- train(y=y, x=x, (i), trControl = trainControl(method = "boot632"))
 49 | 	}
 50 | )
 51 | 
 52 | # use lapply to print the results
 53 | r2 <- lapply(1:length(t2), function(i) 
 54 | 		{cat(sprintf("%-20s",(m[i])));
 55 | 		cat(round(t2[[i]]$results$Rsquared[which.min(t2[[i]]$results$RMSE)],4),"\t");
 56 | 		cat(round(t2[[i]]$results$RMSE[which.min(t2[[i]]$results$RMSE)],4),"\t")
 57 | 		cat(t2[[i]]$times$everything[3],"\n")
 58 | 		}
 59 | )
 60 | 
 61 | # stop the parallel processing and register sequential front-end
 62 | stopCluster(cl); registerDoSEQ();
 63 | 
 64 | # preallocate data types
 65 | i = 1; MAX = length(t2);
 66 | x1 <- character() # Name
 67 | x2 <- numeric()   # R2
 68 | x3 <- numeric()   # RMSE
 69 | x4 <- numeric()   # time [s]
 70 | x5 <- character() # long model name
 71 |  
 72 | # fill data and check indexes and NA
 73 | for (i in 1:length(t2)) {
 74 |     x1[i] <- t2[[i]]$method
 75 |     x2[i] <- as.numeric(t2[[i]]$results$Rsquared[which.min(t2[[i]]$results$RMSE)])
 76 |     x3[i] <- as.numeric(t2[[i]]$results$RMSE[which.min(t2[[i]]$results$RMSE)])
 77 |     x4[i] <- as.numeric(t2[[i]]$times$everything[3])
 78 |     x5[i] <- t2[[i]]$modelInfo$label
 79 | }
 80 |   
 81 | # coerce to data frame
 82 | df1 <- data.frame(x1,x2,x3,x4,x5, stringsAsFactors=FALSE)
 83 | 
 84 | # print all results to R-GUI
 85 | df1
 86 | 
 87 | # call web browser output with sortable column names
 88 | datatable(df1,  options = list(
 89 | 		columnDefs = list(list(className = 'dt-left', targets = c(0,1,2,3,4,5))),
 90 | 		pageLength = MAX,
 91 |   		order = list(list(2, 'desc'))),
 92 | 		colnames = c('Num', 'Name', 'R^2', 'RMSE', 'time [s]', 'Model name'),
 93 | 	        caption = paste('Regression results from caret models',Sys.time()),
 94 | 	        class = 'cell-border stripe')  %>% 	       
 95 | 	        formatRound('x2', 3) %>%  
 96 | 	        formatRound('x3', 3) %>%
 97 | 	        formatRound('x4', 3) %>%
 98 | 		    formatStyle(2,
 99 | 		    background = styleColorBar(x2, 'steelblue'),
100 | 		    backgroundSize = '100% 90%',
101 | 		    backgroundRepeat = 'no-repeat',
102 | 		    backgroundPosition = 'center'
103 | )
104 | 
105 | ### END
106 | 
107 | #-----------------------------------------------------------------------------
108 | # Num	Name	R^2	RMSE	time[s]	Model name
109 | # #1	avNNet		20.269	4.98	Model Averaged Neural Network
110 | # 2	bagEarth	1	0	3.8	Bagged MARS
111 | # 3	bagEarthGCV	1	0	2.22	Bagged MARS using gCV Pruning
112 | # 4	bayesglm	1	0	1.11	Bayesian Generalized Linear Model
113 | # 5	bdk	0.81	2.602	1.49	Self-Organizing Map
114 | # 6	blackboost	0.878	2.37	3.9	Boosted Tree
115 | # 7	Boruta	0.965	1.317	25.79	Random Forest with Additional Feature Selection
116 | # 8	brnn	0.999	0.215	0.95	Bayesian Regularized Neural Networks
117 | # 9	BstLm	0.826	2.661	2.89	Boosted Linear Model
118 | # 10	bstTree	0.912	1.766	17.98	Boosted Tree
119 | ...
120 | # 83	xgbTree	0.983	0.679	3.970	eXtreme Gradient Boosting
121 | # 84	xyf	0.834	2.609	1.560	Self-Organizing Maps
122 | #-----------------------------------------------------------------------------
123 | 
124 | ### total time 385.14 [s] or 6.4 min with 4c/16t@4.2 GHz
125 | 
126 | 
127 | 
128 | 


--------------------------------------------------------------------------------
/caret-regression/caret-all-regressions-DT-cars.csv:
--------------------------------------------------------------------------------
 1 | Num,Name,R^2,RMSE,time [s],Model name
 2 | 2,bagEarth,1,0,2.37,Bagged MARS
 3 | 3,bagEarthGCV,1,0,1.58,Bagged MARS using gCV Pruning
 4 | 4,bayesglm,1,0,1.29,Bayesian Generalized Linear Model
 5 | 17,earth,1,0,0.79,Multivariate Adaptive Regression Spline
 6 | 19,enet,1,0,1.06,Elasticnet
 7 | 23,gamLoess,1,0,1.48,Generalized Additive Model using LOESS
 8 | 27,gcvEarth,1,0,0.8,Multivariate Adaptive Regression Splines
 9 | 28,glm,1,0,1.04,Generalized Linear Model
10 | 29,glmboost,1,0,0.86,Boosted Generalized Linear Model
11 | 30,glmnet,1,0.176,1.06,glmnet
12 | 36,lars,1,0,0.81,Least Angle Regression
13 | 37,lasso,1,0.604,0.77,The lasso
14 | 38,leapBackward,1,0,0.9,Linear Regression with Backwards Selection
15 | 39,leapForward,1,0,0.78,Linear Regression with Forward Selection
16 | 40,leapSeq,1,0,0.81,Linear Regression with Stepwise Selection
17 | 41,lm,1,0,0.77,Linear Regression
18 | 42,M5,1,0,3.16,Model Tree
19 | 43,M5Rules,1,0,1.53,Model Rules
20 | 58,ridge,1,0,0.89,Ridge Regression
21 | 61,rlm,1,0,0.78,Robust Linear Model
22 | 64,rqlasso,1,0,1.65,Quantile Regression with LASSO penalty
23 | 65,rqnc,1,0,0.97,Non-Convex Penalized Quantile Regression
24 | 14,cubist,1,0,1.06,Cubist
25 | 20,enpls,1,0,18.19,Ensemble Partial Least Squares Regression
26 | 49,penalized,1,0.025,1.53,Penalized Linear Regression
27 | 68,rvmPoly,1,0.025,1.3,Relevance Vector Machines with Polynomial Kernel
28 | 32,kernelpls,0.999,0.156,0.89,Partial Least Squares
29 | 50,pls,0.999,0.156,0.78,Partial Least Squares
30 | 80,widekernelpls,0.999,0.156,0.84,Partial Least Squares
31 | 71,simpls,0.999,0.156,0.83,Partial Least Squares
32 | 48,pcr,0.999,0.177,0.81,Principal Component Analysis
33 | 8,brnn,0.999,0.237,0.95,Bayesian Regularized Neural Networks
34 | 72,spls,0.998,0.185,1.23,Sparse Partial Least Squares
35 | 73,superpc,0.998,19.931,0.97,Supervised Principal Component Analysis
36 | 52,ppr,0.998,0.189,0.81,Projection Pursuit Regression
37 | 51,plsRglm,0.995,0.352,3.4,Partial Least Squares Generalized Linear Models
38 | 74,svmLinear,0.993,0.522,0.82,Support Vector Machines with Linear Kernel
39 | 75,svmLinear2,0.993,0.522,0.85,Support Vector Machines with Linear Kernel
40 | 24,gaussprLinear,0.993,0.523,2.08,Gaussian Process
41 | 22,extraTrees,0.977,0.944,2.6,Random Forest by Randomization
42 | 76,svmPoly,0.976,0.901,1.9,Support Vector Machines with Polynomial Kernel
43 | 18,elm,0.973,0.914,1.16,Extreme Learning Machine
44 | 54,ranger,0.971,1.202,1.13,Random Forest
45 | 35,krlsRadial,0.971,1.102,3.07,Radial Basis Function Kernel Regularized Least Squares
46 | 7,Boruta,0.963,1.225,10.79,Random Forest with Additional Feature Selection
47 | 55,rf,0.961,1.246,0.92,Random Forest
48 | 66,RRF,0.96,1.254,1.17,Regularized Random Forest
49 | 67,RRFglobal,0.959,1.263,1.23,Regularized Random Forest
50 | 25,gaussprPoly,0.958,1.172,1.06,Gaussian Process with Polynomial Kernel
51 | 53,qrf,0.932,1.667,1.11,Quantile Random Forest
52 | 77,svmRadial,0.929,1.834,0.87,Support Vector Machines with Radial Basis Function Kernel
53 | 78,svmRadialCost,0.924,1.837,1,Support Vector Machines with Radial Basis Function Kernel
54 | 10,bstTree,0.911,1.748,7.24,Boosted Tree
55 | 60,rknnBel,0.91,2.016,15.44,Random k-Nearest Neighbors with Feature Selection
56 | 59,rknn,0.909,2.263,4.62,Random k-Nearest Neighbors
57 | 70,SBC,0.905,1.609,1.61,Subtractive Clustering and Fuzzy c-Means Rules
58 | 31,icr,0.902,1.994,1.09,Independent Component Regression
59 | 26,gaussprRadial,0.896,2.411,0.84,Gaussian Process with Radial Basis Function Kernel
60 | 6,blackboost,0.886,2.193,4.37,Boosted Tree
61 | 33,kknn,0.883,1.819,1.05,k-Nearest Neighbors
62 | 81,WM,0.869,2.134,3.1,Wang and Mendel Fuzzy Rules
63 | 79,treebag,0.862,2.613,1.58,Bagged CART
64 | 84,xyf,0.844,2.434,1.84,Self-Organizing Maps
65 | 11,cforest,0.843,2.691,0.98,Conditional Inference Random Forest
66 | 5,bdk,0.837,2.414,1.69,Self-Organizing Map
67 | 9,BstLm,0.826,2.651,1.86,Boosted Linear Model
68 | 34,knn,0.812,2.795,0.78,k-Nearest Neighbors
69 | 21,evtree,0.789,2.796,2.98,Tree Models from Genetic Algorithms
70 | 12,ctree,0.77,2.982,0.83,Conditional Inference Tree
71 | 13,ctree2,0.77,2.982,0.83,Conditional Inference Tree
72 | 46,partDSA,0.749,3.089,3.19,partDSA
73 | 62,rpart,0.744,3.029,1.22,CART
74 | 63,rpart2,0.744,3.029,0.84,CART
75 | 56,rfRules,0.512,4.943,32.53,Random Forest Rule-Based Model
76 | 69,rvmRadial,0.396,12.565,0.9,Relevance Vector Machines with Radial Basis Function Kernel
77 | 44,mlpWeightDecay,0.338,6.218,2.53,Multi-Layer Perceptron
78 | 15,DENFIS,0.318,9.409,10.34,Dynamic Evolving Neural-Fuzzy Inference System
79 | 57,rbfDDA,0.212,20.857,1.81,Radial Basis Function Network
80 | 1,avNNet,,19.972,7.31,Model Averaged Neural Network
81 | 16,dnn,,6.172,1.64,Stacked AutoEncoder Deep Neural Network
82 | 45,neuralnet,,5.993,20.98,Neural Network
83 | 47,pcaNNet,,19.972,1.17,Neural Networks with Feature Extraction
84 | 82,xgbLinear,,6.045,790.15,eXtreme Gradient Boosting
85 | 83,xgbTree,,6.045,49.58,eXtreme Gradient Boosting
86 | 


--------------------------------------------------------------------------------
/caret-regression/caret-all-regressions-DT-concrete.R:
--------------------------------------------------------------------------------
  1 | # All working and fast caret regression models applied to data(concrete)
  2 | # The regression output from  fast (working) regression models is
  3 | # exported to a sortable table in a web browser using the DT library
  4 | # https://github.com/tobigithub/caret-machine-learning
  5 | # Tobias Kind (2015)
  6 | 
  7 | require(caret); require(DT); require(AppliedPredictiveModeling);
  8 | data(concrete);
  9 |  
 10 | m <- c( "avNNet" , "bagEarth", "bagEarthGCV", 
 11 | "bayesglm", "bdk", "blackboost", "Boruta", "brnn", "BstLm" , 
 12 | "bstTree", "cforest", "ctree", "ctree2", "cubist" , 
 13 | "dnn", "earth", "elm", "enet", "enpls",  
 14 | "gamLoess",  "gaussprLinear", "gaussprPoly", "gaussprRadial", 
 15 | "gcvEarth","glm", "glmboost", "glmnet", "icr", "kernelpls", 
 16 | "kknn", "knn",  "krlsRadial", "lars" , "lasso", 
 17 | "leapBackward", "leapForward", "leapSeq", "lm", "M5", "M5Rules", 
 18 | "mlpWeightDecay", "neuralnet" , "partDSA", 
 19 | "pcaNNet", "pcr", "penalized", "pls", "plsRglm", "ppr", 
 20 | "qrf" , "ranger",  "rf" , "rbfDDA",
 21 | "ridge", "rknn", "rlm", "rpart", "rpart2", "rqlasso", 
 22 | "rqnc", "RRF", "RRFglobal",  "rvmPoly", "rvmRadial", 
 23 | "SBC", "simpls", "spls", "superpc" , 
 24 | "svmLinear", "svmLinear2", "svmPoly", "svmRadial", "svmRadialCost", 
 25 | "treebag", "widekernelpls", "xgbLinear", 
 26 | "xgbTree", "xyf")
 27 |  
 28 |  
 29 | # load all packages (does not really work due to other dependencies)
 30 | suppressPackageStartupMessages(ll <-lapply(m, require, character.only = TRUE))
 31 |   
 32 | # define x and y for regression
 33 | y <- concrete$CompressiveStrength; x <- concrete[, 1:8];
 34 | 
 35 | # register parallel front-end
 36 | library(doParallel); cl <- makeCluster(detectCores()); registerDoParallel(cl)
 37 | 
 38 | # use lapply/loop to run everything
 39 | t2 <- lapply(m,function(i) 
 40 | 	{cat("----------------------------------------------------","\n");
 41 | 	set.seed(123); cat(i," <- loaded\n");
 42 | 	t2 <- train(y=y, x=x, (i), trControl = trainControl(method = "boot632"))
 43 | 	}
 44 | )
 45 | 
 46 | 	
 47 | r2 <- lapply(1:length(t2), function(i) 
 48 | 		{cat(sprintf("%-20s",(m[i])));
 49 | 		cat(round(t2[[i]]$results$Rsquared[which.min(t2[[i]]$results$RMSE)],4),"\t");
 50 | 		cat(round(t2[[i]]$results$RMSE[which.min(t2[[i]]$results$RMSE)],4),"\t")
 51 | 		cat(t2[[i]]$times$everything[3],"\n")
 52 | 		}
 53 | )
 54 | 
 55 | # stop cluster and register sequntial front end
 56 | stopCluster(cl); registerDoSEQ();
 57 | 
 58 | # preallocate data types
 59 | i = 1; MAX = length(t2);
 60 | x1 <- character() # Name
 61 | x2 <- numeric()   # R2
 62 | x3 <- numeric()   # RMSE
 63 | x4 <- numeric()   # time [s]
 64 | x5 <- character() # long model name
 65 |  
 66 | # fill data and check indexes and NA
 67 | for (i in 1:length(t2)) {
 68 |     x1[i] <- t2[[i]]$method
 69 |     x2[i] <- as.numeric(t2[[i]]$results$Rsquared[which.min(t2[[i]]$results$RMSE)])
 70 |     x3[i] <- as.numeric(t2[[i]]$results$RMSE[which.min(t2[[i]]$results$RMSE)])
 71 |     x4[i] <- as.numeric(t2[[i]]$times$everything[3])
 72 |     x5[i] <- t2[[i]]$modelInfo$label
 73 | }
 74 |   
 75 | # coerce to data frame
 76 | df1 <- data.frame(x1,x2,x3,x4,x5, stringsAsFactors=FALSE)
 77 | 
 78 | # print all results to R-GUI
 79 | df1
 80 | 
 81 | # plot RMSE vs boosting iterations for xgbLinear and xgbTree
 82 | # next 2 lines this is static code, index extraction may fail
 83 | ggplot(t2[[76]])
 84 | ggplot(t2[[77]])
 85 | 
 86 | # call web output with correct column names
 87 | datatable(df1,  options = list(
 88 | 		columnDefs = list(list(className = 'dt-left', targets = c(0,1,2,3,4,5))),
 89 | 		pageLength = MAX,
 90 |   		order = list(list(2, 'desc'))),
 91 | 		colnames = c('Num', 'Name', 'R^2', 'RMSE', 'time [s]', 'Model name'),
 92 | 	        caption = paste('Regression results from caret models',Sys.time()),
 93 | 	        class = 'cell-border stripe')  %>% 	       
 94 | 	        formatRound('x2', 3) %>%  
 95 | 	        formatRound('x3', 3) %>%
 96 | 	        formatRound('x4', 3) %>%
 97 | 		    formatStyle(2,
 98 | 		    background = styleColorBar(x2, 'steelblue'),
 99 | 		    backgroundSize = '100% 90%',
100 | 		    backgroundRepeat = 'no-repeat',
101 | 		    backgroundPosition = 'center'
102 | )
103 | 
104 | 
105 | ### END
106 | 


--------------------------------------------------------------------------------
/caret-regression/caret-all-regressions-DT-concrete.csv:
--------------------------------------------------------------------------------
 1 | Num,Name,R^2,RMSE,time [s],Model name
 2 | 14,cubist,0.935,4.122,7.41,Cubist
 3 | 52,rf,0.932,4.177,11.67,Random Forest
 4 | 62,RRFglobal,0.932,4.178,13.29,Regularized Random Forest
 5 | 7,Boruta,0.932,4.183,83.31,Random Forest with Additional Feature Selection
 6 | 61,RRF,0.932,4.185,22.87,Regularized Random Forest
 7 | 51,ranger,0.93,4.223,6.16,Random Forest
 8 | 50,qrf,0.92,4.493,3.18,Quantile Random Forest
 9 | 10,bstTree,0.923,4.582,102.66,Boosted Tree
10 | 39,M5,0.892,5.469,34.68,Model Tree
11 | 11,cforest,0.89,5.565,23.07,Conditional Inference Random Forest
12 | 32,krlsRadial,0.869,5.622,320.78,Radial Basis Function Kernel Regularized Least Squares
13 | 3,bagEarthGCV,0.868,6.074,6.79,Bagged MARS using gCV Pruning
14 | 6,blackboost,0.871,6.11,5.1,Boosted Tree
15 | 2,bagEarth,0.866,6.125,21.35,Bagged MARS
16 | 24,gcvEarth,0.859,6.264,1.44,Multivariate Adaptive Regression Splines
17 | 22,gaussprPoly,0.857,6.308,75.74,Gaussian Process with Polynomial Kernel
18 | 16,earth,0.857,6.314,2.1,Multivariate Adaptive Regression Spline
19 | 40,M5Rules,0.855,6.319,20.14,Model Rules
20 | 71,svmPoly,0.856,6.33,108.18,Support Vector Machines with Polynomial Kernel
21 | 73,svmRadialCost,0.856,6.335,9.02,Support Vector Machines with Radial Basis Function Kernel
22 | 8,brnn,0.855,6.359,11.03,Bayesian Regularized Neural Networks
23 | 72,svmRadial,0.854,6.384,5.72,Support Vector Machines with Radial Basis Function Kernel
24 | 20,gamLoess,0.85,6.48,1.9,Generalized Additive Model using LOESS
25 | 76,xgbLinear,0.855,6.487,659.2,eXtreme Gradient Boosting
26 | 23,gaussprRadial,0.839,6.775,6.07,Gaussian Process with Radial Basis Function Kernel
27 | 12,ctree,0.824,6.965,1.72,Conditional Inference Tree
28 | 77,xgbTree,0.826,7.116,44.19,eXtreme Gradient Boosting
29 | 30,kknn,0.779,7.526,1.47,k-Nearest Neighbors
30 | 74,treebag,0.803,7.532,5.86,Bagged CART
31 | 49,ppr,0.788,7.609,1.26,Projection Pursuit Regression
32 | 55,rknn,0.774,8.377,21.79,Random k-Nearest Neighbors
33 | 31,knn,0.721,8.794,1.08,k-Nearest Neighbors
34 | 65,SBC,0.709,8.945,462.52,Subtractive Clustering and Fuzzy c-Means Rules
35 | 19,enpls,0.61,10.442,343.36,Ensemble Partial Least Squares Regression
36 | 21,gaussprLinear,0.61,10.442,4.46,Gaussian Process
37 | 54,ridge,0.61,10.443,1.7,Ridge Regression
38 | 18,enet,0.61,10.443,2.56,Elasticnet
39 | 46,penalized,0.61,10.443,14.72,Penalized Linear Regression
40 | 25,glm,0.61,10.443,1.18,Generalized Linear Model
41 | 38,lm,0.61,10.443,1.19,Linear Regression
42 | 33,lars,0.61,10.443,1.34,Least Angle Regression
43 | 4,bayesglm,0.61,10.443,1.79,Bayesian Generalized Linear Model
44 | 27,glmnet,0.61,10.443,1.49,glmnet
45 | 34,lasso,0.61,10.449,1.28,The lasso
46 | 56,rlm,0.606,10.572,1.18,Robust Linear Model
47 | 58,rpart2,0.594,10.635,1.33,CART
48 | 26,glmboost,0.595,10.694,1.58,Boosted Generalized Linear Model
49 | 67,spls,0.586,10.758,4.98,Sparse Partial Least Squares
50 | 48,plsRglm,0.586,10.764,46.07,Partial Least Squares Generalized Linear Models
51 | 47,pls,0.579,10.873,1.12,Partial Least Squares
52 | 66,simpls,0.579,10.873,1.2,Partial Least Squares
53 | 29,kernelpls,0.579,10.873,1.25,Partial Least Squares
54 | 75,widekernelpls,0.579,10.873,20.11,Partial Least Squares
55 | 70,svmLinear2,0.594,10.889,1.93,Support Vector Machines with Linear Kernel
56 | 69,svmLinear,0.594,10.909,1.54,Support Vector Machines with Linear Kernel
57 | 59,rqlasso,0.591,10.982,2.54,Quantile Regression with LASSO penalty
58 | 60,rqnc,0.591,10.983,5.73,Non-Convex Penalized Quantile Regression
59 | 35,leapBackward,0.55,11.218,1.22,Linear Regression with Backwards Selection
60 | 36,leapForward,0.548,11.249,1.25,Linear Regression with Forward Selection
61 | 13,ctree2,0.536,11.375,1.4,Conditional Inference Tree
62 | 37,leapSeq,0.532,11.431,1.13,Linear Regression with Stepwise Selection
63 | 63,rvmPoly,0.509,11.61,140.07,Relevance Vector Machines with Polynomial Kernel
64 | 17,elm,0.45,12.308,3.51,Extreme Learning Machine
65 | 57,rpart,0.439,12.514,1.53,CART
66 | 43,partDSA,0.397,12.975,6.5,partDSA
67 | 9,BstLm,0.433,13.468,2.37,Boosted Linear Model
68 | 78,xyf,0.334,13.678,2.97,Self-Organizing Maps
69 | 28,icr,0.329,13.702,3.62,Independent Component Regression
70 | 5,bdk,0.419,14.261,3.03,Self-Organizing Map
71 | 45,pcr,0.262,14.368,1.09,Principal Component Analysis
72 | 42,neuralnet,,16.701,6.63,Neural Network
73 | 15,dnn,,16.786,39.46,Stacked AutoEncoder Deep Neural Network
74 | 41,mlpWeightDecay,,18.03,17.8,Multi-Layer Perceptron
75 | 64,rvmRadial,0.301,30.056,225.6,Relevance Vector Machines with Radial Basis Function Kernel
76 | 44,pcaNNet,,38.657,9.33,Neural Networks with Feature Extraction
77 | 1,avNNet,,38.657,54.51,Model Averaged Neural Network
78 | 68,superpc,0.262,38.798,3.78,Supervised Principal Component Analysis
79 | 53,rbfDDA,0.006,39.561,390.54,Radial Basis Function Network
80 | 


--------------------------------------------------------------------------------
/caret-regression/caret-regression-plotObsVsPred.R:
--------------------------------------------------------------------------------
 1 | # Regression analysis and visualization
 2 | # Plot observed vs predicted values for training and test set from  CART and PLS
 3 | # Source: http://www.inside-r.org/packages/cran/caret/docs/plotObsVsPred
 4 | # Author: Max Kuhn
 5 | #
 6 | # https://github.com/tobigithub/caret-machine-learning
 7 | # Tobias Kind (2015)
 8 | 
 9 | # load libraries and models
10 | require(caret)
11 | require(mlbench)
12 | data(BostonHousing)
13 | 
14 | # perform CART (Classification And Regression Tree) analysis
15 | set.seed(123)
16 | rpartFit <- train(BostonHousing[1:100, -c(4, 14)], 
17 |                   BostonHousing$medv[1:100], 
18 |                   "rpart", tuneLength = 9)
19 | 
20 | # perform PLS (Partial Least Squares) analysis
21 | set.seed(123)
22 | plsFit <- train(BostonHousing[1:100, -c(4, 14)], 
23 |                 BostonHousing$medv[1:100], 
24 |                 "pls")
25 | 
26 | # extract optimal tuning values for further use
27 | predVals <- extractPrediction(list(rpartFit, plsFit), 
28 |                               testX = BostonHousing[101:200, -c(4, 14)], 
29 |                               testY = BostonHousing$medv[101:200], 
30 |                               unkX = BostonHousing[201:300, -c(4, 14)])
31 | 
32 | # plot CART and PLS observed vs predicted values for training and test set
33 | plotObsVsPred(predVals)
34 | 
35 | ### END
36 | 


--------------------------------------------------------------------------------
/caret-setup/caret-get-all-models-automatically.R:
--------------------------------------------------------------------------------
 1 | # Get all caret models for regression and classification
 2 | # https://github.com/tobigithub/caret-machine-learning
 3 | # Tobias Kind (2015)
 4 | 
 5 | # -----------------------------------------------------------
 6 | # get all caret models for regression
 7 | 
 8 | require(caret)
 9 | modNames <- unique(modelLookup()[modelLookup()$forReg,c(1)])
10 | length(modNames); modNames;
11 | 
12 | # -----------------------------------------------------------
13 | # get all caret models for classification
14 | 
15 | require(caret)
16 | modNames <- unique(modelLookup()[modelLookup()$forClass,c(1)])
17 | length(modNames); modNames;
18 | 


--------------------------------------------------------------------------------
/caret-setup/caret-model-list-v6058.csv:
--------------------------------------------------------------------------------
  1 | Num,Model,method Argument Value,Type,Packages,Tuning Parameters
  2 | 1,Boosted Classification Trees,ada,Classification,"ada, plyr","iter, maxdepth, nu"
  3 | 2,Bagged AdaBoost,AdaBag,Classification,"adabag, plyr","mfinal, maxdepth"
  4 | 3,AdaBoost.M1,AdaBoost.M1,Classification,"adabag, plyr","mfinal, maxdepth, coeflearn"
  5 | 4,Adaptive Mixture Discriminant Analysis,amdai,Classification,adaptDA,model
  6 | 5,Adaptive-Network-Based Fuzzy Inference System,ANFIS,Regression,frbs,"num.labels, max.iter"
  7 | 6,Model Averaged Neural Network,avNNet,Dual Use,nnet,"size, decay, bag"
  8 | 7,Naive Bayes Classifier with Attribute Weighting,awnb,Classification,bnclassify,smooth
  9 | 8,Tree Augmented Naive Bayes Classifier with Attribute Weighting,awtan,Classification,bnclassify,"score, smooth"
 10 | 9,Bagged Model,bag,Dual Use,caret,vars
 11 | 10,Bagged MARS,bagEarth,Dual Use,earth,"nprune, degree"
 12 | 11,Bagged MARS using gCV Pruning,bagEarthGCV,Dual Use,earth,degree
 13 | 12,Bagged Flexible Discriminant Analysis,bagFDA,Classification,"earth, mda","degree, nprune"
 14 | 13,Bagged FDA using gCV Pruning,bagFDAGCV,Classification,earth,degree
 15 | 14,Bayesian Additive Regression Trees,bartMachine,Dual Use,bartMachine,"num_trees, k, alpha, beta, nu"
 16 | 15,Bayesian Generalized Linear Model,bayesglm,Dual Use,arm,None
 17 | 16,Self-Organizing Map,bdk,Dual Use,kohonen,"xdim, ydim, xweight, topo"
 18 | 17,Binary Discriminant Analysis,binda,Classification,binda,lambda.freqs
 19 | 18,Boosted Tree,blackboost,Dual Use,"party, mboost, plyr","mstop, maxdepth"
 20 | 19,Random Forest with Additional Feature Selection,Boruta,Dual Use,"Boruta, randomForest",mtry
 21 | 20,Bayesian Regularized Neural Networks,brnn,Regression,brnn,neurons
 22 | 21,Boosted Linear Model,BstLm,Dual Use,"bst, plyr","mstop, nu"
 23 | 22,Boosted Smoothing Spline,bstSm,Dual Use,"bst, plyr","mstop, nu"
 24 | 23,Boosted Tree,bstTree,Dual Use,"bst, plyr","mstop, maxdepth, nu"
 25 | 24,C5.0,C5.0,Classification,"C50, plyr","trials, model, winnow"
 26 | 25,Cost-Sensitive C5.0,C5.0Cost,Classification,"C50, plyr","trials, model, winnow, cost"
 27 | 26,Single C5.0 Ruleset,C5.0Rules,Classification,C50,None
 28 | 27,Single C5.0 Tree,C5.0Tree,Classification,C50,None
 29 | 28,Conditional Inference Random Forest,cforest,Dual Use,party,mtry
 30 | 29,CHi-squared Automated Interaction Detection,chaid,Classification,CHAID,"alpha2, alpha3, alpha4"
 31 | 30,SIMCA,CSimca,Classification,rrcovHD,None
 32 | 31,Conditional Inference Tree,ctree,Dual Use,party,mincriterion
 33 | 32,Conditional Inference Tree,ctree2,Dual Use,party,maxdepth
 34 | 33,Cubist,cubist,Regression,Cubist,"committees, neighbors"
 35 | 34,Dynamic Evolving Neural-Fuzzy Inference System,DENFIS,Regression,frbs,"Dthr, max.iter"
 36 | 35,Stacked AutoEncoder Deep Neural Network,dnn,Dual Use,deepnet,"layer1, layer2, layer3, hidden_dropout, visible_dropout"
 37 | 36,Linear Distance Weighted Discrimination,dwdLinear,Classification,kerndwd,"lambda, qval"
 38 | 37,Distance Weighted Discrimination with Polynomial Kernel,dwdPoly,Classification,kerndwd,"lambda, qval, degree, scale"
 39 | 38,Distance Weighted Discrimination with Radial Basis Function Kernel,dwdRadial,Classification,"kernlab, kerndwd","lambda, qval, sigma"
 40 | 39,Multivariate Adaptive Regression Spline,earth,Dual Use,earth,"nprune, degree"
 41 | 40,Extreme Learning Machine,elm,Dual Use,elmNN,"nhid, actfun"
 42 | 41,Elasticnet,enet,Regression,elasticnet,"fraction, lambda"
 43 | 42,Ensemble Partial Least Squares Regression,enpls,Regression,enpls,maxcomp
 44 | 43,Ensemble Partial Least Squares Regression with Feature Selection,enpls.fs,Regression,enpls,"maxcomp, threshold"
 45 | 44,Tree Models from Genetic Algorithms,evtree,Dual Use,evtree,alpha
 46 | 45,Random Forest by Randomization,extraTrees,Dual Use,extraTrees,"mtry, numRandomCuts"
 47 | 46,Flexible Discriminant Analysis,fda,Classification,"earth, mda","degree, nprune"
 48 | 47,Fuzzy Rules Using Genetic Cooperative-Competitive Learning and Pittsburgh,FH.GBML,Classification,frbs,"max.num.rule, popu.size, max.gen"
 49 | 48,Fuzzy Inference Rules by Descent Method,FIR.DM,Regression,frbs,"num.labels, max.iter"
 50 | 49,Ridge Regression with Variable Selection,foba,Regression,foba,"k, lambda"
 51 | 50,Fuzzy Rules Using Chi's Method,FRBCS.CHI,Classification,frbs,"num.labels, type.mf"
 52 | 51,Fuzzy Rules with Weight Factor,FRBCS.W,Classification,frbs,"num.labels, type.mf"
 53 | 52,Simplified TSK Fuzzy Rules,FS.HGD,Regression,frbs,"num.labels, max.iter"
 54 | 53,Generalized Additive Model using Splines,gam,Dual Use,mgcv,"select, method"
 55 | 54,Boosted Generalized Additive Model,gamboost,Dual Use,mboost,"mstop, prune"
 56 | 55,Generalized Additive Model using LOESS,gamLoess,Dual Use,gam,"span, degree"
 57 | 56,Generalized Additive Model using Splines,gamSpline,Dual Use,gam,df
 58 | 57,Gaussian Process,gaussprLinear,Dual Use,kernlab,None
 59 | 58,Gaussian Process with Polynomial Kernel,gaussprPoly,Dual Use,kernlab,"degree, scale"
 60 | 59,Gaussian Process with Radial Basis Function Kernel,gaussprRadial,Dual Use,kernlab,sigma
 61 | 60,Stochastic Gradient Boosting,gbm,Dual Use,"gbm, plyr","n.trees, interaction.depth, shrinkage, n.minobsinnode"
 62 | 61,Multivariate Adaptive Regression Splines,gcvEarth,Dual Use,earth,degree
 63 | 62,Fuzzy Rules via MOGUL,GFS.FR.MOGUL,Regression,frbs,"max.gen, max.iter, max.tune"
 64 | 63,Fuzzy Rules Using Genetic Cooperative-Competitive Learning,GFS.GCCL,Classification,frbs,"num.labels, popu.size, max.gen"
 65 | 64,Genetic Lateral Tuning and Rule Selection of Linguistic Fuzzy Systems,GFS.LT.RS,Regression,frbs,"popu.size, num.labels, max.gen"
 66 | 65,Fuzzy Rules via Thrift,GFS.THRIFT,Regression,frbs,"popu.size, num.labels, max.gen"
 67 | 66,Generalized Linear Model,glm,Dual Use,,None
 68 | 67,Boosted Generalized Linear Model,glmboost,Dual Use,mboost,"mstop, prune"
 69 | 68,glmnet,glmnet,Dual Use,glmnet,"alpha, lambda"
 70 | 69,Generalized Linear Model with Stepwise Feature Selection,glmStepAIC,Dual Use,MASS,None
 71 | 70,Generalized Partial Least Squares,gpls,Classification,gpls,K.prov
 72 | 71,Heteroscedastic Discriminant Analysis,hda,Classification,hda,"gamma, lambda, newdim"
 73 | 72,High Dimensional Discriminant Analysis,hdda,Classification,HDclassif,"threshold, model"
 74 | 73,Hybrid Neural Fuzzy Inference System,HYFIS,Regression,frbs,"num.labels, max.iter"
 75 | 74,Independent Component Regression,icr,Regression,fastICA,n.comp
 76 | 75,C4.5-like Trees,J48,Classification,RWeka,C
 77 | 76,Rule-Based Classifier,JRip,Classification,RWeka,NumOpt
 78 | 77,Partial Least Squares,kernelpls,Dual Use,pls,ncomp
 79 | 78,k-Nearest Neighbors,kknn,Dual Use,kknn,"kmax, distance, kernel"
 80 | 79,k-Nearest Neighbors,knn,Dual Use,,k
 81 | 80,Polynomial Kernel Regularized Least Squares,krlsPoly,Regression,KRLS,"lambda, degree"
 82 | 81,Radial Basis Function Kernel Regularized Least Squares,krlsRadial,Regression,"KRLS, kernlab","lambda, sigma"
 83 | 82,Least Angle Regression,lars,Regression,lars,fraction
 84 | 83,Least Angle Regression,lars2,Regression,lars,step
 85 | 84,The lasso,lasso,Regression,elasticnet,fraction
 86 | 85,Linear Discriminant Analysis,lda,Classification,MASS,None
 87 | 86,Linear Discriminant Analysis,lda2,Classification,MASS,dimen
 88 | 87,Linear Regression with Backwards Selection,leapBackward,Regression,leaps,nvmax
 89 | 88,Linear Regression with Forward Selection,leapForward,Regression,leaps,nvmax
 90 | 89,Linear Regression with Stepwise Selection,leapSeq,Regression,leaps,nvmax
 91 | 90,Robust Linear Discriminant Analysis,Linda,Classification,rrcov,None
 92 | 91,Linear Regression,lm,Regression,,None
 93 | 92,Linear Regression with Stepwise Selection,lmStepAIC,Regression,MASS,None
 94 | 93,Logistic Model Trees,LMT,Classification,RWeka,iter
 95 | 94,Localized Linear Discriminant Analysis,loclda,Classification,klaR,k
 96 | 95,Bagged Logic Regression,logicBag,Dual Use,logicFS,"nleaves, ntrees"
 97 | 96,Boosted Logistic Regression,LogitBoost,Classification,caTools,nIter
 98 | 97,Logic Regression,logreg,Dual Use,LogicReg,"treesize, ntrees"
 99 | 98,Least Squares Support Vector Machine,lssvmLinear,Classification,kernlab,None
100 | 99,Least Squares Support Vector Machine with Polynomial Kernel,lssvmPoly,Classification,kernlab,"degree, scale"
101 | 100,Least Squares Support Vector Machine with Radial Basis Function Kernel,lssvmRadial,Classification,kernlab,sigma
102 | 101,Learning Vector Quantization,lvq,Classification,class,"size, k"
103 | 102,Model Tree,M5,Regression,RWeka,"pruned, smoothed, rules"
104 | 103,Model Rules,M5Rules,Regression,RWeka,"pruned, smoothed"
105 | 104,Mixture Discriminant Analysis,mda,Classification,mda,subclasses
106 | 105,Maximum Uncertainty Linear Discriminant Analysis,Mlda,Classification,HiDimDA,None
107 | 106,Multi-Layer Perceptron,mlp,Dual Use,RSNNS,size
108 | 107,Multi-Layer Perceptron,mlpWeightDecay,Dual Use,RSNNS,"size, decay"
109 | 108,Penalized Multinomial Regression,multinom,Classification,nnet,decay
110 | 109,Naive Bayes,nb,Classification,klaR,"fL, usekernel"
111 | 110,Naive Bayes Classifier,nbDiscrete,Classification,bnclassify,smooth
112 | 111,Semi-Naive Structure Learner Wrapper,nbSearch,Classification,bnclassify,"k, epsilon, smooth, final_smooth, direction"
113 | 112,Neural Network,neuralnet,Regression,neuralnet,"layer1, layer2, layer3"
114 | 113,Neural Network,nnet,Dual Use,nnet,"size, decay"
115 | 114,Non-Negative Least Squares,nnls,Regression,nnls,None
116 | 115,Tree-Based Ensembles,nodeHarvest,Dual Use,nodeHarvest,"maxinter, mode"
117 | 116,Oblique Trees,oblique.tree,Classification,oblique.tree,"oblique.splits, variable.selection"
118 | 117,Single Rule Classification,OneR,Classification,RWeka,None
119 | 118,Oblique Random Forest,ORFlog,Classification,obliqueRF,mtry
120 | 119,Oblique Random Forest,ORFpls,Classification,obliqueRF,mtry
121 | 120,Oblique Random Forest,ORFridge,Classification,obliqueRF,mtry
122 | 121,Oblique Random Forest,ORFsvm,Classification,obliqueRF,mtry
123 | 122,Optimal Weighted Nearest Neighbor Classifier,ownn,Classification,snn,K
124 | 123,Nearest Shrunken Centroids,pam,Classification,pamr,threshold
125 | 124,Parallel Random Forest,parRF,Dual Use,"e1071, randomForest",mtry
126 | 125,Rule-Based Classifier,PART,Classification,RWeka,"threshold, pruned"
127 | 126,partDSA,partDSA,Dual Use,partDSA,"cut.off.growth, MPD"
128 | 127,Neural Networks with Feature Extraction,pcaNNet,Dual Use,nnet,"size, decay"
129 | 128,Principal Component Analysis,pcr,Regression,pls,ncomp
130 | 129,Penalized Discriminant Analysis,pda,Classification,mda,lambda
131 | 130,Penalized Discriminant Analysis,pda2,Classification,mda,df
132 | 131,Penalized Linear Regression,penalized,Regression,penalized,"lambda1, lambda2"
133 | 132,Penalized Linear Discriminant Analysis,PenalizedLDA,Classification,"penalizedLDA, plyr","lambda, K"
134 | 133,Penalized Logistic Regression,plr,Classification,stepPlr,"lambda, cp"
135 | 134,Partial Least Squares,pls,Dual Use,pls,ncomp
136 | 135,Partial Least Squares Generalized Linear Models,plsRglm,Dual Use,plsRglm,"nt, alpha.pvals.expli"
137 | 136,Ordered Logistic or Probit Regression,polr,Classification,MASS,None
138 | 137,Projection Pursuit Regression,ppr,Regression,,nterms
139 | 138,Greedy Prototype Selection,protoclass,Classification,"proxy, protoclass","eps, Minkowski"
140 | 139,Knn regression via sklearn.neighbors.KNeighborsRegressor,pythonKnnReg,Regression,rPython,"n_neighbors, weights, algorithm, leaf_size, metric, p"
141 | 140,Quadratic Discriminant Analysis,qda,Classification,MASS,None
142 | 141,Robust Quadratic Discriminant Analysis,QdaCov,Classification,rrcov,None
143 | 142,Quantile Random Forest,qrf,Regression,quantregForest,mtry
144 | 143,Quantile Regression Neural Network,qrnn,Regression,qrnn,"n.hidden, penalty, bag"
145 | 144,Random Forest,ranger,Dual Use,"e1071, ranger",mtry
146 | 145,Radial Basis Function Network,rbf,Dual Use,RSNNS,size
147 | 146,Radial Basis Function Network,rbfDDA,Dual Use,RSNNS,negativeThreshold
148 | 147,Regularized Discriminant Analysis,rda,Classification,klaR,"gamma, lambda"
149 | 148,Relaxed Lasso,relaxo,Regression,"relaxo, plyr","lambda, phi"
150 | 149,Random Forest,rf,Dual Use,randomForest,mtry
151 | 150,Random Ferns,rFerns,Classification,rFerns,depth
152 | 151,Factor-Based Linear Discriminant Analysis,RFlda,Classification,HiDimDA,q
153 | 152,Random Forest Rule-Based Model,rfRules,Dual Use,"randomForest, inTrees, plyr","mtry, maxdepth"
154 | 153,Ridge Regression,ridge,Regression,elasticnet,lambda
155 | 154,Random k-Nearest Neighbors,rknn,Dual Use,rknn,"k, mtry"
156 | 155,Random k-Nearest Neighbors with Feature Selection,rknnBel,Dual Use,"rknn, plyr","k, mtry, d"
157 | 156,Robust Linear Model,rlm,Regression,MASS,None
158 | 157,Robust Mixture Discriminant Analysis,rmda,Classification,robustDA,"K, model"
159 | 158,ROC-Based Classifier,rocc,Classification,rocc,xgenes
160 | 159,Rotation Forest,rotationForest,Classification,rotationForest,"K, L"
161 | 160,Rotation Forest,rotationForestCp,Classification,"rpart, plyr, rotationForest","K, L, cp"
162 | 161,CART,rpart,Dual Use,rpart,cp
163 | 162,CART,rpart2,Dual Use,rpart,maxdepth
164 | 163,Cost-Sensitive CART,rpartCost,Classification,rpart,"cp, Cost"
165 | 164,Quantile Regression with LASSO penalty,rqlasso,Regression,rqPen,lambda
166 | 165,Non-Convex Penalized Quantile Regression,rqnc,Regression,rqPen,"lambda, penalty"
167 | 166,Regularized Random Forest,RRF,Dual Use,"randomForest, RRF","mtry, coefReg, coefImp"
168 | 167,Regularized Random Forest,RRFglobal,Dual Use,RRF,"mtry, coefReg"
169 | 168,Robust Regularized Linear Discriminant Analysis,rrlda,Classification,rrlda,"lambda, hp, penalty"
170 | 169,Robust SIMCA,RSimca,Classification,rrcovHD,None
171 | 170,Relevance Vector Machines with Linear Kernel,rvmLinear,Regression,kernlab,None
172 | 171,Relevance Vector Machines with Polynomial Kernel,rvmPoly,Regression,kernlab,"scale, degree"
173 | 172,Relevance Vector Machines with Radial Basis Function Kernel,rvmRadial,Regression,kernlab,sigma
174 | 173,Subtractive Clustering and Fuzzy c-Means Rules,SBC,Regression,frbs,"r.a, eps.high, eps.low"
175 | 174,Shrinkage Discriminant Analysis,sda,Classification,sda,"diagonal, lambda"
176 | 175,Stepwise Diagonal Linear Discriminant Analysis,sddaLDA,Classification,SDDA,None
177 | 176,Stepwise Diagonal Quadratic Discriminant Analysis,sddaQDA,Classification,SDDA,None
178 | 177,Sparse Distance Weighted Discrimination,sdwd,Classification,sdwd,"lambda, lambda2"
179 | 178,Partial Least Squares,simpls,Dual Use,pls,ncomp
180 | 179,Fuzzy Rules Using the Structural Learning Algorithm on Vague Environment,SLAVE,Classification,frbs,"num.labels, max.iter, max.gen"
181 | 180,Stabilized Linear Discriminant Analysis,slda,Classification,ipred,None
182 | 181,Sparse Mixture Discriminant Analysis,smda,Classification,sparseLDA,"NumVars, lambda, R"
183 | 182,Stabilized Nearest Neighbor Classifier,snn,Classification,snn,lambda
184 | 183,Sparse Linear Discriminant Analysis,sparseLDA,Classification,sparseLDA,"NumVars, lambda"
185 | 184,Sparse Partial Least Squares,spls,Dual Use,spls,"K, eta, kappa"
186 | 185,Linear Discriminant Analysis with Stepwise Feature Selection,stepLDA,Classification,"klaR, MASS","maxvar, direction"
187 | 186,Quadratic Discriminant Analysis with Stepwise Feature Selection,stepQDA,Classification,"klaR, MASS","maxvar, direction"
188 | 187,Supervised Principal Component Analysis,superpc,Regression,superpc,"threshold, n.components"
189 | 188,Support Vector Machines with Boundrange String Kernel,svmBoundrangeString,Dual Use,kernlab,"length, C"
190 | 189,Support Vector Machines with Exponential String Kernel,svmExpoString,Dual Use,kernlab,"lambda, C"
191 | 190,Support Vector Machines with Linear Kernel,svmLinear,Dual Use,kernlab,C
192 | 191,Support Vector Machines with Linear Kernel,svmLinear2,Dual Use,e1071,cost
193 | 192,Support Vector Machines with Polynomial Kernel,svmPoly,Dual Use,kernlab,"degree, scale, C"
194 | 193,Support Vector Machines with Radial Basis Function Kernel,svmRadial,Dual Use,kernlab,"sigma, C"
195 | 194,Support Vector Machines with Radial Basis Function Kernel,svmRadialCost,Dual Use,kernlab,C
196 | 195,Support Vector Machines with Class Weights,svmRadialWeights,Classification,kernlab,"sigma, C, Weight"
197 | 196,Support Vector Machines with Spectrum String Kernel,svmSpectrumString,Dual Use,kernlab,"length, C"
198 | 197,Tree Augmented Naive Bayes Classifier,tan,Classification,bnclassify,"score, smooth"
199 | 198,Tree Augmented Naive Bayes Classifier Structure Learner Wrapper,tanSearch,Classification,bnclassify,"k, epsilon, smooth, final_smooth, sp"
200 | 199,Bagged CART,treebag,Dual Use,"ipred, plyr, e1071",None
201 | 200,Variational Bayesian Multinomial Probit Regression,vbmpRadial,Classification,vbmp,estimateTheta
202 | 201,Partial Least Squares,widekernelpls,Dual Use,pls,ncomp
203 | 202,Wang and Mendel Fuzzy Rules,WM,Regression,frbs,"num.labels, type.mf"
204 | 203,Weighted Subspace Random Forest,wsrf,Classification,wsrf,mtry
205 | 204,eXtreme Gradient Boosting,xgbLinear,Dual Use,xgboost,"nrounds, lambda, alpha"
206 | 205,eXtreme Gradient Boosting,xgbTree,Dual Use,"xgboost, plyr","nrounds, max_depth, eta"
207 | 206,Self-Organizing Maps,xyf,Dual Use,kohonen,"xdim, ydim, xweight, topo"
208 | 


--------------------------------------------------------------------------------
/caret-setup/caret-modelLookup-DT.R:
--------------------------------------------------------------------------------
 1 | # Get all caret models for regression and classification and output in web browser
 2 | # The package DT is required
 3 | # See: https://github.com/topepo/caret/blob/master/pkg/caret/R/modelLookup.R
 4 | #
 5 | # https://github.com/tobigithub/caret-machine-learning
 6 | # Tobias Kind (2015)
 7 | 
 8 | require(caret)
 9 | # install.packages("DT")
10 | require(DT)
11 | 
12 | # this caret function returns the models and their availability 
13 | # to perform regression  and classification
14 | modelLookup()
15 | #----------------------------------------------
16 | # modelLookup()
17 | # 'data.frame': 372 obs. of  6 variables:
18 | #  $ model    : chr  "ada" "ada" "ada" "AdaBag" ...
19 | #  $ parameter: Factor w/ 144 levels "iter","maxdepth",..: 1 2 3 4 2 4 2 5 6 8 ...
20 | #  $ label    : Factor w/ 155 levels "#Trees","Learning Rate",..: 1 3 2 1 3 1 3 4 5 6 ...
21 | #  $ forReg   : logi  FALSE FALSE FALSE FALSE FALSE FALSE ...
22 | #  $ forClass : logi  TRUE TRUE TRUE TRUE TRUE TRUE ...
23 | #  $ probModel: 
24 | #---------------------------------------------- 
25 | 
26 | #length of models in function
27 | MAX = dim(modelLookup())[1];
28 | #perform model Lookup
29 | caretModels <- modelLookup()
30 | #coerce into dataframe for web output
31 | caretModels <- as.data.frame(caretModels)
32 | class(caretModels)
33 | 
34 | # call web output with correct column names
35 | datatable(caretModels,  options = list(
36 |  		columnDefs = list(list(className = 'dt-left', targets = c(0,1,2,3,4,5,6))),
37 |  		pageLength = MAX,
38 |    		order = list(list(0, 'asc'))),
39 |  		colnames = c('Num','model',' parameter', 'label', 'forReg', 'forClass',' probModel'),
40 |  	        caption = paste('Caret models for regression and classification',Sys.time()),
41 |  	        class = 'cell-border stripe')  %>% 	       
42 |  	            formatStyle(2,
43 |  		    background = styleColorBar(1, 'steelblue'),
44 |  		    backgroundSize = '100% 90%',
45 |  		    backgroundRepeat = 'no-repeat',
46 |  		    backgroundPosition = 'center'
47 |  )
48 |  
49 | ### END
50 | 
51 | # Output will be in sortable table web browser and file index.html
52 | # The table can be easily copy/pasted or saved as csv or XLS
53 | 
54 | # Num	model	parameter	label	forReg	forClass	probModel
55 | #1	ada	iter	#Trees	false	true	true
56 | #2	ada	maxdepth	Max Tree Depth	false	true	true
57 | #3	ada	nu	Learning Rate	false	true	true
58 | 


--------------------------------------------------------------------------------
/caret-setup/caret-setup-comfort.R:
--------------------------------------------------------------------------------
 1 | # Installation of caret package with commonly used 340 dependencies
 2 | # https://github.com/tobigithub/caret-machine-learning
 3 | # Tobias Kind (2015)
 4 | 
 5 | # installs most of the 340 caret dependencies + caret book + seven commonly used but not all of them
 6 | mostCommon <- c("caret", "AppliedPredictiveModeling", "ggplot2", "data.table", "plyr", "knitr", "shiny", "xts", "lattice")
 7 | install.packages(mostCommon, dependencies = c("Imports", "Depends", "Suggests"))          
 8 | require(caret); sessionInfo()
 9 | 
10 | ### END
11 | 


--------------------------------------------------------------------------------
/caret-setup/caret-setup-deLuxe.R:
--------------------------------------------------------------------------------
 1 | # Installation of caret package with allmost 400 required caret dependencies 
 2 | # https://github.com/tobigithub/caret-machine-learning
 3 | # Tobias Kind (2015)
 4 | 
 5 | # 1) load few caret packages from BioConductor, this will create most troubles
 6 | # this is a static solution (not good) check with below URL for more info
 7 | # https://github.com/topepo/caret/blob/master/release_process/update_pkgs.R
 8 | # Answer "n" when asked for updates 
 9 | source("http://bioconductor.org/biocLite.R")
10 | biocLite()
11 | biocLite(c("arm", "gpls", "logicFS", "vbmp"))
12 |  
13 | # 2) installs most of the 340 caret dependencies + seven commonly used but not all of them
14 | # Make sure to allow firewall access for doMPI if needed
15 | mostCommon <- c("caret", "AppliedPredictiveModeling", "ggplot2", "data.table", "plyr", "knitr", "shiny", "xts", "lattice")
16 | install.packages(mostCommon, dependencies = c("Imports", "Depends", "Suggests"))          
17 | 
18 | # 3) then load caret and check which additional libraries covering 200 models need to be installed
19 | # warnings will still exist; because caret loaded few dependencies they can not be updated
20 | # during runtime, may create errors
21 | require(caret); sessionInfo();
22 | caretLibs <- unique(unlist(lapply(getModelInfo(), function(x) x$library)))
23 | detach("package:caret", unload=TRUE)
24 | install.packages(caretLibs, dependencies = c("Imports", "Depends", "Suggests")) 
25 | 
26 | # 4) load packages from R-Forge also "rPython" maybe on CRAN and R-Forge
27 | install.packages(c("CHAID"), repos="http://R-Forge.R-project.org")
28 |  
29 | # 5) Restart R, clean-up mess, and say 'y' when asked
30 | # All packages that are not in CRAN such as SDDA need to be installed by hand
31 | source("http://bioconductor.org/biocLite.R")
32 | biocLite()
33 | biocLite(c("gpls", "logicFS", "vbmp"))
34 | 
35 | # "Warning: cannot remove prior installation of package"
36 | # in case of final installation issues, check packages plyr, MASS and ggplot2
37 | # the library directories may have to be removed manually with Administrator access.
38 | # get the library location with .libPaths()
39 | # R has to be closed and restarted and the following two lines below have to be executed
40 | # (additional issues may occour under WIN with doMPI and msmpi.dll)
41 | 
42 | ## rP <- c("plyr","ggplot2","MASS")
43 | ## install.packages(rP, dependencies = c("Imports", "Depends", "Suggests")) 
44 | 
45 | # the final straw, after everything was messed-up, restart R and do it again
46 | # install.packages("caret", dependencies = c("Imports", "Depends", "Suggests"))          
47 | ### END
48 | 
49 | 


--------------------------------------------------------------------------------
/caret-setup/caret-simple-setup.R:
--------------------------------------------------------------------------------
 1 | # Installation of caret package with most dependencies
 2 | # https://github.com/tobigithub/caret-machine-learning
 3 | # Tobias Kind (2015)
 4 | 
 5 | # installs most of the 300 caret dependencies  but not all of them
 6 | install.packages("caret", dependencies = c("Imports", "Depends", "Suggests"))
 7 | 
 8 | ### END
 9 | 
10 | 


--------------------------------------------------------------------------------
/caret-tune/caret-tune-evolutionial-algorithm-svmRadial.R:
--------------------------------------------------------------------------------
 1 | # Tune "svmRadial" in caret with evolutional algorithm using DEoptim.
 2 | # Author: Rafael Ladeira  https://github.com/rladeira
 3 | # Source: https://github.com/topepo/caret/issues/321
 4 | # Only runs on UNIX/Elcapitan due to library(doMC)
 5 | # https://github.com/tobigithub/caret-machine-learning
 6 | # Tobias Kind (2015)
 7 | 
 8 | library(caret)
 9 | library(parallel)
10 | library(doMC)
11 | 
12 | set.seed(17516)
13 | training_data <- SLC14_1(500)
14 | testing_data <- SLC14_1(10^5)
15 | 
16 | registerDoMC(cores = detectCores())
17 | 
18 | svm_fit <- function(x) {
19 |   mod <- train(y ~ ., data = training_data,
20 |                method = "svmRadial",
21 |                preProc = c("center", "scale"),
22 |                trControl = trainControl(method = "cv"),
23 |                tuneGrid = data.frame(C = 2^x[1], sigma = exp(x[2])))
24 |   getTrainPerf(mod)[, "TrainRMSE"]
25 | }
26 | 
27 | library(DEoptim)
28 | library(kernlab)
29 | 
30 | ## converged after 31 iterations
31 | svm_de_obj <-  DEoptim(fn = svm_fit,
32 |                        ## test cost values between ~0 and 2^10,
33 |                        ## test sigma values between exp(-5) and 1
34 |                        lower = c(-5, -5), 
35 |                        upper = c(10, 0),
36 |                        control = DEoptim.control(reltol = 1e-3,
37 |                                                  steptol = 10,
38 |                                                  itermax = 100))
39 | 
40 | 
41 | fitted_params <- svm_de_obj$optim$bestmem
42 | 
43 | svm_model <- train(y ~ ., data = training_data,
44 |                    method = "svmRadial",
45 |                    preProc = c("center", "scale"),
46 |                    trControl = trainControl(method = "cv", number = 10),
47 |                    tuneGrid = data.frame(C = 2^fitted_params[1], 
48 |                                          sigma = exp(fitted_params[2])))
49 | 
50 | predictions <- predict(svm_model, testing_data)
51 | 
52 | cat("Train RMSE:", getTrainPerf(svm_model)[, "TrainRMSE"], "\n")
53 | cat("Test RMSE:", RMSE(predictions, testing_data$y))
54 | 
55 | ## Train RMSE: 5.733844 
56 | ## Test RMSE: 5.758587
57 | 


--------------------------------------------------------------------------------