├── README.md
└── xgboost_Classification.R


/README.md:
--------------------------------------------------------------------------------
1 | # DataAnalysis
2 | Data Analysis examples
3 | 


--------------------------------------------------------------------------------
/xgboost_Classification.R:
--------------------------------------------------------------------------------
  1 | # Date: Jan 2016
  2 | # xgboost_Classification.R
  3 | # This is an example of xgboost model using the iris data available in base R.
  4 | # To run this code, you need to have the xgboost package installed. You do not have to read in any data files..
  5 | 
  6 | # Predict the Species from the 4 features of iris data.
  7 | # The data contains numeric predictors. Our target column is Species, with 3 classes.
  8 |  
  9 | # Note: This uses a two step process.
 10 | # Step 1 performs cross-validation to find the number of iterations needed to get the minimum loss.
 11 | # Step 2 creates the final model using the nround identified in Step 1, and makes the prediction. 
 12 | # 
 13 | # Also note that I have skipped a few pre-modeling steps:
 14 | #                                   Data Exploration, Handling Outliers, Handling/Imputing Null predictors
 15 | 
 16 | # Load the required libraries.
 17 | library(xgboost)
 18 | library(caret)      # for confusionMatrix
 19 | 
 20 | #Check the data structure
 21 | data(iris)
 22 | print(str(iris))
 23 |  
 24 | #Split the iris data into training (70%) and testing(30%).
 25 | set.seed(100)
 26 | ind = sample(nrow(iris),nrow(iris)* 0.7)
 27 | training = iris[ind,]
 28 | testing = iris[-ind,]
 29 | 
 30 | #Set the parameters for cross-validation and xgboost.
 31 | #Note: This is a multi-class classification problem, and the evaluation metric is "mlogloss".
 32 | #      The same parameters are used by Step 1 and Step 2.
 33 | #      You can try different values for nthread, max_depth, eta, gamma, etc., and see if you get lower prediction error.
 34 | 
 35 | param       = list("objective" = "multi:softmax", # multi class classification
 36 | 	      "num_class"= 3 ,  		# Number of classes in the dependent variable.
 37 |               "eval_metric" = "mlogloss",  	 # evaluation metric 
 38 |               "nthread" = 8,   			 # number of threads to be used 
 39 |               "max_depth" = 16,    		 # maximum depth of tree 
 40 |               "eta" = 0.3,    			 # step size shrinkage 
 41 |               "gamma" = 0,    			 # minimum loss reduction 
 42 |               "subsample" = 0.7,    		 # part of data instances to grow tree 
 43 |               "colsample_bytree" = 1, 		 # subsample ratio of columns when constructing each tree 
 44 |               "min_child_weight" = 12  		 # minimum sum of instance weight needed in a child 
 45 |               )
 46 | 
 47 | #Identify the Predictors and the dependent variable, aka label.
 48 | predictors = colnames(training[-ncol(training)])
 49 | #xgboost works only if the labels are numeric. Hence, convert the labels (Species) to numeric.
 50 | label = as.numeric(training[,ncol(training)])
 51 | print(table (label))
 52 | 
 53 | #Alas, xgboost works only if the numeric labels start from 0. Hence, subtract 1 from the label.
 54 | label = as.numeric(training[,ncol(training)])-1
 55 | print(table (label))
 56 | 		  
 57 | #########################################################################################################
 58 | # Step 1: Run a Cross-Validation to identify the round with the minimum loss or error.
 59 | #         Note: xgboost expects the data in the form of a numeric matrix.
 60 | 
 61 | set.seed(100)
 62 | 
 63 | cv.nround = 200;  # Number of rounds. This can be set to a lower or higher value, if you wish, example: 150 or 250 or 300  
 64 | bst.cv = xgb.cv(
 65 |         param=param,
 66 | 	data = as.matrix(training[,predictors]),
 67 | 	label = label,
 68 | 	nfold = 3,
 69 | 	nrounds=cv.nround,
 70 | 	prediction=T)
 71 | 
 72 | #Find where the minimum logloss occurred
 73 | min.loss.idx = which.min(bst.cv$dt[, test.mlogloss.mean]) 
 74 | cat ("Minimum logloss occurred in round : ", min.loss.idx, "\n")
 75 | 
 76 | # Minimum logloss
 77 | print(bst.cv$dt[min.loss.idx,])
 78 | 
 79 | ##############################################################################################################################
 80 | # Step 2: Train the xgboost model using min.loss.idx found above.
 81 | #         Note, we have to stop at the round where we get the minumum error.
 82 | set.seed(100)
 83 | 
 84 | bst = xgboost(
 85 | 		param=param,
 86 | 		data =as.matrix(training[,predictors]),
 87 | 		label = label,
 88 | 		nrounds=min.loss.idx)
 89 | 
 90 | # Make prediction on the testing data.
 91 | testing$prediction = predict(bst, as.matrix(testing[,predictors]))
 92 | 
 93 | #Translate the prediction to the original class or Species.
 94 | testing$prediction = ifelse(testing$prediction==0,"setosa",ifelse(testing$prediction==1,"versicolor","virginica"))
 95 | 
 96 | #Compute the accuracy of predictions.
 97 | confusionMatrix( testing$prediction,testing$Species)
 98 | 
 99 | #################################################################################################################################
100 | #Extra: Use some other model for the same prediction.
101 | #       (randomForest with cross-validation using the caret package)
102 | 
103 | set.seed(100)
104 | train_control = trainControl(method="cv",number=10)
105 | model.rf = train(Species~., data=training, trControl=train_control, method="rf")
106 | 
107 | testing$prediction.rf = predict(model.rf,testing[,predictors])
108 | 
109 | #Compute the accuracy of predictions.
110 | confusionMatrix( testing$prediction.rf,testing$Species)
111 | ################################################################################################################################
112 | ####################################
113 | 


--------------------------------------------------------------------------------