├── README.md └── xgboost_Classification.R /README.md: -------------------------------------------------------------------------------- 1 | # DataAnalysis 2 | Data Analysis examples 3 | -------------------------------------------------------------------------------- /xgboost_Classification.R: -------------------------------------------------------------------------------- 1 | # Date: Jan 2016 2 | # xgboost_Classification.R 3 | # This is an example of xgboost model using the iris data available in base R. 4 | # To run this code, you need to have the xgboost package installed. You do not have to read in any data files.. 5 | 6 | # Predict the Species from the 4 features of iris data. 7 | # The data contains numeric predictors. Our target column is Species, with 3 classes. 8 | 9 | # Note: This uses a two step process. 10 | # Step 1 performs cross-validation to find the number of iterations needed to get the minimum loss. 11 | # Step 2 creates the final model using the nround identified in Step 1, and makes the prediction. 12 | # 13 | # Also note that I have skipped a few pre-modeling steps: 14 | # Data Exploration, Handling Outliers, Handling/Imputing Null predictors 15 | 16 | # Load the required libraries. 17 | library(xgboost) 18 | library(caret) # for confusionMatrix 19 | 20 | #Check the data structure 21 | data(iris) 22 | print(str(iris)) 23 | 24 | #Split the iris data into training (70%) and testing(30%). 25 | set.seed(100) 26 | ind = sample(nrow(iris),nrow(iris)* 0.7) 27 | training = iris[ind,] 28 | testing = iris[-ind,] 29 | 30 | #Set the parameters for cross-validation and xgboost. 31 | #Note: This is a multi-class classification problem, and the evaluation metric is "mlogloss". 32 | # The same parameters are used by Step 1 and Step 2. 33 | # You can try different values for nthread, max_depth, eta, gamma, etc., and see if you get lower prediction error. 34 | 35 | param = list("objective" = "multi:softmax", # multi class classification 36 | "num_class"= 3 , # Number of classes in the dependent variable. 37 | "eval_metric" = "mlogloss", # evaluation metric 38 | "nthread" = 8, # number of threads to be used 39 | "max_depth" = 16, # maximum depth of tree 40 | "eta" = 0.3, # step size shrinkage 41 | "gamma" = 0, # minimum loss reduction 42 | "subsample" = 0.7, # part of data instances to grow tree 43 | "colsample_bytree" = 1, # subsample ratio of columns when constructing each tree 44 | "min_child_weight" = 12 # minimum sum of instance weight needed in a child 45 | ) 46 | 47 | #Identify the Predictors and the dependent variable, aka label. 48 | predictors = colnames(training[-ncol(training)]) 49 | #xgboost works only if the labels are numeric. Hence, convert the labels (Species) to numeric. 50 | label = as.numeric(training[,ncol(training)]) 51 | print(table (label)) 52 | 53 | #Alas, xgboost works only if the numeric labels start from 0. Hence, subtract 1 from the label. 54 | label = as.numeric(training[,ncol(training)])-1 55 | print(table (label)) 56 | 57 | ######################################################################################################### 58 | # Step 1: Run a Cross-Validation to identify the round with the minimum loss or error. 59 | # Note: xgboost expects the data in the form of a numeric matrix. 60 | 61 | set.seed(100) 62 | 63 | cv.nround = 200; # Number of rounds. This can be set to a lower or higher value, if you wish, example: 150 or 250 or 300 64 | bst.cv = xgb.cv( 65 | param=param, 66 | data = as.matrix(training[,predictors]), 67 | label = label, 68 | nfold = 3, 69 | nrounds=cv.nround, 70 | prediction=T) 71 | 72 | #Find where the minimum logloss occurred 73 | min.loss.idx = which.min(bst.cv$dt[, test.mlogloss.mean]) 74 | cat ("Minimum logloss occurred in round : ", min.loss.idx, "\n") 75 | 76 | # Minimum logloss 77 | print(bst.cv$dt[min.loss.idx,]) 78 | 79 | ############################################################################################################################## 80 | # Step 2: Train the xgboost model using min.loss.idx found above. 81 | # Note, we have to stop at the round where we get the minumum error. 82 | set.seed(100) 83 | 84 | bst = xgboost( 85 | param=param, 86 | data =as.matrix(training[,predictors]), 87 | label = label, 88 | nrounds=min.loss.idx) 89 | 90 | # Make prediction on the testing data. 91 | testing$prediction = predict(bst, as.matrix(testing[,predictors])) 92 | 93 | #Translate the prediction to the original class or Species. 94 | testing$prediction = ifelse(testing$prediction==0,"setosa",ifelse(testing$prediction==1,"versicolor","virginica")) 95 | 96 | #Compute the accuracy of predictions. 97 | confusionMatrix( testing$prediction,testing$Species) 98 | 99 | ################################################################################################################################# 100 | #Extra: Use some other model for the same prediction. 101 | # (randomForest with cross-validation using the caret package) 102 | 103 | set.seed(100) 104 | train_control = trainControl(method="cv",number=10) 105 | model.rf = train(Species~., data=training, trControl=train_control, method="rf") 106 | 107 | testing$prediction.rf = predict(model.rf,testing[,predictors]) 108 | 109 | #Compute the accuracy of predictions. 110 | confusionMatrix( testing$prediction.rf,testing$Species) 111 | ################################################################################################################################ 112 | #################################### 113 | --------------------------------------------------------------------------------