├── LICENSE ├── README.md ├── geneticXGboost.py └── xgboost_genetic.py /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2018 Mohit Jain 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Hyperparameter-tuning-in-XGBoost-using-genetic-algorithm 2 | We will use genetic algorithm for hyperparameter tuning in XGBoost. The dataset is from https://archive.ics.uci.edu/ml/machine-learning-databases/musk/. 3 | It contains a set of 102 molecules, out of which 39 are identified by humans as 4 | having odor that can be used in perfumery and 69 having not the desired odor. 5 | The dataset contains 6,590 low-energy conformations of these molecules, contianing 166 features. 6 | -------------------------------------------------------------------------------- /geneticXGboost.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | @author: mohit jain 5 | """ 6 | 7 | from sklearn.metrics import f1_score 8 | import numpy as np 9 | import random 10 | import xgboost as xgb 11 | import matplotlib.pyplot as plt 12 | 13 | 14 | random.seed(723) 15 | np.random.seed(723) 16 | 17 | def initilialize_poplulation(numberOfParents): 18 | learningRate = np.empty([numberOfParents, 1]) 19 | nEstimators = np.empty([numberOfParents, 1], dtype = np.uint8) 20 | maxDepth = np.empty([numberOfParents, 1], dtype = np.uint8) 21 | minChildWeight = np.empty([numberOfParents, 1]) 22 | gammaValue = np.empty([numberOfParents, 1]) 23 | subSample = np.empty([numberOfParents, 1]) 24 | colSampleByTree = np.empty([numberOfParents, 1]) 25 | 26 | for i in range(numberOfParents): 27 | print(i) 28 | learningRate[i] = round(random.uniform(0.01, 1), 2) 29 | nEstimators[i] = random.randrange(10, 1500, step = 25) 30 | maxDepth[i] = int(random.randrange(1, 10, step= 1)) 31 | minChildWeight[i] = round(random.uniform(0.01, 10.0), 2) 32 | gammaValue[i] = round(random.uniform(0.01, 10.0), 2) 33 | subSample[i] = round(random.uniform(0.01, 1.0), 2) 34 | colSampleByTree[i] = round(random.uniform(0.01, 1.0), 2) 35 | 36 | population = np.concatenate((learningRate, nEstimators, maxDepth, minChildWeight, gammaValue, subSample, colSampleByTree), axis= 1) 37 | return population 38 | 39 | 40 | #create fitness function that will predict F1_score 41 | 42 | def fitness_f1score(y_true, y_pred): 43 | fitness = round((f1_score(y_true, y_pred, average='weighted')), 4) 44 | return fitness 45 | 46 | #train the data annd find fitness score 47 | def train_population(population, dMatrixTrain, dMatrixtest, y_test): 48 | fScore = [] 49 | for i in range(population.shape[0]): 50 | param = { 'objective':'binary:logistic', 51 | 'learning_rate': population[i][0], 52 | 'n_estimators': population[i][1], 53 | 'max_depth': int(population[i][2]), 54 | 'min_child_weight': population[i][3], 55 | 'gamma': population[i][4], 56 | 'subsample': population[i][5], 57 | 'colsample_bytree': population[i][6], 58 | 'seed': 24} 59 | num_round = 100 60 | xgbT = xgb.train(param, dMatrixTrain, num_round) 61 | preds = xgbT.predict(dMatrixtest) 62 | preds = preds>0.5 63 | fScore.append(fitness_f1score(y_test, preds)) 64 | return fScore 65 | 66 | 67 | 68 | #select parents for mating 69 | def new_parents_selection(population, fitness, numParents): 70 | selectedParents = np.empty((numParents, population.shape[1])) #create an array to store fittest parents 71 | 72 | #find the top best performing parents 73 | for parentId in range(numParents): 74 | bestFitnessId = np.where(fitness == np.max(fitness)) 75 | bestFitnessId = bestFitnessId[0][0] 76 | selectedParents[parentId, :] = population[bestFitnessId, :] 77 | fitness[bestFitnessId] = -1 #set this value to negative, in case of F1-score, so this parent is not selected again 78 | return selectedParents 79 | 80 | ''' 81 | Mate these parents to create chilren having parameters from these parents (we are using uniform crossover method) 82 | ''' 83 | def crossover_uniform(parents, childrenSize): 84 | 85 | crossoverPointIndex = np.arange(0, np.uint8(childrenSize[1]), 1, dtype= np.uint8) #get all the index 86 | crossoverPointIndex1 = np.random.randint(0, np.uint8(childrenSize[1]), np.uint8(childrenSize[1]/2)) # select half of the indexes randomly 87 | crossoverPointIndex2 = np.array(list(set(crossoverPointIndex) - set(crossoverPointIndex1))) #select leftover indexes 88 | 89 | children = np.empty(childrenSize) 90 | 91 | ''' 92 | Create child by choosing parameters from two paraents selected using new_parent_selection function. The parameter values 93 | will be picked from the indexes, which were randomly selected above. 94 | ''' 95 | for i in range(childrenSize[0]): 96 | 97 | #find parent 1 index 98 | parent1_index = i%parents.shape[0] 99 | #find parent 2 index 100 | parent2_index = (i+1)%parents.shape[0] 101 | #insert parameters based on random selected indexes in parent 1 102 | children[i, crossoverPointIndex1] = parents[parent1_index, crossoverPointIndex1] 103 | #insert parameters based on random selected indexes in parent 1 104 | children[i, crossoverPointIndex2] = parents[parent2_index, crossoverPointIndex2] 105 | return children 106 | 107 | ''' 108 | Introduce some mutation in the children. In case of XGboost we will introdcue mutation randomly on each parameter one at a time, 109 | based on which parameter is selected at random. Initially, we will define the maximum/minimum value that is allowed for the parameter, to prevent the 110 | out the range error during runtime. Subsequently, we will generate mutation value and add it to the parameter, and return the mutated offspring!!! 111 | ''' 112 | 113 | def mutation(crossover, numberOfParameters): 114 | #Define minimum and maximum values allowed for each parameter 115 | 116 | minMaxValue = np.zeros((numberOfParameters, 2)) 117 | 118 | minMaxValue[0:] = [0.01, 1.0] #min/max learning rate 119 | minMaxValue[1, :] = [10, 2000] #min/max n_estimator 120 | minMaxValue[2, :] = [1, 15] #min/max depth 121 | minMaxValue[3, :] = [0, 10.0] #min/max child_weight 122 | minMaxValue[4, :] = [0.01, 10.0] #min/max gamma 123 | minMaxValue[5, :] = [0.01, 1.0] #min/maxsubsample 124 | minMaxValue[6, :] = [0.01, 1.0] #min/maxcolsample_bytree 125 | 126 | # Mutation changes a single gene in each offspring randomly. 127 | mutationValue = 0 128 | parameterSelect = np.random.randint(0, 7, 1) 129 | print(parameterSelect) 130 | if parameterSelect == 0: #learning_rate 131 | mutationValue = round(np.random.uniform(-0.5, 0.5), 2) 132 | if parameterSelect == 1: #n_estimators 133 | mutationValue = np.random.randint(-200, 200, 1) 134 | if parameterSelect == 2: #max_depth 135 | mutationValue = np.random.randint(-5, 5, 1) 136 | if parameterSelect == 3: #min_child_weight 137 | mutationValue = round(np.random.uniform(5, 5), 2) 138 | if parameterSelect == 4: #gamma 139 | mutationValue = round(np.random.uniform(-2, 2), 2) 140 | if parameterSelect == 5: #subsample 141 | mutationValue = round(np.random.uniform(-0.5, 0.5), 2) 142 | if parameterSelect == 6: #colsample 143 | mutationValue = round(np.random.uniform(-0.5, 0.5), 2) 144 | 145 | #indtroduce mutation by changing one parameter, and set to max or min if it goes out of range 146 | for idx in range(crossover.shape[0]): 147 | crossover[idx, parameterSelect] = crossover[idx, parameterSelect] + mutationValue 148 | if(crossover[idx, parameterSelect] > minMaxValue[parameterSelect, 1]): 149 | crossover[idx, parameterSelect] = minMaxValue[parameterSelect, 1] 150 | if(crossover[idx, parameterSelect] < minMaxValue[parameterSelect, 0]): 151 | crossover[idx, parameterSelect] = minMaxValue[parameterSelect, 0] 152 | return crossover 153 | 154 | 155 | ''' 156 | This function will allow us to genrate the heatmap for various parameters and fitness to visualize 157 | how each parameter and fitness changes with each generation 158 | ''' 159 | 160 | def plot_parameters(numberOfGenerations, numberOfParents, parameter, parameterName): 161 | #inspired from https://matplotlib.org/gallery/images_contours_and_fields/image_annotated_heatmap.html 162 | generationList = ["Gen {}".format(i) for i in range(numberOfGenerations+1)] 163 | populationList = ["Parent {}".format(i) for i in range(numberOfParents)] 164 | 165 | fig, ax = plt.subplots() 166 | im = ax.imshow(parameter, cmap=plt.get_cmap('YlOrBr')) 167 | 168 | # show ticks 169 | ax.set_xticks(np.arange(len(populationList))) 170 | ax.set_yticks(np.arange(len(generationList))) 171 | 172 | # show labels 173 | ax.set_xticklabels(populationList) 174 | ax.set_yticklabels(generationList) 175 | 176 | # set ticks at 45 degrees and rotate around anchor 177 | plt.setp(ax.get_xticklabels(), rotation=45, ha="right", 178 | rotation_mode="anchor") 179 | 180 | 181 | # insert the value of the parameter in each cell 182 | for i in range(len(generationList)): 183 | for j in range(len(populationList)): 184 | text = ax.text(j, i, parameter[i, j], 185 | ha="center", va="center", color="k") 186 | 187 | ax.set_title("Change in the value of " + parameterName) 188 | fig.tight_layout() 189 | plt.show() 190 | -------------------------------------------------------------------------------- /xgboost_genetic.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | @author: mohit jain 5 | """ 6 | 7 | ''' 8 | We will use genetic algorithum to optimize hyperparameters for XGboost. 9 | ''' 10 | 11 | # Importing the libraries 12 | import numpy as np 13 | import pandas as pd 14 | import geneticXGboost #genetic algorithum module 15 | import xgboost as xgb 16 | 17 | 18 | np.random.seed(723) 19 | ''' 20 | The dataset is from https://archive.ics.uci.edu/ml/machine-learning-databases/musk/ 21 | It contains a set of 102 molecules, out of which 39 are identified by humans as 22 | having odor that can be used in perfumery and 69 not having the desired odor. 23 | The dataset contains 6,590 low-energy conformations of these molecules, contianing 166 features. 24 | ''' 25 | 26 | # Importing the dataset 27 | dataset = pd.read_csv('clean2.data', header=None) 28 | 29 | X = dataset.iloc[:, 2:168].values #discard first two coloums as these are molecule's name and conformation's name 30 | 31 | y = dataset.iloc[:, 168].values #extrtact last coloum as class (1 => desired odor, 0 => undesired odor) 32 | 33 | # Splitting the dataset into the Training set and Test set 34 | from sklearn.model_selection import train_test_split 35 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 97) 36 | 37 | # Feature Scaling 38 | from sklearn.preprocessing import StandardScaler 39 | sc = StandardScaler() 40 | X_train = sc.fit_transform(X_train) 41 | X_test = sc.transform(X_test) 42 | 43 | #XGboost Classifier 44 | 45 | #model xgboost 46 | #use xgboost API now 47 | xgDMatrix = xgb.DMatrix(X_train, y_train) #create Dmatrix 48 | xgbDMatrixTest = xgb.DMatrix(X_test, y_test) 49 | 50 | 51 | ''' 52 | Let's find optimized parameters using genetic algorithms 53 | ''' 54 | 55 | numberOfParents = 8 #number of parents to start 56 | numberOfParentsMating = 4 #number of parents that will mate 57 | numberOfParameters = 7 #number of parameters that will be optimized 58 | numberOfGenerations = 4 #number of genration that will be created 59 | 60 | #define the population size 61 | 62 | populationSize = (numberOfParents, numberOfParameters) 63 | 64 | #initialize the population with randomly generated parameters 65 | population = geneticXGboost.initilialize_poplulation(numberOfParents) 66 | 67 | #define an array to store the fitness hitory 68 | fitnessHistory = np.empty([numberOfGenerations+1, numberOfParents]) 69 | 70 | #define an array to store the value of each parameter for each parent and generation 71 | populationHistory = np.empty([(numberOfGenerations+1)*numberOfParents, numberOfParameters]) 72 | 73 | #insert the value of initial parameters to history 74 | populationHistory[0:numberOfParents, :] = population 75 | 76 | for generation in range(numberOfGenerations): 77 | print("This is number %s generation" % (generation)) 78 | 79 | #train the dataset and obtain fitness 80 | fitnessValue = geneticXGboost.train_population(population=population, dMatrixTrain=xgDMatrix, dMatrixtest=xgbDMatrixTest, y_test=y_test) 81 | fitnessHistory[generation, :] = fitnessValue 82 | 83 | #best score in the current iteration 84 | print('Best F1 score in the this iteration = {}'.format(np.max(fitnessHistory[generation, :]))) 85 | 86 | #survival of the fittest - take the top parents, based on the fitness value and number of parents needed to be selected 87 | parents = geneticXGboost.new_parents_selection(population=population, fitness=fitnessValue, numParents=numberOfParentsMating) 88 | 89 | #mate these parents to create children having parameters from these parents (we are using uniform crossover) 90 | children = geneticXGboost.crossover_uniform(parents=parents, childrenSize=(populationSize[0] - parents.shape[0], numberOfParameters)) 91 | 92 | #add mutation to create genetic diversity 93 | children_mutated = geneticXGboost.mutation(children, numberOfParameters) 94 | 95 | ''' 96 | We will create new population, which will contain parents that where selected previously based on the 97 | fitness score and rest of them will be children 98 | ''' 99 | population[0:parents.shape[0], :] = parents #fittest parents 100 | population[parents.shape[0]:, :] = children_mutated #children 101 | 102 | populationHistory[(generation+1)*numberOfParents : (generation+1)*numberOfParents+ numberOfParents , :] = population #srore parent information 103 | 104 | 105 | #Best solution from the final iteration 106 | 107 | fitness = geneticXGboost.train_population(population=population, dMatrixTrain=xgDMatrix, dMatrixtest=xgbDMatrixTest, y_test=y_test) 108 | fitnessHistory[generation+1, :] = fitness 109 | 110 | #index of the best solution 111 | bestFitnessIndex = np.where(fitness == np.max(fitness))[0][0] 112 | 113 | #Best fitness 114 | print("Best fitness is =", fitness[bestFitnessIndex]) 115 | 116 | #Best parameters 117 | print("Best parameters are:") 118 | print('learning_rate', population[bestFitnessIndex][0]) 119 | print('n_estimators', population[bestFitnessIndex][1]) 120 | print('max_depth', int(population[bestFitnessIndex][2])) 121 | print('min_child_weight', population[bestFitnessIndex][3]) 122 | print('gamma', population[bestFitnessIndex][4]) 123 | print('subsample', population[bestFitnessIndex][5]) 124 | print('colsample_bytree', population[bestFitnessIndex][6]) 125 | 126 | 127 | #visualize the change in fitness of the various generations and parents 128 | 129 | 130 | geneticXGboost.plot_parameters(numberOfGenerations, numberOfParents, fitnessHistory, "fitness (F1-score)") 131 | 132 | #Look at individual parameters change with generation 133 | #Create array for each parameter history (Genration x Parents) 134 | 135 | learnigRateHistory = populationHistory[:, 0].reshape([numberOfGenerations+1, numberOfParents]) 136 | nEstimatorHistory = populationHistory[:, 1].reshape([numberOfGenerations+1, numberOfParents]) 137 | maxdepthHistory = populationHistory[:, 2].reshape([numberOfGenerations+1, numberOfParents]) 138 | minChildWeightHistory = populationHistory[:, 3].reshape([numberOfGenerations+1, numberOfParents]) 139 | gammaHistory = populationHistory[:, 4].reshape([numberOfGenerations+1, numberOfParents]) 140 | subsampleHistory = populationHistory[:, 5].reshape([numberOfGenerations+1, numberOfParents]) 141 | colsampleByTreeHistory = populationHistory[:, 6].reshape([numberOfGenerations+1, numberOfParents]) 142 | 143 | #generate heatmap for each parameter 144 | 145 | geneticXGboost.plot_parameters(numberOfGenerations, numberOfParents, learnigRateHistory, "learning rate") 146 | geneticXGboost.plot_parameters(numberOfGenerations, numberOfParents, nEstimatorHistory, "n_estimator") 147 | geneticXGboost.plot_parameters(numberOfGenerations, numberOfParents, maxdepthHistory, "maximum depth") 148 | geneticXGboost.plot_parameters(numberOfGenerations, numberOfParents, minChildWeightHistory, "minimum child weight") 149 | geneticXGboost.plot_parameters(numberOfGenerations, numberOfParents, gammaHistory, "gamma") 150 | geneticXGboost.plot_parameters(numberOfGenerations, numberOfParents, subsampleHistory, "subsample") 151 | geneticXGboost.plot_parameters(numberOfGenerations, numberOfParents, colsampleByTreeHistory, "col sample by history") 152 | 153 | --------------------------------------------------------------------------------