├── LICENSE
├── README.md
├── geneticXGboost.py
└── xgboost_genetic.py


/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2018 Mohit Jain
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Hyperparameter-tuning-in-XGBoost-using-genetic-algorithm
2 | We will use genetic algorithm for hyperparameter tuning in XGBoost. The dataset is from https://archive.ics.uci.edu/ml/machine-learning-databases/musk/.
3 | It contains a set of 102 molecules, out of which 39 are identified by humans as 
4 | having odor that can be used in perfumery and 69 having not the desired odor.
5 | The dataset contains 6,590 low-energy conformations of these molecules, contianing 166 features.
6 | 


--------------------------------------------------------------------------------
/geneticXGboost.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | """
  4 | @author: mohit jain
  5 | """
  6 | 
  7 | from sklearn.metrics import f1_score
  8 | import numpy as np
  9 | import random
 10 | import xgboost as xgb
 11 | import matplotlib.pyplot as plt
 12 | 
 13 | 
 14 | random.seed(723)
 15 | np.random.seed(723)
 16 | 
 17 | def initilialize_poplulation(numberOfParents):
 18 |     learningRate = np.empty([numberOfParents, 1])
 19 |     nEstimators = np.empty([numberOfParents, 1], dtype = np.uint8)
 20 |     maxDepth = np.empty([numberOfParents, 1], dtype = np.uint8)
 21 |     minChildWeight = np.empty([numberOfParents, 1])
 22 |     gammaValue = np.empty([numberOfParents, 1])
 23 |     subSample = np.empty([numberOfParents, 1])
 24 |     colSampleByTree =  np.empty([numberOfParents, 1])
 25 | 
 26 |     for i in range(numberOfParents):
 27 |         print(i)
 28 |         learningRate[i] = round(random.uniform(0.01, 1), 2)
 29 |         nEstimators[i] = random.randrange(10, 1500, step = 25)
 30 |         maxDepth[i] = int(random.randrange(1, 10, step= 1))
 31 |         minChildWeight[i] = round(random.uniform(0.01, 10.0), 2)
 32 |         gammaValue[i] = round(random.uniform(0.01, 10.0), 2)
 33 |         subSample[i] = round(random.uniform(0.01, 1.0), 2)
 34 |         colSampleByTree[i] = round(random.uniform(0.01, 1.0), 2)
 35 |     
 36 |     population = np.concatenate((learningRate, nEstimators, maxDepth, minChildWeight, gammaValue, subSample, colSampleByTree), axis= 1)
 37 |     return population
 38 | 
 39 | 
 40 | #create fitness function that will predict F1_score    
 41 | 
 42 | def fitness_f1score(y_true, y_pred):
 43 |     fitness = round((f1_score(y_true, y_pred, average='weighted')), 4)
 44 |     return fitness
 45 | 
 46 | #train the data annd find fitness score
 47 | def train_population(population, dMatrixTrain, dMatrixtest, y_test):
 48 |     fScore = []
 49 |     for i in range(population.shape[0]):
 50 |         param = { 'objective':'binary:logistic',
 51 |               'learning_rate': population[i][0],
 52 |               'n_estimators': population[i][1], 
 53 |               'max_depth': int(population[i][2]), 
 54 |               'min_child_weight': population[i][3],
 55 |               'gamma': population[i][4], 
 56 |               'subsample': population[i][5],
 57 |               'colsample_bytree': population[i][6],
 58 |               'seed': 24}
 59 |         num_round = 100
 60 |         xgbT = xgb.train(param, dMatrixTrain, num_round)
 61 |         preds = xgbT.predict(dMatrixtest)
 62 |         preds = preds>0.5
 63 |         fScore.append(fitness_f1score(y_test, preds))
 64 |     return fScore
 65 | 
 66 | 
 67 | 
 68 | #select parents for mating
 69 | def new_parents_selection(population, fitness, numParents):
 70 |     selectedParents = np.empty((numParents, population.shape[1])) #create an array to store fittest parents
 71 |     
 72 |     #find the top best performing parents
 73 |     for parentId in range(numParents):
 74 |         bestFitnessId = np.where(fitness == np.max(fitness))
 75 |         bestFitnessId  = bestFitnessId[0][0]
 76 |         selectedParents[parentId, :] = population[bestFitnessId, :]
 77 |         fitness[bestFitnessId] = -1 #set this value to negative, in case of F1-score, so this parent is not selected again
 78 |     return selectedParents
 79 |         
 80 | '''
 81 | Mate these parents to create chilren having parameters from these parents (we are using uniform crossover method)
 82 | '''
 83 | def crossover_uniform(parents, childrenSize):
 84 |     
 85 |     crossoverPointIndex = np.arange(0, np.uint8(childrenSize[1]), 1, dtype= np.uint8) #get all the index
 86 |     crossoverPointIndex1 = np.random.randint(0, np.uint8(childrenSize[1]), np.uint8(childrenSize[1]/2)) # select half  of the indexes randomly
 87 |     crossoverPointIndex2 = np.array(list(set(crossoverPointIndex) - set(crossoverPointIndex1))) #select leftover indexes
 88 |     
 89 |     children = np.empty(childrenSize)
 90 |     
 91 |     '''
 92 |     Create child by choosing parameters from two paraents selected using new_parent_selection function. The parameter values
 93 |     will be picked from the indexes, which were randomly selected above. 
 94 |     '''
 95 |     for i in range(childrenSize[0]):
 96 |         
 97 |         #find parent 1 index 
 98 |         parent1_index = i%parents.shape[0]
 99 |         #find parent 2 index
100 |         parent2_index = (i+1)%parents.shape[0]
101 |         #insert parameters based on random selected indexes in parent 1
102 |         children[i, crossoverPointIndex1] = parents[parent1_index, crossoverPointIndex1]
103 |         #insert parameters based on random selected indexes in parent 1
104 |         children[i, crossoverPointIndex2] = parents[parent2_index, crossoverPointIndex2]
105 |     return children
106 |     
107 | '''
108 | Introduce some mutation in the children. In case of XGboost we will introdcue mutation randomly on each parameter one at a time,
109 | based on which parameter is selected at random. Initially, we will define the maximum/minimum value that is allowed for the parameter, to prevent the
110 | out the range error during runtime. Subsequently, we will generate mutation value and add it to the parameter, and return the mutated offspring!!!
111 | '''
112 | 
113 | def mutation(crossover, numberOfParameters):
114 |     #Define minimum and maximum values allowed for each parameter
115 | 
116 |     minMaxValue = np.zeros((numberOfParameters, 2))
117 |     
118 |     minMaxValue[0:] = [0.01, 1.0] #min/max learning rate
119 |     minMaxValue[1, :] = [10, 2000] #min/max n_estimator
120 |     minMaxValue[2, :] = [1, 15] #min/max depth
121 |     minMaxValue[3, :] = [0, 10.0] #min/max child_weight
122 |     minMaxValue[4, :] = [0.01, 10.0] #min/max gamma
123 |     minMaxValue[5, :] = [0.01, 1.0] #min/maxsubsample
124 |     minMaxValue[6, :] = [0.01, 1.0] #min/maxcolsample_bytree
125 |  
126 |     # Mutation changes a single gene in each offspring randomly.
127 |     mutationValue = 0
128 |     parameterSelect = np.random.randint(0, 7, 1)
129 |     print(parameterSelect)
130 |     if parameterSelect == 0: #learning_rate
131 |         mutationValue = round(np.random.uniform(-0.5, 0.5), 2)
132 |     if parameterSelect == 1: #n_estimators
133 |         mutationValue = np.random.randint(-200, 200, 1)
134 |     if parameterSelect == 2: #max_depth
135 |         mutationValue = np.random.randint(-5, 5, 1)
136 |     if parameterSelect == 3: #min_child_weight
137 |         mutationValue = round(np.random.uniform(5, 5), 2)
138 |     if parameterSelect == 4: #gamma
139 |         mutationValue = round(np.random.uniform(-2, 2), 2)
140 |     if parameterSelect == 5: #subsample
141 |         mutationValue = round(np.random.uniform(-0.5, 0.5), 2)
142 |     if parameterSelect == 6: #colsample
143 |         mutationValue = round(np.random.uniform(-0.5, 0.5), 2)
144 |   
145 |     #indtroduce mutation by changing one parameter, and set to max or min if it goes out of range
146 |     for idx in range(crossover.shape[0]):
147 |         crossover[idx, parameterSelect] = crossover[idx, parameterSelect] + mutationValue
148 |         if(crossover[idx, parameterSelect] > minMaxValue[parameterSelect, 1]):
149 |             crossover[idx, parameterSelect] = minMaxValue[parameterSelect, 1]
150 |         if(crossover[idx, parameterSelect] < minMaxValue[parameterSelect, 0]):
151 |             crossover[idx, parameterSelect] = minMaxValue[parameterSelect, 0]    
152 |     return crossover
153 | 
154 | 
155 | '''
156 | This function will allow us to genrate the heatmap for various parameters and fitness to visualize 
157 | how each parameter and fitness changes with each generation
158 | '''
159 | 
160 | def plot_parameters(numberOfGenerations, numberOfParents, parameter, parameterName):
161 |     #inspired from https://matplotlib.org/gallery/images_contours_and_fields/image_annotated_heatmap.html
162 |     generationList = ["Gen {}".format(i) for i in range(numberOfGenerations+1)]
163 |     populationList = ["Parent {}".format(i) for i in range(numberOfParents)]
164 |     
165 |     fig, ax = plt.subplots()
166 |     im = ax.imshow(parameter, cmap=plt.get_cmap('YlOrBr'))
167 |     
168 |     # show ticks
169 |     ax.set_xticks(np.arange(len(populationList)))
170 |     ax.set_yticks(np.arange(len(generationList)))
171 |     
172 |     # show labels
173 |     ax.set_xticklabels(populationList)
174 |     ax.set_yticklabels(generationList)
175 |     
176 |     # set ticks at 45 degrees and rotate around anchor
177 |     plt.setp(ax.get_xticklabels(), rotation=45, ha="right",
178 |              rotation_mode="anchor")
179 |     
180 |     
181 |     # insert the value of the parameter in each cell
182 |     for i in range(len(generationList)):
183 |         for j in range(len(populationList)):
184 |             text = ax.text(j, i, parameter[i, j],
185 |                            ha="center", va="center", color="k")
186 |     
187 |     ax.set_title("Change in the value of " + parameterName)
188 |     fig.tight_layout()
189 |     plt.show()
190 | 


--------------------------------------------------------------------------------
/xgboost_genetic.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | """
  4 | @author: mohit jain
  5 | """
  6 | 
  7 | '''
  8 | We will use genetic algorithum to optimize hyperparameters for XGboost. 
  9 | '''
 10 | 
 11 | # Importing the libraries
 12 | import numpy as np
 13 | import pandas as pd
 14 | import geneticXGboost #genetic algorithum module
 15 | import xgboost as xgb
 16 | 
 17 | 
 18 | np.random.seed(723)
 19 | '''
 20 | The dataset is from https://archive.ics.uci.edu/ml/machine-learning-databases/musk/
 21 | It contains a set of 102 molecules, out of which 39 are identified by humans as 
 22 | having odor that can be used in perfumery and 69 not having the desired odor.
 23 | The dataset contains 6,590 low-energy conformations of these molecules, contianing 166 features.
 24 | '''
 25 | 
 26 | # Importing the dataset
 27 | dataset = pd.read_csv('clean2.data', header=None)
 28 | 
 29 | X = dataset.iloc[:, 2:168].values #discard first two coloums as these are molecule's name and conformation's name
 30 | 
 31 | y = dataset.iloc[:, 168].values #extrtact last coloum as class (1 => desired odor, 0 => undesired odor)
 32 | 
 33 | # Splitting the dataset into the Training set and Test set
 34 | from sklearn.model_selection import train_test_split
 35 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 97)
 36 | 
 37 | # Feature Scaling
 38 | from sklearn.preprocessing import StandardScaler
 39 | sc = StandardScaler()
 40 | X_train = sc.fit_transform(X_train)
 41 | X_test = sc.transform(X_test)
 42 | 
 43 | #XGboost Classifier
 44 | 
 45 | #model xgboost
 46 | #use xgboost API now
 47 | xgDMatrix = xgb.DMatrix(X_train, y_train) #create Dmatrix
 48 | xgbDMatrixTest = xgb.DMatrix(X_test, y_test)
 49 | 
 50 | 
 51 | '''
 52 | Let's find optimized parameters using genetic algorithms
 53 | '''
 54 | 
 55 | numberOfParents = 8 #number of parents to start
 56 | numberOfParentsMating = 4 #number of parents that will mate
 57 | numberOfParameters = 7 #number of parameters that will be optimized
 58 | numberOfGenerations = 4 #number of genration that will be created
 59 | 
 60 | #define the population size
 61 | 
 62 | populationSize = (numberOfParents, numberOfParameters)
 63 | 
 64 | #initialize the population with randomly generated parameters
 65 | population = geneticXGboost.initilialize_poplulation(numberOfParents)
 66 | 
 67 | #define an array to store the fitness  hitory
 68 | fitnessHistory = np.empty([numberOfGenerations+1, numberOfParents])
 69 | 
 70 | #define an array to store the value of each parameter for each parent and generation
 71 | populationHistory = np.empty([(numberOfGenerations+1)*numberOfParents, numberOfParameters])
 72 | 
 73 | #insert the value of initial parameters to history
 74 | populationHistory[0:numberOfParents, :] = population
 75 | 
 76 | for generation in range(numberOfGenerations):
 77 |     print("This is number %s generation" % (generation))
 78 |     
 79 |     #train the dataset and obtain fitness
 80 |     fitnessValue = geneticXGboost.train_population(population=population, dMatrixTrain=xgDMatrix, dMatrixtest=xgbDMatrixTest, y_test=y_test)
 81 |     fitnessHistory[generation, :] = fitnessValue
 82 |     
 83 |     #best score in the current iteration
 84 |     print('Best F1 score in the this iteration = {}'.format(np.max(fitnessHistory[generation, :])))
 85 | 
 86 |     #survival of the fittest - take the top parents, based on the fitness value and number of parents needed to be selected
 87 |     parents = geneticXGboost.new_parents_selection(population=population, fitness=fitnessValue, numParents=numberOfParentsMating)
 88 |     
 89 |     #mate these parents to create children having parameters from these parents (we are using uniform crossover)
 90 |     children = geneticXGboost.crossover_uniform(parents=parents, childrenSize=(populationSize[0] - parents.shape[0], numberOfParameters))
 91 |     
 92 |     #add mutation to create genetic diversity
 93 |     children_mutated = geneticXGboost.mutation(children, numberOfParameters)
 94 |     
 95 |     '''
 96 |     We will create new population, which will contain parents that where selected previously based on the
 97 |     fitness score and rest of them  will be children
 98 |     '''
 99 |     population[0:parents.shape[0], :] = parents #fittest parents
100 |     population[parents.shape[0]:, :] = children_mutated #children
101 |     
102 |     populationHistory[(generation+1)*numberOfParents : (generation+1)*numberOfParents+ numberOfParents , :] = population #srore parent information
103 |     
104 | 
105 | #Best solution from the final iteration
106 | 
107 | fitness = geneticXGboost.train_population(population=population, dMatrixTrain=xgDMatrix, dMatrixtest=xgbDMatrixTest, y_test=y_test)
108 | fitnessHistory[generation+1, :] = fitness
109 | 
110 | #index of the best solution
111 | bestFitnessIndex = np.where(fitness == np.max(fitness))[0][0]
112 | 
113 | #Best fitness
114 | print("Best fitness is =", fitness[bestFitnessIndex])
115 | 
116 | #Best parameters
117 | print("Best parameters are:")
118 | print('learning_rate', population[bestFitnessIndex][0])
119 | print('n_estimators', population[bestFitnessIndex][1])
120 | print('max_depth', int(population[bestFitnessIndex][2])) 
121 | print('min_child_weight', population[bestFitnessIndex][3])
122 | print('gamma', population[bestFitnessIndex][4])
123 | print('subsample', population[bestFitnessIndex][5])
124 | print('colsample_bytree', population[bestFitnessIndex][6])
125 | 
126 | 
127 | #visualize the change in fitness of the various generations and parents
128 | 
129 | 
130 | geneticXGboost.plot_parameters(numberOfGenerations, numberOfParents, fitnessHistory, "fitness (F1-score)")
131 | 
132 | #Look at individual parameters change with generation
133 | #Create array for each parameter history (Genration x Parents)
134 | 
135 | learnigRateHistory = populationHistory[:, 0].reshape([numberOfGenerations+1, numberOfParents])
136 | nEstimatorHistory = populationHistory[:, 1].reshape([numberOfGenerations+1, numberOfParents])
137 | maxdepthHistory = populationHistory[:, 2].reshape([numberOfGenerations+1, numberOfParents])
138 | minChildWeightHistory = populationHistory[:, 3].reshape([numberOfGenerations+1, numberOfParents])
139 | gammaHistory = populationHistory[:, 4].reshape([numberOfGenerations+1, numberOfParents])
140 | subsampleHistory = populationHistory[:, 5].reshape([numberOfGenerations+1, numberOfParents])
141 | colsampleByTreeHistory = populationHistory[:, 6].reshape([numberOfGenerations+1, numberOfParents])
142 | 
143 | #generate heatmap for each parameter
144 | 
145 | geneticXGboost.plot_parameters(numberOfGenerations, numberOfParents, learnigRateHistory, "learning rate")
146 | geneticXGboost.plot_parameters(numberOfGenerations, numberOfParents, nEstimatorHistory, "n_estimator")
147 | geneticXGboost.plot_parameters(numberOfGenerations, numberOfParents, maxdepthHistory, "maximum depth")
148 | geneticXGboost.plot_parameters(numberOfGenerations, numberOfParents, minChildWeightHistory, "minimum child weight")
149 | geneticXGboost.plot_parameters(numberOfGenerations, numberOfParents, gammaHistory, "gamma")
150 | geneticXGboost.plot_parameters(numberOfGenerations, numberOfParents, subsampleHistory, "subsample")
151 | geneticXGboost.plot_parameters(numberOfGenerations, numberOfParents, colsampleByTreeHistory, "col sample by history")
152 | 
153 | 


--------------------------------------------------------------------------------