├── README.md ├── bank-additional-full.csv ├── gaFeatureSelectionExample.py ├── testSetClassificationAccuracyVsContinuum.png ├── validationSetClassificationAccuracyVsContinuum.png └── validationSetClassificationAccuracyVsContinuumWithCubicSplineInterpolation.png /README.md: -------------------------------------------------------------------------------- 1 | This script demonstrates how Genetic Algorithms can be used to find "optimal" feature subsets for machine learning problems. 2 | 3 | ## Links 4 | A YouTube video of me briefly reviewing this project can be found [here](https://www.youtube.com/watch?v=COLO7cGP2sA). 5 | 6 | ## Inspiration 7 | While working with a data set that I had created, I found myself wondering whether certain measurements should be included in features. I didn't want to leave anything out, but also didn't want to include so many features that it might negatively impact the performance of some machine learning algorithms. While reading an academic paper, I came across the concept of using Genetic Algorithms to determine optimal feature subsets. Therefore, I chose to implement an example of this being done. 8 | 9 | ## The Future 10 | In the future, I may make a class to specifically facilitate the feature selection process. This will allow the Genetic Algorithm method of feature selection to be more easily applied "out of the box" to machine learning problems. 11 | 12 | ## Other Notes 13 | To plot a curve over the noisy data, I used Cubic-Spline Interpolation. This is my first time using this method, and I suspect there are better ways to plot such a curve. In my limited experience, Cubic-Spline Interpolation can determine curves that have unnecessary "bends" in them. 14 | -------------------------------------------------------------------------------- /gaFeatureSelectionExample.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from sklearn.preprocessing import LabelEncoder 3 | from sklearn.model_selection import train_test_split 4 | from sklearn.linear_model import LogisticRegression 5 | from sklearn.metrics import accuracy_score 6 | from deap import creator, base, tools, algorithms 7 | from scoop import futures 8 | import random 9 | import numpy 10 | from scipy import interpolate 11 | import matplotlib.pyplot as plt 12 | 13 | # Read in data from CSV 14 | # Data set from https://archive.ics.uci.edu/ml/datasets/Bank+Marketing 15 | dfData = pd.read_csv('bank-additional-full.csv', sep=';') 16 | 17 | # Encode the classification labels to numbers 18 | # Get classes and one hot encoded feature vectors 19 | le = LabelEncoder() 20 | le.fit(dfData['y']) 21 | allClasses = le.transform(dfData['y']) 22 | allFeatures = dfData.drop(['y'], axis=1) 23 | 24 | # Form training, test, and validation sets 25 | X_trainAndTest, X_validation, y_trainAndTest, y_validation = train_test_split(allFeatures, allClasses, test_size=0.20, random_state=42) 26 | X_train, X_test, y_train, y_test = train_test_split(X_trainAndTest, y_trainAndTest, test_size=0.20, random_state=42) 27 | 28 | # Feature subset fitness function 29 | def getFitness(individual, X_train, X_test, y_train, y_test): 30 | 31 | # Parse our feature columns that we don't use 32 | # Apply one hot encoding to the features 33 | cols = [index for index in range(len(individual)) if individual[index] == 0] 34 | X_trainParsed = X_train.drop(X_train.columns[cols], axis=1) 35 | X_trainOhFeatures = pd.get_dummies(X_trainParsed) 36 | X_testParsed = X_test.drop(X_test.columns[cols], axis=1) 37 | X_testOhFeatures = pd.get_dummies(X_testParsed) 38 | 39 | # Remove any columns that aren't in both the training and test sets 40 | sharedFeatures = set(X_trainOhFeatures.columns) & set(X_testOhFeatures.columns) 41 | removeFromTrain = set(X_trainOhFeatures.columns) - sharedFeatures 42 | removeFromTest = set(X_testOhFeatures.columns) - sharedFeatures 43 | X_trainOhFeatures = X_trainOhFeatures.drop(list(removeFromTrain), axis=1) 44 | X_testOhFeatures = X_testOhFeatures.drop(list(removeFromTest), axis=1) 45 | 46 | # Apply logistic regression on the data, and calculate accuracy 47 | clf = LogisticRegression() 48 | clf.fit(X_trainOhFeatures, y_train) 49 | predictions = clf.predict(X_testOhFeatures) 50 | accuracy = accuracy_score(y_test, predictions) 51 | 52 | # Return calculated accuracy as fitness 53 | return (accuracy,) 54 | 55 | #========DEAP GLOBAL VARIABLES (viewable by SCOOP)======== 56 | 57 | # Create Individual 58 | creator.create("FitnessMax", base.Fitness, weights=(1.0,)) 59 | creator.create("Individual", list, fitness=creator.FitnessMax) 60 | 61 | # Create Toolbox 62 | toolbox = base.Toolbox() 63 | toolbox.register("attr_bool", random.randint, 0, 1) 64 | toolbox.register("individual", tools.initRepeat, creator.Individual, toolbox.attr_bool, len(dfData.columns) - 1) 65 | toolbox.register("population", tools.initRepeat, list, toolbox.individual) 66 | 67 | # Continue filling toolbox... 68 | toolbox.register("evaluate", getFitness, X_train=X_train, X_test=X_test, y_train=y_train, y_test=y_test) 69 | toolbox.register("mate", tools.cxOnePoint) 70 | toolbox.register("mutate", tools.mutFlipBit, indpb=0.05) 71 | toolbox.register("select", tools.selTournament, tournsize=3) 72 | 73 | #======== 74 | 75 | def getHof(): 76 | 77 | # Initialize variables to use eaSimple 78 | numPop = 100 79 | numGen = 10 80 | pop = toolbox.population(n=numPop) 81 | hof = tools.HallOfFame(numPop * numGen) 82 | stats = tools.Statistics(lambda ind: ind.fitness.values) 83 | stats.register("avg", numpy.mean) 84 | stats.register("std", numpy.std) 85 | stats.register("min", numpy.min) 86 | stats.register("max", numpy.max) 87 | 88 | # Launch genetic algorithm 89 | pop, log = algorithms.eaSimple(pop, toolbox, cxpb=0.5, mutpb=0.2, ngen=numGen, stats=stats, halloffame=hof, verbose=True) 90 | 91 | # Return the hall of fame 92 | return hof 93 | 94 | def getMetrics(hof): 95 | 96 | # Get list of percentiles in the hall of fame 97 | percentileList = [i / (len(hof) - 1) for i in range(len(hof))] 98 | 99 | # Gather fitness data from each percentile 100 | testAccuracyList = [] 101 | validationAccuracyList = [] 102 | individualList = [] 103 | for individual in hof: 104 | testAccuracy = individual.fitness.values 105 | validationAccuracy = getFitness(individual, X_trainAndTest, X_validation, y_trainAndTest, y_validation) 106 | testAccuracyList.append(testAccuracy[0]) 107 | validationAccuracyList.append(validationAccuracy[0]) 108 | individualList.append(individual) 109 | testAccuracyList.reverse() 110 | validationAccuracyList.reverse() 111 | return testAccuracyList, validationAccuracyList, individualList, percentileList 112 | 113 | 114 | if __name__ == '__main__': 115 | 116 | ''' 117 | First, we will apply logistic regression using all the features to acquire a baseline accuracy. 118 | ''' 119 | individual = [1 for i in range(len(allFeatures.columns))] 120 | testAccuracy = getFitness(individual, X_train, X_test, y_train, y_test) 121 | validationAccuracy = getFitness(individual, X_trainAndTest, X_validation, y_trainAndTest, y_validation) 122 | print('\nTest accuracy with all features: \t' + str(testAccuracy[0])) 123 | print('Validation accuracy with all features: \t' + str(validationAccuracy[0]) + '\n') 124 | 125 | ''' 126 | Now, we will apply a genetic algorithm to choose a subset of features that gives a better accuracy than the baseline. 127 | ''' 128 | hof = getHof() 129 | testAccuracyList, validationAccuracyList, individualList, percentileList = getMetrics(hof) 130 | 131 | # Get a list of subsets that performed best on validation data 132 | maxValAccSubsetIndicies = [index for index in range(len(validationAccuracyList)) if validationAccuracyList[index] == max(validationAccuracyList)] 133 | maxValIndividuals = [individualList[index] for index in maxValAccSubsetIndicies] 134 | maxValSubsets = [[list(allFeatures)[index] for index in range(len(individual)) if individual[index] == 1] for individual in maxValIndividuals] 135 | 136 | print('\n---Optimal Feature Subset(s)---\n') 137 | for index in range(len(maxValAccSubsetIndicies)): 138 | print('Percentile: \t\t\t' + str(percentileList[maxValAccSubsetIndicies[index]])) 139 | print('Validation Accuracy: \t\t' + str(validationAccuracyList[maxValAccSubsetIndicies[index]])) 140 | print('Individual: \t' + str(maxValIndividuals[index])) 141 | print('Number Features In Subset: \t' + str(len(maxValSubsets[index]))) 142 | print('Feature Subset: ' + str(maxValSubsets[index])) 143 | 144 | ''' 145 | Now, we plot the test and validation classification accuracy to see how these numbers change as we move from our worst feature subsets to the 146 | best feature subsets found by the genetic algorithm. 147 | ''' 148 | # Calculate best fit line for validation classification accuracy (non-linear) 149 | tck = interpolate.splrep(percentileList, validationAccuracyList, s=5.0) 150 | ynew = interpolate.splev(percentileList, tck) 151 | 152 | e = plt.figure(1) 153 | plt.plot(percentileList, validationAccuracyList, marker='o', color='r') 154 | plt.plot(percentileList, ynew, color='b') 155 | plt.title('Validation Set Classification Accuracy vs. \n Continuum with Cubic-Spline Interpolation') 156 | plt.xlabel('Population Ordered By Increasing Test Set Accuracy') 157 | plt.ylabel('Validation Set Accuracy') 158 | e.show() 159 | 160 | f = plt.figure(2) 161 | plt.scatter(percentileList, validationAccuracyList) 162 | plt.title('Validation Set Classification Accuracy vs. Continuum') 163 | plt.xlabel('Population Ordered By Increasing Test Set Accuracy') 164 | plt.ylabel('Validation Set Accuracy') 165 | f.show() 166 | 167 | g = plt.figure(3) 168 | plt.scatter(percentileList, testAccuracyList) 169 | plt.title('Test Set Classification Accuracy vs. Continuum') 170 | plt.xlabel('Population Ordered By Increasing Test Set Accuracy') 171 | plt.ylabel('Test Set Accuracy') 172 | g.show() 173 | 174 | input() 175 | 176 | 177 | 178 | 179 | 180 | 181 | 182 | 183 | 184 | 185 | 186 | 187 | 188 | 189 | 190 | 191 | 192 | 193 | 194 | 195 | 196 | 197 | -------------------------------------------------------------------------------- /testSetClassificationAccuracyVsContinuum.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scoliann/GeneticAlgorithmFeatureSelection/87e8fb427fb4257ad6cb206c559e166e71202ac4/testSetClassificationAccuracyVsContinuum.png -------------------------------------------------------------------------------- /validationSetClassificationAccuracyVsContinuum.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scoliann/GeneticAlgorithmFeatureSelection/87e8fb427fb4257ad6cb206c559e166e71202ac4/validationSetClassificationAccuracyVsContinuum.png -------------------------------------------------------------------------------- /validationSetClassificationAccuracyVsContinuumWithCubicSplineInterpolation.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scoliann/GeneticAlgorithmFeatureSelection/87e8fb427fb4257ad6cb206c559e166e71202ac4/validationSetClassificationAccuracyVsContinuumWithCubicSplineInterpolation.png --------------------------------------------------------------------------------