├── README.md
├── bank-additional-full.csv
├── gaFeatureSelectionExample.py
├── testSetClassificationAccuracyVsContinuum.png
├── validationSetClassificationAccuracyVsContinuum.png
└── validationSetClassificationAccuracyVsContinuumWithCubicSplineInterpolation.png


/README.md:
--------------------------------------------------------------------------------
 1 | This script demonstrates how Genetic Algorithms can be used to find "optimal" feature subsets for machine learning problems.
 2 | 
 3 | ## Links
 4 | A YouTube video of me briefly reviewing this project can be found [here](https://www.youtube.com/watch?v=COLO7cGP2sA).
 5 | 
 6 | ## Inspiration
 7 | While working with a data set that I had created, I found myself wondering whether certain measurements should be included in features.  I didn't want to leave anything out, but also didn't want to include so many features that it might negatively impact the performance of some machine learning algorithms.  While reading an academic paper, I came across the concept of using Genetic Algorithms to determine optimal feature subsets.  Therefore, I chose to implement an example of this being done.
 8 | 
 9 | ## The Future
10 | In the future, I may make a class to specifically facilitate the feature selection process.  This will allow the Genetic Algorithm method of feature selection to be more easily applied "out of the box" to machine learning problems.
11 | 
12 | ## Other Notes
13 | To plot a curve over the noisy data, I used Cubic-Spline Interpolation.  This is my first time using this method, and I suspect there are better ways to plot such a curve.  In my limited experience, Cubic-Spline Interpolation can determine curves that have unnecessary "bends" in them.
14 | 


--------------------------------------------------------------------------------
/gaFeatureSelectionExample.py:
--------------------------------------------------------------------------------
  1 | import pandas as pd
  2 | from sklearn.preprocessing import LabelEncoder
  3 | from sklearn.model_selection import train_test_split
  4 | from sklearn.linear_model import LogisticRegression
  5 | from sklearn.metrics import accuracy_score
  6 | from deap import creator, base, tools, algorithms
  7 | from scoop import futures
  8 | import random
  9 | import numpy
 10 | from scipy import interpolate
 11 | import matplotlib.pyplot as plt
 12 | 
 13 | # Read in data from CSV
 14 | # Data set from https://archive.ics.uci.edu/ml/datasets/Bank+Marketing
 15 | dfData = pd.read_csv('bank-additional-full.csv', sep=';')
 16 | 
 17 | # Encode the classification labels to numbers
 18 | # Get classes and one hot encoded feature vectors
 19 | le = LabelEncoder()
 20 | le.fit(dfData['y'])
 21 | allClasses = le.transform(dfData['y'])
 22 | allFeatures = dfData.drop(['y'], axis=1)
 23 | 
 24 | # Form training, test, and validation sets
 25 | X_trainAndTest, X_validation, y_trainAndTest, y_validation = train_test_split(allFeatures, allClasses, test_size=0.20, random_state=42)
 26 | X_train, X_test, y_train, y_test = train_test_split(X_trainAndTest, y_trainAndTest, test_size=0.20, random_state=42)
 27 | 
 28 | # Feature subset fitness function
 29 | def getFitness(individual, X_train, X_test, y_train, y_test):
 30 | 
 31 | 	# Parse our feature columns that we don't use
 32 | 	# Apply one hot encoding to the features
 33 | 	cols = [index for index in range(len(individual)) if individual[index] == 0]
 34 | 	X_trainParsed = X_train.drop(X_train.columns[cols], axis=1)
 35 | 	X_trainOhFeatures = pd.get_dummies(X_trainParsed)
 36 | 	X_testParsed = X_test.drop(X_test.columns[cols], axis=1)
 37 | 	X_testOhFeatures = pd.get_dummies(X_testParsed)
 38 | 
 39 | 	# Remove any columns that aren't in both the training and test sets
 40 | 	sharedFeatures = set(X_trainOhFeatures.columns) & set(X_testOhFeatures.columns)
 41 | 	removeFromTrain = set(X_trainOhFeatures.columns) - sharedFeatures
 42 | 	removeFromTest = set(X_testOhFeatures.columns) - sharedFeatures
 43 | 	X_trainOhFeatures = X_trainOhFeatures.drop(list(removeFromTrain), axis=1)
 44 | 	X_testOhFeatures = X_testOhFeatures.drop(list(removeFromTest), axis=1)
 45 | 
 46 | 	# Apply logistic regression on the data, and calculate accuracy
 47 | 	clf = LogisticRegression()
 48 | 	clf.fit(X_trainOhFeatures, y_train)
 49 | 	predictions = clf.predict(X_testOhFeatures)
 50 | 	accuracy = accuracy_score(y_test, predictions)
 51 | 
 52 | 	# Return calculated accuracy as fitness
 53 | 	return (accuracy,)
 54 | 
 55 | #========DEAP GLOBAL VARIABLES (viewable by SCOOP)========
 56 | 
 57 | # Create Individual
 58 | creator.create("FitnessMax", base.Fitness, weights=(1.0,))
 59 | creator.create("Individual", list, fitness=creator.FitnessMax)
 60 | 
 61 | # Create Toolbox
 62 | toolbox = base.Toolbox()
 63 | toolbox.register("attr_bool", random.randint, 0, 1)
 64 | toolbox.register("individual", tools.initRepeat, creator.Individual, toolbox.attr_bool, len(dfData.columns) - 1)
 65 | toolbox.register("population", tools.initRepeat, list, toolbox.individual)
 66 | 
 67 | # Continue filling toolbox...
 68 | toolbox.register("evaluate", getFitness, X_train=X_train, X_test=X_test, y_train=y_train, y_test=y_test)
 69 | toolbox.register("mate", tools.cxOnePoint)
 70 | toolbox.register("mutate", tools.mutFlipBit, indpb=0.05)
 71 | toolbox.register("select", tools.selTournament, tournsize=3)
 72 | 
 73 | #========
 74 | 
 75 | def getHof():
 76 | 
 77 | 	# Initialize variables to use eaSimple
 78 | 	numPop = 100
 79 | 	numGen = 10
 80 | 	pop = toolbox.population(n=numPop)
 81 | 	hof = tools.HallOfFame(numPop * numGen)
 82 | 	stats = tools.Statistics(lambda ind: ind.fitness.values)
 83 | 	stats.register("avg", numpy.mean)
 84 | 	stats.register("std", numpy.std)
 85 | 	stats.register("min", numpy.min)
 86 | 	stats.register("max", numpy.max)
 87 | 
 88 | 	# Launch genetic algorithm
 89 | 	pop, log = algorithms.eaSimple(pop, toolbox, cxpb=0.5, mutpb=0.2, ngen=numGen, stats=stats, halloffame=hof, verbose=True)
 90 | 
 91 | 	# Return the hall of fame
 92 | 	return hof
 93 | 
 94 | def getMetrics(hof):
 95 | 
 96 | 	# Get list of percentiles in the hall of fame
 97 | 	percentileList = [i / (len(hof) - 1) for i in range(len(hof))]
 98 | 	
 99 | 	# Gather fitness data from each percentile
100 | 	testAccuracyList = []
101 | 	validationAccuracyList = []
102 | 	individualList = []
103 | 	for individual in hof:
104 | 		testAccuracy = individual.fitness.values
105 | 		validationAccuracy = getFitness(individual, X_trainAndTest, X_validation, y_trainAndTest, y_validation)
106 | 		testAccuracyList.append(testAccuracy[0])
107 | 		validationAccuracyList.append(validationAccuracy[0])
108 | 		individualList.append(individual)
109 | 	testAccuracyList.reverse()
110 | 	validationAccuracyList.reverse()
111 | 	return testAccuracyList, validationAccuracyList, individualList, percentileList
112 | 
113 | 
114 | if __name__ == '__main__':
115 | 
116 | 	'''
117 | 	First, we will apply logistic regression using all the features to acquire a baseline accuracy.
118 | 	'''
119 | 	individual = [1 for i in range(len(allFeatures.columns))]
120 | 	testAccuracy = getFitness(individual, X_train, X_test, y_train, y_test)
121 | 	validationAccuracy = getFitness(individual, X_trainAndTest, X_validation, y_trainAndTest, y_validation)
122 | 	print('\nTest accuracy with all features: \t' + str(testAccuracy[0]))
123 | 	print('Validation accuracy with all features: \t' + str(validationAccuracy[0]) + '\n')
124 | 
125 | 	'''
126 | 	Now, we will apply a genetic algorithm to choose a subset of features that gives a better accuracy than the baseline.
127 | 	'''
128 | 	hof = getHof()
129 | 	testAccuracyList, validationAccuracyList, individualList, percentileList = getMetrics(hof)
130 | 
131 | 	# Get a list of subsets that performed best on validation data
132 | 	maxValAccSubsetIndicies = [index for index in range(len(validationAccuracyList)) if validationAccuracyList[index] == max(validationAccuracyList)]
133 | 	maxValIndividuals = [individualList[index] for index in maxValAccSubsetIndicies]
134 | 	maxValSubsets = [[list(allFeatures)[index] for index in range(len(individual)) if individual[index] == 1] for individual in maxValIndividuals]
135 | 
136 | 	print('\n---Optimal Feature Subset(s)---\n')
137 | 	for index in range(len(maxValAccSubsetIndicies)):
138 | 		print('Percentile: \t\t\t' + str(percentileList[maxValAccSubsetIndicies[index]]))
139 | 		print('Validation Accuracy: \t\t' + str(validationAccuracyList[maxValAccSubsetIndicies[index]]))
140 | 		print('Individual: \t' + str(maxValIndividuals[index]))
141 | 		print('Number Features In Subset: \t' + str(len(maxValSubsets[index])))
142 | 		print('Feature Subset: ' + str(maxValSubsets[index]))
143 | 
144 | 	'''
145 | 	Now, we plot the test and validation classification accuracy to see how these numbers change as we move from our worst feature subsets to the 
146 | 	best feature subsets found by the genetic algorithm.
147 | 	'''
148 | 	# Calculate best fit line for validation classification accuracy (non-linear)
149 | 	tck = interpolate.splrep(percentileList, validationAccuracyList, s=5.0)
150 | 	ynew = interpolate.splev(percentileList, tck)
151 | 
152 | 	e = plt.figure(1)
153 | 	plt.plot(percentileList, validationAccuracyList, marker='o', color='r')
154 | 	plt.plot(percentileList, ynew, color='b')
155 | 	plt.title('Validation Set Classification Accuracy vs. \n Continuum with Cubic-Spline Interpolation')
156 | 	plt.xlabel('Population Ordered By Increasing Test Set Accuracy')
157 | 	plt.ylabel('Validation Set Accuracy')
158 | 	e.show()
159 | 
160 | 	f = plt.figure(2)
161 | 	plt.scatter(percentileList, validationAccuracyList)
162 | 	plt.title('Validation Set Classification Accuracy vs. Continuum')
163 | 	plt.xlabel('Population Ordered By Increasing Test Set Accuracy')
164 | 	plt.ylabel('Validation Set Accuracy')
165 | 	f.show()
166 | 
167 | 	g = plt.figure(3)
168 | 	plt.scatter(percentileList, testAccuracyList)
169 | 	plt.title('Test Set Classification Accuracy vs. Continuum')
170 | 	plt.xlabel('Population Ordered By Increasing Test Set Accuracy')
171 | 	plt.ylabel('Test Set Accuracy')
172 | 	g.show()
173 | 
174 | 	input()
175 | 
176 | 
177 | 
178 | 
179 | 
180 | 
181 | 
182 | 
183 | 
184 | 
185 | 
186 | 
187 | 
188 | 
189 | 
190 | 
191 | 
192 | 
193 | 
194 | 
195 | 
196 | 
197 | 


--------------------------------------------------------------------------------
/testSetClassificationAccuracyVsContinuum.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scoliann/GeneticAlgorithmFeatureSelection/87e8fb427fb4257ad6cb206c559e166e71202ac4/testSetClassificationAccuracyVsContinuum.png


--------------------------------------------------------------------------------
/validationSetClassificationAccuracyVsContinuum.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scoliann/GeneticAlgorithmFeatureSelection/87e8fb427fb4257ad6cb206c559e166e71202ac4/validationSetClassificationAccuracyVsContinuum.png


--------------------------------------------------------------------------------
/validationSetClassificationAccuracyVsContinuumWithCubicSplineInterpolation.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scoliann/GeneticAlgorithmFeatureSelection/87e8fb427fb4257ad6cb206c559e166e71202ac4/validationSetClassificationAccuracyVsContinuumWithCubicSplineInterpolation.png


--------------------------------------------------------------------------------