├── .github └── FUNDING.yml ├── Example_GeneticAlgorithm.py ├── README.md ├── dataset_features.pkl ├── ga.py ├── outputs.pkl └── requirements.txt /.github/FUNDING.yml: -------------------------------------------------------------------------------- 1 | # These are supported funding model platforms 2 | 3 | github: # Replace with up to 4 GitHub Sponsors-enabled usernames e.g., [user1, user2] 4 | # paypal: http://paypal.me/ahmedfgad # Replace with a single Patreon username 5 | open_collective: pygad 6 | ko_fi: # Replace with a single Ko-fi username 7 | tidelift: # Replace with a single Tidelift platform-name/package-name e.g., npm/babel 8 | community_bridge: # Replace with a single Community Bridge project-name e.g., cloud-foundry 9 | liberapay: # Replace with a single Liberapay username 10 | issuehunt: # Replace with a single IssueHunt username 11 | otechie: # Replace with a single Otechie username 12 | custom: ['https://donate.stripe.com/eVa5kO866elKgM0144', 'http://paypal.me/ahmedfgad'] # Replace with up to 4 custom sponsorship URLs e.g., ['link1', 'link2'] 13 | -------------------------------------------------------------------------------- /Example_GeneticAlgorithm.py: -------------------------------------------------------------------------------- 1 | import numpy 2 | import ga 3 | import pickle 4 | import matplotlib.pyplot 5 | 6 | f = open("dataset_features.pkl", "rb") 7 | data_inputs = pickle.load(f) 8 | f.close() 9 | 10 | f = open("outputs.pkl", "rb") 11 | data_outputs = pickle.load(f) 12 | f.close() 13 | 14 | num_samples = data_inputs.shape[0] 15 | num_feature_elements = data_inputs.shape[1] 16 | 17 | train_indices = numpy.arange(1, num_samples, 4) 18 | test_indices = numpy.arange(0, num_samples, 4) 19 | print("Number of training samples: ", train_indices.shape[0]) 20 | print("Number of test samples: ", test_indices.shape[0]) 21 | 22 | """ 23 | Genetic algorithm parameters: 24 | Population size 25 | Mating pool size 26 | Number of mutations 27 | """ 28 | sol_per_pop = 8 # Population size. 29 | num_parents_mating = 4 # Number of parents inside the mating pool. 30 | num_mutations = 3 # Number of elements to mutate. 31 | 32 | # Defining the population shape. 33 | pop_shape = (sol_per_pop, num_feature_elements) 34 | 35 | # Creating the initial population. 36 | new_population = numpy.random.randint(low=0, high=2, size=pop_shape) 37 | print(new_population.shape) 38 | 39 | best_outputs = [] 40 | num_generations = 100 41 | for generation in range(num_generations): 42 | print("Generation : ", generation) 43 | # Measuring the fitness of each chromosome in the population. 44 | fitness = ga.cal_pop_fitness(new_population, data_inputs, data_outputs, train_indices, test_indices) 45 | 46 | best_outputs.append(numpy.max(fitness)) 47 | # The best result in the current iteration. 48 | print("Best result : ", best_outputs[-1]) 49 | 50 | # Selecting the best parents in the population for mating. 51 | parents = ga.select_mating_pool(new_population, fitness, num_parents_mating) 52 | 53 | # Generating next generation using crossover. 54 | offspring_crossover = ga.crossover(parents, offspring_size=(pop_shape[0]-parents.shape[0], num_feature_elements)) 55 | 56 | # Adding some variations to the offspring using mutation. 57 | offspring_mutation = ga.mutation(offspring_crossover, num_mutations=num_mutations) 58 | 59 | # Creating the new population based on the parents and offspring. 60 | new_population[0:parents.shape[0], :] = parents 61 | new_population[parents.shape[0]:, :] = offspring_mutation 62 | 63 | # Getting the best solution after iterating finishing all generations. 64 | # At first, the fitness is calculated for each solution in the final generation. 65 | fitness = ga.cal_pop_fitness(new_population, data_inputs, data_outputs, train_indices, test_indices) 66 | # Then return the index of that solution corresponding to the best fitness. 67 | best_match_idx = numpy.where(fitness == numpy.max(fitness))[0] 68 | best_match_idx = best_match_idx[0] 69 | 70 | best_solution = new_population[best_match_idx, :] 71 | best_solution_indices = numpy.where(best_solution == 1)[0] 72 | best_solution_num_elements = best_solution_indices.shape[0] 73 | best_solution_fitness = fitness[best_match_idx] 74 | 75 | print("best_match_idx : ", best_match_idx) 76 | print("best_solution : ", best_solution) 77 | print("Selected indices : ", best_solution_indices) 78 | print("Number of selected elements : ", best_solution_num_elements) 79 | print("Best solution fitness : ", best_solution_fitness) 80 | 81 | matplotlib.pyplot.plot(best_outputs) 82 | matplotlib.pyplot.xlabel("Iteration") 83 | matplotlib.pyplot.ylabel("Fitness") 84 | matplotlib.pyplot.show() 85 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Feature Reduction using Genetic Algorithm 2 | This project uses the genetic algotithm (GA) optimization technique for selecting the best subset of features for classifying the **Fruits360 dataset**. This project is documented in a tutorial titled "**Feature Reduction using Genetic Algorithm**" available in **my LinkedIn** profile here: https://www.linkedin.com/pulse/feature-reduction-using-genetic-algorithm-ahmed-gad. 3 | 4 | It is worth mentioning that this project it is an extension to a previously published 2 projects. Both of them are documented in a tutorial in my LinkedIn profile. 5 | 6 | The first tutorial is titled "**Artificial Neural Network Implementation using NumPy and Classification of the Fruits360 Image Dataset**". It starts by extracting a feature vector of length 360 from 4 classes of the Fruits360 dataset. Then, it builds an artificial neural network (ANN) using NumPy from scratch in order to classify the dataset. It is available here https://www.linkedin.com/pulse/artificial-neural-network-implementation-using-numpy-fruits360-gad. Its GitHub project is available here: https://github.com/ahmedfgad/NumPyANN. 7 | 8 | The second tutorial is titled "**Artificial Neural Networks Optimization using Genetic Algorithm**". It builds and uses the GA for optimizing the ANN parameters in order to increase the classification accuracy. It is available here https://www.linkedin.com/pulse/artificial-neural-networks-optimization-using-genetic-ahmed-gad. Its GitHub project is also available here: https://github.com/ahmedfgad/NeuralGenetic. 9 | 10 | This project discusses how to use the genetic algorithm (GA) for reducing the feature vector extracted from the Fruits360 dataset of length 360. This tutorial starts by discussing the steps to be followed. After that, the steps are implemented in Python. 11 | 12 | This project modifies my previous project that implements the GA which is available in my GitHub account here: https://github.com/ahmedfgad/GeneticAlgorithmPython. For more information about how GA is implemented, you can read my tutotial titled "**Genetic Algorithm Implementation in Python**" which is available in my LinkedIn profile here: https://www.linkedin.com/pulse/genetic-algorithm-implementation-python-ahmed-gad. 13 | 14 | ## Project Structure 15 | The GA implementation is organized in 2 Python files which are: 16 | 1. Example_GeneticAlgorithm.py 17 | 2. GA.py 18 | 19 | The first file is the **main file** that initializes all parameters of the GA and goes through a the generations in which the subsets of featrures are returned by evolving the solutions. There are 2 supplementary files which are: 20 | 1. dataset_features.pkl 21 | 2. outputs.pkl 22 | 23 | The first file holds the extracted features for the used samples which are to be reduced. The second file holds their class labels. 24 | You can find more details about how the project works in the tutorial that documents it. 25 | 26 | ## More Information about GA 27 | For more information about GA and its implementation, you can read my 2018 book cited as "**Ahmed Fawzy Gad 'Practical Computer Vision Applications Using Deep Learning with CNNs'. Dec. 2018, Apress, 978-1-4842-4167-7**". The book is available at Springer at this link: https://springer.com/us/book/9781484241660." You can also read my tutorials listed below: 28 | * Introduction to Optimization with Genetic Algorithm 29 | https://www.linkedin.com/pulse/introduction-optimization-genetic-algorithm-ahmed-gad 30 | https://www.kdnuggets.com/2018/03/introduction-optimization-with-genetic-algorithm.html 31 | https://towardsdatascience.com/introduction-to-optimization-with-genetic-algorithm-2f5001d9964b 32 | * Genetic Algorithm (GA) Optimization - Step-by-Step Example 33 | https://www.slideshare.net/AhmedGadFCIT/genetic-algorithm-ga-optimization-stepbystep-example 34 | * Genetic Algorithm Implementation in Python 35 | https://www.linkedin.com/pulse/genetic-algorithm-implementation-python-ahmed-gad 36 | https://www.kdnuggets.com/2018/07/genetic-algorithm-implementation-python.html 37 | https://towardsdatascience.com/genetic-algorithm-implementation-in-python-5ab67bb124a6 38 | https://github.com/ahmedfgad/GeneticAlgorithmPython 39 | 40 | ## For Contacting the Author 41 | * E-mail: ahmed.f.gad@gmail.com 42 | * LinkedIn: https://linkedin.com/in/ahmedfgad/ 43 | * KDnuggets: https://kdnuggets.com/author/ahmed-gad 44 | * YouTube: https://youtube.com/AhmedGadFCIT 45 | * TowardsDataScience: https://towardsdatascience.com/@ahmedfgad 46 | * GitHub: https://github.com/ahmedfgad 47 | 48 | 49 | -------------------------------------------------------------------------------- /dataset_features.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ahmedfgad/FeatureReductionGenetic/7ce3f8cf434b2017163c9634f157380f8cf2b17e/dataset_features.pkl -------------------------------------------------------------------------------- /ga.py: -------------------------------------------------------------------------------- 1 | import numpy 2 | import sklearn.svm 3 | 4 | def reduce_features(solution, features): 5 | selected_elements_indices = numpy.where(solution == 1)[0] 6 | reduced_features = features[:, selected_elements_indices] 7 | return reduced_features 8 | 9 | 10 | def classification_accuracy(labels, predictions): 11 | correct = numpy.where(labels == predictions)[0] 12 | accuracy = correct.shape[0]/labels.shape[0] 13 | return accuracy 14 | 15 | 16 | def cal_pop_fitness(pop, features, labels, train_indices, test_indices): 17 | accuracies = numpy.zeros(pop.shape[0]) 18 | idx = 0 19 | 20 | for curr_solution in pop: 21 | reduced_features = reduce_features(curr_solution, features) 22 | train_data = reduced_features[train_indices, :] 23 | test_data = reduced_features[test_indices, :] 24 | 25 | train_labels = labels[train_indices] 26 | test_labels = labels[test_indices] 27 | 28 | SV_classifier = sklearn.svm.SVC(gamma='scale') 29 | SV_classifier.fit(X=train_data, y=train_labels) 30 | 31 | predictions = SV_classifier.predict(test_data) 32 | accuracies[idx] = classification_accuracy(test_labels, predictions) 33 | idx = idx + 1 34 | return accuracies 35 | 36 | def select_mating_pool(pop, fitness, num_parents): 37 | # Selecting the best individuals in the current generation as parents for producing the offspring of the next generation. 38 | parents = numpy.empty((num_parents, pop.shape[1])) 39 | for parent_num in range(num_parents): 40 | max_fitness_idx = numpy.where(fitness == numpy.max(fitness)) 41 | max_fitness_idx = max_fitness_idx[0][0] 42 | parents[parent_num, :] = pop[max_fitness_idx, :] 43 | fitness[max_fitness_idx] = -99999999999 44 | return parents 45 | 46 | 47 | def crossover(parents, offspring_size): 48 | offspring = numpy.empty(offspring_size) 49 | # The point at which crossover takes place between two parents. Usually, it is at the center. 50 | crossover_point = numpy.uint8(offspring_size[1]/2) 51 | 52 | for k in range(offspring_size[0]): 53 | # Index of the first parent to mate. 54 | parent1_idx = k%parents.shape[0] 55 | # Index of the second parent to mate. 56 | parent2_idx = (k+1)%parents.shape[0] 57 | # The new offspring will have its first half of its genes taken from the first parent. 58 | offspring[k, 0:crossover_point] = parents[parent1_idx, 0:crossover_point] 59 | # The new offspring will have its second half of its genes taken from the second parent. 60 | offspring[k, crossover_point:] = parents[parent2_idx, crossover_point:] 61 | return offspring 62 | 63 | 64 | def mutation(offspring_crossover, num_mutations=2): 65 | mutation_idx = numpy.random.randint(low=0, high=offspring_crossover.shape[1], size=num_mutations) 66 | # Mutation changes a single gene in each offspring randomly. 67 | for idx in range(offspring_crossover.shape[0]): 68 | # The random value to be added to the gene. 69 | offspring_crossover[idx, mutation_idx] = 1 - offspring_crossover[idx, mutation_idx] 70 | return offspring_crossover 71 | -------------------------------------------------------------------------------- /outputs.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ahmedfgad/FeatureReductionGenetic/7ce3f8cf434b2017163c9634f157380f8cf2b17e/outputs.pkl -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | pygad --------------------------------------------------------------------------------