├── LICENSE ├── README.md ├── fastgp ├── __init__.py ├── algorithms │ ├── __init__.py │ ├── afpo.py │ ├── evolutionary_feature_synthesis.py │ ├── fast_evaluate.py │ └── truncation_with_elite.py ├── logging │ ├── __init__.py │ ├── archive.py │ └── reports.py ├── parametrized │ ├── __init__.py │ ├── mutation.py │ └── simple_parametrized_terminals.py └── utilities │ ├── __init__.py │ ├── benchmark_problems.py │ ├── metrics.py │ ├── operators.py │ ├── subset_selection.py │ └── symbreg.py ├── requirements.txt ├── setup.py └── tests └── fastgp └── parametrized └── test_simple_parametrzied_terminal.py /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) [year] [fullname] 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Fast Genetic Programming 2 | fastgp is a numpy implementation of [genetic programming](https://en.wikipedia.org/wiki/Genetic_programming) built on top of [deap](https://github.com/DEAP/deap). It is the core library for [fastsr](https://github.com/cfusting/fast-symbolic-regression), a symbolic regression package for Python. 3 | It's primary contribution is an implementation of AFPO\[1\] which is compatible with any deap toolbox. 4 | 5 | fastgp was designed and developed by the [Morphology, Evolution & Cognition Laboratory](http://www.meclab.org/) at the University of Vermont. It extends research code which can be found [here](https://github.com/mszubert/gecco_2016). 6 | 7 | Installing 8 | ---------- 9 | fastgp is compatible with Python 2.7+. 10 | ```bash 11 | pip install fastgp 12 | ``` 13 | 14 | Example Usage 15 | ------------- 16 | fastgp is a core library and as such there are no examples in this repository. 17 | Check out [fastsr](https://github.com/cfusting/fast-symbolic-regression) for an example of fastgp's use in Symbolic Regression. 18 | 19 | Literature Cited 20 | ---------------- 21 | 1. Michael Schmidt and Hod Lipson. 2011. Age-fitness pareto optimization. In Genetic Programming Theory and Practice VIII. Springer, 129–146. 22 | -------------------------------------------------------------------------------- /fastgp/__init__.py: -------------------------------------------------------------------------------- 1 | __author__ = 'mszubert' 2 | -------------------------------------------------------------------------------- /fastgp/algorithms/__init__.py: -------------------------------------------------------------------------------- 1 | __author__ = 'mszubert' 2 | -------------------------------------------------------------------------------- /fastgp/algorithms/afpo.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import random 3 | import time 4 | 5 | from deap import tools 6 | 7 | from fastgp.utilities import symbreg 8 | 9 | 10 | def breed(parents, toolbox, xover_prob, mut_prob): 11 | offspring = [toolbox.clone(ind) for ind in parents] 12 | 13 | for i in range(1, len(offspring), 2): 14 | if random.random() < xover_prob: 15 | offspring[i - 1], offspring[i] = toolbox.mate(offspring[i - 1], offspring[i]) 16 | max_age = max(offspring[i - 1].age, offspring[i].age) 17 | offspring[i].age = offspring[i - 1].age = max_age 18 | del offspring[i - 1].fitness.values, offspring[i].fitness.values 19 | 20 | for i in range(len(offspring)): 21 | if random.random() < mut_prob: 22 | offspring[i], = toolbox.mutate(offspring[i]) 23 | del offspring[i].fitness.values 24 | 25 | return offspring 26 | 27 | 28 | def find_pareto_front(population): 29 | """Finds a subset of nondominated individuals in a given list 30 | 31 | :param population: a list of individuals 32 | :return: a set of indices corresponding to nondominated individuals 33 | """ 34 | 35 | pareto_front = set(range(len(population))) 36 | 37 | for i in range(len(population)): 38 | if i not in pareto_front: 39 | continue 40 | 41 | ind1 = population[i] 42 | for j in range(i + 1, len(population)): 43 | ind2 = population[j] 44 | 45 | # if individuals are equal on all objectives, mark one of them (the first encountered one) as dominated 46 | # to prevent excessive growth of the Pareto front 47 | if ind2.fitness.dominates(ind1.fitness) or ind1.fitness == ind2.fitness: 48 | pareto_front.discard(i) 49 | 50 | if ind1.fitness.dominates(ind2.fitness): 51 | pareto_front.discard(j) 52 | 53 | return pareto_front 54 | 55 | 56 | def reduce_population(population, tournament_size, target_popsize, nondominated_size): 57 | num_iterations = 0 58 | new_population_indices = list(range(len(population))) 59 | stop_cond = False 60 | while len(new_population_indices) > target_popsize and len(new_population_indices) > nondominated_size: 61 | if num_iterations > 10e6: 62 | print("Pareto front size may be exceeding the size of population. Stopping the execution. Try making" 63 | "the population size larger or the number of generations smaller.") 64 | # random.sample(new_population_indices, len(new_population_indices) - target_popsize) 65 | stop_cond = True 66 | num_iterations += 1 67 | tournament_indices = random.sample(new_population_indices, tournament_size) 68 | tournament = [population[index] for index in tournament_indices] 69 | nondominated_tournament = find_pareto_front(tournament) 70 | for i in range(len(tournament)): 71 | if i not in nondominated_tournament: 72 | new_population_indices.remove(tournament_indices[i]) 73 | population[:] = [population[i] for i in new_population_indices] 74 | return stop_cond 75 | 76 | 77 | def pareto_optimization(population, toolbox, xover_prob, mut_prob, ngen, 78 | tournament_size, num_randoms=1, archive=None, 79 | stats=None, calc_pareto_front=True, verbose=False, 80 | reevaluate_population=False, history=None, 81 | stop_time=None): 82 | start = time.time() 83 | if history is not None: 84 | history.update(population) 85 | logbook = tools.Logbook() 86 | logbook.header = ['gen', 'nevals', 'cpu_time'] + (stats.fields if stats else []) 87 | 88 | target_popsize = len(population) 89 | 90 | # calculating errors may be expensive, so we will cache the error value as an individual's attribute 91 | for ind in population: 92 | ind.error = toolbox.evaluate_error(ind)[0] 93 | toolbox.assign_fitness(population) 94 | for ind in population: 95 | history.genealogy_history[ind.history_index].error = ind.error 96 | 97 | record = stats.compile(population) if stats else {} 98 | cpu_time = time.time() - start 99 | logbook.record(gen=0, nevals=len(population), cpu_time=cpu_time, **record) 100 | if archive is not None: 101 | archive.update(population) 102 | if verbose: 103 | print(logbook.stream) 104 | 105 | gen = 0 106 | while(gen < (ngen + 1)): 107 | # do we want to enforce re-evaluating the whole population instead of using cached erro r values 108 | if reevaluate_population: 109 | for ind in population: 110 | ind.error = toolbox.evaluate_error(ind)[0] 111 | parents = toolbox.select(population, len(population) - num_randoms) 112 | offspring = breed(parents, toolbox, xover_prob, mut_prob) 113 | offspring += toolbox.generate_randoms() 114 | 115 | # evaluate newly generated individuals which do not have cached values (or have inherited them from parents) 116 | for ind in offspring: 117 | ind.error = toolbox.evaluate_error(ind)[0] 118 | 119 | # extend the population by adding offspring - the size of population is now 2*target_popsize 120 | population.extend(offspring) 121 | toolbox.assign_fitness(population) 122 | 123 | for ind in population: 124 | history.genealogy_history[ind.history_index].error = ind.error 125 | 126 | # we may take 2 strategies of evaluating pareto-front: 127 | # - pessimistic: Pareto front may be larger than target_popsize and we want to detect it early because 128 | # if that's the case we won't be able to reduce the size of population to target_popsize 129 | # - optimistic: in practice, the above case happen extremely rarely but calculating global front is expensive 130 | # so let's assume that Pareto front is small enough try to reduce the population 131 | if calc_pareto_front: 132 | pareto_front_size = len(find_pareto_front(population)) 133 | logging.debug("Generation: %5d - Pareto Front Size: %5d", gen, pareto_front_size) 134 | if pareto_front_size > target_popsize: 135 | logging.info("Pareto front size exceeds the size of population. Try Making the population size larger" 136 | "or reducing the number of generations.") 137 | break 138 | else: 139 | pareto_front_size = 0 140 | 141 | # perform Pareto tournament selection until the size of the population is reduced to target_popsize 142 | stop_cond = reduce_population(population, tournament_size, target_popsize, pareto_front_size) 143 | 144 | record = stats.compile(population) if stats else {} 145 | cpu_time = time.time() - start 146 | 147 | print(gen, cpu_time, stop_time) 148 | 149 | logbook.record(gen=gen, nevals=len(population), cpu_time=cpu_time, **record) 150 | if archive is not None: 151 | archive.update(population) 152 | if verbose: 153 | print(logbook.stream) 154 | 155 | for ind in population: 156 | ind.age += 1 157 | 158 | if stop_cond: 159 | print('Stop condition reached at generation %i.' % gen) 160 | gen = ngen + 1 161 | elif stop_time is not None and cpu_time > stop_time: 162 | print('Stop time reached at generation %i.' % gen) 163 | gen = ngen + 1 164 | else: 165 | gen = gen + 1 166 | 167 | return population, logbook, history 168 | 169 | 170 | def evaluate_age_fitness(ind, error_func): 171 | ind.error = error_func(ind)[0] 172 | return ind.error, ind.age 173 | 174 | 175 | def evaluate_age_fitness_size(ind, error_func): 176 | ind.size = len(ind) 177 | return evaluate_age_fitness(ind, error_func) + (ind.size,) 178 | 179 | 180 | def evaluate_fitness_size(ind, error_func): 181 | ind.error = error_func(ind)[0] 182 | ind.size = len(ind) 183 | return ind.error, ind.size 184 | 185 | 186 | def evaluate_fitness_size_complexity(ind, error_func): 187 | ind.error = error_func(ind)[0] 188 | ind.size = len(ind) 189 | ind.complexity = symbreg.calculate_order(ind) 190 | return ind.error, ind.size, ind.complexity 191 | 192 | 193 | def assign_random_fitness(population, random_range): 194 | for ind in population: 195 | ind.fitness.values = (ind.error, random.randrange(random_range)) 196 | 197 | 198 | def assign_pure_fitness(population): 199 | for ind in population: 200 | ind.fitness.values = (ind.error,) 201 | 202 | 203 | def assign_age_fitness(population): 204 | for ind in population: 205 | ind.fitness.values = (ind.error, ind.age) 206 | 207 | 208 | def assign_age_fitness_size(population): 209 | for ind in population: 210 | ind.fitness.values = (ind.error, ind.age, len(ind)) 211 | 212 | 213 | def assign_age_fitness_complexity(population): 214 | for ind in population: 215 | ind.fitness.values = (ind.error, ind.age, symbreg.calculate_order(ind)) 216 | 217 | 218 | def assign_age_fitness_size_complexity(population): 219 | for ind in population: 220 | ind.fitness.values = (ind.error, ind.age, len(ind), symbreg.calculate_order(ind)) 221 | 222 | 223 | def assign_size_fitness(population): 224 | for ind in population: 225 | ind.fitness.values = (ind.error, len(ind)) 226 | 227 | -------------------------------------------------------------------------------- /fastgp/algorithms/evolutionary_feature_synthesis.py: -------------------------------------------------------------------------------- 1 | import warnings 2 | import random 3 | from copy import deepcopy 4 | import math 5 | 6 | import numpy as np 7 | 8 | from sklearn.linear_model import ElasticNetCV 9 | from sklearn.model_selection import TimeSeriesSplit, KFold 10 | from sklearn.preprocessing import StandardScaler 11 | 12 | from scipy.stats import pearsonr 13 | from scipy.stats import skew 14 | 15 | from fastgp.utilities.metrics import mean_squared_error 16 | from fastgp.utilities.symbreg import numpy_protected_div_dividend, numpy_protected_sqrt, numpy_protected_log_one 17 | 18 | 19 | class Statistics: 20 | 21 | def __init__(self): 22 | self.scores = [] 23 | self.generations = [] 24 | self.num_features = [] 25 | self.index = 0 26 | 27 | def add(self, gen, score, num_features): 28 | self.generations.append(gen) 29 | self.scores.append(score) 30 | self.num_features.append(num_features) 31 | 32 | def __iter__(self): 33 | return self 34 | 35 | def next(self): 36 | self.index += 1 37 | if self.index > len(self.num_features): 38 | raise StopIteration 39 | return self.generations[self.index], self.scores[self.index], self.num_features[self.index] 40 | 41 | 42 | class Feature: 43 | 44 | def __init__(self, value, string, infix_string, size=0, fitness=1, original_variable=False): 45 | self.value = value 46 | self.fitness = fitness 47 | self.string = string 48 | self.infix_string = infix_string 49 | self.size = size 50 | self.original_variable = original_variable 51 | 52 | def __str__(self): 53 | return self.string 54 | 55 | 56 | class Operator: 57 | 58 | def __init__(self, operation, parity, string, infix, infix_name): 59 | self.operation = operation 60 | self.parity = parity 61 | self.string = string 62 | self.infix = infix 63 | self.infix_name = infix_name 64 | 65 | 66 | def square(x): 67 | return np.power(x, 2) 68 | 69 | 70 | def cube(x): 71 | return np.power(x, 3) 72 | 73 | 74 | def is_huge(x): 75 | return x > np.finfo(np.float64).max / 100000 76 | 77 | 78 | def numpy_safe_exp(x): 79 | with np.errstate(invalid='ignore'): 80 | result = np.exp(x) 81 | if isinstance(result, np.ndarray): 82 | result[np.isnan(x)] = 1 83 | result[np.isinf(x)] = 1 84 | result[is_huge(x)] = 1 85 | elif np.isinf(result): 86 | result = 1 87 | elif np.isnan(x): 88 | result = 1 89 | elif is_huge(x): 90 | result = 1 91 | return result 92 | 93 | 94 | def generate_operator_map(ops): 95 | opmap = {} 96 | for o in ops: 97 | opmap[o.infix_name] = o 98 | return opmap 99 | 100 | 101 | operators = [ 102 | Operator(np.add, 2, '({0} + {1})', 'add({0},{1})', 'add'), 103 | Operator(np.subtract, 2, '({0} - {1})', 'sub({0},{1})', 'sub'), 104 | Operator(np.multiply, 2, '({0} * {1})', 'mul({0},{1})', 'mul'), 105 | Operator(numpy_protected_div_dividend, 2, '({0} / {1})', 'div({0},{1})', 'div'), 106 | # Operator(numpy_safe_exp, 1, 'exp({0})'), 107 | Operator(numpy_protected_log_one, 1, 'log({0})', 'log({0})', 'log'), 108 | Operator(square, 1, 'sqr({0})', 'sqr({0})', 'sqr'), 109 | Operator(numpy_protected_sqrt, 1, 'sqt({0})', 'sqt({0})', 'sqt'), 110 | Operator(cube, 1, 'cbe({0})', 'cbe({0})', 'cbe'), 111 | Operator(np.cbrt, 1, 'cbt({0})', 'cbt({0})', 'cbt'), 112 | Operator(None, None, None, None, 'mutate'), 113 | Operator(None, None, None, None, 'transition') 114 | 115 | ] 116 | operators_map = generate_operator_map(operators) 117 | 118 | 119 | def init(num_additions, feature_names, predictors, seed): 120 | random.seed(seed) 121 | np.random.seed(seed) 122 | if num_additions is None: 123 | num_additions = math.ceil(predictors.shape[1] / 3) 124 | if feature_names is None: 125 | feature_names = ['x' + str(x) for x in range(len(predictors))] 126 | return num_additions, feature_names 127 | 128 | 129 | def init_features(feature_names, predictors, preserve_originals, range_operations, variable_type_indices): 130 | features = [] 131 | for i, name in enumerate(feature_names): 132 | features.append(Feature(predictors[:, i], name, name, original_variable=preserve_originals)) 133 | for _ in range(range_operations): 134 | features.append(RangeOperation(variable_type_indices, feature_names, predictors)) 135 | return features 136 | 137 | 138 | def get_basis(features): 139 | basis = np.zeros((features[0].value.shape[0], len(features))) 140 | for i, f in enumerate(features): 141 | basis[:, i] = features[i].value 142 | basis = np.nan_to_num(basis) 143 | scaler = StandardScaler() 144 | basis = scaler.fit_transform(basis) 145 | return basis, scaler 146 | 147 | 148 | def get_model(basis, response, time_series_cv, splits): 149 | if time_series_cv: 150 | cv = TimeSeriesSplit(n_splits=splits) 151 | else: 152 | cv = KFold(n_splits=splits) 153 | model = ElasticNetCV(l1_ratio=1, selection='random', cv=cv) 154 | with warnings.catch_warnings(): 155 | warnings.simplefilter('ignore') 156 | model.fit(basis, response) 157 | _, coefs, _ = model.path(basis, response, l1_ration=model.l1_ratio_, alphas=model.alphas_) 158 | return model, coefs, model.mse_path_ 159 | 160 | 161 | def get_selected_features(num_additions, features, tournament_probability): 162 | selected_features = [] 163 | for _ in range(num_additions): 164 | feature = tournament_selection(features, tournament_probability) 165 | selected_features.append(feature) 166 | return selected_features 167 | 168 | 169 | def get_coefficient_fitness(coefs, mse_path, threshold, response_variance): 170 | mse = np.mean(mse_path, axis=1) 171 | r_squared = 1 - (mse / response_variance) 172 | binary_coefs = coefs > threshold 173 | return binary_coefs.dot(r_squared) 174 | 175 | 176 | def rank_by_coefficient(features, coefs, mse_path, num_additions, threshold, response_variance, 177 | verbose): 178 | fitness = get_coefficient_fitness(coefs, mse_path, threshold, response_variance) 179 | for i, f in enumerate(features): 180 | f.fitness = fitness[i] 181 | new_features = list(filter(lambda x: x.original_variable is True, features)) 182 | possible_features = list(filter(lambda x: x.original_variable is False, features)) 183 | possible_features.sort(key=lambda x: x.fitness, reverse=True) 184 | new_features.extend(possible_features[0:num_additions + 1]) 185 | new_features.sort(key=lambda x: x.fitness, reverse=True) 186 | print('Top performing features:') 187 | for i in range(10): 188 | print(new_features[i].string + ' - ' + str(new_features[i].fitness)) 189 | return new_features 190 | 191 | 192 | def remove_zeroed_features(model, features, threshold, verbose): 193 | remove_features = [] 194 | for i, coef in enumerate(model.coef_): 195 | features[i].fitness = math.fabs(coef) 196 | if features[i].fitness <= threshold and not features[i].original_variable: 197 | remove_features.append(features[i]) 198 | for f in remove_features: 199 | features.remove(f) 200 | print('Removed ' + str(len(remove_features)) + ' features from population.') 201 | if verbose and remove_features: 202 | print(get_model_string(remove_features)) 203 | return features 204 | 205 | 206 | def update_fitness(features, response, threshold, fitness_algorithm, response_variance, num_additions, 207 | time_series_cv, splits, verbose): 208 | basis, _ = get_basis(features) 209 | model, coefs, mse_path = get_model(basis, response, time_series_cv, splits) 210 | if fitness_algorithm == 'zero_out': 211 | features = remove_zeroed_features(model, features, threshold, verbose) 212 | elif fitness_algorithm == 'coefficient_rank': 213 | features = rank_by_coefficient(features, coefs, mse_path, num_additions, threshold, response_variance, 214 | verbose) 215 | return features 216 | 217 | 218 | def uncorrelated(parents, new_feature, correlation_threshold): 219 | uncorr = True 220 | if type(parents) == list: 221 | for p in parents: 222 | r, _ = pearsonr(new_feature.value, p.value) 223 | if r > correlation_threshold: 224 | uncorr = False 225 | else: 226 | r, _ = pearsonr(new_feature.value, parents.value) 227 | if r > correlation_threshold: 228 | uncorr = False 229 | return uncorr 230 | 231 | 232 | def tournament_selection(population, probability): 233 | individuals = random.choices(population, k=2) 234 | individuals.sort(reverse=True, key=lambda x: x.fitness) 235 | if random.random() < probability: 236 | return individuals[0] 237 | else: 238 | return individuals[1] 239 | 240 | 241 | def compose_features(num_additions, features, tournament_probability, correlation_threshold, 242 | range_operators, verbose): 243 | new_feature_list = [] 244 | for _ in range(num_additions): 245 | operator = random.choice(operators) 246 | if operator.parity == 1: 247 | parent = tournament_selection(features, tournament_probability) 248 | new_feature_string = operator.string.format(parent.string) 249 | new_infix_string = operator.infix.format(parent.infix_string) 250 | new_feature_value = operator.operation(parent.value) 251 | new_feature = Feature(new_feature_value, new_feature_string, new_infix_string, 252 | size=parent.size + 1) 253 | if uncorrelated(parent, new_feature, correlation_threshold): 254 | new_feature_list.append(new_feature) 255 | elif operator.parity == 2: 256 | parent1 = tournament_selection(features, tournament_probability) 257 | parent2 = tournament_selection(features, tournament_probability) 258 | new_feature_string = operator.string.format(parent1.string, parent2.string) 259 | new_infix_string = operator.infix.format(parent1.infix_string, parent2.infix_string) 260 | new_feature_value = operator.operation(parent1.value, parent2.value) 261 | new_feature = Feature(new_feature_value, new_feature_string, new_infix_string, 262 | size=parent1.size + parent2.size + 1) 263 | if uncorrelated([parent1, parent2], new_feature, correlation_threshold): 264 | new_feature_list.append(new_feature) 265 | if range_operators: 266 | protected_range_operators = list(filter(lambda x: type(x) == RangeOperation and x.original_variable, 267 | features)) 268 | transitional_range_operators = list(filter(lambda x: type(x) == RangeOperation and not x.original_variable, 269 | features)) 270 | if operator.infix_name == 'transition' and protected_range_operators: 271 | parent = random.choice(protected_range_operators) 272 | new_feature = deepcopy(parent) 273 | new_feature.original_variable = False 274 | new_feature_list.append(new_feature) 275 | elif operator.infix_name == 'mutate' and transitional_range_operators: 276 | parent = random.choice(transitional_range_operators) 277 | new_feature = deepcopy(parent) 278 | new_feature.mutate_parameters() 279 | new_feature_list.append(new_feature) 280 | filtered_feature_list = list(filter(lambda x: x.size < 5, new_feature_list)) 281 | features.extend(filtered_feature_list) 282 | print('Adding ' + str(len(filtered_feature_list)) + ' features to population.') 283 | if verbose: 284 | print(get_model_string(new_feature_list)) 285 | return features 286 | 287 | 288 | def score_model(features, response, time_series_cv, splits): 289 | print('Scoring model with ' + str(len(features)) + ' features.') 290 | basis, scaler = get_basis(features) 291 | model, _, _ = get_model(basis, response, time_series_cv, splits) 292 | score = mean_squared_error(model.predict(basis), response)[0] 293 | return score, model, scaler 294 | 295 | 296 | def get_model_string(features): 297 | feature_strings = [] 298 | for f in features: 299 | feature_strings.append(f.string) 300 | return '[' + '] + ['.join(feature_strings) + ']' 301 | 302 | 303 | def compute_operation(num_variables, predictors, stack, feature_names): 304 | variables = [] 305 | for _ in range(num_variables): 306 | variable_name = stack.pop() 307 | variable_index = feature_names.index(variable_name) 308 | variables.append(predictors[:, variable_index]) 309 | operator = stack.pop() 310 | result = operator.operation(*variables) 311 | return result 312 | 313 | 314 | def build_operation_stack(string): 315 | stack = [] 316 | start = 0 317 | for i, s in enumerate(string): 318 | if s == '(': 319 | substring = string[start:i] 320 | start = i + 1 321 | stack.append(substring) 322 | elif s == ',': 323 | if i != start: 324 | substring = string[start:i] 325 | stack.append(substring) 326 | start = i + 1 327 | elif s == ')': 328 | if i != start: 329 | substring = string[start:i] 330 | stack.append(substring) 331 | start = i + 1 332 | return stack 333 | 334 | 335 | def get_feature_value(stack, feature_names, predictors, variable_type_indices): 336 | variables_stack = [] 337 | while len(stack) > 0: 338 | current = stack.pop() 339 | if variable_type_indices and current.startswith('RangeOperation'): 340 | range_operation = RangeOperation(variable_type_indices, feature_names, predictors, string=current) 341 | variables_stack.append(np.squeeze(range_operation.value)) 342 | elif current in feature_names: 343 | variable_index = feature_names.index(current) 344 | variables_stack.append(predictors[:, variable_index]) 345 | elif current in operators_map: 346 | operator = operators_map[current] 347 | variables = [] 348 | for _ in range(operator.parity): 349 | variables.append(variables_stack.pop()) 350 | result = operator.operation(*variables) 351 | variables_stack.append(result) 352 | return variables_stack.pop() 353 | 354 | 355 | def build_basis_from_features(infix_features, feature_names, predictors, variable_type_indices): 356 | basis = np.zeros((predictors.shape[0], len(infix_features))) 357 | for j, f in enumerate(infix_features): 358 | if variable_type_indices and f.startswith('RangeOperation'): 359 | range_operation = RangeOperation(variable_type_indices, feature_names, predictors, string=f) 360 | basis[:, j] = np.squeeze(range_operation.value) 361 | elif f in feature_names: 362 | variable_index = feature_names.index(f) 363 | basis[:, j] = predictors[:, variable_index] 364 | else: 365 | operation_stack = build_operation_stack(f) 366 | basis[:, j] = get_feature_value(operation_stack, feature_names, predictors, variable_type_indices) 367 | return basis 368 | 369 | 370 | def get_basis_from_infix_features(infix_features, feature_names, predictors, scaler=None, 371 | variable_type_indices=None): 372 | basis = build_basis_from_features(infix_features, feature_names, predictors, variable_type_indices) 373 | basis = np.nan_to_num(basis) 374 | if scaler: 375 | basis = scaler.transform(basis) 376 | return basis 377 | 378 | 379 | def optimize(predictors, response, seed, fitness_algorithm, max_gens=100, num_additions=None, preserve_originals=True, 380 | tournament_probability=.9, max_useless_steps=10, fitness_threshold=.01, correlation_threshold=0.95, 381 | reinit_range_operators=3, splits=3, time_series_cv=False, feature_names=None, range_operators=0, 382 | variable_type_indices=None, verbose=False): 383 | assert predictors.shape[1] == len(feature_names) 384 | num_additions, feature_names = init(num_additions, feature_names, predictors, seed) 385 | features = init_features(feature_names, predictors, preserve_originals, range_operators, 386 | variable_type_indices) 387 | best_models = [] 388 | best_features = [] 389 | best_scalers = [] 390 | best_validation_scores = [] 391 | statistics = Statistics() 392 | best_score = np.Inf 393 | steps_without_new_model = 0 394 | response_variance = np.var(response) 395 | gen = 1 396 | while gen <= max_gens and steps_without_new_model <= max_useless_steps: 397 | print('Generation: ' + str(gen)) 398 | score, model, scaler = score_model(features, response, time_series_cv, splits) 399 | statistics.add(gen, score, len(features)) 400 | if verbose: 401 | print(get_model_string(features)) 402 | print('Score: ' + str(score)) 403 | if score < best_score: 404 | best_validation_scores.append(score) 405 | steps_without_new_model = 0 406 | best_score = score 407 | print('New best model score: ' + str(best_score)) 408 | best_models.append(model) 409 | temp_features = deepcopy(features) 410 | for f in temp_features: 411 | f.value = None 412 | best_features.append(temp_features) 413 | best_scalers.append(scaler) 414 | else: 415 | steps_without_new_model += 1 416 | print('-------------------------------------------------------') 417 | if gen < max_gens and steps_without_new_model <= max_useless_steps: 418 | features = compose_features(num_additions, features, tournament_probability, correlation_threshold, 419 | range_operators, verbose) 420 | features = update_fitness(features, response, fitness_threshold, fitness_algorithm, 421 | response_variance, num_additions, time_series_cv, splits, 422 | verbose) 423 | if gen % reinit_range_operators == 0: 424 | features = swap_range_operators(features, range_operators, variable_type_indices, feature_names, 425 | predictors) 426 | gen += 1 427 | return statistics, best_models, best_features, best_scalers, best_validation_scores 428 | 429 | 430 | def swap_range_operators(features, range_operations, variable_type_indices, feature_names, predictors): 431 | for f in features: 432 | if type(f) == RangeOperation and f.original_variable: 433 | features.remove(f) 434 | for _ in range(range_operations): 435 | features.append(RangeOperation(variable_type_indices, feature_names, predictors)) 436 | return features 437 | 438 | 439 | def name_operation(operation, name): 440 | operation.__name__ = name 441 | return operation 442 | 443 | 444 | class RangeOperation(Feature): 445 | 446 | def __init__(self, variable_type_indices, names, predictors, operation=None, begin_range_name=None, 447 | end_range_name=None, original_variable=True, string=None): 448 | Feature.__init__(self, None, 'RangeOperation', 'RangeOperation', original_variable=original_variable) 449 | self.predictors = predictors 450 | self.begin_range = None 451 | self.end_range = None 452 | self.operation = None 453 | self.names = None 454 | self.lower_bound = None 455 | self.upper_bound = None 456 | self.variable_type_indices = variable_type_indices 457 | self.operations = { 458 | 'sum': name_operation(np.sum, 'sum'), 459 | 'min': name_operation(np.min, 'min'), 460 | 'max': name_operation(np.max, 'max'), 461 | 'mean': name_operation(np.mean, 'mean'), 462 | 'vari': name_operation(np.var, 'vari'), 463 | 'skew': name_operation(skew, 'skew') 464 | } 465 | if string: 466 | parts = string.split('_') 467 | self.initialize_parameters(variable_type_indices, names, parts[1], parts[2], parts[3]) 468 | else: 469 | self.initialize_parameters(variable_type_indices, names, operation, begin_range_name, end_range_name) 470 | self.value = self.create_input_vector() 471 | self.string = self.format() 472 | self.infix_string = self.format() 473 | 474 | def __deepcopy__(self, memo): 475 | new = self.__class__(self.variable_type_indices, self.names, self.predictors) 476 | new.__dict__.update(deepcopy(self.__dict__, memo)) 477 | new.predictors = self.predictors 478 | new.value = self.value 479 | return new 480 | 481 | def initialize_parameters(self, variable_type_indices, names, operation=None, begin_range_name=None, 482 | end_range_name=None): 483 | """ 484 | :param variable_type_indices: A sequence of variable type indices where each entry defines the 485 | index of a variable type in the design matrix. For example a design matrix with two variable types will have 486 | indices [j,n] where variable type A spans 0 to j and variable type B spans j + 1 to n. 487 | :param names: 488 | :param operation 489 | :param begin_range_name 490 | :param end_range_name 491 | :return: 492 | """ 493 | self.names = names 494 | for r in variable_type_indices: 495 | if r[1] - r[0] < 2: 496 | raise ValueError('Invalid variable type indices: ' + str(r)) 497 | rng = random.choice(variable_type_indices) 498 | self.lower_bound = rng[0] 499 | self.upper_bound = rng[1] 500 | if operation is not None and begin_range_name is not None and end_range_name is not None: 501 | if self.operations.get(operation) is None: 502 | raise ValueError('Invalid operation provided to Range Terminal: ' + operation) 503 | if begin_range_name not in self.names: 504 | raise ValueError('Invalid range name provided to Range Termnial: ' + str(begin_range_name)) 505 | if end_range_name not in self.names: 506 | raise ValueError('Invalid range name provided to Range Terminal: ' + str(end_range_name)) 507 | begin_range = self.names.index(begin_range_name) 508 | end_range = self.names.index(end_range_name) + 1 509 | valid = False 510 | for r in variable_type_indices: 511 | if r[0] <= begin_range < end_range <= r[1]: 512 | valid = True 513 | if not valid: 514 | raise ValueError('Invalid range provided to Range Terminal: (' + str(begin_range) + ',' + 515 | str(end_range) + ')') 516 | self.operation = self.operations[operation] 517 | self.begin_range = begin_range 518 | self.end_range = end_range 519 | else: 520 | self.operation = random.choice(list(self.operations.values())) 521 | self.begin_range = np.random.randint(self.lower_bound, self.upper_bound - 1) 522 | self.end_range = np.random.randint(self.begin_range + 1, self.upper_bound) 523 | 524 | def mutate_parameters(self): 525 | old = self.format() 526 | mutation = random.choice(['low', 'high']) 527 | span = self.end_range - self.begin_range 528 | if span == 0: 529 | span = 1 530 | value = random.gauss(0, math.sqrt(span)) 531 | amount = int(math.ceil(abs(value))) 532 | if value < 0: 533 | amount *= -1 534 | if mutation == 'low': 535 | location = amount + self.begin_range 536 | if location < self.lower_bound: 537 | self.begin_range = self.lower_bound 538 | elif location > self.end_range - 2: 539 | self.begin_range = self.end_range - 2 540 | elif location > self.upper_bound - 2: 541 | self.begin_range = self.upper_bound - 2 542 | else: 543 | self.begin_range = location 544 | elif mutation == 'high': 545 | location = amount + self.end_range 546 | if location > self.upper_bound: 547 | self.end_range = self.upper_bound 548 | elif location < self.begin_range + 2: 549 | self.end_range = self.begin_range + 2 550 | elif location < self.lower_bound + 2: 551 | self.end_range = self.lower_bound + 2 552 | else: 553 | self.end_range = location 554 | self.value = self.create_input_vector() 555 | self.infix_string = self.format() 556 | self.string = self.format() 557 | # print('Mutated ' + old + ' to ' + self.format()) 558 | 559 | def create_input_vector(self): 560 | array = self.predictors[:, self.begin_range:self.end_range] 561 | if array.shape[1] == 0: 562 | return np.zeros((array.shape[0], 1)) 563 | else: 564 | return self.operation(array, axis=1) 565 | 566 | def format(self): 567 | return "RangeOperation_{}_{}_{}".format(self.operation.__name__, self.names[self.begin_range], 568 | self.names[self.end_range - 1]) 569 | 570 | -------------------------------------------------------------------------------- /fastgp/algorithms/fast_evaluate.py: -------------------------------------------------------------------------------- 1 | import cachetools 2 | import numpy 3 | 4 | 5 | def fast_numpy_evaluate(ind, context, predictors, get_node_semantics, error_function=None, expression_dict=None): 6 | semantics_stack = [] 7 | expressions_stack = [] 8 | 9 | if expression_dict is None: 10 | expression_dict = cachetools.LRUCache(maxsize=100) 11 | 12 | for node in reversed(ind): 13 | expression = node.format(*[expressions_stack.pop() for _ in range(node.arity)]) 14 | subtree_semantics = [semantics_stack.pop() for _ in range(node.arity)] 15 | 16 | if expression in expression_dict: 17 | vector = expression_dict[expression] 18 | else: 19 | vector = get_node_semantics(node, subtree_semantics, predictors, context) 20 | expression_dict[expression] = vector 21 | 22 | expressions_stack.append(expression) 23 | semantics_stack.append(vector) 24 | 25 | if error_function is None: 26 | return semantics_stack.pop() 27 | else: 28 | return error_function(semantics_stack.pop()) 29 | 30 | 31 | def fast_numpy_evaluate_population(pop, context, predictors, error_func, expression_dict=None, arg_prefix="ARG"): 32 | if expression_dict is None: 33 | expression_dict = cachetools.LRUCache(maxsize=2000) 34 | 35 | results = numpy.empty(shape=(len(pop), len(predictors))) 36 | for row, ind in enumerate(pop): 37 | results[row] = fast_numpy_evaluate(ind, context, predictors, expression_dict, arg_prefix) 38 | 39 | errors = error_func(results) 40 | for ind, error in zip(pop, errors): 41 | ind.fitness.values = error, 42 | -------------------------------------------------------------------------------- /fastgp/algorithms/truncation_with_elite.py: -------------------------------------------------------------------------------- 1 | import time 2 | import math 3 | import random 4 | 5 | from deap import tools 6 | 7 | 8 | def generate_next_population(individuals, toolbox): 9 | """ 10 | Perform truncated selection with elitism. 11 | :param individuals: 12 | :param toolbox: 13 | :return: 14 | """ 15 | individuals = [toolbox.clone(ind) for ind in individuals] 16 | individuals.sort(key=lambda x: x.error) 17 | 18 | offspring = [] 19 | pop_size = len(individuals) 20 | num_top = math.floor(pop_size / 2) 21 | parents = individuals[0:num_top + 1] 22 | for _ in range(pop_size - 1): 23 | off = toolbox.clone(random.choice(parents)) 24 | off = toolbox.mutate(off)[0] 25 | offspring.append(off) 26 | offspring.append(individuals[0]) 27 | return offspring 28 | 29 | 30 | def render_fitness(population, toolbox, history): 31 | for ind in population: 32 | ind.error = toolbox.evaluate_error(ind)[0] 33 | ind.fitness.values = ind.error, 34 | if history is not None: 35 | history.genealogy_history[ind.history_index].error = ind.error 36 | 37 | 38 | def record_information(population, stats, start, archive, logbook, verbose): 39 | record = stats.compile(population) if stats else {} 40 | logbook.record(gen=0, nevals=len(population), cpu_time=time.time() - start, **record) 41 | if archive is not None: 42 | archive.update(population) 43 | if verbose: 44 | print(logbook.stream) 45 | 46 | 47 | def optimize(population, toolbox, ngen, archive=None, stats=None, verbose=False, history=None): 48 | """ 49 | Optimize a population of individuals. 50 | :param population: 51 | :param toolbox: 52 | :param mut_prob: 53 | :param ngen: 54 | :param archive: 55 | :param stats: 56 | :param verbose: 57 | :param history: 58 | :return: 59 | """ 60 | start = time.time() 61 | if history is not None: 62 | history.update(population) 63 | logbook = tools.Logbook() 64 | logbook.header = ['gen', 'nevals', 'cpu_time'] + (stats.fields if stats else []) 65 | render_fitness(population, toolbox, history) 66 | record_information(population, stats, start, archive, logbook, verbose) 67 | for gen in range(1, ngen + 1): 68 | offspring = generate_next_population(population, toolbox) 69 | render_fitness(offspring, toolbox, history) 70 | population = offspring 71 | record_information(population, stats, start, archive, logbook, verbose) 72 | return population, logbook, history 73 | 74 | -------------------------------------------------------------------------------- /fastgp/logging/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cfusting/fastgp/1951d0d992119ec86dc60b6fde636f903c638428/fastgp/logging/__init__.py -------------------------------------------------------------------------------- /fastgp/logging/archive.py: -------------------------------------------------------------------------------- 1 | import csv 2 | import operator 3 | from collections import defaultdict 4 | from copy import deepcopy 5 | 6 | import numpy 7 | 8 | from fastgp.algorithms import afpo 9 | from fastgp.utilities import symbreg 10 | 11 | 12 | class FitnessDistributionArchive(object): 13 | def __init__(self, frequency): 14 | self.fitness = [] 15 | self.generations = [] 16 | self.frequency = frequency 17 | self.generation_counter = 0 18 | 19 | def update(self, population): 20 | if self.generation_counter % self.frequency == 0: 21 | fitnesses = [ind.fitness.values for ind in population] 22 | self.fitness.append(fitnesses) 23 | self.generations.append(self.generation_counter) 24 | self.generation_counter += 1 25 | 26 | def save(self, log_file): 27 | fitness_distribution_file = "fitness_" + log_file 28 | with open(fitness_distribution_file, 'wb') as f: 29 | writer = csv.writer(f) 30 | for gen, ages in zip(self.generations, self.fitness): 31 | writer.writerow([gen, ages]) 32 | 33 | 34 | def pick_fitness_size_from_fitness_age_size(ind): 35 | ind.fitness.values = (ind.error, 0, len(ind)) 36 | 37 | 38 | def pick_fitness_complexity_from_fitness_age_complexity(ind): 39 | ind.fitness.values = (ind.error, 0, symbreg.calculate_order(ind)) 40 | 41 | 42 | def pick_fitness_size_complexity_from_fitness_age_size_complexity(ind): 43 | ind.fitness.values = (ind.error, 0, len(ind), symbreg.calculate_order(ind)) 44 | 45 | 46 | def pick_fitness_size_from_fitness_age(ind): 47 | ind.fitness.values = (ind.error, len(ind)) 48 | 49 | 50 | class MultiArchive(object): 51 | def __init__(self, archives): 52 | self.archives = archives 53 | 54 | def update(self, population): 55 | for archive in self.archives: 56 | archive.update(population) 57 | 58 | def save(self, log_file): 59 | for archive in self.archives: 60 | archive.save(log_file) 61 | 62 | 63 | class ParetoFrontSavingArchive(object): 64 | def __init__(self, frequency, criteria_chooser=None, simplifier=None): 65 | self.fronts = [] 66 | self.frequency = frequency 67 | self.generation_counter = 0 68 | self.criteria_chooser = criteria_chooser 69 | self.simplifier = simplifier 70 | 71 | def update(self, population): 72 | if self.generation_counter % self.frequency == 0: 73 | pop_copy = [deepcopy(ind) for ind in population] 74 | if self.simplifier is not None: 75 | self.simplifier(pop_copy) 76 | if self.criteria_chooser is not None: 77 | map(self.criteria_chooser, pop_copy) 78 | 79 | non_dominated = afpo.find_pareto_front(pop_copy) 80 | front = [pop_copy[index] for index in non_dominated] 81 | front.sort(key=operator.attrgetter("fitness.values")) 82 | self.fronts.append(front) 83 | self.generation_counter += 1 84 | 85 | def save(self, log_file): 86 | pareto_front_file = "pareto_" + log_file 87 | with open(pareto_front_file, 'w') as f: 88 | writer = csv.writer(f) 89 | generation = 0 90 | for front in self.fronts: 91 | inds = [(ind.fitness.values, str(ind)) for ind in front] 92 | writer.writerow([generation, len(inds)] + inds) 93 | generation += self.frequency 94 | 95 | 96 | class MutationStatsArchive(object): 97 | def __init__(self, evaluate_function): 98 | self.stats = defaultdict(list) 99 | self.neutral_mutations = defaultdict(int) 100 | self.detrimental_mutations = defaultdict(int) 101 | self.beneficial_mutations = defaultdict(int) 102 | self.evaluate_function = evaluate_function 103 | self.generation = -1 104 | 105 | def update(self, population): 106 | self.generation += 1 107 | 108 | def submit(self, old_ind, new_ind): 109 | old_error = self.evaluate_function(old_ind)[0] 110 | new_error = self.evaluate_function(new_ind)[0] 111 | delta_error = new_error - old_error 112 | delta_size = len(new_ind) - len(old_ind) 113 | if delta_size == 0 and numpy.isclose([delta_error], [0.0])[0]: 114 | self.neutral_mutations[self.generation] += 1 115 | if delta_error > 0: 116 | self.detrimental_mutations[self.generation] += 1 117 | elif delta_error < 0: 118 | self.beneficial_mutations[self.generation] += 1 119 | self.stats[self.generation].append((delta_error, delta_size)) 120 | 121 | def save(self, log_file): 122 | mutation_statistics_file = "mutation_stats_" + log_file 123 | fieldnames = ['generation', 'neutral_mutations', 'beneficial_mutations', 'detrimental_mutations', 'deltas'] 124 | with open(mutation_statistics_file, 'wb') as f: 125 | writer = csv.writer(f) 126 | writer.writerow(fieldnames) 127 | for gen in self.stats.keys(): 128 | writer.writerow([gen, self.neutral_mutations[gen], self.beneficial_mutations[gen], 129 | self.detrimental_mutations[gen]] + self.stats[gen]) 130 | -------------------------------------------------------------------------------- /fastgp/logging/reports.py: -------------------------------------------------------------------------------- 1 | from __future__ import division 2 | import csv 3 | from deap import tools 4 | import numpy 5 | import operator 6 | import fastgp.parametrized.simple_parametrized_terminals as sp 7 | 8 | 9 | def get_fitness(ind): 10 | return ind.fitness.values[0] 11 | 12 | 13 | def get_mean(values): 14 | return numpy.mean(list(filter(numpy.isfinite, values))) 15 | 16 | 17 | def get_std(values): 18 | return numpy.std(list(filter(numpy.isfinite, values))) 19 | 20 | 21 | def get_min(values): 22 | return numpy.min(list(filter(numpy.isfinite, values))) 23 | 24 | 25 | def get_max(values): 26 | return numpy.max(list(filter(numpy.isfinite, values))) 27 | 28 | 29 | def get_size_min(values): 30 | return min(values)[1] 31 | 32 | 33 | def get_size_max(values): 34 | return max(values)[1] 35 | 36 | 37 | def get_fitness_size(ind): 38 | return ind.fitness.values[0], len(ind) 39 | 40 | 41 | def configure_inf_protected_stats(): 42 | stats_fit = tools.Statistics(get_fitness) 43 | stats_size = tools.Statistics(len) 44 | stats_height = tools.Statistics(operator.attrgetter("height")) 45 | mstats = tools.MultiStatistics(fitness=stats_fit, size=stats_size, height=stats_height) 46 | mstats.register("avg", get_mean) 47 | mstats.register("std", get_std) 48 | mstats.register("min", get_min) 49 | mstats.register("max", get_max) 50 | 51 | stats_best_ind = tools.Statistics(get_fitness_size) 52 | stats_best_ind.register("size_min", get_size_min) 53 | stats_best_ind.register("size_max", get_size_max) 54 | mstats["best_tree"] = stats_best_ind 55 | return mstats 56 | 57 | 58 | def is_parametrized_terminal(node): 59 | return isinstance(node, sp.SimpleParametrizedTerminal) 60 | 61 | 62 | def get_param_ratio(ind): 63 | parametrized = len(list(filter(is_parametrized_terminal, ind))) 64 | total = len(ind) 65 | return parametrized / total 66 | 67 | 68 | def configure_parametrized_inf_protected_stats(): 69 | stats_fit = tools.Statistics(get_fitness) 70 | stats_size = tools.Statistics(len) 71 | stats_height = tools.Statistics(operator.attrgetter("height")) 72 | 73 | stats_parametrized = tools.Statistics(get_param_ratio) 74 | mstats = tools.MultiStatistics(fitness=stats_fit, size=stats_size, height=stats_height, 75 | parametrized=stats_parametrized) 76 | mstats.register("avg", get_mean) 77 | mstats.register("std", get_std) 78 | mstats.register("min", get_min) 79 | mstats.register("max", get_max) 80 | stats_best_ind = tools.Statistics(get_fitness_size) 81 | stats_best_ind.register("size_min", get_size_min) 82 | stats_best_ind.register("size_max", get_size_max) 83 | mstats["best_tree"] = stats_best_ind 84 | return mstats 85 | 86 | 87 | def get_age(ind): 88 | return ind.age 89 | 90 | 91 | def add_age_to_stats(mstats): 92 | stats_age = tools.Statistics(get_age) 93 | stats_age.register("avg", numpy.mean) 94 | stats_age.register("std", numpy.std) 95 | stats_age.register("max", numpy.max) 96 | mstats["age"] = stats_age 97 | return mstats 98 | 99 | 100 | def save_log_to_csv(log, file_path): 101 | columns = [log.select("cpu_time")] 102 | columns_names = ["cpu_time"] 103 | for chapter_name, chapter in log.chapters.items(): 104 | for column in chapter[0].keys(): 105 | columns_names.append(str(column) + "_" + str(chapter_name)) 106 | columns.append(chapter.select(column)) 107 | 108 | rows = zip(*columns) 109 | with open(file_path + '.csv', 'w') as f: 110 | writer = csv.writer(f) 111 | writer.writerow(columns_names) 112 | for row in rows: 113 | writer.writerow(row) 114 | 115 | 116 | def save_hof(hof, test_toolbox=None): 117 | def decorator(func): 118 | def wrapper(pop, log, file_name): 119 | func(pop, log, file_name) 120 | hof_file_name = "trees_" + file_name 121 | with open(hof_file_name, 'wb') as f: 122 | writer = csv.writer(f) 123 | writer.writerow(["gen", "fitness", "tree"]) 124 | for gen, ind in enumerate(hof.historical_trees): 125 | if test_toolbox is not None: 126 | test_error = test_toolbox.test_evaluate(ind)[0] 127 | writer.writerow([gen, ind.fitness, str(ind), test_error]) 128 | else: 129 | writer.writerow([gen, ind.fitness, str(ind)]) 130 | return wrapper 131 | return decorator 132 | 133 | 134 | def save_archive(archive): 135 | def decorator(func): 136 | def wrapper(pop, log, file_name): 137 | func(pop, log, file_name) 138 | archive.save(file_name) 139 | return wrapper 140 | return decorator 141 | -------------------------------------------------------------------------------- /fastgp/parametrized/__init__.py: -------------------------------------------------------------------------------- 1 | __author__ = 'mszubert' 2 | -------------------------------------------------------------------------------- /fastgp/parametrized/mutation.py: -------------------------------------------------------------------------------- 1 | import random 2 | 3 | 4 | def multi_mutation(ind, mutations, probs): 5 | for mutation, probability in zip(mutations, probs): 6 | if random.random() < probability: 7 | ind = mutation(ind), 8 | return ind, 9 | 10 | 11 | def multi_mutation_exclusive(ind, mutations, probs): 12 | if len(mutations) != len(probs): 13 | raise ValueError("Must have the same number of mutations as probabilities.") 14 | if sum(probs) > 1: 15 | raise ValueError("Probabilities must sum to 1.") 16 | prob_range = [0] + probs 17 | value = random.random() 18 | i = 1 19 | while i < len(prob_range): 20 | prob_range[i] += prob_range[i - 1] 21 | if prob_range[i - 1] <= value < prob_range[i]: 22 | mutations[i - 1](ind) 23 | return ind, 24 | i += 1 25 | return ind, 26 | -------------------------------------------------------------------------------- /fastgp/parametrized/simple_parametrized_terminals.py: -------------------------------------------------------------------------------- 1 | import random 2 | import time 3 | import itertools 4 | from functools import partial 5 | import math 6 | import re 7 | 8 | import cachetools 9 | import numpy as np 10 | from scipy.stats import skew, moment 11 | from copy import deepcopy 12 | 13 | from deap import gp 14 | 15 | 16 | class SimpleParametrizedPrimitiveSet(gp.PrimitiveSet): 17 | def __init__(self, name, arity, variable_type_indices, variable_names, prefix="ARG"): 18 | gp.PrimitiveSet.__init__(self, name, arity, prefix) 19 | self.variable_type_indices = variable_type_indices 20 | self.variable_names = variable_names 21 | 22 | def add_parametrized_terminal(self, parametrized_terminal_class): 23 | self._add(parametrized_terminal_class) 24 | self.context[parametrized_terminal_class.__name__] = parametrized_terminal_class.call 25 | 26 | 27 | class SimpleParametrizedPrimitiveTree(gp.PrimitiveTree): 28 | def __init__(self, content): 29 | gp.PrimitiveTree.__init__(self, content) 30 | 31 | def __deepcopy__(self, memo): 32 | new = self.__class__(self) 33 | for i, node in enumerate(self): 34 | if isinstance(node, SimpleParametrizedTerminal): 35 | new[i] = deepcopy(node) 36 | new.__dict__.update(deepcopy(self.__dict__, memo)) 37 | return new 38 | 39 | @classmethod 40 | def from_string(cls, string, pset): 41 | """Try to convert a string expression into a PrimitiveTree given a 42 | PrimitiveSet *pset*. The primitive set needs to contain every primitive 43 | present in the expression. 44 | 45 | :param string: String representation of a Python expression. 46 | :param pset: Primitive set from which primitives are selected. 47 | :returns: PrimitiveTree populated with the deserialized primitives. 48 | """ 49 | tokens = re.split("[ \t\n\r\f\v(),]", string) 50 | expr = [] 51 | 52 | def get_parts(token_string): 53 | parts = tokens[i].split('_') 54 | return parts[1], parts[2], parts[3] 55 | i = 0 56 | while i < len(tokens): 57 | if tokens[i] == '': 58 | i += 1 59 | continue 60 | if tokens[i] in pset.mapping: 61 | primitive = pset.mapping[tokens[i]] 62 | expr.append(primitive) 63 | elif RangeOperationTerminal.NAME in tokens[i]: 64 | operation, begin_range_name, end_range_name = get_parts(tokens[i]) 65 | range_operation_terminal = RangeOperationTerminal() 66 | range_operation_terminal.initialize_parameters(pset.variable_type_indices, pset.variable_names, 67 | operation, begin_range_name, end_range_name) 68 | expr.append(range_operation_terminal) 69 | elif MomentFindingTerminal.NAME in tokens[i]: 70 | operation, begin_range_name, end_range_name = get_parts(tokens[i]) 71 | moment_operation_terminal = MomentFindingTerminal() 72 | moment_operation_terminal.initialize_parameters(pset.variable_type_indices, pset.variable_names, 73 | operation, begin_range_name, end_range_name) 74 | expr.append(moment_operation_terminal) 75 | else: 76 | try: 77 | token = eval(tokens[i]) 78 | except NameError: 79 | raise TypeError("Unable to evaluate terminal: {}.".format(tokens[i])) 80 | expr.append(gp.Terminal(token, False, gp.__type__)) 81 | i += 1 82 | return cls(expr) 83 | 84 | 85 | class SimpleParametrizedTerminal(gp.Terminal): 86 | ret = object 87 | 88 | def __init__(self, name="SimpleParametrizedTerminal", ret_type=object): 89 | gp.Terminal.__init__(self, name, True, ret_type) 90 | 91 | def __deepcopy__(self, memo): 92 | new = self.__class__() 93 | new.__dict__.update(deepcopy(self.__dict__, memo)) 94 | return new 95 | 96 | def initialize_parameters(self, variable_type_indices, names): 97 | raise NotImplementedError 98 | 99 | def create_input_vector(self, predictors): 100 | raise NotImplementedError 101 | 102 | def call(*parameters): 103 | pass # implement this method to make the class work with standard gp.compile 104 | 105 | 106 | def name_operation(operation, name): 107 | operation.__name__ = name 108 | return operation 109 | 110 | 111 | class RangeOperationTerminal(SimpleParametrizedTerminal): 112 | NAME = 'RangeOperation' 113 | 114 | def __init__(self): 115 | SimpleParametrizedTerminal.__init__(self, RangeOperationTerminal.__name__) 116 | self.begin_range = None 117 | self.end_range = None 118 | self.operation = None 119 | self.names = None 120 | self.lower_bound = None 121 | self.upper_bound = None 122 | self.operations = { 123 | 'sum': name_operation(np.sum, 'sum'), 124 | 'min': name_operation(np.min, 'min'), 125 | 'max': name_operation(np.max, 'max') 126 | } 127 | 128 | def initialize_parameters(self, variable_type_indices, names, operation=None, begin_range_name=None, 129 | end_range_name=None, *args): 130 | """ 131 | :param variable_type_indices: A sequence of variable type indices where each entry defines the 132 | index of a variable type in the design matrix. For example a design matrix with two variable types will have 133 | indices [j,n] where variable type A spans 0 to j and variable type B spans j + 1 to n. 134 | :param names: 135 | :param args: 136 | :param operation 137 | :param begin_range_name 138 | :param end_range_name 139 | :return: 140 | """ 141 | self.names = names 142 | for r in variable_type_indices: 143 | if r[1] - r[0] < 2: 144 | raise ValueError('Invalid range provided to Range Terminal: ' + str(r)) 145 | rng = random.choice(variable_type_indices) 146 | self.lower_bound = rng[0] 147 | self.upper_bound = rng[1] 148 | if operation is not None and begin_range_name is not None and end_range_name is not None: 149 | if self.operations.get(operation) is None: 150 | raise ValueError('Invalid operation provided to Range Terminal: ' + operation) 151 | if begin_range_name not in self.names: 152 | raise ValueError('Invalid range name provided to Range Termnial: ' + str(begin_range_name)) 153 | if end_range_name not in names: 154 | raise ValueError('Invalid range name provided to Range Termnial: ' + str(end_range_name)) 155 | begin_range = self.names.index(begin_range_name) 156 | end_range = self.names.index(end_range_name) 157 | valid = False 158 | for r in variable_type_indices: 159 | if r[0] <= begin_range < end_range <= r[1]: 160 | valid = True 161 | if not valid: 162 | raise ValueError('Invalid range provided to Range Terminal: (' + str(begin_range) + ',' + 163 | str(end_range) + ')') 164 | self.operation = self.operations[operation] 165 | self.begin_range = begin_range 166 | self.end_range = end_range 167 | else: 168 | self.operation = random.choice(list(self.operations.values())) 169 | self.begin_range = np.random.randint(self.lower_bound, self.upper_bound - 1) 170 | self.end_range = np.random.randint(self.begin_range + 1, self.upper_bound) 171 | 172 | def mutate_parameters(self, stdev_calc): 173 | mutation = random.choice(['low', 'high']) 174 | span = self.end_range - self.begin_range 175 | if span == 0: 176 | span = 1 177 | value = random.gauss(0, stdev_calc(span)) 178 | amount = int(math.ceil(abs(value))) 179 | if value < 0: 180 | amount *= -1 181 | if mutation == 'low': 182 | location = amount + self.begin_range 183 | if location < self.lower_bound: 184 | self.begin_range = self.lower_bound 185 | elif location > self.end_range - 2: 186 | self.begin_range = self.end_range - 2 187 | elif location > self.upper_bound - 2: 188 | self.begin_range = self.upper_bound - 2 189 | else: 190 | self.begin_range = location 191 | elif mutation == 'high': 192 | location = amount + self.end_range 193 | if location > self.upper_bound: 194 | self.end_range = self.upper_bound 195 | elif location < self.begin_range + 2: 196 | self.end_range = self.begin_range + 2 197 | elif location < self.lower_bound + 2: 198 | self.end_range = self.lower_bound + 2 199 | else: 200 | self.end_range = location 201 | 202 | def create_input_vector(self, predictors): 203 | array = predictors[:, self.begin_range:self.end_range] 204 | if array.shape[1] == 0: 205 | return np.zeros((array.shape[0], 1)) 206 | else: 207 | return self.operation(array, axis=1) 208 | 209 | def format(self): 210 | return "RangeOperation_{}_{}_{}".format(self.operation.__name__, self.names[self.begin_range], 211 | self.names[self.end_range - 1]) 212 | 213 | 214 | class MomentFindingTerminal(RangeOperationTerminal): 215 | NAME = 'MomentOperation' 216 | 217 | def __init__(self): 218 | super(MomentFindingTerminal, self).__init__() 219 | self.operations = { 220 | 'mean': name_operation(np.mean, 'mean'), 221 | 'vari': name_operation(np.var, 'vari'), 222 | 'skew': name_operation(skew, 'skew') 223 | } 224 | 225 | def initialize_parameters(self, variable_type_indices, names, operation=None, begin_range_name=None, 226 | end_range_name=None, *args): 227 | if operation is None: 228 | super(MomentFindingTerminal, self).initialize_parameters(variable_type_indices, names) 229 | self.operation = random.choice(list(self.operations.values())) 230 | else: 231 | super(MomentFindingTerminal, self).initialize_parameters(variable_type_indices, names, operation, 232 | begin_range_name, end_range_name, *args) 233 | 234 | def format(self): 235 | return "MomentOperation_{}_{}_{}".format(self.operation.__name__, self.names[self.begin_range], 236 | self.names[self.end_range - 1]) 237 | 238 | 239 | class PolynomialFindingTerminal(RangeOperationTerminal): 240 | NAME = 'PolynomialOperation' 241 | 242 | def __init__(self): 243 | super(PolynomialFindingTerminal, self).__init__() 244 | self.operations = { 245 | 'first': self.first, 246 | 'second': self.second, 247 | 'third': self.third 248 | } 249 | 250 | def first(self, X, axis=1): 251 | return self.polynomial(X, 1) 252 | 253 | def second(self, X, axis=1): 254 | return self.polynomial(X, 2) 255 | 256 | def third(self, X, axis=1): 257 | return self.polynomial(X, 3) 258 | 259 | def polynomial(self, X, order, interactions=False): 260 | start = time.time() 261 | orders = [] 262 | for o in range(1, order + 1): 263 | orders.append(np.apply_along_axis(lambda x: np.power(x, o), 1, X)) 264 | matrix = np.concatenate(orders, axis=1) 265 | rows = matrix.shape[0] 266 | cols = matrix.shape[1] 267 | result = np.zeros(rows) 268 | if interactions: 269 | indices = [x for x in range(cols)] 270 | for c in range(1, cols): 271 | for comb in itertools.combinations(indices, c): 272 | M = np.ones(rows) 273 | for j in comb: 274 | M *= matrix[:, j].reshape(rows) 275 | result += M 276 | else: 277 | result = np.sum(matrix, axis=1) 278 | return result 279 | 280 | def initialize_parameters(self, variable_type_indices, names, operation=None, begin_range_name=None, 281 | end_range_name=None, *args): 282 | if operation is None: 283 | super(PolynomialFindingTerminal, self).initialize_parameters(variable_type_indices, names) 284 | self.operation = random.choice(list(self.operations.values())) 285 | else: 286 | super(PolynomialFindingTerminal, self).initialize_parameters(variable_type_indices, names, operation, 287 | begin_range_name, end_range_name, *args) 288 | 289 | def format(self): 290 | return "PolynomialOperation{}_{}_{}".format(self.operation.__name__, self.names[self.begin_range], 291 | self.names[self.end_range - 1]) 292 | 293 | 294 | def named_moment(number): 295 | def f(vector, axis=0): 296 | return moment(vector, moment=number, axis=axis) 297 | f.__name__ = "moment_" + str(number) 298 | return f 299 | 300 | 301 | def generate_parametrized_expression(generate_expression, variable_type_indices, names): 302 | expr = generate_expression() 303 | for node in expr: 304 | if isinstance(node, SimpleParametrizedTerminal): 305 | node.initialize_parameters(variable_type_indices, names) 306 | return expr 307 | 308 | 309 | def evolve_parametrized_expression(stdev_calc): 310 | def decorator(func): 311 | def wrapper(*args, **kargs): 312 | offspring = list(func(*args, **kargs)) 313 | for ind in offspring: 314 | for node in ind: 315 | if isinstance(node, SimpleParametrizedTerminal): 316 | node.mutate_parameters(stdev_calc) 317 | return offspring 318 | return wrapper 319 | return decorator 320 | 321 | 322 | def get_parametrized_nodes(ind): 323 | return list(filter(lambda node: isinstance(node, SimpleParametrizedTerminal), ind)) 324 | 325 | 326 | def mutate_parametrized_nodes(ind, stdev_calc): 327 | param_nodes = get_parametrized_nodes(ind) 328 | map(lambda node: node.mutate_parameters(stdev_calc), param_nodes) 329 | return ind, 330 | 331 | 332 | def mutate_single_parametrized_node(ind, stdev_calc): 333 | param_nodes = get_parametrized_nodes(ind) 334 | if len(param_nodes) != 0: 335 | random.choice(param_nodes).mutate_parameters(stdev_calc) 336 | return ind, 337 | 338 | 339 | def search_entire_space(node, evaluate_function): 340 | fitness = [] 341 | parameters = [] 342 | begin = node.lower_bound 343 | while begin <= node.upper_bound: 344 | end = begin + 1 345 | while end <= node.upper_bound: 346 | node.begin_range = begin 347 | node.end_range = end 348 | fitness.append(evaluate_function()) 349 | parameters.append((begin, end)) 350 | end += 1 351 | begin += 1 352 | return parameters, fitness 353 | 354 | 355 | def optimize_node(node, evaluate_function, optimization_objective_function): 356 | parameters, fitness = search_entire_space(node, evaluate_function) 357 | best_value = optimization_objective_function(fitness) 358 | optimal_index = fitness.index(best_value) 359 | begin, end = parameters[optimal_index] 360 | node.begin_range = begin 361 | node.end_range = end 362 | return parameters, fitness 363 | 364 | 365 | def mutate_single_parametrized_node_optimal(ind, evaluate_function, optimization_objective_function): 366 | param_nodes = get_parametrized_nodes(ind) 367 | if len(param_nodes) != 0: 368 | node = random.choice(param_nodes) 369 | optimize_node(node, partial(evaluate_function, ind=ind), optimization_objective_function) 370 | return ind, 371 | 372 | 373 | def simple_parametrized_evaluate(ind, context, predictors, error_function=None, expression_dict=None): 374 | semantics_stack = [] 375 | expressions_stack = [] 376 | 377 | if expression_dict is None: 378 | expression_dict = cachetools.LRUCache(maxsize=100) 379 | 380 | for node in reversed(ind): 381 | expression = node.format(*[expressions_stack.pop() for _ in range(node.arity)]) 382 | subtree_semantics = [semantics_stack.pop() for _ in range(node.arity)] 383 | 384 | if expression in expression_dict: 385 | vector = expression_dict[expression] 386 | else: 387 | vector = get_node_semantics(node, subtree_semantics, predictors, context) 388 | expression_dict[expression] = vector 389 | 390 | expressions_stack.append(expression) 391 | semantics_stack.append(vector) 392 | 393 | if error_function is None: 394 | return semantics_stack.pop() 395 | else: 396 | return error_function(semantics_stack.pop()) 397 | 398 | 399 | def get_terminal_semantics(node, context, predictors): 400 | if isinstance(node, gp.Ephemeral) or isinstance(node.value, float) or isinstance(node.value, int): 401 | return np.ones(len(predictors)) * node.value 402 | 403 | if node.value in context: 404 | return np.ones(len(predictors)) * context[node.value] 405 | 406 | arg_index = re.findall('\d+', node.name) 407 | return predictors[:, int(arg_index[0])] 408 | 409 | 410 | def get_node_semantics(node, subtree_semantics, predictors, context): 411 | if isinstance(node, SimpleParametrizedTerminal): 412 | vector = node.create_input_vector(predictors) 413 | elif isinstance(node, gp.Terminal): 414 | vector = get_terminal_semantics(node, context, predictors) 415 | else: 416 | with np.errstate(over='ignore', divide='ignore', invalid='ignore'): 417 | vector = context[node.name](*list(map(lambda x: x.astype(float) if type(x) != float else x, 418 | subtree_semantics))) 419 | return vector 420 | 421 | 422 | def graph(expr): 423 | nodes = range(len(expr)) 424 | edges = list() 425 | labels = dict() 426 | 427 | stack = [] 428 | for i, node in enumerate(expr): 429 | if stack: 430 | edges.append((stack[-1][0], i)) 431 | stack[-1][1] -= 1 432 | if isinstance(node, gp.Primitive): 433 | labels[i] = node.name 434 | elif isinstance(node, SimpleParametrizedTerminal): 435 | labels[i] = node.format() 436 | else: 437 | labels[i] = node.value 438 | stack.append([i, node.arity]) 439 | while stack and stack[-1][1] == 0: 440 | stack.pop() 441 | return nodes, edges, labels 442 | -------------------------------------------------------------------------------- /fastgp/utilities/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cfusting/fastgp/1951d0d992119ec86dc60b6fde636f903c638428/fastgp/utilities/__init__.py -------------------------------------------------------------------------------- /fastgp/utilities/benchmark_problems.py: -------------------------------------------------------------------------------- 1 | import math 2 | 3 | 4 | # 4 * x^4 + 3 * x^3 + 2 * x^2 + x 5 | def mod_quartic(x): 6 | return x * (1 + x * (2 + x * (3 + x * 4))) 7 | 8 | 9 | # Koza-1: x^4 + x^3 + x^2 + x 10 | def quartic(x): 11 | return x * (1 + x * (1 + x * (1 + x))) 12 | 13 | 14 | # Koza-2: x^5 - 2x^3 + x 15 | def quintic(x): 16 | return x * (1 - x * x * (2 - x * x)) 17 | 18 | 19 | # Koza-3: x^6 - 2x^4 + x^2 20 | def sextic(x): 21 | return x * x * (1 - x * x * (2 - x * x)) 22 | 23 | 24 | # x^7 - 2x^6 + x^5 - x^4 + x^3 - 2x^2 + x 25 | def septic(x): 26 | return x * (1 - x * (2 - x * (1 - x * (1 - x * (1 - x * (2 - x)))))) 27 | 28 | 29 | # sum_{1}^9{x^i} 30 | def nonic(x): 31 | return x * (1 + x * (1 + x * (1 + x * (1 + x * (1 + x * (1 + x * (1 + x * (1 + x)))))))) 32 | 33 | 34 | # x^3 + x^2 + x 35 | def nguyen1(x): 36 | return x * (1 + x * (1 + x)) 37 | 38 | 39 | # x^5 + x^4 + x^3 + x^2 + x 40 | def nguyen3(x): 41 | return x * (1 + x * (1 + x * (1 + x * (1 + x)))) 42 | 43 | 44 | # x^6 + x^5 + x^4 + x^3 + x^2 + x 45 | def nguyen4(x): 46 | return x * (1 + x * (1 + x * (1 + x * (1 + x * (1 + x))))) 47 | 48 | 49 | def nguyen5(x): 50 | return math.sin(x * x) * math.cos(x) - 1 51 | 52 | 53 | def nguyen6(x): 54 | return math.sin(x) + math.sin(x * (1 + x)) 55 | 56 | 57 | def nguyen7(x): 58 | return math.log(x + 1) + math.log(x * x + 1) 59 | 60 | 61 | def nguyen9(x, y): 62 | return math.sin(x) + math.sin(y * y) 63 | 64 | 65 | def nguyen10(x, y): 66 | return 2 * math.sin(x) * math.cos(y) 67 | 68 | 69 | def nguyen12(x, y): 70 | return x ** 4 - x ** 3 + (y ** 2 / 2.0) - y 71 | 72 | 73 | def keijzer1(x): 74 | return 0.3 * x * math.sin(2 * math.pi * x) 75 | 76 | 77 | def keijzer4(x): 78 | return x ** 3 * math.exp(-x) * math.cos(x) * math.sin(x) * (math.sin(x) ** 2 * math.cos(x) - 1) 79 | 80 | 81 | def keijzer11(x, y): 82 | return (x * y) + math.sin((x - 1) * (y - 1)) 83 | 84 | 85 | def keijzer12(x, y): 86 | return x ** 4 - x ** 3 + (y ** 2 / 2.0) - y 87 | 88 | 89 | def keijzer13(x, y): 90 | return 6 * math.sin(x) * math.cos(y) 91 | 92 | 93 | def keijzer14(x, y): 94 | return 8.0 / (2 + x ** 2 + y ** 2) 95 | 96 | 97 | def keijzer15(x, y): 98 | return (x ** 3 / 5.0) + (y ** 3 / 2.0) - x - y 99 | 100 | 101 | def r1(x): 102 | return ((x + 1) ** 3) / (x ** 2 - x + 1) 103 | 104 | 105 | def r2(x): 106 | return (x ** 5 - (3 * (x ** 3)) + 1) / (x ** 2 + 1) 107 | 108 | 109 | def r3(x): 110 | return (x ** 6 + x ** 5) / (x ** 4 + x ** 3 + x ** 2 + x + 1) 111 | 112 | 113 | def pagie1(x, y): 114 | return (1 / (1 + x ** -4)) + (1 / (1 + y ** -4)) 115 | -------------------------------------------------------------------------------- /fastgp/utilities/metrics.py: -------------------------------------------------------------------------------- 1 | import numpy 2 | from scipy.stats import pearsonr, spearmanr 3 | 4 | from fastgp.utilities.symbreg import numpy_protected_div_dividend 5 | 6 | 7 | def mean_absolute_error(vector, response): 8 | errors = numpy.abs(vector - response) 9 | mean_error = numpy.mean(errors) 10 | if not numpy.isfinite(mean_error): 11 | return numpy.inf, 12 | return mean_error.item(), 13 | 14 | 15 | def euclidean_error(vector, response): 16 | with numpy.errstate(over='ignore', divide='ignore', invalid='ignore'): 17 | squared_errors = numpy.square(vector - response) 18 | sum_squared_errors = numpy.sum(squared_errors) 19 | if not numpy.isfinite(sum_squared_errors): 20 | return numpy.inf, 21 | distance = numpy.sqrt(sum_squared_errors) 22 | return distance.item(), 23 | 24 | 25 | def root_mean_square_error(vector, response): 26 | with numpy.errstate(over='ignore', divide='ignore', invalid='ignore'): 27 | squared_errors = numpy.square(vector - response) 28 | mse = numpy.mean(squared_errors) 29 | if not numpy.isfinite(mse): 30 | return numpy.inf, 31 | rmse = numpy.sqrt(mse) 32 | return rmse.item(), 33 | 34 | 35 | def mean_squared_error(vector, response): 36 | squared_errors = numpy.square(vector - response) 37 | mse = float(numpy.mean(squared_errors)) 38 | if not numpy.isfinite(mse): 39 | return numpy.inf, 40 | return mse, 41 | 42 | 43 | def pearson_correlation(vector, response): 44 | return pearsonr(vector, response) 45 | 46 | 47 | def spearman_correlation(vector, response): 48 | return spearmanr(vector, response) 49 | 50 | 51 | def normalized_cumulative_absolute_error(vector, response, threshold=0.0): 52 | errors = numpy.abs(vector - response) 53 | raw_sum = numpy.sum(errors) 54 | if not numpy.isfinite(raw_sum): 55 | return 0.0, 56 | 57 | errors[errors < threshold] = 0 58 | cumulative_error = numpy.sum(errors).item() 59 | return 1 / (1 + cumulative_error), 60 | 61 | 62 | def mean_absolute_percentage_error(vector, response): 63 | with numpy.errstate(over='ignore', divide='ignore', invalid='ignore'): 64 | errors = numpy_protected_div_dividend((vector - response), response) 65 | errors = numpy_protected_div_dividend(errors, float(len(response))) 66 | mean_error = numpy.sum(numpy.abs(errors)) 67 | if numpy.isnan(mean_error) or not numpy.isfinite(mean_error): 68 | return numpy.inf, 69 | return mean_error, 70 | 71 | 72 | def percentage_error(vector, response, threshold=0.0): 73 | errors = numpy.abs(vector - response) 74 | raw_sum = numpy.sum(errors) 75 | if not numpy.isfinite(raw_sum): 76 | return 0.0, 77 | 78 | errors[errors < threshold] = 0 79 | cumulative_error = numpy.sum(errors).item() 80 | cumulative_response = numpy.sum(response).item() 81 | return numpy_protected_div_dividend(cumulative_error, cumulative_response), 82 | 83 | 84 | def cumulative_absolute_error(vector, response): 85 | errors = numpy.abs(vector - response) 86 | cumulative_error = numpy.sum(errors) 87 | if not numpy.isfinite(cumulative_error): 88 | return numpy.inf, 89 | return cumulative_error.item(), 90 | 91 | 92 | def normalized_mean_squared_error(vector, response): 93 | squared_errors = numpy.square(vector - response) 94 | mse = numpy.mean(squared_errors) 95 | if not numpy.isfinite(mse): 96 | return numpy.inf, 97 | normalized_mse = mse / numpy.var(response) 98 | return normalized_mse.item(), 99 | 100 | -------------------------------------------------------------------------------- /fastgp/utilities/operators.py: -------------------------------------------------------------------------------- 1 | import copy 2 | from functools import wraps 3 | import random 4 | 5 | 6 | def static_limit(key, max_value): 7 | def decorator(func): 8 | @wraps(func) 9 | def wrapper(*args, **kwargs): 10 | keep_inds = [copy.deepcopy(ind) for ind in args] 11 | new_inds = list(func(*args, **kwargs)) 12 | for i, ind in enumerate(new_inds): 13 | if key(ind) > max_value: 14 | new_inds[i] = copy.deepcopy(random.choice(keep_inds)) 15 | return new_inds 16 | return wrapper 17 | return decorator 18 | 19 | 20 | def stats_collector(archive): 21 | def decorator(func): 22 | @wraps(func) 23 | def wrapper(*args, **kwargs): 24 | keep_inds = [copy.deepcopy(ind) for ind in args] 25 | new_inds = list(func(*args, **kwargs)) 26 | for old_ind, new_ind in zip(keep_inds, new_inds): 27 | archive.submit(old_ind, new_ind) 28 | return new_inds 29 | return wrapper 30 | return decorator 31 | 32 | 33 | def internally_biased_node_selector(individual, bias): 34 | internal_nodes = [] 35 | leaves = [] 36 | 37 | for index, node in enumerate(individual): 38 | if node.arity == 0: 39 | leaves.append(index) 40 | else: 41 | internal_nodes.append(index) 42 | 43 | if internal_nodes and random.random() < bias: 44 | return random.choice(internal_nodes) 45 | else: 46 | return random.choice(leaves) 47 | 48 | 49 | def get_node_indices_at_depth(individual, level): 50 | stack = [0] 51 | nodes_at_depth = [] 52 | for index, node in enumerate(individual): 53 | current_depth = stack.pop() 54 | if current_depth == level: 55 | nodes_at_depth.append(index) 56 | stack.extend([current_depth + 1] * node.arity) 57 | 58 | return nodes_at_depth 59 | 60 | 61 | def uniform_depth_node_selector(individual): 62 | depth = random.randint(0, individual.height) 63 | nodes_at_depth = get_node_indices_at_depth(individual, depth) 64 | return random.choice(nodes_at_depth) 65 | 66 | 67 | def uniform_depth_mutation(individual, expr, pset): 68 | node_index = uniform_depth_node_selector(individual) 69 | slice_ = individual.searchSubtree(node_index) 70 | type_ = individual[node_index].ret 71 | individual[slice_] = expr(pset=pset, type_=type_) 72 | return individual, 73 | 74 | 75 | def multi_mutation(ind, mutations, probs): 76 | for mutation, probability in zip(mutations, probs): 77 | if random.random() < probability: 78 | ind, = mutation(ind) 79 | return ind, 80 | 81 | 82 | def one_point_xover_biased(ind1, ind2, node_selector): 83 | if len(ind1) < 2 or len(ind2) < 2: 84 | return ind1, ind2 85 | 86 | index1 = node_selector(ind1) 87 | index2 = node_selector(ind2) 88 | slice1 = ind1.searchSubtree(index1) 89 | slice2 = ind2.searchSubtree(index2) 90 | ind1[slice1], ind2[slice2] = ind2[slice2], ind1[slice1] 91 | 92 | return ind1, ind2 93 | 94 | 95 | def mutation_biased(ind, expr, node_selector): 96 | index = node_selector(ind) 97 | slice1 = ind.searchSubtree(index) 98 | ind[slice1] = expr() 99 | return ind, 100 | 101 | 102 | def static_limit_retries(key, max_value, num_retries): 103 | def decorator(func): 104 | @wraps(func) 105 | def wrapper(*args, **kwargs): 106 | keep_inds = [copy.deepcopy(ind) for ind in args] 107 | 108 | for _ in range(num_retries): 109 | new_inds = list(func(*args, **kwargs)) 110 | all_within_limit = True 111 | for i, ind in enumerate(new_inds): 112 | if key(ind) > max_value: 113 | all_within_limit = False 114 | break 115 | if all_within_limit: 116 | return new_inds 117 | 118 | new_inds = list(func(*args, **kwargs)) 119 | for i, ind in enumerate(new_inds): 120 | if key(ind) > max_value: 121 | new_inds[i] = random.choice(keep_inds) 122 | return new_inds 123 | return wrapper 124 | return decorator 125 | -------------------------------------------------------------------------------- /fastgp/utilities/subset_selection.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from fastgp.algorithms import fast_evaluate 4 | 5 | 6 | class SubsetSelectionArchive(object): 7 | def __init__(self, frequency, predictors, response, subset_size, expression_dict): 8 | self.expression_dict = expression_dict 9 | self.frequency = frequency 10 | self.predictors = predictors 11 | self.response = response 12 | self.subset_size = subset_size 13 | self.num_obs = len(predictors) 14 | 15 | selected_indices = np.random.choice(self.num_obs, self.subset_size, replace=False) 16 | self.training_subset = np.zeros(self.num_obs, np.bool) 17 | self.training_subset[selected_indices] = 1 18 | self.subset_predictors = self.predictors[self.training_subset, :] 19 | self.subset_response = self.response[self.training_subset] 20 | self.generation_counter = 0 21 | 22 | def update(self, population): 23 | raise NotImplementedError 24 | 25 | def set_difficulty(self, errors): 26 | pass 27 | 28 | def get_data_subset(self): 29 | return self.subset_predictors, self.subset_response 30 | 31 | def get_indices(self): 32 | return np.arange(self.num_obs)[self.training_subset] 33 | 34 | def save(self, log_file): 35 | pass 36 | 37 | 38 | class RandomSubsetSelectionArchive(SubsetSelectionArchive): 39 | def __init__(self, frequency, predictors, response, subset_size, expression_dict): 40 | SubsetSelectionArchive.__init__(self, frequency, predictors, response, subset_size, expression_dict) 41 | 42 | def update(self, population): 43 | if self.generation_counter % self.frequency == 0: 44 | selected_indices = np.random.choice(self.num_obs, self.subset_size, replace=False) 45 | self.training_subset = np.zeros(self.num_obs, np.bool) 46 | self.training_subset[selected_indices] = 1 47 | self.subset_predictors = self.predictors[self.training_subset, :] 48 | self.subset_response = self.response[self.training_subset] 49 | self.expression_dict.clear() 50 | self.generation_counter += 1 51 | 52 | 53 | def fast_numpy_evaluate_subset(ind, context, subset_selection_archive, get_node_semantics, 54 | inner_evaluate_function=fast_evaluate.fast_numpy_evaluate, 55 | error_function=None, expression_dict=None): 56 | predictors, response = subset_selection_archive.get_data_subset() 57 | root_semantics = inner_evaluate_function(ind, context, predictors, get_node_semantics, error_function=None, 58 | expression_dict=expression_dict) 59 | return error_function(root_semantics, response) 60 | -------------------------------------------------------------------------------- /fastgp/utilities/symbreg.py: -------------------------------------------------------------------------------- 1 | import math 2 | 3 | from deap import gp 4 | import numpy 5 | 6 | 7 | def protected_div_one(left, right): 8 | try: 9 | return left / right 10 | except ZeroDivisionError: 11 | return 1 12 | 13 | 14 | def protected_div_zero(left, right): 15 | try: 16 | return left / right 17 | except ZeroDivisionError: 18 | return 0 19 | 20 | 21 | def protected_div_dividend(left, right): 22 | if right != 0: 23 | return left / right 24 | else: 25 | return left 26 | 27 | 28 | def aq(left, right): 29 | with numpy.errstate(divide='ignore', invalid='ignore'): 30 | x = numpy.divide(left, numpy.sqrt(1 + numpy.square(right))) 31 | if isinstance(x, numpy.ndarray): 32 | x[numpy.isinf(x)] = left[numpy.isinf(x)] 33 | x[numpy.isnan(x)] = left[numpy.isnan(x)] 34 | elif numpy.isinf(x) or numpy.isnan(x): 35 | x = left 36 | return x 37 | 38 | 39 | def numpy_protected_div_dividend(left, right): 40 | with numpy.errstate(divide='ignore', invalid='ignore'): 41 | x = numpy.divide(left, right) 42 | if isinstance(x, numpy.ndarray): 43 | x[numpy.isinf(x)] = left[numpy.isinf(x)] 44 | x[numpy.isnan(x)] = left[numpy.isnan(x)] 45 | elif numpy.isinf(x) or numpy.isnan(x): 46 | x = left 47 | return x 48 | 49 | 50 | def numpy_protected_div_zero(left, right): 51 | with numpy.errstate(divide='ignore', invalid='ignore'): 52 | x = numpy.divide(left, right) 53 | if isinstance(x, numpy.ndarray): 54 | x[numpy.isinf(x)] = 0.0 55 | x[numpy.isnan(x)] = 0.0 56 | elif numpy.isinf(x) or numpy.isnan(x): 57 | x = 0.0 58 | return x 59 | 60 | 61 | def numpy_protected_div_one(left, right): 62 | with numpy.errstate(divide='ignore', invalid='ignore'): 63 | x = numpy.divide(left, right) 64 | if isinstance(x, numpy.ndarray): 65 | x[numpy.isinf(x)] = 1.0 66 | x[numpy.isnan(x)] = 1.0 67 | elif numpy.isinf(x) or numpy.isnan(x): 68 | x = 1.0 69 | return x 70 | 71 | 72 | def numpy_protected_sqrt(x): 73 | with numpy.errstate(invalid='ignore'): 74 | x = numpy.sqrt(x) 75 | if isinstance(x, numpy.ndarray): 76 | x[numpy.isnan(x)] = 0 77 | elif numpy.isnan(x): 78 | x = 0 79 | return x 80 | 81 | 82 | def protected_log_one(x): 83 | if x > 0: 84 | return math.log(x) 85 | else: 86 | return 1 87 | 88 | 89 | def protected_log_abs(x): 90 | if x != 0: 91 | return math.log(abs(x)) 92 | else: 93 | return 0 94 | 95 | 96 | def cube(x): 97 | return numpy.power(x, 3.0) 98 | 99 | 100 | def numpy_protected_log_abs(x): 101 | with numpy.errstate(divide='ignore', invalid='ignore'): 102 | abs_val = numpy.abs(x) 103 | x = numpy.log(abs_val.astype(float)) 104 | if isinstance(x, numpy.ndarray): 105 | x[numpy.isinf(x)] = -1e300 106 | x[numpy.isnan(x)] = 0 107 | elif numpy.isinf(x): 108 | x = -1e300 109 | elif numpy.isnan(x): 110 | x = 0 111 | return x 112 | 113 | 114 | def numpy_protected_log_one(x): 115 | with numpy.errstate(divide='ignore', invalid='ignore'): 116 | x = numpy.log(numpy.abs(x)) 117 | if isinstance(x, numpy.ndarray): 118 | x[numpy.isinf(x)] = 1.0 119 | x[numpy.isnan(x)] = 1.0 120 | elif numpy.isinf(x): 121 | x = 1.0 122 | elif numpy.isnan(x): 123 | x = 1.0 124 | return x 125 | 126 | 127 | def get_terminal_order(node, context=None): 128 | if isinstance(node, gp.Ephemeral) or isinstance(node.value, float) \ 129 | or isinstance(node.value, int) or context is not None and node.value in context: 130 | return 0 131 | return 1 132 | 133 | 134 | def calculate_order(ind, context=None): 135 | order_stack = [] 136 | for node in reversed(ind): 137 | if isinstance(node, gp.Terminal): 138 | terminal_order = get_terminal_order(node, context) 139 | order_stack.append(terminal_order) 140 | elif node.arity == 1: 141 | arg_order = order_stack.pop() 142 | if node.name == numpy_protected_log_abs.__name__: 143 | order_stack.append(3 * arg_order) 144 | elif node.name == numpy.exp.__name__: 145 | order_stack.append(4 * arg_order) 146 | else: # cube or square 147 | order_stack.append(1.5 * arg_order) 148 | else: # node.arity == 2: 149 | args_order = [order_stack.pop() for _ in range(node.arity)] 150 | if node.name == numpy.add.__name__ or node.name == numpy.subtract.__name__: 151 | order_stack.append(max(args_order)) 152 | else: 153 | order_stack.append(sum(args_order)) 154 | return order_stack.pop() 155 | 156 | 157 | def get_numpy_infix_symbol_map(): 158 | symbol_map = {numpy.add.__name__: "({0} + {1})", 159 | numpy.subtract.__name__: "({0} - {1})", 160 | numpy.multiply.__name__: "({0} * {1})", 161 | numpy_protected_div_dividend.__name__: "({0} / {1})", 162 | numpy_protected_log_abs.__name__: "log({0})", 163 | numpy.abs.__name__: "abs({0})", 164 | numpy.sin.__name__: "sin({0})", 165 | numpy.cos.__name__: "cos({0})", 166 | numpy.exp.__name__: "exp({0})", 167 | numpy.square.__name__: "(({0}) ^ 2)", 168 | cube.__name__: "(({0}) ^ 3)", 169 | numpy.sqrt.__name__: "sqrt({0})", 170 | numpy.reciprocal.__name__: "(1 / {0})", 171 | aq.__name__: "({0} // {1})", 172 | numpy.power.__name__: "(({0}) ^ {1})"} 173 | return symbol_map 174 | 175 | 176 | def get_numpy_prefix_symbol_map(): 177 | symbol_map = [("+", numpy.add.__name__,), 178 | ("-", numpy.subtract.__name__), 179 | ("**", numpy.power.__name__), 180 | ("^", numpy.power.__name__), 181 | ("*", numpy.multiply.__name__), 182 | ("/", numpy_protected_div_dividend.__name__), 183 | ('abs', numpy.abs.__name__), 184 | ("log", numpy_protected_log_abs.__name__), 185 | ("sin", numpy.sin.__name__), 186 | ("cos", numpy.cos.__name__), 187 | ("exp", numpy.exp.__name__)] 188 | return symbol_map 189 | 190 | 191 | def get_numpy_commutative_set(): 192 | return {numpy.add.__name__, numpy.multiply.__name__} 193 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | scipy 2 | deap 3 | pytest 4 | cachetools 5 | numpy 6 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, find_packages 2 | 3 | setup( 4 | name='fastgp', 5 | version='0.1.0', 6 | description='Fast genetic programming.', 7 | author='Chris Fusting', 8 | author_email='cfusting@gmail.com', 9 | license='GNU GPLv3', 10 | classifiers=[ 11 | 'Development Status :: 3 - Alpha', 12 | 'Intended Audience :: Science/Research', 13 | 'Topic :: Scientific/Engineering :: Artificial Intelligence', 14 | 'License :: OSI Approved :: GNU General Public License v3 (GPLv3)', 15 | 'Programming Language :: Python :: 3' 16 | ], 17 | keywords='evolution machine learning artificial intelligence', 18 | install_requires=[ 19 | 'scipy', 20 | 'deap', 21 | 'cachetools', 22 | 'numpy', 23 | ], 24 | python_requires='>=2.7', 25 | packages=find_packages(exclude=['contrib', 'docs', 'tests']), 26 | url='https://github.com/cfusting/fastgp' 27 | ) 28 | -------------------------------------------------------------------------------- /tests/fastgp/parametrized/test_simple_parametrzied_terminal.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | import fastgp.parametrized.simple_parametrized_terminals as sp 4 | 5 | 6 | class TestRangeOperationTerminal: 7 | 8 | def test_initialize(self): 9 | term = sp.RangeOperationTerminal() 10 | variable_type_indices = [(1, 2)] 11 | names = ['xtdlta' + str(x) for x in range(2)] 12 | with pytest.raises(ValueError): 13 | term.initialize_parameters(variable_type_indices, names) 14 | variable_type_indices = [(2, 0)] 15 | names = ['xtdlta' + str(x) for x in range(2)] 16 | with pytest.raises(ValueError): 17 | term.initialize_parameters(variable_type_indices, names) 18 | variable_type_indices = [(1, 3)] 19 | names = ['xtdlta' + str(x) for x in range(2)] 20 | term.initialize_parameters(variable_type_indices, names) 21 | variable_type_indices = [(1, 3), (7, 30)] 22 | names = ['xtdlta' + str(x) for x in range(2)] + ['ytdlta' + str(x) for x in range(7, 30)] 23 | term.initialize_parameters(variable_type_indices, names) 24 | 25 | def test_initialize_manually(self): 26 | term = sp.RangeOperationTerminal() 27 | variable_type_indices = [(0, 2)] 28 | names = ['xtdlta' + str(x) for x in range(2)] 29 | with pytest.raises(ValueError): 30 | term.initialize_parameters(variable_type_indices, names, operation='cat', begin_range_name='xtdlta0', 31 | end_range_name='xtdlta1') 32 | with pytest.raises(ValueError): 33 | term.initialize_parameters(variable_type_indices, names, operation='cat', begin_range_name='xtlta0', 34 | end_range_name='xtdlta1') 35 | with pytest.raises(ValueError): 36 | term.initialize_parameters(variable_type_indices, names, operation='cat', begin_range_name='xtdlta0', 37 | end_range_name='xtdlt1') 38 | variable_type_indices = [(0, 2)] 39 | names = ['xtdlta' + str(x) for x in range(2)] 40 | term.initialize_parameters(variable_type_indices, names, operation='max', begin_range_name='xtdlta0', 41 | end_range_name='xtdlta1') 42 | variable_type_indices = [(0, 7)] 43 | names = ['xtdlta' + str(x) for x in range(7)] 44 | term.initialize_parameters(variable_type_indices, names, operation='max', begin_range_name='xtdlta3', 45 | end_range_name='xtdlta4') 46 | --------------------------------------------------------------------------------