├── LICENSE
├── README.md
├── fastgp
├── __init__.py
├── algorithms
│ ├── __init__.py
│ ├── afpo.py
│ ├── evolutionary_feature_synthesis.py
│ ├── fast_evaluate.py
│ └── truncation_with_elite.py
├── logging
│ ├── __init__.py
│ ├── archive.py
│ └── reports.py
├── parametrized
│ ├── __init__.py
│ ├── mutation.py
│ └── simple_parametrized_terminals.py
└── utilities
│ ├── __init__.py
│ ├── benchmark_problems.py
│ ├── metrics.py
│ ├── operators.py
│ ├── subset_selection.py
│ └── symbreg.py
├── requirements.txt
├── setup.py
└── tests
└── fastgp
└── parametrized
└── test_simple_parametrzied_terminal.py
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) [year] [fullname]
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Fast Genetic Programming
2 | fastgp is a numpy implementation of [genetic programming](https://en.wikipedia.org/wiki/Genetic_programming) built on top of [deap](https://github.com/DEAP/deap). It is the core library for [fastsr](https://github.com/cfusting/fast-symbolic-regression), a symbolic regression package for Python.
3 | It's primary contribution is an implementation of AFPO\[1\] which is compatible with any deap toolbox.
4 |
5 | fastgp was designed and developed by the [Morphology, Evolution & Cognition Laboratory](http://www.meclab.org/) at the University of Vermont. It extends research code which can be found [here](https://github.com/mszubert/gecco_2016).
6 |
7 | Installing
8 | ----------
9 | fastgp is compatible with Python 2.7+.
10 | ```bash
11 | pip install fastgp
12 | ```
13 |
14 | Example Usage
15 | -------------
16 | fastgp is a core library and as such there are no examples in this repository.
17 | Check out [fastsr](https://github.com/cfusting/fast-symbolic-regression) for an example of fastgp's use in Symbolic Regression.
18 |
19 | Literature Cited
20 | ----------------
21 | 1. Michael Schmidt and Hod Lipson. 2011. Age-fitness pareto optimization. In Genetic Programming Theory and Practice VIII. Springer, 129–146.
22 |
--------------------------------------------------------------------------------
/fastgp/__init__.py:
--------------------------------------------------------------------------------
1 | __author__ = 'mszubert'
2 |
--------------------------------------------------------------------------------
/fastgp/algorithms/__init__.py:
--------------------------------------------------------------------------------
1 | __author__ = 'mszubert'
2 |
--------------------------------------------------------------------------------
/fastgp/algorithms/afpo.py:
--------------------------------------------------------------------------------
1 | import logging
2 | import random
3 | import time
4 |
5 | from deap import tools
6 |
7 | from fastgp.utilities import symbreg
8 |
9 |
10 | def breed(parents, toolbox, xover_prob, mut_prob):
11 | offspring = [toolbox.clone(ind) for ind in parents]
12 |
13 | for i in range(1, len(offspring), 2):
14 | if random.random() < xover_prob:
15 | offspring[i - 1], offspring[i] = toolbox.mate(offspring[i - 1], offspring[i])
16 | max_age = max(offspring[i - 1].age, offspring[i].age)
17 | offspring[i].age = offspring[i - 1].age = max_age
18 | del offspring[i - 1].fitness.values, offspring[i].fitness.values
19 |
20 | for i in range(len(offspring)):
21 | if random.random() < mut_prob:
22 | offspring[i], = toolbox.mutate(offspring[i])
23 | del offspring[i].fitness.values
24 |
25 | return offspring
26 |
27 |
28 | def find_pareto_front(population):
29 | """Finds a subset of nondominated individuals in a given list
30 |
31 | :param population: a list of individuals
32 | :return: a set of indices corresponding to nondominated individuals
33 | """
34 |
35 | pareto_front = set(range(len(population)))
36 |
37 | for i in range(len(population)):
38 | if i not in pareto_front:
39 | continue
40 |
41 | ind1 = population[i]
42 | for j in range(i + 1, len(population)):
43 | ind2 = population[j]
44 |
45 | # if individuals are equal on all objectives, mark one of them (the first encountered one) as dominated
46 | # to prevent excessive growth of the Pareto front
47 | if ind2.fitness.dominates(ind1.fitness) or ind1.fitness == ind2.fitness:
48 | pareto_front.discard(i)
49 |
50 | if ind1.fitness.dominates(ind2.fitness):
51 | pareto_front.discard(j)
52 |
53 | return pareto_front
54 |
55 |
56 | def reduce_population(population, tournament_size, target_popsize, nondominated_size):
57 | num_iterations = 0
58 | new_population_indices = list(range(len(population)))
59 | stop_cond = False
60 | while len(new_population_indices) > target_popsize and len(new_population_indices) > nondominated_size:
61 | if num_iterations > 10e6:
62 | print("Pareto front size may be exceeding the size of population. Stopping the execution. Try making"
63 | "the population size larger or the number of generations smaller.")
64 | # random.sample(new_population_indices, len(new_population_indices) - target_popsize)
65 | stop_cond = True
66 | num_iterations += 1
67 | tournament_indices = random.sample(new_population_indices, tournament_size)
68 | tournament = [population[index] for index in tournament_indices]
69 | nondominated_tournament = find_pareto_front(tournament)
70 | for i in range(len(tournament)):
71 | if i not in nondominated_tournament:
72 | new_population_indices.remove(tournament_indices[i])
73 | population[:] = [population[i] for i in new_population_indices]
74 | return stop_cond
75 |
76 |
77 | def pareto_optimization(population, toolbox, xover_prob, mut_prob, ngen,
78 | tournament_size, num_randoms=1, archive=None,
79 | stats=None, calc_pareto_front=True, verbose=False,
80 | reevaluate_population=False, history=None,
81 | stop_time=None):
82 | start = time.time()
83 | if history is not None:
84 | history.update(population)
85 | logbook = tools.Logbook()
86 | logbook.header = ['gen', 'nevals', 'cpu_time'] + (stats.fields if stats else [])
87 |
88 | target_popsize = len(population)
89 |
90 | # calculating errors may be expensive, so we will cache the error value as an individual's attribute
91 | for ind in population:
92 | ind.error = toolbox.evaluate_error(ind)[0]
93 | toolbox.assign_fitness(population)
94 | for ind in population:
95 | history.genealogy_history[ind.history_index].error = ind.error
96 |
97 | record = stats.compile(population) if stats else {}
98 | cpu_time = time.time() - start
99 | logbook.record(gen=0, nevals=len(population), cpu_time=cpu_time, **record)
100 | if archive is not None:
101 | archive.update(population)
102 | if verbose:
103 | print(logbook.stream)
104 |
105 | gen = 0
106 | while(gen < (ngen + 1)):
107 | # do we want to enforce re-evaluating the whole population instead of using cached erro r values
108 | if reevaluate_population:
109 | for ind in population:
110 | ind.error = toolbox.evaluate_error(ind)[0]
111 | parents = toolbox.select(population, len(population) - num_randoms)
112 | offspring = breed(parents, toolbox, xover_prob, mut_prob)
113 | offspring += toolbox.generate_randoms()
114 |
115 | # evaluate newly generated individuals which do not have cached values (or have inherited them from parents)
116 | for ind in offspring:
117 | ind.error = toolbox.evaluate_error(ind)[0]
118 |
119 | # extend the population by adding offspring - the size of population is now 2*target_popsize
120 | population.extend(offspring)
121 | toolbox.assign_fitness(population)
122 |
123 | for ind in population:
124 | history.genealogy_history[ind.history_index].error = ind.error
125 |
126 | # we may take 2 strategies of evaluating pareto-front:
127 | # - pessimistic: Pareto front may be larger than target_popsize and we want to detect it early because
128 | # if that's the case we won't be able to reduce the size of population to target_popsize
129 | # - optimistic: in practice, the above case happen extremely rarely but calculating global front is expensive
130 | # so let's assume that Pareto front is small enough try to reduce the population
131 | if calc_pareto_front:
132 | pareto_front_size = len(find_pareto_front(population))
133 | logging.debug("Generation: %5d - Pareto Front Size: %5d", gen, pareto_front_size)
134 | if pareto_front_size > target_popsize:
135 | logging.info("Pareto front size exceeds the size of population. Try Making the population size larger"
136 | "or reducing the number of generations.")
137 | break
138 | else:
139 | pareto_front_size = 0
140 |
141 | # perform Pareto tournament selection until the size of the population is reduced to target_popsize
142 | stop_cond = reduce_population(population, tournament_size, target_popsize, pareto_front_size)
143 |
144 | record = stats.compile(population) if stats else {}
145 | cpu_time = time.time() - start
146 |
147 | print(gen, cpu_time, stop_time)
148 |
149 | logbook.record(gen=gen, nevals=len(population), cpu_time=cpu_time, **record)
150 | if archive is not None:
151 | archive.update(population)
152 | if verbose:
153 | print(logbook.stream)
154 |
155 | for ind in population:
156 | ind.age += 1
157 |
158 | if stop_cond:
159 | print('Stop condition reached at generation %i.' % gen)
160 | gen = ngen + 1
161 | elif stop_time is not None and cpu_time > stop_time:
162 | print('Stop time reached at generation %i.' % gen)
163 | gen = ngen + 1
164 | else:
165 | gen = gen + 1
166 |
167 | return population, logbook, history
168 |
169 |
170 | def evaluate_age_fitness(ind, error_func):
171 | ind.error = error_func(ind)[0]
172 | return ind.error, ind.age
173 |
174 |
175 | def evaluate_age_fitness_size(ind, error_func):
176 | ind.size = len(ind)
177 | return evaluate_age_fitness(ind, error_func) + (ind.size,)
178 |
179 |
180 | def evaluate_fitness_size(ind, error_func):
181 | ind.error = error_func(ind)[0]
182 | ind.size = len(ind)
183 | return ind.error, ind.size
184 |
185 |
186 | def evaluate_fitness_size_complexity(ind, error_func):
187 | ind.error = error_func(ind)[0]
188 | ind.size = len(ind)
189 | ind.complexity = symbreg.calculate_order(ind)
190 | return ind.error, ind.size, ind.complexity
191 |
192 |
193 | def assign_random_fitness(population, random_range):
194 | for ind in population:
195 | ind.fitness.values = (ind.error, random.randrange(random_range))
196 |
197 |
198 | def assign_pure_fitness(population):
199 | for ind in population:
200 | ind.fitness.values = (ind.error,)
201 |
202 |
203 | def assign_age_fitness(population):
204 | for ind in population:
205 | ind.fitness.values = (ind.error, ind.age)
206 |
207 |
208 | def assign_age_fitness_size(population):
209 | for ind in population:
210 | ind.fitness.values = (ind.error, ind.age, len(ind))
211 |
212 |
213 | def assign_age_fitness_complexity(population):
214 | for ind in population:
215 | ind.fitness.values = (ind.error, ind.age, symbreg.calculate_order(ind))
216 |
217 |
218 | def assign_age_fitness_size_complexity(population):
219 | for ind in population:
220 | ind.fitness.values = (ind.error, ind.age, len(ind), symbreg.calculate_order(ind))
221 |
222 |
223 | def assign_size_fitness(population):
224 | for ind in population:
225 | ind.fitness.values = (ind.error, len(ind))
226 |
227 |
--------------------------------------------------------------------------------
/fastgp/algorithms/evolutionary_feature_synthesis.py:
--------------------------------------------------------------------------------
1 | import warnings
2 | import random
3 | from copy import deepcopy
4 | import math
5 |
6 | import numpy as np
7 |
8 | from sklearn.linear_model import ElasticNetCV
9 | from sklearn.model_selection import TimeSeriesSplit, KFold
10 | from sklearn.preprocessing import StandardScaler
11 |
12 | from scipy.stats import pearsonr
13 | from scipy.stats import skew
14 |
15 | from fastgp.utilities.metrics import mean_squared_error
16 | from fastgp.utilities.symbreg import numpy_protected_div_dividend, numpy_protected_sqrt, numpy_protected_log_one
17 |
18 |
19 | class Statistics:
20 |
21 | def __init__(self):
22 | self.scores = []
23 | self.generations = []
24 | self.num_features = []
25 | self.index = 0
26 |
27 | def add(self, gen, score, num_features):
28 | self.generations.append(gen)
29 | self.scores.append(score)
30 | self.num_features.append(num_features)
31 |
32 | def __iter__(self):
33 | return self
34 |
35 | def next(self):
36 | self.index += 1
37 | if self.index > len(self.num_features):
38 | raise StopIteration
39 | return self.generations[self.index], self.scores[self.index], self.num_features[self.index]
40 |
41 |
42 | class Feature:
43 |
44 | def __init__(self, value, string, infix_string, size=0, fitness=1, original_variable=False):
45 | self.value = value
46 | self.fitness = fitness
47 | self.string = string
48 | self.infix_string = infix_string
49 | self.size = size
50 | self.original_variable = original_variable
51 |
52 | def __str__(self):
53 | return self.string
54 |
55 |
56 | class Operator:
57 |
58 | def __init__(self, operation, parity, string, infix, infix_name):
59 | self.operation = operation
60 | self.parity = parity
61 | self.string = string
62 | self.infix = infix
63 | self.infix_name = infix_name
64 |
65 |
66 | def square(x):
67 | return np.power(x, 2)
68 |
69 |
70 | def cube(x):
71 | return np.power(x, 3)
72 |
73 |
74 | def is_huge(x):
75 | return x > np.finfo(np.float64).max / 100000
76 |
77 |
78 | def numpy_safe_exp(x):
79 | with np.errstate(invalid='ignore'):
80 | result = np.exp(x)
81 | if isinstance(result, np.ndarray):
82 | result[np.isnan(x)] = 1
83 | result[np.isinf(x)] = 1
84 | result[is_huge(x)] = 1
85 | elif np.isinf(result):
86 | result = 1
87 | elif np.isnan(x):
88 | result = 1
89 | elif is_huge(x):
90 | result = 1
91 | return result
92 |
93 |
94 | def generate_operator_map(ops):
95 | opmap = {}
96 | for o in ops:
97 | opmap[o.infix_name] = o
98 | return opmap
99 |
100 |
101 | operators = [
102 | Operator(np.add, 2, '({0} + {1})', 'add({0},{1})', 'add'),
103 | Operator(np.subtract, 2, '({0} - {1})', 'sub({0},{1})', 'sub'),
104 | Operator(np.multiply, 2, '({0} * {1})', 'mul({0},{1})', 'mul'),
105 | Operator(numpy_protected_div_dividend, 2, '({0} / {1})', 'div({0},{1})', 'div'),
106 | # Operator(numpy_safe_exp, 1, 'exp({0})'),
107 | Operator(numpy_protected_log_one, 1, 'log({0})', 'log({0})', 'log'),
108 | Operator(square, 1, 'sqr({0})', 'sqr({0})', 'sqr'),
109 | Operator(numpy_protected_sqrt, 1, 'sqt({0})', 'sqt({0})', 'sqt'),
110 | Operator(cube, 1, 'cbe({0})', 'cbe({0})', 'cbe'),
111 | Operator(np.cbrt, 1, 'cbt({0})', 'cbt({0})', 'cbt'),
112 | Operator(None, None, None, None, 'mutate'),
113 | Operator(None, None, None, None, 'transition')
114 |
115 | ]
116 | operators_map = generate_operator_map(operators)
117 |
118 |
119 | def init(num_additions, feature_names, predictors, seed):
120 | random.seed(seed)
121 | np.random.seed(seed)
122 | if num_additions is None:
123 | num_additions = math.ceil(predictors.shape[1] / 3)
124 | if feature_names is None:
125 | feature_names = ['x' + str(x) for x in range(len(predictors))]
126 | return num_additions, feature_names
127 |
128 |
129 | def init_features(feature_names, predictors, preserve_originals, range_operations, variable_type_indices):
130 | features = []
131 | for i, name in enumerate(feature_names):
132 | features.append(Feature(predictors[:, i], name, name, original_variable=preserve_originals))
133 | for _ in range(range_operations):
134 | features.append(RangeOperation(variable_type_indices, feature_names, predictors))
135 | return features
136 |
137 |
138 | def get_basis(features):
139 | basis = np.zeros((features[0].value.shape[0], len(features)))
140 | for i, f in enumerate(features):
141 | basis[:, i] = features[i].value
142 | basis = np.nan_to_num(basis)
143 | scaler = StandardScaler()
144 | basis = scaler.fit_transform(basis)
145 | return basis, scaler
146 |
147 |
148 | def get_model(basis, response, time_series_cv, splits):
149 | if time_series_cv:
150 | cv = TimeSeriesSplit(n_splits=splits)
151 | else:
152 | cv = KFold(n_splits=splits)
153 | model = ElasticNetCV(l1_ratio=1, selection='random', cv=cv)
154 | with warnings.catch_warnings():
155 | warnings.simplefilter('ignore')
156 | model.fit(basis, response)
157 | _, coefs, _ = model.path(basis, response, l1_ration=model.l1_ratio_, alphas=model.alphas_)
158 | return model, coefs, model.mse_path_
159 |
160 |
161 | def get_selected_features(num_additions, features, tournament_probability):
162 | selected_features = []
163 | for _ in range(num_additions):
164 | feature = tournament_selection(features, tournament_probability)
165 | selected_features.append(feature)
166 | return selected_features
167 |
168 |
169 | def get_coefficient_fitness(coefs, mse_path, threshold, response_variance):
170 | mse = np.mean(mse_path, axis=1)
171 | r_squared = 1 - (mse / response_variance)
172 | binary_coefs = coefs > threshold
173 | return binary_coefs.dot(r_squared)
174 |
175 |
176 | def rank_by_coefficient(features, coefs, mse_path, num_additions, threshold, response_variance,
177 | verbose):
178 | fitness = get_coefficient_fitness(coefs, mse_path, threshold, response_variance)
179 | for i, f in enumerate(features):
180 | f.fitness = fitness[i]
181 | new_features = list(filter(lambda x: x.original_variable is True, features))
182 | possible_features = list(filter(lambda x: x.original_variable is False, features))
183 | possible_features.sort(key=lambda x: x.fitness, reverse=True)
184 | new_features.extend(possible_features[0:num_additions + 1])
185 | new_features.sort(key=lambda x: x.fitness, reverse=True)
186 | print('Top performing features:')
187 | for i in range(10):
188 | print(new_features[i].string + ' - ' + str(new_features[i].fitness))
189 | return new_features
190 |
191 |
192 | def remove_zeroed_features(model, features, threshold, verbose):
193 | remove_features = []
194 | for i, coef in enumerate(model.coef_):
195 | features[i].fitness = math.fabs(coef)
196 | if features[i].fitness <= threshold and not features[i].original_variable:
197 | remove_features.append(features[i])
198 | for f in remove_features:
199 | features.remove(f)
200 | print('Removed ' + str(len(remove_features)) + ' features from population.')
201 | if verbose and remove_features:
202 | print(get_model_string(remove_features))
203 | return features
204 |
205 |
206 | def update_fitness(features, response, threshold, fitness_algorithm, response_variance, num_additions,
207 | time_series_cv, splits, verbose):
208 | basis, _ = get_basis(features)
209 | model, coefs, mse_path = get_model(basis, response, time_series_cv, splits)
210 | if fitness_algorithm == 'zero_out':
211 | features = remove_zeroed_features(model, features, threshold, verbose)
212 | elif fitness_algorithm == 'coefficient_rank':
213 | features = rank_by_coefficient(features, coefs, mse_path, num_additions, threshold, response_variance,
214 | verbose)
215 | return features
216 |
217 |
218 | def uncorrelated(parents, new_feature, correlation_threshold):
219 | uncorr = True
220 | if type(parents) == list:
221 | for p in parents:
222 | r, _ = pearsonr(new_feature.value, p.value)
223 | if r > correlation_threshold:
224 | uncorr = False
225 | else:
226 | r, _ = pearsonr(new_feature.value, parents.value)
227 | if r > correlation_threshold:
228 | uncorr = False
229 | return uncorr
230 |
231 |
232 | def tournament_selection(population, probability):
233 | individuals = random.choices(population, k=2)
234 | individuals.sort(reverse=True, key=lambda x: x.fitness)
235 | if random.random() < probability:
236 | return individuals[0]
237 | else:
238 | return individuals[1]
239 |
240 |
241 | def compose_features(num_additions, features, tournament_probability, correlation_threshold,
242 | range_operators, verbose):
243 | new_feature_list = []
244 | for _ in range(num_additions):
245 | operator = random.choice(operators)
246 | if operator.parity == 1:
247 | parent = tournament_selection(features, tournament_probability)
248 | new_feature_string = operator.string.format(parent.string)
249 | new_infix_string = operator.infix.format(parent.infix_string)
250 | new_feature_value = operator.operation(parent.value)
251 | new_feature = Feature(new_feature_value, new_feature_string, new_infix_string,
252 | size=parent.size + 1)
253 | if uncorrelated(parent, new_feature, correlation_threshold):
254 | new_feature_list.append(new_feature)
255 | elif operator.parity == 2:
256 | parent1 = tournament_selection(features, tournament_probability)
257 | parent2 = tournament_selection(features, tournament_probability)
258 | new_feature_string = operator.string.format(parent1.string, parent2.string)
259 | new_infix_string = operator.infix.format(parent1.infix_string, parent2.infix_string)
260 | new_feature_value = operator.operation(parent1.value, parent2.value)
261 | new_feature = Feature(new_feature_value, new_feature_string, new_infix_string,
262 | size=parent1.size + parent2.size + 1)
263 | if uncorrelated([parent1, parent2], new_feature, correlation_threshold):
264 | new_feature_list.append(new_feature)
265 | if range_operators:
266 | protected_range_operators = list(filter(lambda x: type(x) == RangeOperation and x.original_variable,
267 | features))
268 | transitional_range_operators = list(filter(lambda x: type(x) == RangeOperation and not x.original_variable,
269 | features))
270 | if operator.infix_name == 'transition' and protected_range_operators:
271 | parent = random.choice(protected_range_operators)
272 | new_feature = deepcopy(parent)
273 | new_feature.original_variable = False
274 | new_feature_list.append(new_feature)
275 | elif operator.infix_name == 'mutate' and transitional_range_operators:
276 | parent = random.choice(transitional_range_operators)
277 | new_feature = deepcopy(parent)
278 | new_feature.mutate_parameters()
279 | new_feature_list.append(new_feature)
280 | filtered_feature_list = list(filter(lambda x: x.size < 5, new_feature_list))
281 | features.extend(filtered_feature_list)
282 | print('Adding ' + str(len(filtered_feature_list)) + ' features to population.')
283 | if verbose:
284 | print(get_model_string(new_feature_list))
285 | return features
286 |
287 |
288 | def score_model(features, response, time_series_cv, splits):
289 | print('Scoring model with ' + str(len(features)) + ' features.')
290 | basis, scaler = get_basis(features)
291 | model, _, _ = get_model(basis, response, time_series_cv, splits)
292 | score = mean_squared_error(model.predict(basis), response)[0]
293 | return score, model, scaler
294 |
295 |
296 | def get_model_string(features):
297 | feature_strings = []
298 | for f in features:
299 | feature_strings.append(f.string)
300 | return '[' + '] + ['.join(feature_strings) + ']'
301 |
302 |
303 | def compute_operation(num_variables, predictors, stack, feature_names):
304 | variables = []
305 | for _ in range(num_variables):
306 | variable_name = stack.pop()
307 | variable_index = feature_names.index(variable_name)
308 | variables.append(predictors[:, variable_index])
309 | operator = stack.pop()
310 | result = operator.operation(*variables)
311 | return result
312 |
313 |
314 | def build_operation_stack(string):
315 | stack = []
316 | start = 0
317 | for i, s in enumerate(string):
318 | if s == '(':
319 | substring = string[start:i]
320 | start = i + 1
321 | stack.append(substring)
322 | elif s == ',':
323 | if i != start:
324 | substring = string[start:i]
325 | stack.append(substring)
326 | start = i + 1
327 | elif s == ')':
328 | if i != start:
329 | substring = string[start:i]
330 | stack.append(substring)
331 | start = i + 1
332 | return stack
333 |
334 |
335 | def get_feature_value(stack, feature_names, predictors, variable_type_indices):
336 | variables_stack = []
337 | while len(stack) > 0:
338 | current = stack.pop()
339 | if variable_type_indices and current.startswith('RangeOperation'):
340 | range_operation = RangeOperation(variable_type_indices, feature_names, predictors, string=current)
341 | variables_stack.append(np.squeeze(range_operation.value))
342 | elif current in feature_names:
343 | variable_index = feature_names.index(current)
344 | variables_stack.append(predictors[:, variable_index])
345 | elif current in operators_map:
346 | operator = operators_map[current]
347 | variables = []
348 | for _ in range(operator.parity):
349 | variables.append(variables_stack.pop())
350 | result = operator.operation(*variables)
351 | variables_stack.append(result)
352 | return variables_stack.pop()
353 |
354 |
355 | def build_basis_from_features(infix_features, feature_names, predictors, variable_type_indices):
356 | basis = np.zeros((predictors.shape[0], len(infix_features)))
357 | for j, f in enumerate(infix_features):
358 | if variable_type_indices and f.startswith('RangeOperation'):
359 | range_operation = RangeOperation(variable_type_indices, feature_names, predictors, string=f)
360 | basis[:, j] = np.squeeze(range_operation.value)
361 | elif f in feature_names:
362 | variable_index = feature_names.index(f)
363 | basis[:, j] = predictors[:, variable_index]
364 | else:
365 | operation_stack = build_operation_stack(f)
366 | basis[:, j] = get_feature_value(operation_stack, feature_names, predictors, variable_type_indices)
367 | return basis
368 |
369 |
370 | def get_basis_from_infix_features(infix_features, feature_names, predictors, scaler=None,
371 | variable_type_indices=None):
372 | basis = build_basis_from_features(infix_features, feature_names, predictors, variable_type_indices)
373 | basis = np.nan_to_num(basis)
374 | if scaler:
375 | basis = scaler.transform(basis)
376 | return basis
377 |
378 |
379 | def optimize(predictors, response, seed, fitness_algorithm, max_gens=100, num_additions=None, preserve_originals=True,
380 | tournament_probability=.9, max_useless_steps=10, fitness_threshold=.01, correlation_threshold=0.95,
381 | reinit_range_operators=3, splits=3, time_series_cv=False, feature_names=None, range_operators=0,
382 | variable_type_indices=None, verbose=False):
383 | assert predictors.shape[1] == len(feature_names)
384 | num_additions, feature_names = init(num_additions, feature_names, predictors, seed)
385 | features = init_features(feature_names, predictors, preserve_originals, range_operators,
386 | variable_type_indices)
387 | best_models = []
388 | best_features = []
389 | best_scalers = []
390 | best_validation_scores = []
391 | statistics = Statistics()
392 | best_score = np.Inf
393 | steps_without_new_model = 0
394 | response_variance = np.var(response)
395 | gen = 1
396 | while gen <= max_gens and steps_without_new_model <= max_useless_steps:
397 | print('Generation: ' + str(gen))
398 | score, model, scaler = score_model(features, response, time_series_cv, splits)
399 | statistics.add(gen, score, len(features))
400 | if verbose:
401 | print(get_model_string(features))
402 | print('Score: ' + str(score))
403 | if score < best_score:
404 | best_validation_scores.append(score)
405 | steps_without_new_model = 0
406 | best_score = score
407 | print('New best model score: ' + str(best_score))
408 | best_models.append(model)
409 | temp_features = deepcopy(features)
410 | for f in temp_features:
411 | f.value = None
412 | best_features.append(temp_features)
413 | best_scalers.append(scaler)
414 | else:
415 | steps_without_new_model += 1
416 | print('-------------------------------------------------------')
417 | if gen < max_gens and steps_without_new_model <= max_useless_steps:
418 | features = compose_features(num_additions, features, tournament_probability, correlation_threshold,
419 | range_operators, verbose)
420 | features = update_fitness(features, response, fitness_threshold, fitness_algorithm,
421 | response_variance, num_additions, time_series_cv, splits,
422 | verbose)
423 | if gen % reinit_range_operators == 0:
424 | features = swap_range_operators(features, range_operators, variable_type_indices, feature_names,
425 | predictors)
426 | gen += 1
427 | return statistics, best_models, best_features, best_scalers, best_validation_scores
428 |
429 |
430 | def swap_range_operators(features, range_operations, variable_type_indices, feature_names, predictors):
431 | for f in features:
432 | if type(f) == RangeOperation and f.original_variable:
433 | features.remove(f)
434 | for _ in range(range_operations):
435 | features.append(RangeOperation(variable_type_indices, feature_names, predictors))
436 | return features
437 |
438 |
439 | def name_operation(operation, name):
440 | operation.__name__ = name
441 | return operation
442 |
443 |
444 | class RangeOperation(Feature):
445 |
446 | def __init__(self, variable_type_indices, names, predictors, operation=None, begin_range_name=None,
447 | end_range_name=None, original_variable=True, string=None):
448 | Feature.__init__(self, None, 'RangeOperation', 'RangeOperation', original_variable=original_variable)
449 | self.predictors = predictors
450 | self.begin_range = None
451 | self.end_range = None
452 | self.operation = None
453 | self.names = None
454 | self.lower_bound = None
455 | self.upper_bound = None
456 | self.variable_type_indices = variable_type_indices
457 | self.operations = {
458 | 'sum': name_operation(np.sum, 'sum'),
459 | 'min': name_operation(np.min, 'min'),
460 | 'max': name_operation(np.max, 'max'),
461 | 'mean': name_operation(np.mean, 'mean'),
462 | 'vari': name_operation(np.var, 'vari'),
463 | 'skew': name_operation(skew, 'skew')
464 | }
465 | if string:
466 | parts = string.split('_')
467 | self.initialize_parameters(variable_type_indices, names, parts[1], parts[2], parts[3])
468 | else:
469 | self.initialize_parameters(variable_type_indices, names, operation, begin_range_name, end_range_name)
470 | self.value = self.create_input_vector()
471 | self.string = self.format()
472 | self.infix_string = self.format()
473 |
474 | def __deepcopy__(self, memo):
475 | new = self.__class__(self.variable_type_indices, self.names, self.predictors)
476 | new.__dict__.update(deepcopy(self.__dict__, memo))
477 | new.predictors = self.predictors
478 | new.value = self.value
479 | return new
480 |
481 | def initialize_parameters(self, variable_type_indices, names, operation=None, begin_range_name=None,
482 | end_range_name=None):
483 | """
484 | :param variable_type_indices: A sequence of variable type indices where each entry defines the
485 | index of a variable type in the design matrix. For example a design matrix with two variable types will have
486 | indices [j,n] where variable type A spans 0 to j and variable type B spans j + 1 to n.
487 | :param names:
488 | :param operation
489 | :param begin_range_name
490 | :param end_range_name
491 | :return:
492 | """
493 | self.names = names
494 | for r in variable_type_indices:
495 | if r[1] - r[0] < 2:
496 | raise ValueError('Invalid variable type indices: ' + str(r))
497 | rng = random.choice(variable_type_indices)
498 | self.lower_bound = rng[0]
499 | self.upper_bound = rng[1]
500 | if operation is not None and begin_range_name is not None and end_range_name is not None:
501 | if self.operations.get(operation) is None:
502 | raise ValueError('Invalid operation provided to Range Terminal: ' + operation)
503 | if begin_range_name not in self.names:
504 | raise ValueError('Invalid range name provided to Range Termnial: ' + str(begin_range_name))
505 | if end_range_name not in self.names:
506 | raise ValueError('Invalid range name provided to Range Terminal: ' + str(end_range_name))
507 | begin_range = self.names.index(begin_range_name)
508 | end_range = self.names.index(end_range_name) + 1
509 | valid = False
510 | for r in variable_type_indices:
511 | if r[0] <= begin_range < end_range <= r[1]:
512 | valid = True
513 | if not valid:
514 | raise ValueError('Invalid range provided to Range Terminal: (' + str(begin_range) + ',' +
515 | str(end_range) + ')')
516 | self.operation = self.operations[operation]
517 | self.begin_range = begin_range
518 | self.end_range = end_range
519 | else:
520 | self.operation = random.choice(list(self.operations.values()))
521 | self.begin_range = np.random.randint(self.lower_bound, self.upper_bound - 1)
522 | self.end_range = np.random.randint(self.begin_range + 1, self.upper_bound)
523 |
524 | def mutate_parameters(self):
525 | old = self.format()
526 | mutation = random.choice(['low', 'high'])
527 | span = self.end_range - self.begin_range
528 | if span == 0:
529 | span = 1
530 | value = random.gauss(0, math.sqrt(span))
531 | amount = int(math.ceil(abs(value)))
532 | if value < 0:
533 | amount *= -1
534 | if mutation == 'low':
535 | location = amount + self.begin_range
536 | if location < self.lower_bound:
537 | self.begin_range = self.lower_bound
538 | elif location > self.end_range - 2:
539 | self.begin_range = self.end_range - 2
540 | elif location > self.upper_bound - 2:
541 | self.begin_range = self.upper_bound - 2
542 | else:
543 | self.begin_range = location
544 | elif mutation == 'high':
545 | location = amount + self.end_range
546 | if location > self.upper_bound:
547 | self.end_range = self.upper_bound
548 | elif location < self.begin_range + 2:
549 | self.end_range = self.begin_range + 2
550 | elif location < self.lower_bound + 2:
551 | self.end_range = self.lower_bound + 2
552 | else:
553 | self.end_range = location
554 | self.value = self.create_input_vector()
555 | self.infix_string = self.format()
556 | self.string = self.format()
557 | # print('Mutated ' + old + ' to ' + self.format())
558 |
559 | def create_input_vector(self):
560 | array = self.predictors[:, self.begin_range:self.end_range]
561 | if array.shape[1] == 0:
562 | return np.zeros((array.shape[0], 1))
563 | else:
564 | return self.operation(array, axis=1)
565 |
566 | def format(self):
567 | return "RangeOperation_{}_{}_{}".format(self.operation.__name__, self.names[self.begin_range],
568 | self.names[self.end_range - 1])
569 |
570 |
--------------------------------------------------------------------------------
/fastgp/algorithms/fast_evaluate.py:
--------------------------------------------------------------------------------
1 | import cachetools
2 | import numpy
3 |
4 |
5 | def fast_numpy_evaluate(ind, context, predictors, get_node_semantics, error_function=None, expression_dict=None):
6 | semantics_stack = []
7 | expressions_stack = []
8 |
9 | if expression_dict is None:
10 | expression_dict = cachetools.LRUCache(maxsize=100)
11 |
12 | for node in reversed(ind):
13 | expression = node.format(*[expressions_stack.pop() for _ in range(node.arity)])
14 | subtree_semantics = [semantics_stack.pop() for _ in range(node.arity)]
15 |
16 | if expression in expression_dict:
17 | vector = expression_dict[expression]
18 | else:
19 | vector = get_node_semantics(node, subtree_semantics, predictors, context)
20 | expression_dict[expression] = vector
21 |
22 | expressions_stack.append(expression)
23 | semantics_stack.append(vector)
24 |
25 | if error_function is None:
26 | return semantics_stack.pop()
27 | else:
28 | return error_function(semantics_stack.pop())
29 |
30 |
31 | def fast_numpy_evaluate_population(pop, context, predictors, error_func, expression_dict=None, arg_prefix="ARG"):
32 | if expression_dict is None:
33 | expression_dict = cachetools.LRUCache(maxsize=2000)
34 |
35 | results = numpy.empty(shape=(len(pop), len(predictors)))
36 | for row, ind in enumerate(pop):
37 | results[row] = fast_numpy_evaluate(ind, context, predictors, expression_dict, arg_prefix)
38 |
39 | errors = error_func(results)
40 | for ind, error in zip(pop, errors):
41 | ind.fitness.values = error,
42 |
--------------------------------------------------------------------------------
/fastgp/algorithms/truncation_with_elite.py:
--------------------------------------------------------------------------------
1 | import time
2 | import math
3 | import random
4 |
5 | from deap import tools
6 |
7 |
8 | def generate_next_population(individuals, toolbox):
9 | """
10 | Perform truncated selection with elitism.
11 | :param individuals:
12 | :param toolbox:
13 | :return:
14 | """
15 | individuals = [toolbox.clone(ind) for ind in individuals]
16 | individuals.sort(key=lambda x: x.error)
17 |
18 | offspring = []
19 | pop_size = len(individuals)
20 | num_top = math.floor(pop_size / 2)
21 | parents = individuals[0:num_top + 1]
22 | for _ in range(pop_size - 1):
23 | off = toolbox.clone(random.choice(parents))
24 | off = toolbox.mutate(off)[0]
25 | offspring.append(off)
26 | offspring.append(individuals[0])
27 | return offspring
28 |
29 |
30 | def render_fitness(population, toolbox, history):
31 | for ind in population:
32 | ind.error = toolbox.evaluate_error(ind)[0]
33 | ind.fitness.values = ind.error,
34 | if history is not None:
35 | history.genealogy_history[ind.history_index].error = ind.error
36 |
37 |
38 | def record_information(population, stats, start, archive, logbook, verbose):
39 | record = stats.compile(population) if stats else {}
40 | logbook.record(gen=0, nevals=len(population), cpu_time=time.time() - start, **record)
41 | if archive is not None:
42 | archive.update(population)
43 | if verbose:
44 | print(logbook.stream)
45 |
46 |
47 | def optimize(population, toolbox, ngen, archive=None, stats=None, verbose=False, history=None):
48 | """
49 | Optimize a population of individuals.
50 | :param population:
51 | :param toolbox:
52 | :param mut_prob:
53 | :param ngen:
54 | :param archive:
55 | :param stats:
56 | :param verbose:
57 | :param history:
58 | :return:
59 | """
60 | start = time.time()
61 | if history is not None:
62 | history.update(population)
63 | logbook = tools.Logbook()
64 | logbook.header = ['gen', 'nevals', 'cpu_time'] + (stats.fields if stats else [])
65 | render_fitness(population, toolbox, history)
66 | record_information(population, stats, start, archive, logbook, verbose)
67 | for gen in range(1, ngen + 1):
68 | offspring = generate_next_population(population, toolbox)
69 | render_fitness(offspring, toolbox, history)
70 | population = offspring
71 | record_information(population, stats, start, archive, logbook, verbose)
72 | return population, logbook, history
73 |
74 |
--------------------------------------------------------------------------------
/fastgp/logging/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cfusting/fastgp/1951d0d992119ec86dc60b6fde636f903c638428/fastgp/logging/__init__.py
--------------------------------------------------------------------------------
/fastgp/logging/archive.py:
--------------------------------------------------------------------------------
1 | import csv
2 | import operator
3 | from collections import defaultdict
4 | from copy import deepcopy
5 |
6 | import numpy
7 |
8 | from fastgp.algorithms import afpo
9 | from fastgp.utilities import symbreg
10 |
11 |
12 | class FitnessDistributionArchive(object):
13 | def __init__(self, frequency):
14 | self.fitness = []
15 | self.generations = []
16 | self.frequency = frequency
17 | self.generation_counter = 0
18 |
19 | def update(self, population):
20 | if self.generation_counter % self.frequency == 0:
21 | fitnesses = [ind.fitness.values for ind in population]
22 | self.fitness.append(fitnesses)
23 | self.generations.append(self.generation_counter)
24 | self.generation_counter += 1
25 |
26 | def save(self, log_file):
27 | fitness_distribution_file = "fitness_" + log_file
28 | with open(fitness_distribution_file, 'wb') as f:
29 | writer = csv.writer(f)
30 | for gen, ages in zip(self.generations, self.fitness):
31 | writer.writerow([gen, ages])
32 |
33 |
34 | def pick_fitness_size_from_fitness_age_size(ind):
35 | ind.fitness.values = (ind.error, 0, len(ind))
36 |
37 |
38 | def pick_fitness_complexity_from_fitness_age_complexity(ind):
39 | ind.fitness.values = (ind.error, 0, symbreg.calculate_order(ind))
40 |
41 |
42 | def pick_fitness_size_complexity_from_fitness_age_size_complexity(ind):
43 | ind.fitness.values = (ind.error, 0, len(ind), symbreg.calculate_order(ind))
44 |
45 |
46 | def pick_fitness_size_from_fitness_age(ind):
47 | ind.fitness.values = (ind.error, len(ind))
48 |
49 |
50 | class MultiArchive(object):
51 | def __init__(self, archives):
52 | self.archives = archives
53 |
54 | def update(self, population):
55 | for archive in self.archives:
56 | archive.update(population)
57 |
58 | def save(self, log_file):
59 | for archive in self.archives:
60 | archive.save(log_file)
61 |
62 |
63 | class ParetoFrontSavingArchive(object):
64 | def __init__(self, frequency, criteria_chooser=None, simplifier=None):
65 | self.fronts = []
66 | self.frequency = frequency
67 | self.generation_counter = 0
68 | self.criteria_chooser = criteria_chooser
69 | self.simplifier = simplifier
70 |
71 | def update(self, population):
72 | if self.generation_counter % self.frequency == 0:
73 | pop_copy = [deepcopy(ind) for ind in population]
74 | if self.simplifier is not None:
75 | self.simplifier(pop_copy)
76 | if self.criteria_chooser is not None:
77 | map(self.criteria_chooser, pop_copy)
78 |
79 | non_dominated = afpo.find_pareto_front(pop_copy)
80 | front = [pop_copy[index] for index in non_dominated]
81 | front.sort(key=operator.attrgetter("fitness.values"))
82 | self.fronts.append(front)
83 | self.generation_counter += 1
84 |
85 | def save(self, log_file):
86 | pareto_front_file = "pareto_" + log_file
87 | with open(pareto_front_file, 'w') as f:
88 | writer = csv.writer(f)
89 | generation = 0
90 | for front in self.fronts:
91 | inds = [(ind.fitness.values, str(ind)) for ind in front]
92 | writer.writerow([generation, len(inds)] + inds)
93 | generation += self.frequency
94 |
95 |
96 | class MutationStatsArchive(object):
97 | def __init__(self, evaluate_function):
98 | self.stats = defaultdict(list)
99 | self.neutral_mutations = defaultdict(int)
100 | self.detrimental_mutations = defaultdict(int)
101 | self.beneficial_mutations = defaultdict(int)
102 | self.evaluate_function = evaluate_function
103 | self.generation = -1
104 |
105 | def update(self, population):
106 | self.generation += 1
107 |
108 | def submit(self, old_ind, new_ind):
109 | old_error = self.evaluate_function(old_ind)[0]
110 | new_error = self.evaluate_function(new_ind)[0]
111 | delta_error = new_error - old_error
112 | delta_size = len(new_ind) - len(old_ind)
113 | if delta_size == 0 and numpy.isclose([delta_error], [0.0])[0]:
114 | self.neutral_mutations[self.generation] += 1
115 | if delta_error > 0:
116 | self.detrimental_mutations[self.generation] += 1
117 | elif delta_error < 0:
118 | self.beneficial_mutations[self.generation] += 1
119 | self.stats[self.generation].append((delta_error, delta_size))
120 |
121 | def save(self, log_file):
122 | mutation_statistics_file = "mutation_stats_" + log_file
123 | fieldnames = ['generation', 'neutral_mutations', 'beneficial_mutations', 'detrimental_mutations', 'deltas']
124 | with open(mutation_statistics_file, 'wb') as f:
125 | writer = csv.writer(f)
126 | writer.writerow(fieldnames)
127 | for gen in self.stats.keys():
128 | writer.writerow([gen, self.neutral_mutations[gen], self.beneficial_mutations[gen],
129 | self.detrimental_mutations[gen]] + self.stats[gen])
130 |
--------------------------------------------------------------------------------
/fastgp/logging/reports.py:
--------------------------------------------------------------------------------
1 | from __future__ import division
2 | import csv
3 | from deap import tools
4 | import numpy
5 | import operator
6 | import fastgp.parametrized.simple_parametrized_terminals as sp
7 |
8 |
9 | def get_fitness(ind):
10 | return ind.fitness.values[0]
11 |
12 |
13 | def get_mean(values):
14 | return numpy.mean(list(filter(numpy.isfinite, values)))
15 |
16 |
17 | def get_std(values):
18 | return numpy.std(list(filter(numpy.isfinite, values)))
19 |
20 |
21 | def get_min(values):
22 | return numpy.min(list(filter(numpy.isfinite, values)))
23 |
24 |
25 | def get_max(values):
26 | return numpy.max(list(filter(numpy.isfinite, values)))
27 |
28 |
29 | def get_size_min(values):
30 | return min(values)[1]
31 |
32 |
33 | def get_size_max(values):
34 | return max(values)[1]
35 |
36 |
37 | def get_fitness_size(ind):
38 | return ind.fitness.values[0], len(ind)
39 |
40 |
41 | def configure_inf_protected_stats():
42 | stats_fit = tools.Statistics(get_fitness)
43 | stats_size = tools.Statistics(len)
44 | stats_height = tools.Statistics(operator.attrgetter("height"))
45 | mstats = tools.MultiStatistics(fitness=stats_fit, size=stats_size, height=stats_height)
46 | mstats.register("avg", get_mean)
47 | mstats.register("std", get_std)
48 | mstats.register("min", get_min)
49 | mstats.register("max", get_max)
50 |
51 | stats_best_ind = tools.Statistics(get_fitness_size)
52 | stats_best_ind.register("size_min", get_size_min)
53 | stats_best_ind.register("size_max", get_size_max)
54 | mstats["best_tree"] = stats_best_ind
55 | return mstats
56 |
57 |
58 | def is_parametrized_terminal(node):
59 | return isinstance(node, sp.SimpleParametrizedTerminal)
60 |
61 |
62 | def get_param_ratio(ind):
63 | parametrized = len(list(filter(is_parametrized_terminal, ind)))
64 | total = len(ind)
65 | return parametrized / total
66 |
67 |
68 | def configure_parametrized_inf_protected_stats():
69 | stats_fit = tools.Statistics(get_fitness)
70 | stats_size = tools.Statistics(len)
71 | stats_height = tools.Statistics(operator.attrgetter("height"))
72 |
73 | stats_parametrized = tools.Statistics(get_param_ratio)
74 | mstats = tools.MultiStatistics(fitness=stats_fit, size=stats_size, height=stats_height,
75 | parametrized=stats_parametrized)
76 | mstats.register("avg", get_mean)
77 | mstats.register("std", get_std)
78 | mstats.register("min", get_min)
79 | mstats.register("max", get_max)
80 | stats_best_ind = tools.Statistics(get_fitness_size)
81 | stats_best_ind.register("size_min", get_size_min)
82 | stats_best_ind.register("size_max", get_size_max)
83 | mstats["best_tree"] = stats_best_ind
84 | return mstats
85 |
86 |
87 | def get_age(ind):
88 | return ind.age
89 |
90 |
91 | def add_age_to_stats(mstats):
92 | stats_age = tools.Statistics(get_age)
93 | stats_age.register("avg", numpy.mean)
94 | stats_age.register("std", numpy.std)
95 | stats_age.register("max", numpy.max)
96 | mstats["age"] = stats_age
97 | return mstats
98 |
99 |
100 | def save_log_to_csv(log, file_path):
101 | columns = [log.select("cpu_time")]
102 | columns_names = ["cpu_time"]
103 | for chapter_name, chapter in log.chapters.items():
104 | for column in chapter[0].keys():
105 | columns_names.append(str(column) + "_" + str(chapter_name))
106 | columns.append(chapter.select(column))
107 |
108 | rows = zip(*columns)
109 | with open(file_path + '.csv', 'w') as f:
110 | writer = csv.writer(f)
111 | writer.writerow(columns_names)
112 | for row in rows:
113 | writer.writerow(row)
114 |
115 |
116 | def save_hof(hof, test_toolbox=None):
117 | def decorator(func):
118 | def wrapper(pop, log, file_name):
119 | func(pop, log, file_name)
120 | hof_file_name = "trees_" + file_name
121 | with open(hof_file_name, 'wb') as f:
122 | writer = csv.writer(f)
123 | writer.writerow(["gen", "fitness", "tree"])
124 | for gen, ind in enumerate(hof.historical_trees):
125 | if test_toolbox is not None:
126 | test_error = test_toolbox.test_evaluate(ind)[0]
127 | writer.writerow([gen, ind.fitness, str(ind), test_error])
128 | else:
129 | writer.writerow([gen, ind.fitness, str(ind)])
130 | return wrapper
131 | return decorator
132 |
133 |
134 | def save_archive(archive):
135 | def decorator(func):
136 | def wrapper(pop, log, file_name):
137 | func(pop, log, file_name)
138 | archive.save(file_name)
139 | return wrapper
140 | return decorator
141 |
--------------------------------------------------------------------------------
/fastgp/parametrized/__init__.py:
--------------------------------------------------------------------------------
1 | __author__ = 'mszubert'
2 |
--------------------------------------------------------------------------------
/fastgp/parametrized/mutation.py:
--------------------------------------------------------------------------------
1 | import random
2 |
3 |
4 | def multi_mutation(ind, mutations, probs):
5 | for mutation, probability in zip(mutations, probs):
6 | if random.random() < probability:
7 | ind = mutation(ind),
8 | return ind,
9 |
10 |
11 | def multi_mutation_exclusive(ind, mutations, probs):
12 | if len(mutations) != len(probs):
13 | raise ValueError("Must have the same number of mutations as probabilities.")
14 | if sum(probs) > 1:
15 | raise ValueError("Probabilities must sum to 1.")
16 | prob_range = [0] + probs
17 | value = random.random()
18 | i = 1
19 | while i < len(prob_range):
20 | prob_range[i] += prob_range[i - 1]
21 | if prob_range[i - 1] <= value < prob_range[i]:
22 | mutations[i - 1](ind)
23 | return ind,
24 | i += 1
25 | return ind,
26 |
--------------------------------------------------------------------------------
/fastgp/parametrized/simple_parametrized_terminals.py:
--------------------------------------------------------------------------------
1 | import random
2 | import time
3 | import itertools
4 | from functools import partial
5 | import math
6 | import re
7 |
8 | import cachetools
9 | import numpy as np
10 | from scipy.stats import skew, moment
11 | from copy import deepcopy
12 |
13 | from deap import gp
14 |
15 |
16 | class SimpleParametrizedPrimitiveSet(gp.PrimitiveSet):
17 | def __init__(self, name, arity, variable_type_indices, variable_names, prefix="ARG"):
18 | gp.PrimitiveSet.__init__(self, name, arity, prefix)
19 | self.variable_type_indices = variable_type_indices
20 | self.variable_names = variable_names
21 |
22 | def add_parametrized_terminal(self, parametrized_terminal_class):
23 | self._add(parametrized_terminal_class)
24 | self.context[parametrized_terminal_class.__name__] = parametrized_terminal_class.call
25 |
26 |
27 | class SimpleParametrizedPrimitiveTree(gp.PrimitiveTree):
28 | def __init__(self, content):
29 | gp.PrimitiveTree.__init__(self, content)
30 |
31 | def __deepcopy__(self, memo):
32 | new = self.__class__(self)
33 | for i, node in enumerate(self):
34 | if isinstance(node, SimpleParametrizedTerminal):
35 | new[i] = deepcopy(node)
36 | new.__dict__.update(deepcopy(self.__dict__, memo))
37 | return new
38 |
39 | @classmethod
40 | def from_string(cls, string, pset):
41 | """Try to convert a string expression into a PrimitiveTree given a
42 | PrimitiveSet *pset*. The primitive set needs to contain every primitive
43 | present in the expression.
44 |
45 | :param string: String representation of a Python expression.
46 | :param pset: Primitive set from which primitives are selected.
47 | :returns: PrimitiveTree populated with the deserialized primitives.
48 | """
49 | tokens = re.split("[ \t\n\r\f\v(),]", string)
50 | expr = []
51 |
52 | def get_parts(token_string):
53 | parts = tokens[i].split('_')
54 | return parts[1], parts[2], parts[3]
55 | i = 0
56 | while i < len(tokens):
57 | if tokens[i] == '':
58 | i += 1
59 | continue
60 | if tokens[i] in pset.mapping:
61 | primitive = pset.mapping[tokens[i]]
62 | expr.append(primitive)
63 | elif RangeOperationTerminal.NAME in tokens[i]:
64 | operation, begin_range_name, end_range_name = get_parts(tokens[i])
65 | range_operation_terminal = RangeOperationTerminal()
66 | range_operation_terminal.initialize_parameters(pset.variable_type_indices, pset.variable_names,
67 | operation, begin_range_name, end_range_name)
68 | expr.append(range_operation_terminal)
69 | elif MomentFindingTerminal.NAME in tokens[i]:
70 | operation, begin_range_name, end_range_name = get_parts(tokens[i])
71 | moment_operation_terminal = MomentFindingTerminal()
72 | moment_operation_terminal.initialize_parameters(pset.variable_type_indices, pset.variable_names,
73 | operation, begin_range_name, end_range_name)
74 | expr.append(moment_operation_terminal)
75 | else:
76 | try:
77 | token = eval(tokens[i])
78 | except NameError:
79 | raise TypeError("Unable to evaluate terminal: {}.".format(tokens[i]))
80 | expr.append(gp.Terminal(token, False, gp.__type__))
81 | i += 1
82 | return cls(expr)
83 |
84 |
85 | class SimpleParametrizedTerminal(gp.Terminal):
86 | ret = object
87 |
88 | def __init__(self, name="SimpleParametrizedTerminal", ret_type=object):
89 | gp.Terminal.__init__(self, name, True, ret_type)
90 |
91 | def __deepcopy__(self, memo):
92 | new = self.__class__()
93 | new.__dict__.update(deepcopy(self.__dict__, memo))
94 | return new
95 |
96 | def initialize_parameters(self, variable_type_indices, names):
97 | raise NotImplementedError
98 |
99 | def create_input_vector(self, predictors):
100 | raise NotImplementedError
101 |
102 | def call(*parameters):
103 | pass # implement this method to make the class work with standard gp.compile
104 |
105 |
106 | def name_operation(operation, name):
107 | operation.__name__ = name
108 | return operation
109 |
110 |
111 | class RangeOperationTerminal(SimpleParametrizedTerminal):
112 | NAME = 'RangeOperation'
113 |
114 | def __init__(self):
115 | SimpleParametrizedTerminal.__init__(self, RangeOperationTerminal.__name__)
116 | self.begin_range = None
117 | self.end_range = None
118 | self.operation = None
119 | self.names = None
120 | self.lower_bound = None
121 | self.upper_bound = None
122 | self.operations = {
123 | 'sum': name_operation(np.sum, 'sum'),
124 | 'min': name_operation(np.min, 'min'),
125 | 'max': name_operation(np.max, 'max')
126 | }
127 |
128 | def initialize_parameters(self, variable_type_indices, names, operation=None, begin_range_name=None,
129 | end_range_name=None, *args):
130 | """
131 | :param variable_type_indices: A sequence of variable type indices where each entry defines the
132 | index of a variable type in the design matrix. For example a design matrix with two variable types will have
133 | indices [j,n] where variable type A spans 0 to j and variable type B spans j + 1 to n.
134 | :param names:
135 | :param args:
136 | :param operation
137 | :param begin_range_name
138 | :param end_range_name
139 | :return:
140 | """
141 | self.names = names
142 | for r in variable_type_indices:
143 | if r[1] - r[0] < 2:
144 | raise ValueError('Invalid range provided to Range Terminal: ' + str(r))
145 | rng = random.choice(variable_type_indices)
146 | self.lower_bound = rng[0]
147 | self.upper_bound = rng[1]
148 | if operation is not None and begin_range_name is not None and end_range_name is not None:
149 | if self.operations.get(operation) is None:
150 | raise ValueError('Invalid operation provided to Range Terminal: ' + operation)
151 | if begin_range_name not in self.names:
152 | raise ValueError('Invalid range name provided to Range Termnial: ' + str(begin_range_name))
153 | if end_range_name not in names:
154 | raise ValueError('Invalid range name provided to Range Termnial: ' + str(end_range_name))
155 | begin_range = self.names.index(begin_range_name)
156 | end_range = self.names.index(end_range_name)
157 | valid = False
158 | for r in variable_type_indices:
159 | if r[0] <= begin_range < end_range <= r[1]:
160 | valid = True
161 | if not valid:
162 | raise ValueError('Invalid range provided to Range Terminal: (' + str(begin_range) + ',' +
163 | str(end_range) + ')')
164 | self.operation = self.operations[operation]
165 | self.begin_range = begin_range
166 | self.end_range = end_range
167 | else:
168 | self.operation = random.choice(list(self.operations.values()))
169 | self.begin_range = np.random.randint(self.lower_bound, self.upper_bound - 1)
170 | self.end_range = np.random.randint(self.begin_range + 1, self.upper_bound)
171 |
172 | def mutate_parameters(self, stdev_calc):
173 | mutation = random.choice(['low', 'high'])
174 | span = self.end_range - self.begin_range
175 | if span == 0:
176 | span = 1
177 | value = random.gauss(0, stdev_calc(span))
178 | amount = int(math.ceil(abs(value)))
179 | if value < 0:
180 | amount *= -1
181 | if mutation == 'low':
182 | location = amount + self.begin_range
183 | if location < self.lower_bound:
184 | self.begin_range = self.lower_bound
185 | elif location > self.end_range - 2:
186 | self.begin_range = self.end_range - 2
187 | elif location > self.upper_bound - 2:
188 | self.begin_range = self.upper_bound - 2
189 | else:
190 | self.begin_range = location
191 | elif mutation == 'high':
192 | location = amount + self.end_range
193 | if location > self.upper_bound:
194 | self.end_range = self.upper_bound
195 | elif location < self.begin_range + 2:
196 | self.end_range = self.begin_range + 2
197 | elif location < self.lower_bound + 2:
198 | self.end_range = self.lower_bound + 2
199 | else:
200 | self.end_range = location
201 |
202 | def create_input_vector(self, predictors):
203 | array = predictors[:, self.begin_range:self.end_range]
204 | if array.shape[1] == 0:
205 | return np.zeros((array.shape[0], 1))
206 | else:
207 | return self.operation(array, axis=1)
208 |
209 | def format(self):
210 | return "RangeOperation_{}_{}_{}".format(self.operation.__name__, self.names[self.begin_range],
211 | self.names[self.end_range - 1])
212 |
213 |
214 | class MomentFindingTerminal(RangeOperationTerminal):
215 | NAME = 'MomentOperation'
216 |
217 | def __init__(self):
218 | super(MomentFindingTerminal, self).__init__()
219 | self.operations = {
220 | 'mean': name_operation(np.mean, 'mean'),
221 | 'vari': name_operation(np.var, 'vari'),
222 | 'skew': name_operation(skew, 'skew')
223 | }
224 |
225 | def initialize_parameters(self, variable_type_indices, names, operation=None, begin_range_name=None,
226 | end_range_name=None, *args):
227 | if operation is None:
228 | super(MomentFindingTerminal, self).initialize_parameters(variable_type_indices, names)
229 | self.operation = random.choice(list(self.operations.values()))
230 | else:
231 | super(MomentFindingTerminal, self).initialize_parameters(variable_type_indices, names, operation,
232 | begin_range_name, end_range_name, *args)
233 |
234 | def format(self):
235 | return "MomentOperation_{}_{}_{}".format(self.operation.__name__, self.names[self.begin_range],
236 | self.names[self.end_range - 1])
237 |
238 |
239 | class PolynomialFindingTerminal(RangeOperationTerminal):
240 | NAME = 'PolynomialOperation'
241 |
242 | def __init__(self):
243 | super(PolynomialFindingTerminal, self).__init__()
244 | self.operations = {
245 | 'first': self.first,
246 | 'second': self.second,
247 | 'third': self.third
248 | }
249 |
250 | def first(self, X, axis=1):
251 | return self.polynomial(X, 1)
252 |
253 | def second(self, X, axis=1):
254 | return self.polynomial(X, 2)
255 |
256 | def third(self, X, axis=1):
257 | return self.polynomial(X, 3)
258 |
259 | def polynomial(self, X, order, interactions=False):
260 | start = time.time()
261 | orders = []
262 | for o in range(1, order + 1):
263 | orders.append(np.apply_along_axis(lambda x: np.power(x, o), 1, X))
264 | matrix = np.concatenate(orders, axis=1)
265 | rows = matrix.shape[0]
266 | cols = matrix.shape[1]
267 | result = np.zeros(rows)
268 | if interactions:
269 | indices = [x for x in range(cols)]
270 | for c in range(1, cols):
271 | for comb in itertools.combinations(indices, c):
272 | M = np.ones(rows)
273 | for j in comb:
274 | M *= matrix[:, j].reshape(rows)
275 | result += M
276 | else:
277 | result = np.sum(matrix, axis=1)
278 | return result
279 |
280 | def initialize_parameters(self, variable_type_indices, names, operation=None, begin_range_name=None,
281 | end_range_name=None, *args):
282 | if operation is None:
283 | super(PolynomialFindingTerminal, self).initialize_parameters(variable_type_indices, names)
284 | self.operation = random.choice(list(self.operations.values()))
285 | else:
286 | super(PolynomialFindingTerminal, self).initialize_parameters(variable_type_indices, names, operation,
287 | begin_range_name, end_range_name, *args)
288 |
289 | def format(self):
290 | return "PolynomialOperation{}_{}_{}".format(self.operation.__name__, self.names[self.begin_range],
291 | self.names[self.end_range - 1])
292 |
293 |
294 | def named_moment(number):
295 | def f(vector, axis=0):
296 | return moment(vector, moment=number, axis=axis)
297 | f.__name__ = "moment_" + str(number)
298 | return f
299 |
300 |
301 | def generate_parametrized_expression(generate_expression, variable_type_indices, names):
302 | expr = generate_expression()
303 | for node in expr:
304 | if isinstance(node, SimpleParametrizedTerminal):
305 | node.initialize_parameters(variable_type_indices, names)
306 | return expr
307 |
308 |
309 | def evolve_parametrized_expression(stdev_calc):
310 | def decorator(func):
311 | def wrapper(*args, **kargs):
312 | offspring = list(func(*args, **kargs))
313 | for ind in offspring:
314 | for node in ind:
315 | if isinstance(node, SimpleParametrizedTerminal):
316 | node.mutate_parameters(stdev_calc)
317 | return offspring
318 | return wrapper
319 | return decorator
320 |
321 |
322 | def get_parametrized_nodes(ind):
323 | return list(filter(lambda node: isinstance(node, SimpleParametrizedTerminal), ind))
324 |
325 |
326 | def mutate_parametrized_nodes(ind, stdev_calc):
327 | param_nodes = get_parametrized_nodes(ind)
328 | map(lambda node: node.mutate_parameters(stdev_calc), param_nodes)
329 | return ind,
330 |
331 |
332 | def mutate_single_parametrized_node(ind, stdev_calc):
333 | param_nodes = get_parametrized_nodes(ind)
334 | if len(param_nodes) != 0:
335 | random.choice(param_nodes).mutate_parameters(stdev_calc)
336 | return ind,
337 |
338 |
339 | def search_entire_space(node, evaluate_function):
340 | fitness = []
341 | parameters = []
342 | begin = node.lower_bound
343 | while begin <= node.upper_bound:
344 | end = begin + 1
345 | while end <= node.upper_bound:
346 | node.begin_range = begin
347 | node.end_range = end
348 | fitness.append(evaluate_function())
349 | parameters.append((begin, end))
350 | end += 1
351 | begin += 1
352 | return parameters, fitness
353 |
354 |
355 | def optimize_node(node, evaluate_function, optimization_objective_function):
356 | parameters, fitness = search_entire_space(node, evaluate_function)
357 | best_value = optimization_objective_function(fitness)
358 | optimal_index = fitness.index(best_value)
359 | begin, end = parameters[optimal_index]
360 | node.begin_range = begin
361 | node.end_range = end
362 | return parameters, fitness
363 |
364 |
365 | def mutate_single_parametrized_node_optimal(ind, evaluate_function, optimization_objective_function):
366 | param_nodes = get_parametrized_nodes(ind)
367 | if len(param_nodes) != 0:
368 | node = random.choice(param_nodes)
369 | optimize_node(node, partial(evaluate_function, ind=ind), optimization_objective_function)
370 | return ind,
371 |
372 |
373 | def simple_parametrized_evaluate(ind, context, predictors, error_function=None, expression_dict=None):
374 | semantics_stack = []
375 | expressions_stack = []
376 |
377 | if expression_dict is None:
378 | expression_dict = cachetools.LRUCache(maxsize=100)
379 |
380 | for node in reversed(ind):
381 | expression = node.format(*[expressions_stack.pop() for _ in range(node.arity)])
382 | subtree_semantics = [semantics_stack.pop() for _ in range(node.arity)]
383 |
384 | if expression in expression_dict:
385 | vector = expression_dict[expression]
386 | else:
387 | vector = get_node_semantics(node, subtree_semantics, predictors, context)
388 | expression_dict[expression] = vector
389 |
390 | expressions_stack.append(expression)
391 | semantics_stack.append(vector)
392 |
393 | if error_function is None:
394 | return semantics_stack.pop()
395 | else:
396 | return error_function(semantics_stack.pop())
397 |
398 |
399 | def get_terminal_semantics(node, context, predictors):
400 | if isinstance(node, gp.Ephemeral) or isinstance(node.value, float) or isinstance(node.value, int):
401 | return np.ones(len(predictors)) * node.value
402 |
403 | if node.value in context:
404 | return np.ones(len(predictors)) * context[node.value]
405 |
406 | arg_index = re.findall('\d+', node.name)
407 | return predictors[:, int(arg_index[0])]
408 |
409 |
410 | def get_node_semantics(node, subtree_semantics, predictors, context):
411 | if isinstance(node, SimpleParametrizedTerminal):
412 | vector = node.create_input_vector(predictors)
413 | elif isinstance(node, gp.Terminal):
414 | vector = get_terminal_semantics(node, context, predictors)
415 | else:
416 | with np.errstate(over='ignore', divide='ignore', invalid='ignore'):
417 | vector = context[node.name](*list(map(lambda x: x.astype(float) if type(x) != float else x,
418 | subtree_semantics)))
419 | return vector
420 |
421 |
422 | def graph(expr):
423 | nodes = range(len(expr))
424 | edges = list()
425 | labels = dict()
426 |
427 | stack = []
428 | for i, node in enumerate(expr):
429 | if stack:
430 | edges.append((stack[-1][0], i))
431 | stack[-1][1] -= 1
432 | if isinstance(node, gp.Primitive):
433 | labels[i] = node.name
434 | elif isinstance(node, SimpleParametrizedTerminal):
435 | labels[i] = node.format()
436 | else:
437 | labels[i] = node.value
438 | stack.append([i, node.arity])
439 | while stack and stack[-1][1] == 0:
440 | stack.pop()
441 | return nodes, edges, labels
442 |
--------------------------------------------------------------------------------
/fastgp/utilities/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cfusting/fastgp/1951d0d992119ec86dc60b6fde636f903c638428/fastgp/utilities/__init__.py
--------------------------------------------------------------------------------
/fastgp/utilities/benchmark_problems.py:
--------------------------------------------------------------------------------
1 | import math
2 |
3 |
4 | # 4 * x^4 + 3 * x^3 + 2 * x^2 + x
5 | def mod_quartic(x):
6 | return x * (1 + x * (2 + x * (3 + x * 4)))
7 |
8 |
9 | # Koza-1: x^4 + x^3 + x^2 + x
10 | def quartic(x):
11 | return x * (1 + x * (1 + x * (1 + x)))
12 |
13 |
14 | # Koza-2: x^5 - 2x^3 + x
15 | def quintic(x):
16 | return x * (1 - x * x * (2 - x * x))
17 |
18 |
19 | # Koza-3: x^6 - 2x^4 + x^2
20 | def sextic(x):
21 | return x * x * (1 - x * x * (2 - x * x))
22 |
23 |
24 | # x^7 - 2x^6 + x^5 - x^4 + x^3 - 2x^2 + x
25 | def septic(x):
26 | return x * (1 - x * (2 - x * (1 - x * (1 - x * (1 - x * (2 - x))))))
27 |
28 |
29 | # sum_{1}^9{x^i}
30 | def nonic(x):
31 | return x * (1 + x * (1 + x * (1 + x * (1 + x * (1 + x * (1 + x * (1 + x * (1 + x))))))))
32 |
33 |
34 | # x^3 + x^2 + x
35 | def nguyen1(x):
36 | return x * (1 + x * (1 + x))
37 |
38 |
39 | # x^5 + x^4 + x^3 + x^2 + x
40 | def nguyen3(x):
41 | return x * (1 + x * (1 + x * (1 + x * (1 + x))))
42 |
43 |
44 | # x^6 + x^5 + x^4 + x^3 + x^2 + x
45 | def nguyen4(x):
46 | return x * (1 + x * (1 + x * (1 + x * (1 + x * (1 + x)))))
47 |
48 |
49 | def nguyen5(x):
50 | return math.sin(x * x) * math.cos(x) - 1
51 |
52 |
53 | def nguyen6(x):
54 | return math.sin(x) + math.sin(x * (1 + x))
55 |
56 |
57 | def nguyen7(x):
58 | return math.log(x + 1) + math.log(x * x + 1)
59 |
60 |
61 | def nguyen9(x, y):
62 | return math.sin(x) + math.sin(y * y)
63 |
64 |
65 | def nguyen10(x, y):
66 | return 2 * math.sin(x) * math.cos(y)
67 |
68 |
69 | def nguyen12(x, y):
70 | return x ** 4 - x ** 3 + (y ** 2 / 2.0) - y
71 |
72 |
73 | def keijzer1(x):
74 | return 0.3 * x * math.sin(2 * math.pi * x)
75 |
76 |
77 | def keijzer4(x):
78 | return x ** 3 * math.exp(-x) * math.cos(x) * math.sin(x) * (math.sin(x) ** 2 * math.cos(x) - 1)
79 |
80 |
81 | def keijzer11(x, y):
82 | return (x * y) + math.sin((x - 1) * (y - 1))
83 |
84 |
85 | def keijzer12(x, y):
86 | return x ** 4 - x ** 3 + (y ** 2 / 2.0) - y
87 |
88 |
89 | def keijzer13(x, y):
90 | return 6 * math.sin(x) * math.cos(y)
91 |
92 |
93 | def keijzer14(x, y):
94 | return 8.0 / (2 + x ** 2 + y ** 2)
95 |
96 |
97 | def keijzer15(x, y):
98 | return (x ** 3 / 5.0) + (y ** 3 / 2.0) - x - y
99 |
100 |
101 | def r1(x):
102 | return ((x + 1) ** 3) / (x ** 2 - x + 1)
103 |
104 |
105 | def r2(x):
106 | return (x ** 5 - (3 * (x ** 3)) + 1) / (x ** 2 + 1)
107 |
108 |
109 | def r3(x):
110 | return (x ** 6 + x ** 5) / (x ** 4 + x ** 3 + x ** 2 + x + 1)
111 |
112 |
113 | def pagie1(x, y):
114 | return (1 / (1 + x ** -4)) + (1 / (1 + y ** -4))
115 |
--------------------------------------------------------------------------------
/fastgp/utilities/metrics.py:
--------------------------------------------------------------------------------
1 | import numpy
2 | from scipy.stats import pearsonr, spearmanr
3 |
4 | from fastgp.utilities.symbreg import numpy_protected_div_dividend
5 |
6 |
7 | def mean_absolute_error(vector, response):
8 | errors = numpy.abs(vector - response)
9 | mean_error = numpy.mean(errors)
10 | if not numpy.isfinite(mean_error):
11 | return numpy.inf,
12 | return mean_error.item(),
13 |
14 |
15 | def euclidean_error(vector, response):
16 | with numpy.errstate(over='ignore', divide='ignore', invalid='ignore'):
17 | squared_errors = numpy.square(vector - response)
18 | sum_squared_errors = numpy.sum(squared_errors)
19 | if not numpy.isfinite(sum_squared_errors):
20 | return numpy.inf,
21 | distance = numpy.sqrt(sum_squared_errors)
22 | return distance.item(),
23 |
24 |
25 | def root_mean_square_error(vector, response):
26 | with numpy.errstate(over='ignore', divide='ignore', invalid='ignore'):
27 | squared_errors = numpy.square(vector - response)
28 | mse = numpy.mean(squared_errors)
29 | if not numpy.isfinite(mse):
30 | return numpy.inf,
31 | rmse = numpy.sqrt(mse)
32 | return rmse.item(),
33 |
34 |
35 | def mean_squared_error(vector, response):
36 | squared_errors = numpy.square(vector - response)
37 | mse = float(numpy.mean(squared_errors))
38 | if not numpy.isfinite(mse):
39 | return numpy.inf,
40 | return mse,
41 |
42 |
43 | def pearson_correlation(vector, response):
44 | return pearsonr(vector, response)
45 |
46 |
47 | def spearman_correlation(vector, response):
48 | return spearmanr(vector, response)
49 |
50 |
51 | def normalized_cumulative_absolute_error(vector, response, threshold=0.0):
52 | errors = numpy.abs(vector - response)
53 | raw_sum = numpy.sum(errors)
54 | if not numpy.isfinite(raw_sum):
55 | return 0.0,
56 |
57 | errors[errors < threshold] = 0
58 | cumulative_error = numpy.sum(errors).item()
59 | return 1 / (1 + cumulative_error),
60 |
61 |
62 | def mean_absolute_percentage_error(vector, response):
63 | with numpy.errstate(over='ignore', divide='ignore', invalid='ignore'):
64 | errors = numpy_protected_div_dividend((vector - response), response)
65 | errors = numpy_protected_div_dividend(errors, float(len(response)))
66 | mean_error = numpy.sum(numpy.abs(errors))
67 | if numpy.isnan(mean_error) or not numpy.isfinite(mean_error):
68 | return numpy.inf,
69 | return mean_error,
70 |
71 |
72 | def percentage_error(vector, response, threshold=0.0):
73 | errors = numpy.abs(vector - response)
74 | raw_sum = numpy.sum(errors)
75 | if not numpy.isfinite(raw_sum):
76 | return 0.0,
77 |
78 | errors[errors < threshold] = 0
79 | cumulative_error = numpy.sum(errors).item()
80 | cumulative_response = numpy.sum(response).item()
81 | return numpy_protected_div_dividend(cumulative_error, cumulative_response),
82 |
83 |
84 | def cumulative_absolute_error(vector, response):
85 | errors = numpy.abs(vector - response)
86 | cumulative_error = numpy.sum(errors)
87 | if not numpy.isfinite(cumulative_error):
88 | return numpy.inf,
89 | return cumulative_error.item(),
90 |
91 |
92 | def normalized_mean_squared_error(vector, response):
93 | squared_errors = numpy.square(vector - response)
94 | mse = numpy.mean(squared_errors)
95 | if not numpy.isfinite(mse):
96 | return numpy.inf,
97 | normalized_mse = mse / numpy.var(response)
98 | return normalized_mse.item(),
99 |
100 |
--------------------------------------------------------------------------------
/fastgp/utilities/operators.py:
--------------------------------------------------------------------------------
1 | import copy
2 | from functools import wraps
3 | import random
4 |
5 |
6 | def static_limit(key, max_value):
7 | def decorator(func):
8 | @wraps(func)
9 | def wrapper(*args, **kwargs):
10 | keep_inds = [copy.deepcopy(ind) for ind in args]
11 | new_inds = list(func(*args, **kwargs))
12 | for i, ind in enumerate(new_inds):
13 | if key(ind) > max_value:
14 | new_inds[i] = copy.deepcopy(random.choice(keep_inds))
15 | return new_inds
16 | return wrapper
17 | return decorator
18 |
19 |
20 | def stats_collector(archive):
21 | def decorator(func):
22 | @wraps(func)
23 | def wrapper(*args, **kwargs):
24 | keep_inds = [copy.deepcopy(ind) for ind in args]
25 | new_inds = list(func(*args, **kwargs))
26 | for old_ind, new_ind in zip(keep_inds, new_inds):
27 | archive.submit(old_ind, new_ind)
28 | return new_inds
29 | return wrapper
30 | return decorator
31 |
32 |
33 | def internally_biased_node_selector(individual, bias):
34 | internal_nodes = []
35 | leaves = []
36 |
37 | for index, node in enumerate(individual):
38 | if node.arity == 0:
39 | leaves.append(index)
40 | else:
41 | internal_nodes.append(index)
42 |
43 | if internal_nodes and random.random() < bias:
44 | return random.choice(internal_nodes)
45 | else:
46 | return random.choice(leaves)
47 |
48 |
49 | def get_node_indices_at_depth(individual, level):
50 | stack = [0]
51 | nodes_at_depth = []
52 | for index, node in enumerate(individual):
53 | current_depth = stack.pop()
54 | if current_depth == level:
55 | nodes_at_depth.append(index)
56 | stack.extend([current_depth + 1] * node.arity)
57 |
58 | return nodes_at_depth
59 |
60 |
61 | def uniform_depth_node_selector(individual):
62 | depth = random.randint(0, individual.height)
63 | nodes_at_depth = get_node_indices_at_depth(individual, depth)
64 | return random.choice(nodes_at_depth)
65 |
66 |
67 | def uniform_depth_mutation(individual, expr, pset):
68 | node_index = uniform_depth_node_selector(individual)
69 | slice_ = individual.searchSubtree(node_index)
70 | type_ = individual[node_index].ret
71 | individual[slice_] = expr(pset=pset, type_=type_)
72 | return individual,
73 |
74 |
75 | def multi_mutation(ind, mutations, probs):
76 | for mutation, probability in zip(mutations, probs):
77 | if random.random() < probability:
78 | ind, = mutation(ind)
79 | return ind,
80 |
81 |
82 | def one_point_xover_biased(ind1, ind2, node_selector):
83 | if len(ind1) < 2 or len(ind2) < 2:
84 | return ind1, ind2
85 |
86 | index1 = node_selector(ind1)
87 | index2 = node_selector(ind2)
88 | slice1 = ind1.searchSubtree(index1)
89 | slice2 = ind2.searchSubtree(index2)
90 | ind1[slice1], ind2[slice2] = ind2[slice2], ind1[slice1]
91 |
92 | return ind1, ind2
93 |
94 |
95 | def mutation_biased(ind, expr, node_selector):
96 | index = node_selector(ind)
97 | slice1 = ind.searchSubtree(index)
98 | ind[slice1] = expr()
99 | return ind,
100 |
101 |
102 | def static_limit_retries(key, max_value, num_retries):
103 | def decorator(func):
104 | @wraps(func)
105 | def wrapper(*args, **kwargs):
106 | keep_inds = [copy.deepcopy(ind) for ind in args]
107 |
108 | for _ in range(num_retries):
109 | new_inds = list(func(*args, **kwargs))
110 | all_within_limit = True
111 | for i, ind in enumerate(new_inds):
112 | if key(ind) > max_value:
113 | all_within_limit = False
114 | break
115 | if all_within_limit:
116 | return new_inds
117 |
118 | new_inds = list(func(*args, **kwargs))
119 | for i, ind in enumerate(new_inds):
120 | if key(ind) > max_value:
121 | new_inds[i] = random.choice(keep_inds)
122 | return new_inds
123 | return wrapper
124 | return decorator
125 |
--------------------------------------------------------------------------------
/fastgp/utilities/subset_selection.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 |
3 | from fastgp.algorithms import fast_evaluate
4 |
5 |
6 | class SubsetSelectionArchive(object):
7 | def __init__(self, frequency, predictors, response, subset_size, expression_dict):
8 | self.expression_dict = expression_dict
9 | self.frequency = frequency
10 | self.predictors = predictors
11 | self.response = response
12 | self.subset_size = subset_size
13 | self.num_obs = len(predictors)
14 |
15 | selected_indices = np.random.choice(self.num_obs, self.subset_size, replace=False)
16 | self.training_subset = np.zeros(self.num_obs, np.bool)
17 | self.training_subset[selected_indices] = 1
18 | self.subset_predictors = self.predictors[self.training_subset, :]
19 | self.subset_response = self.response[self.training_subset]
20 | self.generation_counter = 0
21 |
22 | def update(self, population):
23 | raise NotImplementedError
24 |
25 | def set_difficulty(self, errors):
26 | pass
27 |
28 | def get_data_subset(self):
29 | return self.subset_predictors, self.subset_response
30 |
31 | def get_indices(self):
32 | return np.arange(self.num_obs)[self.training_subset]
33 |
34 | def save(self, log_file):
35 | pass
36 |
37 |
38 | class RandomSubsetSelectionArchive(SubsetSelectionArchive):
39 | def __init__(self, frequency, predictors, response, subset_size, expression_dict):
40 | SubsetSelectionArchive.__init__(self, frequency, predictors, response, subset_size, expression_dict)
41 |
42 | def update(self, population):
43 | if self.generation_counter % self.frequency == 0:
44 | selected_indices = np.random.choice(self.num_obs, self.subset_size, replace=False)
45 | self.training_subset = np.zeros(self.num_obs, np.bool)
46 | self.training_subset[selected_indices] = 1
47 | self.subset_predictors = self.predictors[self.training_subset, :]
48 | self.subset_response = self.response[self.training_subset]
49 | self.expression_dict.clear()
50 | self.generation_counter += 1
51 |
52 |
53 | def fast_numpy_evaluate_subset(ind, context, subset_selection_archive, get_node_semantics,
54 | inner_evaluate_function=fast_evaluate.fast_numpy_evaluate,
55 | error_function=None, expression_dict=None):
56 | predictors, response = subset_selection_archive.get_data_subset()
57 | root_semantics = inner_evaluate_function(ind, context, predictors, get_node_semantics, error_function=None,
58 | expression_dict=expression_dict)
59 | return error_function(root_semantics, response)
60 |
--------------------------------------------------------------------------------
/fastgp/utilities/symbreg.py:
--------------------------------------------------------------------------------
1 | import math
2 |
3 | from deap import gp
4 | import numpy
5 |
6 |
7 | def protected_div_one(left, right):
8 | try:
9 | return left / right
10 | except ZeroDivisionError:
11 | return 1
12 |
13 |
14 | def protected_div_zero(left, right):
15 | try:
16 | return left / right
17 | except ZeroDivisionError:
18 | return 0
19 |
20 |
21 | def protected_div_dividend(left, right):
22 | if right != 0:
23 | return left / right
24 | else:
25 | return left
26 |
27 |
28 | def aq(left, right):
29 | with numpy.errstate(divide='ignore', invalid='ignore'):
30 | x = numpy.divide(left, numpy.sqrt(1 + numpy.square(right)))
31 | if isinstance(x, numpy.ndarray):
32 | x[numpy.isinf(x)] = left[numpy.isinf(x)]
33 | x[numpy.isnan(x)] = left[numpy.isnan(x)]
34 | elif numpy.isinf(x) or numpy.isnan(x):
35 | x = left
36 | return x
37 |
38 |
39 | def numpy_protected_div_dividend(left, right):
40 | with numpy.errstate(divide='ignore', invalid='ignore'):
41 | x = numpy.divide(left, right)
42 | if isinstance(x, numpy.ndarray):
43 | x[numpy.isinf(x)] = left[numpy.isinf(x)]
44 | x[numpy.isnan(x)] = left[numpy.isnan(x)]
45 | elif numpy.isinf(x) or numpy.isnan(x):
46 | x = left
47 | return x
48 |
49 |
50 | def numpy_protected_div_zero(left, right):
51 | with numpy.errstate(divide='ignore', invalid='ignore'):
52 | x = numpy.divide(left, right)
53 | if isinstance(x, numpy.ndarray):
54 | x[numpy.isinf(x)] = 0.0
55 | x[numpy.isnan(x)] = 0.0
56 | elif numpy.isinf(x) or numpy.isnan(x):
57 | x = 0.0
58 | return x
59 |
60 |
61 | def numpy_protected_div_one(left, right):
62 | with numpy.errstate(divide='ignore', invalid='ignore'):
63 | x = numpy.divide(left, right)
64 | if isinstance(x, numpy.ndarray):
65 | x[numpy.isinf(x)] = 1.0
66 | x[numpy.isnan(x)] = 1.0
67 | elif numpy.isinf(x) or numpy.isnan(x):
68 | x = 1.0
69 | return x
70 |
71 |
72 | def numpy_protected_sqrt(x):
73 | with numpy.errstate(invalid='ignore'):
74 | x = numpy.sqrt(x)
75 | if isinstance(x, numpy.ndarray):
76 | x[numpy.isnan(x)] = 0
77 | elif numpy.isnan(x):
78 | x = 0
79 | return x
80 |
81 |
82 | def protected_log_one(x):
83 | if x > 0:
84 | return math.log(x)
85 | else:
86 | return 1
87 |
88 |
89 | def protected_log_abs(x):
90 | if x != 0:
91 | return math.log(abs(x))
92 | else:
93 | return 0
94 |
95 |
96 | def cube(x):
97 | return numpy.power(x, 3.0)
98 |
99 |
100 | def numpy_protected_log_abs(x):
101 | with numpy.errstate(divide='ignore', invalid='ignore'):
102 | abs_val = numpy.abs(x)
103 | x = numpy.log(abs_val.astype(float))
104 | if isinstance(x, numpy.ndarray):
105 | x[numpy.isinf(x)] = -1e300
106 | x[numpy.isnan(x)] = 0
107 | elif numpy.isinf(x):
108 | x = -1e300
109 | elif numpy.isnan(x):
110 | x = 0
111 | return x
112 |
113 |
114 | def numpy_protected_log_one(x):
115 | with numpy.errstate(divide='ignore', invalid='ignore'):
116 | x = numpy.log(numpy.abs(x))
117 | if isinstance(x, numpy.ndarray):
118 | x[numpy.isinf(x)] = 1.0
119 | x[numpy.isnan(x)] = 1.0
120 | elif numpy.isinf(x):
121 | x = 1.0
122 | elif numpy.isnan(x):
123 | x = 1.0
124 | return x
125 |
126 |
127 | def get_terminal_order(node, context=None):
128 | if isinstance(node, gp.Ephemeral) or isinstance(node.value, float) \
129 | or isinstance(node.value, int) or context is not None and node.value in context:
130 | return 0
131 | return 1
132 |
133 |
134 | def calculate_order(ind, context=None):
135 | order_stack = []
136 | for node in reversed(ind):
137 | if isinstance(node, gp.Terminal):
138 | terminal_order = get_terminal_order(node, context)
139 | order_stack.append(terminal_order)
140 | elif node.arity == 1:
141 | arg_order = order_stack.pop()
142 | if node.name == numpy_protected_log_abs.__name__:
143 | order_stack.append(3 * arg_order)
144 | elif node.name == numpy.exp.__name__:
145 | order_stack.append(4 * arg_order)
146 | else: # cube or square
147 | order_stack.append(1.5 * arg_order)
148 | else: # node.arity == 2:
149 | args_order = [order_stack.pop() for _ in range(node.arity)]
150 | if node.name == numpy.add.__name__ or node.name == numpy.subtract.__name__:
151 | order_stack.append(max(args_order))
152 | else:
153 | order_stack.append(sum(args_order))
154 | return order_stack.pop()
155 |
156 |
157 | def get_numpy_infix_symbol_map():
158 | symbol_map = {numpy.add.__name__: "({0} + {1})",
159 | numpy.subtract.__name__: "({0} - {1})",
160 | numpy.multiply.__name__: "({0} * {1})",
161 | numpy_protected_div_dividend.__name__: "({0} / {1})",
162 | numpy_protected_log_abs.__name__: "log({0})",
163 | numpy.abs.__name__: "abs({0})",
164 | numpy.sin.__name__: "sin({0})",
165 | numpy.cos.__name__: "cos({0})",
166 | numpy.exp.__name__: "exp({0})",
167 | numpy.square.__name__: "(({0}) ^ 2)",
168 | cube.__name__: "(({0}) ^ 3)",
169 | numpy.sqrt.__name__: "sqrt({0})",
170 | numpy.reciprocal.__name__: "(1 / {0})",
171 | aq.__name__: "({0} // {1})",
172 | numpy.power.__name__: "(({0}) ^ {1})"}
173 | return symbol_map
174 |
175 |
176 | def get_numpy_prefix_symbol_map():
177 | symbol_map = [("+", numpy.add.__name__,),
178 | ("-", numpy.subtract.__name__),
179 | ("**", numpy.power.__name__),
180 | ("^", numpy.power.__name__),
181 | ("*", numpy.multiply.__name__),
182 | ("/", numpy_protected_div_dividend.__name__),
183 | ('abs', numpy.abs.__name__),
184 | ("log", numpy_protected_log_abs.__name__),
185 | ("sin", numpy.sin.__name__),
186 | ("cos", numpy.cos.__name__),
187 | ("exp", numpy.exp.__name__)]
188 | return symbol_map
189 |
190 |
191 | def get_numpy_commutative_set():
192 | return {numpy.add.__name__, numpy.multiply.__name__}
193 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | scipy
2 | deap
3 | pytest
4 | cachetools
5 | numpy
6 |
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | from setuptools import setup, find_packages
2 |
3 | setup(
4 | name='fastgp',
5 | version='0.1.0',
6 | description='Fast genetic programming.',
7 | author='Chris Fusting',
8 | author_email='cfusting@gmail.com',
9 | license='GNU GPLv3',
10 | classifiers=[
11 | 'Development Status :: 3 - Alpha',
12 | 'Intended Audience :: Science/Research',
13 | 'Topic :: Scientific/Engineering :: Artificial Intelligence',
14 | 'License :: OSI Approved :: GNU General Public License v3 (GPLv3)',
15 | 'Programming Language :: Python :: 3'
16 | ],
17 | keywords='evolution machine learning artificial intelligence',
18 | install_requires=[
19 | 'scipy',
20 | 'deap',
21 | 'cachetools',
22 | 'numpy',
23 | ],
24 | python_requires='>=2.7',
25 | packages=find_packages(exclude=['contrib', 'docs', 'tests']),
26 | url='https://github.com/cfusting/fastgp'
27 | )
28 |
--------------------------------------------------------------------------------
/tests/fastgp/parametrized/test_simple_parametrzied_terminal.py:
--------------------------------------------------------------------------------
1 | import pytest
2 |
3 | import fastgp.parametrized.simple_parametrized_terminals as sp
4 |
5 |
6 | class TestRangeOperationTerminal:
7 |
8 | def test_initialize(self):
9 | term = sp.RangeOperationTerminal()
10 | variable_type_indices = [(1, 2)]
11 | names = ['xtdlta' + str(x) for x in range(2)]
12 | with pytest.raises(ValueError):
13 | term.initialize_parameters(variable_type_indices, names)
14 | variable_type_indices = [(2, 0)]
15 | names = ['xtdlta' + str(x) for x in range(2)]
16 | with pytest.raises(ValueError):
17 | term.initialize_parameters(variable_type_indices, names)
18 | variable_type_indices = [(1, 3)]
19 | names = ['xtdlta' + str(x) for x in range(2)]
20 | term.initialize_parameters(variable_type_indices, names)
21 | variable_type_indices = [(1, 3), (7, 30)]
22 | names = ['xtdlta' + str(x) for x in range(2)] + ['ytdlta' + str(x) for x in range(7, 30)]
23 | term.initialize_parameters(variable_type_indices, names)
24 |
25 | def test_initialize_manually(self):
26 | term = sp.RangeOperationTerminal()
27 | variable_type_indices = [(0, 2)]
28 | names = ['xtdlta' + str(x) for x in range(2)]
29 | with pytest.raises(ValueError):
30 | term.initialize_parameters(variable_type_indices, names, operation='cat', begin_range_name='xtdlta0',
31 | end_range_name='xtdlta1')
32 | with pytest.raises(ValueError):
33 | term.initialize_parameters(variable_type_indices, names, operation='cat', begin_range_name='xtlta0',
34 | end_range_name='xtdlta1')
35 | with pytest.raises(ValueError):
36 | term.initialize_parameters(variable_type_indices, names, operation='cat', begin_range_name='xtdlta0',
37 | end_range_name='xtdlt1')
38 | variable_type_indices = [(0, 2)]
39 | names = ['xtdlta' + str(x) for x in range(2)]
40 | term.initialize_parameters(variable_type_indices, names, operation='max', begin_range_name='xtdlta0',
41 | end_range_name='xtdlta1')
42 | variable_type_indices = [(0, 7)]
43 | names = ['xtdlta' + str(x) for x in range(7)]
44 | term.initialize_parameters(variable_type_indices, names, operation='max', begin_range_name='xtdlta3',
45 | end_range_name='xtdlta4')
46 |
--------------------------------------------------------------------------------