├── algo
    ├── ebs
    │   ├── __init__.py
    │   ├── eq_functions.py
    │   ├── eq.py
    │   └── eq_node.py
    ├── gene.py
    ├── operators
    │   ├── mutation.py
    │   ├── crossover.py
    │   ├── next_generation.py
    │   └── fitness.py
    ├── population.py
    ├── multi_tpot_analysis.py
    ├── genetic_algorithm_feature_selection.py
    ├── genetic_algorithm_symbolic_fit.py
    └── equation_brute_search.py
├── experiments
    ├── __init__.py
    ├── exp_steady_free_fall_with_drag_case_1.py
    ├── exp_steady_free_fall_with_drag_case_2.py
    ├── exp_steady_free_fall_with_drag_case_2_with_educated_guess.py
    ├── exp_steady_free_fall_with_drag_case_3.py
    ├── exp_constant_acceleration.py
    ├── exp_drag_force.py
    ├── exp_steady_free_fall_with_drag.py
    └── exp_noise.py
├── data
    └── Readme.md
├── requirements.txt
├── Dockerfile
├── setup.py
├── LICENSE
├── demo
    ├── demo.py
    └── demo.csv
├── utills
    ├── logger_config.py
    ├── fitness_methods.py
    ├── tpot_results_extractor.py
    ├── symbolic_regression_to_latex_text.py
    ├── consts.py
    ├── result_tracker.py
    └── plotter.py
├── data_generators
    ├── N_frequency_generator.py
    ├── constant_acceleration_data_generator.py
    ├── drag_force_data_generator.py
    └── steady_free_fall_with_drag_data_generator.py
├── running_sciemed.py
├── .gitignore
├── paper_exp_runner.py
├── README.md
└── scimed.py


/algo/ebs/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/experiments/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/data/Readme.md:
--------------------------------------------------------------------------------
1 | This folder will contain the data generated for training
2 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | pandas
 2 | numpy
 3 | matplotlib
 4 | seaborn
 5 | scikit-learn
 6 | scipy
 7 | TPOT
 8 | gplearn
 9 | torch
10 | termcolor
11 | sympy


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | # syntax=docker/dockerfile:1
 2 | 
 3 | ARG PYTHON_VERSION=3.10
 4 | 
 5 | FROM python:${PYTHON_VERSION}
 6 | 
 7 | WORKDIR /src
 8 | 
 9 | COPY requirements.txt /src/requirements.txt
10 | RUN pip install --no-cache-dir -r requirements.txt
11 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | from setuptools import setup, find_packages
 4 | 
 5 | setup(name='SciMED',
 6 |       version='1.0',
 7 |       description='A computational framework for finding symbolic expressions from physical datasets.',
 8 |       author='Liron Simon Keren, Alex Liberzon, Teddy Lazebnik',
 9 |       author_email='lazebnik.teddy@gmail.com',
10 |       url='https://github.com/LironSimon/SciMED',
11 |       packages=find_packages(),
12 |      )
13 | 


--------------------------------------------------------------------------------
/algo/ebs/eq_functions.py:
--------------------------------------------------------------------------------
 1 | # library imports
 2 | import numpy as np
 3 | import pandas as pd
 4 | 
 5 | # project import
 6 | 
 7 | 
 8 | def add(a: pd.Series,
 9 |         b: pd.Series):
10 |     return a + b
11 | 
12 | 
13 | def sub(a: pd.Series,
14 |         b: pd.Series):
15 |     return a - b
16 | 
17 | 
18 | def div(a: pd.Series,
19 |         b: pd.Series):
20 |     return (a / b).fillna(0).replace([np.inf, -np.inf], 0)
21 | 
22 | 
23 | def mul(a: pd.Series,
24 |         b: pd.Series):
25 |     return a * b
26 | 
27 | 
28 | FUNCTION_MAPPER = {
29 |     add: "add",
30 |     sub: "sub",
31 |     mul: "mul",
32 |     div: "div"
33 | }
34 | 
35 | FUNCTION_LIST = [add, sub, mul, div]
36 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2022 Teddy Lazebnik
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/demo/demo.py:
--------------------------------------------------------------------------------
 1 | """
 2 | This file provides a demo on how to use SciMed on a captured datasets that is stored in a CSV file format
 3 | """
 4 | 
 5 | """
 6 | 1. Needed library imports
 7 | """
 8 | import os
 9 | import pandas as pd
10 | from sklearn.model_selection import train_test_split
11 | """
12 | 2. Import SciMED's instance
13 | """
14 | from scimed import scimed
15 | """
16 | 3. Load the data into a pandas.DataFrame and split it into the source and target features 
17 | """
18 | df = pd.read_csv("demo.csv")
19 | Y_COL_NAME = ""
20 | x = df.drop([Y_COL_NAME], axis=1)
21 | y = df[Y_COL_NAME]
22 | x_train, x_test, y_train, y_test = train_test_split(x,
23 |                                                     y,
24 |                                                     test_size=0.2, # most of the time, we divide 80%-20%
25 |                                                     random_state=73) # Sheldon's number - just for fun
26 | """
27 | 4. Run SciMED and observe the results in the 'results_folder'
28 | """
29 | scimed.run(train_data_x=x_train,
30 |            train_data_y=y_train,
31 |            test_data_x=x_test,
32 |            test_data_y=y_test,
33 |            results_folder=os.path.join(os.path.dirname(__file__), "results")
34 |            )


--------------------------------------------------------------------------------
/utills/logger_config.py:
--------------------------------------------------------------------------------
 1 | # library imports
 2 | import logging
 3 | from termcolor import colored
 4 | 
 5 | # project imports
 6 | 
 7 | 
 8 | class Logger:
 9 |     """
10 |     Our project logger
11 |     """
12 | 
13 |     logger = logging.getLogger()
14 |     logging.getLogger('matplotlib.font_manager').disabled = True
15 |     logger.setLevel(logging.DEBUG)
16 | 
17 |     def __init__(self,save_path):
18 |         logging.basicConfig(filename=save_path,
19 |                             format='%(asctime)s %(message)s',
20 |                             filemode='w')
21 | 
22 |     @staticmethod
23 |     def print(message: str):
24 |         Logger.logger.info(message)
25 |         print("Info: {}".format(message))
26 | 
27 |     @staticmethod
28 |     def info(message: str):
29 |         Logger.print(message=message)
30 | 
31 |     @staticmethod
32 |     def important(message: str):
33 |         Logger.logger.critical(message)
34 |         print("Important: {}".format(colored(message, "bold")))
35 | 
36 |     @staticmethod
37 |     def debug(message: str):
38 |         Logger.logger.debug(message)
39 |         print("Debug: {}".format(colored(message, "red")))
40 | 
41 |     def __repr__(self):
42 |         return self.__str__()
43 | 
44 |     def __str__(self):
45 |         return "<Logger>"
46 | 


--------------------------------------------------------------------------------
/algo/gene.py:
--------------------------------------------------------------------------------
 1 | # library imports
 2 | from random import randint
 3 | import pandas as pd
 4 | 
 5 | # project imports
 6 | 
 7 | 
 8 | class Gene:
 9 |     """
10 |     A gene to represent a subset of features,
11 |     ensuring there is only one feature representation
12 |     from each group of similarly-created feature.
13 |     """
14 | 
15 |     def __init__(self,
16 |                  feature_indexes: list,
17 |                  scores: pd.DataFrame = pd.DataFrame(),
18 |                  fitness: float = 0,
19 |                  model_object=None):
20 |         self.feature_indexes = feature_indexes
21 |         self.scoring_history = scores
22 |         self.fitness = fitness
23 |         self.model_object = model_object
24 | 
25 | 
26 |     def __repr__(self):
27 |         return self.__str__()
28 | 
29 |     def __str__(self):
30 |         return "<Gene | feature_indexes = {}, fitness = {}>".format(self.feature_indexes,
31 |                                                                     self.fitness)
32 | 
33 |     @staticmethod
34 |     def random(feature_indexes_ranges: list,
35 |                feature_count: int):
36 |         return Gene(feature_indexes=[randint(feature_indexes_ranges[i][0],
37 |                                              feature_indexes_ranges[i][1])
38 |                                      for i in range(feature_count)])
39 | 
40 |     def length(self):
41 |         return len(self.feature_indexes)
42 | 
43 | 


--------------------------------------------------------------------------------
/algo/operators/mutation.py:
--------------------------------------------------------------------------------
 1 | # library imports
 2 | import numpy as np
 3 | from random import randint, random
 4 | 
 5 | # project imports
 6 | from algo.population import Population
 7 | 
 8 | 
 9 | class Mutation:
10 |     """
11 |     mutation operations class
12 |     """
13 | 
14 |     def __init__(self):
15 |         pass
16 | 
17 |     @staticmethod
18 |     def simple(population: Population,
19 |                feature_indexes_ranges: list,
20 |                mutation_rate: float,
21 |                w: list = None):
22 |         """
23 |         Just a simple random mutation: changes the selected  feature
24 |         of one feature-group to a different feature from the same group.
25 |         """
26 |         if w is None:
27 |             w = [1 / len(population[0].feature_indexes) for _ in range(len(population[0].feature_indexes))]
28 |         w = np.asarray(w)
29 |         w = w/w.sum()
30 |         for gene in population:
31 |             if random() < mutation_rate:
32 |                 pick_index = np.random.choice(range(len(gene.feature_indexes)),
33 |                                               1,
34 |                                               p=w)[0]
35 |                 gene.feature_indexes[pick_index] = randint(feature_indexes_ranges[pick_index][0],
36 |                                                            feature_indexes_ranges[pick_index][1])
37 |         return population
38 | 
39 |     def __repr__(self):
40 |         return self.__str__()
41 | 
42 |     def __str__(self):
43 |         return "<Mutation-GA-operator>"
44 | 
45 | 


--------------------------------------------------------------------------------
/data_generators/N_frequency_generator.py:
--------------------------------------------------------------------------------
 1 | # project imports
 2 | import numpy as np
 3 | import pandas as pd
 4 | 
 5 | # project imports
 6 | from utills.consts import *
 7 | 
 8 | 
 9 | class NFrequencyGenerator:
10 |     """
11 |     This class is responsible for creating all possible combinations
12 |     representing a fluid frequency N [1/s].
13 |     """
14 | 
15 |     def __init__(self):
16 |         pass
17 | 
18 |     @staticmethod
19 |     def add_all_combos(df: pd.DataFrame,
20 |                        g: float):
21 |         """
22 |         Single entry point.
23 |         Adds all possible combination of N as new columns in a given df.
24 |         returns the modified df and list os suffixes that indicate
25 |         how N was calculated.
26 |         """
27 |         for suff in N_FREQ_SUFFIX:
28 |             # choose rho_up
29 |             if suff[-2] == '1':
30 |                 rho_up = df["rhop"] - df["rhoa"]
31 |             elif suff[-2] == '2':
32 |                 rho_up = df["rhop"]
33 |             else:
34 |                 rho_up = df["rhoa"]
35 | 
36 |             # choose rho_down
37 |             if suff[-1] == '1':
38 |                 rho_down = 0.5 * (df["rhop"] + df["rhoa"])
39 |             elif suff[-1] == '2':
40 |                 rho_down = df["rhop"]
41 |             else:
42 |                 rho_down = df["rhoa"]
43 | 
44 |             # calc N and add to Ns
45 |             df.at[:, "N{}".format(suff)] = np.sqrt((g * rho_up) / (df["d"] * rho_down))
46 |         # return answer
47 |         return df, N_FREQ_SUFFIX
48 | 


--------------------------------------------------------------------------------
/algo/population.py:
--------------------------------------------------------------------------------
 1 | # library imports
 2 | 
 3 | # project imports
 4 | from algo.gene import Gene
 5 | 
 6 | 
 7 | class Population:
 8 |     """
 9 |     A population of genes
10 |     """
11 | 
12 |     def __init__(self,
13 |                  genes: list = None):
14 |         self.genes = genes if isinstance(genes, list) and len(genes) > 0 else []
15 | 
16 |     def __getitem__(self, item):
17 |         return self.genes[item]  # delegate to li.__getitem__
18 | 
19 |     def __repr__(self):
20 |         return self.__str__()
21 | 
22 |     def __str__(self):
23 |         return "<Population | size = {}>".format(len(self.genes))
24 | 
25 |     @staticmethod
26 |     def random(size: int,
27 |                feature_count: int,
28 |                feature_indexes_ranges: list):
29 |         return Population(genes=[Gene.random(feature_indexes_ranges=feature_indexes_ranges,
30 |                                              feature_count=feature_count)
31 |                                  for _ in range(size)])
32 | 
33 |     def get_best(self):
34 |         best_gene = self.genes[0]
35 |         best_gene_fitness = self.genes[0].fitness
36 |         for gene in self.genes:
37 |             if gene.fitness > best_gene_fitness:
38 |                 best_gene_fitness = gene.fitness
39 |                 best_gene = gene
40 |         return best_gene
41 | 
42 |     def get_scores(self):
43 |         return [gene.fitness for gene in self.genes]
44 | 
45 |     def get(self,
46 |             index: int):
47 |         return self.genes[index]
48 | 
49 |     def remove(self,
50 |                index: int):
51 |         self.genes.remove(self.genes[index])
52 | 
53 |     def add(self,
54 |             gene: Gene):
55 |         self.genes.append(gene)
56 | 
57 |     def size(self):
58 |         return len(self.genes)
59 | 


--------------------------------------------------------------------------------
/algo/operators/crossover.py:
--------------------------------------------------------------------------------
 1 | # library imports
 2 | from random import randint
 3 | 
 4 | # project imports
 5 | from algo.gene import Gene
 6 | from algo.population import Population
 7 | 
 8 | 
 9 | class Crossover:
10 |     """
11 |     crossover operations class
12 |     """
13 | 
14 |     def __init__(self):
15 |         pass
16 | 
17 |     @staticmethod
18 |     def simple(population: Population):
19 |         """
20 |         Just a simple single-point random crossover:
21 |         returns a new population by randomly picking pairs of gene
22 |         parents + a breaking point, to generate a pair of offsprings.
23 |         """
24 |         new_pop = Population()
25 |         while population.size() > 0:
26 |             # pick two different genes
27 |             i = randint(0, population.size()-1)
28 |             j = randint(0, population.size()-1)
29 |             while i == j:
30 |                 i = randint(0, population.size()-1)
31 |                 j = randint(0, population.size()-1)
32 |             gene_i = population.get(i)
33 |             gene_j = population.get(j)
34 |             # pick a single breaking point
35 |             break_index = randint(1, len(gene_i.feature_indexes)-2)
36 |             # recall to new list
37 |             new_pop.add(gene=Gene(feature_indexes=gene_i.feature_indexes[:break_index]+gene_j.feature_indexes[break_index:]))
38 |             new_pop.add(gene=Gene(feature_indexes=gene_j.feature_indexes[:break_index]+gene_i.feature_indexes[break_index:]))
39 |             # remove from previous list
40 |             # TODO: Teddy, why is the order of removal important? why do we need this if/else
41 |             if i < j:
42 |                 population.remove(index=j)
43 |                 population.remove(index=i)
44 |             else:
45 |                 population.remove(index=i)
46 |                 population.remove(index=j)
47 |         return new_pop
48 | 
49 |     def __repr__(self):
50 |         return self.__str__()
51 | 
52 |     def __str__(self):
53 |         return "<Crossover-GA-operator>"
54 | 
55 | 


--------------------------------------------------------------------------------
/running_sciemed.py:
--------------------------------------------------------------------------------
 1 | """
 2 | 1. Needed library imports
 3 | """
 4 | import os
 5 | import pandas as pd
 6 | from sklearn.preprocessing import MinMaxScaler
 7 | from sklearn.model_selection import train_test_split
 8 | """
 9 | 2. Import SciMED's instance
10 | """
11 | from scimed import scimed
12 | """
13 | 3. Load the data into a pandas.DataFrame and split it into the source and target features 
14 | """
15 | 
16 | 
17 | def run():
18 |     df = pd.read_csv("412_dataset.csv")
19 |     Y_COL_NAME = "tau/t_expected"
20 |     df = (df-df.min())/(df.max()-df.min())
21 |     x = df.drop([Y_COL_NAME], axis=1)
22 |     y = df[Y_COL_NAME]
23 |     x_train, x_test, y_train, y_test = train_test_split(x,
24 |                                                         y,
25 |                                                         test_size=0.2,   # most of the time, we divide 80%-20%
26 |                                                         random_state=73) # Sheldon's number - just for fun
27 |     feature_indexes_ranges =[[0,0], [1,4], [5,20], [21,21], [22,27], [28,412]]
28 |     """
29 |     4. Run SciMED and observe the results in the 'results_folder'
30 |     """
31 |     scimed.run(train_data_x=x_train,
32 |                train_data_y=y_train,
33 |                test_data_x=x_test,
34 |                test_data_y=y_test,
35 |                results_folder=os.path.join(os.path.dirname(__file__), "results"),
36 |                k_fold = 5,
37 |                numerical_bool = True,
38 |                numerical_run_times = 1,
39 |                numerical_generations = 25,
40 |                numerical_population = 40,
41 |                analytical_bool = False,
42 |                force_ebs_bool = True,
43 |                ebs_size_range = (3, 13),
44 |                feature_indexes_ranges = feature_indexes_ranges,
45 |                feature_selection_generations = 30,
46 |                feature_selection_pop_size = 26,
47 |                feature_selection_mutation_rate = 0.03,
48 |                feature_selection_royalty=0.05
49 |                )
50 | 
51 | 
52 | if __name__ == '__main__':
53 |     run()
54 | 


--------------------------------------------------------------------------------
/algo/operators/next_generation.py:
--------------------------------------------------------------------------------
 1 | # library imports
 2 | import random
 3 | 
 4 | # project imports
 5 | from algo.population import Population
 6 | 
 7 | 
 8 | class NextGeneration:
 9 |     """
10 |     A next generation metric
11 |     """
12 | 
13 |     def __init__(self):
14 |         pass
15 | 
16 |     @staticmethod
17 |     def tournament_with_royalty(population: Population,
18 |                                 royalty: float):
19 |         """
20 |         A tournament next generation with royalty
21 |         """
22 |         # calc the probability of selecting a gene to a tournament
23 |         sum_fitness = sum(population.get_scores())
24 |         if sum_fitness > 0:
25 |             fitness_probabilities = [score / sum_fitness for score in population.get_scores()]
26 |         else:
27 |             fitness_probabilities = population.get_scores()
28 |         # sort the population by probability of selection
29 |         genes_with_fitness = zip(fitness_probabilities, population.genes)
30 |         genes_with_fitness = sorted(genes_with_fitness, key=lambda x: x[0], reverse=True)
31 |         # pick the most probable genes (those with the largest fitness scores)
32 |         royalty_pop = [val[1] for val in genes_with_fitness[:round(len(genes_with_fitness) * royalty)]]
33 |         # tournament around the other genes
34 |         left_genes = [val[1] for val in genes_with_fitness[round(len(genes_with_fitness) * royalty):]]
35 |         left_fitness = [val[0] for val in genes_with_fitness[round(len(genes_with_fitness) * royalty):]]
36 |         pick_genes = []
37 |         left_count = len(population.genes) - len(royalty_pop)
38 |         while len(pick_genes) < left_count:
39 |             pick_gene = random.choices(left_genes, weights=left_fitness)[0]
40 |             pick_genes.append(pick_gene)
41 |         # add the royalty to the genes selected in the tournament
42 |         pick_genes = list(pick_genes)
43 |         pick_genes.extend(royalty_pop)
44 |         return Population(genes=pick_genes)
45 | 
46 |     def __repr__(self):
47 |         return self.__str__()
48 | 
49 |     def __str__(self):
50 |         return "<NextGeneration-GA-operator>"
51 | 


--------------------------------------------------------------------------------
/utills/fitness_methods.py:
--------------------------------------------------------------------------------
 1 | # library imports
 2 | import numpy as np
 3 | from sklearn.metrics import make_scorer
 4 | from gplearn.fitness import make_fitness
 5 | from sklearn.metrics import mean_absolute_error, mean_squared_error
 6 | 
 7 | # functions we might want to use as part of the TPOT process
 8 | neg_mean_squared_error_scorer = make_scorer(mean_squared_error, greater_is_better=False)
 9 | 
10 | 
11 | def simple_symbolic_reg_fitness(y: np.ndarray,
12 |                                 y_pred: np.ndarray,
13 |                                 sample_weight: np.ndarray = None) -> np.float64:
14 |     """
15 |     Just the MSE
16 |     :param y_true: the list of baseline values to compare with
17 |     :param y_pred: the list of model predicted values to evaluate
18 |     :return: the error value from 0 to inf
19 |     """
20 |     if sample_weight is None:
21 |         return mean_squared_error(y_true=y, y_pred=y_pred)
22 |     else:
23 |         return mean_squared_error(y_true=y, y_pred=y_pred, sample_weight=sample_weight)
24 | 
25 | 
26 | def better_symbolic_reg_fitness(y: np.ndarray,
27 |                                 y_pred: np.ndarray,
28 |                                 sample_weight: np.ndarray = None) -> np.float64:
29 |     """
30 |     Taking ideas from https://arxiv.org/abs/1904.05417 for better overall results
31 |     :param y_true: the list of baseline values to compare with
32 |     :param y_pred: the list of model predicted values to evaluate
33 |     :return: the error value from 0 to inf
34 |     """
35 |     if sample_weight is None:
36 |         return mean_squared_error(y_true=y, y_pred=y_pred) + mean_absolute_error(y_true=y, y_pred=y_pred) + np.max(y-y_pred)
37 |     else:
38 |         return mean_squared_error(y_true=y, y_pred=y_pred, sample_weight=sample_weight) + mean_absolute_error(y_true=y, y_pred=y_pred, sample_weight=sample_weight) + np.max(y-y_pred)
39 | 
40 | 
41 | # common functions
42 | function_mapper = {
43 |     "simple_symbolic_reg_fitness": make_fitness(function=simple_symbolic_reg_fitness, greater_is_better=False),
44 |     "better_symbolic_reg_fitness": make_fitness(function=better_symbolic_reg_fitness, greater_is_better=False)
45 | }
46 | 


--------------------------------------------------------------------------------
/data_generators/constant_acceleration_data_generator.py:
--------------------------------------------------------------------------------
 1 | # project imports
 2 | import numpy as np
 3 | import pandas as pd
 4 | 
 5 | # library imports
 6 | 
 7 | 
 8 | class ConstantAccelerationDataGenerator:
 9 |     """
10 |     This class is responsible for the generation of measurements of motion with
11 |      constant acceleration to test the model.
12 |     """
13 | 
14 |     # CONSTS #
15 | 
16 |     # END - CONSTS #
17 | 
18 |     def __init__(self):
19 |         pass
20 | 
21 |     # Logic - start #
22 | 
23 |     @staticmethod
24 |     def generate(samples: int,
25 |                  a_range: tuple,
26 |                  t_range: tuple,
27 |                  v0_range: tuple,
28 |                  noise_range: tuple,
29 |                  save_path: str):
30 |         """
31 |         Generate a pandas dataframe of experiments to represent motion with constant acceleration.
32 |         We assume we sample 3 parameters:
33 |             v0: initial velocity   [m/s]
34 |              a: acceleration      [m/s2]
35 |              t: time pass         [s]
36 |         and calculate with them:
37 |              v: velocity at time t [m/s]
38 |         via v = v0 + a * t.
39 |         """
40 |         a_range_delta = a_range[1] - a_range[0]
41 |         t_range_delta = t_range[1] - t_range[0]
42 |         v0_range_delta = v0_range[1] - v0_range[0]
43 |         noise_range_delta = noise_range[1] - noise_range[0]
44 |         data = []
45 |         # generate samples
46 |         for i in range(samples):
47 |             a = round(np.random.random_sample() * a_range_delta + a_range[0], 2)
48 |             v0 = round(np.random.random_sample() * v0_range_delta + v0_range[0], 2)
49 |             t = round(np.random.random_sample() * t_range_delta + t_range[0], 2)
50 |             noise = round(np.random.random_sample() * noise_range_delta + noise_range[0], 2) * np.random.choice((-1, 1))
51 |             v_sampled = v0 + a * t
52 |             v_sampled = round(v_sampled * (1 + noise), 2)
53 |             data.append([v0, a, t, v_sampled])
54 |         # make a Pandas.DataFrame and save it as a CSV file
55 |         pd.DataFrame(data=data, columns=["v0", "a", "t", "v"]).to_csv(save_path, index=False)
56 | 
57 |     # Logic - end #
58 | 


--------------------------------------------------------------------------------
/utills/tpot_results_extractor.py:
--------------------------------------------------------------------------------
 1 | # library import
 2 | import os
 3 | import re
 4 | import json
 5 | from glob import glob
 6 | 
 7 | 
 8 | # project import
 9 | 
10 | 
11 | class TPOTresultsExtractor:
12 |     """
13 |     This class is responsible to get a file or folder with files produced by the
14 |     MultiTPOTrunner library with pipeline configurations and extract only the description of the pipeline itself
15 |     """
16 | 
17 |     # CONSTS #
18 |     OPEN_DIVIDER = "make_pipeline("
19 |     CLOSE_DIVIDER = "exported_pipeline."
20 |     # END - CONSTS #
21 | 
22 |     def __init__(self):
23 |         pass
24 | 
25 |     @staticmethod
26 |     def process_folder(folder_path: str,
27 |                        answer_path: str = None):
28 |         """
29 |         :param folder_path: path to a folder with MultiTPOTrunner produced <>.py files
30 |         :param answer_path: path to the results, if None, just return the string with the answer
31 |         """
32 |         answer = [TPOTresultsExtractor.process_file(file_path=file_path,
33 |                                                     answer_path=None)
34 |                   for file_path in glob(os.path.join(folder_path, "*.py"))]
35 |         if answer_path is not None:
36 |             with open(answer_path, "w") as answer_file:
37 |                 json.dump(answer, answer_file)
38 |         else:
39 |             return answer
40 | 
41 |     @staticmethod
42 |     def process_file(file_path: str,
43 |                      answer_path: str = None):
44 |         """
45 |         :param file_path: path to the MultiTPOTrunner produced <>.py file
46 |         :param answer_path: path to the results, if None, just return the string with the answer
47 |         """
48 |         data_text = ""
49 |         with open(file_path, "r") as input_file:
50 |             data_text = input_file.read()
51 |         open_index = data_text.find(TPOTresultsExtractor.OPEN_DIVIDER)
52 |         close_index = data_text.find(TPOTresultsExtractor.CLOSE_DIVIDER)
53 |         pipeline = data_text[open_index + len(TPOTresultsExtractor.OPEN_DIVIDER):close_index].strip().replace("\n", "")
54 |         while ", " in pipeline:
55 |             pipeline = pipeline.replace(", ", ",")
56 |         if answer_path is not None:
57 |             with open(answer_path, "w") as result_file:
58 |                 result_file.write(pipeline)
59 |         else:
60 |             return pipeline
61 | 


--------------------------------------------------------------------------------
/experiments/exp_steady_free_fall_with_drag_case_1.py:
--------------------------------------------------------------------------------
 1 | # library imports
 2 | 
 3 | # project imports
 4 | from utills.consts import *
 5 | from experiments.exp_steady_free_fall_with_drag import ExpSFF
 6 | from data_generators.steady_free_fall_with_drag_data_generator import SFFWDdataGenerator
 7 | 
 8 | 
 9 | class ExpSFF1(ExpSFF):
10 |     """
11 |     The first case of the SFF -
12 |     Program receives a dataset that is missing an essential feature
13 |     (particle velocity), that is needed to deduce the target (drag coefficient).
14 | 
15 |     Failure of both numerical and analytical parts of the program prove
16 |     that if the user neglects to measure a key physical component in the
17 |     unknown physical phenomena, the user is alerted by bad results.
18 |     """
19 | 
20 |     def __init__(self):
21 |         ExpSFF.__init__(self)
22 | 
23 |     @staticmethod
24 |     def perform(numerical_bool: bool,
25 |                 analytical_bool: bool,
26 |                 force_ebs_bool: bool):
27 |         """
28 |         Entry point
29 |         """
30 |         ExpSFF.run(numerical_bool=numerical_bool,
31 |                    analytical_bool=analytical_bool,
32 |                    force_ebs_bool=force_ebs_bool,
33 |                    results_folder=os.path.join(os.path.dirname(os.path.dirname(__file__)), SFF_1_RESULTS_FOLDER_NAME),
34 |                    data_path=os.path.join(os.path.dirname(os.path.dirname(__file__)), SFF_1_DATA_FOLDER_NAME),
35 |                    data_generation_function=SFFWDdataGenerator.generate_case_1,
36 |                    numerical_run_times=SFF_NUMERICAL_RUN_TIMES,
37 |                    numerical_generations=SFF_NUMERICAL_GENERATION_COUNT,
38 |                    numerical_population=SFF_NUMERICAL_POP_SIZE,
39 |                    analytical_run_times=SFF_ANALYTICAL_RUN_TIMES,
40 |                    analytical_generations=SFF_ANALYTICAL_GENERATION_COUNT,
41 |                    analytical_population=SFF_NUMERICAL_POP_SIZE,
42 |                    parsimony_coefficient=SFF_ANALYTICAL_PARSIMONY_COEFFICIENT,
43 |                    k_fold=K_FOLD,
44 |                    samples=SFF_NUMERICAL_NUM_SAMPLES,
45 |                    rhoa_range=SFF_RHOA_RANGE,
46 |                    rhop_range=SFF_RHOP_RANGE,
47 |                    nu_range=SFF_NU_RANGE,
48 |                    re_range=SFF_RE_RANGE,
49 |                    expected_eq="unknown",
50 |                    ebs_size_range=SFF_DIMENSIONAL_EBS_SIZE_RANGE_1_2
51 |                    )
52 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | .idea/*
  7 | *.png
  8 | *.json
  9 | 
 10 | # C extensions
 11 | *.so
 12 | 
 13 | # Distribution / packaging
 14 | .Python
 15 | build/
 16 | develop-eggs/
 17 | dist/
 18 | downloads/
 19 | eggs/
 20 | .eggs/
 21 | lib/
 22 | lib64/
 23 | parts/
 24 | sdist/
 25 | var/
 26 | wheels/
 27 | pip-wheel-metadata/
 28 | share/python-wheels/
 29 | *.egg-info/
 30 | .installed.cfg
 31 | *.egg
 32 | MANIFEST
 33 | 
 34 | # PyInstaller
 35 | #  Usually these files are written by a python script from a template
 36 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 37 | *.manifest
 38 | *.spec
 39 | 
 40 | # Installer logs
 41 | pip-log.txt
 42 | pip-delete-this-directory.txt
 43 | 
 44 | # Unit test / coverage reports
 45 | htmlcov/
 46 | .tox/
 47 | .nox/
 48 | .coverage
 49 | .coverage.*
 50 | .cache
 51 | nosetests.xml
 52 | coverage.xml
 53 | *.cover
 54 | *.py,cover
 55 | .hypothesis/
 56 | .pytest_cache/
 57 | 
 58 | # Translations
 59 | *.mo
 60 | *.pot
 61 | 
 62 | # Django stuff:
 63 | *.log
 64 | local_settings.py
 65 | db.sqlite3
 66 | db.sqlite3-journal
 67 | 
 68 | # Flask stuff:
 69 | instance/
 70 | .webassets-cache
 71 | 
 72 | # Scrapy stuff:
 73 | .scrapy
 74 | 
 75 | # Sphinx documentation
 76 | docs/_build/
 77 | 
 78 | # PyBuilder
 79 | target/
 80 | 
 81 | # Jupyter Notebook
 82 | .ipynb_checkpoints
 83 | 
 84 | # IPython
 85 | profile_default/
 86 | ipython_config.py
 87 | 
 88 | # pyenv
 89 | .python-version
 90 | 
 91 | # pipenv
 92 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 93 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 94 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 95 | #   install all needed dependencies.
 96 | #Pipfile.lock
 97 | 
 98 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 99 | __pypackages__/
100 | 
101 | # Celery stuff
102 | celerybeat-schedule
103 | celerybeat.pid
104 | 
105 | # SageMath parsed files
106 | *.sage.py
107 | 
108 | # Environments
109 | .env
110 | .venv
111 | env/
112 | venv/
113 | ENV/
114 | env.bak/
115 | venv.bak/
116 | 
117 | # Spyder project settings
118 | .spyderproject
119 | .spyproject
120 | 
121 | # Rope project settings
122 | .ropeproject
123 | 
124 | # mkdocs documentation
125 | /site
126 | 
127 | # mypy
128 | .mypy_cache/
129 | .dmypy.json
130 | dmypy.json
131 | 
132 | # Pyre type checker
133 | .pyre/
134 | data/*
135 | results/*
136 | 


--------------------------------------------------------------------------------
/experiments/exp_steady_free_fall_with_drag_case_2.py:
--------------------------------------------------------------------------------
 1 | # library imports
 2 | 
 3 | # project imports
 4 | from utills.consts import *
 5 | from experiments.exp_steady_free_fall_with_drag import ExpSFF
 6 | from data_generators.steady_free_fall_with_drag_data_generator import SFFWDdataGenerator
 7 | 
 8 | 
 9 | class ExpSFF2(ExpSFF):
10 |     """
11 |     The second case of the SFF -
12 |     Program receives a dataset with all essential features needed
13 |     to deduce a "noisy" target (drag coefficient), except for
14 |     the gravitational acceleration constant.
15 | 
16 |     Success of both numerical and analytical parts of the program prove
17 |     that:
18 |       1) The program is able to learn the non-linear physical relation between
19 |          features, even with noisy data.
20 |       2) The program was able to learn the gravitational acceleration constant.
21 |     """
22 | 
23 |     def __init__(self):
24 |         ExpSFF.__init__(self)
25 | 
26 |     @staticmethod
27 |     def perform(numerical_bool: bool,
28 |                 analytical_bool: bool,
29 |                 force_ebs_bool: bool):
30 |         """
31 |         Entry point
32 |         """
33 |         ExpSFF.run(numerical_bool=numerical_bool,
34 |                    analytical_bool=analytical_bool,
35 |                    force_ebs_bool=force_ebs_bool,
36 |                    results_folder=os.path.join(os.path.dirname(os.path.dirname(__file__)), SFF_2_RESULTS_FOLDER_NAME),
37 |                    data_path=os.path.join(os.path.dirname(os.path.dirname(__file__)), SFF_2_DATA_FOLDER_NAME),
38 |                    data_generation_function=SFFWDdataGenerator.generate_case_2,
39 |                    numerical_run_times=SFF_NUMERICAL_RUN_TIMES,
40 |                    numerical_generations=SFF_NUMERICAL_GENERATION_COUNT,
41 |                    numerical_population=SFF_NUMERICAL_POP_SIZE,
42 |                    analytical_run_times=SFF_ANALYTICAL_RUN_TIMES,
43 |                    analytical_generations=SFF_ANALYTICAL_GENERATION_COUNT,
44 |                    analytical_population=SFF_NUMERICAL_POP_SIZE,
45 |                    parsimony_coefficient=SFF_ANALYTICAL_PARSIMONY_COEFFICIENT,
46 |                    k_fold=K_FOLD,
47 |                    samples=SFF_NUMERICAL_NUM_SAMPLES,
48 |                    rhoa_range=SFF_RHOA_RANGE,
49 |                    rhop_range=SFF_RHOP_RANGE,
50 |                    nu_range=SFF_NU_RANGE,
51 |                    re_range=SFF_RE_RANGE,
52 |                    expected_eq="13.08 * (rhop - rhoa) * d  / (rhoa * V * V)",
53 |                    ebs_size_range=SFF_DIMENSIONAL_EBS_SIZE_RANGE_1_2
54 |                    )
55 | 


--------------------------------------------------------------------------------
/experiments/exp_steady_free_fall_with_drag_case_2_with_educated_guess.py:
--------------------------------------------------------------------------------
 1 | # library imports
 2 | 
 3 | # project imports
 4 | from utills.consts import *
 5 | from experiments.exp_steady_free_fall_with_drag import ExpSFF
 6 | from data_generators.steady_free_fall_with_drag_data_generator import SFFWDdataGenerator
 7 | 
 8 | 
 9 | class ExpSFF2WithGuess(ExpSFF):
10 |     """
11 |     Similar to the second case of the SFF, but with educated guesses -
12 |     Program receives a dataset with all essential features needed
13 |     to deduce a "noisy" target (drag coefficient), except for
14 |     the gravitational acceleration constant. To that dataset of
15 |     dimensional features, two dimensional educated guesses are added.
16 | 
17 |     Success of both numerical and analytical parts of the program prove
18 |     that:
19 |       1) Educated guesses may improve results, despite the addition of features.
20 |       2) The program was able to learn the gravitational acceleration constant.
21 |     """
22 | 
23 |     def __init__(self):
24 |         ExpSFF.__init__(self)
25 | 
26 |     @staticmethod
27 |     def perform(numerical_bool: bool,
28 |                 analytical_bool: bool,
29 |                 force_ebs_bool: bool):
30 |         """
31 |         Entry point
32 |         """
33 |         ExpSFF.run(numerical_bool=numerical_bool,
34 |                    analytical_bool=analytical_bool,
35 |                    force_ebs_bool=force_ebs_bool,
36 |                    results_folder=os.path.join(os.path.dirname(os.path.dirname(__file__)), SFF_2_WITH_GUESS_RESULTS_FOLDER_NAME),
37 |                    data_path=os.path.join(os.path.dirname(os.path.dirname(__file__)), SFF_2_WITH_GUESS_DATA_FOLDER_NAME),
38 |                    data_generation_function=SFFWDdataGenerator.generate_case_2_with_guess,
39 |                    numerical_run_times=SFF_NUMERICAL_RUN_TIMES,
40 |                    numerical_generations=SFF_NUMERICAL_GENERATION_COUNT,
41 |                    numerical_population=SFF_NUMERICAL_POP_SIZE,
42 |                    analytical_run_times=SFF_ANALYTICAL_RUN_TIMES,
43 |                    analytical_generations=SFF_ANALYTICAL_GENERATION_COUNT,
44 |                    analytical_population=SFF_NUMERICAL_POP_SIZE,
45 |                    parsimony_coefficient=SFF_ANALYTICAL_PARSIMONY_COEFFICIENT,
46 |                    k_fold=K_FOLD,
47 |                    samples=SFF_NUMERICAL_NUM_SAMPLES,
48 |                    rhoa_range=SFF_RHOA_RANGE,
49 |                    rhop_range=SFF_RHOP_RANGE,
50 |                    nu_range=SFF_NU_RANGE,
51 |                    re_range=SFF_RE_RANGE,
52 |                    expected_eq="13.08 * (rhop - rhoa) * d  / (rhoa * V * V)",
53 |                    ebs_size_range=SFF_DIMENSIONAL_EBS_SIZE_RANGE_2_easy
54 |                    )
55 | 


--------------------------------------------------------------------------------
/algo/operators/fitness.py:
--------------------------------------------------------------------------------
 1 | # library imports
 2 | import pandas as pd
 3 | 
 4 | # project imports
 5 | from algo.population import Population
 6 | from utills.logger_config import Logger
 7 | from algo.multi_tpot_analysis import MultiTPOTrunner
 8 | 
 9 | 
10 | class Fitness:
11 |     """
12 |     An AutoML wrapper class that responsible to find the best
13 |     ML model + hyperparameters for a given dataset
14 |     """
15 | 
16 |     def __init__(self):
17 |         pass
18 | 
19 |     @staticmethod
20 |     def tpot(run_times: int,
21 |              train_data_x: pd.DataFrame,
22 |              train_data_y: pd.DataFrame,
23 |              test_data_x: pd.DataFrame,
24 |              test_data_y: pd.DataFrame,
25 |              generations: int,
26 |              population: Population,
27 |              population_size: int,
28 |              k_fold: int,
29 |              performance_metric,
30 |              save_dir: str,
31 |              n_jobs: int = -1):
32 |         """
33 |         Loops over all genes in the population, reduces the dataset
34 |         according to their sequence, and uses a TPOTRegressor to
35 |         find the best ML model + hyperparameters for the reduced data.
36 | 
37 |         Saves the fittness score and best ML model to the memory of each gene,
38 |         and returns the whole population.
39 |         """
40 |         for gene_index, gene in enumerate(population.genes):
41 |             Logger.print("\nAssigning to gene #{}/{}:".format(gene_index, population.size()-1))
42 |             # reduce data according to gene sequence
43 |             reduced_train_data_x = train_data_x.iloc[:, gene.feature_indexes]
44 |             reduced_test_data_x = test_data_x.iloc[:, gene.feature_indexes]
45 |             # run TPOT analysis multiple times on the reduced data
46 |             results, best_model = MultiTPOTrunner.run_and_analyze(run_times=run_times,
47 |                                                                   train_data_x=reduced_train_data_x,
48 |                                                                   train_data_y=train_data_y,
49 |                                                                   test_data_x=reduced_test_data_x,
50 |                                                                   test_data_y=test_data_y,
51 |                                                                   generations=generations,
52 |                                                                   population_size=population_size,
53 |                                                                   k_fold=k_fold,
54 |                                                                   performance_metric=performance_metric,
55 |                                                                   save_dir=save_dir,
56 |                                                                   n_jobs=n_jobs)
57 |             # save the best performing model & scoring history to gene's data
58 |             gene.model_object = best_model
59 |             gene.scoring_history = results
60 |             # assign fittness acc to performance score of the model
61 |             gene.fitness = best_model.score(reduced_test_data_x, test_data_y)
62 |         return population
63 | 
64 |     def __repr__(self):
65 |         return self.__str__()
66 | 
67 |     def __str__(self):
68 |         return "<Fitness-GA-operator>"
69 | 


--------------------------------------------------------------------------------
/experiments/exp_steady_free_fall_with_drag_case_3.py:
--------------------------------------------------------------------------------
 1 | # library imports
 2 | 
 3 | # project imports
 4 | from utills.consts import *
 5 | from experiments.exp_steady_free_fall_with_drag import ExpSFF
 6 | from data_generators.steady_free_fall_with_drag_data_generator import SFFWDdataGenerator
 7 | 
 8 | 
 9 | class ExpSFF3(ExpSFF):
10 |     """
11 |     The third case of the SFF -
12 |     Program receives a dataset with all non-dimensional
13 |     combinations of possible dimensional variables, relating to the
14 |     target (a total of 34 features created from 5 variables).
15 | 
16 |     The program selects the best feature from each group of
17 |     similar non-dimensional numbers, to create an improved
18 |     dataset. This is then used to find numerical and analytical relations.
19 | 
20 |     Success of both numerical and analytical parts of the program proves
21 |     that:
22 |       1) The program is capable of discovering the governing non-dimensional
23 |          numbers that explain the physical phenomena, with no prior physical
24 |          knowledge given by the user.
25 |       2) The program is able to learn the physical relation between
26 |          non-dimensional features and the target
27 |       3) The program was able to learn the gravitational acceleration constant.
28 |     """
29 | 
30 |     def __init__(self):
31 |         ExpSFF.__init__(self)
32 | 
33 |     @staticmethod
34 |     def perform(numerical_bool: bool,
35 |                 analytical_bool: bool,
36 |                 force_ebs_bool: bool):
37 |         """
38 |         Entry point
39 |         """
40 |         ExpSFF.run(numerical_bool=numerical_bool,
41 |                    analytical_bool=analytical_bool,
42 |                    force_ebs_bool=force_ebs_bool,
43 |                    results_folder=os.path.join(os.path.dirname(os.path.dirname(__file__)), SFF_3_RESULTS_FOLDER_NAME),
44 |                    data_path=os.path.join(os.path.dirname(os.path.dirname(__file__)), SFF_3_DATA_FOLDER_NAME),
45 |                    data_generation_function=SFFWDdataGenerator.generate_case_3,
46 |                    numerical_run_times=SFF_NUMERICAL_RUN_TIMES,
47 |                    numerical_generations=SFF_NUMERICAL_GENERATION_COUNT,
48 |                    numerical_population=SFF_NUMERICAL_POP_SIZE,
49 |                    analytical_run_times=SFF_ANALYTICAL_RUN_TIMES,
50 |                    analytical_generations=SFF_ANALYTICAL_GENERATION_COUNT,
51 |                    analytical_population=SFF_NUMERICAL_POP_SIZE,
52 |                    parsimony_coefficient=SFF_ANALYTICAL_PARSIMONY_COEFFICIENT,
53 |                    k_fold=K_FOLD,
54 |                    samples=SFF_NUMERICAL_NUM_SAMPLES,
55 |                    rhoa_range=SFF_RHOA_RANGE,
56 |                    rhop_range=SFF_RHOP_RANGE,
57 |                    nu_range=SFF_NU_RANGE,
58 |                    re_range=SFF_RE_RANGE,
59 |                    expected_eq="1) 1.33 * (delta_rho/rhoa) * (g*d/V**2)",
60 |                    feature_selection_generations=FEATURE_SELECTION_GENERATIONS_COUNT,
61 |                    feature_selection_pop_size=FEATURE_SELECTION_POP_SIZE,
62 |                    feature_selection_mutation_rate=FEATURE_SELECTION_MUTATION_RATE,
63 |                    feature_selection_royalty=FEATURE_SELECTION_ROYALTY,
64 |                    ebs_size_range=SFF_DIMENSIONAL_EBS_SIZE_RANGE_3
65 |                    )
66 | 


--------------------------------------------------------------------------------
/algo/multi_tpot_analysis.py:
--------------------------------------------------------------------------------
 1 | # library imports
 2 | import os
 3 | import pickle
 4 | import pandas as pd
 5 | from scipy import stats
 6 | from tpot import TPOTRegressor
 7 | from sklearn.model_selection import KFold
 8 | from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
 9 | 
10 | # project imports
11 | from utills.logger_config import Logger
12 | from utills.tpot_results_extractor import TPOTresultsExtractor
13 | 
14 | 
15 | class MultiTPOTrunner:
16 |     """
17 |     This class is responsible for generating a numerical
18 |     prediction of a target value from a given set of features,
19 |     using a TPOTRegressor.
20 | 
21 |     Input dataset is used to train and test the model
22 |     multiple times (named as run_times), to gain statistical
23 |     insight on the performance.
24 |     """
25 | 
26 |     def __init__(self):
27 |         pass
28 | 
29 |     @staticmethod
30 |     def run_and_analyze(run_times: int,
31 |                         train_data_x: pd.DataFrame,
32 |                         train_data_y: pd.DataFrame,
33 |                         test_data_x: pd.DataFrame,
34 |                         test_data_y: pd.DataFrame,
35 |                         generations: int,
36 |                         population_size: int,
37 |                         k_fold: int,
38 |                         performance_metric,
39 |                         save_dir: str,
40 |                         n_jobs: int = -1):
41 |         """
42 |         Run the TPOTRegressor algorithm with some hyper-parameters
43 |         for multiple times and analyze the stability of the results.
44 |         Returns a pandas dataframe of all results and the best model from
45 |         all runs.
46 |         """
47 |         # const
48 |         tpot_model_file_path = os.path.join(save_dir, 'current_tpot_pipeline.py')
49 |         tpot_object_file_path = os.path.join(save_dir, 'current_tpot_pipeline_as_object')
50 |         # prepare DF for results
51 |         best_model = None
52 |         results = pd.DataFrame()
53 |         current_best_performance_score = 99999
54 |         for test in range(run_times):
55 |             Logger.print("TPOT run {}/{}".format(test + 1, run_times))
56 |             model = TPOTRegressor(generations=generations,
57 |                                   population_size=population_size,
58 |                                   cv=KFold(n_splits=k_fold),
59 |                                   scoring=performance_metric,
60 |                                   verbosity=2,
61 |                                   n_jobs=n_jobs)
62 |             model.fit(train_data_x, train_data_y)
63 |             pred = model.predict(test_data_x)
64 |             # store test scores
65 |             try:
66 |                 # we assume this is just a function
67 |                 performance_score = performance_metric(test_data_y, pred)
68 |             except:
69 |                 # maybe it is a scorer wrapper of a function and we want to overcome it
70 |                 performance_score = performance_metric._score_func(test_data_y, pred)
71 |             results.at[test, "performance_score"] = performance_score
72 |             results.at[test, "mae"] = mean_absolute_error(test_data_y, pred)
73 |             results.at[test, "mse"] = mean_squared_error(test_data_y, pred)
74 |             results.at[test, "r2"] = r2_score(test_data_y, pred)
75 |             results.at[test, "t_test_p_value"] = stats.ttest_ind(test_data_y, pred)[1]
76 |             # store exported pipeline
77 |             model.export(tpot_model_file_path)
78 |             pipeline = TPOTresultsExtractor.process_file(tpot_model_file_path)
79 |             results.at[test, 'pipeline'] = pipeline
80 |             # update best mae score and model
81 |             if performance_score < current_best_performance_score:
82 |                 best_model = model
83 |                 current_best_performance_score = performance_score
84 |         # remove unnecessary file
85 |         os.remove(tpot_model_file_path)
86 |         # Logger.print and save scoring results of all runs
87 |         Logger.print("\nFinished all MultiTPOT runner runs")
88 |         [Logger.print("{}: {:.3}+-{:.3}".format(score, results[score].mean(), results[score].std())) for score in results.keys() if score != "pipeline"]
89 |         return results, best_model
90 | 
91 |     def __repr__(self):
92 |         return self.__str__()
93 | 
94 |     def __str__(self):
95 |         return "<MultiTPOTrunner>"
96 | 


--------------------------------------------------------------------------------
/algo/ebs/eq.py:
--------------------------------------------------------------------------------
  1 | # library imports
  2 | import numpy as np
  3 | import pandas as pd
  4 | 
  5 | # project imports
  6 | from algo.ebs.eq_node import EqNode
  7 | 
  8 | 
  9 | class Eq:
 10 |     """
 11 |     This class represents an equation that constructed from simple functions
 12 |     """
 13 | 
 14 |     def __init__(self,
 15 |                  tree: EqNode):
 16 |         self.tree = tree
 17 |         self.linear_a = 1
 18 |         self.linear_b = 0
 19 | 
 20 |     def eval(self,
 21 |              x_values: pd.DataFrame) -> pd.Series:
 22 |         """
 23 |         Eval the y_pred from the input
 24 |         :param x_values: the input
 25 |         """
 26 |         return self.linear_a * self.tree.eval(x_values=x_values) + self.linear_b
 27 | 
 28 |     def predict(self,
 29 |                 x_values: pd.DataFrame) -> pd.Series:
 30 |         """
 31 |         Eval the y_pred from the input
 32 |         :param x_values: the input
 33 |         """
 34 |         return self.linear_a * self.tree.eval(x_values=x_values) + self.linear_b
 35 | 
 36 |     def fix_nodes(self) -> None:
 37 |         """ fix nodes' is_leaf flag if has been corrupted by other process """
 38 |         return self.tree.fix_node()
 39 | 
 40 |     def to_id_str(self) -> str:
 41 |         """ print equation in a narrow way for hash mapping """
 42 |         return self.tree.to_id_str()
 43 | 
 44 |     def to_string(self) -> str:
 45 |         """ print the node as a string """
 46 |         if self.linear_a == 1:
 47 |             if self.linear_b > 0:
 48 |                 return "add({}, {:.3f})".format(self.tree.to_string(), self.linear_b)
 49 |             elif self.linear_b < 0:
 50 |                 return "sub({}, {:.3f})".format(self.tree.to_string(), -1*self.linear_b)
 51 |             else:
 52 |                 return "{}".format(self.tree.to_string())
 53 |         else:
 54 |             if self.linear_b > 0:
 55 |                 return "add(mul({:.3f}, {}), {:.3f})".format(self.linear_a, self.tree.to_string(), self.linear_b)
 56 |             elif self.linear_b < 0:
 57 |                 return "sub(mul({:.3f}, {}), {:.3f})".format(self.linear_a, self.tree.to_string(), -1*self.linear_b)
 58 |             else:
 59 |                 return "mul({:.3f}, {})".format(self.linear_a, self.tree.to_string())
 60 | 
 61 |     def size(self) -> int:
 62 |         """ calc the size of the equation """
 63 |         return self.tree.size()
 64 | 
 65 |     def populate(self,
 66 |                  not_leaf_values: list,
 67 |                  leaf_values: list) -> list:
 68 |         """ provide a list with all possible combinations """
 69 |         # set index to all and get which one is leaf and not leaf
 70 |         leaf_dict = {}
 71 |         self.tree.set_index(leaf_dict)
 72 |         possible_allocations_list = [leaf_values if is_leaf else not_leaf_values
 73 |                                      for index, is_leaf in leaf_dict.items()]
 74 |         possible_allocations_index_list = [len(val) for val in possible_allocations_list]
 75 |         combinations_count = np.prod(possible_allocations_index_list)
 76 |         # run on all possible permutations
 77 |         answer = []
 78 |         for index in range(combinations_count):
 79 |             allocation_option = [0 for _ in range(len(possible_allocations_index_list))]
 80 |             current_index = index
 81 |             set_index = 0
 82 |             while current_index != 0:
 83 |                 this_val = current_index % possible_allocations_index_list[set_index]
 84 |                 allocation_option[set_index] = this_val
 85 |                 current_index = current_index // possible_allocations_index_list[set_index]
 86 |                 set_index += 1
 87 |             answer.append(Eq(tree=self.tree._copy_and_put_values(allocation={index: possible_allocations_list[index][val] for index, val in enumerate(allocation_option)})))
 88 |         return answer
 89 | 
 90 |     @staticmethod
 91 |     def all_possible_fbt(n: int) -> list:
 92 |         """ Return all full binary trees of inputted size 'n' """
 93 |         return [Eq(tree=eq) for eq in EqNode.all_possible_fbt(n=n)]
 94 | 
 95 |     def __repr__(self):
 96 |         return self.__str__()
 97 | 
 98 |     def __str__(self):
 99 |         return "<Eq: {:.2f}*{}+{:.2f}>".format(self.linear_a,
100 |                                                self.tree.__str__(),
101 |                                                self.linear_b)
102 | 


--------------------------------------------------------------------------------
/data_generators/drag_force_data_generator.py:
--------------------------------------------------------------------------------
 1 | # project imports
 2 | import numpy as np
 3 | import pandas as pd
 4 | 
 5 | # library imports
 6 | 
 7 | 
 8 | class DragForceDataGenerator:
 9 |     """
10 |     This class generates measurements of aerodynamic drag force Fd, 
11 |     exerted on a sphere.
12 |     """
13 | 
14 |     # CONSTS #
15 | 
16 |     # END - CONSTS #
17 | 
18 |     def __init__(self):
19 |         pass
20 | 
21 |     # Logic - start #
22 | 
23 |     @staticmethod
24 |     def generate(samples: int,
25 |                  cd_range: tuple,
26 |                  rhoa_range: tuple,
27 |                  v_range: tuple,
28 |                  d_range: tuple,
29 |                  noise_range: tuple,
30 |                  save_path: str,
31 |                  rhop_range: tuple=(0.15,0.4),
32 |                  L_range: tuple=(0.15,0.4),
33 |                  p1_range: tuple=(15,35),
34 |                  p2_range: tuple=(40,60),
35 |                  p3_range: tuple=(1e-4,2e-4),
36 |                  p4_range: tuple=(200,300),
37 |                  p5_range: tuple=(1,3),
38 |                  p6_range: tuple=(1000,2500)):
39 |         """
40 |         Generate a pandas dataframe of experiments to represent the measurments.
41 |         We assume we sample 4 parameters:
42 |             cd: drag coefficient on a sphere    [-]
43 |             rhoa: air density                   [kg/m3]
44 |             v: momentary velocity of the sphere [m/s]
45 |             d: diameter of the sphere           [m]
46 |         and calculate with them:
47 |             fd: drag exerted on the sphere      [kg*m/s2]
48 |         via fd = pi*cd*rho*(v**2)*(d**2)/8.
49 |         Measurements include additional 8 parameters
50 |         that are not included within this function.
51 |         """
52 |         cd_range_delta = cd_range[1] - cd_range[0]
53 |         rhoa_range_delta = rhoa_range[1] - rhoa_range[0]
54 |         v_range_delta = v_range[1] - v_range[0]
55 |         d_range_delta = d_range[1] - d_range[0]
56 |         rhop_range_delta = d_range[1] - d_range[0]
57 |         L_range_delta = L_range[1] - L_range[0]
58 |         p1_range_delta = p1_range[1] - p1_range[0]
59 |         p2_range_delta = p2_range[1] - p2_range[0]
60 |         p3_range_delta = p3_range[1] - p3_range[0]
61 |         p4_range_delta = p4_range[1] - p4_range[0]
62 |         p5_range_delta = p5_range[1] - p5_range[0]
63 |         p6_range_delta = p6_range[1] - p6_range[0]
64 |         noise_range_delta = noise_range[1] - noise_range[0]
65 |         data = []
66 |         # generate samples
67 |         for i in range(samples):
68 |             cd = round(np.random.random_sample() * cd_range_delta + cd_range[0], 2)
69 |             rho = round(np.random.random_sample() * rhoa_range_delta + rhoa_range[0], 2)
70 |             v = round(np.random.random_sample() * v_range_delta + v_range[0], 2)
71 |             d = round(np.random.random_sample() * d_range_delta + d_range[0], 2)
72 |             rhop = round(np.random.random_sample() * rhop_range_delta + rhop_range[0], 2)
73 |             L = round(np.random.random_sample() * L_range_delta + L_range[0], 2)
74 |             p1 = round(np.random.random_sample() * p1_range_delta + p1_range[0], 2)
75 |             p2 = round(np.random.random_sample() * p2_range_delta + p2_range[0], 2)
76 |             p3 = round(np.random.random_sample() * p3_range_delta + p3_range[0], 2)
77 |             p4 = round(np.random.random_sample() * p4_range_delta + p4_range[0], 2)
78 |             p5 = round(np.random.random_sample() * p5_range_delta + p5_range[0], 2)
79 |             p6 = round(np.random.random_sample() * p6_range_delta + p6_range[0], 2)
80 |             noise = round(np.random.random_sample() * noise_range_delta + noise_range[0], 2) * np.random.choice((-1, 1))
81 |             fd_sampled = np.pi*cd*rho*(v**2)*(d**2)/8
82 |             fd_sampled = round(fd_sampled * (1 + noise), 2)
83 |             data.append([cd, rho, rhop, v, d, L, p1, p2, p3, p4, p5, p6, fd_sampled])
84 |         # make a Pandas.DataFrame and save it as a CSV file
85 |         pd.DataFrame(data=data, columns=["Cd","rhoa","rhop","v","d","L","p1","p2",
86 |                                          "p3","p4","p5","p6","Fd"]).to_csv(save_path, index=False)
87 |         #return indices of feature groups: rhoa-rhop and d-l form a group. The rest do not have a selection option
88 |         return [[0,0],[1,2],[2,2],[3,4],[5,5],[6,6],[7,7],[8,8],[9,9],[10,10],[11,11]] 
89 | 
90 |     # Logic - end #
91 | 


--------------------------------------------------------------------------------
/utills/symbolic_regression_to_latex_text.py:
--------------------------------------------------------------------------------
  1 | # library import
  2 | from sympy import *
  3 | import re as regular_exp
  4 | 
  5 | 
  6 | class SymbolicRegressionToLatexText:
  7 |     """
  8 |     This class is responsible to convert the standard symbolic regression's result style to latex style
  9 |     """
 10 | 
 11 |     # CONSTS #
 12 |     SR_FUNCS_NAMES = ["add", "sub", "mul", "div"]
 13 | 
 14 |     # END - CONSTS #
 15 | 
 16 |     def __init__(self):
 17 |         pass
 18 | 
 19 |     @staticmethod
 20 |     def run(eq: str):
 21 |         """
 22 |         Single entry point - run the convertor from EQ of the symbolic regression class to LATEX style
 23 |         :param eq: the EQ to convert
 24 |         :return: the same EQ in LATEX format
 25 |         """
 26 |         # replace the text to use the static methods of this class
 27 |         for func_name in SymbolicRegressionToLatexText.SR_FUNCS_NAMES:
 28 |             eq = eq.replace(func_name,
 29 |                             "SymbolicRegressionToLatexText._{}".format(func_name))
 30 |         eq = eq.replace("^", "power")
 31 |         # collect possible var names
 32 |         eq_vars = regular_exp.findall(r'(\w*),',
 33 |                                       eq)
 34 |         eq_vars.extend(regular_exp.findall(r', (\w*)\)',
 35 |                                            eq))
 36 |         # filter just vars names
 37 |         eq_vars = list(set([eq_var.strip() for eq_var in eq_vars if len(eq_var) > 0 and not eq_var.isnumeric() and eq_var not in SymbolicRegressionToLatexText.SR_FUNCS_NAMES]))
 38 |         # name them as strings
 39 |         eq_vars = sorted(eq_vars,
 40 |                          key=lambda x: len(x),
 41 |                          reverse=True)
 42 |         for eq_var in eq_vars:
 43 |             eq = eq.replace("{},".format(eq_var), '"{}",'.format(eq_var))
 44 |             eq = eq.replace(", {}".format(eq_var), ', "{}"'.format(eq_var))
 45 |         # run the code
 46 |         ex_locals = {}
 47 |         exec("answer = {}".format(eq), None, ex_locals)
 48 |         answer_eq = ex_locals["answer"]
 49 |         # small fixes to style
 50 |         try:
 51 |             answer_eq = SymbolicRegressionToLatexText._post_fixes(answer_eq=answer_eq)
 52 |         except:
 53 |             pass
 54 |         return answer_eq
 55 | 
 56 |     @staticmethod
 57 |     def _add(x: str,
 58 |              y: str):
 59 |         x = str(x)
 60 |         y = str(y)
 61 |         x_number = x.isnumeric() or (x[1:].isnumeric() and x[0] == "-")
 62 |         y_number = y.isnumeric() or (y[1:].isnumeric() and y[0] == "-")
 63 |         if x_number and y_number:
 64 |             return "{}".format(float(x) + float(y))
 65 |         return "({} + {})".format(x, y)
 66 | 
 67 |     @staticmethod
 68 |     def _sub(x: str,
 69 |              y: str):
 70 |         x = str(x)
 71 |         y = str(y)
 72 |         x_number = x.isnumeric() or (x[1:].isnumeric() and x[0] == "-")
 73 |         y_number = y.isnumeric() or (y[1:].isnumeric() and y[0] == "-")
 74 |         if x_number and y_number:
 75 |             return "{}".format(float(x) - float(y))
 76 |         return "({} - {})".format(x, y)
 77 | 
 78 |     @staticmethod
 79 |     def _mul(x: str,
 80 |              y: str):
 81 |         x = str(x)
 82 |         y = str(y)
 83 |         x_number = x.isnumeric() or (x[1:].isnumeric() and x[0] == "-")
 84 |         y_number = y.isnumeric() or (y[1:].isnumeric() and y[0] == "-")
 85 |         if x_number and y_number:
 86 |             return "{}".format(float(x) * float(y))
 87 |         elif x_number and not y_number:
 88 |             return "{}{}".format(x, y)
 89 |         elif not x_number and y_number:
 90 |             return "{}{}".format(y, x)
 91 |         else:
 92 |             return "{} \\cdot {}".format(x, y)
 93 | 
 94 |     @staticmethod
 95 |     def _div(x: str,
 96 |              y: str):
 97 |         x = str(x)
 98 |         y = str(y)
 99 |         x_number = x.isnumeric() or (x[1:].isnumeric() and x[0] == "-")
100 |         y_number = y.isnumeric() or (y[1:].isnumeric() and y[0] == "-")
101 |         if x_number and y_number:
102 |             return "{}".format(float(x) / float(y))
103 |         else:
104 |             return "\\frac{" + str(x) + "}{" + str(y) + "}"
105 | 
106 |     @staticmethod
107 |     def _post_fixes(answer_eq: str):
108 |         change_symbol = True
109 |         while change_symbol:
110 |             answer_eq_before = answer_eq
111 |             answer_eq = answer_eq.replace("--", "+")
112 |             answer_eq = answer_eq.replace(" - -", " + ")
113 |             answer_eq = answer_eq.replace("-+", "-")
114 |             answer_eq = answer_eq.replace("+-", "-")
115 |             answer_eq = answer_eq.replace("++", "+")
116 |             answer_eq = answer_eq.replace(" + +", "+")
117 |             answer_eq = answer_eq.replace(" - +", " - ")
118 |             answer_eq = answer_eq.replace(" + -", " - ")
119 |             answer_eq = answer_eq.replace("power", "^")
120 |             change_symbol = answer_eq_before != answer_eq
121 |         # try to simplify results
122 |         try:
123 |             answer_eq = simplify(answer_eq)
124 |         except Exception as error:
125 |             pass
126 |         # return answer
127 |         return answer_eq
128 | 


--------------------------------------------------------------------------------
/paper_exp_runner.py:
--------------------------------------------------------------------------------
  1 | # library imports
  2 | import os
  3 | 
  4 | # project imports
  5 | from experiments.exp_steady_free_fall_with_drag_case_1 import ExpSFF1
  6 | from experiments.exp_steady_free_fall_with_drag_case_2 import ExpSFF2
  7 | from experiments.exp_steady_free_fall_with_drag_case_3 import ExpSFF3
  8 | from experiments.exp_constant_acceleration import ExpConstantAcceleration
  9 | from experiments.exp_steady_free_fall_with_drag_case_2_with_educated_guess import ExpSFF2WithGuess
 10 | 
 11 | 
 12 | class PaperExpRunner:
 13 |     """
 14 |     Single entry point for the project.
 15 |     This file runs all the experiments in the project and save the raw results
 16 |     for the manuscript
 17 |     """
 18 | 
 19 |     # CONSTS #
 20 |     RESULTS_FOLDER_NAME = "results"
 21 | 
 22 |     # END - CONSTS #
 23 | 
 24 |     def __init__(self):
 25 |         pass
 26 | 
 27 |     @staticmethod
 28 |     def run(const_acc_numerical_bool: bool = True,
 29 |             const_acc_analytical_bool: bool = True,
 30 |             const_acc_force_ebs_bool: bool = True,
 31 |             sff1_numerical_bool: bool = True,
 32 |             sff1_analytical_bool: bool = True,
 33 |             sff1_force_ebs_bool: bool = True,
 34 |             sff2_numerical_bool: bool = True,
 35 |             sff2_analytical_bool: bool = True,
 36 |             sff2_force_ebs_bool: bool = True,
 37 |             sff2_with_guess_numerical_bool: bool = True,
 38 |             sff2_with_guess_analytical_bool: bool = True,
 39 |             sff2_with_guess_force_ebs_bool: bool = True,
 40 |             sff3_numerical_bool: bool = True,
 41 |             sff3_analytical_bool: bool = True,
 42 |             sff3_force_ebs_bool: bool = True,
 43 |             drag_force_numerical_bool: bool = True,
 44 |             drag_force_analytical_bool: bool = True,
 45 |             drag_force_force_ebs_bool: bool = True):
 46 |         """
 47 |         Single method to use in the class.
 48 |         Run the experiments, if requested
 49 |         """
 50 |         # prepare IO
 51 |         os.makedirs(os.path.join(os.path.dirname(__file__), PaperExpRunner.RESULTS_FOLDER_NAME),
 52 |                     exist_ok=True)
 53 |         # run all the experiments
 54 |         if const_acc_numerical_bool or const_acc_analytical_bool or const_acc_force_ebs_bool:
 55 |             ExpConstantAcceleration.run(numerical_bool=const_acc_numerical_bool,
 56 |                                         analytical_bool=const_acc_analytical_bool,
 57 |                                         force_ebs_bool=const_acc_force_ebs_bool)
 58 | 
 59 |         if sff1_numerical_bool or sff1_analytical_bool or sff1_force_ebs_bool:
 60 |             ExpSFF1.perform(numerical_bool=sff1_numerical_bool,
 61 |                             analytical_bool=sff1_analytical_bool,
 62 |                             force_ebs_bool=sff1_force_ebs_bool)
 63 | 
 64 |         if sff2_numerical_bool or sff2_analytical_bool or sff2_force_ebs_bool:
 65 |             ExpSFF2.perform(numerical_bool=sff2_numerical_bool,
 66 |                             analytical_bool=sff2_analytical_bool,
 67 |                             force_ebs_bool=sff2_force_ebs_bool)
 68 | 
 69 |         if sff2_with_guess_numerical_bool or sff2_with_guess_analytical_bool or sff2_with_guess_force_ebs_bool:
 70 |             ExpSFF2WithGuess.perform(numerical_bool=sff2_with_guess_numerical_bool,
 71 |                                      analytical_bool=sff2_with_guess_analytical_bool,
 72 |                                      force_ebs_bool=sff2_with_guess_force_ebs_bool)
 73 | 
 74 |         if sff3_numerical_bool or sff3_analytical_bool or sff3_force_ebs_bool:
 75 |             ExpSFF3.perform(numerical_bool=sff3_numerical_bool,
 76 |                             analytical_bool=sff3_analytical_bool,
 77 |                             force_ebs_bool=sff3_force_ebs_bool)
 78 |         
 79 |         if drag_force_numerical_bool or drag_force_analytical_bool or drag_force_force_ebs_bool:
 80 |             ExpSFF3.perform(numerical_bool=drag_force_numerical_bool,
 81 |                             analytical_bool=drag_force_analytical_bool,
 82 |                             force_ebs_bool=drag_force_force_ebs_bool)
 83 | 
 84 | 
 85 | if __name__ == '__main__':
 86 |     PaperExpRunner.run(const_acc_numerical_bool=False,
 87 |                        const_acc_analytical_bool=False,
 88 |                        const_acc_force_ebs_bool=False,
 89 | 
 90 |                        sff1_numerical_bool=False,
 91 |                        sff1_analytical_bool=False,
 92 |                        sff1_force_ebs_bool=False,
 93 | 
 94 |                        sff2_numerical_bool=False,
 95 |                        sff2_analytical_bool=False,
 96 |                        sff2_force_ebs_bool=False,
 97 | 
 98 |                        sff2_with_guess_numerical_bool=False,
 99 |                        sff2_with_guess_analytical_bool=False,
100 |                        sff2_with_guess_force_ebs_bool=False,
101 | 
102 |                        sff3_numerical_bool=False,
103 |                        sff3_analytical_bool=False,
104 |                        sff3_force_ebs_bool=False,
105 |                       
106 |                        drag_force_numerical_bool=True,
107 |                        drag_force_analytical_bool=True,
108 |                        drag_force_force_ebs_bool=True)
109 | 


--------------------------------------------------------------------------------
/utills/consts.py:
--------------------------------------------------------------------------------
  1 | # library imports
  2 | import os
  3 | 
  4 | # consts #
  5 | 
  6 | # 1) General for all experiment:
  7 | DATA_FOLDER = "data"
  8 | RESULTS_FOLDER = "results"
  9 | g_force = 9.81
 10 | REL_ERR_OF_STD = 0.05
 11 | DEFAULT_FIG_SIZE = 8
 12 | DEFAULT_DPI = 600
 13 | FEATURE_IMPORTANCE_SIMULATION_COUNT = 5 #100
 14 | JSON_INDENT = 2
 15 | K_FOLD = 5
 16 | RANDOM_STATE = 73
 17 | SYMBOLIC_PERCENT_OF_MAJORITY = 0.6
 18 | SYMBOLIC_P_VALUE_THRESHOLD = 0.8
 19 | SYMBOLIC_EQ_RANKING_METRIC = "r2"
 20 | SYMBOLIC_TOP_EQS_MAX_NUM = 5
 21 | 
 22 | 
 23 | # 2) Constant acceleration exp:
 24 | # - data generation:
 25 | CONST_ACCELERATION_NUM_SAMPLES = 400
 26 | CONST_ACCELERATION_TEST_SIZE_PORTION = 0.75
 27 | # - experiment run:
 28 | CONST_ACCELERATION_NUMERICAL_RUN_TIMES = 20
 29 | CONST_ACCELERATION_NUMERICAL_GENERATION_COUNT = 5
 30 | CONST_ACCELERATION_NUMERICAL_POP_SIZE = 30
 31 | CONST_ACCELERATION_ANALYTICAL_RUN_TIMES = 20
 32 | CONST_ACCELERATION_ANALYTICAL_GENERATION_COUNT = 5
 33 | CONST_ACCELERATION_ANALYTICAL_POP_SIZE = 50
 34 | CONST_ACCELERATION_NOISE_RANGE = (0, 0.02)
 35 | CONST_ACCELERATION_ANALYTICAL_PARSIMONY_COEFFICIENT = 0.02
 36 | CONST_ACCELERATION_EBS_SIZE_RANGE = (5,)
 37 | # - result path
 38 | CONST_ACCELERATION_RESULTS_FOLDER_NAME = os.path.join(RESULTS_FOLDER,
 39 |                                                       "constant_acceleration_results",
 40 |                                                       "{}_samples".format(CONST_ACCELERATION_NUM_SAMPLES))
 41 | 
 42 | 
 43 | # 3) Steady free fall with drag exp:
 44 | # - data generation:
 45 | N_FREQ_SUFFIX = ['_11', '_12', '_13', '_21', '_23', '_31', '_32']
 46 | STEADY_FALL_MINIZE_TOL = 1e-25
 47 | SFF_RHOA_RANGE = (998., 1300.)   # fresh water -> salt water at 20C [kg/m3]
 48 | SFF_RHOP_RANGE = (0, 5000)
 49 | SFF_NU_RANGE = (1e-6, 1.4e-6)    # viscosity corresponding to rhoa [m2/s]
 50 | SFF_RE_RANGE = (1., 100.)        # Reynolds range where Cd changes significantly
 51 | SFF_CASE_2_NOISE_RANGE = (0, 0.02)
 52 | SFF_TEST_SIZE_PORTION = 0.2
 53 | SFF_DIMENSIONAL_EBS_SIZE_RANGE_1_2 = (11,)
 54 | SFF_DIMENSIONAL_EBS_SIZE_RANGE_2_easy = (7,)
 55 | SFF_DIMENSIONAL_EBS_SIZE_RANGE_3 = (3,)
 56 | SFF_1_DROP_PARAM = "V"
 57 | FORCE_DATA_OVERRIDE_FLAG = False
 58 | # - experiment run:
 59 | SFF_NUMERICAL_NUM_SAMPLES = 10**4
 60 | SFF_NUMERICAL_RUN_TIMES = 20
 61 | SFF_NUMERICAL_GENERATION_COUNT = 3
 62 | SFF_NUMERICAL_POP_SIZE = 25
 63 | SFF_ANALYTICAL_RUN_TIMES = 20
 64 | SFF_ANALYTICAL_GENERATION_COUNT = 10
 65 | SFF_ANALYTICAL_POP_SIZE = 2000
 66 | SFF_ANALYTICAL_PARSIMONY_COEFFICIENT = 0.025
 67 | # - feature selection:
 68 | FEATURE_SELECTION_GENERATIONS_COUNT = 2
 69 | FEATURE_SELECTION_POP_SIZE = 8
 70 | FEATURE_SELECTION_MUTATION_RATE = 0.1
 71 | FEATURE_SELECTION_ROYALTY = 0.05
 72 | # - data and result paths
 73 | SFF_N_SAMPLES_STR = str(round(SFF_NUMERICAL_NUM_SAMPLES/1000)) + "k"
 74 | SFF_1_RESULTS_FOLDER_NAME = os.path.join(RESULTS_FOLDER,
 75 |                                          "steady_fall_case_1_results",
 76 |                                          "{}_samples_without_{}".format(SFF_N_SAMPLES_STR, SFF_1_DROP_PARAM))
 77 | SFF_1_DATA_FOLDER_NAME = os.path.join(DATA_FOLDER,
 78 |                                       "case_1_steady_fall_with_drag_data_" +
 79 |                                       "{}_samples_no_{}.csv".format(SFF_N_SAMPLES_STR, SFF_1_DROP_PARAM))
 80 | 
 81 | SFF_2_RESULTS_FOLDER_NAME = os.path.join(RESULTS_FOLDER,
 82 |                                          "steady_fall_case_2_results_{}_samples".format(SFF_N_SAMPLES_STR))
 83 | SFF_2_DATA_FOLDER_NAME = os.path.join(DATA_FOLDER,
 84 |                                       "case_2_steady_fall_with_drag_data_{}_samples.csv".format(SFF_N_SAMPLES_STR))
 85 | 
 86 | SFF_2_WITH_GUESS_RESULTS_FOLDER_NAME = os.path.join(RESULTS_FOLDER,
 87 |                                                     "steady_fall_case_2_with_guess_results_{}_samples".format(SFF_N_SAMPLES_STR))
 88 | SFF_2_WITH_GUESS_DATA_FOLDER_NAME = os.path.join(DATA_FOLDER,
 89 |                                                  "case_2_with_guess_steady_fall_with_drag_data_{}_samples.csv".format(SFF_N_SAMPLES_STR))
 90 | 
 91 | SFF_3_RESULTS_FOLDER_NAME = os.path.join(RESULTS_FOLDER,
 92 |                                          "steady_fall_case_3_results_{}_samples".format(SFF_N_SAMPLES_STR))
 93 | SFF_3_DATA_FOLDER_NAME = os.path.join(DATA_FOLDER,
 94 |                                       "case_3_steady_fall_with_drag_data_{}_samples.csv".format(SFF_N_SAMPLES_STR))
 95 | 
 96 | 
 97 | # 3) Drag force exp:
 98 | # - data generation:
 99 | DRAG_FORCE_NUM_SAMPLES = 10000
100 | DRAG_FORCE_TEST_SIZE_PORTION = 0.75
101 | # - experiment run:
102 | DRAG_FORCE_NUMERICAL_RUN_TIMES = 20
103 | DRAG_FORCE_NUMERICAL_GENERATION_COUNT = 5
104 | DRAG_FORCE_NUMERICAL_POP_SIZE = 30
105 | DRAG_FORCE_FEATURE_GENERATIONS_COUNT = 5
106 | DRAG_FORCE_FEATURE_POP_SIZE = 30
107 | DRAG_FORCE_MUTATION_RATE = 0.1
108 | DRAG_FORCE_ROYALTY = 0.05
109 | DRAG_FORCE_ANALYTICAL_RUN_TIMES = 20
110 | DRAG_FORCE_ANALYTICAL_GENERATION_COUNT = 5
111 | DRAG_FORCE_ANALYTICAL_POP_SIZE = 50
112 | DRAG_FORCE_NOISE_RANGE = (0, 0.02)
113 | DRAG_FORCE_ANALYTICAL_PARSIMONY_COEFFICIENT = 0.02
114 | DRAG_FORCE_EBS_SIZE_RANGE = (13,)
115 | # - result path
116 | DRAG_FORCE_RESULTS_FOLDER_NAME = os.path.join(RESULTS_FOLDER,
117 |                                               "drag_force_results",
118 |                                               "{}_samples".format(DRAG_FORCE_NUM_SAMPLES))
119 | 
120 | # end - consts #
121 | 


--------------------------------------------------------------------------------
/algo/genetic_algorithm_feature_selection.py:
--------------------------------------------------------------------------------
  1 | # library imports
  2 | import pandas as pd
  3 | import os
  4 | 
  5 | # project imports
  6 | from algo.operators.fitness import Fitness
  7 | from algo.population import Population
  8 | from algo.operators.mutation import Mutation
  9 | from utills.logger_config import Logger
 10 | from algo.operators.crossover import Crossover
 11 | from algo.operators.next_generation import NextGeneration
 12 | 
 13 | 
 14 | class GAFS:
 15 |     """
 16 |     A classical genetic algorithm for grouped feature selection with
 17 |     a wrapper of AutoML (MultiTPOTrunner-driven) pipeline search
 18 |     """
 19 | 
 20 |     def __init__(self):
 21 |         pass
 22 | 
 23 |     @staticmethod
 24 |     def run(tpot_run_times: int,
 25 |             feature_generations: int,
 26 |             tpot_regressor_generations: int,
 27 |             feature_population_size: int,
 28 |             tpot_regressor_population_size: int,
 29 |             mutation_rate: float,
 30 |             feature_indexes_ranges: list,
 31 |             mutation_w: list,
 32 |             royalty: float,
 33 |             k_fold: int,
 34 |             performance_metric: str,
 35 |             train_data_x: pd.DataFrame,
 36 |             train_data_y: pd.DataFrame,
 37 |             test_data_x: pd.DataFrame,
 38 |             test_data_y: pd.DataFrame,
 39 |             save_dir: str,
 40 |             cores: int = -1):
 41 |         """
 42 |         Run the GAFS algorithm with some hyper-parameters
 43 |         """
 44 |         assert len(mutation_w) == len(feature_indexes_ranges)
 45 |         assert feature_generations > 0
 46 |         assert tpot_regressor_generations > 0
 47 |         assert feature_population_size > 0
 48 |         assert (feature_population_size % 2) == 0
 49 |         assert tpot_regressor_population_size > 0
 50 |         assert k_fold > 0
 51 |         assert 0 < royalty < 1
 52 |         assert train_data_x.shape[0] == train_data_y.shape[0]
 53 |         assert test_data_x.shape[0] == test_data_y.shape[0]
 54 |         assert test_data_x.shape[1] == test_data_x.shape[1]
 55 | 
 56 |         # generate population of genes dictating how to trim data
 57 |         pop = Population.random(size=feature_population_size,
 58 |                                 feature_count=len(feature_indexes_ranges),
 59 |                                 feature_indexes_ranges=feature_indexes_ranges)
 60 |         # create a dict to store selected features through generations
 61 |         selected_fs = {"feature_indices": [],
 62 |                        "feature_names": []}
 63 |         for generation in range(feature_generations):
 64 |             # manipulate gene population
 65 |             pop = Mutation.simple(population=pop,
 66 |                                   feature_indexes_ranges=feature_indexes_ranges,
 67 |                                   mutation_rate=mutation_rate,
 68 |                                   w=mutation_w)
 69 |             pop = Crossover.simple(population=pop)
 70 |             # assign fitness score and best ML pipeline to each gene in pop
 71 |             Logger.print("\nGeneration #{}/{} | Assign Fitness and Pipeline to each gene:".format(generation + 1,
 72 |                                                                                                   feature_generations))
 73 |             pop = Fitness.tpot(run_times=tpot_run_times,
 74 |                                train_data_x=train_data_x,
 75 |                                train_data_y=train_data_y,
 76 |                                test_data_x=test_data_x,
 77 |                                test_data_y=test_data_y,
 78 |                                generations=tpot_regressor_generations,
 79 |                                population=pop,
 80 |                                population_size=tpot_regressor_population_size,
 81 |                                k_fold=k_fold,
 82 |                                performance_metric=performance_metric,
 83 |                                n_jobs=cores,
 84 |                                save_dir=save_dir)
 85 |             # alert user
 86 |             current_best_gene = pop.get_best()
 87 |             feature_names = list(test_data_x.columns[current_best_gene.feature_indexes])
 88 |             selected_fs["feature_indices"].append(current_best_gene.feature_indexes)
 89 |             selected_fs["feature_names"].append(feature_names)
 90 |             Logger.print("Generation #{}/{} | Best gene's fitness: {:.3f} selected features: {}".format(generation + 1,
 91 |                                                                                                         feature_generations,
 92 |                                                                                                         current_best_gene.fitness,
 93 |                                                                                                         feature_names))
 94 |             # prepare population for next generation
 95 |             pop = NextGeneration.tournament_with_royalty(population=pop,
 96 |                                                          royalty=royalty)
 97 |         # save selected features from all generations
 98 |         pd.DataFrame(selected_fs, dtype=object).to_csv(os.path.join(save_dir, "selected_features_history.csv"),
 99 |                                                        index=False)
100 |         return pop.get_best()
101 | 
102 |     def __repr__(self):
103 |         return self.__str__()
104 | 
105 |     def __str__(self):
106 |         return "<GAFS>"


--------------------------------------------------------------------------------
/algo/ebs/eq_node.py:
--------------------------------------------------------------------------------
  1 | # library imports
  2 | import pandas as pd
  3 | 
  4 | # project imports
  5 | from algo.ebs.eq_functions import *
  6 | 
  7 | 
  8 | class EqNode:
  9 |     """
 10 |     This class represents a single node in an equation tree
 11 |     """
 12 | 
 13 |     # CONSTS #
 14 |     NO_INDEX = -1
 15 |     # END - CONSTS #
 16 | 
 17 |     # CATCH FOR OPTIMIZATION #
 18 |     dp = {}
 19 |     # END - CATCH FOR OPTIMIZATION #
 20 | 
 21 |     def __init__(self,
 22 |                  value=None,
 23 |                  is_leaf: bool = True,
 24 |                  left_child=None,
 25 |                  right_child=None,
 26 |                  index: int = NO_INDEX):
 27 |         self.value = value
 28 |         self.is_leaf = is_leaf
 29 |         self.index = index
 30 |         if is_leaf:
 31 |             self.left_child = None
 32 |             self.right_child = None
 33 |         elif left_child is not None and right_child is not None:
 34 |             self.left_child = left_child
 35 |             self.right_child = right_child
 36 |         else:
 37 |             raise ValueError("If EqNode is not leaf node, it must have both left and right childs")
 38 | 
 39 |     def eval(self,
 40 |              x_values: pd.DataFrame) -> pd.Series:
 41 |         """ eval the node """
 42 |         if self.is_leaf:
 43 |             return x_values[self.value]
 44 |         else:
 45 |             return self.value(self.left_child.eval(x_values),
 46 |                               self.right_child.eval(x_values))
 47 | 
 48 |     def fix_node(self) -> None:
 49 |         """ fix nodes' is_leaf flag if has been corrupted by other process """
 50 |         if self.is_leaf and self.left_child is not None and self.right_child is not None:
 51 |             self.is_leaf = False
 52 |         elif not self.is_leaf and self.left_child is None and self.right_child is None:
 53 |             self.is_leaf = True
 54 |         elif self.is_leaf and (self.left_child is None or self.right_child is None):
 55 |             self.left_child = None
 56 |             self.right_child = None
 57 | 
 58 |         if not self.is_leaf:
 59 |             self.left_child.fix_node()
 60 |             self.right_child.fix_node()
 61 | 
 62 |     def to_string(self) -> str:
 63 |         """ print the node as a string """
 64 |         if self.is_leaf:
 65 |             return str(self.value)
 66 |         else:
 67 |             return "{}({}, {})".format(FUNCTION_MAPPER[self.value],
 68 |                                        self.left_child.to_string(),
 69 |                                        self.right_child.to_string())
 70 | 
 71 |     def to_id_str(self) -> str:
 72 |         """ print equation in a narrow way for hash mapping """
 73 |         if self.is_leaf:
 74 |             return "L".format()
 75 |         return "{}N{}".format(self.left_child.to_id_str(),
 76 |                               self.right_child.to_id_str())
 77 | 
 78 |     def size(self) -> int:
 79 |         """ calc the size of the equation """
 80 |         if self.is_leaf:
 81 |             return 1
 82 |         return 1 + self.right_child.size() + self.left_child.size()
 83 | 
 84 |     def set_index(self,
 85 |                   leaf_dict: dict,
 86 |                   index: int = 0) -> tuple:
 87 |         """ add an index to each node and tells if leaf or not """
 88 |         self.index = index
 89 |         leaf_dict[self.index] = self.is_leaf
 90 |         if not self.is_leaf:
 91 |             index = self.left_child.set_index(leaf_dict=leaf_dict, index=index + 1)
 92 |             index = self.right_child.set_index(leaf_dict=leaf_dict, index=index + 1)
 93 |         return index
 94 | 
 95 |     def _copy_and_put_values(self,
 96 |                              allocation: dict):
 97 |         """ copy the current topoloy and puts values by order according to their index """
 98 |         if self.is_leaf:
 99 |             return EqNode(value=allocation[self.index],
100 |                           index=self.index,
101 |                           left_child=None,  # self.left_child
102 |                           right_child=None,  # self.right_child
103 |                           is_leaf=self.is_leaf)
104 |         return EqNode(value=allocation[self.index],
105 |                       index=self.index,
106 |                       left_child=self.left_child._copy_and_put_values(allocation=allocation),
107 |                       right_child=self.right_child._copy_and_put_values(allocation=allocation),
108 |                       is_leaf=self.is_leaf)
109 | 
110 |     @staticmethod
111 |     def all_possible_fbt(n: int) -> list:
112 |         """ Return all full binary trees of inputted size 'n' """
113 |         if n == 0:
114 |             return []
115 |         if n == 1:
116 |             return [EqNode(is_leaf=True)]
117 |         if n in EqNode.dp:
118 |             return EqNode.dp[n]
119 | 
120 |         result = []
121 |         for l in range(n):
122 |             r = n - 1 - l
123 |             left_trees = EqNode.all_possible_fbt(n=l)
124 |             right_trees = EqNode.all_possible_fbt(n=r)
125 |             for t1 in left_trees:
126 |                 for t2 in right_trees:
127 |                     result.append(EqNode(is_leaf=False,
128 |                                          left_child=t1,
129 |                                          right_child=t2))
130 |         EqNode.dp[n] = result
131 |         return result
132 | 
133 |     def __repr__(self):
134 |         return "<EqNode: value={}, is_leaf={}, index={}>".format(self.value,
135 |                                                                  self.is_leaf,
136 |                                                                  self.index)
137 | 
138 |     def __str__(self):
139 |         if self.is_leaf:
140 |             return "([#{}]{})".format(self.index,
141 |                                       self.value)
142 |         return "([#{}]{} -> {} & {})".format(self.index,
143 |                                              self.value,
144 |                                              self.left_child.__str__(),
145 |                                              self.right_child.__str__())
146 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # SciMED: A Computational Framework For Physics-Informed Symbolic Regression with Scientist-In-The-Loop
  2 | 
  3 | ## Abstract
  4 | Discovering a meaningful, dimensionally homogeneous, symbolic expression explaining
  5 | experimental data is a fundamental challenge in physics. In this study, we present a
  6 | novel, open-source computational framework called Scientist-Machine Equation
  7 | Detector (SciMED), which integrates scientific discipline wisdom in a
  8 | scientist-in-the-loop approach with state-of-the-art SR methods. SciMED combines a
  9 | genetic algorithm-based wrapper selection method with automatic machine learning
 10 | and two levels of symbolic regression methods. We test SciMED on four configurations
 11 | of the settling of a sphere with and without a non-linear aerodynamic drag force. We
 12 | show that SciMED is sufficiently robust to discover the correct physically meaningful
 13 | symbolic expressions of each configuration from noisy data. Our results indicate better
 14 | performance on these tasks than the state-of-the-art SR software package.
 15 | 
 16 | ## Table of contents
 17 | 1. [Code usage](#code_usage)
 18 | 2. [The algorithm](#the_algorithm)
 19 | 3. [Data](#data_preparation)
 20 | 4. [How to cite](#how_to_cite)
 21 | 5. [Dependencies](#dependencies)
 22 | 6. [Contributing](#contributing)
 23 | 7. [Bug Reports](#bug_reports)
 24 | 8. [Contact](#contact)
 25 | 
 26 | <a name="code_usage"/>
 27 | 
 28 | ## Code usage
 29 | ### Run the experiments shown in the paper:
 30 | 1. Clone the repo 
 31 | 2. Install the `requirements.txt` file.
 32 | 3. run the project from the `paper_exp_runner.py` file, make sure all the arguments are set to **True**. 
 33 | 
 34 | ### Use in your project:
 35 | 1. Clone the repo 
 36 | 2. Install the `requirements.txt` file.
 37 | 3. Include the following code to the relevant part of your project:
 38 | ```
 39 | from scimed import scimed
 40 | scimed.run(dataset_x: pandas.DataFrame, dataset_y: pandas.Seires, ...)
 41 | ```
 42 | ### Demo:
 43 | A demo on how to use SciMED with a data from a CSV file (using Pandas) is shown in the "/demo" folder. 
 44 | 
 45 | <a name="the_algorithm"/>
 46 | 
 47 | ## The algorithm
 48 | SciMED is constructed from four components: 
 49 | 1. **A genetic algorithm-based feature selection:** Reduce the search space by selecting a single, most explainable, feature from each group of features that are considered to be the same in physical essence. This devition to groups is provided by the user, applying his domain knowledge.
 50 | 2. **A genetic algorithm-based automatic machine learning (AutoML):** Trains an ML to produce synthetic data that facilitates the SR task by enriching the data domain.
 51 | 3. **A genetic algorithm-based symbolic regression (SR):** less resource and time-consuming but stochastic SR search. May result in sub-optimal outcome.
 52 | 4. **A Las Vegas search SR:** more computationally expensive SR search that averagely produces more stable and accurate outcome.
 53 | 
 54 | Each section allows the user to easily insert physical knowledge or assumptions, specific to its current task, directing the search process for a more credible result 
 55 | with fewer required resources. The motivation for this structure is derived from the way human scientists work, where more promising directions get more attention and resources. 
 56 | 
 57 | ![Algo_structure](https://user-images.githubusercontent.com/72650415/230033829-9e283c9c-80ab-43d1-9385-6999074ae836.png)
 58 | 
 59 | <a name="data_preparation"/>
 60 | 
 61 | ## Data preparation
 62 | The data file to be analyzed should be a csv file, with each column containing the numerical values of each variable. If the variables can be grouped into variables of similar essence, from which only one can be in the mystery eqaution, then they should appear sequentially and the index ranges for each group should be passed to the function.
 63 | 
 64 | The solution file will be saved in the directory called "results" under the name of the specific component that generated them. For example, there will be three {component}_target_vs_pred.pdf files demonstrating the prediction capabilities of the specific outcome from the component. 
 65 | 
 66 | <a name="how_to_cite"/>
 67 | 
 68 | ## How to cite
 69 | Please cite the SciMED work if you compare, use, or build on it:
 70 | ```
 71 | @article{keren2023computational,
 72 |         title={A computational framework for physics-informed symbolic regression with straightforward integration of domain knowledge},
 73 |         author={Keren, Liron Simon and Liberzon, Alex and Lazebnik, Teddy},
 74 |         journal={Scientific Reports},
 75 |         volume={13},
 76 |         number={1},
 77 |         pages={1249},
 78 |         year={2023},
 79 |         publisher={Nature Publishing Group UK London}
 80 | }
 81 | ```
 82 | 
 83 | <a name="dependencies"/>
 84 | 
 85 | ## Dependencies 
 86 | 1. pandas 
 87 | 2. numpy 
 88 | 3. matplotlib 
 89 | 4. seaborn 
 90 | 5. scikit-learn 
 91 | 6. scipy 
 92 | 7. TPOT 
 93 | 8. gplearn 
 94 | 9. pytorch 
 95 | 10. termcolor 
 96 | 11. sympy
 97 | 
 98 | <a name="contributing"/>
 99 | 
100 | ## Contributing
101 | We would love you to contribute to this project, pull requests are very welcome! Please send us an email with your suggestions or requests...
102 | 
103 | <a name="bug_reports"/>
104 | 
105 | ## Bug Reports
106 | Report [here]("https://github.com/LironSimon/SciMED/issues"). Guaranteed reply as fast as we can :)
107 | 
108 | <a name="contact"/>
109 | 
110 | ## Contact
111 | * Liron Simon - [email](mailto:lirons.gb@gmail.com) | [LinkedInֿ](https://www.linkedin.com/in/liron-simon/)
112 | * Teddy Lazebnik - [email](mailto:t.lazebnik@ucl.ac.uk) | [LinkedInֿ](https://www.linkedin.com/in/teddy-lazebnik/)
113 | * Alex Liberzon - [email](mailto:alexlib@tauex.tau.ac.il) | [LinkedInֿ](https://www.linkedin.com/in/alexliberzon/)
114 | 
115 | 
116 | ## Run online using Mybinder.org
117 | 
118 | [![Binder](https://mybinder.org/badge_logo.svg)](https://mybinder.org/v2/gh/LironSimon/SciMED/master)
119 | 
120 | Open New Terminal and run `python main.py`
121 | 
122 | 
123 | ## Run using Docker
124 | 
125 |     docker run alexlib/scimed:latest
126 |  
127 | 
128 | 


--------------------------------------------------------------------------------
/utills/result_tracker.py:
--------------------------------------------------------------------------------
  1 | # library imports
  2 | import os
  3 | import json
  4 | import pickle
  5 | import pandas as pd
  6 | 
  7 | # project imports
  8 | from utills.consts import *
  9 | from utills.plotter import Plotter
 10 | from utills.symbolic_regression_to_latex_text import SymbolicRegressionToLatexText
 11 | 
 12 | 
 13 | class ResultTracker:
 14 |     """
 15 |     This class is responsible for saving  plots and data
 16 |     for each part of the program.
 17 |     """
 18 | 
 19 |     def __init__(self):
 20 |         pass
 21 | 
 22 |     @staticmethod
 23 |     def run(program_part: str,
 24 |             run_times: int,
 25 |             all_scores: pd.DataFrame,
 26 |             model,
 27 |             train_data_x: pd.DataFrame,
 28 |             train_data_y: pd.DataFrame,
 29 |             test_data_x: pd.DataFrame,
 30 |             test_data_y: pd.DataFrame,
 31 |             save_dir: str):
 32 |         """
 33 | 
 34 |         """
 35 |         assert program_part in ["tpot", "symbolic"]
 36 | 
 37 |         # 1) save model
 38 |         if program_part == "tpot":
 39 |             model.export(os.path.join(os.path.dirname(os.path.dirname(__file__)),
 40 |                                       save_dir,
 41 |                                       "tpot_exported_pipeline.py"))
 42 |         else:
 43 |             with open(os.path.join(os.path.dirname(os.path.dirname(__file__)), save_dir,
 44 |                                    "symbolic_model"), "wb") as symbolic_fit_file:
 45 |                 pickle.dump(model, symbolic_fit_file)
 46 | 
 47 |         # 2) save scoring history of model as a whole and as averaged
 48 |         all_scores.to_csv(os.path.join(save_dir, program_part + "_scoring_history.csv"), index=False)
 49 |         with open(os.path.join(os.path.dirname(os.path.dirname(__file__)), save_dir,
 50 |                                program_part + "_fit_results.json"), "w") as test_file:
 51 |             json.dump({key: all_scores[key].mean() for key in all_scores.keys()[:-1]},
 52 |                       test_file,
 53 |                       indent=JSON_INDENT)
 54 | 
 55 |         # 3) plot model's predictions vs true values
 56 |         Plotter.y_test_vs_y_pred(model=model,
 57 |                                  x_test=pd.concat([train_data_x, test_data_x]),
 58 |                                  y_test=pd.concat([train_data_y, test_data_y]),
 59 |                                  save_path=os.path.join(save_dir, program_part + "_target_vs_pred.pdf"))
 60 | 
 61 |         # 4) varify that mae scores are stable
 62 |         if run_times > 1:
 63 |             Plotter.std_check(data=all_scores["mae"],
 64 |                               save_path=os.path.join(os.path.dirname(os.path.dirname(__file__)),
 65 |                                                      save_dir,
 66 |                                                      program_part + "_mae_stability.pdf"))
 67 |         # 5) plot feature importance
 68 |         dataset_x = pd.concat([train_data_x, test_data_x])
 69 |         dataset_y = pd.concat([train_data_y, test_data_y])
 70 |         dataset = pd.concat([dataset_x, dataset_y], axis=1)
 71 | 
 72 |         # if program_part == "tpot":
 73 |         #     Plotter.feature_importance(model=model,
 74 |         #                                dataset=dataset,
 75 |         #                                save_dir=save_dir,
 76 |         #                                program_part=program_part,
 77 |         #                                simulations=FEATURE_IMPORTANCE_SIMULATION_COUNT)
 78 | 
 79 |         if program_part == "symbolic":
 80 |             p_value = all_scores["t_test_p_value"].mean()
 81 |             if p_value < SYMBOLIC_P_VALUE_THRESHOLD:
 82 |                 continue_to_ebs_flag = True
 83 |             else:
 84 |                 continue_to_ebs_flag = False
 85 |             return continue_to_ebs_flag
 86 | 
 87 |     @staticmethod
 88 |     def summaries_symbolic_results(run_times: int,
 89 |                                    percent_of_majority: float,
 90 |                                    eq_ranking_metric: str,
 91 |                                    top_eqs_max_num: int,
 92 |                                    save_dir: str):
 93 |         """
 94 | 
 95 |         """
 96 |         # load data
 97 |         eqs = pd.read_csv(os.path.join(save_dir, "symbolic_scoring_history.csv"))["found_eq"]
 98 |         eq_ranking = pd.read_csv(os.path.join(save_dir, "symbolic_scoring_history.csv"))[eq_ranking_metric]
 99 |         # write summary file:
100 |         with open(os.path.join(save_dir, "symbolic_results_summary.txt"), 'w') as f:
101 |             f.write("Symbolic run count: {}\n\n".format(run_times))
102 |             # check if program needs to continue to ebf search
103 |             if list(eqs.value_counts())[0] >= percent_of_majority * run_times:
104 |                 f.write("The function that repeated in {}% of the runs: \n  {}\n\n".format(
105 |                     round(list(eqs.value_counts())[0] * 100 / run_times, 2),
106 |                     eqs.value_counts().index[0]#SymbolicRegressionToLatexText.run(eq=str(eqs.value_counts().index[0]))
107 |                 ))
108 |                 continue_to_ebs_flag = False
109 |             else:
110 |                 f.write("No function was found for at least {} of the runs\n\n".format(round(percent_of_majority * 100,
111 |                                                                                              2)))
112 |                 continue_to_ebs_flag = True
113 |             # rank the eqs found by metric:
114 |             top_eqs_index = eq_ranking.sort_values(ascending=False)[:top_eqs_max_num].index
115 |             f.write("{} best equations found (according to {} score):\n".format(len(eqs[top_eqs_index].unique()),
116 |                                                                                 eq_ranking_metric))
117 |             for i, eq in enumerate(eqs[top_eqs_index].unique()):
118 |                 f.write(" {}) {}\n".format(i + 1, eq)) #SymbolicRegressionToLatexText.run(eq=str(eq))))
119 |             # alert user of findings
120 |             return continue_to_ebs_flag
121 | 
122 |     @staticmethod
123 |     def ebs_results(model,
124 |                     all_scores: pd.DataFrame,
125 |                     save_dir: str):
126 |         with open(os.path.join(os.path.dirname(os.path.dirname(__file__)), save_dir,
127 |                                "ebs_fit_score.json"), "w") as ebs_fit_score_file:
128 |             answer = {"k_fold": K_FOLD}
129 |             for key in all_scores.keys()[:-1]:
130 |                 answer[key] = all_scores[key].mean()
131 |             json.dump(answer,
132 |                       ebs_fit_score_file,
133 |                       indent=JSON_INDENT)
134 |         # save best fitted model
135 |         with open(os.path.join(os.path.dirname(os.path.dirname(__file__)),
136 |                                CONST_ACCELERATION_RESULTS_FOLDER_NAME,
137 |                                "ebs_model"),
138 |                   "wb") as ebs_fit_file:
139 |             pickle.dump(model, ebs_fit_file)
140 | 
141 |     def __repr__(self):
142 |         return self.__str__()
143 | 
144 |     def __str__(self):
145 |         return "<ResultTracker>"
146 | 


--------------------------------------------------------------------------------
/algo/genetic_algorithm_symbolic_fit.py:
--------------------------------------------------------------------------------
  1 | # library imports
  2 | import os
  3 | import json
  4 | import pandas as pd
  5 | from scipy import stats
  6 | from sklearn.metrics import r2_score
  7 | from sklearn.model_selection import KFold
  8 | from gplearn.genetic import SymbolicRegressor
  9 | 
 10 | # project imports
 11 | from utills.plotter import Plotter
 12 | from utills.fitness_methods import *
 13 | from utills.logger_config import Logger
 14 | from utills.symbolic_regression_to_latex_text import SymbolicRegressionToLatexText
 15 | 
 16 | 
 17 | class GASF:
 18 |     """
 19 |     This class is responsible for generating a symbolic equation
 20 |     of a target value from a given set of features, using a
 21 |     SymbolicRegressor.
 22 | 
 23 |     The class contains 2 functions:
 24 |     1. run: Kfold trains a model and returns it fitted
 25 |     2. run_and_analyze: applies the run function multiple times
 26 |        to gain statistical insight on the performance.
 27 |     """
 28 | 
 29 |     # CONSTS #
 30 |     DEFAULT_TEST_FIT_FUNCTION = better_symbolic_reg_fitness
 31 |     # END - CONSTS #
 32 | 
 33 |     def __init__(self):
 34 |         pass
 35 | 
 36 |     @staticmethod
 37 |     def run(non_normalized_data: pd.DataFrame,
 38 |             generations: int,
 39 |             population_size: int,
 40 |             k_fold: int,
 41 |             performance_metric,
 42 |             parsimony_coefficient: float,
 43 |             verbose: int,
 44 |             expected_eq='Unknown',
 45 |             cores: int = -1):
 46 |         """
 47 |         Run the GAFS algorithm with some hyper-parameters.
 48 |         Initially the model is trained on a kfold portion of data,
 49 |         and then on the dataset as a whole.
 50 |         The model of the latter case is returned.
 51 |         """
 52 |         y_col = non_normalized_data.keys()[-1]
 53 |         x_values = non_normalized_data.drop([y_col], axis=1)
 54 |         y_values = non_normalized_data[y_col]
 55 |         # make a k-fold cross validation so we can trust the results better
 56 |         kf = KFold(n_splits=k_fold)
 57 |         scores = []
 58 |         fold_index = 1
 59 |         for train_index, test_index in kf.split(x_values):
 60 |             # say we do fold
 61 |             Logger.print(message="   Symbolic regression {} fold".format(fold_index))
 62 |             fold_index += 1
 63 |             # prepare data
 64 |             X_train, X_test = x_values.iloc[train_index, :], x_values.iloc[test_index, :]
 65 |             y_train, y_test = y_values.iloc[train_index], y_values.iloc[test_index]
 66 |             # prepare model
 67 |             est = SymbolicRegressor(population_size=population_size,
 68 |                                     generations=generations,
 69 |                                     metric=performance_metric,
 70 |                                     n_jobs=cores,
 71 |                                     verbose=verbose,
 72 |                                     parsimony_coefficient=parsimony_coefficient,
 73 |                                     random_state=73)
 74 |             est.fit(X_train, y_train)
 75 |             y_pred = est.predict(X_test)
 76 |             score = performance_metric(y_test, y_pred) if not isinstance(performance_metric, str) else function_mapper[performance_metric](y_test, y_pred)
 77 |             scores.append(score)
 78 | 
 79 |         # train a symbolic regression on all the data, it is at least as good as the previous ones
 80 |         est = SymbolicRegressor(population_size=population_size,
 81 |                                 generations=generations,
 82 |                                 n_jobs=cores,
 83 |                                 feature_names=non_normalized_data.keys()[:-1],
 84 |                                 parsimony_coefficient=parsimony_coefficient,
 85 |                                 verbose=verbose,
 86 |                                 random_state=73)
 87 |         est.fit(x_values, y_values)
 88 |         # if we want to compare to the EQ.
 89 |         if expected_eq != 'Unknown':
 90 |             Logger.print(message='Expected eq: {}, Found eq: {} | Found eq as latex: {}'.format(expected_eq,
 91 |                                                                                                 est,
 92 |                                                                                                 "NA"))#SymbolicRegressionToLatexText.run(eq=str(est))))
 93 |         else:
 94 |             Logger.print(message='Found eq: {}'.format(est))
 95 |         return est
 96 | 
 97 |     @staticmethod
 98 |     def run_and_analyze(run_times: int,
 99 |                         non_normalized_data: pd.DataFrame,
100 |                         generations: int,
101 |                         population_size: int,
102 |                         k_fold: int,
103 |                         performance_metric,
104 |                         parsimony_coefficient: float,
105 |                         save_dir: str,
106 |                         expected_eq='Unknown',
107 |                         cores: int = -1):
108 |         """
109 |         Run the GAFS algorithm several times and save results from all runs.
110 |         Returns a pandas dataframe of all results and the best model from
111 |         all runs.
112 |         """
113 |         results = pd.DataFrame()
114 |         y_col = non_normalized_data.keys()[-1]
115 |         x_values = non_normalized_data.drop(y_col, axis=1)
116 |         y_values = non_normalized_data[y_col]
117 |         current_best_wanted_loss = 99999
118 |         best_model = None
119 |         for test in range(run_times):
120 |             Logger.print(message="Symbolic regression run {}".format(test + 1))
121 |             if isinstance(parsimony_coefficient, float) and 0 <= parsimony_coefficient <= 1:
122 |                 fit_model = GASF.run(non_normalized_data=non_normalized_data,
123 |                                      generations=generations,
124 |                                      population_size=population_size,
125 |                                      k_fold=k_fold,
126 |                                      performance_metric=performance_metric,
127 |                                      parsimony_coefficient=parsimony_coefficient,
128 |                                      verbose=1 if test == 0 else 0,
129 |                                      expected_eq=expected_eq,
130 |                                      cores=cores)
131 |             elif isinstance(parsimony_coefficient, list) and len(parsimony_coefficient) > 0 and all([isinstance(val, float) for val in parsimony_coefficient]):
132 |                 best_score = 99999
133 |                 best_inner_model = None
134 |                 best_parsimony_coefficient = 0
135 |                 score_history = {}
136 |                 for parsimony_coefficient_val in parsimony_coefficient:
137 |                     fit_model = GASF.run(non_normalized_data=non_normalized_data,
138 |                                          generations=generations,
139 |                                          population_size=population_size,
140 |                                          k_fold=k_fold,
141 |                                          performance_metric=performance_metric,
142 |                                          parsimony_coefficient=parsimony_coefficient_val,
143 |                                          verbose=1 if test == 0 else 0,
144 |                                          expected_eq=expected_eq,
145 |                                          cores=cores)
146 |                     try:
147 |                         this_score = performance_metric(y_values, fit_model.predict(x_values))
148 |                     except Exception as error:
149 |                         this_score = GASF.DEFAULT_TEST_FIT_FUNCTION(y_values, fit_model.predict(x_values))
150 |                     score_history[parsimony_coefficient_val] = this_score
151 |                     if this_score > best_score:
152 |                         best_score = this_score
153 |                         best_inner_model = fit_model
154 |                         best_parsimony_coefficient = parsimony_coefficient_val
155 |                 # save grid search results
156 |                 Logger.print("The best parsimony_coefficient value is: {}".format(best_parsimony_coefficient))
157 |                 with open(os.path.join(save_dir, "parsimony_coefficient_grid_search.json"), "w") as grid_search_value:
158 |                     json.dump(score_history, grid_search_value)
159 |                 # continue with the best model
160 |                 fit_model = best_inner_model
161 | 
162 |             else:
163 |                 raise ValueError("The parsimony_coefficient argument must be either a float between 0 and 1 or a non-empty list of floats")
164 |             pred = fit_model.predict(x_values)
165 |             # save test scores
166 |             try:
167 |                 wanted_loss = performance_metric(y_values, pred)
168 |             except Exception as error:
169 |                 wanted_loss = GASF.DEFAULT_TEST_FIT_FUNCTION(y_values, pred)
170 |             results.at[test, "wanted_loss"] = wanted_loss
171 |             results.at[test, "mae"] = mean_absolute_error(y_values, pred)
172 |             results.at[test, "mse"] = mean_squared_error(y_values, pred)
173 |             results.at[test, "r2"] = r2_score(y_values, pred)
174 |             results.at[test, "t_test_p_value"] = stats.ttest_ind(y_values, pred)[1]
175 |             results.at[test, "found_eq"] = fit_model
176 |             if wanted_loss < current_best_wanted_loss:
177 |                 best_model = fit_model
178 |                 current_best_wanted_loss= wanted_loss
179 | 
180 |         # print and save scoring results of all runs
181 |         Logger.print(message="Finished all symbolic runs - ")
182 |         [Logger.print(message="{}: {:.3} +- {:.3}".format(score, results[score].mean(), results[score].std()))
183 |          for score in ["mae", "mse", "r2", "t_test_p_value"]]
184 |         return results, best_model
185 | 
186 |     def __repr__(self):
187 |         return self.__str__()
188 | 
189 |     def __str__(self):
190 |         return "<GASF>"
191 | 


--------------------------------------------------------------------------------
/demo/demo.csv:
--------------------------------------------------------------------------------
  1 | v0,a,t,v
  2 | 3.43,7.47,4.46,36.75
  3 | 11.02,6.68,1.63,22.13
  4 | 9.29,7.99,6.44,60.14
  5 | 14.05,5.71,2.37,28.13
  6 | 7.64,7.12,6.7,54.79
  7 | 17.61,5.91,5.37,49.84
  8 | 18.87,7.08,9.66,87.26
  9 | 17.07,6.66,0.61,21.56
 10 | 10.69,8.26,9.51,89.24
 11 | 13.26,6.58,3.18,34.53
 12 | 14.57,6.49,9.1,75.1
 13 | 11.87,5.93,6.01,47.51
 14 | 11.14,6.23,6.37,50.32
 15 | 7.94,7.84,4.03,39.14
 16 | 16.74,7.8,2.91,39.83
 17 | 11.76,7.11,5.54,50.64
 18 | 4.82,8.9,0.26,7.21
 19 | 4.99,8.78,6.87,65.96
 20 | 6.49,6.85,8.4,64.67
 21 | 12.98,8.44,6.29,66.73
 22 | 8.05,5.71,3.94,30.85
 23 | 6.46,5.96,1.42,15.07
 24 | 12.62,7.58,6.47,62.28
 25 | 17.22,7.61,9.16,87.8
 26 | 3.25,5.85,9.24,58.45
 27 | 9.32,7.56,1.36,19.6
 28 | 2.04,6.94,4.5,33.27
 29 | 14.61,8.15,5.52,59.6
 30 | 16.36,5.47,4.61,40.75
 31 | 2.53,6.28,6.8,44.33
 32 | 14,5.17,3.31,31.11
 33 | 18.04,7.39,4.45,51.94
 34 | 7.44,8.17,4.39,43.74
 35 | 2.24,5.88,5.48,34.12
 36 | 11.37,7.21,9.73,80.71
 37 | 2.9,5.96,3.09,21.1
 38 | 14.23,7.93,8.16,79.73
 39 | 15.46,8.94,7.18,78.06
 40 | 18.21,6.68,8.91,76.95
 41 | 1.92,7.38,0.28,4.07
 42 | 11.85,6.87,2.43,28.26
 43 | 13.8,8.28,8.56,83.83
 44 | 6.06,6.85,5.27,42.58
 45 | 17.72,6.51,5.46,53.26
 46 | 19.54,7.01,0.56,23.23
 47 | 18.06,8.54,4.12,52.18
 48 | 1.59,7.79,4.54,36.96
 49 | 1.53,5.14,1.27,7.98
 50 | 3.06,6.34,4.44,30.59
 51 | 10.89,7.15,1.79,23.45
 52 | 4.76,8.61,7.16,67.07
 53 | 9.51,7.36,9.71,80.17
 54 | 2.5,7.55,3.36,27.59
 55 | 17.17,7.51,8.82,82.57
 56 | 18.42,8.21,5.35,61.72
 57 | 8.02,8.22,2.77,31.41
 58 | 19.13,8.83,7.18,80.88
 59 | 14.27,5.9,7.58,58.4
 60 | 19.94,8.82,0.3,22.36
 61 | 13.38,5.41,6.29,46.93
 62 | 5.88,6.4,2.19,20.09
 63 | 2.73,8.33,5.65,49.79
 64 | 12.09,6.31,3.17,31.77
 65 | 14.31,8.13,4.11,47.72
 66 | 13.76,8.6,1.21,23.68
 67 | 3.51,7.78,8.82,72.85
 68 | 4.54,7.64,5.9,49.62
 69 | 2.96,5.07,4.45,25.52
 70 | 16.42,8,4.28,51.17
 71 | 4.19,8.17,5.62,50.61
 72 | 19.14,5.67,6.69,57.64
 73 | 3.44,5.39,0.17,4.27
 74 | 14.61,6.41,9.65,75.7
 75 | 8.01,5.7,6.73,46.37
 76 | 18.11,7.91,2.8,39.45
 77 | 4.92,7.23,8.28,65.43
 78 | 4.42,5.01,1.97,14.43
 79 | 16.31,6.57,8.77,72.45
 80 | 12.88,5.46,0.31,14.72
 81 | 17.03,8.31,1.15,26.59
 82 | 10.18,7.34,5.77,52.53
 83 | 5.99,6.47,0.85,11.49
 84 | 11.45,8.74,6.84,70.52
 85 | 1.13,6.85,4.12,29.35
 86 | 13.8,6.62,4.63,44.01
 87 | 7.61,7.62,6.35,56.56
 88 | 1.88,5.32,0.98,7.24
 89 | 5.04,6.61,7.47,53.87
 90 | 3.31,7.68,4.81,39.45
 91 | 3,8.68,7.12,64.15
 92 | 7.97,7.11,5.33,45.87
 93 | 3.86,8.9,2.7,28.17
 94 | 1.16,7.72,0.69,6.42
 95 | 3.12,7.25,3.3,26.5
 96 | 4.99,8.77,7.62,73.25
 97 | 17.43,6.22,1.23,25.08
 98 | 10.82,5.32,5.12,38.06
 99 | 2.96,5.15,6.05,34.8
100 | 18.77,8.08,4.71,57.4
101 | 3.09,8.32,2.83,26.64
102 | 8.23,5.53,1.37,15.65
103 | 4.2,8.31,9.55,82.72
104 | 18.56,6.18,6.6,58.75
105 | 1.2,6.48,4.33,28.97
106 | 13.22,6.27,4.98,44.44
107 | 5.12,8.65,2.28,24.35
108 | 14.64,7.89,9.68,90.11
109 | 15.33,6.65,0.34,17.24
110 | 12.87,6.82,6.37,57.44
111 | 1.59,5.46,7.76,43.52
112 | 8.52,8.38,5.08,50.07
113 | 1.85,5.17,5.01,28.03
114 | 9.97,6.6,6.5,53.4
115 | 14.62,6.25,9.44,74.36
116 | 0.54,7.89,1.56,12.59
117 | 2.53,5.18,4.18,24.42
118 | 0.16,5.47,8.42,45.29
119 | 12.65,7.18,1.89,25.96
120 | 19.31,5.23,5.29,47.45
121 | 0.99,5.49,5.95,33.99
122 | 15.15,7.47,4.11,45.39
123 | 14.52,6.58,6.21,54.83
124 | 2.09,8.4,9.98,85.92
125 | 18.25,7.42,7.34,72.71
126 | 11.77,7.42,8.32,72.77
127 | 7.18,6.25,5.36,41.09
128 | 7.27,5.27,0.87,11.62
129 | 11.89,6.99,4.89,46.07
130 | 3.22,5.13,5.65,32.53
131 | 6.21,8.43,5.52,52.22
132 | 12.44,6.86,6.5,57.6
133 | 6.31,7.6,5.45,48.68
134 | 9.68,8.95,0.13,10.84
135 | 1.56,6.45,4.18,29.09
136 | 13.13,7.49,1.53,25.08
137 | 15.53,5.17,6.81,51.75
138 | 9.67,6.13,9.96,72.14
139 | 16.61,7.93,1.77,31.26
140 | 16.24,7.92,9.18,88.95
141 | 17.13,8.88,8.04,87.64
142 | 18.7,6.73,2.83,37.75
143 | 19.49,6.93,2.32,34.86
144 | 17.88,6.72,8.25,74.79
145 | 5.04,6.06,2.63,20.98
146 | 5.01,7.01,0.88,11.4
147 | 8.87,6.78,6.89,55.58
148 | 5.61,8.32,8.49,75.48
149 | 2.42,7.06,0.16,3.55
150 | 18.85,8.21,6.7,75.33
151 | 8.09,6.78,4.4,37.54
152 | 13.61,6.47,2.86,32.11
153 | 16.23,5.85,1.81,27.35
154 | 11.85,5.42,9.22,61.2
155 | 9.44,6.06,1.24,17.29
156 | 2.8,5.83,1.5,11.31
157 | 15.43,5.73,2.55,30.04
158 | 19.97,6.9,5.1,54.61
159 | 3.92,7.83,5.82,49.99
160 | 8,7.9,7.5,67.92
161 | 14.92,6.73,9.24,76.33
162 | 18.94,8.52,0.58,23.88
163 | 2.18,7.84,5.41,44.15
164 | 8.46,8.88,2.37,30.1
165 | 0.66,8.92,9.24,84.74
166 | 10.84,7.72,8.05,72.99
167 | 5.45,8.46,2.73,28.26
168 | 0.93,6.44,7.23,48.44
169 | 4.25,8.02,5.51,48.44
170 | 13.56,7.48,0.95,20.25
171 | 11.66,5.49,6.97,49.93
172 | 3.38,6.07,7.21,48.09
173 | 0.17,5.07,8.34,42.88
174 | 11.78,8.54,0.4,14.89
175 | 13.67,5.69,8.56,61.13
176 | 4.34,6.7,3.63,28.95
177 | 1.75,8.04,3.18,27.59
178 | 19.59,5.79,1.17,26.89
179 | 10.96,6.07,2.81,28.3
180 | 18.37,8.58,0.98,26.51
181 | 13.95,7.25,8.78,76.83
182 | 11.33,5.04,3.69,30.23
183 | 11.33,8.22,4.29,46.13
184 | 3.84,5.04,5.43,31.52
185 | 19.81,5.61,9.34,72.93
186 | 5.24,7.93,7.3,63.13
187 | 12.57,5.49,1.35,20.18
188 | 6.3,6.67,0.52,9.96
189 | 14.56,8.73,1.71,29.49
190 | 2.15,6.4,8.73,58.02
191 | 1.28,7.23,6.26,47.47
192 | 14.1,8.22,7.92,79.99
193 | 5.77,8.93,9.51,90.69
194 | 12.91,7.02,1.79,25.99
195 | 9.01,5.68,3.27,27.31
196 | 4.05,7.69,9.85,79.8
197 | 13.2,5.32,9.32,63.41
198 | 2.23,5.97,3.03,20.12
199 | 2.85,7.58,3.89,32.01
200 | 6.78,6.97,9.74,74.67
201 | 0.68,7.02,0.76,5.96
202 | 4.97,6.31,4.04,30.16
203 | 14.16,6.76,2.65,32.07
204 | 2.4,8.97,6.27,58.64
205 | 19.91,5.44,1.62,28.44
206 | 13.32,7.81,0.74,19.1
207 | 11.54,6.22,2.82,28.79
208 | 5.79,6.6,6.87,50.62
209 | 10.13,6.36,7.98,60.27
210 | 9.29,6.77,8.09,62.78
211 | 0.18,6.31,1.46,9.3
212 | 4.63,6.56,1.51,14.54
213 | 19.8,7.05,9.94,88.98
214 | 19.76,7.57,0.3,22.25
215 | 12.17,6.79,4.13,39.41
216 | 11.95,6.47,5.39,47.29
217 | 8.96,8.37,6.59,64.76
218 | 2.92,8.32,5.38,46.73
219 | 9.43,6.22,9.83,69.16
220 | 13.67,7.55,8.18,76.18
221 | 1.42,8.88,8.23,74.5
222 | 0.76,8.38,9.68,83.52
223 | 11.77,8.1,10,90.91
224 | 4.19,8.34,7.94,70.41
225 | 8.62,5.93,7.24,52.07
226 | 8.55,7.4,5.4,48.02
227 | 1.6,8.91,5.5,50.1
228 | 1.16,6.53,7.54,49.89
229 | 14.32,7.65,8.1,77.05
230 | 14.94,5.39,0.34,16.94
231 | 1.1,5.41,4.55,25.46
232 | 19.45,5.05,0.66,22.56
233 | 13.12,5.49,0.93,18.41
234 | 6.89,5.95,4.84,35.33
235 | 4.7,6.71,7.1,52.86
236 | 11.17,6.99,8.56,70.29
237 | 1.23,8.06,2.13,18.58
238 | 3.24,6.58,7.82,55.79
239 | 13.5,7.73,3.02,36.48
240 | 17.45,8.17,2.87,40.08
241 | 2.57,6.38,1.21,10.08
242 | 11.84,7,9.27,77.5
243 | 19.04,5.49,4.34,43.3
244 | 14.11,7.6,8.26,78.42
245 | 5.9,8.47,1.34,17.25
246 | 15.62,8,2.29,34.28
247 | 13.02,6.31,6.13,52.22
248 | 14.61,8.87,1.71,29.48
249 | 3.87,8.4,3.24,30.46
250 | 15.39,5.17,0.35,17.2
251 | 16.34,7.19,1.86,29.42
252 | 1.14,5.73,1.78,11.45
253 | 8.13,8.07,5.04,49.29
254 | 7.49,5.08,8.27,49.01
255 | 8.75,6.19,4.24,35
256 | 19.21,7.81,6.5,68.58
257 | 15.95,5.99,9.04,70.8
258 | 12.43,8.83,9.52,95.53
259 | 9.37,7.85,2.13,26.09
260 | 7.47,5.36,8.07,50.73
261 | 11.64,6.94,3.15,33.17
262 | 17,6.81,2.11,31.68
263 | 5.17,8.59,0.65,10.65
264 | 12.2,7.92,9.67,88.79
265 | 10.49,6,8,59.07
266 | 11.71,8.66,7.96,79.84
267 | 14.1,8.77,0.94,22.57
268 | 5.67,6.18,4.34,32.49
269 | 19.28,7.16,9.03,84.77
270 | 8.79,5.04,0.86,13.12
271 | 4.08,8.8,2.08,22.16
272 | 1.94,6.24,5.52,36.38
273 | 3.11,5.65,8.01,47.88
274 | 18.89,7.95,9.18,90.03
275 | 6.73,6.96,1.84,19.73
276 | 7.97,5.92,1.01,14.23
277 | 0.88,6.29,4.27,28.29
278 | 4.01,5.88,0.62,7.66
279 | 1.53,5.81,3.25,20.21
280 | 0.59,5.02,6.07,31.37
281 | 16.18,5.05,2.76,30.42
282 | 7.29,8.33,7.2,67.94
283 | 2.71,8.29,4.52,40.18
284 | 15.28,5.51,5.43,45.65
285 | 7.31,5.79,5.06,36.97
286 | 5.97,7.09,3.32,28.92
287 | 9.1,7.59,0.05,9.48
288 | 13.55,5.39,0.59,16.73
289 | 6.16,5.37,8.86,54.28
290 | 17.52,5.15,6.46,50.79
291 | 17.09,6.62,1.94,30.23
292 | 1.85,8.77,2.25,21.8
293 | 16.37,8.36,4.63,55.08
294 | 5.89,7.44,4.8,41.19
295 | 2.43,5.86,4.65,30.27
296 | 4.41,7.39,9.41,72.47
297 | 19.67,6.16,9.97,81.09
298 | 10.39,6.75,7.81,63.11
299 | 18.78,6.23,6.8,60.53
300 | 19.95,5.45,1.04,25.36
301 | 5.19,8.37,5.2,48.71
302 | 5.19,5.15,9.68,55.59
303 | 14.29,6.2,4.05,39.79
304 | 13.1,8.48,0.29,15.4
305 | 12.34,7.06,7.05,60.87
306 | 7.65,7.12,6.23,50.97
307 | 12.88,7.59,9.96,86.71
308 | 19.84,5.42,0.83,24.58
309 | 8.93,6.35,9.03,67.6
310 | 10.79,8.93,6.91,73.95
311 | 9.04,7.74,8.03,70.48
312 | 17.81,7.51,2.91,40.06
313 | 1.61,8.81,8.63,77.64
314 | 10.11,5.2,6.19,41.88
315 | 9.57,5.25,6.75,45.46
316 | 14.85,5.85,6.74,54.28
317 | 15.63,5.34,1.84,25.71
318 | 6.26,6.35,6.31,45.87
319 | 9.58,5.19,6.52,43.85
320 | 14.55,6.25,2.08,27.55
321 | 14.62,6.83,7.38,65.68
322 | 18.53,8.11,8.77,91.45
323 | 16.29,6.49,7.97,68.02
324 | 13.87,7.2,2.24,30.6
325 | 16.88,6.67,8.92,77.14
326 | 13.92,5.64,9.21,66.52
327 | 16.13,8.44,5.75,65.31
328 | 15.89,8.29,6.83,73.24
329 | 14.72,8.19,3.58,44.48
330 | 9.25,7.49,6.67,58.62
331 | 8.32,6.8,4.76,41.09
332 | 19.5,5.16,5.99,49.4
333 | 3.77,7.7,6.83,55.8
334 | 15.85,5.97,6.84,57.25
335 | 17.86,6.93,4.69,51.37
336 | 18.85,8.77,3.1,45.58
337 | 10.19,8.8,3.3,39.23
338 | 8.94,8.75,3.8,43.03
339 | 8.39,5.15,6.61,43.28
340 | 7.18,5.64,8.25,54.25
341 | 3.36,6.18,0.14,4.31
342 | 1,7.64,7.39,58.61
343 | 4.9,8.91,5.84,56.93
344 | 4.89,6.06,9.27,60.46
345 | 10.84,8.52,4.52,48.86
346 | 19.23,8.42,2.61,41.21
347 | 15.57,5.95,3.42,35.92
348 | 7.85,8.84,4.57,47.77
349 | 10.95,7.55,6.56,59.87
350 | 6.85,8.31,7.46,67.47
351 | 18.28,6.65,8.09,72.8
352 | 19.59,5.35,0.13,20.49
353 | 17.52,7.71,1.12,26.42
354 | 3.55,8.29,7.51,65.15
355 | 6.53,6.07,0.81,11.33
356 | 7.45,5.86,7.12,48.68
357 | 0.74,5.47,0.93,5.77
358 | 13.95,8.11,8.29,81.18
359 | 11.35,8.21,0.7,16.76
360 | 1.86,5.83,5.73,34.91
361 | 8.73,5.22,9.82,60.59
362 | 15.84,8.31,6.1,66.53
363 | 11.09,5.49,4.31,35.1
364 | 15.6,5.61,7.61,57.13
365 | 0.29,5.39,0.91,5.19
366 | 12.33,6.17,7.2,56.75
367 | 12.1,6.79,5.6,50.12
368 | 11.42,8.29,1.26,21.65
369 | 8.36,8.97,8.04,81.28
370 | 2.69,6.41,1.27,10.72
371 | 16.13,6.5,7.75,67.17
372 | 3.03,8.44,6.16,53.92
373 | 15.22,8,9.1,88.9
374 | 12,6.62,5.04,44.46
375 | 6.3,8.28,5.34,50.52
376 | 9.28,8.13,3.2,34.94
377 | 2.98,5.63,0.38,5.17
378 | 15.37,6.76,7.72,67.56
379 | 9.78,6.41,2.78,27.32
380 | 19.73,5.55,0.32,21.51
381 | 19.64,5.56,8.05,65.69
382 | 18.97,7.39,2.4,37.44
383 | 0.36,5.1,5.52,28.51
384 | 4.71,6.94,5.24,40.66
385 | 13.31,8.08,5.12,55.23
386 | 12.1,7.3,8.43,74.38
387 | 6.99,6.93,3.35,30.81
388 | 6.18,5.11,0.95,10.92
389 | 5.04,7.22,2.66,23.76
390 | 5.14,5.59,8.12,50.53
391 | 11.94,7.41,5.37,51.73
392 | 4.82,6.4,0.18,5.97
393 | 15.87,8.38,7.42,77.27
394 | 8.38,8.07,2.35,27.34
395 | 13.94,6.66,4.87,45.91
396 | 5.13,8.76,1.87,21.08
397 | 17.62,8.29,6.02,66.85
398 | 5.8,7.73,3.11,29.54
399 | 15.5,7.27,0.86,21.97
400 | 18.29,8.89,8.42,95.01
401 | 16.86,7.05,5.59,56.27


--------------------------------------------------------------------------------
/utills/plotter.py:
--------------------------------------------------------------------------------
  1 | # library imports
  2 | import matplotlib
  3 | import numpy as np
  4 | import pandas as pd
  5 | import matplotlib.pyplot as plt
  6 | from sklearn.metrics import r2_score
  7 | from sklearn.linear_model import LinearRegression
  8 | from sklearn.model_selection import train_test_split
  9 | 
 10 | # project imports
 11 | from utills.consts import *
 12 | from utills.logger_config import Logger
 13 | 
 14 | # fix for windows
 15 | matplotlib.use('Agg')
 16 | 
 17 | 
 18 | class Plotter:
 19 |     """
 20 |     A plotter class for the results of the model
 21 |     """
 22 | 
 23 |     def __init__(self):
 24 |         pass
 25 | 
 26 |     @staticmethod
 27 |     def noise_graph(noise_range: list,
 28 |                     y_list: dict,
 29 |                     save_path: str):
 30 |         colors = ["red", "blue", "green"]
 31 |         symbols = ["o", "*", "P"]
 32 | 
 33 |         fig = plt.figure(figsize=(DEFAULT_FIG_SIZE, DEFAULT_FIG_SIZE))
 34 |         # calc predictions and save to csv
 35 |         index = 0
 36 |         for name, y in y_list.items():
 37 |             plt.plot(noise_range,
 38 |                      y,
 39 |                      "-{}".format(symbols[index]),
 40 |                      color=colors[index],
 41 |                      s=20,
 42 |                      alpha=0.5,
 43 |                      label="{}".format(name))
 44 |             index += 1
 45 |         # set parameters and save plot
 46 |         plt.xlim((min(noise_range), max(noise_range)))
 47 |         plt.ylim((0, 1))
 48 |         plt.xlabel("Noise Level", fontsize=16)
 49 |         plt.ylabel("Successful rate", fontsize=16)
 50 |         plt.legend(frameon=True, fontsize=13)
 51 |         plt.grid(alpha=0.5)
 52 |         ax = plt.gca()
 53 |         ax.yaxis.set_ticks_position('left')
 54 |         ax.xaxis.set_ticks_position('bottom')
 55 |         ax.spines['right'].set_visible(False)
 56 |         ax.spines['top'].set_visible(False)
 57 |         plt.savefig(save_path, dpi=DEFAULT_DPI)
 58 |         plt.close()
 59 | 
 60 |     @staticmethod
 61 |     def y_test_vs_y_pred(model,
 62 |                          x_test,
 63 |                          y_test,
 64 |                          save_path: str):
 65 |         fig = plt.figure(figsize=(DEFAULT_FIG_SIZE, DEFAULT_FIG_SIZE))
 66 |         # calc predictions and save to csv
 67 |         y_pred = model.predict(x_test)
 68 |         pd.DataFrame({'y_pred': y_pred, 'y_true': y_test}).to_csv(save_path[:-4] + '.csv', index=False)
 69 |         pts_range = (min([min(y_test), min(y_pred)]), max([max(y_test), max(y_pred)]))
 70 |         # plot predictions them against actual values
 71 |         y_test = np.array(y_test).reshape(-1, 1)
 72 |         lg = LinearRegression().fit(y_test, y_pred)
 73 |         r2 = lg.score(y_test, y_pred)
 74 |         plt.scatter(x=y_test,
 75 |                     y=y_pred,
 76 |                     color="blue",
 77 |                     s=20,
 78 |                     alpha=0.5)
 79 |         # plot y_pred = y_true for ref
 80 |         plt.plot([min(y_test), max(y_test)],
 81 |                  [min(y_test), max(y_test)],
 82 |                  "-",
 83 |                  color="black",
 84 |                  linewidth=1,
 85 |                  alpha=0.75)
 86 |         # plot actual y_pred = f(y_true) relation
 87 |         plt.plot([min(y_test), max(y_test)],
 88 |                  [lg.predict([min(y_test)])[0], lg.predict([max(y_test)])[0]],
 89 |                  "--",
 90 |                  color="gray",
 91 |                  linewidth=2,
 92 |                  alpha=0.75,
 93 |                  label="$R^2$ = " + str(round(r2, 3)) + " | $y_{pred} = y_{exp} * " + str(
 94 |                      round(lg.coef_[0], 3)) + " + " + str(round(lg.intercept_, 3)) + "$")
 95 |         # set parameters and save plot
 96 |         plt.xlim(pts_range)
 97 |         plt.ylim(pts_range)
 98 |         plt.xlabel("True value", fontsize=16)
 99 |         plt.ylabel("Predicted value", fontsize=16)
100 |         plt.legend(frameon=True, fontsize=13)
101 |         plt.grid(alpha=0.5)
102 |         ax = plt.gca()
103 |         ax.yaxis.set_ticks_position('left')
104 |         ax.xaxis.set_ticks_position('bottom')
105 |         ax.spines['right'].set_visible(False)
106 |         ax.spines['top'].set_visible(False)
107 |         plt.savefig(save_path, dpi=DEFAULT_DPI)
108 |         plt.close()
109 | 
110 |     @staticmethod
111 |     def parameter_sensitivity_graph(model,
112 |                                     baseline_x: list,
113 |                                     parameter_col_index: int,
114 |                                     parameter_start_range: float,
115 |                                     parameter_end_range: float,
116 |                                     parameter_steps_count: int,
117 |                                     parameter_name: str,
118 |                                     target_name: str,
119 |                                     save_path: str):
120 |         fig = plt.figure(figsize=(DEFAULT_FIG_SIZE, DEFAULT_FIG_SIZE))
121 |         x_data = []
122 |         x_values = []
123 |         # prepare data
124 |         step_size = (parameter_end_range - parameter_start_range) / parameter_steps_count
125 |         for i in range(parameter_steps_count):
126 |             new_row = baseline_x.copy()
127 |             new_row[parameter_col_index] = parameter_start_range + i * step_size
128 |             x_values.append(new_row[parameter_col_index])
129 |             x_data.append(new_row)
130 |         df = pd.DataFrame(x_data)
131 |         y_pred = model.predict(df)
132 |         plt.plot(x_values,
133 |                  y_pred,
134 |                  "-o",
135 |                  color="black")
136 |         plt.xlim((parameter_start_range, parameter_end_range))
137 |         plt.ylim((min(y_pred), max(y_pred)))
138 |         plt.xlabel(parameter_name, fontsize=16)
139 |         plt.xlabel(target_name, fontsize=16)
140 |         ax = plt.gca()
141 |         plt.grid()
142 |         ax.yaxis.set_ticks_position('left')
143 |         ax.xaxis.set_ticks_position('bottom')
144 |         plt.savefig(save_path, dpi=DEFAULT_DPI)
145 |         plt.close()
146 | 
147 |     @staticmethod
148 |     def std_check(data, save_path):
149 |         # calc std on increasing num of samples
150 |         vals = []
151 |         for i in data.index[1:]:
152 |             vals.append(float(data.loc[:i].std()))
153 |         # plot std development over iterations
154 |         fig = plt.figure(figsize=(DEFAULT_FIG_SIZE, DEFAULT_FIG_SIZE))
155 |         plt.scatter(x=range(1, 1 + len(vals)),
156 |                     y=vals,
157 |                     color="blue",
158 |                     s=20,
159 |                     alpha=0.5)
160 |         plt.axhline(y=vals[-1] * (1 + REL_ERR_OF_STD),
161 |                     linestyle="--",
162 |                     color="red",
163 |                     linewidth=1)
164 |         plt.axhline(y=vals[-1] * (1 - REL_ERR_OF_STD),
165 |                     linestyle="--",
166 |                     color="red",
167 |                     linewidth=1)  # set parameters and save plot
168 |         plt.xlim([1, len(vals)])
169 |         plt.xlabel("Iteration", fontsize=16)
170 |         plt.ylabel("Standard Deviation", fontsize=16)
171 |         plt.grid(alpha=0.5)
172 |         ax = plt.gca()
173 |         ax.yaxis.set_ticks_position('left')
174 |         ax.xaxis.set_ticks_position('bottom')
175 |         ax.spines['right'].set_visible(False)
176 |         ax.spines['top'].set_visible(False)
177 |         plt.savefig(save_path, dpi=DEFAULT_DPI)
178 |         plt.close()
179 | 
180 |     @staticmethod
181 |     def feature_importance(model,
182 |                            dataset: pd.DataFrame,
183 |                            save_dir: str,
184 |                            program_part: str,
185 |                            simulations: int = 100):
186 |         # alert user
187 |         Logger.print("\nTest feature importance with best {} ML model:".format(program_part))
188 |         fig = plt.figure(figsize=(DEFAULT_FIG_SIZE, DEFAULT_FIG_SIZE))
189 |         # create a df to save r2 scores of simulations
190 |         sim_results = pd.DataFrame()
191 |         y_col = dataset.keys()[-1]
192 |         for sim in range(simulations):
193 |             # prepare data
194 |             train_Xs, test_Xs, train_y, test_y = train_test_split(dataset.drop(y_col, axis=1),
195 |                                                                   dataset[y_col],
196 |                                                                   shuffle=True)
197 |             # train & test model on data
198 |             sim_model = model
199 |             sim_model.fit(train_Xs, train_y)
200 |             pred = sim_model.predict(test_Xs)
201 |             sim_results.at[sim, 'r2'] = r2_score(test_y, pred)
202 |             # check r2 loss on data without a specific feature
203 |             for feature in train_Xs.keys():
204 |                 new_train_Xs, new_test_Xs = train_Xs.drop(feature, axis=1), test_Xs.drop(feature, axis=1)
205 |                 new_model = model
206 |                 new_model.fit(new_train_Xs, train_y)
207 |                 new_pred = new_model.predict(new_test_Xs)
208 |                 sim_results.at[sim, '{}_r2_loss'.format(feature)] = sim_results.loc[sim, 'r2'] - \
209 |                                                                     r2_score(test_y, new_pred)
210 |         # save all r2 scores
211 |         sim_results.to_csv(os.path.join(os.path.dirname(os.path.dirname(__file__)), save_dir,
212 |                                         "{}_feature_importance.csv".format(program_part)),
213 |                            index=False)
214 |         # prepare new df of avereged feature importance acc to r2 loss
215 |         f_importance = pd.DataFrame()
216 |         for i, f in enumerate(train_Xs.keys()):
217 |             name = f + '_r2_loss'
218 |             f_importance.at[i, 'feature'] = f
219 |             f_importance.at[i, ['r2_loss', 'r2_err']] = sim_results[name].mean(), sim_results[name].std()
220 |         # plot data
221 |         f_importance.plot.barh(x='feature',
222 |                                y='r2_loss',
223 |                                xerr=f_importance['r2_err'].T.values,
224 |                                color="grey")
225 |         plt.ylabel("Feature Name", fontsize=16)
226 |         plt.xlabel("$R^2$ Loss Without Feature", fontsize=16)
227 |         ax = plt.gca()
228 |         plt.grid(axis='x')
229 |         ax.yaxis.set_ticks_position('left')
230 |         ax.xaxis.set_ticks_position('bottom')
231 |         plt.savefig(os.path.join(os.path.dirname(os.path.dirname(__file__)), save_dir,
232 |                                  "{}_feature_importance.pdf".format(program_part)),
233 |                     dpi=DEFAULT_DPI)
234 |         plt.close()
235 | 
236 |     def __repr__(self):
237 |         return self.__str__()
238 | 
239 |     def __str__(self):
240 |         return "<Plotter>"
241 | 


--------------------------------------------------------------------------------
/experiments/exp_constant_acceleration.py:
--------------------------------------------------------------------------------
  1 | # library imports
  2 | import os
  3 | import time
  4 | import pandas as pd
  5 | from datetime import timedelta
  6 | from sklearn.metrics import mean_squared_error
  7 | from sklearn.model_selection import train_test_split
  8 | 
  9 | # project imports
 10 | from utills.consts import *
 11 | from utills.fitness_methods import *
 12 | from utills.logger_config import Logger
 13 | from algo.equation_brute_search import EBS
 14 | from utills.result_tracker import ResultTracker
 15 | from algo.multi_tpot_analysis import MultiTPOTrunner
 16 | from algo.genetic_algorithm_symbolic_fit import GASF
 17 | from data_generators.constant_acceleration_data_generator import ConstantAccelerationDataGenerator
 18 | 
 19 | 
 20 | class ExpConstantAcceleration:
 21 |     """
 22 |     Program receives a dataset with all essential features needed
 23 |     to deduce a "noisy" target (momentary velocity).
 24 | 
 25 |     Success of both numerical and analytical parts of the program prove
 26 |     that the program is able to learn simple linear relation between
 27 |     features, even with noisy data.
 28 |     """
 29 | 
 30 |     def __init__(self):
 31 |         pass
 32 | 
 33 |     @staticmethod
 34 |     def run(numerical_bool: bool,
 35 |             analytical_bool: bool,
 36 |             force_ebs_bool: bool):
 37 |         """
 38 |         Entry point
 39 |         """
 40 |         # config logging
 41 |         start_time = time.time()
 42 | 
 43 |         # prepare IO
 44 |         os.makedirs(os.path.join(os.path.dirname(os.path.dirname(__file__)), CONST_ACCELERATION_RESULTS_FOLDER_NAME),
 45 |                     exist_ok=True)
 46 |         Logger(os.path.join(os.path.dirname(os.path.dirname(__file__)),
 47 |                             CONST_ACCELERATION_RESULTS_FOLDER_NAME,
 48 |                             "run.log"))
 49 | 
 50 |         # 1) generate data
 51 |         data_path = os.path.join(os.path.dirname(__file__), "..", "data",
 52 |                                  "constant_acceleration_data_" + str(CONST_ACCELERATION_NUM_SAMPLES) + "_samples.csv")
 53 |         ConstantAccelerationDataGenerator.generate(samples=CONST_ACCELERATION_NUM_SAMPLES,
 54 |                                                    a_range=(5, 9),
 55 |                                                    t_range=(0, 10),
 56 |                                                    v0_range=(0, 20),
 57 |                                                    noise_range= CONST_ACCELERATION_NOISE_RANGE,
 58 |                                                    save_path=data_path)
 59 |         # 1.1) load data, normalize and split
 60 |         df = pd.read_csv(data_path)
 61 |         Logger.print('Generated data:\n{}'.format(df.describe()))
 62 |         y_col = df.keys()[-1]
 63 |         normalized_df = (df - df.min()) / (df.max() - df.min())
 64 |         train_data_x, test_data_x, train_data_y, test_data_y = train_test_split(normalized_df.drop([y_col], axis=1),
 65 |                                                                                 normalized_df[y_col],
 66 |                                                                                 shuffle=True,
 67 |                                                                                 test_size=CONST_ACCELERATION_TEST_SIZE_PORTION,
 68 |                                                                                 random_state=RANDOM_STATE)
 69 |         # 1.2) log elapsed time
 70 |         data_end_time = time.time()
 71 |         Logger.print("   --- Finished. Elapsed time: {} ---".format(timedelta(seconds=data_end_time - start_time)))
 72 | 
 73 |         # 2.1) continue to the MultiTPOTrunner regression
 74 |         Logger.print('Training MultiTPOTrunner:')
 75 |         if numerical_bool:
 76 |             # 2.1) find the best ML model for data
 77 |             all_t_scores, best_t_model = MultiTPOTrunner.run_and_analyze(run_times=CONST_ACCELERATION_NUMERICAL_RUN_TIMES,
 78 |                                                                          train_data_x=train_data_x,
 79 |                                                                          train_data_y=train_data_y,
 80 |                                                                          test_data_x=test_data_x,
 81 |                                                                          test_data_y=test_data_y,
 82 |                                                                          generations=CONST_ACCELERATION_NUMERICAL_GENERATION_COUNT,
 83 |                                                                          population_size=CONST_ACCELERATION_NUMERICAL_POP_SIZE,
 84 |                                                                          k_fold=K_FOLD,
 85 |                                                                          performance_metric=neg_mean_squared_error_scorer,
 86 |                                                                          save_dir=os.path.join(os.path.dirname(os.path.dirname(__file__)),
 87 |                                                                                                CONST_ACCELERATION_RESULTS_FOLDER_NAME),
 88 |                                                                          n_jobs=-1)
 89 |             # 2.2) save results of best model from all runs
 90 |             ResultTracker.run(program_part="tpot",
 91 |                               run_times=CONST_ACCELERATION_NUMERICAL_RUN_TIMES,
 92 |                               all_scores=all_t_scores,
 93 |                               model=best_t_model,
 94 |                               train_data_x=train_data_x,
 95 |                               train_data_y=train_data_y,
 96 |                               test_data_x=test_data_x,
 97 |                               test_data_y=test_data_y,
 98 |                               save_dir=os.path.join(os.path.dirname(os.path.dirname(__file__)),
 99 |                                                     CONST_ACCELERATION_RESULTS_FOLDER_NAME))
100 |         # 2.2) log elapsed time
101 |         tpot_end_time = time.time()
102 |         Logger.print("   --- Finished. Elapsed time: {} ---".format(timedelta(seconds=tpot_end_time - data_end_time)))
103 | 
104 |         # 3) continue to the symbolic regression
105 |         Logger.print('Searching for a symbolic expression:')
106 |         if analytical_bool:
107 |             # 3.1) run symbolic regressor multiple times
108 |             all_s_scores, best_s_model = GASF.run_and_analyze(run_times=CONST_ACCELERATION_NUMERICAL_RUN_TIMES,
109 |                                                               non_normalized_data=df,
110 |                                                               performance_metric=function_mapper["better_symbolic_reg_fitness"],
111 |                                                               generations=CONST_ACCELERATION_ANALYTICAL_GENERATION_COUNT,
112 |                                                               population_size=CONST_ACCELERATION_ANALYTICAL_POP_SIZE,
113 |                                                               k_fold=K_FOLD,
114 |                                                               cores=-1,
115 |                                                               parsimony_coefficient=CONST_ACCELERATION_ANALYTICAL_PARSIMONY_COEFFICIENT,
116 |                                                               expected_eq='add(v0, mul(a, t))',
117 |                                                               save_dir=os.path.join(os.path.dirname(os.path.dirname(__file__)),
118 |                                                                                     CONST_ACCELERATION_RESULTS_FOLDER_NAME))
119 |             # 3.2) save results of best model from all runs
120 |             ResultTracker.run(program_part="symbolic",
121 |                               run_times=CONST_ACCELERATION_NUMERICAL_RUN_TIMES,
122 |                               all_scores=all_s_scores,
123 |                               model=best_s_model,
124 |                               train_data_x=train_data_x,
125 |                               train_data_y=train_data_y,
126 |                               test_data_x=test_data_x,
127 |                               test_data_y=test_data_y,
128 |                               save_dir=os.path.join(os.path.dirname(os.path.dirname(__file__)),
129 |                                                     CONST_ACCELERATION_RESULTS_FOLDER_NAME))
130 |             # 3.3) save a summary of the eqs found & figure whether to continue to ebf
131 |             ebs_flag = ResultTracker.summaries_symbolic_results(run_times=CONST_ACCELERATION_NUMERICAL_RUN_TIMES,
132 |                                                                 percent_of_majority=SYMBOLIC_PERCENT_OF_MAJORITY,
133 |                                                                 eq_ranking_metric=SYMBOLIC_EQ_RANKING_METRIC,
134 |                                                                 top_eqs_max_num=SYMBOLIC_TOP_EQS_MAX_NUM,
135 |                                                                 save_dir=os.path.join(os.path.dirname(os.path.dirname(__file__)),
136 |                                                                                       CONST_ACCELERATION_RESULTS_FOLDER_NAME))
137 |             # 3.4) log elapsed time
138 |             symbolic_end_time = time.time()
139 |             Logger.print("Finished. Elapsed time: {}".format(timedelta(seconds=symbolic_end_time - tpot_end_time)))
140 | 
141 |             # 4) continue to the EBS
142 |             if ebs_flag or force_ebs_bool:
143 |                 Logger.print('Searching for a symbolic expression using EBF:')
144 |                 # 4.1) run EBS multiple times
145 |                 all_ebs_scores, best_ebs_model = EBS.run_and_analyze(run_times=CONST_ACCELERATION_NUMERICAL_RUN_TIMES,
146 |                                                                      non_normalized_data=df,
147 |                                                                      performance_metric=function_mapper["better_symbolic_reg_fitness"],
148 |                                                                      cores=-1,
149 |                                                                      size_range=CONST_ACCELERATION_EBS_SIZE_RANGE,
150 |                                                                      expected_eq='add(v0, mul(a, t))',
151 |                                                                      save_dir=os.path.join(os.path.dirname(os.path.dirname(__file__)),
152 |                                                                                            CONST_ACCELERATION_RESULTS_FOLDER_NAME))
153 |                 # 4.2) save the fitting score results
154 |                 ResultTracker.ebs_results(model=best_ebs_model,
155 |                                           all_scores=all_ebs_scores,
156 |                                           save_dir=os.path.join(os.path.dirname(os.path.dirname(__file__)),
157 |                                                                 CONST_ACCELERATION_RESULTS_FOLDER_NAME))
158 |             else:
159 |                 Logger.print("EBF search of a symbolic equation wasn't needed")
160 |             # 4.3) log elapsed time
161 |             ebs_end_time = time.time()
162 |             Logger.print("Finished. Elapsed time: {}".format(timedelta(seconds=ebs_end_time - symbolic_end_time)))
163 | 
164 |         # 5) alert results to the user
165 |         Logger.print("TOTAL TIME ELAPSED TIME: {}".format(timedelta(seconds=time.time() - start_time)))
166 | 


--------------------------------------------------------------------------------
/scimed.py:
--------------------------------------------------------------------------------
  1 | # library imports
  2 | import json
  3 | import pandas as pd
  4 | from sklearn.model_selection import train_test_split
  5 | 
  6 | # project imports
  7 | from utills.consts import *
  8 | from utills.fitness_methods import *
  9 | from utills.logger_config import Logger
 10 | from algo.equation_brute_search import EBS
 11 | from utills.result_tracker import ResultTracker
 12 | from algo.multi_tpot_analysis import MultiTPOTrunner
 13 | from algo.genetic_algorithm_symbolic_fit import GASF
 14 | from algo.genetic_algorithm_feature_selection import GAFS
 15 | 
 16 | 
 17 | class scimed:
 18 |     """
 19 |     The main class of the project, allow other developers to load
 20 |     it and use all the SciMED pipeline at once.
 21 |     """
 22 | 
 23 |     def __init__(self):
 24 |         pass
 25 | 
 26 |     @staticmethod
 27 |     def run(train_data_x: pd.DataFrame,
 28 |             train_data_y: pd.DataFrame,
 29 |             test_data_x: pd.DataFrame,
 30 |             test_data_y: pd.DataFrame,
 31 |             results_folder: str,
 32 |             analytical_reachment_portion: float = 0,
 33 |             numerical_run_times: int = 20,
 34 |             numerical_generations: int = 50,
 35 |             numerical_population: int = 100,
 36 |             analytical_run_times: int = 20,
 37 |             analytical_generations: int = 50,
 38 |             analytical_population: int = 100,
 39 |             parsimony_coefficient: int = 0.05,
 40 |             k_fold: int = 5,
 41 |             ebs_size_range: tuple = (5, 9),
 42 |             numerical_bool: bool = True,
 43 |             analytical_bool: bool = True,
 44 |             force_ebs_bool: bool = True,
 45 |             feature_indexes_ranges = "Not applicable",
 46 |             feature_selection_generations: int = None,
 47 |             feature_selection_pop_size: int = None,
 48 |             feature_selection_mutation_rate: float = None,
 49 |             feature_selection_royalty: float = None):
 50 |         """
 51 |         Single entry point
 52 |         """
 53 | 
 54 |         # 1) prepare IO
 55 |         os.makedirs(results_folder, exist_ok=True)
 56 | 
 57 |         # init logger
 58 |         Logger(save_path=os.path.join(results_folder, "logger.txt"))
 59 | 
 60 |         # 2) run the numerical part
 61 |         if numerical_bool:
 62 |             # 2.1) run multi-tpot analysis if feature selection isn't needed
 63 |             if feature_indexes_ranges == "Not applicable":
 64 |                 # 2.1.1) find the best ML model for all the data
 65 |                 all_t_scores, best_t_model = MultiTPOTrunner.run_and_analyze(run_times=numerical_run_times,
 66 |                                                                              train_data_x=train_data_x,
 67 |                                                                              train_data_y=train_data_y,
 68 |                                                                              test_data_x=test_data_x,
 69 |                                                                              test_data_y=test_data_y,
 70 |                                                                              generations=numerical_generations,
 71 |                                                                              population_size=numerical_population,
 72 |                                                                              k_fold=k_fold,
 73 |                                                                              performance_metric=neg_mean_squared_error_scorer,
 74 |                                                                              save_dir=results_folder,
 75 |                                                                              n_jobs=-1)
 76 |                 # 2.1.2) save results of best model from all runs
 77 |                 ResultTracker.run(program_part="tpot",
 78 |                                   run_times=numerical_run_times,
 79 |                                   all_scores=all_t_scores,
 80 |                                   model=best_t_model,
 81 |                                   train_data_x=train_data_x,
 82 |                                   train_data_y=train_data_y,
 83 |                                   test_data_x=test_data_x,
 84 |                                   test_data_y=test_data_y,
 85 |                                   save_dir=results_folder)
 86 |             # 2.2) run multi-tpot analysis with feature selection
 87 |             else:
 88 |                 # 2.2.1) find the best ML model for a subset of the data
 89 |                 best_gene = GAFS.run(tpot_run_times=numerical_run_times,
 90 |                                      feature_generations=feature_selection_generations,
 91 |                                      tpot_regressor_generations=numerical_generations,
 92 |                                      feature_population_size=feature_selection_pop_size,
 93 |                                      tpot_regressor_population_size=numerical_population,
 94 |                                      mutation_rate=feature_selection_mutation_rate,
 95 |                                      feature_indexes_ranges=feature_indexes_ranges,
 96 |                                      mutation_w=[val[1]-val[0] for val in feature_indexes_ranges],
 97 |                                      royalty=feature_selection_royalty,
 98 |                                      k_fold=k_fold,
 99 |                                      performance_metric=neg_mean_squared_error_scorer,
100 |                                      train_data_x=train_data_x,
101 |                                      train_data_y=train_data_y,
102 |                                      test_data_x=test_data_x,
103 |                                      test_data_y=test_data_y,
104 |                                      save_dir=results_folder,
105 |                                      cores=-1)
106 |                 # 2.2.2) save results of best model from all runs
107 |                 ResultTracker.run(program_part="tpot",
108 |                                   run_times=numerical_run_times,
109 |                                   all_scores=best_gene.scoring_history,
110 |                                   model=best_gene.model_object,
111 |                                   train_data_x=train_data_x.iloc[:, best_gene.feature_indexes],
112 |                                   train_data_y=train_data_y,
113 |                                   test_data_x=test_data_x.iloc[:, best_gene.feature_indexes],
114 |                                   test_data_y=test_data_y,
115 |                                   save_dir=results_folder)
116 |                 # 2.2.3) save selected features of best gene
117 |                 with open(os.path.join(os.path.dirname(__file__), results_folder, "best_features_selected.json"),
118 |                           "w") as features_file:
119 |                     json.dump({"index": best_gene.feature_indexes,
120 |                                "names": list(test_data_x.columns[best_gene.feature_indexes])},
121 |                               features_file)
122 |                 # 2.2.4) reduce the dataset of non-normalized samples for next part
123 |                 train_data_x = train_data_x.iloc[:, best_gene.feature_indexes+[-1]]
124 |             # 2.3 add more data to the original data with the model
125 |             # TODO: add to the next release after fixing the sample method in production
126 | 
127 |         # 3) continue to the symbolic regression
128 |         if analytical_bool:
129 |             # 3.1) run symbolic regression multiple times
130 |             all_s_scores, best_s_model = GASF.run_and_analyze(run_times=analytical_run_times,
131 |                                                               non_normalized_data=train_data_x,
132 |                                                               performance_metric=function_mapper["better_symbolic_reg_fitness"],
133 |                                                               generations=analytical_generations,
134 |                                                               population_size=analytical_population,
135 |                                                               k_fold=k_fold,
136 |                                                               cores=-1,
137 |                                                               parsimony_coefficient=parsimony_coefficient,
138 |                                                               save_dir=results_folder)
139 |             # 3.2) save results of best model from all runs
140 |             non_norm_train_x, non_norm_test_x, non_norm_train_y, non_norm_test_y = train_test_split(train_data_x,
141 |                                                                                                     train_data_y,
142 |                                                                                                     shuffle=True,
143 |                                                                                                     test_size=SFF_TEST_SIZE_PORTION,
144 |                                                                                                     random_state=RANDOM_STATE)
145 |             p_value_flag = ResultTracker.run(program_part="symbolic",
146 |                                              run_times=analytical_run_times,
147 |                                              all_scores=all_s_scores,
148 |                                              model=best_s_model,
149 |                                              train_data_x=non_norm_train_x,
150 |                                              train_data_y=non_norm_train_y,
151 |                                              test_data_x=non_norm_test_x,
152 |                                              test_data_y=non_norm_test_y,
153 |                                              save_dir=results_folder)
154 |             # 3.3) save a summary of the eqs found & figure whether to continue to ebf
155 |             stability_flag = ResultTracker.summaries_symbolic_results(run_times=analytical_run_times,
156 |                                                                 percent_of_majority=SYMBOLIC_PERCENT_OF_MAJORITY,
157 |                                                                 eq_ranking_metric=SYMBOLIC_EQ_RANKING_METRIC,
158 |                                                                 top_eqs_max_num=SYMBOLIC_TOP_EQS_MAX_NUM,
159 |                                                                 save_dir=results_folder)
160 | 
161 |             ebs_flag = p_value_flag or stability_flag
162 |         else:
163 |             ebs_flag = False
164 | 
165 |         # 4) continue to the EBS
166 |         if ebs_flag or force_ebs_bool:
167 |             # 4.1) run EBS multiple times
168 |             all_ebs_scores, best_ebs_model = EBS.run_and_analyze(run_times=analytical_run_times,
169 |                                                                  non_normalized_data=train_data_x,
170 |                                                                  performance_metric=function_mapper[
171 |                                                                      "better_symbolic_reg_fitness"],
172 |                                                                  cores=-1,
173 |                                                                  size_range=ebs_size_range,
174 |                                                                  save_dir=results_folder)
175 |             # 4.2) save the fitting score results
176 |             ResultTracker.ebs_results(model=best_ebs_model,
177 |                                       all_scores=all_ebs_scores,
178 |                                       save_dir=results_folder)
179 | 


--------------------------------------------------------------------------------
/experiments/exp_drag_force.py:
--------------------------------------------------------------------------------
  1 | # library imports
  2 | import os
  3 | import time
  4 | import pandas as pd
  5 | from datetime import timedelta
  6 | from sklearn.metrics import mean_squared_error
  7 | from sklearn.model_selection import train_test_split
  8 | 
  9 | # project imports
 10 | from utills.consts import *
 11 | from utills.fitness_methods import *
 12 | from utills.logger_config import Logger
 13 | from algo.equation_brute_search import EBS
 14 | from utills.result_tracker import ResultTracker
 15 | from algo.multi_tpot_analysis import MultiTPOTrunner
 16 | from algo.genetic_algorithm_symbolic_fit import GASF
 17 | from data_generators.drag_force_data_generator import DragForceDataGenerator
 18 | 
 19 | 
 20 | class ExpDragFroce:
 21 |     """
 22 |     Program receives a dataset with all essential features needed
 23 |     to deduce a "noisy" target (drag on sphere).
 24 |     Success of both numerical and analytical parts of the program prove
 25 |     that the program is able to learn a complex polynomial relation between
 26 |     features, even with noisy data.
 27 |     """
 28 | 
 29 |     def __init__(self):
 30 |         pass
 31 | 
 32 |     @staticmethod
 33 |     def run(numerical_bool: bool,
 34 |             analytical_bool: bool,
 35 |             force_ebs_bool: bool):
 36 |         """
 37 |         Entry point
 38 |         """
 39 |         # config logging
 40 |         start_time = time.time()
 41 | 
 42 |         # prepare IO
 43 |         os.makedirs(os.path.join(os.path.dirname(os.path.dirname(__file__)), DRAG_FORCE_RESULTS_FOLDER_NAME),
 44 |                     exist_ok=True)
 45 |         Logger(os.path.join(os.path.dirname(os.path.dirname(__file__)),
 46 |                             DRAG_FORCE_RESULTS_FOLDER_NAME,
 47 |                             "run.log"))
 48 | 
 49 |         # 1) generate data
 50 |         data_path = os.path.join(os.path.dirname(__file__), "..", "data",
 51 |                                  "drag_force_{}_samples.csv".format(DRAG_FORCE_NUM_SAMPLES))
 52 |         feature_indexes_ranges = DragForceDataGenerator.generate(samples=DRAG_FORCE_NUM_SAMPLES,
 53 |                                                                  cd_range=(1, 10),
 54 |                                                                  rhoa_range=(30, 50),
 55 |                                                                  v_range=(1, 10),
 56 |                                                                  d_range=(0.01, 0.1),
 57 |                                                                  noise_range= DRAG_FORCE_NOISE_RANGE,
 58 |                                                                  save_path=data_path)
 59 |         # feature_indexes_ranges = [[i,i] for i in range(len(df.keys())-1)] #remove comment if you don't want feature selection
 60 |         
 61 |         # 1.1) load data, normalize and split
 62 |         df = pd.read_csv(data_path)
 63 |         Logger.print('Generated data:\n{}'.format(df.describe()))
 64 |         y_col = df.keys()[-1]
 65 |         normalized_df = (df - df.min()) / (df.max() - df.min())
 66 |         train_data_x, test_data_x, train_data_y, test_data_y = train_test_split(normalized_df.drop([y_col], axis=1),
 67 |                                                                                 normalized_df[y_col],
 68 |                                                                                 shuffle=True,
 69 |                                                                                 test_size=DRAG_FORCE_TEST_SIZE_PORTION,
 70 |                                                                                 random_state=RANDOM_STATE)
 71 |         # 1.2) log elapsed time
 72 |         data_end_time = time.time()
 73 |         Logger.print("   --- Finished. Elapsed time: {} ---".format(timedelta(seconds=data_end_time - start_time)))
 74 | 
 75 |         # 2) continue to the MultiTPOTrunner regression
 76 |         Logger.print('Training MultiTPOTrunner:')
 77 |         if numerical_bool:
 78 |             # 2.1.1) find the best sub-set of features and ML model
 79 |             best_gene = GAFS.run(tpot_run_times=DRAG_FORCE_NUMERICAL_RUN_TIMES,
 80 |                                 feature_generations=DRAG_FORCE_FEATURE_GENERATIONS_COUNT,
 81 |                                 tpot_regressor_generations=DRAG_FORCE_NUMERICAL_GENERATION_COUNT,
 82 |                                 feature_population_size=DRAG_FORCE_FEATURE_POP_SIZE,
 83 |                                 tpot_regressor_population_size=DRAG_FORCE_NUMERICAL_POP_SIZE,
 84 |                                 mutation_rate=DRAG_FORCE_MUTATION_RATE,
 85 |                                 feature_indexes_ranges=feature_indexes_ranges,
 86 |                                 mutation_w=[val[1]-val[0] for val in feature_indexes_ranges],
 87 |                                 royalty=DRAG_FORCE_ROYALTY,
 88 |                                 k_fold=k_fold,
 89 |                                 performance_metric=neg_mean_squared_error_scorer,
 90 |                                 train_data_x=train_data_x,
 91 |                                 train_data_y=train_data_y,
 92 |                                 test_data_x=test_data_x,
 93 |                                 test_data_y=test_data_y,
 94 |                                 save_dir=os.path.join(os.path.dirname(os.path.dirname(__file__)),
 95 |                                                       DRAG_FORCE_RESULTS_FOLDER_NAME),
 96 |                                 cores=-1)
 97 |             # 2.1.2) save results of best model from all runs
 98 |             ResultTracker.run(program_part="tpot",
 99 |                               run_times=DRAG_FORCE_NUMERICAL_RUN_TIMES,
100 |                               all_scores=best_gene.scoring_history,
101 |                               model=best_gene.model_object,
102 |                               train_data_x=train_data_x.iloc[:, best_gene.feature_indexes],
103 |                               train_data_y=train_data_y,
104 |                               test_data_x=test_data_x.iloc[:, best_gene.feature_indexes],
105 |                               test_data_y=test_data_y,
106 |                               save_dir=os.path.join(os.path.dirname(os.path.dirname(__file__)),
107 |                                                       DRAG_FORCE_RESULTS_FOLDER_NAME))
108 |             # 2.1.3) save selected features of best gene
109 |             with open(os.path.join(os.path.dirname(__file__), DRAG_FORCE_RESULTS_FOLDER_NAME, "best_features_selected.json"),
110 |                       "w") as features_file:
111 |                 json.dump({"index": best_gene.feature_indexes,
112 |                             "names": list(test_data_x.columns[best_gene.feature_indexes])},
113 |                           features_file)
114 |             Logger.print("Best gene features: {}".format(list(test_data_x.columns[best_gene.feature_indexes])))
115 |             # 2.1.4) reduce the dataset of non-normalized samples for next part
116 |             df = df.iloc[:, best_gene.feature_indexes+[-1]]
117 |         # 2.2) log elapsed time
118 |         tpot_end_time = time.time()
119 |         Logger.print("   --- Finished. Elapsed time: {} ---".format(timedelta(seconds=tpot_end_time - data_end_time)))
120 | 
121 |         # 3) continue to the symbolic regression
122 |         Logger.print('Searching for a symbolic expression:')
123 |         if analytical_bool:
124 |             # 3.1) run symbolic regressor multiple times
125 |             all_s_scores, best_s_model = GASF.run_and_analyze(run_times=DRAG_FORCE_NUMERICAL_RUN_TIMES,
126 |                                                               non_normalized_data=df,
127 |                                                               performance_metric=function_mapper["better_symbolic_reg_fitness"],
128 |                                                               generations=DRAG_FORCE_ANALYTICAL_GENERATION_COUNT,
129 |                                                               population_size=DRAG_FORCE_ANALYTICAL_POP_SIZE,
130 |                                                               k_fold=K_FOLD,
131 |                                                               cores=-1,
132 |                                                               parsimony_coefficient=DRAG_FORCE_ANALYTICAL_PARSIMONY_COEFFICIENT,
133 |                                                               expected_eq='mul(0.392, mul(cd, mul(rho, mul(v, mul(v, mul(d, d))))))',
134 |                                                               save_dir=os.path.join(os.path.dirname(os.path.dirname(__file__)),
135 |                                                                                     DRAG_FORCE_RESULTS_FOLDER_NAME))
136 |             # 3.2) save results of best model from all runs
137 |             ResultTracker.run(program_part="symbolic",
138 |                               run_times=DRAG_FORCE_NUMERICAL_RUN_TIMES,
139 |                               all_scores=all_s_scores,
140 |                               model=best_s_model,
141 |                               train_data_x=train_data_x,
142 |                               train_data_y=train_data_y,
143 |                               test_data_x=test_data_x,
144 |                               test_data_y=test_data_y,
145 |                               save_dir=os.path.join(os.path.dirname(os.path.dirname(__file__)),
146 |                                                     DRAG_FORCE_RESULTS_FOLDER_NAME))
147 |             # 3.3) save a summary of the eqs found & figure whether to continue to ebf
148 |             ebs_flag = ResultTracker.summaries_symbolic_results(run_times=DRAG_FORCE_NUMERICAL_RUN_TIMES,
149 |                                                                 percent_of_majority=SYMBOLIC_PERCENT_OF_MAJORITY,
150 |                                                                 eq_ranking_metric=SYMBOLIC_EQ_RANKING_METRIC,
151 |                                                                 top_eqs_max_num=SYMBOLIC_TOP_EQS_MAX_NUM,
152 |                                                                 save_dir=os.path.join(os.path.dirname(os.path.dirname(__file__)),
153 |                                                                                       DRAG_FORCE_RESULTS_FOLDER_NAME))
154 |             # 3.4) log elapsed time
155 |             symbolic_end_time = time.time()
156 |             Logger.print("Finished. Elapsed time: {}".format(timedelta(seconds=symbolic_end_time - tpot_end_time)))
157 | 
158 |             # 4) continue to the EBS
159 |             if ebs_flag or force_ebs_bool:
160 |                 Logger.print('Searching for a symbolic expression using EBF:')
161 |                 # 4.1) run EBS multiple times
162 |                 all_ebs_scores, best_ebs_model = EBS.run_and_analyze(run_times=DRAG_FORCE_NUMERICAL_RUN_TIMES,
163 |                                                                      non_normalized_data=df,
164 |                                                                      performance_metric=function_mapper["better_symbolic_reg_fitness"],
165 |                                                                      cores=-1,
166 |                                                                      size_range=DRAG_FORCE_EBS_SIZE_RANGE,
167 |                                                                      expected_eq='mul(0.392, mul(cd, mul(rho, mul(v, mul(v, mul(d, d))))))',
168 |                                                                      save_dir=os.path.join(os.path.dirname(os.path.dirname(__file__)),
169 |                                                                                            DRAG_FORCE_RESULTS_FOLDER_NAME))
170 |                 # 4.2) save the fitting score results
171 |                 ResultTracker.ebs_results(model=best_ebs_model,
172 |                                           all_scores=all_ebs_scores,
173 |                                           save_dir=os.path.join(os.path.dirname(os.path.dirname(__file__)),
174 |                                                                 DRAG_FORCE_RESULTS_FOLDER_NAME))
175 |             else:
176 |                 Logger.print("EBS search of a symbolic equation wasn't needed")
177 |             # 4.3) log elapsed time
178 |             ebs_end_time = time.time()
179 |             Logger.print("Finished. Elapsed time: {}".format(timedelta(seconds=ebs_end_time - symbolic_end_time)))
180 | 
181 |         # 5) alert results to the user
182 |         Logger.print("TOTAL TIME ELAPSED TIME: {}".format(timedelta(seconds=time.time() - start_time)))
183 | 


--------------------------------------------------------------------------------
/algo/equation_brute_search.py:
--------------------------------------------------------------------------------
  1 | # library imports
  2 | import os
  3 | import json
  4 | import pandas as pd
  5 | from scipy import stats
  6 | import concurrent.futures
  7 | from sklearn.model_selection import KFold
  8 | from sklearn.linear_model import LinearRegression
  9 | from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
 10 | 
 11 | # project imports
 12 | from algo.ebs.eq import Eq
 13 | from utills.plotter import Plotter
 14 | from algo.ebs.eq_node import EqNode
 15 | from algo.ebs.eq_functions import *
 16 | from utills.fitness_methods import *
 17 | from utills.logger_config import Logger
 18 | 
 19 | 
 20 | class EBS:
 21 |     """
 22 |     This class is responsible for generating a symbolic equation
 23 |     of a target value from a given set of features, using a tree-structure brute-search.
 24 | 
 25 |     The class contains 2 functions:
 26 |     1. run: Kfold trains a model and returns it fitted
 27 |     2. run_and_analyze: applies the run function multiple times
 28 |        to gain statistical insight on the performance.
 29 |     """
 30 | 
 31 |     # CONSTS #
 32 |     DEFAULT_TEST_FIT_FUNCTION = better_symbolic_reg_fitness
 33 |     # END - CONSTS #
 34 | 
 35 |     # CATCH FOR FASTER COMPUTATION #
 36 |     TOPOLOGY_TREES = {}
 37 |     ALLOCATED_EQS = {}
 38 |     # END - CATCH FOR FASTER COMPUTATION #
 39 | 
 40 |     def __init__(self):
 41 |         pass
 42 | 
 43 |     @staticmethod
 44 |     def run(non_normalized_data: pd.DataFrame,
 45 |             k_fold: int,
 46 |             performance_metric,
 47 |             verbose: int,
 48 |             size_range: tuple,
 49 |             expected_eq='Unknown',
 50 |             cores: int = -1):
 51 |         """
 52 |         Run the GAFS algorithm with some hyper-parameters.
 53 |         Initially the model is trained on a kfold portion of data,
 54 |         and then on the dataset as a whole.
 55 |         The model of the latter case is returned.
 56 |         """
 57 |         y_col = non_normalized_data.keys()[-1]
 58 |         x_values = non_normalized_data.drop([y_col], axis=1)
 59 |         y_values = non_normalized_data[y_col]
 60 |         # make a k-fold cross validation so we can trust the results better
 61 |         kf = KFold(n_splits=k_fold)
 62 |         scores = []
 63 |         fold_index = 1
 64 |         for train_index, test_index in kf.split(x_values):
 65 |             # say we do fold
 66 |             Logger.print(message="Equation brute force {} fold".format(fold_index))
 67 |             fold_index += 1
 68 |             # prepare data
 69 |             X_train, X_test = x_values.iloc[train_index, :], x_values.iloc[test_index, :]
 70 |             y_train, y_test = y_values.iloc[train_index], y_values.iloc[test_index]
 71 |             # prepare model
 72 |             eq, best_score, answer = EBS._search_equation(x=X_train,
 73 |                                                           y=y_train,
 74 |                                                           performance_metric=performance_metric,
 75 |                                                           verbose=verbose,
 76 |                                                           cores=cores,
 77 |                                                           size_range=size_range)
 78 |             y_pred = eq.eval(X_test)
 79 |             score = performance_metric(y_test, y_pred) if not isinstance(performance_metric, str) else function_mapper[
 80 |                 performance_metric](y_test, y_pred)
 81 |             scores.append(score)
 82 | 
 83 |         # train a symbolic regression on all the data, it is at least as good as the previous ones
 84 |         eq, best_score, answer = EBS._search_equation(x=x_values,
 85 |                                                       y=y_values,
 86 |                                                       performance_metric=performance_metric,
 87 |                                                       verbose=verbose,
 88 |                                                       cores=cores,
 89 |                                                       size_range=size_range)
 90 |         # if we want to compare to the EQ.
 91 |         if expected_eq != 'Unknown':
 92 |             Logger.print(message='Expected eq: {}, Found eq: {}'.format(expected_eq,
 93 |                                                                         eq.to_string()))
 94 |         else:
 95 |             Logger.print(message='Found eq: {}'.format(eq.to_string()))
 96 |         return eq
 97 | 
 98 |     @staticmethod
 99 |     def run_and_analyze(run_times: int,
100 |                         non_normalized_data: pd.DataFrame,
101 |                         performance_metric,
102 |                         save_dir: str,
103 |                         size_range: tuple,
104 |                         expected_eq='Unknown',
105 |                         cores: int = -1):
106 |         """
107 |         Run the GAFS algorithm several times and save results from all runs.
108 |         Returns a pandas dataframe of all results and the best model from
109 |         all runs.
110 | 
111 |         @:var size_range - start, end, and step size of the EQ tree's number of nodes
112 |         """
113 |         results = pd.DataFrame()
114 |         y_col = non_normalized_data.keys()[-1]
115 |         x_values = non_normalized_data.drop(y_col, axis=1)
116 |         y_values = non_normalized_data[y_col]
117 |         current_best_wanted_loss = 99999
118 |         best_model = None
119 |         for test in range(run_times):
120 |             Logger.print(message="run {}".format(test + 1))
121 |             eq, best_score, answer = EBS._search_equation(x=x_values,
122 |                                                           y=y_values,
123 |                                                           verbose=1 if test == 0 else 0,
124 |                                                           performance_metric=performance_metric,
125 |                                                           size_range=size_range,
126 |                                                           cores=cores)
127 |             pred = eq.eval(x_values)
128 |             # save test scores
129 |             try:
130 |                 wanted_loss = performance_metric(y_values, pred)
131 |             except Exception as error:
132 |                 wanted_loss = EBS.DEFAULT_TEST_FIT_FUNCTION(y_values, pred)
133 |             results.at[test, "wanted_loss"] = wanted_loss
134 |             results.at[test, "mae"] = mean_absolute_error(y_values, pred)
135 |             results.at[test, "mse"] = mean_squared_error(y_values, pred)
136 |             results.at[test, "r2"] = r2_score(y_values, pred)
137 |             results.at[test, "t_test_p_value"] = stats.ttest_ind(y_values, pred)[1]
138 |             results.at[test, "found_eq"] = eq.to_string()
139 |             if wanted_loss < current_best_wanted_loss or best_model is None:
140 |                 best_model = eq
141 |                 current_best_wanted_loss = wanted_loss
142 | 
143 |         # print and save scoring results of all runs
144 |         Logger.print(message="Finished all EBS runs - ")
145 |         if expected_eq != 'Unknown':
146 |             Logger.print(message='Expected eq: {}, Found eq: {}'.format(expected_eq,
147 |                                                                         best_model.to_string()))
148 |         else:
149 |             Logger.print(message='Found eq: {}'.format(best_model.to_string()))
150 |         [Logger.print(message="{}: {:.3} +- {:.3}".format(score, results[score].mean(), results[score].std()))
151 |          for score in ["mae", "mse", "r2", "t_test_p_value"]]
152 |         results.to_csv(os.path.join(save_dir, "ebs_scoring_history.csv"))
153 |         # plot best model's predictions vs true values
154 |         Plotter.y_test_vs_y_pred(model=best_model,
155 |                                  x_test=x_values,
156 |                                  y_test=y_values,
157 |                                  save_path=os.path.join(save_dir, "ebs_target_vs_pred.pdf"))
158 |         return results, best_model
159 | 
160 |     @staticmethod
161 |     def _search_equation(x: pd.DataFrame,
162 |                          y: pd.Series,
163 |                          verbose: int,
164 |                          performance_metric,
165 |                          size_range: tuple,
166 |                          cores: int) -> tuple:
167 |         """
168 |         Search for the equation
169 |         # TODO: think how to use multi-thread later
170 |         """
171 |         # run over the needed range to generate all possible tree topologies
172 |         for n in size_range:
173 |             if verbose == 1:
174 |                 Logger.print(message="EBS._search_equation: Generating all possible binary tree topologies for size {}".format(n))
175 | 
176 |             if n < -1:
177 |                 continue
178 |             elif (n % 2) == 0:
179 |                 EBS.TOPOLOGY_TREES[n] = []
180 |             elif n == 1:
181 |                 EBS.TOPOLOGY_TREES[1] = [Eq(tree=EqNode(value=None))]
182 |             elif n not in EBS.TOPOLOGY_TREES:  # do not calc the same topology twice
183 |                 EBS.TOPOLOGY_TREES[n] = EBS._all_possible_fbt(n=n)
184 |         # find best equation for the data
185 |         answer = {}
186 |         best_eq = ""
187 |         best_score = 9999
188 |         # TODO: replace the organized search with random monto-carlo sample later in production once the mapping order is fixed
189 |         # run over the tree topologies sizes
190 |         for n in size_range:
191 |             if verbose == 1:
192 |                 Logger.print(message="EBS._search_equation: Testing {} possible binary tree topologies for size {}".format(n, len(EBS.TOPOLOGY_TREES[n])))
193 | 
194 |             # run over all tree populations
195 |             for tree_topology in EBS.TOPOLOGY_TREES[n]:
196 |                 tree_topology.fix_nodes()  # just to make sure the meta-values are fine
197 |                 # avoid computing the same data twice, if we have this allocation list, use it and calc if we don't
198 |                 if tree_topology.to_id_str() not in EBS.ALLOCATED_EQS:
199 |                     # populate each tree with all possible combinations
200 |                     possible_trees = EBS._populate_tree(eq=tree_topology,
201 |                                                         not_leaf_values=FUNCTION_LIST,
202 |                                                         leaf_values=list(x))
203 |                     EBS.ALLOCATED_EQS[tree_topology.to_id_str()] = possible_trees  # recall the allocation list
204 |                 else:
205 |                     possible_trees = EBS.ALLOCATED_EQS[tree_topology.to_id_str()]
206 | 
207 |                 if verbose == 1:
208 |                     Logger.print(message="EBS._search_equation: Found {} possible populated trees to check for this topology".format(len(possible_trees)))
209 |                 # for each combination compute performance
210 |                 for eq_index, this_eq in enumerate(possible_trees):
211 |                     # TODO: change this magic number later
212 |                     if (eq_index % 100) == 0 and verbose == 1:
213 |                         Logger.print(message="EBS._search_equation: Test of {} / {} ({:.3f}%) equations done".format(eq_index, len(possible_trees), eq_index*100/len(possible_trees)))
214 |                     try:
215 |                         y_pred = this_eq.eval(x_values=x)
216 |                         reg = LinearRegression().fit([[val] for val in y_pred], y)
217 |                         this_eq.linear_a = reg.coef_[0]
218 |                         this_eq.linear_b = reg.intercept_
219 |                         y_pred = this_eq.linear_a * y_pred + this_eq.linear_b  # re-calibrate results
220 |                         score = performance_metric(y, y_pred)  # calc the performance
221 |                         answer[this_eq.to_string()] = score
222 |                         if score < best_score:
223 |                             best_score = score
224 |                             best_eq = this_eq
225 |                     except Exception as error:
226 |                         this_eq.eval(x_values=x)
227 |                         Logger.debug(message="Error at EBS._search_equation, saying: {}".format(error))
228 |         return best_eq, best_score, answer
229 | 
230 |     @staticmethod
231 |     def _all_possible_fbt(n: int) -> list:
232 |         return Eq.all_possible_fbt(n=n)
233 | 
234 |     @staticmethod
235 |     def _populate_tree(eq: Eq,
236 |                        not_leaf_values: list,
237 |                        leaf_values: list) -> list:
238 |         """
239 |         Gets a tree topology and return all possible value allocations to the tree
240 |         :param tree: the tree topology
241 |         :param not_leaf_values: the function can be allocated to the not leaf node
242 |         :param leaf_values: the leaves values
243 |         :return: all possible allocation of the values to the given
244 |         """
245 |         return eq.populate(not_leaf_values=not_leaf_values,
246 |                            leaf_values=leaf_values)
247 | 
248 |     def __repr__(self):
249 |         return self.__str__()
250 | 
251 |     def __str__(self):
252 |         return "<EBS>"
253 | 


--------------------------------------------------------------------------------
/data_generators/steady_free_fall_with_drag_data_generator.py:
--------------------------------------------------------------------------------
  1 | # project imports
  2 | import os
  3 | import numpy as np
  4 | import pandas as pd
  5 | from tqdm import tqdm
  6 | 
  7 | # library imports
  8 | from utills.consts import *
  9 | from utills.logger_config import Logger
 10 | from data_generators.N_frequency_generator import NFrequencyGenerator
 11 | 
 12 | 
 13 | class SFFWDdataGenerator:
 14 |     """
 15 |     This class is responsible for the generation of the physical
 16 |     steady (no acceleration) free fall with drag to test the model.
 17 |     The motion is dictated by:
 18 |         0 = Weight - Buoyancy - 0.5*Cd*rhoa*V*V*area
 19 |     ->  Cd = (rhop - rhoa) * volume * g * 2/(rhoa*V*V*area)
 20 | 
 21 |     Variables in this file:
 22 |          g : gravitational acceleration        [m/s2]
 23 |       rhop : particle density                  [kg/m3]
 24 |          d : particle diameter                 [m]
 25 |       rhoa : fluid density                     [kg/m3]
 26 |          V : settling velocity                 [m/s]
 27 |         nu : kinematic viscosity of the fluid  [m2/s]
 28 |         Re : reynolds number                   [-]
 29 | 
 30 |     This class is responsible to produce a pandas DataFrame for three different model experiments.
 31 |     """
 32 | 
 33 |     def __init__(self):
 34 |         pass
 35 | 
 36 |     @staticmethod
 37 |     def generate_noiseless(samples: int,
 38 |                            rhoa_range: tuple,
 39 |                            rhop_range: tuple,
 40 |                            nu_range: tuple,
 41 |                            re_range: tuple,
 42 |                            show_progress_bar: bool = True):
 43 |         """
 44 |         Generates a pandas dataframe of experiments to represent steady free fall measurements.
 45 |         """
 46 |         rhoa_range_delta = rhoa_range[1] - rhoa_range[0]
 47 |         rhop_range_delta = rhop_range[1] - rhop_range[0]
 48 |         nu_range_delta = nu_range[1] - nu_range[0]
 49 |         re_range_delta = re_range[1] - re_range[0]
 50 |         data = []
 51 |         # generate samples
 52 |         pbar = tqdm(total=samples, desc="Generating baseline data") if show_progress_bar else None
 53 |         for sample_index in range(samples):
 54 |             # sample data from ranges
 55 |             rhoa = np.random.random_sample() * rhoa_range_delta + rhoa_range[0]
 56 |             nu = np.random.random_sample() * nu_range_delta + nu_range[0]
 57 |             re = np.random.random_sample() * re_range_delta + re_range[0]
 58 |             # calc rhop from rhoa with range
 59 |             rhop = rhoa + np.random.random_sample() * rhop_range_delta + rhop_range[0]
 60 |             # calc Cd from Re acc to known drag to Reynolds relation
 61 |             cd = 0.4 + 24.0 / re + 6.0 / (1 + re ** 0.5)
 62 |             # calc 'd' from the other parameters
 63 |             if rhop == rhoa:  # just to make sure we will not divide by zero
 64 |                 raise ZeroDivisionError("Rhop can not be equal to rhoa")
 65 |             d = np.power((cd * re * re * nu * nu * rhoa) / (13.08 * (rhop - rhoa)), 1 / 3)
 66 |             # recalculate 'v'
 67 |             v = re * nu / d
 68 |             # add the data and alert the user by a progress bar, if needed
 69 |             data.append([rhoa, v, d, rhop, nu, cd])
 70 |             if show_progress_bar:
 71 |                 pbar.update(1)
 72 | 
 73 |         if show_progress_bar:
 74 |             pbar.close()
 75 |         # make a Pandas.DataFrame and save it as a CSV file
 76 |         df = pd.DataFrame(data=data, columns=["rhoa", "V", "d", "rhop", "nu", "Cd"])
 77 |         df.to_csv(os.path.join(DATA_FOLDER,
 78 |                                "SFF_noiseless_baseline_{}_samples.csv".format(SFF_N_SAMPLES_STR)),
 79 |                   index=False)
 80 |         return df
 81 | 
 82 |     @staticmethod
 83 |     def generate_case_1(samples: int,
 84 |                         rhoa_range: tuple,
 85 |                         rhop_range: tuple,
 86 |                         nu_range: tuple,
 87 |                         re_range: tuple,
 88 |                         save_path: str,
 89 |                         dropped_param: str = SFF_1_DROP_PARAM,
 90 |                         force: bool = FORCE_DATA_OVERRIDE_FLAG):
 91 |         """
 92 |         Generate a pandas dataframe with only 3 our of 4 needed features to calc Cd.
 93 |         Saves the dataframe for model experiment.
 94 |         """
 95 |         baseline_path = os.path.join(os.path.dirname(os.path.dirname(__file__)),
 96 |                                      DATA_FOLDER,
 97 |                                      "SFF_noiseless_baseline_{}_samples.csv".format(SFF_N_SAMPLES_STR))
 98 |         if os.path.exists(baseline_path) and not force:
 99 |             df = pd.read_csv(baseline_path)
100 |         else:
101 |             df = SFFWDdataGenerator.generate_noiseless(samples=samples,
102 |                                                        rhoa_range=rhoa_range,
103 |                                                        nu_range=nu_range,
104 |                                                        re_range=re_range,
105 |                                                        rhop_range=rhop_range)
106 |         # alert user
107 |         re = df["d"] * df["V"] / df["nu"]
108 |         Logger.print('Generated data with Re: min={:.4} max={:.4}'.format(re.min(), re.max()))
109 |         # build case
110 |         df.drop([dropped_param, 'nu'], axis=1).to_csv(save_path, index=False)
111 |         # alert user that feature selection isn't needed
112 |         feature_indexes_ranges = "Not applicable"
113 |         return feature_indexes_ranges
114 | 
115 |     @staticmethod
116 |     def generate_case_2(samples: int,
117 |                         rhoa_range: tuple,
118 |                         rhop_range: tuple,
119 |                         nu_range: tuple,
120 |                         re_range: tuple,
121 |                         save_path: str,
122 |                         noise_range: tuple = SFF_CASE_2_NOISE_RANGE,
123 |                         force: bool = FORCE_DATA_OVERRIDE_FLAG):
124 |         """
125 |         Generate a pandas dataframe with rhoa,V,d,rhop,Cd measurements.
126 |         Adds noise to Cd, and saves the dataframe for model experiment.
127 |         """
128 |         baseline_path = os.path.join(os.path.dirname(os.path.dirname(__file__)),
129 |                                      DATA_FOLDER,
130 |                                      "SFF_noiseless_baseline_{}_samples.csv".format(SFF_N_SAMPLES_STR))
131 |         if os.path.exists(baseline_path) and not force:
132 |             df = pd.read_csv(baseline_path)
133 |         else:
134 |             df = SFFWDdataGenerator.generate_noiseless(samples=samples,
135 |                                                        rhoa_range=rhoa_range,
136 |                                                        nu_range=nu_range,
137 |                                                        re_range=re_range,
138 |                                                        rhop_range=rhop_range)
139 |         # alert user
140 |         re = df["d"] * df["V"] / df["nu"]
141 |         Logger.print('Generated data with Re: min={:.4} max={:.4}'.format(re.min(), re.max()))
142 |         # add noise to target
143 |         noise_range_delta = noise_range[1] - noise_range[0]
144 |         noise = (np.random.random_sample() * noise_range_delta + noise_range[0]) * np.random.choice((-1, 1))
145 |         df.Cd = [val * (1 + noise) for val in df["Cd"]]
146 |         # build case
147 |         df.drop("nu", axis=1).to_csv(save_path, index=False)
148 |         # alert user that feature selection isn't needed
149 |         feature_indexes_ranges = "Not applicable"
150 |         return feature_indexes_ranges
151 | 
152 |     @staticmethod
153 |     def generate_case_2_with_guess(samples: int,
154 |                                    rhoa_range: tuple,
155 |                                    rhop_range: tuple,
156 |                                    nu_range: tuple,
157 |                                    re_range: tuple,
158 |                                    save_path: str,
159 |                                    noise_range: tuple = SFF_CASE_2_NOISE_RANGE,
160 |                                    force: bool = FORCE_DATA_OVERRIDE_FLAG):
161 |         """
162 |         Generate a pandas dataframe similar to that of casse 2.
163 |         Adds two educated guesses (rhop-rhoa, and V^2),
164 |         and saves the dataframe for model experiment.
165 |         """
166 |         baseline_path = os.path.join(os.path.dirname(os.path.dirname(__file__)),
167 |                                      DATA_FOLDER,
168 |                                      "SFF_noiseless_baseline_{}_samples.csv".format(SFF_N_SAMPLES_STR))
169 |         if os.path.exists(baseline_path) and not force:
170 |             df = pd.read_csv(baseline_path)
171 |         else:
172 |             df = SFFWDdataGenerator.generate_noiseless(samples=samples,
173 |                                                        rhoa_range=rhoa_range,
174 |                                                        nu_range=nu_range,
175 |                                                        re_range=re_range,
176 |                                                        rhop_range=rhop_range)
177 |         # alert user
178 |         re = df["d"] * df["V"] / df["nu"]
179 |         Logger.print('Generated data with Re: min={:.4} max={:.4}'.format(re.min(), re.max()))
180 |         # add noise to target
181 |         noise_range_delta = noise_range[1] - noise_range[0]
182 |         noise = (np.random.random_sample() * noise_range_delta + noise_range[0]) * np.random.choice((-1, 1))
183 |         df.Cd = [val * (1 + noise) for val in df["Cd"]]
184 |         # build case
185 |         delta_rho = df["rhop"] - df["rhoa"]
186 |         df.insert(0, "delta_rho", delta_rho)
187 |         vel_squared = df["V"] * df["V"]
188 |         df.insert(0, "V^2", vel_squared)
189 |         df.drop("nu", axis=1).to_csv(save_path, index=False)
190 |         # alert user that feature selection isn't needed
191 |         feature_indexes_ranges = "Not applicable"
192 |         return feature_indexes_ranges
193 | 
194 |     @staticmethod
195 |     def generate_case_3(samples: int,
196 |                         rhoa_range: tuple,
197 |                         rhop_range: tuple,
198 |                         nu_range: tuple,
199 |                         re_range: tuple,
200 |                         save_path: str,
201 |                         force: bool = FORCE_DATA_OVERRIDE_FLAG):
202 |         """
203 |         Generate a pandas dataframe with rhoa,V,d,rhop,nu,Cd measurements.
204 |         Uses the dataframe to create a dataframe of dimensionless features,
205 |         and saves it for model experiment.
206 |         """
207 |         baseline_path = os.path.join(os.path.dirname(os.path.dirname(__file__)),
208 |                                      DATA_FOLDER,
209 |                                      "SFF_noiseless_baseline_{}_samples.csv".format(SFF_N_SAMPLES_STR))
210 |         if os.path.exists(baseline_path) and not force:
211 |             df = pd.read_csv(baseline_path)
212 |         else:
213 |             df = SFFWDdataGenerator.generate_noiseless(samples=samples,
214 |                                                        rhoa_range=rhoa_range,
215 |                                                        nu_range=nu_range,
216 |                                                        re_range=re_range,
217 |                                                        rhop_range=rhop_range)
218 |         # alert user
219 |         re = df["d"] * df["V"] / df["nu"]
220 |         Logger.print('Generated data with Re: min={:.4} max={:.4}'.format(re.min(), re.max()))
221 |         # add all possible N frequency combinations
222 |         df, n_suffix = NFrequencyGenerator.add_all_combos(df=df,
223 |                                                           g=g_force)
224 |         # create non-dimensional features
225 |         features = pd.DataFrame()
226 |         # density ratio:
227 |         features["rhop/rhoa"] = df["rhop"] / df["rhoa"]
228 |         # density delta ratio:
229 |         features["delta_rho/rhoa"] = (df["rhop"] - df["rhoa"]) / df["rhoa"]
230 |         # Reynolds number:
231 |         features["Re"] = re
232 |         # Unknown number - nu*g/V**3:
233 |         features["nu*g/V**3"] = g_force * df["nu"] / df["V"] ** 3
234 |         # Unknown number - g*d/V**2:
235 |         features["g*d/V**2"] = g_force * df["d"] / df["V"] ** 2
236 |         # add 4 more groups of features
237 |         for suff in n_suffix:
238 |             # Froude number:
239 |             features["Fr{}".format(suff)] = df["V"] / (df["d"] * df["N{}".format(suff)])
240 |             # Froude number from acceleration - g/(V*N_i):
241 |             features["AccFr{}".format(suff)] = g_force / (df["V"] * df["N{}".format(suff)])
242 |             # Unknown number (Num1) - g*d/(nu*N_i)
243 |             features["1Num{}".format(suff)] = g_force * df["d"] / (df["nu"] * df["N{}".format(suff)])
244 |             # Unknown number (Num2) - V*V/(nu*N_i)
245 |             features["2Num{}".format(suff)] = df["V"] * df["V"] / (df["nu"] * df["N{}".format(suff)])
246 |         # reorder column names to be in groups
247 |         features = features[sorted(features.keys(), key=lambda x: x[0])]
248 |         # set index ranges for all 9 feature groups
249 |         feature_indexes_ranges = []
250 |         index = 0
251 |         for i in range(9):
252 |             if i < 4:
253 |                 feature_indexes_ranges.append([index, index + len(n_suffix) - 1])
254 |                 index += len(n_suffix)
255 |             else:
256 |                 feature_indexes_ranges.append([index, index])
257 |                 index += 1
258 |         # add target
259 |         features["Cd"] = df["Cd"]
260 |         # save the result to a csv file
261 |         features.to_csv(save_path, index=False)
262 | 
263 |         return feature_indexes_ranges
264 | 


--------------------------------------------------------------------------------
/experiments/exp_steady_free_fall_with_drag.py:
--------------------------------------------------------------------------------
  1 | # library imports
  2 | import os
  3 | import json
  4 | import time
  5 | import pandas as pd
  6 | from datetime import timedelta
  7 | from sklearn.model_selection import train_test_split
  8 | 
  9 | # project imports
 10 | from utills.consts import *
 11 | from utills.fitness_methods import *
 12 | from utills.logger_config import Logger
 13 | from algo.equation_brute_search import EBS
 14 | from utills.result_tracker import ResultTracker
 15 | from algo.multi_tpot_analysis import MultiTPOTrunner
 16 | from algo.genetic_algorithm_symbolic_fit import GASF
 17 | from algo.genetic_algorithm_feature_selection import GAFS
 18 | 
 19 | 
 20 | class ExpSFF:
 21 |     """
 22 |     A father class to the SFF experiments, responsible for
 23 |     the run function for all SFF cases.
 24 |     Here, only the data generation function changes from
 25 |     case to case.
 26 |     """
 27 | 
 28 |     def __init__(self):
 29 |         pass
 30 | 
 31 |     @staticmethod
 32 |     def run(numerical_bool: bool,
 33 |             analytical_bool: bool,
 34 |             force_ebs_bool: bool,
 35 |             results_folder: str,
 36 |             data_path: str,
 37 |             data_generation_function,
 38 |             numerical_run_times: int,
 39 |             numerical_generations: int,
 40 |             numerical_population: int,
 41 |             analytical_run_times: int,
 42 |             analytical_generations: int,
 43 |             analytical_population: int,
 44 |             parsimony_coefficient: int,
 45 |             k_fold: int,
 46 |             samples: int,
 47 |             rhoa_range: tuple,
 48 |             rhop_range: tuple,
 49 |             nu_range: tuple,
 50 |             re_range: tuple,
 51 |             ebs_size_range: tuple,
 52 |             expected_eq: str = "unknown",
 53 |             feature_selection_generations: int = None,
 54 |             feature_selection_pop_size: int = None,
 55 |             feature_selection_mutation_rate: float = None,
 56 |             feature_selection_royalty: float = None):
 57 | 
 58 |         # config logging
 59 |         start_time = time.time()
 60 | 
 61 |         # prepare IO
 62 |         os.makedirs(results_folder, exist_ok=True)
 63 |         Logger(os.path.join(os.path.dirname(os.path.dirname(__file__)),
 64 |                             results_folder,
 65 |                             "run.log"))
 66 |         # 1) generate data
 67 |         feature_indexes_ranges = data_generation_function(samples=samples,
 68 |                                                           rhoa_range=rhoa_range,
 69 |                                                           nu_range=nu_range,
 70 |                                                           re_range=re_range,
 71 |                                                           rhop_range=rhop_range,
 72 |                                                           save_path=data_path)
 73 |         # 1.1) load data, normalize and split
 74 |         df = pd.read_csv(data_path)
 75 |         Logger.print('Generated data:\n{}'.format(df.describe()))
 76 |         y_col = df.keys()[-1]
 77 |         normalized_df = (df - df.min()) / (df.max() - df.min())
 78 |         train_data_x, test_data_x, train_data_y, test_data_y = train_test_split(normalized_df.drop([y_col], axis=1),
 79 |                                                                                 normalized_df[y_col],
 80 |                                                                                 shuffle=True,
 81 |                                                                                 test_size=SFF_TEST_SIZE_PORTION,
 82 |                                                                                 random_state=RANDOM_STATE)
 83 |         # 1.2) log elapsed time
 84 |         data_end_time = time.time()
 85 |         Logger.print("   --- Finished. Elapsed time: {} ---".format(timedelta(seconds=data_end_time - start_time)))
 86 | 
 87 |         # 2) continue to the MultiTPOTrunner regression
 88 |         Logger.print('Training MultiTPOTrunner:')
 89 |         if numerical_bool:
 90 |             # 2.1) run multi-tpot analysis if feature selection isn't needed
 91 |             if feature_indexes_ranges == "Not applicable":
 92 |                 # 2.1.1) find the best ML model for all the data
 93 |                 all_t_scores, best_t_model = MultiTPOTrunner.run_and_analyze(run_times=numerical_run_times,
 94 |                                                                              train_data_x=train_data_x,
 95 |                                                                              train_data_y=train_data_y,
 96 |                                                                              test_data_x=test_data_x,
 97 |                                                                              test_data_y=test_data_y,
 98 |                                                                              generations=numerical_generations,
 99 |                                                                              population_size=numerical_population,
100 |                                                                              k_fold=k_fold,
101 |                                                                              performance_metric=neg_mean_squared_error_scorer,
102 |                                                                              save_dir=results_folder,
103 |                                                                              n_jobs=-1)
104 |                 # 2.1.2) save results of best model from all runs
105 |                 ResultTracker.run(program_part="tpot",
106 |                                   run_times=numerical_run_times,
107 |                                   all_scores=all_t_scores,
108 |                                   model=best_t_model,
109 |                                   train_data_x=train_data_x,
110 |                                   train_data_y=train_data_y,
111 |                                   test_data_x=test_data_x,
112 |                                   test_data_y=test_data_y,
113 |                                   save_dir=results_folder)
114 |             # 2.2) run multi-tpot analysis with feature selection
115 |             else:
116 |                 # 2.2.1) find the best ML model for a subset of the data
117 |                 best_gene = GAFS.run(tpot_run_times=numerical_run_times,
118 |                                      feature_generations=feature_selection_generations,
119 |                                      tpot_regressor_generations=numerical_generations,
120 |                                      feature_population_size=feature_selection_pop_size,
121 |                                      tpot_regressor_population_size=numerical_population,
122 |                                      mutation_rate=feature_selection_mutation_rate,
123 |                                      feature_indexes_ranges=feature_indexes_ranges,
124 |                                      mutation_w=[val[1]-val[0] for val in feature_indexes_ranges],
125 |                                      royalty=feature_selection_royalty,
126 |                                      k_fold=k_fold,
127 |                                      performance_metric=neg_mean_squared_error_scorer,
128 |                                      train_data_x=train_data_x,
129 |                                      train_data_y=train_data_y,
130 |                                      test_data_x=test_data_x,
131 |                                      test_data_y=test_data_y,
132 |                                      save_dir=results_folder,
133 |                                      cores=-1)
134 |                 # 2.2.2) save results of best model from all runs
135 |                 ResultTracker.run(program_part="tpot",
136 |                                   run_times=numerical_run_times,
137 |                                   all_scores=best_gene.scoring_history,
138 |                                   model=best_gene.model_object,
139 |                                   train_data_x=train_data_x.iloc[:, best_gene.feature_indexes],
140 |                                   train_data_y=train_data_y,
141 |                                   test_data_x=test_data_x.iloc[:, best_gene.feature_indexes],
142 |                                   test_data_y=test_data_y,
143 |                                   save_dir=results_folder)
144 |                 # 2.2.3) save selected features of best gene
145 |                 with open(os.path.join(os.path.dirname(__file__), results_folder, "best_features_selected.json"),
146 |                           "w") as features_file:
147 |                     json.dump({"index": best_gene.feature_indexes,
148 |                                "names": list(test_data_x.columns[best_gene.feature_indexes])},
149 |                               features_file)
150 |                 Logger.print("Best gene features: {}".format(list(test_data_x.columns[best_gene.feature_indexes])))
151 |                 # 2.2.4) reduce the dataset of non-normalized samples for next part
152 |                 df = df.iloc[:, best_gene.feature_indexes+[-1]]
153 |         # 2.3) log elapsed time
154 |         tpot_end_time = time.time()
155 |         symbolic_end_time = time.time()
156 |         Logger.print("   --- Finished. Elapsed time: {} ---".format(timedelta(seconds=tpot_end_time-data_end_time)))
157 | 
158 |         # 3) continue to the symbolic regression
159 |         if analytical_bool:
160 |             Logger.print('Searching for a symbolic expression:')
161 |             # 3.1) run symbolic regressor multiple times
162 |             all_s_scores, best_s_model = GASF.run_and_analyze(run_times=analytical_run_times,
163 |                                                               non_normalized_data=df,
164 |                                                               performance_metric=function_mapper["better_symbolic_reg_fitness"],
165 |                                                               generations=analytical_generations,
166 |                                                               population_size=analytical_population,
167 |                                                               k_fold=k_fold,
168 |                                                               cores=-1,
169 |                                                               parsimony_coefficient=parsimony_coefficient,
170 |                                                               expected_eq=expected_eq,
171 |                                                               save_dir=results_folder)
172 |             # 3.2) save results of best model from all runs
173 |             non_norm_train_x, non_norm_test_x, non_norm_train_y, non_norm_test_y = train_test_split(df.drop([y_col], axis=1),
174 |                                                                                                     df[y_col],
175 |                                                                                                     shuffle=True,
176 |                                                                                                     test_size=SFF_TEST_SIZE_PORTION,
177 |                                                                                                     random_state=RANDOM_STATE)
178 |             p_value_flag = ResultTracker.run(program_part="symbolic",
179 |                                              run_times=analytical_run_times,
180 |                                              all_scores=all_s_scores,
181 |                                              model=best_s_model,
182 |                                              train_data_x=non_norm_train_x,
183 |                                              train_data_y=non_norm_train_y,
184 |                                              test_data_x=non_norm_test_x,
185 |                                              test_data_y=non_norm_test_y,
186 |                                              save_dir=results_folder)
187 |             # 3.3) save a summary of the eqs found & figure whether to continue to ebf
188 |             stability_flag = ResultTracker.summaries_symbolic_results(run_times=analytical_run_times,
189 |                                                                 percent_of_majority=SYMBOLIC_PERCENT_OF_MAJORITY,
190 |                                                                 eq_ranking_metric=SYMBOLIC_EQ_RANKING_METRIC,
191 |                                                                 top_eqs_max_num=SYMBOLIC_TOP_EQS_MAX_NUM,
192 |                                                                 save_dir=results_folder)
193 | 
194 |             ebs_flag = p_value_flag or stability_flag
195 |             # 3.4) log elapsed time
196 |             symbolic_end_time = time.time()
197 |             Logger.print("Finished. Elapsed time: {}".format(timedelta(seconds=tpot_end_time - tpot_end_time)))
198 |         else:
199 |             ebs_flag = False
200 | 
201 |         # 4) continue to the EBS
202 |         if ebs_flag or force_ebs_bool:
203 |             Logger.print('Searching for a symbolic expression using EBF:')
204 |             # 4.1) run EBS multiple times
205 |             all_ebs_scores, best_ebs_model = EBS.run_and_analyze(run_times=analytical_run_times,
206 |                                                                  non_normalized_data=df,
207 |                                                                  performance_metric=function_mapper[
208 |                                                                      "better_symbolic_reg_fitness"],
209 |                                                                  cores=-1,
210 |                                                                  size_range=ebs_size_range,
211 |                                                                  expected_eq=expected_eq,
212 |                                                                  save_dir=results_folder)
213 |             # 4.2) save the fitting score results
214 |             ResultTracker.ebs_results(model=best_ebs_model,
215 |                                       all_scores=all_ebs_scores,
216 |                                       save_dir=results_folder)
217 |         else:
218 |             Logger.print("EBF search of a symbolic equation wasn't needed")
219 |             # 4.3) log elapsed time
220 |             ebs_end_time = time.time()
221 |             Logger.print("Finished. Elapsed time: {}".format(timedelta(seconds=ebs_end_time - symbolic_end_time)))
222 | 
223 |         # 5) alert results to the user
224 |         Logger.print("\n   --- TOTAL TIME ELAPSED TIME: {} ---".format(timedelta(seconds=time.time() - start_time)))
225 | 


--------------------------------------------------------------------------------
/experiments/exp_noise.py:
--------------------------------------------------------------------------------
  1 | # library imports
  2 | import os
  3 | import json
  4 | import time
  5 | import pandas as pd
  6 | from datetime import timedelta
  7 | from sklearn.model_selection import train_test_split
  8 | 
  9 | # project imports
 10 | from utills.consts import *
 11 | from utills.fitness_methods import *
 12 | from utills.logger_config import Logger
 13 | from algo.equation_brute_search import EBS
 14 | from utills.result_tracker import ResultTracker
 15 | from algo.multi_tpot_analysis import MultiTPOTrunner
 16 | from algo.genetic_algorithm_symbolic_fit import GASF
 17 | from algo.genetic_algorithm_feature_selection import GAFS
 18 | 
 19 | 
 20 | class ExpNoise:
 21 |     """
 22 |     A father class to the SFF experiments, responsible for
 23 |     the run function for all SFF cases.
 24 |     Here, only the data generation function changes from
 25 |     case to case.
 26 |     """
 27 | 
 28 |     def __init__(self):
 29 |         pass
 30 | 
 31 |     @staticmethod
 32 |     def run(numerical_bool: bool,
 33 |             analytical_bool: bool,
 34 |             force_ebs_bool: bool,
 35 |             results_folder: str,
 36 |             data_path: str,
 37 |             data_generation_function,
 38 |             numerical_run_times: int,
 39 |             numerical_generations: int,
 40 |             numerical_population: int,
 41 |             analytical_run_times: int,
 42 |             analytical_generations: int,
 43 |             analytical_population: int,
 44 |             parsimony_coefficient: int,
 45 |             k_fold: int,
 46 |             samples: int,
 47 |             rhoa_range: tuple,
 48 |             rhop_range: tuple,
 49 |             nu_range: tuple,
 50 |             re_range: tuple,
 51 |             ebs_size_range: tuple,
 52 |             noise_list: list,
 53 |             expected_eq: str = "unknown",
 54 |             feature_selection_generations: int = None,
 55 |             feature_selection_pop_size: int = None,
 56 |             feature_selection_mutation_rate: float = None,
 57 |             feature_selection_royalty: float = None):
 58 |         global SFF_CASE_2_NOISE_RANGE
 59 | 
 60 |         answer = []
 61 |         for noise in noise_list:
 62 |             # prepare IO
 63 |             os.makedirs(results_folder, exist_ok=True)
 64 |             Logger(os.path.join(os.path.dirname(os.path.dirname(__file__)),
 65 |                                 results_folder,
 66 |                                 "run.log"))
 67 |             # 1) generate data
 68 |             SFF_CASE_2_NOISE_RANGE = noise
 69 |             feature_indexes_ranges = data_generation_function(samples=samples,
 70 |                                                               rhoa_range=rhoa_range,
 71 |                                                               nu_range=nu_range,
 72 |                                                               re_range=re_range,
 73 |                                                               rhop_range=rhop_range,
 74 |                                                               save_path=data_path)
 75 |             # 1.1) load data, normalize and split
 76 |             df = pd.read_csv(data_path)
 77 |             Logger.print('Generated data:\n{}'.format(df.describe()))
 78 |             y_col = df.keys()[-1]
 79 |             normalized_df = (df - df.min()) / (df.max() - df.min())
 80 |             train_data_x, test_data_x, train_data_y, test_data_y = train_test_split(normalized_df.drop([y_col], axis=1),
 81 |                                                                                     normalized_df[y_col],
 82 |                                                                                     shuffle=True,
 83 |                                                                                     test_size=SFF_TEST_SIZE_PORTION,
 84 |                                                                                     random_state=RANDOM_STATE)
 85 |             # 1.2) log elapsed time
 86 |             data_end_time = time.time()
 87 |             Logger.print("   --- Finished. Elapsed time: {} ---".format(timedelta(seconds=data_end_time - start_time)))
 88 | 
 89 |             # 2) continue to the MultiTPOTrunner regression
 90 |             Logger.print('Training MultiTPOTrunner:')
 91 |             if numerical_bool:
 92 |                 # 2.1) run multi-tpot analysis if feature selection isn't needed
 93 |                 if feature_indexes_ranges == "Not applicable":
 94 |                     # 2.1.1) find the best ML model for all the data
 95 |                     all_t_scores, best_t_model = MultiTPOTrunner.run_and_analyze(run_times=numerical_run_times,
 96 |                                                                                  train_data_x=train_data_x,
 97 |                                                                                  train_data_y=train_data_y,
 98 |                                                                                  test_data_x=test_data_x,
 99 |                                                                                  test_data_y=test_data_y,
100 |                                                                                  generations=numerical_generations,
101 |                                                                                  population_size=numerical_population,
102 |                                                                                  k_fold=k_fold,
103 |                                                                                  performance_metric="neg_mean_absolute_error",
104 |                                                                                  save_dir=results_folder,
105 |                                                                                  n_jobs=-1)
106 |                     # 2.1.2) save results of best model from all runs
107 |                     ResultTracker.run(program_part="tpot",
108 |                                       run_times=numerical_run_times,
109 |                                       all_scores=all_t_scores,
110 |                                       model=best_t_model,
111 |                                       train_data_x=train_data_x,
112 |                                       train_data_y=train_data_y,
113 |                                       test_data_x=test_data_x,
114 |                                       test_data_y=test_data_y,
115 |                                       save_dir=results_folder)
116 |                 # 2.2) run multi-tpot analysis with feature selection
117 |                 else:
118 |                     # 2.2.1) find the best ML model for a subset of the data
119 |                     best_gene = GAFS.run(tpot_run_times=numerical_run_times,
120 |                                          feature_generations=feature_selection_generations,
121 |                                          tpot_regressor_generations=numerical_generations,
122 |                                          feature_population_size=feature_selection_pop_size,
123 |                                          tpot_regressor_population_size=numerical_population,
124 |                                          mutation_rate=feature_selection_mutation_rate,
125 |                                          feature_indexes_ranges=feature_indexes_ranges,
126 |                                          mutation_w=[val[1]-val[0] for val in feature_indexes_ranges],
127 |                                          royalty=feature_selection_royalty,
128 |                                          k_fold=k_fold,
129 |                                          performance_metric="neg_mean_absolute_error",
130 |                                          train_data_x=train_data_x,
131 |                                          train_data_y=train_data_y,
132 |                                          test_data_x=test_data_x,
133 |                                          test_data_y=test_data_y,
134 |                                          save_dir=results_folder,
135 |                                          cores=-1)
136 |                     # 2.2.2) save results of best model from all runs
137 |                     ResultTracker.run(program_part="tpot",
138 |                                       run_times=numerical_run_times,
139 |                                       all_scores=best_gene.scoring_history,
140 |                                       model=best_gene.model_object,
141 |                                       train_data_x=train_data_x.iloc[:, best_gene.feature_indexes],
142 |                                       train_data_y=train_data_y,
143 |                                       test_data_x=test_data_x.iloc[:, best_gene.feature_indexes],
144 |                                       test_data_y=test_data_y,
145 |                                       save_dir=results_folder)
146 |                     # 2.2.3) save selected features of best gene
147 |                     with open(os.path.join(os.path.dirname(__file__), results_folder, "best_features_selected.json"),
148 |                               "w") as features_file:
149 |                         json.dump({"index": best_gene.feature_indexes,
150 |                                    "names": list(test_data_x.columns[best_gene.feature_indexes])},
151 |                                   features_file)
152 |                     Logger.print("Best gene features: {}".format(list(test_data_x.columns[best_gene.feature_indexes])))
153 |                     # 2.2.4) reduce the dataset of non-normalized samples for next part
154 |                     df = df.iloc[:, best_gene.feature_indexes+[-1]]
155 |             # 2.3) log elapsed time
156 |             tpot_end_time = time.time()
157 |             symbolic_end_time = time.time()
158 |             Logger.print("   --- Finished. Elapsed time: {} ---".format(timedelta(seconds=tpot_end_time-data_end_time)))
159 | 
160 |             # 3) continue to the symbolic regression
161 |             if analytical_bool:
162 |                 Logger.print('Searching for a symbolic expression:')
163 |                 # 3.1) run symbolic regressor multiple times
164 |                 all_s_scores, best_s_model = GASF.run_and_analyze(run_times=analytical_run_times,
165 |                                                                   non_normalized_data=df,
166 |                                                                   performance_metric=function_mapper["better_symbolic_reg_fitness"],
167 |                                                                   generations=analytical_generations,
168 |                                                                   population_size=analytical_population,
169 |                                                                   k_fold=k_fold,
170 |                                                                   cores=-1,
171 |                                                                   parsimony_coefficient=parsimony_coefficient,
172 |                                                                   expected_eq=expected_eq,
173 |                                                                   save_dir=results_folder)
174 |                 # 3.2) save results of best model from all runs
175 |                 non_norm_train_x, non_norm_test_x, non_norm_train_y, non_norm_test_y = train_test_split(df.drop([y_col], axis=1),
176 |                                                                                                         df[y_col],
177 |                                                                                                         shuffle=True,
178 |                                                                                                         test_size=SFF_TEST_SIZE_PORTION,
179 |                                                                                                         random_state=RANDOM_STATE)
180 |                 ResultTracker.run(program_part="symbolic",
181 |                                   run_times=analytical_run_times,
182 |                                   all_scores=all_s_scores,
183 |                                   model=best_s_model,
184 |                                   train_data_x=non_norm_train_x,
185 |                                   train_data_y=non_norm_train_y,
186 |                                   test_data_x=non_norm_test_x,
187 |                                   test_data_y=non_norm_test_y,
188 |                                   save_dir=results_folder)
189 |                 # 3.3) save a summary of the eqs found & figure whether to continue to ebf
190 |                 ebs_flag = ResultTracker.summaries_symbolic_results(run_times=analytical_run_times,
191 |                                                                     percent_of_majority=SYMBOLIC_PERCENT_OF_MAJORITY,
192 |                                                                     eq_ranking_metric=SYMBOLIC_EQ_RANKING_METRIC,
193 |                                                                     top_eqs_max_num=SYMBOLIC_TOP_EQS_MAX_NUM,
194 |                                                                     save_dir=results_folder)
195 |                 # 3.4) log elapsed time
196 |                 symbolic_end_time = time.time()
197 |                 Logger.print("Finished. Elapsed time: {}".format(timedelta(seconds=tpot_end_time - tpot_end_time)))
198 |             else:
199 |                 ebs_flag = False
200 | 
201 |             # 4) continue to the EBS
202 |             if ebs_flag or force_ebs_bool:
203 |                 Logger.print('Searching for a symbolic expression using EBF:')
204 |                 # 4.1) run EBS multiple times
205 |                 all_ebs_scores, best_ebs_model = EBS.run_and_analyze(run_times=analytical_run_times,
206 |                                                                      non_normalized_data=df,
207 |                                                                      performance_metric=function_mapper[
208 |                                                                          "better_symbolic_reg_fitness"],
209 |                                                                      cores=-1,
210 |                                                                      size_range=ebs_size_range,
211 |                                                                      expected_eq=expected_eq,
212 |                                                                      save_dir=results_folder)
213 |                 # 4.2) save the fitting score results
214 |                 ResultTracker.ebs_results(model=best_ebs_model,
215 |                                           all_scores=all_ebs_scores,
216 |                                           save_dir=results_folder)
217 |             answer.append(all_ebs_scores)
218 | 


--------------------------------------------------------------------------------