├── algo ├── ebs │ ├── __init__.py │ ├── eq_functions.py │ ├── eq.py │ └── eq_node.py ├── gene.py ├── operators │ ├── mutation.py │ ├── crossover.py │ ├── next_generation.py │ └── fitness.py ├── population.py ├── multi_tpot_analysis.py ├── genetic_algorithm_feature_selection.py ├── genetic_algorithm_symbolic_fit.py └── equation_brute_search.py ├── experiments ├── __init__.py ├── exp_steady_free_fall_with_drag_case_1.py ├── exp_steady_free_fall_with_drag_case_2.py ├── exp_steady_free_fall_with_drag_case_2_with_educated_guess.py ├── exp_steady_free_fall_with_drag_case_3.py ├── exp_constant_acceleration.py ├── exp_drag_force.py ├── exp_steady_free_fall_with_drag.py └── exp_noise.py ├── data └── Readme.md ├── requirements.txt ├── Dockerfile ├── setup.py ├── LICENSE ├── demo ├── demo.py └── demo.csv ├── utills ├── logger_config.py ├── fitness_methods.py ├── tpot_results_extractor.py ├── symbolic_regression_to_latex_text.py ├── consts.py ├── result_tracker.py └── plotter.py ├── data_generators ├── N_frequency_generator.py ├── constant_acceleration_data_generator.py ├── drag_force_data_generator.py └── steady_free_fall_with_drag_data_generator.py ├── running_sciemed.py ├── .gitignore ├── paper_exp_runner.py ├── README.md └── scimed.py /algo/ebs/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /experiments/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /data/Readme.md: -------------------------------------------------------------------------------- 1 | This folder will contain the data generated for training 2 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | pandas 2 | numpy 3 | matplotlib 4 | seaborn 5 | scikit-learn 6 | scipy 7 | TPOT 8 | gplearn 9 | torch 10 | termcolor 11 | sympy -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | # syntax=docker/dockerfile:1 2 | 3 | ARG PYTHON_VERSION=3.10 4 | 5 | FROM python:${PYTHON_VERSION} 6 | 7 | WORKDIR /src 8 | 9 | COPY requirements.txt /src/requirements.txt 10 | RUN pip install --no-cache-dir -r requirements.txt 11 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | from setuptools import setup, find_packages 4 | 5 | setup(name='SciMED', 6 | version='1.0', 7 | description='A computational framework for finding symbolic expressions from physical datasets.', 8 | author='Liron Simon Keren, Alex Liberzon, Teddy Lazebnik', 9 | author_email='lazebnik.teddy@gmail.com', 10 | url='https://github.com/LironSimon/SciMED', 11 | packages=find_packages(), 12 | ) 13 | -------------------------------------------------------------------------------- /algo/ebs/eq_functions.py: -------------------------------------------------------------------------------- 1 | # library imports 2 | import numpy as np 3 | import pandas as pd 4 | 5 | # project import 6 | 7 | 8 | def add(a: pd.Series, 9 | b: pd.Series): 10 | return a + b 11 | 12 | 13 | def sub(a: pd.Series, 14 | b: pd.Series): 15 | return a - b 16 | 17 | 18 | def div(a: pd.Series, 19 | b: pd.Series): 20 | return (a / b).fillna(0).replace([np.inf, -np.inf], 0) 21 | 22 | 23 | def mul(a: pd.Series, 24 | b: pd.Series): 25 | return a * b 26 | 27 | 28 | FUNCTION_MAPPER = { 29 | add: "add", 30 | sub: "sub", 31 | mul: "mul", 32 | div: "div" 33 | } 34 | 35 | FUNCTION_LIST = [add, sub, mul, div] 36 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2022 Teddy Lazebnik 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /demo/demo.py: -------------------------------------------------------------------------------- 1 | """ 2 | This file provides a demo on how to use SciMed on a captured datasets that is stored in a CSV file format 3 | """ 4 | 5 | """ 6 | 1. Needed library imports 7 | """ 8 | import os 9 | import pandas as pd 10 | from sklearn.model_selection import train_test_split 11 | """ 12 | 2. Import SciMED's instance 13 | """ 14 | from scimed import scimed 15 | """ 16 | 3. Load the data into a pandas.DataFrame and split it into the source and target features 17 | """ 18 | df = pd.read_csv("demo.csv") 19 | Y_COL_NAME = "" 20 | x = df.drop([Y_COL_NAME], axis=1) 21 | y = df[Y_COL_NAME] 22 | x_train, x_test, y_train, y_test = train_test_split(x, 23 | y, 24 | test_size=0.2, # most of the time, we divide 80%-20% 25 | random_state=73) # Sheldon's number - just for fun 26 | """ 27 | 4. Run SciMED and observe the results in the 'results_folder' 28 | """ 29 | scimed.run(train_data_x=x_train, 30 | train_data_y=y_train, 31 | test_data_x=x_test, 32 | test_data_y=y_test, 33 | results_folder=os.path.join(os.path.dirname(__file__), "results") 34 | ) -------------------------------------------------------------------------------- /utills/logger_config.py: -------------------------------------------------------------------------------- 1 | # library imports 2 | import logging 3 | from termcolor import colored 4 | 5 | # project imports 6 | 7 | 8 | class Logger: 9 | """ 10 | Our project logger 11 | """ 12 | 13 | logger = logging.getLogger() 14 | logging.getLogger('matplotlib.font_manager').disabled = True 15 | logger.setLevel(logging.DEBUG) 16 | 17 | def __init__(self,save_path): 18 | logging.basicConfig(filename=save_path, 19 | format='%(asctime)s %(message)s', 20 | filemode='w') 21 | 22 | @staticmethod 23 | def print(message: str): 24 | Logger.logger.info(message) 25 | print("Info: {}".format(message)) 26 | 27 | @staticmethod 28 | def info(message: str): 29 | Logger.print(message=message) 30 | 31 | @staticmethod 32 | def important(message: str): 33 | Logger.logger.critical(message) 34 | print("Important: {}".format(colored(message, "bold"))) 35 | 36 | @staticmethod 37 | def debug(message: str): 38 | Logger.logger.debug(message) 39 | print("Debug: {}".format(colored(message, "red"))) 40 | 41 | def __repr__(self): 42 | return self.__str__() 43 | 44 | def __str__(self): 45 | return "" 46 | -------------------------------------------------------------------------------- /algo/gene.py: -------------------------------------------------------------------------------- 1 | # library imports 2 | from random import randint 3 | import pandas as pd 4 | 5 | # project imports 6 | 7 | 8 | class Gene: 9 | """ 10 | A gene to represent a subset of features, 11 | ensuring there is only one feature representation 12 | from each group of similarly-created feature. 13 | """ 14 | 15 | def __init__(self, 16 | feature_indexes: list, 17 | scores: pd.DataFrame = pd.DataFrame(), 18 | fitness: float = 0, 19 | model_object=None): 20 | self.feature_indexes = feature_indexes 21 | self.scoring_history = scores 22 | self.fitness = fitness 23 | self.model_object = model_object 24 | 25 | 26 | def __repr__(self): 27 | return self.__str__() 28 | 29 | def __str__(self): 30 | return "".format(self.feature_indexes, 31 | self.fitness) 32 | 33 | @staticmethod 34 | def random(feature_indexes_ranges: list, 35 | feature_count: int): 36 | return Gene(feature_indexes=[randint(feature_indexes_ranges[i][0], 37 | feature_indexes_ranges[i][1]) 38 | for i in range(feature_count)]) 39 | 40 | def length(self): 41 | return len(self.feature_indexes) 42 | 43 | -------------------------------------------------------------------------------- /algo/operators/mutation.py: -------------------------------------------------------------------------------- 1 | # library imports 2 | import numpy as np 3 | from random import randint, random 4 | 5 | # project imports 6 | from algo.population import Population 7 | 8 | 9 | class Mutation: 10 | """ 11 | mutation operations class 12 | """ 13 | 14 | def __init__(self): 15 | pass 16 | 17 | @staticmethod 18 | def simple(population: Population, 19 | feature_indexes_ranges: list, 20 | mutation_rate: float, 21 | w: list = None): 22 | """ 23 | Just a simple random mutation: changes the selected feature 24 | of one feature-group to a different feature from the same group. 25 | """ 26 | if w is None: 27 | w = [1 / len(population[0].feature_indexes) for _ in range(len(population[0].feature_indexes))] 28 | w = np.asarray(w) 29 | w = w/w.sum() 30 | for gene in population: 31 | if random() < mutation_rate: 32 | pick_index = np.random.choice(range(len(gene.feature_indexes)), 33 | 1, 34 | p=w)[0] 35 | gene.feature_indexes[pick_index] = randint(feature_indexes_ranges[pick_index][0], 36 | feature_indexes_ranges[pick_index][1]) 37 | return population 38 | 39 | def __repr__(self): 40 | return self.__str__() 41 | 42 | def __str__(self): 43 | return "" 44 | 45 | -------------------------------------------------------------------------------- /data_generators/N_frequency_generator.py: -------------------------------------------------------------------------------- 1 | # project imports 2 | import numpy as np 3 | import pandas as pd 4 | 5 | # project imports 6 | from utills.consts import * 7 | 8 | 9 | class NFrequencyGenerator: 10 | """ 11 | This class is responsible for creating all possible combinations 12 | representing a fluid frequency N [1/s]. 13 | """ 14 | 15 | def __init__(self): 16 | pass 17 | 18 | @staticmethod 19 | def add_all_combos(df: pd.DataFrame, 20 | g: float): 21 | """ 22 | Single entry point. 23 | Adds all possible combination of N as new columns in a given df. 24 | returns the modified df and list os suffixes that indicate 25 | how N was calculated. 26 | """ 27 | for suff in N_FREQ_SUFFIX: 28 | # choose rho_up 29 | if suff[-2] == '1': 30 | rho_up = df["rhop"] - df["rhoa"] 31 | elif suff[-2] == '2': 32 | rho_up = df["rhop"] 33 | else: 34 | rho_up = df["rhoa"] 35 | 36 | # choose rho_down 37 | if suff[-1] == '1': 38 | rho_down = 0.5 * (df["rhop"] + df["rhoa"]) 39 | elif suff[-1] == '2': 40 | rho_down = df["rhop"] 41 | else: 42 | rho_down = df["rhoa"] 43 | 44 | # calc N and add to Ns 45 | df.at[:, "N{}".format(suff)] = np.sqrt((g * rho_up) / (df["d"] * rho_down)) 46 | # return answer 47 | return df, N_FREQ_SUFFIX 48 | -------------------------------------------------------------------------------- /algo/population.py: -------------------------------------------------------------------------------- 1 | # library imports 2 | 3 | # project imports 4 | from algo.gene import Gene 5 | 6 | 7 | class Population: 8 | """ 9 | A population of genes 10 | """ 11 | 12 | def __init__(self, 13 | genes: list = None): 14 | self.genes = genes if isinstance(genes, list) and len(genes) > 0 else [] 15 | 16 | def __getitem__(self, item): 17 | return self.genes[item] # delegate to li.__getitem__ 18 | 19 | def __repr__(self): 20 | return self.__str__() 21 | 22 | def __str__(self): 23 | return "".format(len(self.genes)) 24 | 25 | @staticmethod 26 | def random(size: int, 27 | feature_count: int, 28 | feature_indexes_ranges: list): 29 | return Population(genes=[Gene.random(feature_indexes_ranges=feature_indexes_ranges, 30 | feature_count=feature_count) 31 | for _ in range(size)]) 32 | 33 | def get_best(self): 34 | best_gene = self.genes[0] 35 | best_gene_fitness = self.genes[0].fitness 36 | for gene in self.genes: 37 | if gene.fitness > best_gene_fitness: 38 | best_gene_fitness = gene.fitness 39 | best_gene = gene 40 | return best_gene 41 | 42 | def get_scores(self): 43 | return [gene.fitness for gene in self.genes] 44 | 45 | def get(self, 46 | index: int): 47 | return self.genes[index] 48 | 49 | def remove(self, 50 | index: int): 51 | self.genes.remove(self.genes[index]) 52 | 53 | def add(self, 54 | gene: Gene): 55 | self.genes.append(gene) 56 | 57 | def size(self): 58 | return len(self.genes) 59 | -------------------------------------------------------------------------------- /algo/operators/crossover.py: -------------------------------------------------------------------------------- 1 | # library imports 2 | from random import randint 3 | 4 | # project imports 5 | from algo.gene import Gene 6 | from algo.population import Population 7 | 8 | 9 | class Crossover: 10 | """ 11 | crossover operations class 12 | """ 13 | 14 | def __init__(self): 15 | pass 16 | 17 | @staticmethod 18 | def simple(population: Population): 19 | """ 20 | Just a simple single-point random crossover: 21 | returns a new population by randomly picking pairs of gene 22 | parents + a breaking point, to generate a pair of offsprings. 23 | """ 24 | new_pop = Population() 25 | while population.size() > 0: 26 | # pick two different genes 27 | i = randint(0, population.size()-1) 28 | j = randint(0, population.size()-1) 29 | while i == j: 30 | i = randint(0, population.size()-1) 31 | j = randint(0, population.size()-1) 32 | gene_i = population.get(i) 33 | gene_j = population.get(j) 34 | # pick a single breaking point 35 | break_index = randint(1, len(gene_i.feature_indexes)-2) 36 | # recall to new list 37 | new_pop.add(gene=Gene(feature_indexes=gene_i.feature_indexes[:break_index]+gene_j.feature_indexes[break_index:])) 38 | new_pop.add(gene=Gene(feature_indexes=gene_j.feature_indexes[:break_index]+gene_i.feature_indexes[break_index:])) 39 | # remove from previous list 40 | # TODO: Teddy, why is the order of removal important? why do we need this if/else 41 | if i < j: 42 | population.remove(index=j) 43 | population.remove(index=i) 44 | else: 45 | population.remove(index=i) 46 | population.remove(index=j) 47 | return new_pop 48 | 49 | def __repr__(self): 50 | return self.__str__() 51 | 52 | def __str__(self): 53 | return "" 54 | 55 | -------------------------------------------------------------------------------- /running_sciemed.py: -------------------------------------------------------------------------------- 1 | """ 2 | 1. Needed library imports 3 | """ 4 | import os 5 | import pandas as pd 6 | from sklearn.preprocessing import MinMaxScaler 7 | from sklearn.model_selection import train_test_split 8 | """ 9 | 2. Import SciMED's instance 10 | """ 11 | from scimed import scimed 12 | """ 13 | 3. Load the data into a pandas.DataFrame and split it into the source and target features 14 | """ 15 | 16 | 17 | def run(): 18 | df = pd.read_csv("412_dataset.csv") 19 | Y_COL_NAME = "tau/t_expected" 20 | df = (df-df.min())/(df.max()-df.min()) 21 | x = df.drop([Y_COL_NAME], axis=1) 22 | y = df[Y_COL_NAME] 23 | x_train, x_test, y_train, y_test = train_test_split(x, 24 | y, 25 | test_size=0.2, # most of the time, we divide 80%-20% 26 | random_state=73) # Sheldon's number - just for fun 27 | feature_indexes_ranges =[[0,0], [1,4], [5,20], [21,21], [22,27], [28,412]] 28 | """ 29 | 4. Run SciMED and observe the results in the 'results_folder' 30 | """ 31 | scimed.run(train_data_x=x_train, 32 | train_data_y=y_train, 33 | test_data_x=x_test, 34 | test_data_y=y_test, 35 | results_folder=os.path.join(os.path.dirname(__file__), "results"), 36 | k_fold = 5, 37 | numerical_bool = True, 38 | numerical_run_times = 1, 39 | numerical_generations = 25, 40 | numerical_population = 40, 41 | analytical_bool = False, 42 | force_ebs_bool = True, 43 | ebs_size_range = (3, 13), 44 | feature_indexes_ranges = feature_indexes_ranges, 45 | feature_selection_generations = 30, 46 | feature_selection_pop_size = 26, 47 | feature_selection_mutation_rate = 0.03, 48 | feature_selection_royalty=0.05 49 | ) 50 | 51 | 52 | if __name__ == '__main__': 53 | run() 54 | -------------------------------------------------------------------------------- /algo/operators/next_generation.py: -------------------------------------------------------------------------------- 1 | # library imports 2 | import random 3 | 4 | # project imports 5 | from algo.population import Population 6 | 7 | 8 | class NextGeneration: 9 | """ 10 | A next generation metric 11 | """ 12 | 13 | def __init__(self): 14 | pass 15 | 16 | @staticmethod 17 | def tournament_with_royalty(population: Population, 18 | royalty: float): 19 | """ 20 | A tournament next generation with royalty 21 | """ 22 | # calc the probability of selecting a gene to a tournament 23 | sum_fitness = sum(population.get_scores()) 24 | if sum_fitness > 0: 25 | fitness_probabilities = [score / sum_fitness for score in population.get_scores()] 26 | else: 27 | fitness_probabilities = population.get_scores() 28 | # sort the population by probability of selection 29 | genes_with_fitness = zip(fitness_probabilities, population.genes) 30 | genes_with_fitness = sorted(genes_with_fitness, key=lambda x: x[0], reverse=True) 31 | # pick the most probable genes (those with the largest fitness scores) 32 | royalty_pop = [val[1] for val in genes_with_fitness[:round(len(genes_with_fitness) * royalty)]] 33 | # tournament around the other genes 34 | left_genes = [val[1] for val in genes_with_fitness[round(len(genes_with_fitness) * royalty):]] 35 | left_fitness = [val[0] for val in genes_with_fitness[round(len(genes_with_fitness) * royalty):]] 36 | pick_genes = [] 37 | left_count = len(population.genes) - len(royalty_pop) 38 | while len(pick_genes) < left_count: 39 | pick_gene = random.choices(left_genes, weights=left_fitness)[0] 40 | pick_genes.append(pick_gene) 41 | # add the royalty to the genes selected in the tournament 42 | pick_genes = list(pick_genes) 43 | pick_genes.extend(royalty_pop) 44 | return Population(genes=pick_genes) 45 | 46 | def __repr__(self): 47 | return self.__str__() 48 | 49 | def __str__(self): 50 | return "" 51 | -------------------------------------------------------------------------------- /utills/fitness_methods.py: -------------------------------------------------------------------------------- 1 | # library imports 2 | import numpy as np 3 | from sklearn.metrics import make_scorer 4 | from gplearn.fitness import make_fitness 5 | from sklearn.metrics import mean_absolute_error, mean_squared_error 6 | 7 | # functions we might want to use as part of the TPOT process 8 | neg_mean_squared_error_scorer = make_scorer(mean_squared_error, greater_is_better=False) 9 | 10 | 11 | def simple_symbolic_reg_fitness(y: np.ndarray, 12 | y_pred: np.ndarray, 13 | sample_weight: np.ndarray = None) -> np.float64: 14 | """ 15 | Just the MSE 16 | :param y_true: the list of baseline values to compare with 17 | :param y_pred: the list of model predicted values to evaluate 18 | :return: the error value from 0 to inf 19 | """ 20 | if sample_weight is None: 21 | return mean_squared_error(y_true=y, y_pred=y_pred) 22 | else: 23 | return mean_squared_error(y_true=y, y_pred=y_pred, sample_weight=sample_weight) 24 | 25 | 26 | def better_symbolic_reg_fitness(y: np.ndarray, 27 | y_pred: np.ndarray, 28 | sample_weight: np.ndarray = None) -> np.float64: 29 | """ 30 | Taking ideas from https://arxiv.org/abs/1904.05417 for better overall results 31 | :param y_true: the list of baseline values to compare with 32 | :param y_pred: the list of model predicted values to evaluate 33 | :return: the error value from 0 to inf 34 | """ 35 | if sample_weight is None: 36 | return mean_squared_error(y_true=y, y_pred=y_pred) + mean_absolute_error(y_true=y, y_pred=y_pred) + np.max(y-y_pred) 37 | else: 38 | return mean_squared_error(y_true=y, y_pred=y_pred, sample_weight=sample_weight) + mean_absolute_error(y_true=y, y_pred=y_pred, sample_weight=sample_weight) + np.max(y-y_pred) 39 | 40 | 41 | # common functions 42 | function_mapper = { 43 | "simple_symbolic_reg_fitness": make_fitness(function=simple_symbolic_reg_fitness, greater_is_better=False), 44 | "better_symbolic_reg_fitness": make_fitness(function=better_symbolic_reg_fitness, greater_is_better=False) 45 | } 46 | -------------------------------------------------------------------------------- /data_generators/constant_acceleration_data_generator.py: -------------------------------------------------------------------------------- 1 | # project imports 2 | import numpy as np 3 | import pandas as pd 4 | 5 | # library imports 6 | 7 | 8 | class ConstantAccelerationDataGenerator: 9 | """ 10 | This class is responsible for the generation of measurements of motion with 11 | constant acceleration to test the model. 12 | """ 13 | 14 | # CONSTS # 15 | 16 | # END - CONSTS # 17 | 18 | def __init__(self): 19 | pass 20 | 21 | # Logic - start # 22 | 23 | @staticmethod 24 | def generate(samples: int, 25 | a_range: tuple, 26 | t_range: tuple, 27 | v0_range: tuple, 28 | noise_range: tuple, 29 | save_path: str): 30 | """ 31 | Generate a pandas dataframe of experiments to represent motion with constant acceleration. 32 | We assume we sample 3 parameters: 33 | v0: initial velocity [m/s] 34 | a: acceleration [m/s2] 35 | t: time pass [s] 36 | and calculate with them: 37 | v: velocity at time t [m/s] 38 | via v = v0 + a * t. 39 | """ 40 | a_range_delta = a_range[1] - a_range[0] 41 | t_range_delta = t_range[1] - t_range[0] 42 | v0_range_delta = v0_range[1] - v0_range[0] 43 | noise_range_delta = noise_range[1] - noise_range[0] 44 | data = [] 45 | # generate samples 46 | for i in range(samples): 47 | a = round(np.random.random_sample() * a_range_delta + a_range[0], 2) 48 | v0 = round(np.random.random_sample() * v0_range_delta + v0_range[0], 2) 49 | t = round(np.random.random_sample() * t_range_delta + t_range[0], 2) 50 | noise = round(np.random.random_sample() * noise_range_delta + noise_range[0], 2) * np.random.choice((-1, 1)) 51 | v_sampled = v0 + a * t 52 | v_sampled = round(v_sampled * (1 + noise), 2) 53 | data.append([v0, a, t, v_sampled]) 54 | # make a Pandas.DataFrame and save it as a CSV file 55 | pd.DataFrame(data=data, columns=["v0", "a", "t", "v"]).to_csv(save_path, index=False) 56 | 57 | # Logic - end # 58 | -------------------------------------------------------------------------------- /utills/tpot_results_extractor.py: -------------------------------------------------------------------------------- 1 | # library import 2 | import os 3 | import re 4 | import json 5 | from glob import glob 6 | 7 | 8 | # project import 9 | 10 | 11 | class TPOTresultsExtractor: 12 | """ 13 | This class is responsible to get a file or folder with files produced by the 14 | MultiTPOTrunner library with pipeline configurations and extract only the description of the pipeline itself 15 | """ 16 | 17 | # CONSTS # 18 | OPEN_DIVIDER = "make_pipeline(" 19 | CLOSE_DIVIDER = "exported_pipeline." 20 | # END - CONSTS # 21 | 22 | def __init__(self): 23 | pass 24 | 25 | @staticmethod 26 | def process_folder(folder_path: str, 27 | answer_path: str = None): 28 | """ 29 | :param folder_path: path to a folder with MultiTPOTrunner produced <>.py files 30 | :param answer_path: path to the results, if None, just return the string with the answer 31 | """ 32 | answer = [TPOTresultsExtractor.process_file(file_path=file_path, 33 | answer_path=None) 34 | for file_path in glob(os.path.join(folder_path, "*.py"))] 35 | if answer_path is not None: 36 | with open(answer_path, "w") as answer_file: 37 | json.dump(answer, answer_file) 38 | else: 39 | return answer 40 | 41 | @staticmethod 42 | def process_file(file_path: str, 43 | answer_path: str = None): 44 | """ 45 | :param file_path: path to the MultiTPOTrunner produced <>.py file 46 | :param answer_path: path to the results, if None, just return the string with the answer 47 | """ 48 | data_text = "" 49 | with open(file_path, "r") as input_file: 50 | data_text = input_file.read() 51 | open_index = data_text.find(TPOTresultsExtractor.OPEN_DIVIDER) 52 | close_index = data_text.find(TPOTresultsExtractor.CLOSE_DIVIDER) 53 | pipeline = data_text[open_index + len(TPOTresultsExtractor.OPEN_DIVIDER):close_index].strip().replace("\n", "") 54 | while ", " in pipeline: 55 | pipeline = pipeline.replace(", ", ",") 56 | if answer_path is not None: 57 | with open(answer_path, "w") as result_file: 58 | result_file.write(pipeline) 59 | else: 60 | return pipeline 61 | -------------------------------------------------------------------------------- /experiments/exp_steady_free_fall_with_drag_case_1.py: -------------------------------------------------------------------------------- 1 | # library imports 2 | 3 | # project imports 4 | from utills.consts import * 5 | from experiments.exp_steady_free_fall_with_drag import ExpSFF 6 | from data_generators.steady_free_fall_with_drag_data_generator import SFFWDdataGenerator 7 | 8 | 9 | class ExpSFF1(ExpSFF): 10 | """ 11 | The first case of the SFF - 12 | Program receives a dataset that is missing an essential feature 13 | (particle velocity), that is needed to deduce the target (drag coefficient). 14 | 15 | Failure of both numerical and analytical parts of the program prove 16 | that if the user neglects to measure a key physical component in the 17 | unknown physical phenomena, the user is alerted by bad results. 18 | """ 19 | 20 | def __init__(self): 21 | ExpSFF.__init__(self) 22 | 23 | @staticmethod 24 | def perform(numerical_bool: bool, 25 | analytical_bool: bool, 26 | force_ebs_bool: bool): 27 | """ 28 | Entry point 29 | """ 30 | ExpSFF.run(numerical_bool=numerical_bool, 31 | analytical_bool=analytical_bool, 32 | force_ebs_bool=force_ebs_bool, 33 | results_folder=os.path.join(os.path.dirname(os.path.dirname(__file__)), SFF_1_RESULTS_FOLDER_NAME), 34 | data_path=os.path.join(os.path.dirname(os.path.dirname(__file__)), SFF_1_DATA_FOLDER_NAME), 35 | data_generation_function=SFFWDdataGenerator.generate_case_1, 36 | numerical_run_times=SFF_NUMERICAL_RUN_TIMES, 37 | numerical_generations=SFF_NUMERICAL_GENERATION_COUNT, 38 | numerical_population=SFF_NUMERICAL_POP_SIZE, 39 | analytical_run_times=SFF_ANALYTICAL_RUN_TIMES, 40 | analytical_generations=SFF_ANALYTICAL_GENERATION_COUNT, 41 | analytical_population=SFF_NUMERICAL_POP_SIZE, 42 | parsimony_coefficient=SFF_ANALYTICAL_PARSIMONY_COEFFICIENT, 43 | k_fold=K_FOLD, 44 | samples=SFF_NUMERICAL_NUM_SAMPLES, 45 | rhoa_range=SFF_RHOA_RANGE, 46 | rhop_range=SFF_RHOP_RANGE, 47 | nu_range=SFF_NU_RANGE, 48 | re_range=SFF_RE_RANGE, 49 | expected_eq="unknown", 50 | ebs_size_range=SFF_DIMENSIONAL_EBS_SIZE_RANGE_1_2 51 | ) 52 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | .idea/* 7 | *.png 8 | *.json 9 | 10 | # C extensions 11 | *.so 12 | 13 | # Distribution / packaging 14 | .Python 15 | build/ 16 | develop-eggs/ 17 | dist/ 18 | downloads/ 19 | eggs/ 20 | .eggs/ 21 | lib/ 22 | lib64/ 23 | parts/ 24 | sdist/ 25 | var/ 26 | wheels/ 27 | pip-wheel-metadata/ 28 | share/python-wheels/ 29 | *.egg-info/ 30 | .installed.cfg 31 | *.egg 32 | MANIFEST 33 | 34 | # PyInstaller 35 | # Usually these files are written by a python script from a template 36 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 37 | *.manifest 38 | *.spec 39 | 40 | # Installer logs 41 | pip-log.txt 42 | pip-delete-this-directory.txt 43 | 44 | # Unit test / coverage reports 45 | htmlcov/ 46 | .tox/ 47 | .nox/ 48 | .coverage 49 | .coverage.* 50 | .cache 51 | nosetests.xml 52 | coverage.xml 53 | *.cover 54 | *.py,cover 55 | .hypothesis/ 56 | .pytest_cache/ 57 | 58 | # Translations 59 | *.mo 60 | *.pot 61 | 62 | # Django stuff: 63 | *.log 64 | local_settings.py 65 | db.sqlite3 66 | db.sqlite3-journal 67 | 68 | # Flask stuff: 69 | instance/ 70 | .webassets-cache 71 | 72 | # Scrapy stuff: 73 | .scrapy 74 | 75 | # Sphinx documentation 76 | docs/_build/ 77 | 78 | # PyBuilder 79 | target/ 80 | 81 | # Jupyter Notebook 82 | .ipynb_checkpoints 83 | 84 | # IPython 85 | profile_default/ 86 | ipython_config.py 87 | 88 | # pyenv 89 | .python-version 90 | 91 | # pipenv 92 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 93 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 94 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 95 | # install all needed dependencies. 96 | #Pipfile.lock 97 | 98 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 99 | __pypackages__/ 100 | 101 | # Celery stuff 102 | celerybeat-schedule 103 | celerybeat.pid 104 | 105 | # SageMath parsed files 106 | *.sage.py 107 | 108 | # Environments 109 | .env 110 | .venv 111 | env/ 112 | venv/ 113 | ENV/ 114 | env.bak/ 115 | venv.bak/ 116 | 117 | # Spyder project settings 118 | .spyderproject 119 | .spyproject 120 | 121 | # Rope project settings 122 | .ropeproject 123 | 124 | # mkdocs documentation 125 | /site 126 | 127 | # mypy 128 | .mypy_cache/ 129 | .dmypy.json 130 | dmypy.json 131 | 132 | # Pyre type checker 133 | .pyre/ 134 | data/* 135 | results/* 136 | -------------------------------------------------------------------------------- /experiments/exp_steady_free_fall_with_drag_case_2.py: -------------------------------------------------------------------------------- 1 | # library imports 2 | 3 | # project imports 4 | from utills.consts import * 5 | from experiments.exp_steady_free_fall_with_drag import ExpSFF 6 | from data_generators.steady_free_fall_with_drag_data_generator import SFFWDdataGenerator 7 | 8 | 9 | class ExpSFF2(ExpSFF): 10 | """ 11 | The second case of the SFF - 12 | Program receives a dataset with all essential features needed 13 | to deduce a "noisy" target (drag coefficient), except for 14 | the gravitational acceleration constant. 15 | 16 | Success of both numerical and analytical parts of the program prove 17 | that: 18 | 1) The program is able to learn the non-linear physical relation between 19 | features, even with noisy data. 20 | 2) The program was able to learn the gravitational acceleration constant. 21 | """ 22 | 23 | def __init__(self): 24 | ExpSFF.__init__(self) 25 | 26 | @staticmethod 27 | def perform(numerical_bool: bool, 28 | analytical_bool: bool, 29 | force_ebs_bool: bool): 30 | """ 31 | Entry point 32 | """ 33 | ExpSFF.run(numerical_bool=numerical_bool, 34 | analytical_bool=analytical_bool, 35 | force_ebs_bool=force_ebs_bool, 36 | results_folder=os.path.join(os.path.dirname(os.path.dirname(__file__)), SFF_2_RESULTS_FOLDER_NAME), 37 | data_path=os.path.join(os.path.dirname(os.path.dirname(__file__)), SFF_2_DATA_FOLDER_NAME), 38 | data_generation_function=SFFWDdataGenerator.generate_case_2, 39 | numerical_run_times=SFF_NUMERICAL_RUN_TIMES, 40 | numerical_generations=SFF_NUMERICAL_GENERATION_COUNT, 41 | numerical_population=SFF_NUMERICAL_POP_SIZE, 42 | analytical_run_times=SFF_ANALYTICAL_RUN_TIMES, 43 | analytical_generations=SFF_ANALYTICAL_GENERATION_COUNT, 44 | analytical_population=SFF_NUMERICAL_POP_SIZE, 45 | parsimony_coefficient=SFF_ANALYTICAL_PARSIMONY_COEFFICIENT, 46 | k_fold=K_FOLD, 47 | samples=SFF_NUMERICAL_NUM_SAMPLES, 48 | rhoa_range=SFF_RHOA_RANGE, 49 | rhop_range=SFF_RHOP_RANGE, 50 | nu_range=SFF_NU_RANGE, 51 | re_range=SFF_RE_RANGE, 52 | expected_eq="13.08 * (rhop - rhoa) * d / (rhoa * V * V)", 53 | ebs_size_range=SFF_DIMENSIONAL_EBS_SIZE_RANGE_1_2 54 | ) 55 | -------------------------------------------------------------------------------- /experiments/exp_steady_free_fall_with_drag_case_2_with_educated_guess.py: -------------------------------------------------------------------------------- 1 | # library imports 2 | 3 | # project imports 4 | from utills.consts import * 5 | from experiments.exp_steady_free_fall_with_drag import ExpSFF 6 | from data_generators.steady_free_fall_with_drag_data_generator import SFFWDdataGenerator 7 | 8 | 9 | class ExpSFF2WithGuess(ExpSFF): 10 | """ 11 | Similar to the second case of the SFF, but with educated guesses - 12 | Program receives a dataset with all essential features needed 13 | to deduce a "noisy" target (drag coefficient), except for 14 | the gravitational acceleration constant. To that dataset of 15 | dimensional features, two dimensional educated guesses are added. 16 | 17 | Success of both numerical and analytical parts of the program prove 18 | that: 19 | 1) Educated guesses may improve results, despite the addition of features. 20 | 2) The program was able to learn the gravitational acceleration constant. 21 | """ 22 | 23 | def __init__(self): 24 | ExpSFF.__init__(self) 25 | 26 | @staticmethod 27 | def perform(numerical_bool: bool, 28 | analytical_bool: bool, 29 | force_ebs_bool: bool): 30 | """ 31 | Entry point 32 | """ 33 | ExpSFF.run(numerical_bool=numerical_bool, 34 | analytical_bool=analytical_bool, 35 | force_ebs_bool=force_ebs_bool, 36 | results_folder=os.path.join(os.path.dirname(os.path.dirname(__file__)), SFF_2_WITH_GUESS_RESULTS_FOLDER_NAME), 37 | data_path=os.path.join(os.path.dirname(os.path.dirname(__file__)), SFF_2_WITH_GUESS_DATA_FOLDER_NAME), 38 | data_generation_function=SFFWDdataGenerator.generate_case_2_with_guess, 39 | numerical_run_times=SFF_NUMERICAL_RUN_TIMES, 40 | numerical_generations=SFF_NUMERICAL_GENERATION_COUNT, 41 | numerical_population=SFF_NUMERICAL_POP_SIZE, 42 | analytical_run_times=SFF_ANALYTICAL_RUN_TIMES, 43 | analytical_generations=SFF_ANALYTICAL_GENERATION_COUNT, 44 | analytical_population=SFF_NUMERICAL_POP_SIZE, 45 | parsimony_coefficient=SFF_ANALYTICAL_PARSIMONY_COEFFICIENT, 46 | k_fold=K_FOLD, 47 | samples=SFF_NUMERICAL_NUM_SAMPLES, 48 | rhoa_range=SFF_RHOA_RANGE, 49 | rhop_range=SFF_RHOP_RANGE, 50 | nu_range=SFF_NU_RANGE, 51 | re_range=SFF_RE_RANGE, 52 | expected_eq="13.08 * (rhop - rhoa) * d / (rhoa * V * V)", 53 | ebs_size_range=SFF_DIMENSIONAL_EBS_SIZE_RANGE_2_easy 54 | ) 55 | -------------------------------------------------------------------------------- /algo/operators/fitness.py: -------------------------------------------------------------------------------- 1 | # library imports 2 | import pandas as pd 3 | 4 | # project imports 5 | from algo.population import Population 6 | from utills.logger_config import Logger 7 | from algo.multi_tpot_analysis import MultiTPOTrunner 8 | 9 | 10 | class Fitness: 11 | """ 12 | An AutoML wrapper class that responsible to find the best 13 | ML model + hyperparameters for a given dataset 14 | """ 15 | 16 | def __init__(self): 17 | pass 18 | 19 | @staticmethod 20 | def tpot(run_times: int, 21 | train_data_x: pd.DataFrame, 22 | train_data_y: pd.DataFrame, 23 | test_data_x: pd.DataFrame, 24 | test_data_y: pd.DataFrame, 25 | generations: int, 26 | population: Population, 27 | population_size: int, 28 | k_fold: int, 29 | performance_metric, 30 | save_dir: str, 31 | n_jobs: int = -1): 32 | """ 33 | Loops over all genes in the population, reduces the dataset 34 | according to their sequence, and uses a TPOTRegressor to 35 | find the best ML model + hyperparameters for the reduced data. 36 | 37 | Saves the fittness score and best ML model to the memory of each gene, 38 | and returns the whole population. 39 | """ 40 | for gene_index, gene in enumerate(population.genes): 41 | Logger.print("\nAssigning to gene #{}/{}:".format(gene_index, population.size()-1)) 42 | # reduce data according to gene sequence 43 | reduced_train_data_x = train_data_x.iloc[:, gene.feature_indexes] 44 | reduced_test_data_x = test_data_x.iloc[:, gene.feature_indexes] 45 | # run TPOT analysis multiple times on the reduced data 46 | results, best_model = MultiTPOTrunner.run_and_analyze(run_times=run_times, 47 | train_data_x=reduced_train_data_x, 48 | train_data_y=train_data_y, 49 | test_data_x=reduced_test_data_x, 50 | test_data_y=test_data_y, 51 | generations=generations, 52 | population_size=population_size, 53 | k_fold=k_fold, 54 | performance_metric=performance_metric, 55 | save_dir=save_dir, 56 | n_jobs=n_jobs) 57 | # save the best performing model & scoring history to gene's data 58 | gene.model_object = best_model 59 | gene.scoring_history = results 60 | # assign fittness acc to performance score of the model 61 | gene.fitness = best_model.score(reduced_test_data_x, test_data_y) 62 | return population 63 | 64 | def __repr__(self): 65 | return self.__str__() 66 | 67 | def __str__(self): 68 | return "" 69 | -------------------------------------------------------------------------------- /experiments/exp_steady_free_fall_with_drag_case_3.py: -------------------------------------------------------------------------------- 1 | # library imports 2 | 3 | # project imports 4 | from utills.consts import * 5 | from experiments.exp_steady_free_fall_with_drag import ExpSFF 6 | from data_generators.steady_free_fall_with_drag_data_generator import SFFWDdataGenerator 7 | 8 | 9 | class ExpSFF3(ExpSFF): 10 | """ 11 | The third case of the SFF - 12 | Program receives a dataset with all non-dimensional 13 | combinations of possible dimensional variables, relating to the 14 | target (a total of 34 features created from 5 variables). 15 | 16 | The program selects the best feature from each group of 17 | similar non-dimensional numbers, to create an improved 18 | dataset. This is then used to find numerical and analytical relations. 19 | 20 | Success of both numerical and analytical parts of the program proves 21 | that: 22 | 1) The program is capable of discovering the governing non-dimensional 23 | numbers that explain the physical phenomena, with no prior physical 24 | knowledge given by the user. 25 | 2) The program is able to learn the physical relation between 26 | non-dimensional features and the target 27 | 3) The program was able to learn the gravitational acceleration constant. 28 | """ 29 | 30 | def __init__(self): 31 | ExpSFF.__init__(self) 32 | 33 | @staticmethod 34 | def perform(numerical_bool: bool, 35 | analytical_bool: bool, 36 | force_ebs_bool: bool): 37 | """ 38 | Entry point 39 | """ 40 | ExpSFF.run(numerical_bool=numerical_bool, 41 | analytical_bool=analytical_bool, 42 | force_ebs_bool=force_ebs_bool, 43 | results_folder=os.path.join(os.path.dirname(os.path.dirname(__file__)), SFF_3_RESULTS_FOLDER_NAME), 44 | data_path=os.path.join(os.path.dirname(os.path.dirname(__file__)), SFF_3_DATA_FOLDER_NAME), 45 | data_generation_function=SFFWDdataGenerator.generate_case_3, 46 | numerical_run_times=SFF_NUMERICAL_RUN_TIMES, 47 | numerical_generations=SFF_NUMERICAL_GENERATION_COUNT, 48 | numerical_population=SFF_NUMERICAL_POP_SIZE, 49 | analytical_run_times=SFF_ANALYTICAL_RUN_TIMES, 50 | analytical_generations=SFF_ANALYTICAL_GENERATION_COUNT, 51 | analytical_population=SFF_NUMERICAL_POP_SIZE, 52 | parsimony_coefficient=SFF_ANALYTICAL_PARSIMONY_COEFFICIENT, 53 | k_fold=K_FOLD, 54 | samples=SFF_NUMERICAL_NUM_SAMPLES, 55 | rhoa_range=SFF_RHOA_RANGE, 56 | rhop_range=SFF_RHOP_RANGE, 57 | nu_range=SFF_NU_RANGE, 58 | re_range=SFF_RE_RANGE, 59 | expected_eq="1) 1.33 * (delta_rho/rhoa) * (g*d/V**2)", 60 | feature_selection_generations=FEATURE_SELECTION_GENERATIONS_COUNT, 61 | feature_selection_pop_size=FEATURE_SELECTION_POP_SIZE, 62 | feature_selection_mutation_rate=FEATURE_SELECTION_MUTATION_RATE, 63 | feature_selection_royalty=FEATURE_SELECTION_ROYALTY, 64 | ebs_size_range=SFF_DIMENSIONAL_EBS_SIZE_RANGE_3 65 | ) 66 | -------------------------------------------------------------------------------- /algo/multi_tpot_analysis.py: -------------------------------------------------------------------------------- 1 | # library imports 2 | import os 3 | import pickle 4 | import pandas as pd 5 | from scipy import stats 6 | from tpot import TPOTRegressor 7 | from sklearn.model_selection import KFold 8 | from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error 9 | 10 | # project imports 11 | from utills.logger_config import Logger 12 | from utills.tpot_results_extractor import TPOTresultsExtractor 13 | 14 | 15 | class MultiTPOTrunner: 16 | """ 17 | This class is responsible for generating a numerical 18 | prediction of a target value from a given set of features, 19 | using a TPOTRegressor. 20 | 21 | Input dataset is used to train and test the model 22 | multiple times (named as run_times), to gain statistical 23 | insight on the performance. 24 | """ 25 | 26 | def __init__(self): 27 | pass 28 | 29 | @staticmethod 30 | def run_and_analyze(run_times: int, 31 | train_data_x: pd.DataFrame, 32 | train_data_y: pd.DataFrame, 33 | test_data_x: pd.DataFrame, 34 | test_data_y: pd.DataFrame, 35 | generations: int, 36 | population_size: int, 37 | k_fold: int, 38 | performance_metric, 39 | save_dir: str, 40 | n_jobs: int = -1): 41 | """ 42 | Run the TPOTRegressor algorithm with some hyper-parameters 43 | for multiple times and analyze the stability of the results. 44 | Returns a pandas dataframe of all results and the best model from 45 | all runs. 46 | """ 47 | # const 48 | tpot_model_file_path = os.path.join(save_dir, 'current_tpot_pipeline.py') 49 | tpot_object_file_path = os.path.join(save_dir, 'current_tpot_pipeline_as_object') 50 | # prepare DF for results 51 | best_model = None 52 | results = pd.DataFrame() 53 | current_best_performance_score = 99999 54 | for test in range(run_times): 55 | Logger.print("TPOT run {}/{}".format(test + 1, run_times)) 56 | model = TPOTRegressor(generations=generations, 57 | population_size=population_size, 58 | cv=KFold(n_splits=k_fold), 59 | scoring=performance_metric, 60 | verbosity=2, 61 | n_jobs=n_jobs) 62 | model.fit(train_data_x, train_data_y) 63 | pred = model.predict(test_data_x) 64 | # store test scores 65 | try: 66 | # we assume this is just a function 67 | performance_score = performance_metric(test_data_y, pred) 68 | except: 69 | # maybe it is a scorer wrapper of a function and we want to overcome it 70 | performance_score = performance_metric._score_func(test_data_y, pred) 71 | results.at[test, "performance_score"] = performance_score 72 | results.at[test, "mae"] = mean_absolute_error(test_data_y, pred) 73 | results.at[test, "mse"] = mean_squared_error(test_data_y, pred) 74 | results.at[test, "r2"] = r2_score(test_data_y, pred) 75 | results.at[test, "t_test_p_value"] = stats.ttest_ind(test_data_y, pred)[1] 76 | # store exported pipeline 77 | model.export(tpot_model_file_path) 78 | pipeline = TPOTresultsExtractor.process_file(tpot_model_file_path) 79 | results.at[test, 'pipeline'] = pipeline 80 | # update best mae score and model 81 | if performance_score < current_best_performance_score: 82 | best_model = model 83 | current_best_performance_score = performance_score 84 | # remove unnecessary file 85 | os.remove(tpot_model_file_path) 86 | # Logger.print and save scoring results of all runs 87 | Logger.print("\nFinished all MultiTPOT runner runs") 88 | [Logger.print("{}: {:.3}+-{:.3}".format(score, results[score].mean(), results[score].std())) for score in results.keys() if score != "pipeline"] 89 | return results, best_model 90 | 91 | def __repr__(self): 92 | return self.__str__() 93 | 94 | def __str__(self): 95 | return "" 96 | -------------------------------------------------------------------------------- /algo/ebs/eq.py: -------------------------------------------------------------------------------- 1 | # library imports 2 | import numpy as np 3 | import pandas as pd 4 | 5 | # project imports 6 | from algo.ebs.eq_node import EqNode 7 | 8 | 9 | class Eq: 10 | """ 11 | This class represents an equation that constructed from simple functions 12 | """ 13 | 14 | def __init__(self, 15 | tree: EqNode): 16 | self.tree = tree 17 | self.linear_a = 1 18 | self.linear_b = 0 19 | 20 | def eval(self, 21 | x_values: pd.DataFrame) -> pd.Series: 22 | """ 23 | Eval the y_pred from the input 24 | :param x_values: the input 25 | """ 26 | return self.linear_a * self.tree.eval(x_values=x_values) + self.linear_b 27 | 28 | def predict(self, 29 | x_values: pd.DataFrame) -> pd.Series: 30 | """ 31 | Eval the y_pred from the input 32 | :param x_values: the input 33 | """ 34 | return self.linear_a * self.tree.eval(x_values=x_values) + self.linear_b 35 | 36 | def fix_nodes(self) -> None: 37 | """ fix nodes' is_leaf flag if has been corrupted by other process """ 38 | return self.tree.fix_node() 39 | 40 | def to_id_str(self) -> str: 41 | """ print equation in a narrow way for hash mapping """ 42 | return self.tree.to_id_str() 43 | 44 | def to_string(self) -> str: 45 | """ print the node as a string """ 46 | if self.linear_a == 1: 47 | if self.linear_b > 0: 48 | return "add({}, {:.3f})".format(self.tree.to_string(), self.linear_b) 49 | elif self.linear_b < 0: 50 | return "sub({}, {:.3f})".format(self.tree.to_string(), -1*self.linear_b) 51 | else: 52 | return "{}".format(self.tree.to_string()) 53 | else: 54 | if self.linear_b > 0: 55 | return "add(mul({:.3f}, {}), {:.3f})".format(self.linear_a, self.tree.to_string(), self.linear_b) 56 | elif self.linear_b < 0: 57 | return "sub(mul({:.3f}, {}), {:.3f})".format(self.linear_a, self.tree.to_string(), -1*self.linear_b) 58 | else: 59 | return "mul({:.3f}, {})".format(self.linear_a, self.tree.to_string()) 60 | 61 | def size(self) -> int: 62 | """ calc the size of the equation """ 63 | return self.tree.size() 64 | 65 | def populate(self, 66 | not_leaf_values: list, 67 | leaf_values: list) -> list: 68 | """ provide a list with all possible combinations """ 69 | # set index to all and get which one is leaf and not leaf 70 | leaf_dict = {} 71 | self.tree.set_index(leaf_dict) 72 | possible_allocations_list = [leaf_values if is_leaf else not_leaf_values 73 | for index, is_leaf in leaf_dict.items()] 74 | possible_allocations_index_list = [len(val) for val in possible_allocations_list] 75 | combinations_count = np.prod(possible_allocations_index_list) 76 | # run on all possible permutations 77 | answer = [] 78 | for index in range(combinations_count): 79 | allocation_option = [0 for _ in range(len(possible_allocations_index_list))] 80 | current_index = index 81 | set_index = 0 82 | while current_index != 0: 83 | this_val = current_index % possible_allocations_index_list[set_index] 84 | allocation_option[set_index] = this_val 85 | current_index = current_index // possible_allocations_index_list[set_index] 86 | set_index += 1 87 | answer.append(Eq(tree=self.tree._copy_and_put_values(allocation={index: possible_allocations_list[index][val] for index, val in enumerate(allocation_option)}))) 88 | return answer 89 | 90 | @staticmethod 91 | def all_possible_fbt(n: int) -> list: 92 | """ Return all full binary trees of inputted size 'n' """ 93 | return [Eq(tree=eq) for eq in EqNode.all_possible_fbt(n=n)] 94 | 95 | def __repr__(self): 96 | return self.__str__() 97 | 98 | def __str__(self): 99 | return "".format(self.linear_a, 100 | self.tree.__str__(), 101 | self.linear_b) 102 | -------------------------------------------------------------------------------- /data_generators/drag_force_data_generator.py: -------------------------------------------------------------------------------- 1 | # project imports 2 | import numpy as np 3 | import pandas as pd 4 | 5 | # library imports 6 | 7 | 8 | class DragForceDataGenerator: 9 | """ 10 | This class generates measurements of aerodynamic drag force Fd, 11 | exerted on a sphere. 12 | """ 13 | 14 | # CONSTS # 15 | 16 | # END - CONSTS # 17 | 18 | def __init__(self): 19 | pass 20 | 21 | # Logic - start # 22 | 23 | @staticmethod 24 | def generate(samples: int, 25 | cd_range: tuple, 26 | rhoa_range: tuple, 27 | v_range: tuple, 28 | d_range: tuple, 29 | noise_range: tuple, 30 | save_path: str, 31 | rhop_range: tuple=(0.15,0.4), 32 | L_range: tuple=(0.15,0.4), 33 | p1_range: tuple=(15,35), 34 | p2_range: tuple=(40,60), 35 | p3_range: tuple=(1e-4,2e-4), 36 | p4_range: tuple=(200,300), 37 | p5_range: tuple=(1,3), 38 | p6_range: tuple=(1000,2500)): 39 | """ 40 | Generate a pandas dataframe of experiments to represent the measurments. 41 | We assume we sample 4 parameters: 42 | cd: drag coefficient on a sphere [-] 43 | rhoa: air density [kg/m3] 44 | v: momentary velocity of the sphere [m/s] 45 | d: diameter of the sphere [m] 46 | and calculate with them: 47 | fd: drag exerted on the sphere [kg*m/s2] 48 | via fd = pi*cd*rho*(v**2)*(d**2)/8. 49 | Measurements include additional 8 parameters 50 | that are not included within this function. 51 | """ 52 | cd_range_delta = cd_range[1] - cd_range[0] 53 | rhoa_range_delta = rhoa_range[1] - rhoa_range[0] 54 | v_range_delta = v_range[1] - v_range[0] 55 | d_range_delta = d_range[1] - d_range[0] 56 | rhop_range_delta = d_range[1] - d_range[0] 57 | L_range_delta = L_range[1] - L_range[0] 58 | p1_range_delta = p1_range[1] - p1_range[0] 59 | p2_range_delta = p2_range[1] - p2_range[0] 60 | p3_range_delta = p3_range[1] - p3_range[0] 61 | p4_range_delta = p4_range[1] - p4_range[0] 62 | p5_range_delta = p5_range[1] - p5_range[0] 63 | p6_range_delta = p6_range[1] - p6_range[0] 64 | noise_range_delta = noise_range[1] - noise_range[0] 65 | data = [] 66 | # generate samples 67 | for i in range(samples): 68 | cd = round(np.random.random_sample() * cd_range_delta + cd_range[0], 2) 69 | rho = round(np.random.random_sample() * rhoa_range_delta + rhoa_range[0], 2) 70 | v = round(np.random.random_sample() * v_range_delta + v_range[0], 2) 71 | d = round(np.random.random_sample() * d_range_delta + d_range[0], 2) 72 | rhop = round(np.random.random_sample() * rhop_range_delta + rhop_range[0], 2) 73 | L = round(np.random.random_sample() * L_range_delta + L_range[0], 2) 74 | p1 = round(np.random.random_sample() * p1_range_delta + p1_range[0], 2) 75 | p2 = round(np.random.random_sample() * p2_range_delta + p2_range[0], 2) 76 | p3 = round(np.random.random_sample() * p3_range_delta + p3_range[0], 2) 77 | p4 = round(np.random.random_sample() * p4_range_delta + p4_range[0], 2) 78 | p5 = round(np.random.random_sample() * p5_range_delta + p5_range[0], 2) 79 | p6 = round(np.random.random_sample() * p6_range_delta + p6_range[0], 2) 80 | noise = round(np.random.random_sample() * noise_range_delta + noise_range[0], 2) * np.random.choice((-1, 1)) 81 | fd_sampled = np.pi*cd*rho*(v**2)*(d**2)/8 82 | fd_sampled = round(fd_sampled * (1 + noise), 2) 83 | data.append([cd, rho, rhop, v, d, L, p1, p2, p3, p4, p5, p6, fd_sampled]) 84 | # make a Pandas.DataFrame and save it as a CSV file 85 | pd.DataFrame(data=data, columns=["Cd","rhoa","rhop","v","d","L","p1","p2", 86 | "p3","p4","p5","p6","Fd"]).to_csv(save_path, index=False) 87 | #return indices of feature groups: rhoa-rhop and d-l form a group. The rest do not have a selection option 88 | return [[0,0],[1,2],[2,2],[3,4],[5,5],[6,6],[7,7],[8,8],[9,9],[10,10],[11,11]] 89 | 90 | # Logic - end # 91 | -------------------------------------------------------------------------------- /utills/symbolic_regression_to_latex_text.py: -------------------------------------------------------------------------------- 1 | # library import 2 | from sympy import * 3 | import re as regular_exp 4 | 5 | 6 | class SymbolicRegressionToLatexText: 7 | """ 8 | This class is responsible to convert the standard symbolic regression's result style to latex style 9 | """ 10 | 11 | # CONSTS # 12 | SR_FUNCS_NAMES = ["add", "sub", "mul", "div"] 13 | 14 | # END - CONSTS # 15 | 16 | def __init__(self): 17 | pass 18 | 19 | @staticmethod 20 | def run(eq: str): 21 | """ 22 | Single entry point - run the convertor from EQ of the symbolic regression class to LATEX style 23 | :param eq: the EQ to convert 24 | :return: the same EQ in LATEX format 25 | """ 26 | # replace the text to use the static methods of this class 27 | for func_name in SymbolicRegressionToLatexText.SR_FUNCS_NAMES: 28 | eq = eq.replace(func_name, 29 | "SymbolicRegressionToLatexText._{}".format(func_name)) 30 | eq = eq.replace("^", "power") 31 | # collect possible var names 32 | eq_vars = regular_exp.findall(r'(\w*),', 33 | eq) 34 | eq_vars.extend(regular_exp.findall(r', (\w*)\)', 35 | eq)) 36 | # filter just vars names 37 | eq_vars = list(set([eq_var.strip() for eq_var in eq_vars if len(eq_var) > 0 and not eq_var.isnumeric() and eq_var not in SymbolicRegressionToLatexText.SR_FUNCS_NAMES])) 38 | # name them as strings 39 | eq_vars = sorted(eq_vars, 40 | key=lambda x: len(x), 41 | reverse=True) 42 | for eq_var in eq_vars: 43 | eq = eq.replace("{},".format(eq_var), '"{}",'.format(eq_var)) 44 | eq = eq.replace(", {}".format(eq_var), ', "{}"'.format(eq_var)) 45 | # run the code 46 | ex_locals = {} 47 | exec("answer = {}".format(eq), None, ex_locals) 48 | answer_eq = ex_locals["answer"] 49 | # small fixes to style 50 | try: 51 | answer_eq = SymbolicRegressionToLatexText._post_fixes(answer_eq=answer_eq) 52 | except: 53 | pass 54 | return answer_eq 55 | 56 | @staticmethod 57 | def _add(x: str, 58 | y: str): 59 | x = str(x) 60 | y = str(y) 61 | x_number = x.isnumeric() or (x[1:].isnumeric() and x[0] == "-") 62 | y_number = y.isnumeric() or (y[1:].isnumeric() and y[0] == "-") 63 | if x_number and y_number: 64 | return "{}".format(float(x) + float(y)) 65 | return "({} + {})".format(x, y) 66 | 67 | @staticmethod 68 | def _sub(x: str, 69 | y: str): 70 | x = str(x) 71 | y = str(y) 72 | x_number = x.isnumeric() or (x[1:].isnumeric() and x[0] == "-") 73 | y_number = y.isnumeric() or (y[1:].isnumeric() and y[0] == "-") 74 | if x_number and y_number: 75 | return "{}".format(float(x) - float(y)) 76 | return "({} - {})".format(x, y) 77 | 78 | @staticmethod 79 | def _mul(x: str, 80 | y: str): 81 | x = str(x) 82 | y = str(y) 83 | x_number = x.isnumeric() or (x[1:].isnumeric() and x[0] == "-") 84 | y_number = y.isnumeric() or (y[1:].isnumeric() and y[0] == "-") 85 | if x_number and y_number: 86 | return "{}".format(float(x) * float(y)) 87 | elif x_number and not y_number: 88 | return "{}{}".format(x, y) 89 | elif not x_number and y_number: 90 | return "{}{}".format(y, x) 91 | else: 92 | return "{} \\cdot {}".format(x, y) 93 | 94 | @staticmethod 95 | def _div(x: str, 96 | y: str): 97 | x = str(x) 98 | y = str(y) 99 | x_number = x.isnumeric() or (x[1:].isnumeric() and x[0] == "-") 100 | y_number = y.isnumeric() or (y[1:].isnumeric() and y[0] == "-") 101 | if x_number and y_number: 102 | return "{}".format(float(x) / float(y)) 103 | else: 104 | return "\\frac{" + str(x) + "}{" + str(y) + "}" 105 | 106 | @staticmethod 107 | def _post_fixes(answer_eq: str): 108 | change_symbol = True 109 | while change_symbol: 110 | answer_eq_before = answer_eq 111 | answer_eq = answer_eq.replace("--", "+") 112 | answer_eq = answer_eq.replace(" - -", " + ") 113 | answer_eq = answer_eq.replace("-+", "-") 114 | answer_eq = answer_eq.replace("+-", "-") 115 | answer_eq = answer_eq.replace("++", "+") 116 | answer_eq = answer_eq.replace(" + +", "+") 117 | answer_eq = answer_eq.replace(" - +", " - ") 118 | answer_eq = answer_eq.replace(" + -", " - ") 119 | answer_eq = answer_eq.replace("power", "^") 120 | change_symbol = answer_eq_before != answer_eq 121 | # try to simplify results 122 | try: 123 | answer_eq = simplify(answer_eq) 124 | except Exception as error: 125 | pass 126 | # return answer 127 | return answer_eq 128 | -------------------------------------------------------------------------------- /paper_exp_runner.py: -------------------------------------------------------------------------------- 1 | # library imports 2 | import os 3 | 4 | # project imports 5 | from experiments.exp_steady_free_fall_with_drag_case_1 import ExpSFF1 6 | from experiments.exp_steady_free_fall_with_drag_case_2 import ExpSFF2 7 | from experiments.exp_steady_free_fall_with_drag_case_3 import ExpSFF3 8 | from experiments.exp_constant_acceleration import ExpConstantAcceleration 9 | from experiments.exp_steady_free_fall_with_drag_case_2_with_educated_guess import ExpSFF2WithGuess 10 | 11 | 12 | class PaperExpRunner: 13 | """ 14 | Single entry point for the project. 15 | This file runs all the experiments in the project and save the raw results 16 | for the manuscript 17 | """ 18 | 19 | # CONSTS # 20 | RESULTS_FOLDER_NAME = "results" 21 | 22 | # END - CONSTS # 23 | 24 | def __init__(self): 25 | pass 26 | 27 | @staticmethod 28 | def run(const_acc_numerical_bool: bool = True, 29 | const_acc_analytical_bool: bool = True, 30 | const_acc_force_ebs_bool: bool = True, 31 | sff1_numerical_bool: bool = True, 32 | sff1_analytical_bool: bool = True, 33 | sff1_force_ebs_bool: bool = True, 34 | sff2_numerical_bool: bool = True, 35 | sff2_analytical_bool: bool = True, 36 | sff2_force_ebs_bool: bool = True, 37 | sff2_with_guess_numerical_bool: bool = True, 38 | sff2_with_guess_analytical_bool: bool = True, 39 | sff2_with_guess_force_ebs_bool: bool = True, 40 | sff3_numerical_bool: bool = True, 41 | sff3_analytical_bool: bool = True, 42 | sff3_force_ebs_bool: bool = True, 43 | drag_force_numerical_bool: bool = True, 44 | drag_force_analytical_bool: bool = True, 45 | drag_force_force_ebs_bool: bool = True): 46 | """ 47 | Single method to use in the class. 48 | Run the experiments, if requested 49 | """ 50 | # prepare IO 51 | os.makedirs(os.path.join(os.path.dirname(__file__), PaperExpRunner.RESULTS_FOLDER_NAME), 52 | exist_ok=True) 53 | # run all the experiments 54 | if const_acc_numerical_bool or const_acc_analytical_bool or const_acc_force_ebs_bool: 55 | ExpConstantAcceleration.run(numerical_bool=const_acc_numerical_bool, 56 | analytical_bool=const_acc_analytical_bool, 57 | force_ebs_bool=const_acc_force_ebs_bool) 58 | 59 | if sff1_numerical_bool or sff1_analytical_bool or sff1_force_ebs_bool: 60 | ExpSFF1.perform(numerical_bool=sff1_numerical_bool, 61 | analytical_bool=sff1_analytical_bool, 62 | force_ebs_bool=sff1_force_ebs_bool) 63 | 64 | if sff2_numerical_bool or sff2_analytical_bool or sff2_force_ebs_bool: 65 | ExpSFF2.perform(numerical_bool=sff2_numerical_bool, 66 | analytical_bool=sff2_analytical_bool, 67 | force_ebs_bool=sff2_force_ebs_bool) 68 | 69 | if sff2_with_guess_numerical_bool or sff2_with_guess_analytical_bool or sff2_with_guess_force_ebs_bool: 70 | ExpSFF2WithGuess.perform(numerical_bool=sff2_with_guess_numerical_bool, 71 | analytical_bool=sff2_with_guess_analytical_bool, 72 | force_ebs_bool=sff2_with_guess_force_ebs_bool) 73 | 74 | if sff3_numerical_bool or sff3_analytical_bool or sff3_force_ebs_bool: 75 | ExpSFF3.perform(numerical_bool=sff3_numerical_bool, 76 | analytical_bool=sff3_analytical_bool, 77 | force_ebs_bool=sff3_force_ebs_bool) 78 | 79 | if drag_force_numerical_bool or drag_force_analytical_bool or drag_force_force_ebs_bool: 80 | ExpSFF3.perform(numerical_bool=drag_force_numerical_bool, 81 | analytical_bool=drag_force_analytical_bool, 82 | force_ebs_bool=drag_force_force_ebs_bool) 83 | 84 | 85 | if __name__ == '__main__': 86 | PaperExpRunner.run(const_acc_numerical_bool=False, 87 | const_acc_analytical_bool=False, 88 | const_acc_force_ebs_bool=False, 89 | 90 | sff1_numerical_bool=False, 91 | sff1_analytical_bool=False, 92 | sff1_force_ebs_bool=False, 93 | 94 | sff2_numerical_bool=False, 95 | sff2_analytical_bool=False, 96 | sff2_force_ebs_bool=False, 97 | 98 | sff2_with_guess_numerical_bool=False, 99 | sff2_with_guess_analytical_bool=False, 100 | sff2_with_guess_force_ebs_bool=False, 101 | 102 | sff3_numerical_bool=False, 103 | sff3_analytical_bool=False, 104 | sff3_force_ebs_bool=False, 105 | 106 | drag_force_numerical_bool=True, 107 | drag_force_analytical_bool=True, 108 | drag_force_force_ebs_bool=True) 109 | -------------------------------------------------------------------------------- /utills/consts.py: -------------------------------------------------------------------------------- 1 | # library imports 2 | import os 3 | 4 | # consts # 5 | 6 | # 1) General for all experiment: 7 | DATA_FOLDER = "data" 8 | RESULTS_FOLDER = "results" 9 | g_force = 9.81 10 | REL_ERR_OF_STD = 0.05 11 | DEFAULT_FIG_SIZE = 8 12 | DEFAULT_DPI = 600 13 | FEATURE_IMPORTANCE_SIMULATION_COUNT = 5 #100 14 | JSON_INDENT = 2 15 | K_FOLD = 5 16 | RANDOM_STATE = 73 17 | SYMBOLIC_PERCENT_OF_MAJORITY = 0.6 18 | SYMBOLIC_P_VALUE_THRESHOLD = 0.8 19 | SYMBOLIC_EQ_RANKING_METRIC = "r2" 20 | SYMBOLIC_TOP_EQS_MAX_NUM = 5 21 | 22 | 23 | # 2) Constant acceleration exp: 24 | # - data generation: 25 | CONST_ACCELERATION_NUM_SAMPLES = 400 26 | CONST_ACCELERATION_TEST_SIZE_PORTION = 0.75 27 | # - experiment run: 28 | CONST_ACCELERATION_NUMERICAL_RUN_TIMES = 20 29 | CONST_ACCELERATION_NUMERICAL_GENERATION_COUNT = 5 30 | CONST_ACCELERATION_NUMERICAL_POP_SIZE = 30 31 | CONST_ACCELERATION_ANALYTICAL_RUN_TIMES = 20 32 | CONST_ACCELERATION_ANALYTICAL_GENERATION_COUNT = 5 33 | CONST_ACCELERATION_ANALYTICAL_POP_SIZE = 50 34 | CONST_ACCELERATION_NOISE_RANGE = (0, 0.02) 35 | CONST_ACCELERATION_ANALYTICAL_PARSIMONY_COEFFICIENT = 0.02 36 | CONST_ACCELERATION_EBS_SIZE_RANGE = (5,) 37 | # - result path 38 | CONST_ACCELERATION_RESULTS_FOLDER_NAME = os.path.join(RESULTS_FOLDER, 39 | "constant_acceleration_results", 40 | "{}_samples".format(CONST_ACCELERATION_NUM_SAMPLES)) 41 | 42 | 43 | # 3) Steady free fall with drag exp: 44 | # - data generation: 45 | N_FREQ_SUFFIX = ['_11', '_12', '_13', '_21', '_23', '_31', '_32'] 46 | STEADY_FALL_MINIZE_TOL = 1e-25 47 | SFF_RHOA_RANGE = (998., 1300.) # fresh water -> salt water at 20C [kg/m3] 48 | SFF_RHOP_RANGE = (0, 5000) 49 | SFF_NU_RANGE = (1e-6, 1.4e-6) # viscosity corresponding to rhoa [m2/s] 50 | SFF_RE_RANGE = (1., 100.) # Reynolds range where Cd changes significantly 51 | SFF_CASE_2_NOISE_RANGE = (0, 0.02) 52 | SFF_TEST_SIZE_PORTION = 0.2 53 | SFF_DIMENSIONAL_EBS_SIZE_RANGE_1_2 = (11,) 54 | SFF_DIMENSIONAL_EBS_SIZE_RANGE_2_easy = (7,) 55 | SFF_DIMENSIONAL_EBS_SIZE_RANGE_3 = (3,) 56 | SFF_1_DROP_PARAM = "V" 57 | FORCE_DATA_OVERRIDE_FLAG = False 58 | # - experiment run: 59 | SFF_NUMERICAL_NUM_SAMPLES = 10**4 60 | SFF_NUMERICAL_RUN_TIMES = 20 61 | SFF_NUMERICAL_GENERATION_COUNT = 3 62 | SFF_NUMERICAL_POP_SIZE = 25 63 | SFF_ANALYTICAL_RUN_TIMES = 20 64 | SFF_ANALYTICAL_GENERATION_COUNT = 10 65 | SFF_ANALYTICAL_POP_SIZE = 2000 66 | SFF_ANALYTICAL_PARSIMONY_COEFFICIENT = 0.025 67 | # - feature selection: 68 | FEATURE_SELECTION_GENERATIONS_COUNT = 2 69 | FEATURE_SELECTION_POP_SIZE = 8 70 | FEATURE_SELECTION_MUTATION_RATE = 0.1 71 | FEATURE_SELECTION_ROYALTY = 0.05 72 | # - data and result paths 73 | SFF_N_SAMPLES_STR = str(round(SFF_NUMERICAL_NUM_SAMPLES/1000)) + "k" 74 | SFF_1_RESULTS_FOLDER_NAME = os.path.join(RESULTS_FOLDER, 75 | "steady_fall_case_1_results", 76 | "{}_samples_without_{}".format(SFF_N_SAMPLES_STR, SFF_1_DROP_PARAM)) 77 | SFF_1_DATA_FOLDER_NAME = os.path.join(DATA_FOLDER, 78 | "case_1_steady_fall_with_drag_data_" + 79 | "{}_samples_no_{}.csv".format(SFF_N_SAMPLES_STR, SFF_1_DROP_PARAM)) 80 | 81 | SFF_2_RESULTS_FOLDER_NAME = os.path.join(RESULTS_FOLDER, 82 | "steady_fall_case_2_results_{}_samples".format(SFF_N_SAMPLES_STR)) 83 | SFF_2_DATA_FOLDER_NAME = os.path.join(DATA_FOLDER, 84 | "case_2_steady_fall_with_drag_data_{}_samples.csv".format(SFF_N_SAMPLES_STR)) 85 | 86 | SFF_2_WITH_GUESS_RESULTS_FOLDER_NAME = os.path.join(RESULTS_FOLDER, 87 | "steady_fall_case_2_with_guess_results_{}_samples".format(SFF_N_SAMPLES_STR)) 88 | SFF_2_WITH_GUESS_DATA_FOLDER_NAME = os.path.join(DATA_FOLDER, 89 | "case_2_with_guess_steady_fall_with_drag_data_{}_samples.csv".format(SFF_N_SAMPLES_STR)) 90 | 91 | SFF_3_RESULTS_FOLDER_NAME = os.path.join(RESULTS_FOLDER, 92 | "steady_fall_case_3_results_{}_samples".format(SFF_N_SAMPLES_STR)) 93 | SFF_3_DATA_FOLDER_NAME = os.path.join(DATA_FOLDER, 94 | "case_3_steady_fall_with_drag_data_{}_samples.csv".format(SFF_N_SAMPLES_STR)) 95 | 96 | 97 | # 3) Drag force exp: 98 | # - data generation: 99 | DRAG_FORCE_NUM_SAMPLES = 10000 100 | DRAG_FORCE_TEST_SIZE_PORTION = 0.75 101 | # - experiment run: 102 | DRAG_FORCE_NUMERICAL_RUN_TIMES = 20 103 | DRAG_FORCE_NUMERICAL_GENERATION_COUNT = 5 104 | DRAG_FORCE_NUMERICAL_POP_SIZE = 30 105 | DRAG_FORCE_FEATURE_GENERATIONS_COUNT = 5 106 | DRAG_FORCE_FEATURE_POP_SIZE = 30 107 | DRAG_FORCE_MUTATION_RATE = 0.1 108 | DRAG_FORCE_ROYALTY = 0.05 109 | DRAG_FORCE_ANALYTICAL_RUN_TIMES = 20 110 | DRAG_FORCE_ANALYTICAL_GENERATION_COUNT = 5 111 | DRAG_FORCE_ANALYTICAL_POP_SIZE = 50 112 | DRAG_FORCE_NOISE_RANGE = (0, 0.02) 113 | DRAG_FORCE_ANALYTICAL_PARSIMONY_COEFFICIENT = 0.02 114 | DRAG_FORCE_EBS_SIZE_RANGE = (13,) 115 | # - result path 116 | DRAG_FORCE_RESULTS_FOLDER_NAME = os.path.join(RESULTS_FOLDER, 117 | "drag_force_results", 118 | "{}_samples".format(DRAG_FORCE_NUM_SAMPLES)) 119 | 120 | # end - consts # 121 | -------------------------------------------------------------------------------- /algo/genetic_algorithm_feature_selection.py: -------------------------------------------------------------------------------- 1 | # library imports 2 | import pandas as pd 3 | import os 4 | 5 | # project imports 6 | from algo.operators.fitness import Fitness 7 | from algo.population import Population 8 | from algo.operators.mutation import Mutation 9 | from utills.logger_config import Logger 10 | from algo.operators.crossover import Crossover 11 | from algo.operators.next_generation import NextGeneration 12 | 13 | 14 | class GAFS: 15 | """ 16 | A classical genetic algorithm for grouped feature selection with 17 | a wrapper of AutoML (MultiTPOTrunner-driven) pipeline search 18 | """ 19 | 20 | def __init__(self): 21 | pass 22 | 23 | @staticmethod 24 | def run(tpot_run_times: int, 25 | feature_generations: int, 26 | tpot_regressor_generations: int, 27 | feature_population_size: int, 28 | tpot_regressor_population_size: int, 29 | mutation_rate: float, 30 | feature_indexes_ranges: list, 31 | mutation_w: list, 32 | royalty: float, 33 | k_fold: int, 34 | performance_metric: str, 35 | train_data_x: pd.DataFrame, 36 | train_data_y: pd.DataFrame, 37 | test_data_x: pd.DataFrame, 38 | test_data_y: pd.DataFrame, 39 | save_dir: str, 40 | cores: int = -1): 41 | """ 42 | Run the GAFS algorithm with some hyper-parameters 43 | """ 44 | assert len(mutation_w) == len(feature_indexes_ranges) 45 | assert feature_generations > 0 46 | assert tpot_regressor_generations > 0 47 | assert feature_population_size > 0 48 | assert (feature_population_size % 2) == 0 49 | assert tpot_regressor_population_size > 0 50 | assert k_fold > 0 51 | assert 0 < royalty < 1 52 | assert train_data_x.shape[0] == train_data_y.shape[0] 53 | assert test_data_x.shape[0] == test_data_y.shape[0] 54 | assert test_data_x.shape[1] == test_data_x.shape[1] 55 | 56 | # generate population of genes dictating how to trim data 57 | pop = Population.random(size=feature_population_size, 58 | feature_count=len(feature_indexes_ranges), 59 | feature_indexes_ranges=feature_indexes_ranges) 60 | # create a dict to store selected features through generations 61 | selected_fs = {"feature_indices": [], 62 | "feature_names": []} 63 | for generation in range(feature_generations): 64 | # manipulate gene population 65 | pop = Mutation.simple(population=pop, 66 | feature_indexes_ranges=feature_indexes_ranges, 67 | mutation_rate=mutation_rate, 68 | w=mutation_w) 69 | pop = Crossover.simple(population=pop) 70 | # assign fitness score and best ML pipeline to each gene in pop 71 | Logger.print("\nGeneration #{}/{} | Assign Fitness and Pipeline to each gene:".format(generation + 1, 72 | feature_generations)) 73 | pop = Fitness.tpot(run_times=tpot_run_times, 74 | train_data_x=train_data_x, 75 | train_data_y=train_data_y, 76 | test_data_x=test_data_x, 77 | test_data_y=test_data_y, 78 | generations=tpot_regressor_generations, 79 | population=pop, 80 | population_size=tpot_regressor_population_size, 81 | k_fold=k_fold, 82 | performance_metric=performance_metric, 83 | n_jobs=cores, 84 | save_dir=save_dir) 85 | # alert user 86 | current_best_gene = pop.get_best() 87 | feature_names = list(test_data_x.columns[current_best_gene.feature_indexes]) 88 | selected_fs["feature_indices"].append(current_best_gene.feature_indexes) 89 | selected_fs["feature_names"].append(feature_names) 90 | Logger.print("Generation #{}/{} | Best gene's fitness: {:.3f} selected features: {}".format(generation + 1, 91 | feature_generations, 92 | current_best_gene.fitness, 93 | feature_names)) 94 | # prepare population for next generation 95 | pop = NextGeneration.tournament_with_royalty(population=pop, 96 | royalty=royalty) 97 | # save selected features from all generations 98 | pd.DataFrame(selected_fs, dtype=object).to_csv(os.path.join(save_dir, "selected_features_history.csv"), 99 | index=False) 100 | return pop.get_best() 101 | 102 | def __repr__(self): 103 | return self.__str__() 104 | 105 | def __str__(self): 106 | return "" -------------------------------------------------------------------------------- /algo/ebs/eq_node.py: -------------------------------------------------------------------------------- 1 | # library imports 2 | import pandas as pd 3 | 4 | # project imports 5 | from algo.ebs.eq_functions import * 6 | 7 | 8 | class EqNode: 9 | """ 10 | This class represents a single node in an equation tree 11 | """ 12 | 13 | # CONSTS # 14 | NO_INDEX = -1 15 | # END - CONSTS # 16 | 17 | # CATCH FOR OPTIMIZATION # 18 | dp = {} 19 | # END - CATCH FOR OPTIMIZATION # 20 | 21 | def __init__(self, 22 | value=None, 23 | is_leaf: bool = True, 24 | left_child=None, 25 | right_child=None, 26 | index: int = NO_INDEX): 27 | self.value = value 28 | self.is_leaf = is_leaf 29 | self.index = index 30 | if is_leaf: 31 | self.left_child = None 32 | self.right_child = None 33 | elif left_child is not None and right_child is not None: 34 | self.left_child = left_child 35 | self.right_child = right_child 36 | else: 37 | raise ValueError("If EqNode is not leaf node, it must have both left and right childs") 38 | 39 | def eval(self, 40 | x_values: pd.DataFrame) -> pd.Series: 41 | """ eval the node """ 42 | if self.is_leaf: 43 | return x_values[self.value] 44 | else: 45 | return self.value(self.left_child.eval(x_values), 46 | self.right_child.eval(x_values)) 47 | 48 | def fix_node(self) -> None: 49 | """ fix nodes' is_leaf flag if has been corrupted by other process """ 50 | if self.is_leaf and self.left_child is not None and self.right_child is not None: 51 | self.is_leaf = False 52 | elif not self.is_leaf and self.left_child is None and self.right_child is None: 53 | self.is_leaf = True 54 | elif self.is_leaf and (self.left_child is None or self.right_child is None): 55 | self.left_child = None 56 | self.right_child = None 57 | 58 | if not self.is_leaf: 59 | self.left_child.fix_node() 60 | self.right_child.fix_node() 61 | 62 | def to_string(self) -> str: 63 | """ print the node as a string """ 64 | if self.is_leaf: 65 | return str(self.value) 66 | else: 67 | return "{}({}, {})".format(FUNCTION_MAPPER[self.value], 68 | self.left_child.to_string(), 69 | self.right_child.to_string()) 70 | 71 | def to_id_str(self) -> str: 72 | """ print equation in a narrow way for hash mapping """ 73 | if self.is_leaf: 74 | return "L".format() 75 | return "{}N{}".format(self.left_child.to_id_str(), 76 | self.right_child.to_id_str()) 77 | 78 | def size(self) -> int: 79 | """ calc the size of the equation """ 80 | if self.is_leaf: 81 | return 1 82 | return 1 + self.right_child.size() + self.left_child.size() 83 | 84 | def set_index(self, 85 | leaf_dict: dict, 86 | index: int = 0) -> tuple: 87 | """ add an index to each node and tells if leaf or not """ 88 | self.index = index 89 | leaf_dict[self.index] = self.is_leaf 90 | if not self.is_leaf: 91 | index = self.left_child.set_index(leaf_dict=leaf_dict, index=index + 1) 92 | index = self.right_child.set_index(leaf_dict=leaf_dict, index=index + 1) 93 | return index 94 | 95 | def _copy_and_put_values(self, 96 | allocation: dict): 97 | """ copy the current topoloy and puts values by order according to their index """ 98 | if self.is_leaf: 99 | return EqNode(value=allocation[self.index], 100 | index=self.index, 101 | left_child=None, # self.left_child 102 | right_child=None, # self.right_child 103 | is_leaf=self.is_leaf) 104 | return EqNode(value=allocation[self.index], 105 | index=self.index, 106 | left_child=self.left_child._copy_and_put_values(allocation=allocation), 107 | right_child=self.right_child._copy_and_put_values(allocation=allocation), 108 | is_leaf=self.is_leaf) 109 | 110 | @staticmethod 111 | def all_possible_fbt(n: int) -> list: 112 | """ Return all full binary trees of inputted size 'n' """ 113 | if n == 0: 114 | return [] 115 | if n == 1: 116 | return [EqNode(is_leaf=True)] 117 | if n in EqNode.dp: 118 | return EqNode.dp[n] 119 | 120 | result = [] 121 | for l in range(n): 122 | r = n - 1 - l 123 | left_trees = EqNode.all_possible_fbt(n=l) 124 | right_trees = EqNode.all_possible_fbt(n=r) 125 | for t1 in left_trees: 126 | for t2 in right_trees: 127 | result.append(EqNode(is_leaf=False, 128 | left_child=t1, 129 | right_child=t2)) 130 | EqNode.dp[n] = result 131 | return result 132 | 133 | def __repr__(self): 134 | return "".format(self.value, 135 | self.is_leaf, 136 | self.index) 137 | 138 | def __str__(self): 139 | if self.is_leaf: 140 | return "([#{}]{})".format(self.index, 141 | self.value) 142 | return "([#{}]{} -> {} & {})".format(self.index, 143 | self.value, 144 | self.left_child.__str__(), 145 | self.right_child.__str__()) 146 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # SciMED: A Computational Framework For Physics-Informed Symbolic Regression with Scientist-In-The-Loop 2 | 3 | ## Abstract 4 | Discovering a meaningful, dimensionally homogeneous, symbolic expression explaining 5 | experimental data is a fundamental challenge in physics. In this study, we present a 6 | novel, open-source computational framework called Scientist-Machine Equation 7 | Detector (SciMED), which integrates scientific discipline wisdom in a 8 | scientist-in-the-loop approach with state-of-the-art SR methods. SciMED combines a 9 | genetic algorithm-based wrapper selection method with automatic machine learning 10 | and two levels of symbolic regression methods. We test SciMED on four configurations 11 | of the settling of a sphere with and without a non-linear aerodynamic drag force. We 12 | show that SciMED is sufficiently robust to discover the correct physically meaningful 13 | symbolic expressions of each configuration from noisy data. Our results indicate better 14 | performance on these tasks than the state-of-the-art SR software package. 15 | 16 | ## Table of contents 17 | 1. [Code usage](#code_usage) 18 | 2. [The algorithm](#the_algorithm) 19 | 3. [Data](#data_preparation) 20 | 4. [How to cite](#how_to_cite) 21 | 5. [Dependencies](#dependencies) 22 | 6. [Contributing](#contributing) 23 | 7. [Bug Reports](#bug_reports) 24 | 8. [Contact](#contact) 25 | 26 | 27 | 28 | ## Code usage 29 | ### Run the experiments shown in the paper: 30 | 1. Clone the repo 31 | 2. Install the `requirements.txt` file. 32 | 3. run the project from the `paper_exp_runner.py` file, make sure all the arguments are set to **True**. 33 | 34 | ### Use in your project: 35 | 1. Clone the repo 36 | 2. Install the `requirements.txt` file. 37 | 3. Include the following code to the relevant part of your project: 38 | ``` 39 | from scimed import scimed 40 | scimed.run(dataset_x: pandas.DataFrame, dataset_y: pandas.Seires, ...) 41 | ``` 42 | ### Demo: 43 | A demo on how to use SciMED with a data from a CSV file (using Pandas) is shown in the "/demo" folder. 44 | 45 | 46 | 47 | ## The algorithm 48 | SciMED is constructed from four components: 49 | 1. **A genetic algorithm-based feature selection:** Reduce the search space by selecting a single, most explainable, feature from each group of features that are considered to be the same in physical essence. This devition to groups is provided by the user, applying his domain knowledge. 50 | 2. **A genetic algorithm-based automatic machine learning (AutoML):** Trains an ML to produce synthetic data that facilitates the SR task by enriching the data domain. 51 | 3. **A genetic algorithm-based symbolic regression (SR):** less resource and time-consuming but stochastic SR search. May result in sub-optimal outcome. 52 | 4. **A Las Vegas search SR:** more computationally expensive SR search that averagely produces more stable and accurate outcome. 53 | 54 | Each section allows the user to easily insert physical knowledge or assumptions, specific to its current task, directing the search process for a more credible result 55 | with fewer required resources. The motivation for this structure is derived from the way human scientists work, where more promising directions get more attention and resources. 56 | 57 | ![Algo_structure](https://user-images.githubusercontent.com/72650415/230033829-9e283c9c-80ab-43d1-9385-6999074ae836.png) 58 | 59 | 60 | 61 | ## Data preparation 62 | The data file to be analyzed should be a csv file, with each column containing the numerical values of each variable. If the variables can be grouped into variables of similar essence, from which only one can be in the mystery eqaution, then they should appear sequentially and the index ranges for each group should be passed to the function. 63 | 64 | The solution file will be saved in the directory called "results" under the name of the specific component that generated them. For example, there will be three {component}_target_vs_pred.pdf files demonstrating the prediction capabilities of the specific outcome from the component. 65 | 66 | 67 | 68 | ## How to cite 69 | Please cite the SciMED work if you compare, use, or build on it: 70 | ``` 71 | @article{keren2023computational, 72 | title={A computational framework for physics-informed symbolic regression with straightforward integration of domain knowledge}, 73 | author={Keren, Liron Simon and Liberzon, Alex and Lazebnik, Teddy}, 74 | journal={Scientific Reports}, 75 | volume={13}, 76 | number={1}, 77 | pages={1249}, 78 | year={2023}, 79 | publisher={Nature Publishing Group UK London} 80 | } 81 | ``` 82 | 83 | 84 | 85 | ## Dependencies 86 | 1. pandas 87 | 2. numpy 88 | 3. matplotlib 89 | 4. seaborn 90 | 5. scikit-learn 91 | 6. scipy 92 | 7. TPOT 93 | 8. gplearn 94 | 9. pytorch 95 | 10. termcolor 96 | 11. sympy 97 | 98 | 99 | 100 | ## Contributing 101 | We would love you to contribute to this project, pull requests are very welcome! Please send us an email with your suggestions or requests... 102 | 103 | 104 | 105 | ## Bug Reports 106 | Report [here]("https://github.com/LironSimon/SciMED/issues"). Guaranteed reply as fast as we can :) 107 | 108 | 109 | 110 | ## Contact 111 | * Liron Simon - [email](mailto:lirons.gb@gmail.com) | [LinkedInֿ](https://www.linkedin.com/in/liron-simon/) 112 | * Teddy Lazebnik - [email](mailto:t.lazebnik@ucl.ac.uk) | [LinkedInֿ](https://www.linkedin.com/in/teddy-lazebnik/) 113 | * Alex Liberzon - [email](mailto:alexlib@tauex.tau.ac.il) | [LinkedInֿ](https://www.linkedin.com/in/alexliberzon/) 114 | 115 | 116 | ## Run online using Mybinder.org 117 | 118 | [![Binder](https://mybinder.org/badge_logo.svg)](https://mybinder.org/v2/gh/LironSimon/SciMED/master) 119 | 120 | Open New Terminal and run `python main.py` 121 | 122 | 123 | ## Run using Docker 124 | 125 | docker run alexlib/scimed:latest 126 | 127 | 128 | -------------------------------------------------------------------------------- /utills/result_tracker.py: -------------------------------------------------------------------------------- 1 | # library imports 2 | import os 3 | import json 4 | import pickle 5 | import pandas as pd 6 | 7 | # project imports 8 | from utills.consts import * 9 | from utills.plotter import Plotter 10 | from utills.symbolic_regression_to_latex_text import SymbolicRegressionToLatexText 11 | 12 | 13 | class ResultTracker: 14 | """ 15 | This class is responsible for saving plots and data 16 | for each part of the program. 17 | """ 18 | 19 | def __init__(self): 20 | pass 21 | 22 | @staticmethod 23 | def run(program_part: str, 24 | run_times: int, 25 | all_scores: pd.DataFrame, 26 | model, 27 | train_data_x: pd.DataFrame, 28 | train_data_y: pd.DataFrame, 29 | test_data_x: pd.DataFrame, 30 | test_data_y: pd.DataFrame, 31 | save_dir: str): 32 | """ 33 | 34 | """ 35 | assert program_part in ["tpot", "symbolic"] 36 | 37 | # 1) save model 38 | if program_part == "tpot": 39 | model.export(os.path.join(os.path.dirname(os.path.dirname(__file__)), 40 | save_dir, 41 | "tpot_exported_pipeline.py")) 42 | else: 43 | with open(os.path.join(os.path.dirname(os.path.dirname(__file__)), save_dir, 44 | "symbolic_model"), "wb") as symbolic_fit_file: 45 | pickle.dump(model, symbolic_fit_file) 46 | 47 | # 2) save scoring history of model as a whole and as averaged 48 | all_scores.to_csv(os.path.join(save_dir, program_part + "_scoring_history.csv"), index=False) 49 | with open(os.path.join(os.path.dirname(os.path.dirname(__file__)), save_dir, 50 | program_part + "_fit_results.json"), "w") as test_file: 51 | json.dump({key: all_scores[key].mean() for key in all_scores.keys()[:-1]}, 52 | test_file, 53 | indent=JSON_INDENT) 54 | 55 | # 3) plot model's predictions vs true values 56 | Plotter.y_test_vs_y_pred(model=model, 57 | x_test=pd.concat([train_data_x, test_data_x]), 58 | y_test=pd.concat([train_data_y, test_data_y]), 59 | save_path=os.path.join(save_dir, program_part + "_target_vs_pred.pdf")) 60 | 61 | # 4) varify that mae scores are stable 62 | if run_times > 1: 63 | Plotter.std_check(data=all_scores["mae"], 64 | save_path=os.path.join(os.path.dirname(os.path.dirname(__file__)), 65 | save_dir, 66 | program_part + "_mae_stability.pdf")) 67 | # 5) plot feature importance 68 | dataset_x = pd.concat([train_data_x, test_data_x]) 69 | dataset_y = pd.concat([train_data_y, test_data_y]) 70 | dataset = pd.concat([dataset_x, dataset_y], axis=1) 71 | 72 | # if program_part == "tpot": 73 | # Plotter.feature_importance(model=model, 74 | # dataset=dataset, 75 | # save_dir=save_dir, 76 | # program_part=program_part, 77 | # simulations=FEATURE_IMPORTANCE_SIMULATION_COUNT) 78 | 79 | if program_part == "symbolic": 80 | p_value = all_scores["t_test_p_value"].mean() 81 | if p_value < SYMBOLIC_P_VALUE_THRESHOLD: 82 | continue_to_ebs_flag = True 83 | else: 84 | continue_to_ebs_flag = False 85 | return continue_to_ebs_flag 86 | 87 | @staticmethod 88 | def summaries_symbolic_results(run_times: int, 89 | percent_of_majority: float, 90 | eq_ranking_metric: str, 91 | top_eqs_max_num: int, 92 | save_dir: str): 93 | """ 94 | 95 | """ 96 | # load data 97 | eqs = pd.read_csv(os.path.join(save_dir, "symbolic_scoring_history.csv"))["found_eq"] 98 | eq_ranking = pd.read_csv(os.path.join(save_dir, "symbolic_scoring_history.csv"))[eq_ranking_metric] 99 | # write summary file: 100 | with open(os.path.join(save_dir, "symbolic_results_summary.txt"), 'w') as f: 101 | f.write("Symbolic run count: {}\n\n".format(run_times)) 102 | # check if program needs to continue to ebf search 103 | if list(eqs.value_counts())[0] >= percent_of_majority * run_times: 104 | f.write("The function that repeated in {}% of the runs: \n {}\n\n".format( 105 | round(list(eqs.value_counts())[0] * 100 / run_times, 2), 106 | eqs.value_counts().index[0]#SymbolicRegressionToLatexText.run(eq=str(eqs.value_counts().index[0])) 107 | )) 108 | continue_to_ebs_flag = False 109 | else: 110 | f.write("No function was found for at least {} of the runs\n\n".format(round(percent_of_majority * 100, 111 | 2))) 112 | continue_to_ebs_flag = True 113 | # rank the eqs found by metric: 114 | top_eqs_index = eq_ranking.sort_values(ascending=False)[:top_eqs_max_num].index 115 | f.write("{} best equations found (according to {} score):\n".format(len(eqs[top_eqs_index].unique()), 116 | eq_ranking_metric)) 117 | for i, eq in enumerate(eqs[top_eqs_index].unique()): 118 | f.write(" {}) {}\n".format(i + 1, eq)) #SymbolicRegressionToLatexText.run(eq=str(eq)))) 119 | # alert user of findings 120 | return continue_to_ebs_flag 121 | 122 | @staticmethod 123 | def ebs_results(model, 124 | all_scores: pd.DataFrame, 125 | save_dir: str): 126 | with open(os.path.join(os.path.dirname(os.path.dirname(__file__)), save_dir, 127 | "ebs_fit_score.json"), "w") as ebs_fit_score_file: 128 | answer = {"k_fold": K_FOLD} 129 | for key in all_scores.keys()[:-1]: 130 | answer[key] = all_scores[key].mean() 131 | json.dump(answer, 132 | ebs_fit_score_file, 133 | indent=JSON_INDENT) 134 | # save best fitted model 135 | with open(os.path.join(os.path.dirname(os.path.dirname(__file__)), 136 | CONST_ACCELERATION_RESULTS_FOLDER_NAME, 137 | "ebs_model"), 138 | "wb") as ebs_fit_file: 139 | pickle.dump(model, ebs_fit_file) 140 | 141 | def __repr__(self): 142 | return self.__str__() 143 | 144 | def __str__(self): 145 | return "" 146 | -------------------------------------------------------------------------------- /algo/genetic_algorithm_symbolic_fit.py: -------------------------------------------------------------------------------- 1 | # library imports 2 | import os 3 | import json 4 | import pandas as pd 5 | from scipy import stats 6 | from sklearn.metrics import r2_score 7 | from sklearn.model_selection import KFold 8 | from gplearn.genetic import SymbolicRegressor 9 | 10 | # project imports 11 | from utills.plotter import Plotter 12 | from utills.fitness_methods import * 13 | from utills.logger_config import Logger 14 | from utills.symbolic_regression_to_latex_text import SymbolicRegressionToLatexText 15 | 16 | 17 | class GASF: 18 | """ 19 | This class is responsible for generating a symbolic equation 20 | of a target value from a given set of features, using a 21 | SymbolicRegressor. 22 | 23 | The class contains 2 functions: 24 | 1. run: Kfold trains a model and returns it fitted 25 | 2. run_and_analyze: applies the run function multiple times 26 | to gain statistical insight on the performance. 27 | """ 28 | 29 | # CONSTS # 30 | DEFAULT_TEST_FIT_FUNCTION = better_symbolic_reg_fitness 31 | # END - CONSTS # 32 | 33 | def __init__(self): 34 | pass 35 | 36 | @staticmethod 37 | def run(non_normalized_data: pd.DataFrame, 38 | generations: int, 39 | population_size: int, 40 | k_fold: int, 41 | performance_metric, 42 | parsimony_coefficient: float, 43 | verbose: int, 44 | expected_eq='Unknown', 45 | cores: int = -1): 46 | """ 47 | Run the GAFS algorithm with some hyper-parameters. 48 | Initially the model is trained on a kfold portion of data, 49 | and then on the dataset as a whole. 50 | The model of the latter case is returned. 51 | """ 52 | y_col = non_normalized_data.keys()[-1] 53 | x_values = non_normalized_data.drop([y_col], axis=1) 54 | y_values = non_normalized_data[y_col] 55 | # make a k-fold cross validation so we can trust the results better 56 | kf = KFold(n_splits=k_fold) 57 | scores = [] 58 | fold_index = 1 59 | for train_index, test_index in kf.split(x_values): 60 | # say we do fold 61 | Logger.print(message=" Symbolic regression {} fold".format(fold_index)) 62 | fold_index += 1 63 | # prepare data 64 | X_train, X_test = x_values.iloc[train_index, :], x_values.iloc[test_index, :] 65 | y_train, y_test = y_values.iloc[train_index], y_values.iloc[test_index] 66 | # prepare model 67 | est = SymbolicRegressor(population_size=population_size, 68 | generations=generations, 69 | metric=performance_metric, 70 | n_jobs=cores, 71 | verbose=verbose, 72 | parsimony_coefficient=parsimony_coefficient, 73 | random_state=73) 74 | est.fit(X_train, y_train) 75 | y_pred = est.predict(X_test) 76 | score = performance_metric(y_test, y_pred) if not isinstance(performance_metric, str) else function_mapper[performance_metric](y_test, y_pred) 77 | scores.append(score) 78 | 79 | # train a symbolic regression on all the data, it is at least as good as the previous ones 80 | est = SymbolicRegressor(population_size=population_size, 81 | generations=generations, 82 | n_jobs=cores, 83 | feature_names=non_normalized_data.keys()[:-1], 84 | parsimony_coefficient=parsimony_coefficient, 85 | verbose=verbose, 86 | random_state=73) 87 | est.fit(x_values, y_values) 88 | # if we want to compare to the EQ. 89 | if expected_eq != 'Unknown': 90 | Logger.print(message='Expected eq: {}, Found eq: {} | Found eq as latex: {}'.format(expected_eq, 91 | est, 92 | "NA"))#SymbolicRegressionToLatexText.run(eq=str(est)))) 93 | else: 94 | Logger.print(message='Found eq: {}'.format(est)) 95 | return est 96 | 97 | @staticmethod 98 | def run_and_analyze(run_times: int, 99 | non_normalized_data: pd.DataFrame, 100 | generations: int, 101 | population_size: int, 102 | k_fold: int, 103 | performance_metric, 104 | parsimony_coefficient: float, 105 | save_dir: str, 106 | expected_eq='Unknown', 107 | cores: int = -1): 108 | """ 109 | Run the GAFS algorithm several times and save results from all runs. 110 | Returns a pandas dataframe of all results and the best model from 111 | all runs. 112 | """ 113 | results = pd.DataFrame() 114 | y_col = non_normalized_data.keys()[-1] 115 | x_values = non_normalized_data.drop(y_col, axis=1) 116 | y_values = non_normalized_data[y_col] 117 | current_best_wanted_loss = 99999 118 | best_model = None 119 | for test in range(run_times): 120 | Logger.print(message="Symbolic regression run {}".format(test + 1)) 121 | if isinstance(parsimony_coefficient, float) and 0 <= parsimony_coefficient <= 1: 122 | fit_model = GASF.run(non_normalized_data=non_normalized_data, 123 | generations=generations, 124 | population_size=population_size, 125 | k_fold=k_fold, 126 | performance_metric=performance_metric, 127 | parsimony_coefficient=parsimony_coefficient, 128 | verbose=1 if test == 0 else 0, 129 | expected_eq=expected_eq, 130 | cores=cores) 131 | elif isinstance(parsimony_coefficient, list) and len(parsimony_coefficient) > 0 and all([isinstance(val, float) for val in parsimony_coefficient]): 132 | best_score = 99999 133 | best_inner_model = None 134 | best_parsimony_coefficient = 0 135 | score_history = {} 136 | for parsimony_coefficient_val in parsimony_coefficient: 137 | fit_model = GASF.run(non_normalized_data=non_normalized_data, 138 | generations=generations, 139 | population_size=population_size, 140 | k_fold=k_fold, 141 | performance_metric=performance_metric, 142 | parsimony_coefficient=parsimony_coefficient_val, 143 | verbose=1 if test == 0 else 0, 144 | expected_eq=expected_eq, 145 | cores=cores) 146 | try: 147 | this_score = performance_metric(y_values, fit_model.predict(x_values)) 148 | except Exception as error: 149 | this_score = GASF.DEFAULT_TEST_FIT_FUNCTION(y_values, fit_model.predict(x_values)) 150 | score_history[parsimony_coefficient_val] = this_score 151 | if this_score > best_score: 152 | best_score = this_score 153 | best_inner_model = fit_model 154 | best_parsimony_coefficient = parsimony_coefficient_val 155 | # save grid search results 156 | Logger.print("The best parsimony_coefficient value is: {}".format(best_parsimony_coefficient)) 157 | with open(os.path.join(save_dir, "parsimony_coefficient_grid_search.json"), "w") as grid_search_value: 158 | json.dump(score_history, grid_search_value) 159 | # continue with the best model 160 | fit_model = best_inner_model 161 | 162 | else: 163 | raise ValueError("The parsimony_coefficient argument must be either a float between 0 and 1 or a non-empty list of floats") 164 | pred = fit_model.predict(x_values) 165 | # save test scores 166 | try: 167 | wanted_loss = performance_metric(y_values, pred) 168 | except Exception as error: 169 | wanted_loss = GASF.DEFAULT_TEST_FIT_FUNCTION(y_values, pred) 170 | results.at[test, "wanted_loss"] = wanted_loss 171 | results.at[test, "mae"] = mean_absolute_error(y_values, pred) 172 | results.at[test, "mse"] = mean_squared_error(y_values, pred) 173 | results.at[test, "r2"] = r2_score(y_values, pred) 174 | results.at[test, "t_test_p_value"] = stats.ttest_ind(y_values, pred)[1] 175 | results.at[test, "found_eq"] = fit_model 176 | if wanted_loss < current_best_wanted_loss: 177 | best_model = fit_model 178 | current_best_wanted_loss= wanted_loss 179 | 180 | # print and save scoring results of all runs 181 | Logger.print(message="Finished all symbolic runs - ") 182 | [Logger.print(message="{}: {:.3} +- {:.3}".format(score, results[score].mean(), results[score].std())) 183 | for score in ["mae", "mse", "r2", "t_test_p_value"]] 184 | return results, best_model 185 | 186 | def __repr__(self): 187 | return self.__str__() 188 | 189 | def __str__(self): 190 | return "" 191 | -------------------------------------------------------------------------------- /demo/demo.csv: -------------------------------------------------------------------------------- 1 | v0,a,t,v 2 | 3.43,7.47,4.46,36.75 3 | 11.02,6.68,1.63,22.13 4 | 9.29,7.99,6.44,60.14 5 | 14.05,5.71,2.37,28.13 6 | 7.64,7.12,6.7,54.79 7 | 17.61,5.91,5.37,49.84 8 | 18.87,7.08,9.66,87.26 9 | 17.07,6.66,0.61,21.56 10 | 10.69,8.26,9.51,89.24 11 | 13.26,6.58,3.18,34.53 12 | 14.57,6.49,9.1,75.1 13 | 11.87,5.93,6.01,47.51 14 | 11.14,6.23,6.37,50.32 15 | 7.94,7.84,4.03,39.14 16 | 16.74,7.8,2.91,39.83 17 | 11.76,7.11,5.54,50.64 18 | 4.82,8.9,0.26,7.21 19 | 4.99,8.78,6.87,65.96 20 | 6.49,6.85,8.4,64.67 21 | 12.98,8.44,6.29,66.73 22 | 8.05,5.71,3.94,30.85 23 | 6.46,5.96,1.42,15.07 24 | 12.62,7.58,6.47,62.28 25 | 17.22,7.61,9.16,87.8 26 | 3.25,5.85,9.24,58.45 27 | 9.32,7.56,1.36,19.6 28 | 2.04,6.94,4.5,33.27 29 | 14.61,8.15,5.52,59.6 30 | 16.36,5.47,4.61,40.75 31 | 2.53,6.28,6.8,44.33 32 | 14,5.17,3.31,31.11 33 | 18.04,7.39,4.45,51.94 34 | 7.44,8.17,4.39,43.74 35 | 2.24,5.88,5.48,34.12 36 | 11.37,7.21,9.73,80.71 37 | 2.9,5.96,3.09,21.1 38 | 14.23,7.93,8.16,79.73 39 | 15.46,8.94,7.18,78.06 40 | 18.21,6.68,8.91,76.95 41 | 1.92,7.38,0.28,4.07 42 | 11.85,6.87,2.43,28.26 43 | 13.8,8.28,8.56,83.83 44 | 6.06,6.85,5.27,42.58 45 | 17.72,6.51,5.46,53.26 46 | 19.54,7.01,0.56,23.23 47 | 18.06,8.54,4.12,52.18 48 | 1.59,7.79,4.54,36.96 49 | 1.53,5.14,1.27,7.98 50 | 3.06,6.34,4.44,30.59 51 | 10.89,7.15,1.79,23.45 52 | 4.76,8.61,7.16,67.07 53 | 9.51,7.36,9.71,80.17 54 | 2.5,7.55,3.36,27.59 55 | 17.17,7.51,8.82,82.57 56 | 18.42,8.21,5.35,61.72 57 | 8.02,8.22,2.77,31.41 58 | 19.13,8.83,7.18,80.88 59 | 14.27,5.9,7.58,58.4 60 | 19.94,8.82,0.3,22.36 61 | 13.38,5.41,6.29,46.93 62 | 5.88,6.4,2.19,20.09 63 | 2.73,8.33,5.65,49.79 64 | 12.09,6.31,3.17,31.77 65 | 14.31,8.13,4.11,47.72 66 | 13.76,8.6,1.21,23.68 67 | 3.51,7.78,8.82,72.85 68 | 4.54,7.64,5.9,49.62 69 | 2.96,5.07,4.45,25.52 70 | 16.42,8,4.28,51.17 71 | 4.19,8.17,5.62,50.61 72 | 19.14,5.67,6.69,57.64 73 | 3.44,5.39,0.17,4.27 74 | 14.61,6.41,9.65,75.7 75 | 8.01,5.7,6.73,46.37 76 | 18.11,7.91,2.8,39.45 77 | 4.92,7.23,8.28,65.43 78 | 4.42,5.01,1.97,14.43 79 | 16.31,6.57,8.77,72.45 80 | 12.88,5.46,0.31,14.72 81 | 17.03,8.31,1.15,26.59 82 | 10.18,7.34,5.77,52.53 83 | 5.99,6.47,0.85,11.49 84 | 11.45,8.74,6.84,70.52 85 | 1.13,6.85,4.12,29.35 86 | 13.8,6.62,4.63,44.01 87 | 7.61,7.62,6.35,56.56 88 | 1.88,5.32,0.98,7.24 89 | 5.04,6.61,7.47,53.87 90 | 3.31,7.68,4.81,39.45 91 | 3,8.68,7.12,64.15 92 | 7.97,7.11,5.33,45.87 93 | 3.86,8.9,2.7,28.17 94 | 1.16,7.72,0.69,6.42 95 | 3.12,7.25,3.3,26.5 96 | 4.99,8.77,7.62,73.25 97 | 17.43,6.22,1.23,25.08 98 | 10.82,5.32,5.12,38.06 99 | 2.96,5.15,6.05,34.8 100 | 18.77,8.08,4.71,57.4 101 | 3.09,8.32,2.83,26.64 102 | 8.23,5.53,1.37,15.65 103 | 4.2,8.31,9.55,82.72 104 | 18.56,6.18,6.6,58.75 105 | 1.2,6.48,4.33,28.97 106 | 13.22,6.27,4.98,44.44 107 | 5.12,8.65,2.28,24.35 108 | 14.64,7.89,9.68,90.11 109 | 15.33,6.65,0.34,17.24 110 | 12.87,6.82,6.37,57.44 111 | 1.59,5.46,7.76,43.52 112 | 8.52,8.38,5.08,50.07 113 | 1.85,5.17,5.01,28.03 114 | 9.97,6.6,6.5,53.4 115 | 14.62,6.25,9.44,74.36 116 | 0.54,7.89,1.56,12.59 117 | 2.53,5.18,4.18,24.42 118 | 0.16,5.47,8.42,45.29 119 | 12.65,7.18,1.89,25.96 120 | 19.31,5.23,5.29,47.45 121 | 0.99,5.49,5.95,33.99 122 | 15.15,7.47,4.11,45.39 123 | 14.52,6.58,6.21,54.83 124 | 2.09,8.4,9.98,85.92 125 | 18.25,7.42,7.34,72.71 126 | 11.77,7.42,8.32,72.77 127 | 7.18,6.25,5.36,41.09 128 | 7.27,5.27,0.87,11.62 129 | 11.89,6.99,4.89,46.07 130 | 3.22,5.13,5.65,32.53 131 | 6.21,8.43,5.52,52.22 132 | 12.44,6.86,6.5,57.6 133 | 6.31,7.6,5.45,48.68 134 | 9.68,8.95,0.13,10.84 135 | 1.56,6.45,4.18,29.09 136 | 13.13,7.49,1.53,25.08 137 | 15.53,5.17,6.81,51.75 138 | 9.67,6.13,9.96,72.14 139 | 16.61,7.93,1.77,31.26 140 | 16.24,7.92,9.18,88.95 141 | 17.13,8.88,8.04,87.64 142 | 18.7,6.73,2.83,37.75 143 | 19.49,6.93,2.32,34.86 144 | 17.88,6.72,8.25,74.79 145 | 5.04,6.06,2.63,20.98 146 | 5.01,7.01,0.88,11.4 147 | 8.87,6.78,6.89,55.58 148 | 5.61,8.32,8.49,75.48 149 | 2.42,7.06,0.16,3.55 150 | 18.85,8.21,6.7,75.33 151 | 8.09,6.78,4.4,37.54 152 | 13.61,6.47,2.86,32.11 153 | 16.23,5.85,1.81,27.35 154 | 11.85,5.42,9.22,61.2 155 | 9.44,6.06,1.24,17.29 156 | 2.8,5.83,1.5,11.31 157 | 15.43,5.73,2.55,30.04 158 | 19.97,6.9,5.1,54.61 159 | 3.92,7.83,5.82,49.99 160 | 8,7.9,7.5,67.92 161 | 14.92,6.73,9.24,76.33 162 | 18.94,8.52,0.58,23.88 163 | 2.18,7.84,5.41,44.15 164 | 8.46,8.88,2.37,30.1 165 | 0.66,8.92,9.24,84.74 166 | 10.84,7.72,8.05,72.99 167 | 5.45,8.46,2.73,28.26 168 | 0.93,6.44,7.23,48.44 169 | 4.25,8.02,5.51,48.44 170 | 13.56,7.48,0.95,20.25 171 | 11.66,5.49,6.97,49.93 172 | 3.38,6.07,7.21,48.09 173 | 0.17,5.07,8.34,42.88 174 | 11.78,8.54,0.4,14.89 175 | 13.67,5.69,8.56,61.13 176 | 4.34,6.7,3.63,28.95 177 | 1.75,8.04,3.18,27.59 178 | 19.59,5.79,1.17,26.89 179 | 10.96,6.07,2.81,28.3 180 | 18.37,8.58,0.98,26.51 181 | 13.95,7.25,8.78,76.83 182 | 11.33,5.04,3.69,30.23 183 | 11.33,8.22,4.29,46.13 184 | 3.84,5.04,5.43,31.52 185 | 19.81,5.61,9.34,72.93 186 | 5.24,7.93,7.3,63.13 187 | 12.57,5.49,1.35,20.18 188 | 6.3,6.67,0.52,9.96 189 | 14.56,8.73,1.71,29.49 190 | 2.15,6.4,8.73,58.02 191 | 1.28,7.23,6.26,47.47 192 | 14.1,8.22,7.92,79.99 193 | 5.77,8.93,9.51,90.69 194 | 12.91,7.02,1.79,25.99 195 | 9.01,5.68,3.27,27.31 196 | 4.05,7.69,9.85,79.8 197 | 13.2,5.32,9.32,63.41 198 | 2.23,5.97,3.03,20.12 199 | 2.85,7.58,3.89,32.01 200 | 6.78,6.97,9.74,74.67 201 | 0.68,7.02,0.76,5.96 202 | 4.97,6.31,4.04,30.16 203 | 14.16,6.76,2.65,32.07 204 | 2.4,8.97,6.27,58.64 205 | 19.91,5.44,1.62,28.44 206 | 13.32,7.81,0.74,19.1 207 | 11.54,6.22,2.82,28.79 208 | 5.79,6.6,6.87,50.62 209 | 10.13,6.36,7.98,60.27 210 | 9.29,6.77,8.09,62.78 211 | 0.18,6.31,1.46,9.3 212 | 4.63,6.56,1.51,14.54 213 | 19.8,7.05,9.94,88.98 214 | 19.76,7.57,0.3,22.25 215 | 12.17,6.79,4.13,39.41 216 | 11.95,6.47,5.39,47.29 217 | 8.96,8.37,6.59,64.76 218 | 2.92,8.32,5.38,46.73 219 | 9.43,6.22,9.83,69.16 220 | 13.67,7.55,8.18,76.18 221 | 1.42,8.88,8.23,74.5 222 | 0.76,8.38,9.68,83.52 223 | 11.77,8.1,10,90.91 224 | 4.19,8.34,7.94,70.41 225 | 8.62,5.93,7.24,52.07 226 | 8.55,7.4,5.4,48.02 227 | 1.6,8.91,5.5,50.1 228 | 1.16,6.53,7.54,49.89 229 | 14.32,7.65,8.1,77.05 230 | 14.94,5.39,0.34,16.94 231 | 1.1,5.41,4.55,25.46 232 | 19.45,5.05,0.66,22.56 233 | 13.12,5.49,0.93,18.41 234 | 6.89,5.95,4.84,35.33 235 | 4.7,6.71,7.1,52.86 236 | 11.17,6.99,8.56,70.29 237 | 1.23,8.06,2.13,18.58 238 | 3.24,6.58,7.82,55.79 239 | 13.5,7.73,3.02,36.48 240 | 17.45,8.17,2.87,40.08 241 | 2.57,6.38,1.21,10.08 242 | 11.84,7,9.27,77.5 243 | 19.04,5.49,4.34,43.3 244 | 14.11,7.6,8.26,78.42 245 | 5.9,8.47,1.34,17.25 246 | 15.62,8,2.29,34.28 247 | 13.02,6.31,6.13,52.22 248 | 14.61,8.87,1.71,29.48 249 | 3.87,8.4,3.24,30.46 250 | 15.39,5.17,0.35,17.2 251 | 16.34,7.19,1.86,29.42 252 | 1.14,5.73,1.78,11.45 253 | 8.13,8.07,5.04,49.29 254 | 7.49,5.08,8.27,49.01 255 | 8.75,6.19,4.24,35 256 | 19.21,7.81,6.5,68.58 257 | 15.95,5.99,9.04,70.8 258 | 12.43,8.83,9.52,95.53 259 | 9.37,7.85,2.13,26.09 260 | 7.47,5.36,8.07,50.73 261 | 11.64,6.94,3.15,33.17 262 | 17,6.81,2.11,31.68 263 | 5.17,8.59,0.65,10.65 264 | 12.2,7.92,9.67,88.79 265 | 10.49,6,8,59.07 266 | 11.71,8.66,7.96,79.84 267 | 14.1,8.77,0.94,22.57 268 | 5.67,6.18,4.34,32.49 269 | 19.28,7.16,9.03,84.77 270 | 8.79,5.04,0.86,13.12 271 | 4.08,8.8,2.08,22.16 272 | 1.94,6.24,5.52,36.38 273 | 3.11,5.65,8.01,47.88 274 | 18.89,7.95,9.18,90.03 275 | 6.73,6.96,1.84,19.73 276 | 7.97,5.92,1.01,14.23 277 | 0.88,6.29,4.27,28.29 278 | 4.01,5.88,0.62,7.66 279 | 1.53,5.81,3.25,20.21 280 | 0.59,5.02,6.07,31.37 281 | 16.18,5.05,2.76,30.42 282 | 7.29,8.33,7.2,67.94 283 | 2.71,8.29,4.52,40.18 284 | 15.28,5.51,5.43,45.65 285 | 7.31,5.79,5.06,36.97 286 | 5.97,7.09,3.32,28.92 287 | 9.1,7.59,0.05,9.48 288 | 13.55,5.39,0.59,16.73 289 | 6.16,5.37,8.86,54.28 290 | 17.52,5.15,6.46,50.79 291 | 17.09,6.62,1.94,30.23 292 | 1.85,8.77,2.25,21.8 293 | 16.37,8.36,4.63,55.08 294 | 5.89,7.44,4.8,41.19 295 | 2.43,5.86,4.65,30.27 296 | 4.41,7.39,9.41,72.47 297 | 19.67,6.16,9.97,81.09 298 | 10.39,6.75,7.81,63.11 299 | 18.78,6.23,6.8,60.53 300 | 19.95,5.45,1.04,25.36 301 | 5.19,8.37,5.2,48.71 302 | 5.19,5.15,9.68,55.59 303 | 14.29,6.2,4.05,39.79 304 | 13.1,8.48,0.29,15.4 305 | 12.34,7.06,7.05,60.87 306 | 7.65,7.12,6.23,50.97 307 | 12.88,7.59,9.96,86.71 308 | 19.84,5.42,0.83,24.58 309 | 8.93,6.35,9.03,67.6 310 | 10.79,8.93,6.91,73.95 311 | 9.04,7.74,8.03,70.48 312 | 17.81,7.51,2.91,40.06 313 | 1.61,8.81,8.63,77.64 314 | 10.11,5.2,6.19,41.88 315 | 9.57,5.25,6.75,45.46 316 | 14.85,5.85,6.74,54.28 317 | 15.63,5.34,1.84,25.71 318 | 6.26,6.35,6.31,45.87 319 | 9.58,5.19,6.52,43.85 320 | 14.55,6.25,2.08,27.55 321 | 14.62,6.83,7.38,65.68 322 | 18.53,8.11,8.77,91.45 323 | 16.29,6.49,7.97,68.02 324 | 13.87,7.2,2.24,30.6 325 | 16.88,6.67,8.92,77.14 326 | 13.92,5.64,9.21,66.52 327 | 16.13,8.44,5.75,65.31 328 | 15.89,8.29,6.83,73.24 329 | 14.72,8.19,3.58,44.48 330 | 9.25,7.49,6.67,58.62 331 | 8.32,6.8,4.76,41.09 332 | 19.5,5.16,5.99,49.4 333 | 3.77,7.7,6.83,55.8 334 | 15.85,5.97,6.84,57.25 335 | 17.86,6.93,4.69,51.37 336 | 18.85,8.77,3.1,45.58 337 | 10.19,8.8,3.3,39.23 338 | 8.94,8.75,3.8,43.03 339 | 8.39,5.15,6.61,43.28 340 | 7.18,5.64,8.25,54.25 341 | 3.36,6.18,0.14,4.31 342 | 1,7.64,7.39,58.61 343 | 4.9,8.91,5.84,56.93 344 | 4.89,6.06,9.27,60.46 345 | 10.84,8.52,4.52,48.86 346 | 19.23,8.42,2.61,41.21 347 | 15.57,5.95,3.42,35.92 348 | 7.85,8.84,4.57,47.77 349 | 10.95,7.55,6.56,59.87 350 | 6.85,8.31,7.46,67.47 351 | 18.28,6.65,8.09,72.8 352 | 19.59,5.35,0.13,20.49 353 | 17.52,7.71,1.12,26.42 354 | 3.55,8.29,7.51,65.15 355 | 6.53,6.07,0.81,11.33 356 | 7.45,5.86,7.12,48.68 357 | 0.74,5.47,0.93,5.77 358 | 13.95,8.11,8.29,81.18 359 | 11.35,8.21,0.7,16.76 360 | 1.86,5.83,5.73,34.91 361 | 8.73,5.22,9.82,60.59 362 | 15.84,8.31,6.1,66.53 363 | 11.09,5.49,4.31,35.1 364 | 15.6,5.61,7.61,57.13 365 | 0.29,5.39,0.91,5.19 366 | 12.33,6.17,7.2,56.75 367 | 12.1,6.79,5.6,50.12 368 | 11.42,8.29,1.26,21.65 369 | 8.36,8.97,8.04,81.28 370 | 2.69,6.41,1.27,10.72 371 | 16.13,6.5,7.75,67.17 372 | 3.03,8.44,6.16,53.92 373 | 15.22,8,9.1,88.9 374 | 12,6.62,5.04,44.46 375 | 6.3,8.28,5.34,50.52 376 | 9.28,8.13,3.2,34.94 377 | 2.98,5.63,0.38,5.17 378 | 15.37,6.76,7.72,67.56 379 | 9.78,6.41,2.78,27.32 380 | 19.73,5.55,0.32,21.51 381 | 19.64,5.56,8.05,65.69 382 | 18.97,7.39,2.4,37.44 383 | 0.36,5.1,5.52,28.51 384 | 4.71,6.94,5.24,40.66 385 | 13.31,8.08,5.12,55.23 386 | 12.1,7.3,8.43,74.38 387 | 6.99,6.93,3.35,30.81 388 | 6.18,5.11,0.95,10.92 389 | 5.04,7.22,2.66,23.76 390 | 5.14,5.59,8.12,50.53 391 | 11.94,7.41,5.37,51.73 392 | 4.82,6.4,0.18,5.97 393 | 15.87,8.38,7.42,77.27 394 | 8.38,8.07,2.35,27.34 395 | 13.94,6.66,4.87,45.91 396 | 5.13,8.76,1.87,21.08 397 | 17.62,8.29,6.02,66.85 398 | 5.8,7.73,3.11,29.54 399 | 15.5,7.27,0.86,21.97 400 | 18.29,8.89,8.42,95.01 401 | 16.86,7.05,5.59,56.27 -------------------------------------------------------------------------------- /utills/plotter.py: -------------------------------------------------------------------------------- 1 | # library imports 2 | import matplotlib 3 | import numpy as np 4 | import pandas as pd 5 | import matplotlib.pyplot as plt 6 | from sklearn.metrics import r2_score 7 | from sklearn.linear_model import LinearRegression 8 | from sklearn.model_selection import train_test_split 9 | 10 | # project imports 11 | from utills.consts import * 12 | from utills.logger_config import Logger 13 | 14 | # fix for windows 15 | matplotlib.use('Agg') 16 | 17 | 18 | class Plotter: 19 | """ 20 | A plotter class for the results of the model 21 | """ 22 | 23 | def __init__(self): 24 | pass 25 | 26 | @staticmethod 27 | def noise_graph(noise_range: list, 28 | y_list: dict, 29 | save_path: str): 30 | colors = ["red", "blue", "green"] 31 | symbols = ["o", "*", "P"] 32 | 33 | fig = plt.figure(figsize=(DEFAULT_FIG_SIZE, DEFAULT_FIG_SIZE)) 34 | # calc predictions and save to csv 35 | index = 0 36 | for name, y in y_list.items(): 37 | plt.plot(noise_range, 38 | y, 39 | "-{}".format(symbols[index]), 40 | color=colors[index], 41 | s=20, 42 | alpha=0.5, 43 | label="{}".format(name)) 44 | index += 1 45 | # set parameters and save plot 46 | plt.xlim((min(noise_range), max(noise_range))) 47 | plt.ylim((0, 1)) 48 | plt.xlabel("Noise Level", fontsize=16) 49 | plt.ylabel("Successful rate", fontsize=16) 50 | plt.legend(frameon=True, fontsize=13) 51 | plt.grid(alpha=0.5) 52 | ax = plt.gca() 53 | ax.yaxis.set_ticks_position('left') 54 | ax.xaxis.set_ticks_position('bottom') 55 | ax.spines['right'].set_visible(False) 56 | ax.spines['top'].set_visible(False) 57 | plt.savefig(save_path, dpi=DEFAULT_DPI) 58 | plt.close() 59 | 60 | @staticmethod 61 | def y_test_vs_y_pred(model, 62 | x_test, 63 | y_test, 64 | save_path: str): 65 | fig = plt.figure(figsize=(DEFAULT_FIG_SIZE, DEFAULT_FIG_SIZE)) 66 | # calc predictions and save to csv 67 | y_pred = model.predict(x_test) 68 | pd.DataFrame({'y_pred': y_pred, 'y_true': y_test}).to_csv(save_path[:-4] + '.csv', index=False) 69 | pts_range = (min([min(y_test), min(y_pred)]), max([max(y_test), max(y_pred)])) 70 | # plot predictions them against actual values 71 | y_test = np.array(y_test).reshape(-1, 1) 72 | lg = LinearRegression().fit(y_test, y_pred) 73 | r2 = lg.score(y_test, y_pred) 74 | plt.scatter(x=y_test, 75 | y=y_pred, 76 | color="blue", 77 | s=20, 78 | alpha=0.5) 79 | # plot y_pred = y_true for ref 80 | plt.plot([min(y_test), max(y_test)], 81 | [min(y_test), max(y_test)], 82 | "-", 83 | color="black", 84 | linewidth=1, 85 | alpha=0.75) 86 | # plot actual y_pred = f(y_true) relation 87 | plt.plot([min(y_test), max(y_test)], 88 | [lg.predict([min(y_test)])[0], lg.predict([max(y_test)])[0]], 89 | "--", 90 | color="gray", 91 | linewidth=2, 92 | alpha=0.75, 93 | label="$R^2$ = " + str(round(r2, 3)) + " | $y_{pred} = y_{exp} * " + str( 94 | round(lg.coef_[0], 3)) + " + " + str(round(lg.intercept_, 3)) + "$") 95 | # set parameters and save plot 96 | plt.xlim(pts_range) 97 | plt.ylim(pts_range) 98 | plt.xlabel("True value", fontsize=16) 99 | plt.ylabel("Predicted value", fontsize=16) 100 | plt.legend(frameon=True, fontsize=13) 101 | plt.grid(alpha=0.5) 102 | ax = plt.gca() 103 | ax.yaxis.set_ticks_position('left') 104 | ax.xaxis.set_ticks_position('bottom') 105 | ax.spines['right'].set_visible(False) 106 | ax.spines['top'].set_visible(False) 107 | plt.savefig(save_path, dpi=DEFAULT_DPI) 108 | plt.close() 109 | 110 | @staticmethod 111 | def parameter_sensitivity_graph(model, 112 | baseline_x: list, 113 | parameter_col_index: int, 114 | parameter_start_range: float, 115 | parameter_end_range: float, 116 | parameter_steps_count: int, 117 | parameter_name: str, 118 | target_name: str, 119 | save_path: str): 120 | fig = plt.figure(figsize=(DEFAULT_FIG_SIZE, DEFAULT_FIG_SIZE)) 121 | x_data = [] 122 | x_values = [] 123 | # prepare data 124 | step_size = (parameter_end_range - parameter_start_range) / parameter_steps_count 125 | for i in range(parameter_steps_count): 126 | new_row = baseline_x.copy() 127 | new_row[parameter_col_index] = parameter_start_range + i * step_size 128 | x_values.append(new_row[parameter_col_index]) 129 | x_data.append(new_row) 130 | df = pd.DataFrame(x_data) 131 | y_pred = model.predict(df) 132 | plt.plot(x_values, 133 | y_pred, 134 | "-o", 135 | color="black") 136 | plt.xlim((parameter_start_range, parameter_end_range)) 137 | plt.ylim((min(y_pred), max(y_pred))) 138 | plt.xlabel(parameter_name, fontsize=16) 139 | plt.xlabel(target_name, fontsize=16) 140 | ax = plt.gca() 141 | plt.grid() 142 | ax.yaxis.set_ticks_position('left') 143 | ax.xaxis.set_ticks_position('bottom') 144 | plt.savefig(save_path, dpi=DEFAULT_DPI) 145 | plt.close() 146 | 147 | @staticmethod 148 | def std_check(data, save_path): 149 | # calc std on increasing num of samples 150 | vals = [] 151 | for i in data.index[1:]: 152 | vals.append(float(data.loc[:i].std())) 153 | # plot std development over iterations 154 | fig = plt.figure(figsize=(DEFAULT_FIG_SIZE, DEFAULT_FIG_SIZE)) 155 | plt.scatter(x=range(1, 1 + len(vals)), 156 | y=vals, 157 | color="blue", 158 | s=20, 159 | alpha=0.5) 160 | plt.axhline(y=vals[-1] * (1 + REL_ERR_OF_STD), 161 | linestyle="--", 162 | color="red", 163 | linewidth=1) 164 | plt.axhline(y=vals[-1] * (1 - REL_ERR_OF_STD), 165 | linestyle="--", 166 | color="red", 167 | linewidth=1) # set parameters and save plot 168 | plt.xlim([1, len(vals)]) 169 | plt.xlabel("Iteration", fontsize=16) 170 | plt.ylabel("Standard Deviation", fontsize=16) 171 | plt.grid(alpha=0.5) 172 | ax = plt.gca() 173 | ax.yaxis.set_ticks_position('left') 174 | ax.xaxis.set_ticks_position('bottom') 175 | ax.spines['right'].set_visible(False) 176 | ax.spines['top'].set_visible(False) 177 | plt.savefig(save_path, dpi=DEFAULT_DPI) 178 | plt.close() 179 | 180 | @staticmethod 181 | def feature_importance(model, 182 | dataset: pd.DataFrame, 183 | save_dir: str, 184 | program_part: str, 185 | simulations: int = 100): 186 | # alert user 187 | Logger.print("\nTest feature importance with best {} ML model:".format(program_part)) 188 | fig = plt.figure(figsize=(DEFAULT_FIG_SIZE, DEFAULT_FIG_SIZE)) 189 | # create a df to save r2 scores of simulations 190 | sim_results = pd.DataFrame() 191 | y_col = dataset.keys()[-1] 192 | for sim in range(simulations): 193 | # prepare data 194 | train_Xs, test_Xs, train_y, test_y = train_test_split(dataset.drop(y_col, axis=1), 195 | dataset[y_col], 196 | shuffle=True) 197 | # train & test model on data 198 | sim_model = model 199 | sim_model.fit(train_Xs, train_y) 200 | pred = sim_model.predict(test_Xs) 201 | sim_results.at[sim, 'r2'] = r2_score(test_y, pred) 202 | # check r2 loss on data without a specific feature 203 | for feature in train_Xs.keys(): 204 | new_train_Xs, new_test_Xs = train_Xs.drop(feature, axis=1), test_Xs.drop(feature, axis=1) 205 | new_model = model 206 | new_model.fit(new_train_Xs, train_y) 207 | new_pred = new_model.predict(new_test_Xs) 208 | sim_results.at[sim, '{}_r2_loss'.format(feature)] = sim_results.loc[sim, 'r2'] - \ 209 | r2_score(test_y, new_pred) 210 | # save all r2 scores 211 | sim_results.to_csv(os.path.join(os.path.dirname(os.path.dirname(__file__)), save_dir, 212 | "{}_feature_importance.csv".format(program_part)), 213 | index=False) 214 | # prepare new df of avereged feature importance acc to r2 loss 215 | f_importance = pd.DataFrame() 216 | for i, f in enumerate(train_Xs.keys()): 217 | name = f + '_r2_loss' 218 | f_importance.at[i, 'feature'] = f 219 | f_importance.at[i, ['r2_loss', 'r2_err']] = sim_results[name].mean(), sim_results[name].std() 220 | # plot data 221 | f_importance.plot.barh(x='feature', 222 | y='r2_loss', 223 | xerr=f_importance['r2_err'].T.values, 224 | color="grey") 225 | plt.ylabel("Feature Name", fontsize=16) 226 | plt.xlabel("$R^2$ Loss Without Feature", fontsize=16) 227 | ax = plt.gca() 228 | plt.grid(axis='x') 229 | ax.yaxis.set_ticks_position('left') 230 | ax.xaxis.set_ticks_position('bottom') 231 | plt.savefig(os.path.join(os.path.dirname(os.path.dirname(__file__)), save_dir, 232 | "{}_feature_importance.pdf".format(program_part)), 233 | dpi=DEFAULT_DPI) 234 | plt.close() 235 | 236 | def __repr__(self): 237 | return self.__str__() 238 | 239 | def __str__(self): 240 | return "" 241 | -------------------------------------------------------------------------------- /experiments/exp_constant_acceleration.py: -------------------------------------------------------------------------------- 1 | # library imports 2 | import os 3 | import time 4 | import pandas as pd 5 | from datetime import timedelta 6 | from sklearn.metrics import mean_squared_error 7 | from sklearn.model_selection import train_test_split 8 | 9 | # project imports 10 | from utills.consts import * 11 | from utills.fitness_methods import * 12 | from utills.logger_config import Logger 13 | from algo.equation_brute_search import EBS 14 | from utills.result_tracker import ResultTracker 15 | from algo.multi_tpot_analysis import MultiTPOTrunner 16 | from algo.genetic_algorithm_symbolic_fit import GASF 17 | from data_generators.constant_acceleration_data_generator import ConstantAccelerationDataGenerator 18 | 19 | 20 | class ExpConstantAcceleration: 21 | """ 22 | Program receives a dataset with all essential features needed 23 | to deduce a "noisy" target (momentary velocity). 24 | 25 | Success of both numerical and analytical parts of the program prove 26 | that the program is able to learn simple linear relation between 27 | features, even with noisy data. 28 | """ 29 | 30 | def __init__(self): 31 | pass 32 | 33 | @staticmethod 34 | def run(numerical_bool: bool, 35 | analytical_bool: bool, 36 | force_ebs_bool: bool): 37 | """ 38 | Entry point 39 | """ 40 | # config logging 41 | start_time = time.time() 42 | 43 | # prepare IO 44 | os.makedirs(os.path.join(os.path.dirname(os.path.dirname(__file__)), CONST_ACCELERATION_RESULTS_FOLDER_NAME), 45 | exist_ok=True) 46 | Logger(os.path.join(os.path.dirname(os.path.dirname(__file__)), 47 | CONST_ACCELERATION_RESULTS_FOLDER_NAME, 48 | "run.log")) 49 | 50 | # 1) generate data 51 | data_path = os.path.join(os.path.dirname(__file__), "..", "data", 52 | "constant_acceleration_data_" + str(CONST_ACCELERATION_NUM_SAMPLES) + "_samples.csv") 53 | ConstantAccelerationDataGenerator.generate(samples=CONST_ACCELERATION_NUM_SAMPLES, 54 | a_range=(5, 9), 55 | t_range=(0, 10), 56 | v0_range=(0, 20), 57 | noise_range= CONST_ACCELERATION_NOISE_RANGE, 58 | save_path=data_path) 59 | # 1.1) load data, normalize and split 60 | df = pd.read_csv(data_path) 61 | Logger.print('Generated data:\n{}'.format(df.describe())) 62 | y_col = df.keys()[-1] 63 | normalized_df = (df - df.min()) / (df.max() - df.min()) 64 | train_data_x, test_data_x, train_data_y, test_data_y = train_test_split(normalized_df.drop([y_col], axis=1), 65 | normalized_df[y_col], 66 | shuffle=True, 67 | test_size=CONST_ACCELERATION_TEST_SIZE_PORTION, 68 | random_state=RANDOM_STATE) 69 | # 1.2) log elapsed time 70 | data_end_time = time.time() 71 | Logger.print(" --- Finished. Elapsed time: {} ---".format(timedelta(seconds=data_end_time - start_time))) 72 | 73 | # 2.1) continue to the MultiTPOTrunner regression 74 | Logger.print('Training MultiTPOTrunner:') 75 | if numerical_bool: 76 | # 2.1) find the best ML model for data 77 | all_t_scores, best_t_model = MultiTPOTrunner.run_and_analyze(run_times=CONST_ACCELERATION_NUMERICAL_RUN_TIMES, 78 | train_data_x=train_data_x, 79 | train_data_y=train_data_y, 80 | test_data_x=test_data_x, 81 | test_data_y=test_data_y, 82 | generations=CONST_ACCELERATION_NUMERICAL_GENERATION_COUNT, 83 | population_size=CONST_ACCELERATION_NUMERICAL_POP_SIZE, 84 | k_fold=K_FOLD, 85 | performance_metric=neg_mean_squared_error_scorer, 86 | save_dir=os.path.join(os.path.dirname(os.path.dirname(__file__)), 87 | CONST_ACCELERATION_RESULTS_FOLDER_NAME), 88 | n_jobs=-1) 89 | # 2.2) save results of best model from all runs 90 | ResultTracker.run(program_part="tpot", 91 | run_times=CONST_ACCELERATION_NUMERICAL_RUN_TIMES, 92 | all_scores=all_t_scores, 93 | model=best_t_model, 94 | train_data_x=train_data_x, 95 | train_data_y=train_data_y, 96 | test_data_x=test_data_x, 97 | test_data_y=test_data_y, 98 | save_dir=os.path.join(os.path.dirname(os.path.dirname(__file__)), 99 | CONST_ACCELERATION_RESULTS_FOLDER_NAME)) 100 | # 2.2) log elapsed time 101 | tpot_end_time = time.time() 102 | Logger.print(" --- Finished. Elapsed time: {} ---".format(timedelta(seconds=tpot_end_time - data_end_time))) 103 | 104 | # 3) continue to the symbolic regression 105 | Logger.print('Searching for a symbolic expression:') 106 | if analytical_bool: 107 | # 3.1) run symbolic regressor multiple times 108 | all_s_scores, best_s_model = GASF.run_and_analyze(run_times=CONST_ACCELERATION_NUMERICAL_RUN_TIMES, 109 | non_normalized_data=df, 110 | performance_metric=function_mapper["better_symbolic_reg_fitness"], 111 | generations=CONST_ACCELERATION_ANALYTICAL_GENERATION_COUNT, 112 | population_size=CONST_ACCELERATION_ANALYTICAL_POP_SIZE, 113 | k_fold=K_FOLD, 114 | cores=-1, 115 | parsimony_coefficient=CONST_ACCELERATION_ANALYTICAL_PARSIMONY_COEFFICIENT, 116 | expected_eq='add(v0, mul(a, t))', 117 | save_dir=os.path.join(os.path.dirname(os.path.dirname(__file__)), 118 | CONST_ACCELERATION_RESULTS_FOLDER_NAME)) 119 | # 3.2) save results of best model from all runs 120 | ResultTracker.run(program_part="symbolic", 121 | run_times=CONST_ACCELERATION_NUMERICAL_RUN_TIMES, 122 | all_scores=all_s_scores, 123 | model=best_s_model, 124 | train_data_x=train_data_x, 125 | train_data_y=train_data_y, 126 | test_data_x=test_data_x, 127 | test_data_y=test_data_y, 128 | save_dir=os.path.join(os.path.dirname(os.path.dirname(__file__)), 129 | CONST_ACCELERATION_RESULTS_FOLDER_NAME)) 130 | # 3.3) save a summary of the eqs found & figure whether to continue to ebf 131 | ebs_flag = ResultTracker.summaries_symbolic_results(run_times=CONST_ACCELERATION_NUMERICAL_RUN_TIMES, 132 | percent_of_majority=SYMBOLIC_PERCENT_OF_MAJORITY, 133 | eq_ranking_metric=SYMBOLIC_EQ_RANKING_METRIC, 134 | top_eqs_max_num=SYMBOLIC_TOP_EQS_MAX_NUM, 135 | save_dir=os.path.join(os.path.dirname(os.path.dirname(__file__)), 136 | CONST_ACCELERATION_RESULTS_FOLDER_NAME)) 137 | # 3.4) log elapsed time 138 | symbolic_end_time = time.time() 139 | Logger.print("Finished. Elapsed time: {}".format(timedelta(seconds=symbolic_end_time - tpot_end_time))) 140 | 141 | # 4) continue to the EBS 142 | if ebs_flag or force_ebs_bool: 143 | Logger.print('Searching for a symbolic expression using EBF:') 144 | # 4.1) run EBS multiple times 145 | all_ebs_scores, best_ebs_model = EBS.run_and_analyze(run_times=CONST_ACCELERATION_NUMERICAL_RUN_TIMES, 146 | non_normalized_data=df, 147 | performance_metric=function_mapper["better_symbolic_reg_fitness"], 148 | cores=-1, 149 | size_range=CONST_ACCELERATION_EBS_SIZE_RANGE, 150 | expected_eq='add(v0, mul(a, t))', 151 | save_dir=os.path.join(os.path.dirname(os.path.dirname(__file__)), 152 | CONST_ACCELERATION_RESULTS_FOLDER_NAME)) 153 | # 4.2) save the fitting score results 154 | ResultTracker.ebs_results(model=best_ebs_model, 155 | all_scores=all_ebs_scores, 156 | save_dir=os.path.join(os.path.dirname(os.path.dirname(__file__)), 157 | CONST_ACCELERATION_RESULTS_FOLDER_NAME)) 158 | else: 159 | Logger.print("EBF search of a symbolic equation wasn't needed") 160 | # 4.3) log elapsed time 161 | ebs_end_time = time.time() 162 | Logger.print("Finished. Elapsed time: {}".format(timedelta(seconds=ebs_end_time - symbolic_end_time))) 163 | 164 | # 5) alert results to the user 165 | Logger.print("TOTAL TIME ELAPSED TIME: {}".format(timedelta(seconds=time.time() - start_time))) 166 | -------------------------------------------------------------------------------- /scimed.py: -------------------------------------------------------------------------------- 1 | # library imports 2 | import json 3 | import pandas as pd 4 | from sklearn.model_selection import train_test_split 5 | 6 | # project imports 7 | from utills.consts import * 8 | from utills.fitness_methods import * 9 | from utills.logger_config import Logger 10 | from algo.equation_brute_search import EBS 11 | from utills.result_tracker import ResultTracker 12 | from algo.multi_tpot_analysis import MultiTPOTrunner 13 | from algo.genetic_algorithm_symbolic_fit import GASF 14 | from algo.genetic_algorithm_feature_selection import GAFS 15 | 16 | 17 | class scimed: 18 | """ 19 | The main class of the project, allow other developers to load 20 | it and use all the SciMED pipeline at once. 21 | """ 22 | 23 | def __init__(self): 24 | pass 25 | 26 | @staticmethod 27 | def run(train_data_x: pd.DataFrame, 28 | train_data_y: pd.DataFrame, 29 | test_data_x: pd.DataFrame, 30 | test_data_y: pd.DataFrame, 31 | results_folder: str, 32 | analytical_reachment_portion: float = 0, 33 | numerical_run_times: int = 20, 34 | numerical_generations: int = 50, 35 | numerical_population: int = 100, 36 | analytical_run_times: int = 20, 37 | analytical_generations: int = 50, 38 | analytical_population: int = 100, 39 | parsimony_coefficient: int = 0.05, 40 | k_fold: int = 5, 41 | ebs_size_range: tuple = (5, 9), 42 | numerical_bool: bool = True, 43 | analytical_bool: bool = True, 44 | force_ebs_bool: bool = True, 45 | feature_indexes_ranges = "Not applicable", 46 | feature_selection_generations: int = None, 47 | feature_selection_pop_size: int = None, 48 | feature_selection_mutation_rate: float = None, 49 | feature_selection_royalty: float = None): 50 | """ 51 | Single entry point 52 | """ 53 | 54 | # 1) prepare IO 55 | os.makedirs(results_folder, exist_ok=True) 56 | 57 | # init logger 58 | Logger(save_path=os.path.join(results_folder, "logger.txt")) 59 | 60 | # 2) run the numerical part 61 | if numerical_bool: 62 | # 2.1) run multi-tpot analysis if feature selection isn't needed 63 | if feature_indexes_ranges == "Not applicable": 64 | # 2.1.1) find the best ML model for all the data 65 | all_t_scores, best_t_model = MultiTPOTrunner.run_and_analyze(run_times=numerical_run_times, 66 | train_data_x=train_data_x, 67 | train_data_y=train_data_y, 68 | test_data_x=test_data_x, 69 | test_data_y=test_data_y, 70 | generations=numerical_generations, 71 | population_size=numerical_population, 72 | k_fold=k_fold, 73 | performance_metric=neg_mean_squared_error_scorer, 74 | save_dir=results_folder, 75 | n_jobs=-1) 76 | # 2.1.2) save results of best model from all runs 77 | ResultTracker.run(program_part="tpot", 78 | run_times=numerical_run_times, 79 | all_scores=all_t_scores, 80 | model=best_t_model, 81 | train_data_x=train_data_x, 82 | train_data_y=train_data_y, 83 | test_data_x=test_data_x, 84 | test_data_y=test_data_y, 85 | save_dir=results_folder) 86 | # 2.2) run multi-tpot analysis with feature selection 87 | else: 88 | # 2.2.1) find the best ML model for a subset of the data 89 | best_gene = GAFS.run(tpot_run_times=numerical_run_times, 90 | feature_generations=feature_selection_generations, 91 | tpot_regressor_generations=numerical_generations, 92 | feature_population_size=feature_selection_pop_size, 93 | tpot_regressor_population_size=numerical_population, 94 | mutation_rate=feature_selection_mutation_rate, 95 | feature_indexes_ranges=feature_indexes_ranges, 96 | mutation_w=[val[1]-val[0] for val in feature_indexes_ranges], 97 | royalty=feature_selection_royalty, 98 | k_fold=k_fold, 99 | performance_metric=neg_mean_squared_error_scorer, 100 | train_data_x=train_data_x, 101 | train_data_y=train_data_y, 102 | test_data_x=test_data_x, 103 | test_data_y=test_data_y, 104 | save_dir=results_folder, 105 | cores=-1) 106 | # 2.2.2) save results of best model from all runs 107 | ResultTracker.run(program_part="tpot", 108 | run_times=numerical_run_times, 109 | all_scores=best_gene.scoring_history, 110 | model=best_gene.model_object, 111 | train_data_x=train_data_x.iloc[:, best_gene.feature_indexes], 112 | train_data_y=train_data_y, 113 | test_data_x=test_data_x.iloc[:, best_gene.feature_indexes], 114 | test_data_y=test_data_y, 115 | save_dir=results_folder) 116 | # 2.2.3) save selected features of best gene 117 | with open(os.path.join(os.path.dirname(__file__), results_folder, "best_features_selected.json"), 118 | "w") as features_file: 119 | json.dump({"index": best_gene.feature_indexes, 120 | "names": list(test_data_x.columns[best_gene.feature_indexes])}, 121 | features_file) 122 | # 2.2.4) reduce the dataset of non-normalized samples for next part 123 | train_data_x = train_data_x.iloc[:, best_gene.feature_indexes+[-1]] 124 | # 2.3 add more data to the original data with the model 125 | # TODO: add to the next release after fixing the sample method in production 126 | 127 | # 3) continue to the symbolic regression 128 | if analytical_bool: 129 | # 3.1) run symbolic regression multiple times 130 | all_s_scores, best_s_model = GASF.run_and_analyze(run_times=analytical_run_times, 131 | non_normalized_data=train_data_x, 132 | performance_metric=function_mapper["better_symbolic_reg_fitness"], 133 | generations=analytical_generations, 134 | population_size=analytical_population, 135 | k_fold=k_fold, 136 | cores=-1, 137 | parsimony_coefficient=parsimony_coefficient, 138 | save_dir=results_folder) 139 | # 3.2) save results of best model from all runs 140 | non_norm_train_x, non_norm_test_x, non_norm_train_y, non_norm_test_y = train_test_split(train_data_x, 141 | train_data_y, 142 | shuffle=True, 143 | test_size=SFF_TEST_SIZE_PORTION, 144 | random_state=RANDOM_STATE) 145 | p_value_flag = ResultTracker.run(program_part="symbolic", 146 | run_times=analytical_run_times, 147 | all_scores=all_s_scores, 148 | model=best_s_model, 149 | train_data_x=non_norm_train_x, 150 | train_data_y=non_norm_train_y, 151 | test_data_x=non_norm_test_x, 152 | test_data_y=non_norm_test_y, 153 | save_dir=results_folder) 154 | # 3.3) save a summary of the eqs found & figure whether to continue to ebf 155 | stability_flag = ResultTracker.summaries_symbolic_results(run_times=analytical_run_times, 156 | percent_of_majority=SYMBOLIC_PERCENT_OF_MAJORITY, 157 | eq_ranking_metric=SYMBOLIC_EQ_RANKING_METRIC, 158 | top_eqs_max_num=SYMBOLIC_TOP_EQS_MAX_NUM, 159 | save_dir=results_folder) 160 | 161 | ebs_flag = p_value_flag or stability_flag 162 | else: 163 | ebs_flag = False 164 | 165 | # 4) continue to the EBS 166 | if ebs_flag or force_ebs_bool: 167 | # 4.1) run EBS multiple times 168 | all_ebs_scores, best_ebs_model = EBS.run_and_analyze(run_times=analytical_run_times, 169 | non_normalized_data=train_data_x, 170 | performance_metric=function_mapper[ 171 | "better_symbolic_reg_fitness"], 172 | cores=-1, 173 | size_range=ebs_size_range, 174 | save_dir=results_folder) 175 | # 4.2) save the fitting score results 176 | ResultTracker.ebs_results(model=best_ebs_model, 177 | all_scores=all_ebs_scores, 178 | save_dir=results_folder) 179 | -------------------------------------------------------------------------------- /experiments/exp_drag_force.py: -------------------------------------------------------------------------------- 1 | # library imports 2 | import os 3 | import time 4 | import pandas as pd 5 | from datetime import timedelta 6 | from sklearn.metrics import mean_squared_error 7 | from sklearn.model_selection import train_test_split 8 | 9 | # project imports 10 | from utills.consts import * 11 | from utills.fitness_methods import * 12 | from utills.logger_config import Logger 13 | from algo.equation_brute_search import EBS 14 | from utills.result_tracker import ResultTracker 15 | from algo.multi_tpot_analysis import MultiTPOTrunner 16 | from algo.genetic_algorithm_symbolic_fit import GASF 17 | from data_generators.drag_force_data_generator import DragForceDataGenerator 18 | 19 | 20 | class ExpDragFroce: 21 | """ 22 | Program receives a dataset with all essential features needed 23 | to deduce a "noisy" target (drag on sphere). 24 | Success of both numerical and analytical parts of the program prove 25 | that the program is able to learn a complex polynomial relation between 26 | features, even with noisy data. 27 | """ 28 | 29 | def __init__(self): 30 | pass 31 | 32 | @staticmethod 33 | def run(numerical_bool: bool, 34 | analytical_bool: bool, 35 | force_ebs_bool: bool): 36 | """ 37 | Entry point 38 | """ 39 | # config logging 40 | start_time = time.time() 41 | 42 | # prepare IO 43 | os.makedirs(os.path.join(os.path.dirname(os.path.dirname(__file__)), DRAG_FORCE_RESULTS_FOLDER_NAME), 44 | exist_ok=True) 45 | Logger(os.path.join(os.path.dirname(os.path.dirname(__file__)), 46 | DRAG_FORCE_RESULTS_FOLDER_NAME, 47 | "run.log")) 48 | 49 | # 1) generate data 50 | data_path = os.path.join(os.path.dirname(__file__), "..", "data", 51 | "drag_force_{}_samples.csv".format(DRAG_FORCE_NUM_SAMPLES)) 52 | feature_indexes_ranges = DragForceDataGenerator.generate(samples=DRAG_FORCE_NUM_SAMPLES, 53 | cd_range=(1, 10), 54 | rhoa_range=(30, 50), 55 | v_range=(1, 10), 56 | d_range=(0.01, 0.1), 57 | noise_range= DRAG_FORCE_NOISE_RANGE, 58 | save_path=data_path) 59 | # feature_indexes_ranges = [[i,i] for i in range(len(df.keys())-1)] #remove comment if you don't want feature selection 60 | 61 | # 1.1) load data, normalize and split 62 | df = pd.read_csv(data_path) 63 | Logger.print('Generated data:\n{}'.format(df.describe())) 64 | y_col = df.keys()[-1] 65 | normalized_df = (df - df.min()) / (df.max() - df.min()) 66 | train_data_x, test_data_x, train_data_y, test_data_y = train_test_split(normalized_df.drop([y_col], axis=1), 67 | normalized_df[y_col], 68 | shuffle=True, 69 | test_size=DRAG_FORCE_TEST_SIZE_PORTION, 70 | random_state=RANDOM_STATE) 71 | # 1.2) log elapsed time 72 | data_end_time = time.time() 73 | Logger.print(" --- Finished. Elapsed time: {} ---".format(timedelta(seconds=data_end_time - start_time))) 74 | 75 | # 2) continue to the MultiTPOTrunner regression 76 | Logger.print('Training MultiTPOTrunner:') 77 | if numerical_bool: 78 | # 2.1.1) find the best sub-set of features and ML model 79 | best_gene = GAFS.run(tpot_run_times=DRAG_FORCE_NUMERICAL_RUN_TIMES, 80 | feature_generations=DRAG_FORCE_FEATURE_GENERATIONS_COUNT, 81 | tpot_regressor_generations=DRAG_FORCE_NUMERICAL_GENERATION_COUNT, 82 | feature_population_size=DRAG_FORCE_FEATURE_POP_SIZE, 83 | tpot_regressor_population_size=DRAG_FORCE_NUMERICAL_POP_SIZE, 84 | mutation_rate=DRAG_FORCE_MUTATION_RATE, 85 | feature_indexes_ranges=feature_indexes_ranges, 86 | mutation_w=[val[1]-val[0] for val in feature_indexes_ranges], 87 | royalty=DRAG_FORCE_ROYALTY, 88 | k_fold=k_fold, 89 | performance_metric=neg_mean_squared_error_scorer, 90 | train_data_x=train_data_x, 91 | train_data_y=train_data_y, 92 | test_data_x=test_data_x, 93 | test_data_y=test_data_y, 94 | save_dir=os.path.join(os.path.dirname(os.path.dirname(__file__)), 95 | DRAG_FORCE_RESULTS_FOLDER_NAME), 96 | cores=-1) 97 | # 2.1.2) save results of best model from all runs 98 | ResultTracker.run(program_part="tpot", 99 | run_times=DRAG_FORCE_NUMERICAL_RUN_TIMES, 100 | all_scores=best_gene.scoring_history, 101 | model=best_gene.model_object, 102 | train_data_x=train_data_x.iloc[:, best_gene.feature_indexes], 103 | train_data_y=train_data_y, 104 | test_data_x=test_data_x.iloc[:, best_gene.feature_indexes], 105 | test_data_y=test_data_y, 106 | save_dir=os.path.join(os.path.dirname(os.path.dirname(__file__)), 107 | DRAG_FORCE_RESULTS_FOLDER_NAME)) 108 | # 2.1.3) save selected features of best gene 109 | with open(os.path.join(os.path.dirname(__file__), DRAG_FORCE_RESULTS_FOLDER_NAME, "best_features_selected.json"), 110 | "w") as features_file: 111 | json.dump({"index": best_gene.feature_indexes, 112 | "names": list(test_data_x.columns[best_gene.feature_indexes])}, 113 | features_file) 114 | Logger.print("Best gene features: {}".format(list(test_data_x.columns[best_gene.feature_indexes]))) 115 | # 2.1.4) reduce the dataset of non-normalized samples for next part 116 | df = df.iloc[:, best_gene.feature_indexes+[-1]] 117 | # 2.2) log elapsed time 118 | tpot_end_time = time.time() 119 | Logger.print(" --- Finished. Elapsed time: {} ---".format(timedelta(seconds=tpot_end_time - data_end_time))) 120 | 121 | # 3) continue to the symbolic regression 122 | Logger.print('Searching for a symbolic expression:') 123 | if analytical_bool: 124 | # 3.1) run symbolic regressor multiple times 125 | all_s_scores, best_s_model = GASF.run_and_analyze(run_times=DRAG_FORCE_NUMERICAL_RUN_TIMES, 126 | non_normalized_data=df, 127 | performance_metric=function_mapper["better_symbolic_reg_fitness"], 128 | generations=DRAG_FORCE_ANALYTICAL_GENERATION_COUNT, 129 | population_size=DRAG_FORCE_ANALYTICAL_POP_SIZE, 130 | k_fold=K_FOLD, 131 | cores=-1, 132 | parsimony_coefficient=DRAG_FORCE_ANALYTICAL_PARSIMONY_COEFFICIENT, 133 | expected_eq='mul(0.392, mul(cd, mul(rho, mul(v, mul(v, mul(d, d))))))', 134 | save_dir=os.path.join(os.path.dirname(os.path.dirname(__file__)), 135 | DRAG_FORCE_RESULTS_FOLDER_NAME)) 136 | # 3.2) save results of best model from all runs 137 | ResultTracker.run(program_part="symbolic", 138 | run_times=DRAG_FORCE_NUMERICAL_RUN_TIMES, 139 | all_scores=all_s_scores, 140 | model=best_s_model, 141 | train_data_x=train_data_x, 142 | train_data_y=train_data_y, 143 | test_data_x=test_data_x, 144 | test_data_y=test_data_y, 145 | save_dir=os.path.join(os.path.dirname(os.path.dirname(__file__)), 146 | DRAG_FORCE_RESULTS_FOLDER_NAME)) 147 | # 3.3) save a summary of the eqs found & figure whether to continue to ebf 148 | ebs_flag = ResultTracker.summaries_symbolic_results(run_times=DRAG_FORCE_NUMERICAL_RUN_TIMES, 149 | percent_of_majority=SYMBOLIC_PERCENT_OF_MAJORITY, 150 | eq_ranking_metric=SYMBOLIC_EQ_RANKING_METRIC, 151 | top_eqs_max_num=SYMBOLIC_TOP_EQS_MAX_NUM, 152 | save_dir=os.path.join(os.path.dirname(os.path.dirname(__file__)), 153 | DRAG_FORCE_RESULTS_FOLDER_NAME)) 154 | # 3.4) log elapsed time 155 | symbolic_end_time = time.time() 156 | Logger.print("Finished. Elapsed time: {}".format(timedelta(seconds=symbolic_end_time - tpot_end_time))) 157 | 158 | # 4) continue to the EBS 159 | if ebs_flag or force_ebs_bool: 160 | Logger.print('Searching for a symbolic expression using EBF:') 161 | # 4.1) run EBS multiple times 162 | all_ebs_scores, best_ebs_model = EBS.run_and_analyze(run_times=DRAG_FORCE_NUMERICAL_RUN_TIMES, 163 | non_normalized_data=df, 164 | performance_metric=function_mapper["better_symbolic_reg_fitness"], 165 | cores=-1, 166 | size_range=DRAG_FORCE_EBS_SIZE_RANGE, 167 | expected_eq='mul(0.392, mul(cd, mul(rho, mul(v, mul(v, mul(d, d))))))', 168 | save_dir=os.path.join(os.path.dirname(os.path.dirname(__file__)), 169 | DRAG_FORCE_RESULTS_FOLDER_NAME)) 170 | # 4.2) save the fitting score results 171 | ResultTracker.ebs_results(model=best_ebs_model, 172 | all_scores=all_ebs_scores, 173 | save_dir=os.path.join(os.path.dirname(os.path.dirname(__file__)), 174 | DRAG_FORCE_RESULTS_FOLDER_NAME)) 175 | else: 176 | Logger.print("EBS search of a symbolic equation wasn't needed") 177 | # 4.3) log elapsed time 178 | ebs_end_time = time.time() 179 | Logger.print("Finished. Elapsed time: {}".format(timedelta(seconds=ebs_end_time - symbolic_end_time))) 180 | 181 | # 5) alert results to the user 182 | Logger.print("TOTAL TIME ELAPSED TIME: {}".format(timedelta(seconds=time.time() - start_time))) 183 | -------------------------------------------------------------------------------- /algo/equation_brute_search.py: -------------------------------------------------------------------------------- 1 | # library imports 2 | import os 3 | import json 4 | import pandas as pd 5 | from scipy import stats 6 | import concurrent.futures 7 | from sklearn.model_selection import KFold 8 | from sklearn.linear_model import LinearRegression 9 | from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error 10 | 11 | # project imports 12 | from algo.ebs.eq import Eq 13 | from utills.plotter import Plotter 14 | from algo.ebs.eq_node import EqNode 15 | from algo.ebs.eq_functions import * 16 | from utills.fitness_methods import * 17 | from utills.logger_config import Logger 18 | 19 | 20 | class EBS: 21 | """ 22 | This class is responsible for generating a symbolic equation 23 | of a target value from a given set of features, using a tree-structure brute-search. 24 | 25 | The class contains 2 functions: 26 | 1. run: Kfold trains a model and returns it fitted 27 | 2. run_and_analyze: applies the run function multiple times 28 | to gain statistical insight on the performance. 29 | """ 30 | 31 | # CONSTS # 32 | DEFAULT_TEST_FIT_FUNCTION = better_symbolic_reg_fitness 33 | # END - CONSTS # 34 | 35 | # CATCH FOR FASTER COMPUTATION # 36 | TOPOLOGY_TREES = {} 37 | ALLOCATED_EQS = {} 38 | # END - CATCH FOR FASTER COMPUTATION # 39 | 40 | def __init__(self): 41 | pass 42 | 43 | @staticmethod 44 | def run(non_normalized_data: pd.DataFrame, 45 | k_fold: int, 46 | performance_metric, 47 | verbose: int, 48 | size_range: tuple, 49 | expected_eq='Unknown', 50 | cores: int = -1): 51 | """ 52 | Run the GAFS algorithm with some hyper-parameters. 53 | Initially the model is trained on a kfold portion of data, 54 | and then on the dataset as a whole. 55 | The model of the latter case is returned. 56 | """ 57 | y_col = non_normalized_data.keys()[-1] 58 | x_values = non_normalized_data.drop([y_col], axis=1) 59 | y_values = non_normalized_data[y_col] 60 | # make a k-fold cross validation so we can trust the results better 61 | kf = KFold(n_splits=k_fold) 62 | scores = [] 63 | fold_index = 1 64 | for train_index, test_index in kf.split(x_values): 65 | # say we do fold 66 | Logger.print(message="Equation brute force {} fold".format(fold_index)) 67 | fold_index += 1 68 | # prepare data 69 | X_train, X_test = x_values.iloc[train_index, :], x_values.iloc[test_index, :] 70 | y_train, y_test = y_values.iloc[train_index], y_values.iloc[test_index] 71 | # prepare model 72 | eq, best_score, answer = EBS._search_equation(x=X_train, 73 | y=y_train, 74 | performance_metric=performance_metric, 75 | verbose=verbose, 76 | cores=cores, 77 | size_range=size_range) 78 | y_pred = eq.eval(X_test) 79 | score = performance_metric(y_test, y_pred) if not isinstance(performance_metric, str) else function_mapper[ 80 | performance_metric](y_test, y_pred) 81 | scores.append(score) 82 | 83 | # train a symbolic regression on all the data, it is at least as good as the previous ones 84 | eq, best_score, answer = EBS._search_equation(x=x_values, 85 | y=y_values, 86 | performance_metric=performance_metric, 87 | verbose=verbose, 88 | cores=cores, 89 | size_range=size_range) 90 | # if we want to compare to the EQ. 91 | if expected_eq != 'Unknown': 92 | Logger.print(message='Expected eq: {}, Found eq: {}'.format(expected_eq, 93 | eq.to_string())) 94 | else: 95 | Logger.print(message='Found eq: {}'.format(eq.to_string())) 96 | return eq 97 | 98 | @staticmethod 99 | def run_and_analyze(run_times: int, 100 | non_normalized_data: pd.DataFrame, 101 | performance_metric, 102 | save_dir: str, 103 | size_range: tuple, 104 | expected_eq='Unknown', 105 | cores: int = -1): 106 | """ 107 | Run the GAFS algorithm several times and save results from all runs. 108 | Returns a pandas dataframe of all results and the best model from 109 | all runs. 110 | 111 | @:var size_range - start, end, and step size of the EQ tree's number of nodes 112 | """ 113 | results = pd.DataFrame() 114 | y_col = non_normalized_data.keys()[-1] 115 | x_values = non_normalized_data.drop(y_col, axis=1) 116 | y_values = non_normalized_data[y_col] 117 | current_best_wanted_loss = 99999 118 | best_model = None 119 | for test in range(run_times): 120 | Logger.print(message="run {}".format(test + 1)) 121 | eq, best_score, answer = EBS._search_equation(x=x_values, 122 | y=y_values, 123 | verbose=1 if test == 0 else 0, 124 | performance_metric=performance_metric, 125 | size_range=size_range, 126 | cores=cores) 127 | pred = eq.eval(x_values) 128 | # save test scores 129 | try: 130 | wanted_loss = performance_metric(y_values, pred) 131 | except Exception as error: 132 | wanted_loss = EBS.DEFAULT_TEST_FIT_FUNCTION(y_values, pred) 133 | results.at[test, "wanted_loss"] = wanted_loss 134 | results.at[test, "mae"] = mean_absolute_error(y_values, pred) 135 | results.at[test, "mse"] = mean_squared_error(y_values, pred) 136 | results.at[test, "r2"] = r2_score(y_values, pred) 137 | results.at[test, "t_test_p_value"] = stats.ttest_ind(y_values, pred)[1] 138 | results.at[test, "found_eq"] = eq.to_string() 139 | if wanted_loss < current_best_wanted_loss or best_model is None: 140 | best_model = eq 141 | current_best_wanted_loss = wanted_loss 142 | 143 | # print and save scoring results of all runs 144 | Logger.print(message="Finished all EBS runs - ") 145 | if expected_eq != 'Unknown': 146 | Logger.print(message='Expected eq: {}, Found eq: {}'.format(expected_eq, 147 | best_model.to_string())) 148 | else: 149 | Logger.print(message='Found eq: {}'.format(best_model.to_string())) 150 | [Logger.print(message="{}: {:.3} +- {:.3}".format(score, results[score].mean(), results[score].std())) 151 | for score in ["mae", "mse", "r2", "t_test_p_value"]] 152 | results.to_csv(os.path.join(save_dir, "ebs_scoring_history.csv")) 153 | # plot best model's predictions vs true values 154 | Plotter.y_test_vs_y_pred(model=best_model, 155 | x_test=x_values, 156 | y_test=y_values, 157 | save_path=os.path.join(save_dir, "ebs_target_vs_pred.pdf")) 158 | return results, best_model 159 | 160 | @staticmethod 161 | def _search_equation(x: pd.DataFrame, 162 | y: pd.Series, 163 | verbose: int, 164 | performance_metric, 165 | size_range: tuple, 166 | cores: int) -> tuple: 167 | """ 168 | Search for the equation 169 | # TODO: think how to use multi-thread later 170 | """ 171 | # run over the needed range to generate all possible tree topologies 172 | for n in size_range: 173 | if verbose == 1: 174 | Logger.print(message="EBS._search_equation: Generating all possible binary tree topologies for size {}".format(n)) 175 | 176 | if n < -1: 177 | continue 178 | elif (n % 2) == 0: 179 | EBS.TOPOLOGY_TREES[n] = [] 180 | elif n == 1: 181 | EBS.TOPOLOGY_TREES[1] = [Eq(tree=EqNode(value=None))] 182 | elif n not in EBS.TOPOLOGY_TREES: # do not calc the same topology twice 183 | EBS.TOPOLOGY_TREES[n] = EBS._all_possible_fbt(n=n) 184 | # find best equation for the data 185 | answer = {} 186 | best_eq = "" 187 | best_score = 9999 188 | # TODO: replace the organized search with random monto-carlo sample later in production once the mapping order is fixed 189 | # run over the tree topologies sizes 190 | for n in size_range: 191 | if verbose == 1: 192 | Logger.print(message="EBS._search_equation: Testing {} possible binary tree topologies for size {}".format(n, len(EBS.TOPOLOGY_TREES[n]))) 193 | 194 | # run over all tree populations 195 | for tree_topology in EBS.TOPOLOGY_TREES[n]: 196 | tree_topology.fix_nodes() # just to make sure the meta-values are fine 197 | # avoid computing the same data twice, if we have this allocation list, use it and calc if we don't 198 | if tree_topology.to_id_str() not in EBS.ALLOCATED_EQS: 199 | # populate each tree with all possible combinations 200 | possible_trees = EBS._populate_tree(eq=tree_topology, 201 | not_leaf_values=FUNCTION_LIST, 202 | leaf_values=list(x)) 203 | EBS.ALLOCATED_EQS[tree_topology.to_id_str()] = possible_trees # recall the allocation list 204 | else: 205 | possible_trees = EBS.ALLOCATED_EQS[tree_topology.to_id_str()] 206 | 207 | if verbose == 1: 208 | Logger.print(message="EBS._search_equation: Found {} possible populated trees to check for this topology".format(len(possible_trees))) 209 | # for each combination compute performance 210 | for eq_index, this_eq in enumerate(possible_trees): 211 | # TODO: change this magic number later 212 | if (eq_index % 100) == 0 and verbose == 1: 213 | Logger.print(message="EBS._search_equation: Test of {} / {} ({:.3f}%) equations done".format(eq_index, len(possible_trees), eq_index*100/len(possible_trees))) 214 | try: 215 | y_pred = this_eq.eval(x_values=x) 216 | reg = LinearRegression().fit([[val] for val in y_pred], y) 217 | this_eq.linear_a = reg.coef_[0] 218 | this_eq.linear_b = reg.intercept_ 219 | y_pred = this_eq.linear_a * y_pred + this_eq.linear_b # re-calibrate results 220 | score = performance_metric(y, y_pred) # calc the performance 221 | answer[this_eq.to_string()] = score 222 | if score < best_score: 223 | best_score = score 224 | best_eq = this_eq 225 | except Exception as error: 226 | this_eq.eval(x_values=x) 227 | Logger.debug(message="Error at EBS._search_equation, saying: {}".format(error)) 228 | return best_eq, best_score, answer 229 | 230 | @staticmethod 231 | def _all_possible_fbt(n: int) -> list: 232 | return Eq.all_possible_fbt(n=n) 233 | 234 | @staticmethod 235 | def _populate_tree(eq: Eq, 236 | not_leaf_values: list, 237 | leaf_values: list) -> list: 238 | """ 239 | Gets a tree topology and return all possible value allocations to the tree 240 | :param tree: the tree topology 241 | :param not_leaf_values: the function can be allocated to the not leaf node 242 | :param leaf_values: the leaves values 243 | :return: all possible allocation of the values to the given 244 | """ 245 | return eq.populate(not_leaf_values=not_leaf_values, 246 | leaf_values=leaf_values) 247 | 248 | def __repr__(self): 249 | return self.__str__() 250 | 251 | def __str__(self): 252 | return "" 253 | -------------------------------------------------------------------------------- /data_generators/steady_free_fall_with_drag_data_generator.py: -------------------------------------------------------------------------------- 1 | # project imports 2 | import os 3 | import numpy as np 4 | import pandas as pd 5 | from tqdm import tqdm 6 | 7 | # library imports 8 | from utills.consts import * 9 | from utills.logger_config import Logger 10 | from data_generators.N_frequency_generator import NFrequencyGenerator 11 | 12 | 13 | class SFFWDdataGenerator: 14 | """ 15 | This class is responsible for the generation of the physical 16 | steady (no acceleration) free fall with drag to test the model. 17 | The motion is dictated by: 18 | 0 = Weight - Buoyancy - 0.5*Cd*rhoa*V*V*area 19 | -> Cd = (rhop - rhoa) * volume * g * 2/(rhoa*V*V*area) 20 | 21 | Variables in this file: 22 | g : gravitational acceleration [m/s2] 23 | rhop : particle density [kg/m3] 24 | d : particle diameter [m] 25 | rhoa : fluid density [kg/m3] 26 | V : settling velocity [m/s] 27 | nu : kinematic viscosity of the fluid [m2/s] 28 | Re : reynolds number [-] 29 | 30 | This class is responsible to produce a pandas DataFrame for three different model experiments. 31 | """ 32 | 33 | def __init__(self): 34 | pass 35 | 36 | @staticmethod 37 | def generate_noiseless(samples: int, 38 | rhoa_range: tuple, 39 | rhop_range: tuple, 40 | nu_range: tuple, 41 | re_range: tuple, 42 | show_progress_bar: bool = True): 43 | """ 44 | Generates a pandas dataframe of experiments to represent steady free fall measurements. 45 | """ 46 | rhoa_range_delta = rhoa_range[1] - rhoa_range[0] 47 | rhop_range_delta = rhop_range[1] - rhop_range[0] 48 | nu_range_delta = nu_range[1] - nu_range[0] 49 | re_range_delta = re_range[1] - re_range[0] 50 | data = [] 51 | # generate samples 52 | pbar = tqdm(total=samples, desc="Generating baseline data") if show_progress_bar else None 53 | for sample_index in range(samples): 54 | # sample data from ranges 55 | rhoa = np.random.random_sample() * rhoa_range_delta + rhoa_range[0] 56 | nu = np.random.random_sample() * nu_range_delta + nu_range[0] 57 | re = np.random.random_sample() * re_range_delta + re_range[0] 58 | # calc rhop from rhoa with range 59 | rhop = rhoa + np.random.random_sample() * rhop_range_delta + rhop_range[0] 60 | # calc Cd from Re acc to known drag to Reynolds relation 61 | cd = 0.4 + 24.0 / re + 6.0 / (1 + re ** 0.5) 62 | # calc 'd' from the other parameters 63 | if rhop == rhoa: # just to make sure we will not divide by zero 64 | raise ZeroDivisionError("Rhop can not be equal to rhoa") 65 | d = np.power((cd * re * re * nu * nu * rhoa) / (13.08 * (rhop - rhoa)), 1 / 3) 66 | # recalculate 'v' 67 | v = re * nu / d 68 | # add the data and alert the user by a progress bar, if needed 69 | data.append([rhoa, v, d, rhop, nu, cd]) 70 | if show_progress_bar: 71 | pbar.update(1) 72 | 73 | if show_progress_bar: 74 | pbar.close() 75 | # make a Pandas.DataFrame and save it as a CSV file 76 | df = pd.DataFrame(data=data, columns=["rhoa", "V", "d", "rhop", "nu", "Cd"]) 77 | df.to_csv(os.path.join(DATA_FOLDER, 78 | "SFF_noiseless_baseline_{}_samples.csv".format(SFF_N_SAMPLES_STR)), 79 | index=False) 80 | return df 81 | 82 | @staticmethod 83 | def generate_case_1(samples: int, 84 | rhoa_range: tuple, 85 | rhop_range: tuple, 86 | nu_range: tuple, 87 | re_range: tuple, 88 | save_path: str, 89 | dropped_param: str = SFF_1_DROP_PARAM, 90 | force: bool = FORCE_DATA_OVERRIDE_FLAG): 91 | """ 92 | Generate a pandas dataframe with only 3 our of 4 needed features to calc Cd. 93 | Saves the dataframe for model experiment. 94 | """ 95 | baseline_path = os.path.join(os.path.dirname(os.path.dirname(__file__)), 96 | DATA_FOLDER, 97 | "SFF_noiseless_baseline_{}_samples.csv".format(SFF_N_SAMPLES_STR)) 98 | if os.path.exists(baseline_path) and not force: 99 | df = pd.read_csv(baseline_path) 100 | else: 101 | df = SFFWDdataGenerator.generate_noiseless(samples=samples, 102 | rhoa_range=rhoa_range, 103 | nu_range=nu_range, 104 | re_range=re_range, 105 | rhop_range=rhop_range) 106 | # alert user 107 | re = df["d"] * df["V"] / df["nu"] 108 | Logger.print('Generated data with Re: min={:.4} max={:.4}'.format(re.min(), re.max())) 109 | # build case 110 | df.drop([dropped_param, 'nu'], axis=1).to_csv(save_path, index=False) 111 | # alert user that feature selection isn't needed 112 | feature_indexes_ranges = "Not applicable" 113 | return feature_indexes_ranges 114 | 115 | @staticmethod 116 | def generate_case_2(samples: int, 117 | rhoa_range: tuple, 118 | rhop_range: tuple, 119 | nu_range: tuple, 120 | re_range: tuple, 121 | save_path: str, 122 | noise_range: tuple = SFF_CASE_2_NOISE_RANGE, 123 | force: bool = FORCE_DATA_OVERRIDE_FLAG): 124 | """ 125 | Generate a pandas dataframe with rhoa,V,d,rhop,Cd measurements. 126 | Adds noise to Cd, and saves the dataframe for model experiment. 127 | """ 128 | baseline_path = os.path.join(os.path.dirname(os.path.dirname(__file__)), 129 | DATA_FOLDER, 130 | "SFF_noiseless_baseline_{}_samples.csv".format(SFF_N_SAMPLES_STR)) 131 | if os.path.exists(baseline_path) and not force: 132 | df = pd.read_csv(baseline_path) 133 | else: 134 | df = SFFWDdataGenerator.generate_noiseless(samples=samples, 135 | rhoa_range=rhoa_range, 136 | nu_range=nu_range, 137 | re_range=re_range, 138 | rhop_range=rhop_range) 139 | # alert user 140 | re = df["d"] * df["V"] / df["nu"] 141 | Logger.print('Generated data with Re: min={:.4} max={:.4}'.format(re.min(), re.max())) 142 | # add noise to target 143 | noise_range_delta = noise_range[1] - noise_range[0] 144 | noise = (np.random.random_sample() * noise_range_delta + noise_range[0]) * np.random.choice((-1, 1)) 145 | df.Cd = [val * (1 + noise) for val in df["Cd"]] 146 | # build case 147 | df.drop("nu", axis=1).to_csv(save_path, index=False) 148 | # alert user that feature selection isn't needed 149 | feature_indexes_ranges = "Not applicable" 150 | return feature_indexes_ranges 151 | 152 | @staticmethod 153 | def generate_case_2_with_guess(samples: int, 154 | rhoa_range: tuple, 155 | rhop_range: tuple, 156 | nu_range: tuple, 157 | re_range: tuple, 158 | save_path: str, 159 | noise_range: tuple = SFF_CASE_2_NOISE_RANGE, 160 | force: bool = FORCE_DATA_OVERRIDE_FLAG): 161 | """ 162 | Generate a pandas dataframe similar to that of casse 2. 163 | Adds two educated guesses (rhop-rhoa, and V^2), 164 | and saves the dataframe for model experiment. 165 | """ 166 | baseline_path = os.path.join(os.path.dirname(os.path.dirname(__file__)), 167 | DATA_FOLDER, 168 | "SFF_noiseless_baseline_{}_samples.csv".format(SFF_N_SAMPLES_STR)) 169 | if os.path.exists(baseline_path) and not force: 170 | df = pd.read_csv(baseline_path) 171 | else: 172 | df = SFFWDdataGenerator.generate_noiseless(samples=samples, 173 | rhoa_range=rhoa_range, 174 | nu_range=nu_range, 175 | re_range=re_range, 176 | rhop_range=rhop_range) 177 | # alert user 178 | re = df["d"] * df["V"] / df["nu"] 179 | Logger.print('Generated data with Re: min={:.4} max={:.4}'.format(re.min(), re.max())) 180 | # add noise to target 181 | noise_range_delta = noise_range[1] - noise_range[0] 182 | noise = (np.random.random_sample() * noise_range_delta + noise_range[0]) * np.random.choice((-1, 1)) 183 | df.Cd = [val * (1 + noise) for val in df["Cd"]] 184 | # build case 185 | delta_rho = df["rhop"] - df["rhoa"] 186 | df.insert(0, "delta_rho", delta_rho) 187 | vel_squared = df["V"] * df["V"] 188 | df.insert(0, "V^2", vel_squared) 189 | df.drop("nu", axis=1).to_csv(save_path, index=False) 190 | # alert user that feature selection isn't needed 191 | feature_indexes_ranges = "Not applicable" 192 | return feature_indexes_ranges 193 | 194 | @staticmethod 195 | def generate_case_3(samples: int, 196 | rhoa_range: tuple, 197 | rhop_range: tuple, 198 | nu_range: tuple, 199 | re_range: tuple, 200 | save_path: str, 201 | force: bool = FORCE_DATA_OVERRIDE_FLAG): 202 | """ 203 | Generate a pandas dataframe with rhoa,V,d,rhop,nu,Cd measurements. 204 | Uses the dataframe to create a dataframe of dimensionless features, 205 | and saves it for model experiment. 206 | """ 207 | baseline_path = os.path.join(os.path.dirname(os.path.dirname(__file__)), 208 | DATA_FOLDER, 209 | "SFF_noiseless_baseline_{}_samples.csv".format(SFF_N_SAMPLES_STR)) 210 | if os.path.exists(baseline_path) and not force: 211 | df = pd.read_csv(baseline_path) 212 | else: 213 | df = SFFWDdataGenerator.generate_noiseless(samples=samples, 214 | rhoa_range=rhoa_range, 215 | nu_range=nu_range, 216 | re_range=re_range, 217 | rhop_range=rhop_range) 218 | # alert user 219 | re = df["d"] * df["V"] / df["nu"] 220 | Logger.print('Generated data with Re: min={:.4} max={:.4}'.format(re.min(), re.max())) 221 | # add all possible N frequency combinations 222 | df, n_suffix = NFrequencyGenerator.add_all_combos(df=df, 223 | g=g_force) 224 | # create non-dimensional features 225 | features = pd.DataFrame() 226 | # density ratio: 227 | features["rhop/rhoa"] = df["rhop"] / df["rhoa"] 228 | # density delta ratio: 229 | features["delta_rho/rhoa"] = (df["rhop"] - df["rhoa"]) / df["rhoa"] 230 | # Reynolds number: 231 | features["Re"] = re 232 | # Unknown number - nu*g/V**3: 233 | features["nu*g/V**3"] = g_force * df["nu"] / df["V"] ** 3 234 | # Unknown number - g*d/V**2: 235 | features["g*d/V**2"] = g_force * df["d"] / df["V"] ** 2 236 | # add 4 more groups of features 237 | for suff in n_suffix: 238 | # Froude number: 239 | features["Fr{}".format(suff)] = df["V"] / (df["d"] * df["N{}".format(suff)]) 240 | # Froude number from acceleration - g/(V*N_i): 241 | features["AccFr{}".format(suff)] = g_force / (df["V"] * df["N{}".format(suff)]) 242 | # Unknown number (Num1) - g*d/(nu*N_i) 243 | features["1Num{}".format(suff)] = g_force * df["d"] / (df["nu"] * df["N{}".format(suff)]) 244 | # Unknown number (Num2) - V*V/(nu*N_i) 245 | features["2Num{}".format(suff)] = df["V"] * df["V"] / (df["nu"] * df["N{}".format(suff)]) 246 | # reorder column names to be in groups 247 | features = features[sorted(features.keys(), key=lambda x: x[0])] 248 | # set index ranges for all 9 feature groups 249 | feature_indexes_ranges = [] 250 | index = 0 251 | for i in range(9): 252 | if i < 4: 253 | feature_indexes_ranges.append([index, index + len(n_suffix) - 1]) 254 | index += len(n_suffix) 255 | else: 256 | feature_indexes_ranges.append([index, index]) 257 | index += 1 258 | # add target 259 | features["Cd"] = df["Cd"] 260 | # save the result to a csv file 261 | features.to_csv(save_path, index=False) 262 | 263 | return feature_indexes_ranges 264 | -------------------------------------------------------------------------------- /experiments/exp_steady_free_fall_with_drag.py: -------------------------------------------------------------------------------- 1 | # library imports 2 | import os 3 | import json 4 | import time 5 | import pandas as pd 6 | from datetime import timedelta 7 | from sklearn.model_selection import train_test_split 8 | 9 | # project imports 10 | from utills.consts import * 11 | from utills.fitness_methods import * 12 | from utills.logger_config import Logger 13 | from algo.equation_brute_search import EBS 14 | from utills.result_tracker import ResultTracker 15 | from algo.multi_tpot_analysis import MultiTPOTrunner 16 | from algo.genetic_algorithm_symbolic_fit import GASF 17 | from algo.genetic_algorithm_feature_selection import GAFS 18 | 19 | 20 | class ExpSFF: 21 | """ 22 | A father class to the SFF experiments, responsible for 23 | the run function for all SFF cases. 24 | Here, only the data generation function changes from 25 | case to case. 26 | """ 27 | 28 | def __init__(self): 29 | pass 30 | 31 | @staticmethod 32 | def run(numerical_bool: bool, 33 | analytical_bool: bool, 34 | force_ebs_bool: bool, 35 | results_folder: str, 36 | data_path: str, 37 | data_generation_function, 38 | numerical_run_times: int, 39 | numerical_generations: int, 40 | numerical_population: int, 41 | analytical_run_times: int, 42 | analytical_generations: int, 43 | analytical_population: int, 44 | parsimony_coefficient: int, 45 | k_fold: int, 46 | samples: int, 47 | rhoa_range: tuple, 48 | rhop_range: tuple, 49 | nu_range: tuple, 50 | re_range: tuple, 51 | ebs_size_range: tuple, 52 | expected_eq: str = "unknown", 53 | feature_selection_generations: int = None, 54 | feature_selection_pop_size: int = None, 55 | feature_selection_mutation_rate: float = None, 56 | feature_selection_royalty: float = None): 57 | 58 | # config logging 59 | start_time = time.time() 60 | 61 | # prepare IO 62 | os.makedirs(results_folder, exist_ok=True) 63 | Logger(os.path.join(os.path.dirname(os.path.dirname(__file__)), 64 | results_folder, 65 | "run.log")) 66 | # 1) generate data 67 | feature_indexes_ranges = data_generation_function(samples=samples, 68 | rhoa_range=rhoa_range, 69 | nu_range=nu_range, 70 | re_range=re_range, 71 | rhop_range=rhop_range, 72 | save_path=data_path) 73 | # 1.1) load data, normalize and split 74 | df = pd.read_csv(data_path) 75 | Logger.print('Generated data:\n{}'.format(df.describe())) 76 | y_col = df.keys()[-1] 77 | normalized_df = (df - df.min()) / (df.max() - df.min()) 78 | train_data_x, test_data_x, train_data_y, test_data_y = train_test_split(normalized_df.drop([y_col], axis=1), 79 | normalized_df[y_col], 80 | shuffle=True, 81 | test_size=SFF_TEST_SIZE_PORTION, 82 | random_state=RANDOM_STATE) 83 | # 1.2) log elapsed time 84 | data_end_time = time.time() 85 | Logger.print(" --- Finished. Elapsed time: {} ---".format(timedelta(seconds=data_end_time - start_time))) 86 | 87 | # 2) continue to the MultiTPOTrunner regression 88 | Logger.print('Training MultiTPOTrunner:') 89 | if numerical_bool: 90 | # 2.1) run multi-tpot analysis if feature selection isn't needed 91 | if feature_indexes_ranges == "Not applicable": 92 | # 2.1.1) find the best ML model for all the data 93 | all_t_scores, best_t_model = MultiTPOTrunner.run_and_analyze(run_times=numerical_run_times, 94 | train_data_x=train_data_x, 95 | train_data_y=train_data_y, 96 | test_data_x=test_data_x, 97 | test_data_y=test_data_y, 98 | generations=numerical_generations, 99 | population_size=numerical_population, 100 | k_fold=k_fold, 101 | performance_metric=neg_mean_squared_error_scorer, 102 | save_dir=results_folder, 103 | n_jobs=-1) 104 | # 2.1.2) save results of best model from all runs 105 | ResultTracker.run(program_part="tpot", 106 | run_times=numerical_run_times, 107 | all_scores=all_t_scores, 108 | model=best_t_model, 109 | train_data_x=train_data_x, 110 | train_data_y=train_data_y, 111 | test_data_x=test_data_x, 112 | test_data_y=test_data_y, 113 | save_dir=results_folder) 114 | # 2.2) run multi-tpot analysis with feature selection 115 | else: 116 | # 2.2.1) find the best ML model for a subset of the data 117 | best_gene = GAFS.run(tpot_run_times=numerical_run_times, 118 | feature_generations=feature_selection_generations, 119 | tpot_regressor_generations=numerical_generations, 120 | feature_population_size=feature_selection_pop_size, 121 | tpot_regressor_population_size=numerical_population, 122 | mutation_rate=feature_selection_mutation_rate, 123 | feature_indexes_ranges=feature_indexes_ranges, 124 | mutation_w=[val[1]-val[0] for val in feature_indexes_ranges], 125 | royalty=feature_selection_royalty, 126 | k_fold=k_fold, 127 | performance_metric=neg_mean_squared_error_scorer, 128 | train_data_x=train_data_x, 129 | train_data_y=train_data_y, 130 | test_data_x=test_data_x, 131 | test_data_y=test_data_y, 132 | save_dir=results_folder, 133 | cores=-1) 134 | # 2.2.2) save results of best model from all runs 135 | ResultTracker.run(program_part="tpot", 136 | run_times=numerical_run_times, 137 | all_scores=best_gene.scoring_history, 138 | model=best_gene.model_object, 139 | train_data_x=train_data_x.iloc[:, best_gene.feature_indexes], 140 | train_data_y=train_data_y, 141 | test_data_x=test_data_x.iloc[:, best_gene.feature_indexes], 142 | test_data_y=test_data_y, 143 | save_dir=results_folder) 144 | # 2.2.3) save selected features of best gene 145 | with open(os.path.join(os.path.dirname(__file__), results_folder, "best_features_selected.json"), 146 | "w") as features_file: 147 | json.dump({"index": best_gene.feature_indexes, 148 | "names": list(test_data_x.columns[best_gene.feature_indexes])}, 149 | features_file) 150 | Logger.print("Best gene features: {}".format(list(test_data_x.columns[best_gene.feature_indexes]))) 151 | # 2.2.4) reduce the dataset of non-normalized samples for next part 152 | df = df.iloc[:, best_gene.feature_indexes+[-1]] 153 | # 2.3) log elapsed time 154 | tpot_end_time = time.time() 155 | symbolic_end_time = time.time() 156 | Logger.print(" --- Finished. Elapsed time: {} ---".format(timedelta(seconds=tpot_end_time-data_end_time))) 157 | 158 | # 3) continue to the symbolic regression 159 | if analytical_bool: 160 | Logger.print('Searching for a symbolic expression:') 161 | # 3.1) run symbolic regressor multiple times 162 | all_s_scores, best_s_model = GASF.run_and_analyze(run_times=analytical_run_times, 163 | non_normalized_data=df, 164 | performance_metric=function_mapper["better_symbolic_reg_fitness"], 165 | generations=analytical_generations, 166 | population_size=analytical_population, 167 | k_fold=k_fold, 168 | cores=-1, 169 | parsimony_coefficient=parsimony_coefficient, 170 | expected_eq=expected_eq, 171 | save_dir=results_folder) 172 | # 3.2) save results of best model from all runs 173 | non_norm_train_x, non_norm_test_x, non_norm_train_y, non_norm_test_y = train_test_split(df.drop([y_col], axis=1), 174 | df[y_col], 175 | shuffle=True, 176 | test_size=SFF_TEST_SIZE_PORTION, 177 | random_state=RANDOM_STATE) 178 | p_value_flag = ResultTracker.run(program_part="symbolic", 179 | run_times=analytical_run_times, 180 | all_scores=all_s_scores, 181 | model=best_s_model, 182 | train_data_x=non_norm_train_x, 183 | train_data_y=non_norm_train_y, 184 | test_data_x=non_norm_test_x, 185 | test_data_y=non_norm_test_y, 186 | save_dir=results_folder) 187 | # 3.3) save a summary of the eqs found & figure whether to continue to ebf 188 | stability_flag = ResultTracker.summaries_symbolic_results(run_times=analytical_run_times, 189 | percent_of_majority=SYMBOLIC_PERCENT_OF_MAJORITY, 190 | eq_ranking_metric=SYMBOLIC_EQ_RANKING_METRIC, 191 | top_eqs_max_num=SYMBOLIC_TOP_EQS_MAX_NUM, 192 | save_dir=results_folder) 193 | 194 | ebs_flag = p_value_flag or stability_flag 195 | # 3.4) log elapsed time 196 | symbolic_end_time = time.time() 197 | Logger.print("Finished. Elapsed time: {}".format(timedelta(seconds=tpot_end_time - tpot_end_time))) 198 | else: 199 | ebs_flag = False 200 | 201 | # 4) continue to the EBS 202 | if ebs_flag or force_ebs_bool: 203 | Logger.print('Searching for a symbolic expression using EBF:') 204 | # 4.1) run EBS multiple times 205 | all_ebs_scores, best_ebs_model = EBS.run_and_analyze(run_times=analytical_run_times, 206 | non_normalized_data=df, 207 | performance_metric=function_mapper[ 208 | "better_symbolic_reg_fitness"], 209 | cores=-1, 210 | size_range=ebs_size_range, 211 | expected_eq=expected_eq, 212 | save_dir=results_folder) 213 | # 4.2) save the fitting score results 214 | ResultTracker.ebs_results(model=best_ebs_model, 215 | all_scores=all_ebs_scores, 216 | save_dir=results_folder) 217 | else: 218 | Logger.print("EBF search of a symbolic equation wasn't needed") 219 | # 4.3) log elapsed time 220 | ebs_end_time = time.time() 221 | Logger.print("Finished. Elapsed time: {}".format(timedelta(seconds=ebs_end_time - symbolic_end_time))) 222 | 223 | # 5) alert results to the user 224 | Logger.print("\n --- TOTAL TIME ELAPSED TIME: {} ---".format(timedelta(seconds=time.time() - start_time))) 225 | -------------------------------------------------------------------------------- /experiments/exp_noise.py: -------------------------------------------------------------------------------- 1 | # library imports 2 | import os 3 | import json 4 | import time 5 | import pandas as pd 6 | from datetime import timedelta 7 | from sklearn.model_selection import train_test_split 8 | 9 | # project imports 10 | from utills.consts import * 11 | from utills.fitness_methods import * 12 | from utills.logger_config import Logger 13 | from algo.equation_brute_search import EBS 14 | from utills.result_tracker import ResultTracker 15 | from algo.multi_tpot_analysis import MultiTPOTrunner 16 | from algo.genetic_algorithm_symbolic_fit import GASF 17 | from algo.genetic_algorithm_feature_selection import GAFS 18 | 19 | 20 | class ExpNoise: 21 | """ 22 | A father class to the SFF experiments, responsible for 23 | the run function for all SFF cases. 24 | Here, only the data generation function changes from 25 | case to case. 26 | """ 27 | 28 | def __init__(self): 29 | pass 30 | 31 | @staticmethod 32 | def run(numerical_bool: bool, 33 | analytical_bool: bool, 34 | force_ebs_bool: bool, 35 | results_folder: str, 36 | data_path: str, 37 | data_generation_function, 38 | numerical_run_times: int, 39 | numerical_generations: int, 40 | numerical_population: int, 41 | analytical_run_times: int, 42 | analytical_generations: int, 43 | analytical_population: int, 44 | parsimony_coefficient: int, 45 | k_fold: int, 46 | samples: int, 47 | rhoa_range: tuple, 48 | rhop_range: tuple, 49 | nu_range: tuple, 50 | re_range: tuple, 51 | ebs_size_range: tuple, 52 | noise_list: list, 53 | expected_eq: str = "unknown", 54 | feature_selection_generations: int = None, 55 | feature_selection_pop_size: int = None, 56 | feature_selection_mutation_rate: float = None, 57 | feature_selection_royalty: float = None): 58 | global SFF_CASE_2_NOISE_RANGE 59 | 60 | answer = [] 61 | for noise in noise_list: 62 | # prepare IO 63 | os.makedirs(results_folder, exist_ok=True) 64 | Logger(os.path.join(os.path.dirname(os.path.dirname(__file__)), 65 | results_folder, 66 | "run.log")) 67 | # 1) generate data 68 | SFF_CASE_2_NOISE_RANGE = noise 69 | feature_indexes_ranges = data_generation_function(samples=samples, 70 | rhoa_range=rhoa_range, 71 | nu_range=nu_range, 72 | re_range=re_range, 73 | rhop_range=rhop_range, 74 | save_path=data_path) 75 | # 1.1) load data, normalize and split 76 | df = pd.read_csv(data_path) 77 | Logger.print('Generated data:\n{}'.format(df.describe())) 78 | y_col = df.keys()[-1] 79 | normalized_df = (df - df.min()) / (df.max() - df.min()) 80 | train_data_x, test_data_x, train_data_y, test_data_y = train_test_split(normalized_df.drop([y_col], axis=1), 81 | normalized_df[y_col], 82 | shuffle=True, 83 | test_size=SFF_TEST_SIZE_PORTION, 84 | random_state=RANDOM_STATE) 85 | # 1.2) log elapsed time 86 | data_end_time = time.time() 87 | Logger.print(" --- Finished. Elapsed time: {} ---".format(timedelta(seconds=data_end_time - start_time))) 88 | 89 | # 2) continue to the MultiTPOTrunner regression 90 | Logger.print('Training MultiTPOTrunner:') 91 | if numerical_bool: 92 | # 2.1) run multi-tpot analysis if feature selection isn't needed 93 | if feature_indexes_ranges == "Not applicable": 94 | # 2.1.1) find the best ML model for all the data 95 | all_t_scores, best_t_model = MultiTPOTrunner.run_and_analyze(run_times=numerical_run_times, 96 | train_data_x=train_data_x, 97 | train_data_y=train_data_y, 98 | test_data_x=test_data_x, 99 | test_data_y=test_data_y, 100 | generations=numerical_generations, 101 | population_size=numerical_population, 102 | k_fold=k_fold, 103 | performance_metric="neg_mean_absolute_error", 104 | save_dir=results_folder, 105 | n_jobs=-1) 106 | # 2.1.2) save results of best model from all runs 107 | ResultTracker.run(program_part="tpot", 108 | run_times=numerical_run_times, 109 | all_scores=all_t_scores, 110 | model=best_t_model, 111 | train_data_x=train_data_x, 112 | train_data_y=train_data_y, 113 | test_data_x=test_data_x, 114 | test_data_y=test_data_y, 115 | save_dir=results_folder) 116 | # 2.2) run multi-tpot analysis with feature selection 117 | else: 118 | # 2.2.1) find the best ML model for a subset of the data 119 | best_gene = GAFS.run(tpot_run_times=numerical_run_times, 120 | feature_generations=feature_selection_generations, 121 | tpot_regressor_generations=numerical_generations, 122 | feature_population_size=feature_selection_pop_size, 123 | tpot_regressor_population_size=numerical_population, 124 | mutation_rate=feature_selection_mutation_rate, 125 | feature_indexes_ranges=feature_indexes_ranges, 126 | mutation_w=[val[1]-val[0] for val in feature_indexes_ranges], 127 | royalty=feature_selection_royalty, 128 | k_fold=k_fold, 129 | performance_metric="neg_mean_absolute_error", 130 | train_data_x=train_data_x, 131 | train_data_y=train_data_y, 132 | test_data_x=test_data_x, 133 | test_data_y=test_data_y, 134 | save_dir=results_folder, 135 | cores=-1) 136 | # 2.2.2) save results of best model from all runs 137 | ResultTracker.run(program_part="tpot", 138 | run_times=numerical_run_times, 139 | all_scores=best_gene.scoring_history, 140 | model=best_gene.model_object, 141 | train_data_x=train_data_x.iloc[:, best_gene.feature_indexes], 142 | train_data_y=train_data_y, 143 | test_data_x=test_data_x.iloc[:, best_gene.feature_indexes], 144 | test_data_y=test_data_y, 145 | save_dir=results_folder) 146 | # 2.2.3) save selected features of best gene 147 | with open(os.path.join(os.path.dirname(__file__), results_folder, "best_features_selected.json"), 148 | "w") as features_file: 149 | json.dump({"index": best_gene.feature_indexes, 150 | "names": list(test_data_x.columns[best_gene.feature_indexes])}, 151 | features_file) 152 | Logger.print("Best gene features: {}".format(list(test_data_x.columns[best_gene.feature_indexes]))) 153 | # 2.2.4) reduce the dataset of non-normalized samples for next part 154 | df = df.iloc[:, best_gene.feature_indexes+[-1]] 155 | # 2.3) log elapsed time 156 | tpot_end_time = time.time() 157 | symbolic_end_time = time.time() 158 | Logger.print(" --- Finished. Elapsed time: {} ---".format(timedelta(seconds=tpot_end_time-data_end_time))) 159 | 160 | # 3) continue to the symbolic regression 161 | if analytical_bool: 162 | Logger.print('Searching for a symbolic expression:') 163 | # 3.1) run symbolic regressor multiple times 164 | all_s_scores, best_s_model = GASF.run_and_analyze(run_times=analytical_run_times, 165 | non_normalized_data=df, 166 | performance_metric=function_mapper["better_symbolic_reg_fitness"], 167 | generations=analytical_generations, 168 | population_size=analytical_population, 169 | k_fold=k_fold, 170 | cores=-1, 171 | parsimony_coefficient=parsimony_coefficient, 172 | expected_eq=expected_eq, 173 | save_dir=results_folder) 174 | # 3.2) save results of best model from all runs 175 | non_norm_train_x, non_norm_test_x, non_norm_train_y, non_norm_test_y = train_test_split(df.drop([y_col], axis=1), 176 | df[y_col], 177 | shuffle=True, 178 | test_size=SFF_TEST_SIZE_PORTION, 179 | random_state=RANDOM_STATE) 180 | ResultTracker.run(program_part="symbolic", 181 | run_times=analytical_run_times, 182 | all_scores=all_s_scores, 183 | model=best_s_model, 184 | train_data_x=non_norm_train_x, 185 | train_data_y=non_norm_train_y, 186 | test_data_x=non_norm_test_x, 187 | test_data_y=non_norm_test_y, 188 | save_dir=results_folder) 189 | # 3.3) save a summary of the eqs found & figure whether to continue to ebf 190 | ebs_flag = ResultTracker.summaries_symbolic_results(run_times=analytical_run_times, 191 | percent_of_majority=SYMBOLIC_PERCENT_OF_MAJORITY, 192 | eq_ranking_metric=SYMBOLIC_EQ_RANKING_METRIC, 193 | top_eqs_max_num=SYMBOLIC_TOP_EQS_MAX_NUM, 194 | save_dir=results_folder) 195 | # 3.4) log elapsed time 196 | symbolic_end_time = time.time() 197 | Logger.print("Finished. Elapsed time: {}".format(timedelta(seconds=tpot_end_time - tpot_end_time))) 198 | else: 199 | ebs_flag = False 200 | 201 | # 4) continue to the EBS 202 | if ebs_flag or force_ebs_bool: 203 | Logger.print('Searching for a symbolic expression using EBF:') 204 | # 4.1) run EBS multiple times 205 | all_ebs_scores, best_ebs_model = EBS.run_and_analyze(run_times=analytical_run_times, 206 | non_normalized_data=df, 207 | performance_metric=function_mapper[ 208 | "better_symbolic_reg_fitness"], 209 | cores=-1, 210 | size_range=ebs_size_range, 211 | expected_eq=expected_eq, 212 | save_dir=results_folder) 213 | # 4.2) save the fitting score results 214 | ResultTracker.ebs_results(model=best_ebs_model, 215 | all_scores=all_ebs_scores, 216 | save_dir=results_folder) 217 | answer.append(all_ebs_scores) 218 | --------------------------------------------------------------------------------