├── EDBOLogo.png ├── LICENSE ├── README.md ├── edbo └── plus │ ├── __init__.py │ ├── benchmark │ ├── __init__.py │ └── multiobjective_benchmark.py │ ├── model.py │ ├── optimizer_botorch.py │ ├── scope_generator.py │ └── utils.py ├── examples ├── publication │ ├── BMS_yield_cost │ │ ├── 0_data_preprocessing.ipynb │ │ ├── 1_preprocess_data.py │ │ ├── 2_plot_ground_truth.py │ │ ├── 3_run_edbo_cost_yield_performance.py │ │ ├── 4_plot_performance_hypervol.py │ │ ├── 5_plot_MAE_and_RMSE.py │ │ ├── 6_distrib_plots.py │ │ ├── 7_plot_scope_expansion.py │ │ ├── 8_optimization_expanding_scope.py │ │ ├── 9_optimization_constraints.py │ │ └── data │ │ │ ├── PCI_PMI_cost_full.csv │ │ │ ├── PCI_PMI_cost_full_update.csv │ │ │ ├── base_dft.csv │ │ │ ├── clean_dft.csv │ │ │ ├── experiments_yield_and_cost.csv │ │ │ ├── ligand_dft.csv │ │ │ └── solvent_dft.csv │ ├── Crosscoupling │ │ ├── 1_run_experiments.py │ │ ├── campaigns │ │ │ ├── 0_recalculate_predictions.py │ │ │ ├── 1_analysis.py │ │ │ ├── challenging_campaign_cvt │ │ │ │ ├── edbo_crosscoupling_photoredox_yield_ee_round1.csv │ │ │ │ ├── edbo_crosscoupling_photoredox_yield_ee_round2.csv │ │ │ │ ├── edbo_crosscoupling_photoredox_yield_ee_round3.csv │ │ │ │ ├── edbo_crosscoupling_photoredox_yield_ee_round4.csv │ │ │ │ ├── edbo_crosscoupling_photoredox_yield_ee_round5.csv │ │ │ │ ├── edbo_crosscoupling_photoredox_yield_ee_round6.csv │ │ │ │ ├── edbo_crosscoupling_photoredox_yield_ee_round7.csv │ │ │ │ ├── predictions_1.csv │ │ │ │ ├── predictions_2.csv │ │ │ │ ├── predictions_3.csv │ │ │ │ ├── predictions_4.csv │ │ │ │ ├── predictions_5.csv │ │ │ │ ├── predictions_6.csv │ │ │ │ └── predictions_7.csv │ │ │ ├── challenging_campaign_random │ │ │ │ ├── edbo_crosscoupling_photoredox_yield_ee_round1.csv │ │ │ │ ├── edbo_crosscoupling_photoredox_yield_ee_round2.csv │ │ │ │ ├── edbo_crosscoupling_photoredox_yield_ee_round3.csv │ │ │ │ ├── edbo_crosscoupling_photoredox_yield_ee_round4.csv │ │ │ │ ├── edbo_crosscoupling_photoredox_yield_ee_round5.csv │ │ │ │ ├── edbo_crosscoupling_photoredox_yield_ee_round6.csv │ │ │ │ ├── edbo_crosscoupling_photoredox_yield_ee_round7.csv │ │ │ │ ├── predictions_1.csv │ │ │ │ ├── predictions_2.csv │ │ │ │ ├── predictions_3.csv │ │ │ │ ├── predictions_4.csv │ │ │ │ ├── predictions_5.csv │ │ │ │ ├── predictions_6.csv │ │ │ │ └── predictions_7.csv │ │ │ ├── crosscoupling_results_challenging_campaign_cvt.csv │ │ │ └── easy_campaign │ │ │ │ ├── edbo_crosscoupling_photoredox_yield_ee_round0.csv │ │ │ │ ├── edbo_crosscoupling_photoredox_yield_ee_round1.csv │ │ │ │ ├── edbo_crosscoupling_photoredox_yield_ee_round2.csv │ │ │ │ ├── edbo_crosscoupling_photoredox_yield_ee_round3.csv │ │ │ │ ├── edbo_crosscoupling_photoredox_yield_ee_round4.csv │ │ │ │ ├── edbo_crosscoupling_photoredox_yield_ee_round5.csv │ │ │ │ ├── edbo_crosscoupling_photoredox_yield_ee_round6.csv │ │ │ │ ├── edbo_crosscoupling_photoredox_yield_ee_round7.csv │ │ │ │ ├── predictions_1.csv │ │ │ │ ├── predictions_2.csv │ │ │ │ ├── predictions_3.csv │ │ │ │ ├── predictions_4.csv │ │ │ │ ├── predictions_5.csv │ │ │ │ ├── predictions_6.csv │ │ │ │ └── predictions_7.csv │ │ └── edbo_crosscoupling_photoredox_yield_ee.csv │ ├── Suzuki │ │ ├── 0_clean_dft.py │ │ ├── 0_clean_mordred.py │ │ ├── 1_run_ohe.py │ │ ├── 2_run_dft.py │ │ ├── 3_run_mordred.py │ │ ├── 4_random_features.py │ │ ├── data │ │ │ ├── dataset_B1.csv │ │ │ ├── dataset_B2.csv │ │ │ ├── dataset_B2_DFT_clean.csv │ │ │ ├── dataset_B3.csv │ │ │ └── dataset_B3_Mordred_clean.csv │ │ ├── performance │ │ │ ├── 1_merge_all.py │ │ │ ├── 2_plot_ground_truth.py │ │ │ ├── 3_plot_decision_pathways_objectives.py │ │ │ ├── 4_plot_performance.py │ │ │ ├── 5_find_entry.py │ │ │ └── 7_plot_performance_acquisition_function.py │ │ └── performance_acq │ │ │ ├── 1_merge_all.py │ │ │ └── 2_plot_acq_batch.py │ └── Virtual-experimentation │ │ ├── 1_benchmark.py │ │ ├── data │ │ └── data.csv │ │ └── performance │ │ ├── 1_merge_all.py │ │ ├── 2_plot_ground_truth.py │ │ ├── 3_plot_performance_acquisition_function.py │ │ └── 4_hypervol_sampling.py └── tutorials │ ├── 1_CLI_example.ipynb │ └── 2_EDBO_WebApp_Tutorial.pdf ├── requirements.txt ├── setup.cfg └── setup.py /EDBOLogo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/doyle-lab-ucla/edboplus/f5d1687e834d33c77598767c5383a919f9a58034/EDBOLogo.png -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2022 Jose A. Garrido Torres 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | # 3 | 4 | ## **EDBO+**. Bayesian reaction optimization as a tool for chemical synthesis 5 | 6 | WebApp: https://www.edbowebapp.com 7 | 8 | **Reference:** Garrido Torres, Jose A.; Lau, Sii Hong; Anchuri, Pranay; Stevens, Jason M.; Tabora, Jose E.; Li, Jun; Borovika, Alina; Adams, Ryan P.; Doyle, Abigail G. "A Multi-Objective Active Learning Platform and Web App for Reaction Optimization". 9 | 10 | **DOI:** 11 | 12 | 10.26434/chemrxiv-2022-cljcp 13 | 14 | 10.1021/jacs.2c08592 15 | 16 | **Links**: 17 | [ChemRxiv](https://chemrxiv.org/engage/chemrxiv/article-details/62f6966269f3a5df46b5584b), 18 | [JACS](https://pubs.acs.org/doi/full/10.1021/jacs.2c08592) 19 | 20 | 21 |
22 | 23 | --- 24 | 25 |
26 | 27 | ### Installation: 28 | 29 |
30 | 31 | (1) Create anaconda environment: 32 | 33 | ``` 34 | conda create --name edbo_env python=3.8 35 | ``` 36 | 37 | (2) Activate conda environment: 38 | 39 | ``` 40 | conda activate edbo_env 41 | ``` 42 | 43 | (3) Install EDBO+ dependencies: 44 | 45 | ``` 46 | pip install -e . 47 | ``` 48 | 49 |
50 | 51 | --- 52 | 53 |
54 | 55 | #### **Note**: to run the notebook tutorials install JupyterLab 56 | 57 | ``` 58 | conda install jupyterlab 59 | ``` 60 | -------------------------------------------------------------------------------- /edbo/plus/__init__.py: -------------------------------------------------------------------------------- 1 | from .optimizer_botorch import * -------------------------------------------------------------------------------- /edbo/plus/benchmark/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/doyle-lab-ucla/edboplus/f5d1687e834d33c77598767c5383a919f9a58034/edbo/plus/benchmark/__init__.py -------------------------------------------------------------------------------- /edbo/plus/model.py: -------------------------------------------------------------------------------- 1 | 2 | import torch 3 | import gpytorch 4 | from gpytorch.kernels import MaternKernel, ScaleKernel 5 | from gpytorch.priors import GammaPrior 6 | from gpytorch.constraints import GreaterThan 7 | import numpy as np 8 | 9 | tkwargs = { 10 | "dtype": torch.double, 11 | "device": torch.device("cpu"), 12 | } 13 | 14 | def build_and_optimize_model(train_x, train_y): 15 | """ Builds model and optimizes it.""" 16 | 17 | gp_options = { 18 | 'ls_prior1': 2.0, 'ls_prior2': 0.2, 'ls_prior3': 5.0, 19 | 'out_prior1': 5.0, 'out_prior2': 0.5, 'out_prior3': 8.0, 20 | 'noise_prior1': 1.5, 'noise_prior2': 0.1, 'noise_prior3': 5.0, 21 | 'noise_constraint': 1e-5, 22 | } 23 | 24 | n_features = np.shape(train_x)[1] 25 | 26 | class ExactGPModel(gpytorch.models.ExactGP): 27 | def __init__(self, train_x, train_y, likelihood): 28 | super(ExactGPModel, self).__init__(train_x, train_y, 29 | likelihood) 30 | self.mean_module = gpytorch.means.ConstantMean() 31 | 32 | kernels = MaternKernel( 33 | ard_num_dims=n_features, 34 | lengthscale_prior=GammaPrior(gp_options['ls_prior1'], 35 | gp_options['ls_prior2']) 36 | ) 37 | 38 | self.covar_module = ScaleKernel( 39 | kernels, 40 | outputscale_prior=GammaPrior(gp_options['out_prior1'], 41 | gp_options['out_prior2'])) 42 | try: 43 | ls_init = gp_options['ls_prior3'] 44 | self.covar_module.base_kernel.lengthscale = ls_init 45 | except: 46 | uniform = gp_options['ls_prior3'] 47 | ls_init = torch.ones(n_features).to(**tkwargs) * uniform 48 | self.covar_module.base_kernel.lengthscale = ls_init 49 | 50 | def forward(self, x): 51 | mean_x = self.mean_module(x) 52 | covar_x = self.covar_module(x) 53 | return gpytorch.distributions.MultivariateNormal(mean_x, covar_x) 54 | 55 | # initialize likelihood and model 56 | likelihood = gpytorch.likelihoods.GaussianLikelihood( 57 | GammaPrior(gp_options['noise_prior1'], gp_options['noise_prior2']) 58 | ) 59 | 60 | likelihood.noise = gp_options['noise_prior3'] 61 | model = ExactGPModel(train_x, train_y, likelihood).to(**tkwargs) 62 | 63 | model.likelihood.noise_covar.register_constraint( 64 | "raw_noise", GreaterThan(gp_options['noise_constraint']) 65 | ) 66 | 67 | model.train() 68 | likelihood.train() 69 | optimizer = torch.optim.Adam([ 70 | {'params': model.parameters()}, 71 | ], lr=0.1) 72 | 73 | # "Loss" for GPs - the marginal log likelihood 74 | mll = gpytorch.mlls.ExactMarginalLogLikelihood(likelihood, model) 75 | 76 | training_iter = 1000 77 | for i in range(training_iter): 78 | # Zero gradients from previous iteration 79 | optimizer.zero_grad() 80 | # Output from model 81 | output = model(train_x) 82 | # Calc loss and backprop gradients 83 | loss = -mll(output, train_y.squeeze(-1).to(**tkwargs)) 84 | loss.backward() 85 | optimizer.step() 86 | 87 | model.eval() 88 | likelihood.eval() 89 | return model, likelihood # Optimized model 90 | 91 | -------------------------------------------------------------------------------- /edbo/plus/optimizer_botorch.py: -------------------------------------------------------------------------------- 1 | import os 2 | from pathlib import Path 3 | import random 4 | import sys 5 | import warnings 6 | 7 | from botorch.acquisition.monte_carlo import qExpectedImprovement 8 | from botorch.acquisition.multi_objective.monte_carlo import \ 9 | qExpectedHypervolumeImprovement, qNoisyExpectedHypervolumeImprovement 10 | from botorch.models import SingleTaskGP, ModelListGP 11 | from botorch.optim import optimize_acqf_discrete 12 | from botorch.sampling.samplers import SobolQMCNormalSampler, IIDNormalSampler 13 | from botorch.utils.multi_objective.box_decompositions import \ 14 | NondominatedPartitioning 15 | from idaes.surrogate.pysmo.sampling import LatinHypercubeSampling, CVTSampling 16 | import numpy as np 17 | from ordered_set import OrderedSet 18 | import pandas as pd 19 | from scipy.stats import norm 20 | from sklearn.preprocessing import MinMaxScaler 21 | from scipy.spatial.distance import cdist 22 | import torch 23 | 24 | from .utils import EDBOStandardScaler 25 | from .model import build_and_optimize_model 26 | from .scope_generator import create_reaction_scope 27 | 28 | tkwargs = { 29 | "dtype": torch.double, 30 | "device": torch.device("cpu"), 31 | } 32 | 33 | 34 | class EDBOplus: 35 | 36 | def __init__(self): 37 | 38 | self.predicted_mean = [] 39 | self.predicted_variance = [] 40 | 41 | @staticmethod 42 | def generate_reaction_scope(components, directory='./', filename='reaction.csv', 43 | check_overwrite=True): 44 | """ 45 | Creates a reaction scope from a dictionary of components and values. 46 | """ 47 | print("Generating a reaction scope...") 48 | df, n_combinations = create_reaction_scope(components=components, directory=directory, 49 | filename=filename, 50 | check_overwrite=check_overwrite) 51 | print(f"The scope was generated and contains {n_combinations} possible reactions!") 52 | return df 53 | 54 | @staticmethod 55 | def _init_sampling(df, batch, sampling_method, seed): 56 | 57 | np.random.seed(seed) 58 | random.seed(seed) 59 | numeric_cols = df._get_numeric_data().columns 60 | ohe_columns = list(OrderedSet(df.columns) - OrderedSet(numeric_cols)) 61 | if len(ohe_columns) > 0: 62 | print(f"The following columns are categorical and will be encoded" 63 | f" using One-Hot-Encoding: {ohe_columns}") 64 | # Encode OHE. 65 | df_sampling = pd.get_dummies(df, prefix=ohe_columns, 66 | columns=ohe_columns, drop_first=True) 67 | 68 | class HiddenPrints: 69 | def __enter__(self): 70 | self._original_stdout = sys.stdout 71 | sys.stdout = open(os.devnull, 'w') 72 | 73 | def __exit__(self, exc_type, exc_val, exc_tb): 74 | sys.stdout.close() 75 | sys.stdout = self._original_stdout 76 | 77 | # Order df according to initial sampling method (random samples). 78 | with HiddenPrints(): 79 | idaes = None 80 | if sampling_method == 'random': 81 | samples = df_sampling.sample(n=batch, random_state=seed) 82 | elif sampling_method.lower() == 'lhs': 83 | idaes = LatinHypercubeSampling(df_sampling, batch, sampling_type="selection") 84 | elif sampling_method.lower() == 'cvt': 85 | idaes = CVTSampling(df_sampling, batch, sampling_type="selection") 86 | 87 | if idaes is not None: 88 | samples = idaes.sample_points() 89 | 90 | # Sometimes the LHS or CVT sampling methods return less samples than requested. Add random samples in this case. 91 | additional_samples = None 92 | if len(samples) < batch: 93 | additional_samples = df.sample(n=batch-len(samples), random_state=seed, replace=True) 94 | additional_samples = additional_samples.reset_index(drop=True) 95 | # Add the additional samples to the samples dataframe. If some of the additional_samples are already in samples, generate new ones until the batch size is reached. 96 | extra_seed = 1 97 | while len(samples) < batch: 98 | samples = pd.concat([samples,additional_samples]).drop_duplicates(ignore_index=True) 99 | additional_samples = df.sample(n=batch-len(samples), random_state=seed+extra_seed, replace=True) 100 | extra_seed +=1 101 | 102 | # Get index of the best samples according to the random sampling method. 103 | df_sampling_matrix = df_sampling.to_numpy() 104 | priority_list = np.zeros_like(df_sampling.index) 105 | 106 | for sample in samples.to_numpy(): 107 | d_i = cdist([sample], df_sampling_matrix, metric='cityblock') 108 | a = np.argmin(d_i) 109 | priority_list[a] = 1. 110 | df['priority'] = priority_list 111 | 112 | print(f"Generated {len(samples)} initial samples using {sampling_method} sampling (seed = {seed}). Run finished!") 113 | 114 | return df 115 | 116 | 117 | def run(self, 118 | objectives, objective_mode, objective_thresholds=None, 119 | directory='.', filename='reaction.csv', 120 | columns_features='all', 121 | batch=5, init_sampling_method='cvt', seed=0, 122 | scaler_features=MinMaxScaler(), 123 | scaler_objectives=EDBOStandardScaler(), 124 | acquisition_function='NoisyEHVI', 125 | acquisition_function_sampler='SobolQMCNormalSampler'): 126 | 127 | """ 128 | Parameters 129 | ---------- 130 | objectives: list 131 | list of string containing the name for each objective. 132 | Example: 133 | objectives = ['yield', 'cost', 'impurity'] 134 | 135 | objective_mode: list 136 | list to select whether the objective should be maximized or minimized. 137 | Examples: 138 | A) Example for single-objective optimization: 139 | objective_mode = ['max'] 140 | B) Example for multi-objective optimization: 141 | objective_mode = ['max', 'min', 'min'] 142 | 143 | objective_thresholds: list 144 | List of worst case values for each objective. 145 | Example: 146 | objective_threshold = [50.0, 10.0, 10.0] 147 | 148 | columns_features: list 149 | List containing the names of the columns to be included in the regression model. By default set to 150 | 'all', which means the algorithm will automatically select all the columns that are not in 151 | the *objectives* list. 152 | 153 | batch: int 154 | Number of experiments that you want to run in parallel. For instance *batch = 5* means that you 155 | will run 5 experiments in each EDBO+ run. You can change this number at any stage of the optimization, 156 | so don't worry if you change your mind after creating or initializing the reaction scope. 157 | 158 | get_predictions: boolean 159 | If True it will print out a *csv file* with the predictions. 160 | You can also access the *predicted_mean* and *predicted_variance* through the EDBOPlus class. 161 | 162 | directory: string 163 | name of the directory to save the results of the optimization. 164 | 165 | filename: string 166 | Name of the file to save a *csv* with the priority list. If *get_predictions=True* EDBO+ will automatically 167 | save a second file including the predictions (*pred_filename.csv*). 168 | 169 | init_sampling_method: string: 170 | Method for selecting the first samples in the scope (in absence) Choices are: 171 | - 'random' : Random seed (as implemented in Pandas). 172 | - 'lhs' : LatinHypercube sampling. 173 | - 'cvt' : CVT sampling. 174 | 175 | scaler_features: sklearn class 176 | sklearn.preprocessing class for transforming the features. 177 | Example: 178 | sklearn.preprocessing.MinMaxScaler() 179 | 180 | scaler_objectives: sklearn class 181 | sklearn.preprocessing class for transforming the objective values. 182 | Examples: 183 | - sklearn.preprocessing.StandardScaler() 184 | Default: 185 | EDBOStandardScaler() 186 | 187 | seed: int 188 | Seed for the random initialization. 189 | 190 | acquisition_function_sampler: string 191 | Options are: 'SobolQMCNormalSampler' or 'IIDNormalSampler'. 192 | 193 | """ 194 | 195 | wdir = Path(directory) 196 | csv_filename = wdir.joinpath(filename) 197 | torch.manual_seed(seed=seed) 198 | np.random.seed(seed) 199 | self.acquisition_sampler = acquisition_function_sampler 200 | 201 | # 1. Safe checks. 202 | self.objective_names = objectives 203 | # Check whether the columns_features contains the objectives. 204 | if columns_features != 'all': 205 | for objective in objectives: 206 | if objective in columns_features: 207 | columns_features.remove(objective) 208 | if 'priority' in columns_features: 209 | columns_features.remove('priority') 210 | 211 | # Check objectives is a list (even for single objective optimization). 212 | ohe_features = False 213 | if type(objectives) != list: 214 | objectives = [objectives] 215 | if type(objective_mode) != list: 216 | objective_mode = [objective_mode] 217 | 218 | # Check that the user's scope exists. 219 | msg = "Scope was not found. Please create a scope (csv file)." 220 | assert os.path.exists(csv_filename), msg 221 | 222 | # 2. Load reaction. 223 | df = pd.read_csv(f"{csv_filename}") 224 | df = df.dropna(axis='columns', how='all') 225 | original_df = df.copy(deep=True) # Make a copy of the original data. 226 | 227 | # 2.1. Initialize sampling (only in the first iteration). 228 | obj_in_df = list(filter(lambda x: x in df.columns.values, objectives)) 229 | 230 | # TODO CHECK: Check whether new objective has been added – if not add PENDING. 231 | for obj_i in self.objective_names: 232 | if obj_i not in original_df.columns.values: 233 | original_df[obj_i] = ['PENDING'] * len(original_df.values) 234 | 235 | if columns_features != 'all': 236 | if 'priority' in df.columns.values: 237 | for obj_i in objectives: 238 | if obj_i not in df.columns.values: 239 | df[obj_i] = ['PENDING'] * len(df.values) 240 | 241 | df = df[columns_features + objectives + ['priority']] 242 | else: 243 | if len(obj_in_df) == 0: 244 | df = df[columns_features] 245 | else: 246 | df = df[columns_features + objectives] 247 | 248 | # No objectives columns in the scope? Then random initialization. 249 | if len(obj_in_df) == 0: 250 | print("There are no experimental observations yet. Random samples will be drawn.") 251 | df = self._init_sampling(df=df, batch=batch, seed=seed, 252 | sampling_method=init_sampling_method) 253 | original_df['priority'] = df['priority'] 254 | # Append objectives. 255 | for objective in objectives: 256 | if objective not in original_df.columns.values: 257 | original_df[objective] = ['PENDING'] * len(original_df) 258 | 259 | # Sort values and save dataframe. 260 | original_df = original_df.sort_values('priority', ascending=False) 261 | original_df = original_df.loc[:,~original_df.columns.str.contains('^Unnamed')] 262 | original_df.to_csv(csv_filename, index=False) 263 | return original_df 264 | 265 | if columns_features == 'all': # replacing with actual list of all features for printout 266 | columns_features = list(set(df.columns.tolist())- set(objectives) - set(['priority'])) 267 | print(f"This run will optimize for the following objectives: {objectives}") 268 | print(f"The following features will be used: {columns_features}") 269 | 270 | # 3. Separate train and test data. 271 | 272 | # 3.1. Auto-detect dummy features (one-hot-encoding). 273 | numeric_cols = df._get_numeric_data().columns 274 | for nc in numeric_cols: 275 | df[nc] = pd.to_numeric(df[nc], downcast='float') 276 | ohe_columns = list(OrderedSet(df.columns) - OrderedSet(numeric_cols)) 277 | ohe_columns = list(OrderedSet(ohe_columns) - OrderedSet(objectives)) 278 | 279 | if len(ohe_columns) > 0: 280 | print(f"The following columns are categorical and will be encoded" 281 | f" using One-Hot-Encoding: {ohe_columns}") 282 | ohe_features = True 283 | 284 | data = pd.get_dummies(df, prefix=ohe_columns, columns=ohe_columns, drop_first=True) 285 | 286 | # 3.2. Any sample with a value 'PENDING' in any objective is a test. 287 | idx_test = (data[data.apply(lambda r: r.astype(str).str.contains('PENDING', case=False).any(), axis=1)]).index.values 288 | idx_train = (data[~data.apply(lambda r: r.astype(str).str.contains('PENDING', case=False).any(), axis=1)]).index.values 289 | 290 | # Data only contains featurized information (train and test). 291 | df_train_y = data.loc[idx_train][objectives] 292 | if 'priority' in data.columns.tolist(): 293 | data = data.drop(columns=objectives + ['priority']) 294 | else: 295 | data = data.drop(columns=objectives) 296 | df_train_x = data.loc[idx_train] 297 | df_test_x = data.loc[idx_test] 298 | 299 | if len(df_train_x.values) == 0: 300 | msg = 'The scope was already generated, please ' \ 301 | 'insert at least one experimental observation ' \ 302 | 'value and then press run.' 303 | print(msg) 304 | return original_df 305 | 306 | # Run the BO process. 307 | priority_list = self._model_run( 308 | data=data, 309 | df_train_x=df_train_x, 310 | df_test_x=df_test_x, 311 | df_train_y=df_train_y, 312 | batch=batch, 313 | objective_mode=objective_mode, 314 | objective_thresholds=objective_thresholds, 315 | seed=seed, 316 | scaler_x=scaler_features, 317 | scaler_y=scaler_objectives, 318 | acquisition_function=acquisition_function 319 | ) 320 | 321 | # Low priority to the samples that have been already collected. 322 | for i in range(0, len(idx_train)): 323 | priority_list[idx_train[i]] = -1 324 | 325 | original_df['priority'] = priority_list 326 | 327 | cols_sort = ['priority'] + original_df.columns.values.tolist() 328 | # Attach objectives predictions and expected improvement. 329 | cols_for_preds = [] 330 | for idx_obj in range(0, len(objectives)): 331 | name = objectives[idx_obj] 332 | mean = self.predicted_mean[:, idx_obj] 333 | var = self.predicted_variance[:, idx_obj] 334 | ei = self.ei[:, idx_obj] 335 | original_df[f"{name}_predicted_mean"] = mean 336 | original_df[f"{name}_predicted_variance"] = var 337 | original_df[f"{name}_expected_improvement"] = ei 338 | cols_for_preds.append([f"{name}_predicted_mean", 339 | f"{name}_predicted_variance", 340 | f"{name}_expected_improvement" 341 | ]) 342 | cols_for_preds = np.ravel(cols_for_preds) 343 | 344 | original_df = original_df.sort_values(cols_sort, ascending=False) 345 | # Save extra df containing predictions, uncertainties and EI. 346 | original_df.to_csv(f"{directory}/pred_{filename}", index=False) 347 | # Drop predictions, uncertainties and EI. 348 | original_df = original_df.drop(columns=cols_for_preds, axis='columns') 349 | original_df = original_df.sort_values(cols_sort, ascending=False) 350 | original_df.to_csv(csv_filename, index=False) 351 | 352 | print("Run finished!") 353 | return original_df 354 | 355 | def _model_run(self, data, df_train_x, df_test_x, df_train_y, batch, 356 | objective_mode, objective_thresholds, seed, 357 | scaler_x, scaler_y, acquisition_function): 358 | """ 359 | Runs the surrogate machine learning model. 360 | Returns a priority list for a given scope (top priority to low priority). 361 | """ 362 | 363 | # Check number of objectives. 364 | n_objectives = len(df_train_y.columns.values) 365 | 366 | scaler_x.fit(df_train_x.to_numpy()) 367 | init_train = scaler_x.transform(df_train_x.to_numpy()) 368 | test_xnp = scaler_x.transform(df_test_x.to_numpy()) 369 | test_x = torch.tensor(test_xnp.tolist()).double().to(**tkwargs) 370 | y = df_train_y.astype(float).to_numpy() # not scaled. 371 | 372 | individual_models = [] 373 | for i in range(0, n_objectives): 374 | if objective_mode[i].lower() == 'min': 375 | y[:, i] = -y[:, i] 376 | y = scaler_y.fit_transform(y) 377 | 378 | print("Generating surrogate model...") 379 | for i in range(0, n_objectives): 380 | train_x = torch.tensor(init_train).to(**tkwargs).double() 381 | train_y = np.array(y)[:, i] 382 | train_y = (np.atleast_2d(train_y).reshape(len(train_y), -1)) 383 | train_y_i = torch.tensor(train_y.tolist()).to(**tkwargs).double() 384 | 385 | gp, likelihood = build_and_optimize_model(train_x=train_x, train_y=train_y_i,) 386 | 387 | model_i = SingleTaskGP(train_X=train_x, train_Y=train_y_i, 388 | covar_module=gp.covar_module, likelihood=likelihood) 389 | individual_models.append(model_i) 390 | 391 | print("Model generated!") 392 | 393 | # Reference point is the minimum seen so far. 394 | ref_mins = np.min(y, axis=0) 395 | if objective_thresholds is None: 396 | ref_point = torch.tensor(ref_mins).double().to(**tkwargs) 397 | else: 398 | ref_point = np.zeros(n_objectives) 399 | for i in range(0, n_objectives): 400 | if objective_thresholds[i] is None: 401 | ref_point[i] = ref_mins[i] 402 | else: 403 | ref_point[i] = objective_thresholds[i] 404 | if objective_mode[i].lower() == 'min': 405 | ref_point[i] = -ref_point[i] 406 | # Scale. 407 | ref_point = scaler_y.transform(np.array([ref_point])) 408 | # Loop again. 409 | for i in range(0, n_objectives): 410 | if objective_thresholds[i] is None: 411 | ref_point[0][i] = ref_mins[i] 412 | ref_point = torch.tensor(ref_point[0]).double().to(**tkwargs) 413 | 414 | if len(data.values) > 100000: 415 | sobol_num_samples = 64 416 | elif len(data.values) > 50000: 417 | sobol_num_samples = 128 418 | elif len(data.values) > 10000: 419 | sobol_num_samples = 256 420 | else: 421 | sobol_num_samples = 512 422 | 423 | y_torch = torch.tensor(y).to(**tkwargs).double() 424 | 425 | if self.acquisition_sampler == 'IIDNormalSampler': 426 | sampler = IIDNormalSampler(num_samples=sobol_num_samples, collapse_batch_dims=True, seed=seed) 427 | if self.acquisition_sampler == 'SobolQMCNormalSampler': 428 | sampler = SobolQMCNormalSampler(num_samples=sobol_num_samples, collapse_batch_dims=True, seed=seed) 429 | 430 | print ("Optimizing acqusition function...") 431 | 432 | surrogate_model = None 433 | 434 | if acquisition_function.lower() == 'ehvi': 435 | 436 | partitioning = NondominatedPartitioning( 437 | ref_point=ref_point, 438 | Y=y_torch) 439 | 440 | surrogate_model = ModelListGP(*individual_models) 441 | individual_models = [] # empty to reuduce memory 442 | 443 | EHVI = qExpectedHypervolumeImprovement( 444 | model=surrogate_model, sampler=sampler, 445 | ref_point=ref_point, # use known reference point 446 | partitioning=partitioning 447 | ) 448 | 449 | acq_result = optimize_acqf_discrete( 450 | acq_function=EHVI, 451 | choices=test_x, 452 | q=batch, 453 | unique=True 454 | ) 455 | 456 | 457 | if acquisition_function.lower() == 'noisyehvi': 458 | with warnings.catch_warnings(): 459 | warnings.simplefilter("ignore") 460 | acq_fct = None 461 | if n_objectives > 1: # NOTE: NoisyEHVI fails in case of n_objectives = 1 --> added that it uses EI in this case 462 | surrogate_model = ModelListGP(*individual_models) 463 | train_x = torch.tensor(init_train).to(**tkwargs).double() 464 | acq_fct = qNoisyExpectedHypervolumeImprovement( 465 | model=surrogate_model, sampler=sampler, 466 | ref_point=ref_point, 467 | alpha = 0.0, 468 | incremental_nehvi=True, X_baseline=train_x, prune_baseline=True 469 | ) 470 | else: 471 | surrogate_model = individual_models[0] 472 | best_value = y_torch.max() 473 | acq_fct = qExpectedImprovement( 474 | model = surrogate_model, 475 | best_f = best_value, 476 | sampler = sampler 477 | ) 478 | 479 | acq_result = optimize_acqf_discrete( 480 | acq_function=acq_fct, 481 | choices=test_x, 482 | q=batch, 483 | unique=True 484 | ) 485 | 486 | best_samples = scaler_x.inverse_transform(acq_result[0].detach().cpu().numpy()) 487 | 488 | print('Acquisition function optimized.') 489 | 490 | # Save rescaled predictions (only for first fantasy). 491 | 492 | # Get predictions in chunks. 493 | chunk_size = 1000 494 | n_chunks = len(data.values) // chunk_size 495 | 496 | if n_chunks == 0: 497 | n_chunks = 1 498 | 499 | self.predicted_mean = np.zeros(shape=(len(data.values), n_objectives)) 500 | self.predicted_variance = np.zeros(shape=(len(data.values), n_objectives)) 501 | self.ei = np.zeros(shape=(len(data.values), n_objectives)) 502 | 503 | observed_raw_values = df_train_y.astype(float).to_numpy() 504 | 505 | for i in range(0, len(data.values), n_chunks): 506 | vals = data.values[i:i+n_chunks] 507 | data_tensor = torch.tensor(scaler_x.transform(vals)).double().to(**tkwargs) 508 | preds = surrogate_model.posterior(X=data_tensor) 509 | self.predicted_mean[i:i+n_chunks] = scaler_y.inverse_transform(preds.mean.detach().cpu().numpy()) 510 | self.predicted_variance[i:i+n_chunks] = scaler_y.inverse_transform_var(preds.variance.detach().cpu().numpy()) 511 | 512 | for j in range(0, len(objective_mode)): 513 | maximizing = False 514 | if objective_mode[j] == 'max': 515 | maximizing = True 516 | self.ei[i:i+n_chunks, j] = self.expected_improvement( 517 | train_y=observed_raw_values[:, j], 518 | mean=self.predicted_mean[i:i+n_chunks, j], 519 | variance=self.predicted_variance[i:i+n_chunks, j], 520 | maximizing=maximizing 521 | ) 522 | 523 | print('Predictions and expected improvement obtained.') 524 | 525 | # Flip predictions if needed. 526 | for i in range(0, len(objective_mode)): 527 | if objective_mode[i] == 'min': 528 | self.predicted_mean[:, i] = -self.predicted_mean[:, i] 529 | 530 | # Rescale samples. 531 | all_samples = data.values 532 | 533 | priority_list = [0] * len(data.values) 534 | 535 | # Find best samples in data. 536 | for sample in best_samples: 537 | d_i = cdist([sample], all_samples, metric='cityblock') 538 | a = np.argmin(d_i) 539 | priority_list[a] = 1. 540 | 541 | return priority_list 542 | 543 | def expected_improvement(self, train_y, mean, variance, 544 | maximizing=False): 545 | """ expected_improvement 546 | Expected improvement acquisition function. 547 | Arguments: 548 | ---------- 549 | mean: Numpy array. 550 | predicted mean of the Gaussian Process. 551 | variance: Numpy array. 552 | predicted variance of the Gaussian Process. 553 | train_y: Numpy array. 554 | Numpy array that contains the values of previously observed train targets. 555 | maximizing: Boolean. 556 | Boolean flag that indicates whether the loss function is to be maximised or minimised. 557 | """ 558 | 559 | sigma = variance * 2. 560 | 561 | if maximizing: 562 | loss_optimum = np.max(train_y) 563 | else: 564 | loss_optimum = np.min(train_y) 565 | 566 | scaling_factor = (-1) ** (not maximizing) 567 | 568 | # In case sigma equals zero 569 | with np.errstate(divide='ignore'): 570 | Z = scaling_factor * (mean - loss_optimum) / sigma 571 | expected_improvement = scaling_factor * (mean - loss_optimum) * norm.cdf(Z) + sigma * norm.pdf(Z) 572 | expected_improvement[sigma == 0.0] = 0.0 573 | 574 | return expected_improvement 575 | 576 | -------------------------------------------------------------------------------- /edbo/plus/scope_generator.py: -------------------------------------------------------------------------------- 1 | 2 | import itertools 3 | import pandas as pd 4 | import os 5 | from pathlib import Path 6 | 7 | 8 | def create_reaction_scope(components, directory='./', filename='reaction.csv', 9 | check_overwrite=True): 10 | 11 | """ 12 | Reaction scope generator. Pass components dictionary, each 13 | dictionary key contains a list of the choices for a given component. 14 | 15 | ---------------------------------------------------------------------- 16 | Example: 17 | 18 | components = {'temperature': [30, 40, 50], 19 | 'solvent': ['THF', 'DMSO'], 20 | 'concentration': [0.1, 0.2, 0.3, 0.4, 0.5]} 21 | ---------------------------------------------------------------------- 22 | 23 | ---------------------------------------------------------------------- 24 | Note: 25 | - All non-numerical choices are encoded using a One-Hot-Encoder. 26 | ---------------------------------------------------------------------- 27 | 28 | ---------------------------------------------------------------------- 29 | Returns: 30 | A dataframe with name *{label}.csv* including the entire 31 | set of choices (reaction scope). 32 | ---------------------------------------------------------------------- 33 | """ 34 | 35 | msg = "You need to pass a dictionary for components. \n" 36 | assert type(components) == dict, msg 37 | 38 | wdir = Path(directory) 39 | csv_filename = wdir.joinpath(filename) 40 | # Ask to overwrite previous scope. 41 | 42 | if os.path.exists(csv_filename) and check_overwrite is True: 43 | overwrite = input('Scope already exists. Overwrite? Y = yes, N = no\n') 44 | if overwrite.lower() != 'y': 45 | return 46 | 47 | # Predict how large will the scope be. 48 | n_combinations = 0 49 | for key in list(components.keys()): 50 | if n_combinations == 0: 51 | n_combinations = len(components[key]) 52 | else: 53 | n_combinations *= len(components[key]) 54 | 55 | # Generate initial scope. 56 | keys = components.keys() 57 | values = (components[key] for key in keys) 58 | 59 | scope = [dict(zip(keys, combination)) for combination in 60 | itertools.product(*values)] 61 | df_scope = pd.DataFrame(scope) 62 | df_scope.to_csv(csv_filename, index=False, mode='w', 63 | header=list(keys)) 64 | 65 | return df_scope, n_combinations 66 | -------------------------------------------------------------------------------- /edbo/plus/utils.py: -------------------------------------------------------------------------------- 1 | 2 | import numpy as np 3 | 4 | 5 | class EDBOStandardScaler: 6 | """ 7 | Custom standard scaler for EDBO. 8 | """ 9 | def __init__(self): 10 | pass 11 | 12 | def fit(self, x): 13 | self.mu = np.mean(x, axis=0) 14 | self.std = np.std(x, axis=0) 15 | 16 | def transform(self, x): 17 | for obj in range(0, len(self.std)): 18 | if self.std[obj] == 0.0: 19 | self.std[obj] = 1e-6 20 | return (x-[self.mu])/[self.std] 21 | 22 | def fit_transform(self, x): 23 | self.mu = np.mean(x, axis=0) 24 | self.std = np.std(x, axis=0) 25 | 26 | for obj in range(0, len(self.std)): 27 | if self.std[obj] == 0.0: 28 | self.std[obj] = 1e-6 29 | return (x-[self.mu])/[self.std] 30 | 31 | def inverse_transform(self, x): 32 | return x * [self.std] + [self.mu] 33 | 34 | def inverse_transform_var(self, x): 35 | return x * [self.std] 36 | -------------------------------------------------------------------------------- /examples/publication/BMS_yield_cost/0_data_preprocessing.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "id": "bfc9fc54", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "import numpy as np\n", 11 | "import pandas as pd" 12 | ] 13 | }, 14 | { 15 | "cell_type": "code", 16 | "execution_count": 2, 17 | "id": "54a71f91", 18 | "metadata": {}, 19 | "outputs": [ 20 | { 21 | "data": { 22 | "text/plain": [ 23 | "Index(['Ligand_inchi', 'Base_inchi', 'Solvent_inchi', 'Product_inchi',\n", 24 | " 'Electrophile_inchi', 'Nucleophile_inchi', 'Precatalyst_inchi', 'Base',\n", 25 | " 'Electrophile', 'Electrophile_PCI_Name', 'Ligand', 'Nucleophile',\n", 26 | " 'Nucleophile_PCI_Name', 'Precatalyst', 'Product', 'Solvent',\n", 27 | " 'Screen_ID', 'umol_Screen', 'Entry', 'Well', 'Row', 'Column',\n", 28 | " 'Base_Equiv', 'Electrophile_Equiv', 'Ligand_Equiv', 'Nucleophile_Equiv',\n", 29 | " 'Precatalyst_Equiv', 'Concentration', 'Time_h', 'Temp_C', 'SampleName',\n", 30 | " 'Vial', 'AP_ISO', 'AP_PDT', 'AP_STD', 'Mean_AP', 'Max_AP', 'SD_AP',\n", 31 | " 'Z_Score_AP', 'RelYield_PDT', 'Mean_RY', 'Max_RY', 'SD_RY',\n", 32 | " 'Z_Score_RY', 'Yield', 'Mean_Yield', 'Max_Yield', 'SD_Yield',\n", 33 | " 'Z_Score_Yield', 'Product_MW', 'Solvent_density', 'Solvent_mass',\n", 34 | " 'Product_mg', 'Base_Cost', 'Base_amt', 'Base_MW', 'Base_price.mol',\n", 35 | " 'Solvent_Cost', 'Solvent_amt', 'Solvent_MW', 'Solvent_price.mol',\n", 36 | " 'Ligand_Cost', 'Ligand_amt', 'Ligand_MW', 'Ligand_price.mol',\n", 37 | " 'Ligand_dol', 'Base_dol', 'Solvent_dol', 'reagent_cost',\n", 38 | " 'Nucleophile_MW', 'Electrophile_MW', 'Precatalyst_MW', 'Nucleophile_mg',\n", 39 | " 'Electrophile_mg', 'Precatalyst_mg', 'Ligand_mg', 'Base_mg', 'Total_mg',\n", 40 | " 'PMI', 'solvent mg', 'ligand_dol_will', 'base_dol_will',\n", 41 | " 'solvent_dol_will', 'total_cost_update'],\n", 42 | " dtype='object')" 43 | ] 44 | }, 45 | "execution_count": 2, 46 | "metadata": {}, 47 | "output_type": "execute_result" 48 | } 49 | ], 50 | "source": [ 51 | "df = pd.read_csv('./data/PCI_PMI_cost_full_update.csv')\n", 52 | "df.columns" 53 | ] 54 | }, 55 | { 56 | "cell_type": "code", 57 | "execution_count": 3, 58 | "id": "d4d208a8", 59 | "metadata": {}, 60 | "outputs": [ 61 | { 62 | "name": "stdout", 63 | "output_type": "stream", 64 | "text": [ 65 | "Ligand_inchi\n", 66 | "Base_inchi\n", 67 | "Solvent_inchi\n", 68 | "Base\n", 69 | "Ligand\n", 70 | "Solvent\n", 71 | "Screen_ID\n", 72 | "Entry\n", 73 | "Well\n", 74 | "Row\n", 75 | "Column\n", 76 | "Concentration\n", 77 | "Temp_C\n", 78 | "SampleName\n", 79 | "Vial\n", 80 | "AP_ISO\n", 81 | "AP_PDT\n", 82 | "AP_STD\n", 83 | "Mean_AP\n", 84 | "Max_AP\n", 85 | "SD_AP\n", 86 | "Z_Score_AP\n", 87 | "RelYield_PDT\n", 88 | "Mean_RY\n", 89 | "Max_RY\n", 90 | "SD_RY\n", 91 | "Z_Score_RY\n", 92 | "Yield\n", 93 | "Mean_Yield\n", 94 | "Max_Yield\n", 95 | "SD_Yield\n", 96 | "Z_Score_Yield\n", 97 | "Solvent_density\n", 98 | "Solvent_mass\n", 99 | "Product_mg\n", 100 | "Base_Cost\n", 101 | "Base_amt\n", 102 | "Base_MW\n", 103 | "Base_price.mol\n", 104 | "Solvent_Cost\n", 105 | "Solvent_amt\n", 106 | "Solvent_MW\n", 107 | "Solvent_price.mol\n", 108 | "Ligand_Cost\n", 109 | "Ligand_amt\n", 110 | "Ligand_MW\n", 111 | "Ligand_price.mol\n", 112 | "Ligand_dol\n", 113 | "Base_dol\n", 114 | "Solvent_dol\n", 115 | "reagent_cost\n", 116 | "Ligand_mg\n", 117 | "Base_mg\n", 118 | "Total_mg\n", 119 | "PMI\n", 120 | "solvent mg\n", 121 | "ligand_dol_will\n", 122 | "base_dol_will\n", 123 | "solvent_dol_will\n", 124 | "total_cost_update\n" 125 | ] 126 | } 127 | ], 128 | "source": [ 129 | "for i in range(0, len(df.columns)):\n", 130 | " if len(np.unique(df[df.columns[i]])) > 1:\n", 131 | " print(df.columns[i])\n", 132 | "\n", 133 | "# np.unique(df['Concentration'].values)" 134 | ] 135 | }, 136 | { 137 | "cell_type": "code", 138 | "execution_count": 4, 139 | "id": "d4a6a824", 140 | "metadata": {}, 141 | "outputs": [ 142 | { 143 | "data": { 144 | "text/html": [ 145 | "
\n", 146 | "\n", 159 | "\n", 160 | " \n", 161 | " \n", 162 | " \n", 163 | " \n", 164 | " \n", 165 | " \n", 166 | " \n", 167 | " \n", 168 | " \n", 169 | " \n", 170 | " \n", 171 | " \n", 172 | " \n", 173 | " \n", 174 | " \n", 175 | " \n", 176 | " \n", 177 | " \n", 178 | " \n", 179 | " \n", 180 | " \n", 181 | " \n", 182 | " \n", 183 | " \n", 184 | " \n", 185 | " \n", 186 | " \n", 187 | " \n", 188 | " \n", 189 | " \n", 190 | " \n", 191 | " \n", 192 | " \n", 193 | " \n", 194 | " \n", 195 | " \n", 196 | " \n", 197 | " \n", 198 | " \n", 199 | " \n", 200 | " \n", 201 | " \n", 202 | " \n", 203 | " \n", 204 | " \n", 205 | " \n", 206 | " \n", 207 | " \n", 208 | " \n", 209 | " \n", 210 | " \n", 211 | " \n", 212 | " \n", 213 | " \n", 214 | " \n", 215 | " \n", 216 | " \n", 217 | " \n", 218 | " \n", 219 | " \n", 220 | " \n", 221 | " \n", 222 | " \n", 223 | " \n", 224 | " \n", 225 | " \n", 226 | " \n", 227 | " \n", 228 | " \n", 229 | " \n", 230 | " \n", 231 | " \n", 232 | " \n", 233 | " \n", 234 | " \n", 235 | " \n", 236 | " \n", 237 | " \n", 238 | " \n", 239 | " \n", 240 | " \n", 241 | " \n", 242 | " \n", 243 | " \n", 244 | " \n", 245 | " \n", 246 | " \n", 247 | " \n", 248 | " \n", 249 | " \n", 250 | " \n", 251 | " \n", 252 | " \n", 253 | " \n", 254 | " \n", 255 | " \n", 256 | " \n", 257 | " \n", 258 | " \n", 259 | " \n", 260 | " \n", 261 | " \n", 262 | " \n", 263 | " \n", 264 | " \n", 265 | " \n", 266 | " \n", 267 | " \n", 268 | " \n", 269 | " \n", 270 | " \n", 271 | " \n", 272 | " \n", 273 | " \n", 274 | " \n", 275 | " \n", 276 | " \n", 277 | " \n", 278 | " \n", 279 | " \n", 280 | " \n", 281 | " \n", 282 | " \n", 283 | " \n", 284 | " \n", 285 | " \n", 286 | " \n", 287 | " \n", 288 | " \n", 289 | " \n", 290 | " \n", 291 | " \n", 292 | " \n", 293 | " \n", 294 | " \n", 295 | " \n", 296 | "
baseligandsolventconcentrationtemperatureyieldcostPMI
0KOAcBrettPhosDMAc0.1001055.470.145775917.668323
1KOAcPPhtBu2DMAc0.1001050.000.043201inf
2KOActBPh-CPhosDMAc0.10010578.950.26914064.469151
3KOAcPCy3 HBF4DMAc0.1001057.260.032181691.080949
4KOAcPPh3DMAc0.10010528.150.026373178.881165
...........................
1723CsOPivPPh2Mep-Xylene0.1531201.600.1106532091.688946
1724CsOPivGorlosPhos HBF4p-Xylene0.1531208.390.121732400.447659
1725CsOPivJackiePhosp-Xylene0.15312013.340.439356252.868372
1726CsOPivCgMe-PPhp-Xylene0.15312019.130.141130175.981223
1727CsOPivPPhMe2p-Xylene0.1531200.000.111903inf
\n", 297 | "

1728 rows × 8 columns

\n", 298 | "
" 299 | ], 300 | "text/plain": [ 301 | " base ligand solvent concentration temperature yield \\\n", 302 | "0 KOAc BrettPhos DMAc 0.100 105 5.47 \n", 303 | "1 KOAc PPhtBu2 DMAc 0.100 105 0.00 \n", 304 | "2 KOAc tBPh-CPhos DMAc 0.100 105 78.95 \n", 305 | "3 KOAc PCy3 HBF4 DMAc 0.100 105 7.26 \n", 306 | "4 KOAc PPh3 DMAc 0.100 105 28.15 \n", 307 | "... ... ... ... ... ... ... \n", 308 | "1723 CsOPiv PPh2Me p-Xylene 0.153 120 1.60 \n", 309 | "1724 CsOPiv GorlosPhos HBF4 p-Xylene 0.153 120 8.39 \n", 310 | "1725 CsOPiv JackiePhos p-Xylene 0.153 120 13.34 \n", 311 | "1726 CsOPiv CgMe-PPh p-Xylene 0.153 120 19.13 \n", 312 | "1727 CsOPiv PPhMe2 p-Xylene 0.153 120 0.00 \n", 313 | "\n", 314 | " cost PMI \n", 315 | "0 0.145775 917.668323 \n", 316 | "1 0.043201 inf \n", 317 | "2 0.269140 64.469151 \n", 318 | "3 0.032181 691.080949 \n", 319 | "4 0.026373 178.881165 \n", 320 | "... ... ... \n", 321 | "1723 0.110653 2091.688946 \n", 322 | "1724 0.121732 400.447659 \n", 323 | "1725 0.439356 252.868372 \n", 324 | "1726 0.141130 175.981223 \n", 325 | "1727 0.111903 inf \n", 326 | "\n", 327 | "[1728 rows x 8 columns]" 328 | ] 329 | }, 330 | "execution_count": 4, 331 | "metadata": {}, 332 | "output_type": "execute_result" 333 | } 334 | ], 335 | "source": [ 336 | "df_sel = df[['Base', 'Ligand', 'Solvent', 'Concentration', 'Temp_C', 'Yield', 'total_cost_update', 'PMI']]\n", 337 | "\n", 338 | "df_all_exp_index = df_sel.rename(columns={'Base': 'base', 'Solvent': 'solvent',\n", 339 | " 'Ligand': 'ligand', 'Concentration': 'concentration',\n", 340 | " 'Temp_C': 'temperature', 'Yield': 'yield',\n", 341 | " 'total_cost_update': 'cost'\n", 342 | " })\n", 343 | "\n", 344 | "# df_all_exp_index.to_csv('./data/experiment_index.csv', index=False)\n", 345 | "df_all_exp_index" 346 | ] 347 | }, 348 | { 349 | "cell_type": "code", 350 | "execution_count": 5, 351 | "id": "c81b98e3", 352 | "metadata": {}, 353 | "outputs": [], 354 | "source": [ 355 | "df_yield_cost = df_all_exp_index.drop(columns=['PMI'])\n", 356 | "# df_yield_cost['new_index'] = np.arange(0, len(df_yield_cost))\n", 357 | "df_yield_cost.to_csv('./data/experiments_yield_and_cost.csv')\n", 358 | "\n" 359 | ] 360 | }, 361 | { 362 | "cell_type": "code", 363 | "execution_count": 6, 364 | "id": "3d0e53be", 365 | "metadata": {}, 366 | "outputs": [], 367 | "source": [ 368 | "df = pd.read_csv('./data/experiments_yield_and_cost.csv')\n", 369 | "df.rename(columns={'Unnamed: 0': 'new_index'}, inplace=True)\n", 370 | "df.to_csv('./data/experiments_yield_and_cost.csv', index=False)\n" 371 | ] 372 | }, 373 | { 374 | "cell_type": "code", 375 | "execution_count": 18, 376 | "id": "5469a317", 377 | "metadata": {}, 378 | "outputs": [], 379 | "source": [] 380 | }, 381 | { 382 | "cell_type": "code", 383 | "execution_count": 12, 384 | "metadata": { 385 | "collapsed": false, 386 | "pycharm": { 387 | "name": "#%%\n" 388 | } 389 | }, 390 | "outputs": [], 391 | "source": [] 392 | }, 393 | { 394 | "cell_type": "code", 395 | "execution_count": 12, 396 | "metadata": { 397 | "collapsed": false, 398 | "pycharm": { 399 | "name": "#%%\n" 400 | } 401 | }, 402 | "outputs": [], 403 | "source": [] 404 | } 405 | ], 406 | "metadata": { 407 | "kernelspec": { 408 | "display_name": "Python 3.7.5 ('edboplus')", 409 | "language": "python", 410 | "name": "python3" 411 | }, 412 | "language_info": { 413 | "codemirror_mode": { 414 | "name": "ipython", 415 | "version": 3 416 | }, 417 | "file_extension": ".py", 418 | "mimetype": "text/x-python", 419 | "name": "python", 420 | "nbconvert_exporter": "python", 421 | "pygments_lexer": "ipython3", 422 | "version": "3.7.5" 423 | }, 424 | "vscode": { 425 | "interpreter": { 426 | "hash": "f6b50c482b94d49566f339c9bbaa80fe4f4c53d65f91d29ce8fa084769027490" 427 | } 428 | } 429 | }, 430 | "nbformat": 4, 431 | "nbformat_minor": 5 432 | } -------------------------------------------------------------------------------- /examples/publication/BMS_yield_cost/1_preprocess_data.py: -------------------------------------------------------------------------------- 1 | 2 | import numpy as np 3 | import pandas as pd 4 | 5 | df_exp = pd.read_csv('./data/experiments_yield_and_cost.csv') 6 | 7 | 8 | # Base features. 9 | for i in ['base', 'ligand', 'solvent']: 10 | df_i = pd.read_csv(f"data/{i}_dft.csv") 11 | df_i.rename(columns={f"{i}_file_name": i}, inplace=True) 12 | df_exp = pd.merge(df_exp, df_i, on=i) 13 | 14 | df_edbo = df_exp.copy(deep=True) 15 | # Remove correlated features. 16 | corr_matrix = df_edbo.corr().abs() 17 | # Select upper triangle of correlation matrix 18 | upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool)) 19 | # Find features with correlation greater than 0.95. 20 | to_drop = [column for column in upper.columns if any(upper[column] > 0.95)] 21 | # Drop features 22 | df_edbo.drop(to_drop, axis=1, inplace=True) 23 | 24 | # Remove columns that have only one or two unique values. 25 | extra_columns_to_remove = [] 26 | for column in df_edbo.columns.values: 27 | if len(np.unique(df_edbo[column].values)) <= 1: 28 | extra_columns_to_remove.append(column) 29 | df_edbo.drop(extra_columns_to_remove, axis=1, inplace=True) 30 | 31 | # Remove non numerical. 32 | df_edbo_numeric = df_edbo.select_dtypes(include=np.number) 33 | df_edbo_numeric.to_csv('./data/clean_dft.csv', index=0) 34 | print(df_edbo_numeric) 35 | -------------------------------------------------------------------------------- /examples/publication/BMS_yield_cost/2_plot_ground_truth.py: -------------------------------------------------------------------------------- 1 | import os.path 2 | 3 | import pandas as pd 4 | import numpy as np 5 | import matplotlib.pyplot as plt 6 | import seaborn as sns 7 | sns.set_style("ticks") 8 | sns.despine() 9 | import matplotlib as mpl 10 | mpl.rcParams['grid.linestyle'] = ':' 11 | mpl.rcParams['grid.linewidth'] = 0.1 12 | plt.rcParams['font.family'] = 'Helvetica' 13 | plt.rcParams['font.size'] = 10 14 | import pareto 15 | from edbo.plus.benchmark.multiobjective_benchmark import is_pareto 16 | from pymoo.mcdm.high_tradeoff import HighTradeoffPoints 17 | from sklearn.preprocessing import MinMaxScaler 18 | import seaborn as sns 19 | 20 | 21 | def get_pareto_points(objective_values): 22 | """ Get pareto for the ground truth function. 23 | NOTE: Assumes maximization.""" 24 | pareto_ground = pareto.eps_sort(tables=objective_values, 25 | objectives=np.arange(2), 26 | maximize_all=True) 27 | idx_pareto = is_pareto(objectives=-objective_values) 28 | return np.array(pareto_ground), idx_pareto 29 | 30 | 31 | def get_high_tradeoff_points(pareto_points): 32 | """ Pass a numpy array with the pareto points and returns a numpy 33 | array with the high tradeoff points.""" 34 | 35 | scaler_pareto = MinMaxScaler() 36 | pareto_scaled = scaler_pareto.fit_transform(pareto_points) 37 | try: 38 | tradeoff = HighTradeoffPoints() 39 | 40 | tradeoff_args = tradeoff.do(-pareto_scaled) # Always minimizing. 41 | tradeoff_points = pareto_points[tradeoff_args] 42 | except: 43 | tradeoff_points = [] 44 | pass 45 | return tradeoff_points 46 | 47 | 48 | df_exp = pd.read_csv('./data/experiments_yield_and_cost.csv') 49 | 50 | df_exp['cost'] = -df_exp['cost'] 51 | objective_vals = df_exp[['yield', 'cost']].values 52 | pareto_points, idx_pareto = get_pareto_points(objective_vals) 53 | high_tradeoff_points = get_high_tradeoff_points(pareto_points) 54 | 55 | print(np.unique(df_exp['base'].values)) 56 | 57 | fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(10, 5)) 58 | 59 | 60 | hues = ['ligand', 'base', 'solvent', 'concentration'] 61 | 62 | sns.scatterplot(x=df_exp['cost'], y=df_exp['yield'], 63 | hue=df_exp['ligand'], s=80, 64 | lw=0.01, edgecolor='black', 65 | ax=ax, palette='Spectral', 66 | style=df_exp['solvent'], 67 | ) 68 | sns.lineplot(x=pareto_points[:, 1], y=pareto_points[:, 0], 69 | linewidth=2, color='grey', ls='dotted', ax=ax) 70 | if not os.path.exists('results_plots'): 71 | os.mkdir('results_plots') 72 | plt.savefig(f'./results_plots/dataset.svg', format='svg', dpi=500) 73 | plt.show() 74 | -------------------------------------------------------------------------------- /examples/publication/BMS_yield_cost/3_run_edbo_cost_yield_performance.py: -------------------------------------------------------------------------------- 1 | 2 | import shutil 3 | import pandas as pd 4 | import numpy as np 5 | import os 6 | from edbo.plus.benchmark.multiobjective_benchmark import Benchmark 7 | import os 8 | import numpy as np 9 | import matplotlib.pyplot as plt 10 | import seaborn as sns 11 | import pandas as pd 12 | sns.set_style("darkgrid") 13 | sns.set_context("talk") 14 | 15 | # Benchmark filename 16 | for batch in [1, 2, 3, 5]: 17 | for acq_i in ['EHVI']: 18 | for sampling_method in ['seed', 'lhs', 'cvtsampling']: 19 | budget = 60 20 | acq = acq_i 21 | seed = 1 22 | 23 | df_exp = pd.read_csv('./data/clean_dft.csv') 24 | sort_column = 'new_index' 25 | 26 | columns_regression = df_exp.columns 27 | columns_regression = columns_regression.drop([sort_column, 'yield', 'cost']).tolist() 28 | objectives = ['yield', 'cost'] 29 | objective_modes = ['max', 'min'] 30 | objective_thresholds = [None, None] 31 | print(f"Columns for regression: {columns_regression}") 32 | 33 | label_benchmark = f"benchmark_dft_acq_{acq}_batch_{batch}_seed_{seed}_init_sampling_{sampling_method}.csv" 34 | 35 | if not os.path.exists(f"./results/{label_benchmark}"): 36 | 37 | # Remove previous files 38 | if os.path.exists(label_benchmark): 39 | os.remove(label_benchmark) 40 | 41 | if os.path.exists(f'pred_{label_benchmark}'): 42 | os.remove(f'pred_{label_benchmark}') 43 | 44 | if os.path.exists(f'results_{label_benchmark}'): 45 | os.remove(f'results_{label_benchmark}') 46 | 47 | bench = Benchmark(df_ground=df_exp, 48 | features_regression=columns_regression, 49 | objective_names=objectives, 50 | objective_modes=objective_modes, 51 | objective_thresholds=objective_thresholds, 52 | filename=label_benchmark, 53 | filename_results=f'results_{label_benchmark}', 54 | index_column=sort_column, 55 | acquisition_function=acq) 56 | 57 | bench.run(steps=int(budget/batch), batch=batch, seed=seed, 58 | plot_ground=False, 59 | plot_predictions=False, plot_train=False, 60 | init_method=sampling_method) 61 | 62 | # Move results. 63 | if not os.path.exists('results'): 64 | os.mkdir('results') 65 | shutil.move(label_benchmark, f'results/{label_benchmark}') 66 | shutil.move(f'pred_{label_benchmark}', f'results/pred_{label_benchmark}') 67 | shutil.move(f'results_{label_benchmark}', f'results/results_{label_benchmark}') 68 | 69 | -------------------------------------------------------------------------------- /examples/publication/BMS_yield_cost/4_plot_performance_hypervol.py: -------------------------------------------------------------------------------- 1 | 2 | import numpy as np 3 | import matplotlib.pyplot as plt 4 | import seaborn as sns 5 | import pandas as pd 6 | import os 7 | 8 | 9 | # sns.set_style("ticks") 10 | # sns.set_context("paper") 11 | import matplotlib as mpl 12 | mpl.rcParams['grid.linestyle'] = ':' 13 | mpl.rcParams['grid.linewidth'] = 0.1 14 | 15 | objective_1 = 'conversion' 16 | objective_2 = 'selectivity' 17 | 18 | plt.rcParams['font.family'] = 'Helvetica' 19 | # mpl.rc('font', **{'family':'sans-serif', 'sans-serif':['HelveticaLight']}) 20 | 21 | # Best objectives. 22 | best_conversion_in_scope = 100. 23 | best_selectivity_in_scope = 100. 24 | n_steps = 60 25 | n_experiments = 60 26 | feat_iter = 0 27 | 28 | if not os.path.exists('./results_plots'): 29 | os.mkdir('results_plots') 30 | 31 | fig, ax = plt.subplots(figsize=(7., 2.5), dpi=500, nrows=1, ncols=3) 32 | 33 | colors_sampling = ['#DC143C', '#0343DF', '#FAC205', '#15B01A'] 34 | 35 | alphas = [0.4, 0.6, 0.7, 1.0] 36 | i = -1 37 | for sampling_method in ['seed', 'lhs', 'cvtsampling']: 38 | 39 | i += 1 40 | j = -1 41 | for batch in [1, 2, 3, 5]: 42 | j += 1 43 | acq = 'EHVI' 44 | 45 | df_i = pd.read_csv(f'./results/results_benchmark_dft_acq_{acq}_batch_{batch}_seed_1_init_sampling_{sampling_method}.csv') 46 | df_i = df_i[df_i['n_experiments'] <= n_experiments] 47 | 48 | # Hypervolume. 49 | hypervol = df_i['hypervolume completed (%)'].values[:] 50 | 51 | # Plot performance for each acquisition function. 52 | n_exp = df_i['n_experiments'].values[:] 53 | 54 | ax[i].plot(n_exp, hypervol, color=colors_sampling[j], lw=2.5, 55 | label=f"{batch}", 56 | alpha=alphas[j]) 57 | 58 | ax[i].set_title(f"{sampling_method}") 59 | ax[i].set_xlabel('Samples') 60 | ax[i].set_ylabel('Hypervolume (%)') 61 | ax[i].set_ylim(0, 100) 62 | 63 | ax[i].legend() 64 | plt.tight_layout() 65 | plt.savefig(f"results_plots/benchmark_hypervol.svg") 66 | 67 | plt.show() 68 | 69 | -------------------------------------------------------------------------------- /examples/publication/BMS_yield_cost/5_plot_MAE_and_RMSE.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | import matplotlib.pyplot as plt 4 | import seaborn as sns 5 | import pareto 6 | from pymoo.mcdm.high_tradeoff import HighTradeoffPoints 7 | from sklearn.preprocessing import MinMaxScaler 8 | 9 | from edbo.plus.benchmark.multiobjective_benchmark import is_pareto 10 | 11 | sns.set_style("ticks") 12 | import matplotlib as mpl 13 | # mpl.rcParams['grid.linestyle'] = ':' 14 | # mpl.rcParams['grid.linewidth'] = 0.1 15 | plt.rcParams['font.family'] = 'Helvetica' 16 | import joypy 17 | from matplotlib import cm 18 | 19 | ############## 20 | 21 | def get_pareto_points(objective_values): 22 | """ Get pareto for the ground truth function. 23 | NOTE: Assumes maximization.""" 24 | pareto_ground = pareto.eps_sort(tables=objective_values, 25 | objectives=np.arange(2), 26 | maximize_all=True) 27 | idx_pareto = is_pareto(objectives=-objective_values) 28 | return np.array(pareto_ground), idx_pareto 29 | 30 | def get_high_tradeoff_points(pareto_points): 31 | """ Pass a numpy array with the pareto points and returns a numpy 32 | array with the high tradeoff points.""" 33 | 34 | scaler_pareto = MinMaxScaler() 35 | pareto_scaled = scaler_pareto.fit_transform(pareto_points) 36 | try: 37 | tradeoff = HighTradeoffPoints() 38 | 39 | tradeoff_args = tradeoff.do(-pareto_scaled) # Always minimizing. 40 | tradeoff_points = pareto_points[tradeoff_args] 41 | except: 42 | tradeoff_points = [] 43 | pass 44 | return tradeoff_points 45 | 46 | 47 | df_exp = pd.read_csv('./data/experiments_yield_and_cost.csv') 48 | df_exp['cost'] = -df_exp['cost'] 49 | 50 | objective_vals = df_exp[['yield', 'cost']].values 51 | pareto_points, idx_pareto = get_pareto_points(objective_vals) 52 | high_tradeoff_points = get_high_tradeoff_points(pareto_points) 53 | 54 | ###### 55 | 56 | 57 | samplings = ['seed', 'lhs', 'cvtsampling'] 58 | batch_sizes = [1, 2, 3, 5] 59 | # colorpalettes = ['Blues', 'Reds', 'Greens', 'Oranges'] 60 | max_number_experiments = 45 61 | objective_1 = 'yield' 62 | objective_2 = 'cost' 63 | 64 | colors = ['blue', 'green', 'red'] 65 | 66 | df_all = pd.read_csv(f'./results/results_benchmark_dft_acq_EHVI_batch_{batch_sizes[0]}_seed_1_init_sampling_{samplings[0]}.csv') 67 | for i in batch_sizes: 68 | for j in samplings: 69 | df_i = pd.read_csv(f'./results/results_benchmark_dft_acq_EHVI_batch_{i}_seed_1_init_sampling_{j}.csv') 70 | df_i = df_i[df_i['n_experiments'] <= max_number_experiments] 71 | df_all = df_all.append(df_i, ignore_index=True) 72 | 73 | 74 | df_all.drop_duplicates(inplace=True) 75 | 76 | df_finish = df_all[(df_all['n_experiments'] < max_number_experiments+2) & (df_all['n_experiments'] > max_number_experiments-2)] 77 | 78 | fig, ax = plt.subplots(nrows=1, ncols=4, figsize=(14, 2.2)) 79 | 80 | sns.barplot(data=df_finish, x='init_method', y='MAE_yield', 81 | hue='batch', ax=ax[0], palette='Blues', 82 | lw=0.7, edgecolor='black', ci=None) 83 | # ax[0].set_ylim((5, 18)) 84 | 85 | sns.barplot(data=df_finish, x='init_method', y='MAE_cost', 86 | hue='batch', ax=ax[1], palette='Reds', 87 | lw=0.7, edgecolor='black', ci=None) 88 | # ax[1].set_ylim(0.01) 89 | 90 | 91 | sns.barplot(data=df_finish, x='init_method', y='RMSE_yield', 92 | hue='batch', ax=ax[2], palette='Blues', 93 | lw=0.7, edgecolor='black', ci=None) 94 | # ax[2].set_ylim(10, 25) 95 | 96 | sns.barplot(data=df_finish, x='init_method', y='RMSE_cost', 97 | hue='batch', ax=ax[3], palette='Reds', 98 | lw=0.7, edgecolor='black', ci=None) 99 | # ax[3].set_ylim(0.01, 0.06) 100 | 101 | 102 | plt.savefig('./results_plots/fig2c.svg', format='svg', dpi=500) 103 | plt.tight_layout() 104 | plt.show() 105 | -------------------------------------------------------------------------------- /examples/publication/BMS_yield_cost/6_distrib_plots.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | import matplotlib.pyplot as plt 4 | import seaborn as sns 5 | sns.set_style("ticks") 6 | import matplotlib as mpl 7 | # mpl.rcParams['grid.linestyle'] = ':' 8 | # mpl.rcParams['grid.linewidth'] = 0.1 9 | plt.rcParams['font.family'] = 'Helvetica' 10 | import joypy 11 | from matplotlib import cm 12 | 13 | samplings = ['seed', 'lhs', 'cvtsampling'] 14 | objective_1 = 'yield' 15 | objective_2 = 'cost' 16 | max_num_experiments = 46 17 | 18 | df_0 = pd.read_csv(f'./results/results_benchmark_dft_acq_EHVI_batch_3_seed_1_init_sampling_{samplings[0]}.csv') 19 | df_0['step'] += 1 20 | df_0 = df_0[df_0['n_experiments'] < max_num_experiments] 21 | 22 | df_1 = pd.read_csv(f'./results/results_benchmark_dft_acq_EHVI_batch_3_seed_1_init_sampling_{samplings[1]}.csv') 23 | df_1['step'] += 1 24 | df_1 = df_1[df_1['n_experiments'] < max_num_experiments] 25 | 26 | df_2 = pd.read_csv(f'./results/results_benchmark_dft_acq_EHVI_batch_3_seed_1_init_sampling_{samplings[2]}.csv') 27 | df_2['step'] += 1 28 | df_2 = df_2[df_2['n_experiments'] < max_num_experiments] 29 | 30 | frames = [df_0, df_1, df_2] 31 | colormaps_obj_1 = [cm.Blues] * 3 32 | colormaps_obj_2 = [cm.Reds] * 3 33 | # colormaps_obj_2 = [cm.PuRd] * 3 34 | # colormaps = [cm.autumn_r, cm.autumn_r, cm.cool, cm.summer] 35 | # pal = sns.cubehelix_palette(10, rot=-.25, light=.7, as_cmap=True) 36 | 37 | for i in range(0, 3): 38 | df = pd.concat(frames) 39 | 40 | plt.figure() 41 | ax, fig = joypy.joyplot( 42 | data=eval(f"df_{i}")[['step', f"{objective_1}_collected_values"]], 43 | by='step', 44 | linecolor='black', 45 | linewidth=0.7, 46 | ylim='own', 47 | column=['yield_collected_values'], 48 | colormap=colormaps_obj_1[i], 49 | legend=False, 50 | alpha=0.95, #bins=10, 51 | normalize=False, 52 | grid=False, 53 | figsize=(3, 3), #x_range=(0, 100) 54 | x_range=(0, 100) 55 | ) 56 | 57 | plt.savefig(f'./results_plots/subplot_{samplings[i]}_{objective_1}.svg', format='svg', dpi=500) 58 | plt.show() 59 | ax, fig = joypy.joyplot( 60 | data=eval(f"df_{i}")[['step', f"{objective_2}_collected_values"]], 61 | by='step', 62 | linecolor='black', 63 | linewidth=0.7, 64 | # hist=True, 65 | ylim='own', 66 | column=[f'{objective_2}_collected_values'], 67 | # color=['#686de0'], 68 | colormap=colormaps_obj_2[i], 69 | legend=False, 70 | alpha=0.95, #bins=10, 71 | normalize=False, grid=False, 72 | figsize=(3, 3), 73 | x_range=(0, 0.4) 74 | ) 75 | plt.savefig(f'./results_plots/subplot_{samplings[i]}_{objective_2}.svg', format='svg', dpi=500) 76 | plt.show() 77 | -------------------------------------------------------------------------------- /examples/publication/BMS_yield_cost/7_plot_scope_expansion.py: -------------------------------------------------------------------------------- 1 | import os.path 2 | 3 | import pandas as pd 4 | import numpy as np 5 | import matplotlib.pyplot as plt 6 | import seaborn as sns 7 | sns.set_style("ticks") 8 | sns.despine() 9 | import matplotlib as mpl 10 | mpl.rcParams['grid.linestyle'] = ':' 11 | mpl.rcParams['grid.linewidth'] = 0.1 12 | plt.rcParams['font.family'] = 'Helvetica' 13 | plt.rcParams['font.size'] = 10 14 | import pareto 15 | from edbo.plus.benchmark.multiobjective_benchmark import is_pareto 16 | from pymoo.mcdm.high_tradeoff import HighTradeoffPoints 17 | from sklearn.preprocessing import MinMaxScaler 18 | import seaborn as sns 19 | 20 | 21 | def get_pareto_points(objective_values): 22 | """ Get pareto for the ground truth function. 23 | NOTE: Assumes maximization.""" 24 | pareto_ground = pareto.eps_sort(tables=objective_values, 25 | objectives=np.arange(2), 26 | maximize_all=True) 27 | idx_pareto = is_pareto(objectives=-objective_values) 28 | return np.array(pareto_ground), idx_pareto 29 | 30 | 31 | def get_high_tradeoff_points(pareto_points): 32 | """ Pass a numpy array with the pareto points and returns a numpy 33 | array with the high tradeoff points.""" 34 | 35 | scaler_pareto = MinMaxScaler() 36 | pareto_scaled = scaler_pareto.fit_transform(pareto_points) 37 | try: 38 | tradeoff = HighTradeoffPoints() 39 | 40 | tradeoff_args = tradeoff.do(-pareto_scaled) # Always minimizing. 41 | tradeoff_points = pareto_points[tradeoff_args] 42 | except: 43 | tradeoff_points = [] 44 | pass 45 | return tradeoff_points 46 | 47 | 48 | df_exp = pd.read_csv('./data/experiments_yield_and_cost.csv') 49 | 50 | df_exp['cost'] = -df_exp['cost'] 51 | objective_vals = df_exp[['yield', 'cost']].values 52 | pareto_points, idx_pareto = get_pareto_points(objective_vals) 53 | high_tradeoff_points = get_high_tradeoff_points(pareto_points) 54 | 55 | print(np.unique(df_exp['base'].values)) 56 | 57 | fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(10, 5)) 58 | 59 | 60 | hues = ['ligand', 'base', 'solvent', 'concentration'] 61 | 62 | sns.scatterplot(x=df_exp['cost'], y=df_exp['yield'], 63 | hue=df_exp['ligand'], s=80, 64 | lw=0.01, edgecolor='black', 65 | ax=ax, palette='Spectral', 66 | style=df_exp['solvent'], 67 | ) 68 | sns.lineplot(x=pareto_points[:, 1], y=pareto_points[:, 0], 69 | linewidth=2, color='grey', ls='dotted', ax=ax) 70 | ax.set_xlim(-0.5, 0.02) 71 | ax.set_ylim(-10, 110) 72 | 73 | if not os.path.exists('results_plots'): 74 | os.mkdir('results_plots') 75 | 76 | plt.savefig(f'./results_plots/dataset.svg', format='svg', dpi=500) 77 | # plt.show() 78 | 79 | # Reduced space 80 | 81 | df_exp = pd.read_csv('./data/experiments_yield_and_cost.csv') 82 | 83 | # Removing a ligand. 84 | df_exp = df_exp[df_exp["ligand"].str.contains("CgMe-PPh")==False] 85 | df_exp = df_exp[df_exp["ligand"].str.contains("PPh3")==False] 86 | 87 | df_exp['cost'] = -df_exp['cost'] 88 | objective_vals = df_exp[['yield', 'cost']].values 89 | pareto_points, idx_pareto = get_pareto_points(objective_vals) 90 | high_tradeoff_points = get_high_tradeoff_points(pareto_points) 91 | 92 | print(np.unique(df_exp['base'].values)) 93 | 94 | fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(10, 5)) 95 | 96 | hues = ['ligand', 'base', 'solvent', 'concentration'] 97 | 98 | sns.scatterplot(x=df_exp['cost'], y=df_exp['yield'], 99 | hue=df_exp['ligand'], s=80, 100 | lw=0.01, edgecolor='black', 101 | ax=ax, palette='Spectral', 102 | style=df_exp['solvent'], 103 | ) 104 | sns.lineplot(x=pareto_points[:, 1], y=pareto_points[:, 0], 105 | linewidth=2, color='grey', ls='dotted', ax=ax) 106 | ax.set_xlim(-0.5, 0.02) 107 | ax.set_ylim(-10, 110) 108 | 109 | if not os.path.exists('results_plots'): 110 | os.mkdir('results_plots') 111 | plt.savefig(f'./results_plots/dataset_reduced.svg', format='svg', dpi=500) 112 | # plt.show() 113 | 114 | -------------------------------------------------------------------------------- /examples/publication/BMS_yield_cost/8_optimization_expanding_scope.py: -------------------------------------------------------------------------------- 1 | from edbo.plus.optimizer_botorch import EDBOplus 2 | import pandas as pd 3 | import numpy as np 4 | import seaborn as sns 5 | import matplotlib.pyplot as plt 6 | 7 | df_lookup = pd.read_csv('./data/experiments_yield_and_cost.csv') 8 | df_large = pd.read_csv('./data/experiments_yield_and_cost.csv') 9 | 10 | condition1 = df_large["ligand"].str.contains("CgMe-PPh")==False 11 | condition2 = df_large["ligand"].str.contains("PPh3")==False 12 | df_small = df_large[condition1 & condition2] 13 | 14 | # Refereces for plots. 15 | ref_best_yield_small_scope = np.max(df_small['yield']) 16 | ref_best_cost_small_scope = np.min(df_small['cost']) 17 | 18 | ref_best_yield_large_scope = np.max(df_large['yield']) 19 | ref_best_cost_large_scope = np.min(df_large['cost']) 20 | 21 | df_small.to_csv('./data/small_scope_lookup.csv', index=False) 22 | df_large.to_csv('./data/large_scope_lookup.csv', index=False) 23 | 24 | df_small.drop(columns=['yield', 'cost'], inplace=True) 25 | df_large.drop(columns=['yield', 'cost'], inplace=True) 26 | 27 | df_small.to_csv('./small_scope.csv', index=False) 28 | df_large.to_csv('./large_scope.csv', index=False) 29 | 30 | # Expand scope. 31 | df_expand = df_large.copy() 32 | condition1 = df_large["ligand"].str.contains("CgMe-PPh")==True 33 | condition2 = df_large["ligand"].str.contains("PPh3")==True 34 | df_expand = df_large[condition1 | condition2] 35 | df_expand['priority'] = np.zeros(len(df_expand)) 36 | df_expand['yield'] = ['PENDING'] * len(df_expand) 37 | df_expand['cost'] = ['PENDING'] * len(df_expand) 38 | 39 | print('References:') 40 | print('Small scope (best yield / best cost):', ref_best_yield_small_scope, ref_best_cost_small_scope) 41 | print('Large scope (best yield / best cost):',ref_best_yield_large_scope, ref_best_cost_large_scope) 42 | 43 | # Run optimization loops. 44 | n_rounds_small = 6 45 | n_round_large = 5 46 | batch_size = 3 47 | columns_regression = df_small.drop(columns=['new_index']).columns.tolist() 48 | 49 | n_experiments = 0 50 | 51 | track_results_dict = { 52 | 'n_experiments': [], 53 | 'best_yield': [], 54 | 'best_cost': [], 55 | 'max_ei_yield': [], 56 | 'max_ei_cost': [], 57 | 'max_uncertainty_yield': [], 58 | 'max_uncertainty_cost': [], 59 | 'avg_uncertainty_yield': [], 60 | 'avg_uncertainty_cost': [], 61 | } 62 | 63 | collected_yields = [] 64 | collected_costs = [] 65 | 66 | for round in range(0, n_rounds_small): 67 | EDBOplus().run( 68 | filename='small_scope.csv', # Previously generated scope. 69 | objectives=['yield', 'cost'], # Objectives to be optimized. 70 | objective_mode=['max', 'min'], # Maximize yield and ee but minimize side_product. 71 | batch=batch_size, # Number of experiments in parallel that we want to perform in this round. 72 | columns_features=columns_regression, # features to be included in the model. 73 | init_sampling_method='cvtsampling' # initialization method. 74 | ) 75 | 76 | n_experiments += batch_size 77 | # Update with experimental values (observations). 78 | df_results = pd.read_csv('small_scope.csv') 79 | arg_lookup = df_results.loc[0:batch_size-1]['new_index'].values 80 | 81 | for a in range(len(arg_lookup)): 82 | df_results.at[a,'yield'] = df_lookup.loc[arg_lookup[a]]['yield'] 83 | df_results.at[a,'cost'] = df_lookup.loc[arg_lookup[a]]['cost'] 84 | collected_yields.append(df_lookup.loc[arg_lookup[a]]['yield']) 85 | collected_costs.append(df_lookup.loc[arg_lookup[a]]['cost']) 86 | 87 | df_results.to_csv('small_scope.csv', index=False) 88 | 89 | if round > 0: 90 | # Save all predicted values. 91 | df_pred = pd.read_csv('pred_small_scope.csv') 92 | max_ei_yield = np.max(df_pred['yield_expected_improvement']) 93 | max_ei_cost = np.max(df_pred['cost_expected_improvement']) 94 | max_uncertainty_yield = np.max((df_pred['yield_predicted_variance'])) 95 | max_uncertainty_cost = np.max((df_pred['cost_predicted_variance'])) 96 | avg_uncertainty_yield = np.average((df_pred['yield_predicted_variance'])) 97 | avg_uncertainty_cost = np.average((df_pred['cost_predicted_variance'])) 98 | best_yield = np.max(collected_yields) 99 | best_cost = np.min(collected_costs) 100 | track_results_dict['n_experiments'].append(n_experiments) 101 | track_results_dict['best_yield'].append(best_yield) 102 | track_results_dict['best_cost'].append(best_cost) 103 | track_results_dict['max_ei_yield'].append(max_ei_yield) 104 | track_results_dict['max_ei_cost'].append(max_ei_cost) 105 | track_results_dict['max_uncertainty_yield'].append(max_uncertainty_yield) 106 | track_results_dict['max_uncertainty_cost'].append(max_uncertainty_cost) 107 | track_results_dict['avg_uncertainty_yield'].append(avg_uncertainty_yield) 108 | track_results_dict['avg_uncertainty_cost'].append(avg_uncertainty_cost) 109 | 110 | # Plot before expanding: 111 | fig, ax = plt.subplots(nrows=2, ncols=2, figsize=(7, 7)) 112 | 113 | sns.scatterplot( 114 | x=np.array(track_results_dict['n_experiments']), 115 | y=np.array(track_results_dict['max_ei_yield']), ax=ax[0][0], color='C1', s=100, 116 | zorder=100 117 | ) 118 | sns.scatterplot( 119 | x=track_results_dict['n_experiments'], 120 | y=track_results_dict['max_ei_cost'], ax=ax[0][1], color='C1', s=100, 121 | zorder=100 122 | ) 123 | sns.scatterplot( 124 | x=track_results_dict['n_experiments'], 125 | y=track_results_dict['best_yield'], ax=ax[1][0], color='C1', s=100, 126 | zorder=100 127 | ) 128 | sns.scatterplot( 129 | x=track_results_dict['n_experiments'], 130 | y=track_results_dict['best_cost'], ax=ax[1][1], color='C1',s=100, 131 | zorder=100 132 | ) 133 | 134 | ax[0][0].set_xlabel('Number of experiments') 135 | ax[0][1].set_xlabel('Number of experiments') 136 | ax[1][0].set_xlabel('Number of experiments') 137 | ax[1][1].set_xlabel('Number of experiments') 138 | ax[0][0].set_ylabel('Max EI (yield)') 139 | ax[0][1].set_ylabel('Max EI (cost)') 140 | ax[1][0].set_ylabel('Highest yield found') 141 | ax[1][1].set_ylabel('Lowest cost found') 142 | 143 | 144 | # Expand scope: 145 | df_small = pd.read_csv('small_scope.csv') 146 | df_expand = df_expand.append(df_small) 147 | df_expand.sort_values(by=['priority'], ascending=False, inplace=True) 148 | df_expand.to_csv('expanded_scope.csv', index=False) 149 | 150 | n_experiments -= batch_size 151 | 152 | # Keep optimizing after expanding. 153 | for round in range(0, n_round_large): 154 | EDBOplus().run( 155 | filename='expanded_scope.csv', # Previously generated scope. 156 | objectives=['yield', 'cost'], # Objectives to be optimized. 157 | objective_mode=['max', 'min'], # Maximize yield and ee but minimize side_product. 158 | batch=batch_size, # Number of experiments in parallel that we want to perform in this round. 159 | columns_features=columns_regression, # features to be included in the model. 160 | init_sampling_method='cvtsampling' # initialization method. 161 | ) 162 | 163 | n_experiments += batch_size 164 | # Update with experimental values (observations). 165 | df_results = pd.read_csv('expanded_scope.csv') 166 | arg_lookup = df_results.loc[0:batch_size-1]['new_index'].values 167 | 168 | for a in range(len(arg_lookup)): 169 | df_results.at[a,'yield'] = df_lookup.loc[arg_lookup[a]]['yield'] 170 | df_results.at[a,'cost'] = df_lookup.loc[arg_lookup[a]]['cost'] 171 | collected_yields.append(df_lookup.loc[arg_lookup[a]]['yield']) 172 | collected_costs.append(df_lookup.loc[arg_lookup[a]]['cost']) 173 | 174 | df_results.to_csv('expanded_scope.csv', index=False) 175 | 176 | if round > 0: 177 | # Save all predicted values. 178 | df_pred = pd.read_csv('pred_expanded_scope.csv') 179 | max_ei_yield = np.max(df_pred['yield_expected_improvement']) 180 | max_ei_cost = np.max(df_pred['cost_expected_improvement']) 181 | max_uncertainty_yield = np.max((df_pred['yield_predicted_variance'])) 182 | max_uncertainty_cost = np.max((df_pred['cost_predicted_variance'])) 183 | avg_uncertainty_yield = np.average((df_pred['yield_predicted_variance'])) 184 | avg_uncertainty_cost = np.average((df_pred['cost_predicted_variance'])) 185 | best_yield = np.max(collected_yields) 186 | best_cost = np.min(collected_costs) 187 | track_results_dict['n_experiments'].append(n_experiments) 188 | track_results_dict['best_yield'].append(best_yield) 189 | track_results_dict['best_cost'].append(best_cost) 190 | track_results_dict['max_ei_yield'].append(max_ei_yield) 191 | track_results_dict['max_ei_cost'].append(max_ei_cost) 192 | track_results_dict['max_uncertainty_yield'].append(max_uncertainty_yield) 193 | track_results_dict['avg_uncertainty_yield'].append(avg_uncertainty_yield) 194 | track_results_dict['avg_uncertainty_cost'].append(avg_uncertainty_cost) 195 | 196 | 197 | sns.scatterplot( 198 | x=np.array(track_results_dict['n_experiments']), 199 | y=np.array(track_results_dict['max_ei_yield']), ax=ax[0][0], color='C0', s=95, 200 | zorder=10 201 | ) 202 | sns.scatterplot( 203 | x=track_results_dict['n_experiments'], 204 | y=track_results_dict['max_ei_cost'], ax=ax[0][1], color='C0', s=95, 205 | ) 206 | sns.scatterplot( 207 | x=track_results_dict['n_experiments'], 208 | y=track_results_dict['best_yield'], ax=ax[1][0], color='C0', s=95, 209 | zorder=10 210 | ) 211 | sns.scatterplot( 212 | x=track_results_dict['n_experiments'], 213 | y=track_results_dict['best_cost'], ax=ax[1][1], color='C0',s=95, 214 | zorder=10 215 | ) 216 | 217 | plt.tight_layout() 218 | plt.savefig('./results_plots/expand_scope.svg', format='svg') 219 | plt.show() 220 | 221 | 222 | 223 | 224 | 225 | 226 | 227 | 228 | 229 | -------------------------------------------------------------------------------- /examples/publication/BMS_yield_cost/9_optimization_constraints.py: -------------------------------------------------------------------------------- 1 | 2 | from edbo.plus.optimizer_botorch import EDBOplus 3 | import pandas as pd 4 | import numpy as np 5 | import matplotlib.pyplot as plt 6 | import pareto 7 | from edbo.plus.benchmark.multiobjective_benchmark import is_pareto 8 | import torch 9 | from botorch.utils.multi_objective.hypervolume import Hypervolume 10 | import copy 11 | import numpy as np 12 | import matplotlib.pyplot as plt 13 | import pandas as pd 14 | import matplotlib as mpl 15 | 16 | # # Metrics. 17 | # def get_pareto_points(objective_values): 18 | # """ Get pareto for the ground truth function. 19 | # NOTE: Assumes maximization.""" 20 | # pareto_ground = pareto.eps_sort(tables=objective_values, 21 | # objectives=np.arange(2), 22 | # maximize_all=True) 23 | # idx_pareto = is_pareto(objectives=-objective_values) 24 | # return np.array(pareto_ground), idx_pareto 25 | 26 | # def get_hypervolume(pareto_points, ref_mins): 27 | # """ 28 | # Calculate hypervolume. 29 | # """ 30 | # pareto_torch = torch.Tensor(pareto_points) 31 | # hv = Hypervolume(ref_point=torch.Tensor(ref_mins)) 32 | # hypervolume = hv.compute(pareto_Y=pareto_torch) 33 | # return hypervolume 34 | 35 | 36 | # # Combinations of constraints tested in this example. 37 | # # Columns that remain constant after EDBO suggest the best sample using batch=1. 38 | set_constraints = [ 39 | ['ligand'], 40 | ['ligand', 'base'], 41 | ['solvent', 'concentration', 'temperature'], 42 | ] 43 | 44 | # df_results = pd.DataFrame(columns=['seed', 'constraints', 45 | # 'n_exp', 'hypervolume']) 46 | 47 | # for columns_to_constrain in set_constraints: 48 | # # Parameters. 49 | # batch_size = 5 50 | # # columns_to_constrain = ['solvent', 'concentration', 'temperature'] 51 | # n_rounds = 7 52 | # n_seeds = 5 53 | # # Load lookup tables. 54 | # df_hte = pd.read_csv('./data/experiments_yield_and_cost.csv') 55 | # # Get targets for hypervolume indicator. 56 | # targets_hte = np.zeros((len(df_hte), 2)) 57 | # targets_hte[:, 0] = df_hte['yield'].to_numpy() 58 | # targets_hte[:, 1] = -df_hte['cost'].to_numpy() 59 | # worst_targets = np.min(targets_hte, axis=0) 60 | # pareto_ref = get_pareto_points(objective_values=targets_hte)[0] 61 | # hypervolume_ref = get_hypervolume(pareto_points=pareto_ref, ref_mins=worst_targets) 62 | 63 | # # Get columns names for regression and search space. 64 | # columns_search_space = df_hte.drop(columns=['yield', 'cost']).columns.tolist() 65 | # columns_regression = df_hte.drop(columns=['new_index', 'yield', 'cost']).columns.tolist() 66 | # df_full_space = df_hte[columns_search_space] 67 | 68 | # # Initialize optimization campaing. 69 | # for seed in range(0, n_seeds): 70 | # n_exp = 0 71 | # df_full_space.to_csv('optimization.csv', index=False) 72 | # for round in range(0, n_rounds): 73 | # EDBOplus().run( 74 | # filename='optimization.csv', 75 | # seed=seed, 76 | # objectives=['yield', 'cost'], 77 | # objective_mode=['max', 'min'], # Maximize yield but minimize cost. 78 | # batch=1, 79 | # columns_features=columns_regression, # features to be included in the model. 80 | # init_sampling_method='cvtsampling' # initialization method. 81 | # ) 82 | 83 | # df_opt = pd.read_csv('optimization.csv') 84 | 85 | # # Initial optimization to obtain the best sample in the entire search space. 86 | # best_suggested_sample = df_opt.loc[0] 87 | # df_reduced_space = df_opt.copy() 88 | # for col in columns_to_constrain: 89 | # df_reduced_space = df_reduced_space[df_reduced_space[col] == best_suggested_sample[col]] 90 | 91 | # df_reduced_space.drop(columns=['yield', 'cost', 'priority'], inplace=True) 92 | # df_reduced_space.to_csv('optimization_reduced.csv', index=False) 93 | 94 | # EDBOplus().run( 95 | # filename='optimization_reduced.csv', # Previously generated scope. 96 | # objectives=['yield', 'cost'], # Objectives to be optimized. 97 | # objective_mode=['max', 'min'], # Maximize yield and ee but minimize side_product. 98 | # batch=batch_size, 99 | # seed=seed, 100 | # columns_features=columns_regression, # features to be included in the model. 101 | # init_sampling_method='cvtsampling' # initialization method. 102 | # ) 103 | 104 | # df_opt_reduced = pd.read_csv('optimization_reduced.csv') 105 | 106 | # idx_best_samples = df_opt_reduced['new_index'].values.tolist()[:batch_size] 107 | # print('Index best samples:', idx_best_samples) 108 | # df_opt = df_opt.sort_values(by='new_index') 109 | # df_opt.reset_index(inplace=True) 110 | # df_opt.drop(columns=['index'], inplace=True) 111 | 112 | # for a in range(len(idx_best_samples)): 113 | # df_opt.at[idx_best_samples[a],'yield'] = df_hte.loc[idx_best_samples[a]]['yield'] 114 | # df_opt.at[idx_best_samples[a],'cost'] = df_hte.loc[idx_best_samples[a]]['cost'] 115 | # df_opt.at[idx_best_samples[a],'priority'] = 1 116 | 117 | # df_opt = df_opt.sort_values(by='priority', ascending=False) 118 | # df_opt.to_csv('optimization.csv', index=False) 119 | 120 | # # Monitoring hypervolume. 121 | # df_train = df_opt[df_opt['yield'] != 'PENDING'] 122 | # df_train['yield'] = copy.deepcopy(pd.to_numeric(df_train['yield'])) 123 | # df_train['cost'] = copy.deepcopy(pd.to_numeric(df_train['cost'])) 124 | 125 | # targets_train = np.zeros((len(df_train), 2)) 126 | # targets_train[:, 0] = df_train['yield'].to_numpy() 127 | # targets_train[:, 1] = -df_train['cost'].to_numpy() 128 | # pareto_train = get_pareto_points(objective_values=targets_train)[0] 129 | # hypervolume_train = get_hypervolume(pareto_points=pareto_train, 130 | # ref_mins=worst_targets) 131 | # hypervolume_explored = (hypervolume_train/hypervolume_ref) * 100 132 | 133 | # n_exp += batch_size 134 | # print(f"Number of samples: {n_exp}") 135 | # print(f"Hypervolume: {hypervolume_explored}") 136 | 137 | # dict_results = {'seed': seed, 138 | # 'constraints': columns_to_constrain, 139 | # 'n_exp': n_exp, 140 | # 'hypervolume': hypervolume_explored} 141 | # df_results = df_results.append(dict_results, ignore_index=True) 142 | # df_results.to_csv('constraint_optimization_results.csv') 143 | 144 | 145 | # Plot results. 146 | df_results = pd.read_csv('constraint_optimization_results.csv') 147 | colors = [ '#0343DF', '#FAC205', '#DC143C'] 148 | count = 0 149 | 150 | mpl.rcParams['grid.linestyle'] = ':' 151 | mpl.rcParams['grid.linewidth'] = 0.1 152 | plt.rcParams['font.family'] = 'Helvetica' 153 | 154 | fig, ax = plt.subplots(figsize=(4., 4.0), dpi=500, nrows=1, ncols=1) 155 | 156 | for constraints in set_constraints: 157 | # Get subset for constraints. 158 | constraints = str(constraints) 159 | df_constraint = df_results[df_results['constraints'] == constraints] 160 | 161 | # Get average, max and min hypervolume explored at each step. 162 | df_avg = df_constraint.groupby(['n_exp']).agg([np.average]) 163 | df_max = df_constraint.groupby(['n_exp']).agg([np.max]) 164 | df_min = df_constraint.groupby(['n_exp']).agg([np.min]) 165 | 166 | 167 | n_exp = np.unique(df_results['n_exp'].values).flatten() 168 | hypervol_avg = df_avg['hypervolume'].values.flatten() 169 | hypervol_max = df_max['hypervolume'].values.flatten() 170 | hypervol_min = df_min['hypervolume'].values.flatten() 171 | 172 | color = colors[count] 173 | 174 | ax.plot(n_exp, hypervol_avg, color=color, lw=2.5, 175 | label=str(constraints)) 176 | ax.fill_between(x=n_exp, 177 | y1=hypervol_avg, 178 | y2=hypervol_max, color=color, alpha=0.3, lw=0.) 179 | ax.fill_between(x=n_exp, 180 | y1=hypervol_min, 181 | y2=hypervol_avg, color=color, alpha=0.3, lw=0.) 182 | ax.plot(n_exp, hypervol_min, color=color, alpha=1., lw=1., ls='--') 183 | ax.plot(n_exp, hypervol_max, color=color, alpha=1., lw=1., ls='--') 184 | ax.plot(n_exp, np.ones_like(n_exp)*100, 185 | dashes=[8, 4], color='black', linewidth=0.8) 186 | ax.scatter(n_exp, hypervol_avg, marker='o', s=0., color=color) 187 | count += 1 188 | 189 | ax.set_xticks(np.arange(0, 120, 10)) 190 | ax.set_xlim(0, np.max(n_exp[:-1])) 191 | ax.set_ylim(0, 100) 192 | ax.set_xlabel('Number of experiments') 193 | ax.set_ylabel('Hypervolume (%)') 194 | plt.legend() 195 | plt.savefig('./results_plots/optimization_constraints.svg', format='svg') 196 | -------------------------------------------------------------------------------- /examples/publication/BMS_yield_cost/data/base_dft.csv: -------------------------------------------------------------------------------- 1 | base_file_name,base_SMILES,base_stoichiometry,base_number_of_atoms,base_charge,base_multiplicity,base_convergence_criteria,base_dipole,base_molar_mass,base_molar_volume,base_electronic_spatial_extent,base_homo_energy,base_lumo_energy,base_electronegativity,base_hardness,base_electrophilicity,base_E_scf,base_zero_point_correction,base_E_thermal_correction,base_H_thermal_correction,base_G_thermal_correction,base_E_zpe,base_E,base_H,base_G,base_ES_root_dipole,base_ES_root_molar_volume,base_ES_root_electronic_spatial_extent,base_ES1_transition,base_ES1_osc_strength,base_ES1_,base_ES2_transition,base_ES2_osc_strength,base_ES2_,base_ES3_transition,base_ES3_osc_strength,base_ES3_,base_ES4_transition,base_ES4_osc_strength,base_ES4_,base_ES5_transition,base_ES5_osc_strength,base_ES5_,base_ES6_transition,base_ES6_osc_strength,base_ES6_,base_ES7_transition,base_ES7_osc_strength,base_ES7_,base_ES8_transition,base_ES8_osc_strength,base_ES8_,base_ES9_transition,base_ES9_osc_strength,base_ES9_,base_ES10_transition,base_ES10_osc_strength,base_ES10_,base_atom1_atom,base_atom1_Mulliken_charge,base_atom1_APT_charge,base_atom1_NPA_charge,base_atom1_NPA_core,base_atom1_NPA_valence,base_atom1_NPA_Rydberg,base_atom1_NPA_total,base_atom1_NMR_shift,base_atom1_NMR_anisotropy,base_atom1_ES_root_Mulliken_charge,base_atom1_ES_root_NPA_charge,base_atom1_ES_root_NPA_core,base_atom1_ES_root_NPA_valence,base_atom1_ES_root_NPA_Rydberg,base_atom1_ES_root_NPA_total,base_atom2_atom,base_atom2_Mulliken_charge,base_atom2_APT_charge,base_atom2_NPA_charge,base_atom2_NPA_core,base_atom2_NPA_valence,base_atom2_NPA_Rydberg,base_atom2_NPA_total,base_atom2_NMR_shift,base_atom2_NMR_anisotropy,base_atom2_ES_root_Mulliken_charge,base_atom2_ES_root_NPA_charge,base_atom2_ES_root_NPA_core,base_atom2_ES_root_NPA_valence,base_atom2_ES_root_NPA_Rydberg,base_atom2_ES_root_NPA_total,base_atom3_atom,base_atom3_Mulliken_charge,base_atom3_APT_charge,base_atom3_NPA_charge,base_atom3_NPA_core,base_atom3_NPA_valence,base_atom3_NPA_Rydberg,base_atom3_NPA_total,base_atom3_NMR_shift,base_atom3_NMR_anisotropy,base_atom3_ES_root_Mulliken_charge,base_atom3_ES_root_NPA_charge,base_atom3_ES_root_NPA_core,base_atom3_ES_root_NPA_valence,base_atom3_ES_root_NPA_Rydberg,base_atom3_ES_root_NPA_total,base_c_min_atom_number,base_c_min_atom,base_c_min_atom=O,base_c_min_Mulliken_charge,base_c_min_APT_charge,base_c_min_NPA_charge,base_c_min_NPA_core,base_c_min_NPA_valence,base_c_min_NPA_Rydberg,base_c_min_NPA_total,base_c_min_NMR_shift,base_c_min_NMR_anisotropy,base_c_min_ES_root_Mulliken_charge,base_c_min_ES_root_NPA_charge,base_c_min_ES_root_NPA_core,base_c_min_ES_root_NPA_valence,base_c_min_ES_root_NPA_Rydberg,base_c_min_ES_root_NPA_total,base_c_min+1_atom_number,base_c_min+1_atom,base_c_min+1_atom=O,base_c_min+1_Mulliken_charge,base_c_min+1_APT_charge,base_c_min+1_NPA_charge,base_c_min+1_NPA_core,base_c_min+1_NPA_valence,base_c_min+1_NPA_Rydberg,base_c_min+1_NPA_total,base_c_min+1_NMR_shift,base_c_min+1_NMR_anisotropy,base_c_min+1_ES_root_Mulliken_charge,base_c_min+1_ES_root_NPA_charge,base_c_min+1_ES_root_NPA_core,base_c_min+1_ES_root_NPA_valence,base_c_min+1_ES_root_NPA_Rydberg,base_c_min+1_ES_root_NPA_total,base_c_max_atom_number,base_c_max_atom,base_c_max_atom=Cs,base_c_max_atom=K,base_c_max_Mulliken_charge,base_c_max_APT_charge,base_c_max_NPA_charge,base_c_max_NPA_core,base_c_max_NPA_valence,base_c_max_NPA_Rydberg,base_c_max_NPA_total,base_c_max_NMR_shift,base_c_max_NMR_anisotropy,base_c_max_ES_root_Mulliken_charge,base_c_max_ES_root_NPA_charge,base_c_max_ES_root_NPA_core,base_c_max_ES_root_NPA_valence,base_c_max_ES_root_NPA_Rydberg,base_c_max_ES_root_NPA_total,base_c_max-1_atom_number,base_c_max-1_atom,base_c_max-1_atom=C,base_c_max-1_Mulliken_charge,base_c_max-1_APT_charge,base_c_max-1_NPA_charge,base_c_max-1_NPA_core,base_c_max-1_NPA_valence,base_c_max-1_NPA_Rydberg,base_c_max-1_NPA_total,base_c_max-1_NMR_shift,base_c_max-1_NMR_anisotropy,base_c_max-1_ES_root_Mulliken_charge,base_c_max-1_ES_root_NPA_charge,base_c_max-1_ES_root_NPA_core,base_c_max-1_ES_root_NPA_valence,base_c_max-1_ES_root_NPA_Rydberg,base_c_max-1_ES_root_NPA_total,base_vib_1_vibration,base_vib_1_standard_vibration,base_vib_1_correlation,base_vib_1_frequency,base_vib_1_reduced_mass,base_vib_1_frc_const,base_vib_1_IR_intensity,base_vib_1_dip_strength,base_vib_1_rot_strength,base_vib_1_E-M_angle,base_vib_1_standard_frequency,base_vib_1_standard_reduced_mass,base_vib_1_standard_frc_const,base_vib_1_standard_IR_intensity,base_vib_1_standard_dip_strength,base_vib_1_standard_rot_strength,base_vib_1_standard_E-M_angle,base_vib_2_vibration,base_vib_2_standard_vibration,base_vib_2_correlation,base_vib_2_frequency,base_vib_2_reduced_mass,base_vib_2_frc_const,base_vib_2_IR_intensity,base_vib_2_dip_strength,base_vib_2_rot_strength,base_vib_2_E-M_angle,base_vib_2_standard_frequency,base_vib_2_standard_reduced_mass,base_vib_2_standard_frc_const,base_vib_2_standard_IR_intensity,base_vib_2_standard_dip_strength,base_vib_2_standard_rot_strength,base_vib_2_standard_E-M_angle,base_vib_3_vibration,base_vib_3_standard_vibration,base_vib_3_correlation,base_vib_3_frequency,base_vib_3_reduced_mass,base_vib_3_frc_const,base_vib_3_IR_intensity,base_vib_3_dip_strength,base_vib_3_rot_strength,base_vib_3_E-M_angle,base_vib_3_standard_frequency,base_vib_3_standard_reduced_mass,base_vib_3_standard_frc_const,base_vib_3_standard_IR_intensity,base_vib_3_standard_dip_strength,base_vib_3_standard_rot_strength,base_vib_3_standard_E-M_angle,base_vib_4_vibration,base_vib_4_standard_vibration,base_vib_4_correlation,base_vib_4_frequency,base_vib_4_reduced_mass,base_vib_4_frc_const,base_vib_4_IR_intensity,base_vib_4_dip_strength,base_vib_4_rot_strength,base_vib_4_E-M_angle,base_vib_4_standard_frequency,base_vib_4_standard_reduced_mass,base_vib_4_standard_frc_const,base_vib_4_standard_IR_intensity,base_vib_4_standard_dip_strength,base_vib_4_standard_rot_strength,base_vib_4_standard_E-M_angle,base_vib_5_vibration,base_vib_5_standard_vibration,base_vib_5_correlation,base_vib_5_frequency,base_vib_5_reduced_mass,base_vib_5_frc_const,base_vib_5_IR_intensity,base_vib_5_dip_strength,base_vib_5_rot_strength,base_vib_5_E-M_angle,base_vib_5_standard_frequency,base_vib_5_standard_reduced_mass,base_vib_5_standard_frc_const,base_vib_5_standard_IR_intensity,base_vib_5_standard_dip_strength,base_vib_5_standard_rot_strength,base_vib_5_standard_E-M_angle,base_vib_6_vibration,base_vib_6_standard_vibration,base_vib_6_correlation,base_vib_6_frequency,base_vib_6_reduced_mass,base_vib_6_frc_const,base_vib_6_IR_intensity,base_vib_6_dip_strength,base_vib_6_rot_strength,base_vib_6_E-M_angle,base_vib_6_standard_frequency,base_vib_6_standard_reduced_mass,base_vib_6_standard_frc_const,base_vib_6_standard_IR_intensity,base_vib_6_standard_dip_strength,base_vib_6_standard_rot_strength,base_vib_6_standard_E-M_angle,base_vib_7_vibration,base_vib_7_standard_vibration,base_vib_7_correlation,base_vib_7_frequency,base_vib_7_reduced_mass,base_vib_7_frc_const,base_vib_7_IR_intensity,base_vib_7_dip_strength,base_vib_7_rot_strength,base_vib_7_E-M_angle,base_vib_7_standard_frequency,base_vib_7_standard_reduced_mass,base_vib_7_standard_frc_const,base_vib_7_standard_IR_intensity,base_vib_7_standard_dip_strength,base_vib_7_standard_rot_strength,base_vib_7_standard_E-M_angle,base_vib_8_vibration,base_vib_8_standard_vibration,base_vib_8_correlation,base_vib_8_frequency,base_vib_8_reduced_mass,base_vib_8_frc_const,base_vib_8_IR_intensity,base_vib_8_dip_strength,base_vib_8_rot_strength,base_vib_8_E-M_angle,base_vib_8_standard_frequency,base_vib_8_standard_reduced_mass,base_vib_8_standard_frc_const,base_vib_8_standard_IR_intensity,base_vib_8_standard_dip_strength,base_vib_8_standard_rot_strength,base_vib_8_standard_E-M_angle,base_atom1_%VBur,base_atom2_%VBur,base_atom3_%VBur,base_c_min_%VBur,base_c_min+1_%VBur,base_c_max_%VBur,base_c_max-1_%VBur 2 | CsOAc,O=C([O-])C.[Cs+],C2H3CsO2,8,0,1,met,10.1478,191.9499,406.01,876.2473,-0.16646,-0.05377,0.11011499999999999,0.056345,0.056345,-248.406264045,0.049939,0.056406,0.05735,0.015986,-248.356325,-248.349858,-248.348914,-248.390278,8.2546,1380.434,909.9558,498.95,0.0037,0.000,432.49,0.1696,0.000,429.03,0.0000,0.000,324.62,0.0000,0.000,312.85,0.0122,0.000,308.18,0.0425,0.000,299.13,0.0286,0.000,290.79,0.0138,0.000,290.15,0.0014,0.000,281.64,0.0007,0.000,O1,-0.630819,-0.948476,-0.82325,1.99975,6.80840,0.01510,8.82325,37.0240,325.6241,-0.367329,-0.45721,1.99977,6.44316,0.01428,8.45721,O2,-0.627311,-0.929421,-0.81310,1.99975,6.79835,0.01500,8.81310,12.2909,315.3062,-0.370284,-0.45523,1.99977,6.44127,0.01419,8.45523,Cs3,0.845450,0.946019,0.93442,53.99587,0.05739,0.01232,54.06558,77.7746,4.3447,0.102280,0.12092,54.00014,0.75463,0.12431,54.87908,1,O,1,-0.630819,-0.948476,-0.82325,1.99975,6.80840,0.01510,8.82325,37.0240,325.6241,-0.367329,-0.45721,1.99977,6.44316,0.01428,8.45721,3,O,1,-0.627311,-0.929421,-0.81310,1.99975,6.79835,0.01500,8.81310,12.2909,315.3062,-0.370284,-0.45523,1.99977,6.44127,0.01419,8.45523,5,Cs,1,0,0.845450,0.946019,0.93442,53.99587,0.05739,0.01232,54.06558,77.7746,4.3447,0.102280,0.12092,54.00014,0.75463,0.12431,54.87908,2,C,1,0.503244,1.052880,0.79419,1.99945,3.15120,0.05516,5.20581,23.9298,101.6692,0.597363,0.80166,1.99946,3.14820,0.05067,5.19834,6,6,1.,618.7872,2.451,0.5529,7.6617,49.3959,0.0056,89.9964,618.7872,2.451,0.5529,7.6617,49.3959,0.0056,89.9964,7,7,1.,646.437,6.0553,1.4909,18.1353,111.9193,-0.0052,90.08,646.437,6.0553,1.4909,18.1353,111.9193,-0.0052,90.08,8,8,1.0000000000000002,916.3218,7.2493,3.5863,16.7246,72.8142,0.0002,89.9953,916.3218,7.2493,3.5863,16.7246,72.8142,0.0002,89.9953,9,9,1.0000000000000002,1027.1507,1.4579,0.9063,6.8743,26.6993,0.0013,89.997,1027.1507,1.4579,0.9063,6.8743,26.6993,0.0013,89.997,10,10,1.,1063.8484,1.8673,1.2451,6.1805,23.1767,-0.0021,90.0022,1063.8484,1.8673,1.2451,6.1805,23.1767,-0.0021,90.0022,11,11,1.0000000000000002,1377.1441,1.4156,1.5818,25.4311,73.6705,0.0003,89.9992,1377.1441,1.4156,1.5818,25.4311,73.6705,0.0003,89.9992,12,12,1.,1428.8053,5.1995,6.254,180.9276,505.1719,0.0002,89.9995,1428.8053,5.1995,6.254,180.9276,505.1719,0.0002,89.9995,15,15,0.9999999999999999,1677.2563,8.6498,14.3368,430.3053,1023.4925,0.,90.,1677.2563,8.6498,14.3368,430.3053,1023.4925,0.,90.,0.4621717912637346,0.4583096314481602,0.4355558216819856,0.4621717912637346,0.4583096314481602,0.4355558216819856,0.5387263854375617 3 | CsOPiv,O=C([O-])C(C)(C)C.[Cs+],C5H9CsO2,17,0,1,met,11.0112,234.0303,1542.838,1831.5572,-0.17387,-0.05538,0.114625,0.059245,0.059245,-366.348293098,0.135133,0.145329,0.146273,0.096898,-366.21316,-366.202964,-366.20202,-366.251395,8.9444,1218.03,1885.5803,469.32,0.0031,0.000,434.42,0.1679,0.000,409.02,0.0000,0.000,311.54,0.0012,0.000,307.21,0.0401,0.000,301.54,0.0076,0.000,297.70,0.0363,0.000,281.05,0.0009,0.000,280.56,0.0077,0.000,275.55,0.0018,0.000,O1,-0.626594,-0.907646,-0.80886,1.99975,6.79434,0.01477,8.80886,36.4140,345.8849,-0.363461,-0.43354,1.99977,6.41981,0.01396,8.43354,O2,-0.640045,-0.928433,-0.82255,1.99975,6.80799,0.01481,8.82255,56.9329,339.6416,-0.390175,-0.47421,1.99977,6.46010,0.01434,8.47421,Cs3,0.848316,0.958216,0.93820,53.99606,0.05620,0.00954,54.06180,77.7495,5.0811,0.075126,0.10338,53.99940,0.78274,0.11448,54.89662,3,O,1,-0.640045,-0.928433,-0.82255,1.99975,6.80799,0.01481,8.82255,56.9329,339.6416,-0.390175,-0.47421,1.99977,6.46010,0.01434,8.47421,1,O,1,-0.626594,-0.907646,-0.80886,1.99975,6.79434,0.01477,8.80886,36.4140,345.8849,-0.363461,-0.43354,1.99977,6.41981,0.01396,8.43354,8,Cs,1,0,0.848316,0.958216,0.93820,53.99606,0.05620,0.00954,54.06180,77.7495,5.0811,0.075126,0.10338,53.99940,0.78274,0.11448,54.89662,2,C,1,0.513009,0.941821,0.81589,1.99929,3.12499,0.05983,5.18411,16.0679,110.6588,0.611765,0.82753,1.99928,3.11710,0.05610,5.17247,15,6,0.9959074339135122,794.1344,6.9049,2.5656,5.4827,27.5427,0.0024,89.9968,618.7872,2.451,0.5529,7.6617,49.3959,0.0056,89.9964,14,7,0.9816918876799305,588.297,3.5036,0.7144,10.1352,68.7297,0.0015,89.864,646.437,6.0553,1.4909,18.1353,111.9193,-0.0052,90.08,17,8,-0.9966831868163546,888.0265,3.386,1.5732,30.4925,136.9854,0.009,89.8991,916.3218,7.2493,3.5863,16.7246,72.8142,0.0002,89.9953,22,9,0.9443745773600293,1059.5097,1.4077,0.9311,0.7714,2.9047,-0.0667,90.3188,1027.1507,1.4579,0.9063,6.8743,26.6993,0.0013,89.997,23,10,-0.9999999970969979,1243.8906,2.7116,2.4719,4.866,15.6061,-0.0039,90.0381,1063.8484,1.8673,1.2451,6.1805,23.1767,-0.0021,90.0022,29,11,-0.999939389883036,1456.51,1.5233,1.904,45.7368,125.2737,0.0018,89.9899,1377.1441,1.4156,1.5818,25.4311,73.6705,0.0003,89.9992,26,12,0.9995646393653587,1395.7311,2.9236,3.3556,87.7367,250.7766,-0.0087,90.0441,1428.8053,5.1995,6.254,180.9276,505.1719,0.0002,89.9995,36,15,0.9971974621484632,1661.3087,11.2165,18.2392,378.225,908.2539,-0.0008,90.0001,1677.2563,8.6498,14.3368,430.3053,1023.4925,0.,90.,0.5230981108350049,0.539564683692105,0.43534624711834974,0.539564683692105,0.5230981108350049,0.43534624711834974,0.664291488278794 4 | KOAc,O=C([O-])C.[K+],C2H3KO2,8,0,1,met,7.1686,98.1428,545.524,641.7535,-0.19743,-0.04683,0.12212999999999999,0.0753,0.0753,-828.445320274,0.050381,0.056594,0.057539,0.018561,-828.394939,-828.388726,-828.387782,-828.426759,8.5245,606.138,685.0531,363.32,0.0031,0.000,327.15,0.0000,0.000,321.58,0.1037,0.000,261.01,0.0000,0.000,251.07,0.0078,0.000,244.35,0.0107,0.000,241.63,0.0001,0.000,239.66,0.0025,0.000,234.54,0.0124,0.000,232.06,0.0018,0.000,O1,-0.627463,-0.943989,-0.82067,1.99980,6.81192,0.00895,8.82067,30.0644,310.1108,-0.363511,-0.43458,1.99983,6.41968,0.01507,8.43458,O2,-0.627534,-0.927130,-0.81465,1.99980,6.79875,0.01610,8.81465,6.9248,303.5098,-0.370326,-0.43362,1.99983,6.41864,0.01515,8.43362,K3,0.725768,0.862762,0.91455,17.99508,0.06358,0.02679,18.08545,1286.0105,40.8048,0.001831,0.05032,18.00078,0.85275,0.09615,18.94968,1,O,1,-0.627463,-0.943989,-0.82067,1.99980,6.81192,0.00895,8.82067,30.0644,310.1108,-0.363511,-0.43458,1.99983,6.41968,0.01507,8.43458,3,O,1,-0.627534,-0.927130,-0.81465,1.99980,6.79875,0.01610,8.81465,6.9248,303.5098,-0.370326,-0.43362,1.99983,6.41864,0.01515,8.43362,5,K,0,1,0.725768,0.862762,0.91455,17.99508,0.06358,0.02679,18.08545,1286.0105,40.8048,0.001831,0.05032,18.00078,0.85275,0.09615,18.94968,2,C,1,0.582472,1.091107,0.78874,1.99958,3.15451,0.05718,5.21126,19.4377,106.5248,0.659809,0.78899,1.99960,3.15956,0.05185,5.21101,6,6,0.9999890370004872,616.9335,2.5135,0.5637,7.6394,49.4004,-0.0238,90.0144,618.7872,2.451,0.5529,7.6617,49.3959,0.0056,89.9964,7,7,0.9987724746696157,659.7903,6.1973,1.5895,29.1201,176.0738,0.0207,89.6665,646.437,6.0553,1.4909,18.1353,111.9193,-0.0052,90.08,8,8,-0.9998826675540469,925.2324,7.2377,3.6505,12.1374,52.3339,-0.0009,90.0186,916.3218,7.2493,3.5863,16.7246,72.8142,0.0002,89.9953,9,9,0.9999989809690724,1031.7256,1.4605,0.916,7.6401,29.5422,-0.0049,90.009,1027.1507,1.4579,0.9063,6.8743,26.6993,0.0013,89.997,10,10,0.9999999998768896,1069.0842,1.8415,1.2401,5.4846,20.4663,0.0076,89.9904,1063.8484,1.8673,1.2451,6.1805,23.1767,-0.0021,90.0022,11,11,0.9999990327735793,1383.254,1.412,1.5918,21.8732,63.0837,-0.0014,90.0059,1377.1441,1.4156,1.5818,25.4311,73.6705,0.0003,89.9992,12,12,0.9997085947517004,1439.3398,4.9649,6.0602,223.0332,618.178,-0.0014,90.0038,1428.8053,5.1995,6.254,180.9276,505.1719,0.0002,89.9995,15,15,-0.9997468732478542,1644.7032,7.706,12.2816,409.2433,992.662,0.0008,89.9999,1677.2563,8.6498,14.3368,430.3053,1023.4925,0.,90.,0.5728870393102002,0.5692344540582618,0.8437172539744319,0.5728870393102002,0.5692344540582618,0.8437172539744319,0.6196521062243645 5 | KOPiv,O=C([O-])C(C)(C)C.[K+],C5H9KO2,17,0,1,met,7.7731,140.2232,1394.117,1500.593,-0.2019,-0.04847,0.125185,0.076715,0.076715,-946.387800763,0.135595,0.145534,0.146478,0.099461,-946.252206,-946.242267,-946.241322,-946.28834,8.9776,909.207,1563.3278,355.25,0.0024,0.000,330.15,0.1106,0.000,321.67,0.0000,0.000,256.20,0.0000,0.000,248.54,0.0117,0.000,247.09,0.0045,0.000,240.58,0.0065,0.000,236.56,0.0018,0.000,233.63,0.0109,0.000,229.18,0.0018,0.000,O1,-0.640102,-0.902338,-0.81130,1.99979,6.79620,0.01531,8.81130,27.8484,331.9497,-0.379796,-0.42010,1.99983,6.40559,0.01469,8.42010,O2,-0.649551,-0.924868,-0.82499,1.99979,6.81009,0.01510,8.82499,46.0592,327.5974,-0.397948,-0.45369,1.99983,6.43914,0.01472,8.45369,K3,0.731425,0.873667,0.91906,17.99520,0.06186,0.02389,18.08094,1287.9448,39.1010,-0.016351,0.04662,18.00055,0.86198,0.09086,18.95338,3,O,1,-0.649551,-0.924868,-0.82499,1.99979,6.81009,0.01510,8.82499,46.0592,327.5974,-0.397948,-0.45369,1.99983,6.43914,0.01472,8.45369,1,O,1,-0.640102,-0.902338,-0.81130,1.99979,6.79620,0.01531,8.81130,27.8484,331.9497,-0.379796,-0.42010,1.99983,6.40559,0.01469,8.42010,8,K,0,1,0.731425,0.873667,0.91906,17.99520,0.06186,0.02389,18.08094,1287.9448,39.1010,-0.016351,0.04662,18.00055,0.86198,0.09086,18.95338,2,C,1,0.614239,0.982954,0.81254,1.99942,3.12952,0.05852,5.18746,11.3401,114.6019,0.697336,0.81335,1.99941,3.12961,0.05763,5.18665,15,6,0.9985454480831925,791.4536,6.8855,2.5412,5.0505,25.4575,0.0014,89.9982,618.7872,2.451,0.5529,7.6617,49.3959,0.0056,89.9964,14,7,-0.9599107742386958,595.6456,3.6165,0.756,15.9905,107.098,0.0007,89.7512,646.437,6.0553,1.4909,18.1353,111.9193,-0.0052,90.08,17,8,0.9954281640750704,893.1298,3.5163,1.6526,27.9244,124.7315,0.0022,89.9738,916.3218,7.2493,3.5863,16.7246,72.8142,0.0002,89.9953,22,9,-0.9145365457535707,1060.0869,1.4056,0.9307,0.9054,3.4074,-0.0275,90.1271,1027.1507,1.4579,0.9063,6.8743,26.6993,0.0013,89.997,23,10,-0.9999999942993534,1242.9392,2.7008,2.4583,3.9264,12.6023,-0.0019,90.0937,1063.8484,1.8673,1.2451,6.1805,23.1767,-0.0021,90.0022,29,11,0.9999328641835897,1460.0476,1.5532,1.9507,61.3992,167.7656,0.0017,89.9895,1377.1441,1.4156,1.5818,25.4311,73.6705,0.0003,89.9992,29,12,-0.999479732422525,1460.0476,1.5532,1.9507,61.3992,167.7656,0.0017,89.9895,1428.8053,5.1995,6.254,180.9276,505.1719,0.0002,89.9995,36,15,-0.9973077260375155,1625.8494,10.5985,16.5065,357.3476,876.8352,-0.0087,90.0138,1677.2563,8.6498,14.3368,430.3053,1023.4925,0.,90.,0.6338732373282236,0.6495613903775336,0.8435376186341725,0.6495613903775336,0.6338732373282236,0.8435376186341725,0.7457261758629982 -------------------------------------------------------------------------------- /examples/publication/BMS_yield_cost/data/solvent_dft.csv: -------------------------------------------------------------------------------- 1 | solvent_file_name,solvent_SMILES,solvent_stoichiometry,solvent_number_of_atoms,solvent_charge,solvent_multiplicity,solvent_convergence_criteria,solvent_dipole,solvent_molar_mass,solvent_molar_volume,solvent_electronic_spatial_extent,solvent_homo_energy,solvent_lumo_energy,solvent_electronegativity,solvent_hardness,solvent_electrophilicity,solvent_E_scf,solvent_zero_point_correction,solvent_E_thermal_correction,solvent_H_thermal_correction,solvent_G_thermal_correction,solvent_E_zpe,solvent_E,solvent_H,solvent_G,solvent_ES_root_dipole,solvent_ES_root_molar_volume,solvent_ES_root_electronic_spatial_extent,solvent_ES1_transition,solvent_ES1_osc_strength,solvent_ES1_,solvent_ES2_transition,solvent_ES2_osc_strength,solvent_ES2_,solvent_ES3_transition,solvent_ES3_osc_strength,solvent_ES3_,solvent_ES4_transition,solvent_ES4_osc_strength,solvent_ES4_,solvent_ES5_transition,solvent_ES5_osc_strength,solvent_ES5_,solvent_ES6_transition,solvent_ES6_osc_strength,solvent_ES6_,solvent_ES7_transition,solvent_ES7_osc_strength,solvent_ES7_,solvent_ES8_transition,solvent_ES8_osc_strength,solvent_ES8_,solvent_ES9_transition,solvent_ES9_osc_strength,solvent_ES9_,solvent_ES10_transition,solvent_ES10_osc_strength,solvent_ES10_,solvent_c_min_atom_number,solvent_c_min_atom,solvent_c_min_atom=N,solvent_c_min_atom=O,solvent_c_min_atom=C,solvent_c_min_Mulliken_charge,solvent_c_min_APT_charge,solvent_c_min_NPA_charge,solvent_c_min_NPA_core,solvent_c_min_NPA_valence,solvent_c_min_NPA_Rydberg,solvent_c_min_NPA_total,solvent_c_min_NMR_shift,solvent_c_min_NMR_anisotropy,solvent_c_min_ES_root_Mulliken_charge,solvent_c_min_ES_root_NPA_charge,solvent_c_min_ES_root_NPA_core,solvent_c_min_ES_root_NPA_valence,solvent_c_min_ES_root_NPA_Rydberg,solvent_c_min_ES_root_NPA_total,solvent_c_min+1_atom_number,solvent_c_min+1_atom,solvent_c_min+1_atom=C,solvent_c_min+1_atom=O,solvent_c_min+1_atom=N,solvent_c_min+1_Mulliken_charge,solvent_c_min+1_APT_charge,solvent_c_min+1_NPA_charge,solvent_c_min+1_NPA_core,solvent_c_min+1_NPA_valence,solvent_c_min+1_NPA_Rydberg,solvent_c_min+1_NPA_total,solvent_c_min+1_NMR_shift,solvent_c_min+1_NMR_anisotropy,solvent_c_min+1_ES_root_Mulliken_charge,solvent_c_min+1_ES_root_NPA_charge,solvent_c_min+1_ES_root_NPA_core,solvent_c_min+1_ES_root_NPA_valence,solvent_c_min+1_ES_root_NPA_Rydberg,solvent_c_min+1_ES_root_NPA_total,solvent_c_max_atom_number,solvent_c_max_atom,solvent_c_max_atom=C,solvent_c_max_atom=H,solvent_c_max_Mulliken_charge,solvent_c_max_APT_charge,solvent_c_max_NPA_charge,solvent_c_max_NPA_core,solvent_c_max_NPA_valence,solvent_c_max_NPA_Rydberg,solvent_c_max_NPA_total,solvent_c_max_NMR_shift,solvent_c_max_NMR_anisotropy,solvent_c_max_ES_root_Mulliken_charge,solvent_c_max_ES_root_NPA_charge,solvent_c_max_ES_root_NPA_core,solvent_c_max_ES_root_NPA_valence,solvent_c_max_ES_root_NPA_Rydberg,solvent_c_max_ES_root_NPA_total,solvent_c_max-1_atom_number,solvent_c_max-1_atom,solvent_c_max-1_atom=H,solvent_c_max-1_Mulliken_charge,solvent_c_max-1_APT_charge,solvent_c_max-1_NPA_charge,solvent_c_max-1_NPA_core,solvent_c_max-1_NPA_valence,solvent_c_max-1_NPA_Rydberg,solvent_c_max-1_NPA_total,solvent_c_max-1_NMR_shift,solvent_c_max-1_NMR_anisotropy,solvent_c_max-1_ES_root_Mulliken_charge,solvent_c_max-1_ES_root_NPA_charge,solvent_c_max-1_ES_root_NPA_core,solvent_c_max-1_ES_root_NPA_valence,solvent_c_max-1_ES_root_NPA_Rydberg,solvent_c_max-1_ES_root_NPA_total,solvent_c_min_%VBur,solvent_c_min+1_%VBur,solvent_c_max_%VBur,solvent_c_max-1_%VBur 2 | BuCN,CCCC#N,C4H7N,12,0,1,met,4.0491,69.106,914.079,571.8195,-0.3186,0.03549,0.141555,0.177045,0.177045,-211.38290967,0.103466,0.109411,0.110356,0.074482,-211.279443,-211.273498,-211.272554,-211.308428,3.124,796.555,571.8803,161.05,0.0001,0.000,154.99,0.0089,0.000,154.72,0.0015,0.000,134.54,0.0145,0.000,132.51,0.0177,0.000,127.49,0.0395,0.000,127.44,0.0007,0.000,125.59,0.0002,0.000,123.78,0.0156,0.000,122.66,0.0268,0.000,5,N,1,0,0,-0.472503,-0.316553,-0.32895,1.99965,5.30799,0.02132,7.32895,0.2592,455.8112,-0.443011,-0.26674,1.99970,5.23124,0.03580,7.26674,1,C,1,0,0,-0.446562,0.081423,-0.68089,1.99946,4.67343,0.00800,6.68089,174.3062,23.2239,-0.452535,-0.68587,1.99946,4.67726,0.00916,6.68587,4,C,1,0,0.347150,0.093010,0.28826,1.99942,3.67740,0.03492,5.71174,83.2576,312.0115,0.357169,0.29020,1.99955,3.66181,0.04844,5.70980,12,H,1,0.190629,-0.001691,0.27199,0.00000,0.72663,0.00138,0.72801,30.2477,6.3175,0.177540,0.26144,0.00000,0.73154,0.00702,0.73856,0.2941528696745606,0.42073590611059547,0.44292087063261576,0.41396964162749617 3 | BuOAc,CCCCOC(C)=O,C6H12O2,20,0,1,met,1.732,116.1596,1374.801,1567.8891,-0.26727,0.01633,0.12547,0.1418,0.1418,-386.334640247,0.176341,0.186348,0.187292,0.139915,-386.158299,-386.148293,-386.147349,-386.194725,1.1576,933.507,1569.2596,213.62,0.0012,0.000,159.76,0.1002,0.000,150.56,0.0020,0.000,142.27,0.0048,0.000,137.73,0.0011,0.000,137.18,0.0008,0.000,134.14,0.1183,0.000,133.21,0.0135,0.000,130.96,0.0016,0.000,128.95,0.0377,0.000,5,O,0,1,0,-0.456115,-0.901813,-0.56061,1.99975,6.54945,0.01141,8.56061,134.3177,142.8309,-0.443534,-0.51973,1.99977,6.50828,0.01167,8.51973,8,O,0,1,0,-0.470030,-0.684614,-0.60235,1.99978,6.58292,0.01964,8.60235,-71.7431,570.4859,-0.303557,-0.28504,1.99981,6.26508,0.02015,8.28504,6,C,1,0,0.600089,1.124279,0.82420,1.99949,3.13174,0.04457,5.17580,29.0632,84.0999,0.462388,0.48792,1.99952,3.47223,0.04032,5.51208,18,H,1,0.180845,0.019608,0.25616,0.00000,0.74297,0.00086,0.74384,30.2399,6.1567,0.160979,0.23070,0.00000,0.76480,0.00451,0.76930,0.5591748750037423,0.43049609293134933,0.5580371845154337,0.3342714289991317 4 | DMAc,CC(N(C)C)=O,C4H9NO,15,0,1,met,3.6595,87.1212,854.473,624.6972,-0.2338,0.03388,0.09996000000000001,0.13384000000000001,0.13384000000000001,-287.830205604,0.131005,0.138714,0.139658,0.099133,-287.699201,-287.691491,-287.690547,-287.731073,2.0521,881.599,624.4849,220.76,0.0009,0.000,178.41,0.2223,0.000,162.71,0.0264,0.000,156.10,0.0121,0.000,147.68,0.0172,0.000,142.30,0.0073,0.000,135.59,0.0005,0.000,132.75,0.0026,0.000,132.39,0.0150,0.000,130.70,0.0003,0.000,6,O,0,1,0,-0.508595,-0.777915,-0.63034,1.99980,6.61277,0.01778,8.63034,-65.8556,569.4492,-0.338968,-0.28910,1.99983,6.27046,0.01882,8.28910,3,N,0,0,1,-0.392215,-0.743329,-0.47855,1.99933,5.46881,0.01041,7.47855,155.0284,111.8179,-0.395470,-0.46828,1.99935,5.45833,0.01061,7.46828,2,C,1,0,0.577639,1.037174,0.69314,1.99938,3.26821,0.03928,5.30686,33.7389,96.6573,0.462343,0.37787,1.99940,3.58626,0.03647,5.62213,10,H,1,0.206649,0.071311,0.26442,0.00000,0.73335,0.00223,0.73558,27.7382,6.7158,0.180961,0.24084,0.00000,0.75824,0.00092,0.75916,0.4078321008353043,0.6535133678632375,0.5860303583725037,0.3953474446872848 5 | p-Xylene,CC1=CC=C(C)C=C1,C8H10,18,0,1,met,0.0011,106.167,1034.845,1072.0641,-0.22568,0.0068,0.10944,0.11624,0.11624,-310.884514007,0.15581,0.163925,0.164869,0.121251,-310.728704,-310.720589,-310.719645,-310.763263,0.0004,862.371,1072.3756,232.87,0.0037,0.000,206.12,0.0838,0.000,177.12,0.3578,0.000,175.96,0.7943,0.000,159.63,0.0002,0.000,159.63,0.0034,0.000,157.51,0.0000,0.000,156.96,0.0015,0.000,154.99,0.0042,0.000,150.37,0.0000,0.000,1,C,0,0,1,-0.529145,0.099551,-0.68665,1.99939,4.68003,0.00723,6.68665,167.9050,34.2593,-0.530333,-0.69452,1.99940,4.68738,0.00774,6.69452,6,C,1,0,0,-0.529145,0.099552,-0.68665,1.99939,4.68003,0.00723,6.68665,167.9049,34.2594,-0.530332,-0.69452,1.99940,4.68738,0.00774,6.69452,9,H,0,1,0.154870,-0.023372,0.23741,0.00000,0.76170,0.00089,0.76259,30.1558,7.2488,0.157494,0.23910,0.00000,0.75928,0.00162,0.76090,14,H,1,0.154871,-0.023375,0.23741,0.00000,0.76170,0.00089,0.76259,30.1556,7.2489,0.157495,0.23911,0.00000,0.75928,0.00162,0.76089,0.4245681267027933,0.4245980659261699,0.3513068471003862,0.3512469686536331 -------------------------------------------------------------------------------- /examples/publication/Crosscoupling/1_run_experiments.py: -------------------------------------------------------------------------------- 1 | 2 | # Cross-coupling photoredox. 3 | import pandas as pd 4 | from edbo.plus.optimizer_botorch import EDBOplus 5 | 6 | filename = 'edbo_crosscoupling_photoredox_yield_ee.csv' 7 | 8 | df_to_opt = pd.read_csv(filename) 9 | regression_columns = df_to_opt.columns.drop(['Ligand', 'priority']).values.tolist() 10 | 11 | opt = EDBOplus() 12 | opt.run( 13 | filename=filename, 14 | objectives=['yield', 'ee'], 15 | objective_mode=['max', 'max'], 16 | objective_thresholds=[None, None], 17 | batch=3, 18 | init_sampling_method='cvtsampling', 19 | columns_features=regression_columns 20 | ) 21 | -------------------------------------------------------------------------------- /examples/publication/Crosscoupling/campaigns/0_recalculate_predictions.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | import matplotlib.pyplot as plt 4 | import numpy as np 5 | import pandas as pd 6 | import shutil 7 | 8 | for campaign in ['challenging_campaign_cvt', 'challenging_campaign_random', 'easy_campaign']: 9 | for round in range(1, 8): 10 | df = pd.read_csv(f"{campaign}/edbo_crosscoupling_photoredox_yield_ee_round{round}.csv") 11 | df.to_csv('optimization.csv', index=False) 12 | 13 | from edbo.plus.optimizer_botorch import EDBOplus 14 | 15 | filename = 'optimization.csv' 16 | 17 | regression_columns = df.columns.drop(['Ligand', 'priority']).values.tolist() 18 | 19 | opt = EDBOplus() 20 | opt.run( 21 | filename=filename, 22 | objectives=['yield', 'ee'], 23 | objective_mode=['max', 'max'], 24 | objective_thresholds=[None, None], 25 | batch=3, 26 | init_sampling_method='cvtsampling', 27 | columns_features=regression_columns 28 | ) 29 | 30 | shutil.copy('pred_optimization.csv', f"{campaign}/predictions_{round}.csv") -------------------------------------------------------------------------------- /examples/publication/Crosscoupling/campaigns/1_analysis.py: -------------------------------------------------------------------------------- 1 | 2 | import matplotlib.pyplot as plt 3 | import numpy as np 4 | import pandas as pd 5 | import shutil 6 | import seaborn as sns 7 | 8 | for campaign in ['challenging_campaign_cvt', 'challenging_campaign_random', 'easy_campaign']: 9 | 10 | fig, ax = plt.subplots(nrows=1, ncols=4, figsize=(12, 4)) 11 | 12 | av_uncertainties_yield = [] 13 | max_uncertainties_yield = [] 14 | av_uncertainties_ee = [] 15 | max_uncertainties_ee = [] 16 | 17 | for round in range(1, 8): 18 | df = pd.read_csv(f"{campaign}/predictions_{round}.csv") 19 | 20 | 21 | max_uncertainties_yield.append(df['yield_predicted_variance'].max()) 22 | max_uncertainties_ee.append(df['ee_predicted_variance'].max()) 23 | 24 | av_uncertainties_yield.append(df['yield_predicted_variance'].mean()) 25 | av_uncertainties_ee.append(df['ee_predicted_variance'].mean()) 26 | 27 | max_uncertainties_yield = np.sqrt(max_uncertainties_yield) 28 | max_uncertainties_ee = np.sqrt(max_uncertainties_ee) 29 | av_uncertainties_yield = np.sqrt(av_uncertainties_yield) 30 | av_uncertainties_ee = np.sqrt(av_uncertainties_ee) 31 | plt.title(f"{campaign}", loc='center') 32 | sns.scatterplot(x=np.arange(1, 8), y=av_uncertainties_yield, ax=ax[0], label='average_uncertainty_yield') 33 | sns.scatterplot(x=np.arange(1, 8), y=av_uncertainties_ee, ax=ax[0], label='average_uncertainty_ee') 34 | plt.title(f"{campaign}", loc='center') 35 | sns.scatterplot(x=np.arange(1, 8), y=max_uncertainties_yield, ax=ax[1], label='max_uncertainty_yield') 36 | sns.scatterplot(x=np.arange(1, 8), y=max_uncertainties_ee, ax=ax[1], label='max_uncertainty_ee') 37 | 38 | ax[0].set_xlabel('Round') 39 | ax[0].set_ylabel('Uncertainty') 40 | ax[1].set_xlabel('Round') 41 | ax[1].set_ylabel('Uncertainty') 42 | ax[0].set_xticks(np.arange(1, 8)) 43 | ax[1].set_xticks(np.arange(1, 8)) 44 | ax[0].set_ylim(0, 15) 45 | ax[1].set_ylim(0, 15) 46 | 47 | ax[0].legend(loc='upper center', bbox_to_anchor=(0.5, -0.2), 48 | fancybox=True, shadow=True) 49 | ax[1].legend(loc='upper center', bbox_to_anchor=(0.5, -0.2), 50 | fancybox=True, shadow=True) 51 | plt.tight_layout() 52 | 53 | 54 | # Expected improvement. 55 | av_eis_yield = [] 56 | max_eis_yield = [] 57 | av_eis_ee = [] 58 | max_eis_ee = [] 59 | 60 | for round in range(1, 8): 61 | df = pd.read_csv(f"{campaign}/predictions_{round}.csv") 62 | 63 | max_eis_yield.append(df['yield_expected_improvement'].max()) 64 | 65 | max_eis_ee.append(df['ee_expected_improvement'].max()) 66 | 67 | av_eis_yield.append(df['yield_expected_improvement'].mean()) 68 | av_eis_ee.append(df['ee_expected_improvement'].mean()) 69 | 70 | 71 | plt.title(f"{campaign}", loc='center') 72 | sns.scatterplot(x=np.arange(1, 8), y=av_eis_yield, ax=ax[2], label='average_EI_yield') 73 | sns.scatterplot(x=np.arange(1, 8), y=av_eis_ee, ax=ax[2], label='average_EI_ee') 74 | plt.title(f"{campaign}", loc='center') 75 | sns.scatterplot(x=np.arange(1, 8), y=max_eis_yield, ax=ax[3], label='max_EI_yield') 76 | sns.scatterplot(x=np.arange(1, 8), y=max_eis_ee, ax=ax[3], label='max_EI_ee') 77 | 78 | ax[2].set_xlabel('Round') 79 | ax[2].set_ylabel('EI') 80 | ax[3].set_xlabel('Round') 81 | ax[3].set_ylabel('EI') 82 | ax[2].set_xticks(np.arange(1, 8)) 83 | ax[3].set_xticks(np.arange(1, 8)) 84 | ax[2].set_ylim(0, 100) 85 | ax[3].set_ylim(0, 100) 86 | ax[2].legend(loc='upper center', bbox_to_anchor=(0.5, -0.2), 87 | fancybox=True, shadow=True) 88 | ax[3].legend(loc='upper center', bbox_to_anchor=(0.5, -0.2), 89 | fancybox=True, shadow=True) 90 | plt.tight_layout() 91 | plt.savefig(f"./plots/{campaign}.svg", format='svg') 92 | 93 | # Save results in csv file. 94 | df = pd.DataFrame([], 95 | columns=['max_uncertainty_yield', 'avg_uncertainty_yield', 'max_EI_yield', 'avg_EI_yield', 96 | 'max_uncertainty_ee', 'avg_uncertainty_ee', 'max_EI_ee', 'avg_EI_ee']) 97 | df['max_uncertainty_yield'] = max_uncertainties_yield 98 | df['max_uncertainty_ee'] = max_uncertainties_ee 99 | df['avg_uncertainty_yield'] = av_uncertainties_yield 100 | df['avg_uncertainty_yield'] = av_uncertainties_ee 101 | df['max_EI_yield'] = max_eis_yield 102 | df['max_EI_ee'] = max_eis_ee 103 | df['avg_EI_yield'] = av_eis_yield 104 | df['avg_EI_ee'] = av_eis_ee 105 | 106 | df.to_csv(f'crosscoupling_results_{campaign}.csv') 107 | plt.show() 108 | 109 | -------------------------------------------------------------------------------- /examples/publication/Crosscoupling/campaigns/crosscoupling_results_challenging_campaign_cvt.csv: -------------------------------------------------------------------------------- 1 | ,max_uncertainty_yield,avg_uncertainty_yield,max_EI_yield,avg_EI_yield,max_uncertainty_ee,avg_uncertainty_ee,max_EI_ee,avg_EI_ee 2 | 0,6.827581272767426,2.9324467709552584,32.40782689891654,25.602683007714496,3.276152995193164,,7.60101466180589,5.928642147112224 3 | 1,8.210461679254594,5.773323515784001,46.574447681313785,11.421841208309262,9.962907192629936,,74.7088879188402,18.83098803650586 4 | 2,5.128887161645351,3.81806514719726,13.744451293043053,2.709265489045186,5.803851647525387,,21.650273058529116,5.9165346528746054 5 | 3,4.541302003122084,3.2123354371983703,5.162726577974803,0.40422385279901685,5.227188408641633,,16.10033337598427,2.30352295288218 6 | 4,3.859516872139973,2.863183469922444,0.5196149081752943,0.020525537970990253,4.08837032148018,,7.477211599451265,1.2544773840895274 7 | 5,3.795156763474525,2.436198804520339,0.2496501348370021,0.00964674815969636,3.8062975956252423,,6.226783841040771,0.6123865268781998 8 | 6,5.306888155590136,2.6604791781908226,2.778079942380365,0.1453910230805698,6.401147787936088,,12.269211324706786,1.6390130724559002 9 | -------------------------------------------------------------------------------- /examples/publication/Suzuki/0_clean_dft.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import numpy as np 4 | import pandas as pd 5 | 6 | 7 | df_dft = pd.read_csv('data/dataset_B2.csv') 8 | 9 | # # Remove correlated features. 10 | corr_matrix = df_dft.corr().abs() 11 | # Select upper triangle of correlation matrix. 12 | upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool)) 13 | # Find features with correlation greater than 0.95. 14 | to_drop = [column for column in upper.columns if any(upper[column] > 0.95)] 15 | # Drop features 16 | df_dft.drop(to_drop, axis=1, inplace=True) 17 | 18 | # Remove columns that have only one or two unique values. 19 | extra_columns_to_remove = [] 20 | for column in df_dft.columns.values: 21 | if len(np.unique(df_dft[column].values)) <= 1: 22 | extra_columns_to_remove.append(column) 23 | df_dft.drop(extra_columns_to_remove, axis=1, inplace=True) 24 | 25 | # Store SMILES. 26 | solvent_ohe = df_dft['solvent'].values 27 | base_ohe = df_dft['base'].values 28 | ligand_ohe = df_dft['ligand'].values 29 | 30 | # Remove non numerical. 31 | df_edbo_numeric = df_dft.select_dtypes(include=np.number) 32 | 33 | # Add back OHE features. 34 | df_edbo_numeric.insert(1, "solvent", solvent_ohe, False) 35 | df_edbo_numeric.insert(1, "base", base_ohe, False) 36 | df_edbo_numeric.insert(1, "ligand", ligand_ohe, False) 37 | 38 | df_edbo_numeric.to_csv('./data/dataset_B2_DFT_clean.csv', index=0) 39 | -------------------------------------------------------------------------------- /examples/publication/Suzuki/0_clean_mordred.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | import numpy as np 4 | import pandas as pd 5 | 6 | 7 | df_dft = pd.read_csv('data/dataset_B3.csv') 8 | 9 | # # Remove correlated features. 10 | corr_matrix = df_dft.corr().abs() 11 | # Select upper triangle of correlation matrix. 12 | upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool)) 13 | # Find features with correlation greater than 0.95. 14 | to_drop = [column for column in upper.columns if any(upper[column] > 0.95)] 15 | # Drop features 16 | df_dft.drop(to_drop, axis=1, inplace=True) 17 | 18 | # Remove columns that have only one or two unique values. 19 | extra_columns_to_remove = [] 20 | for column in df_dft.columns.values: 21 | if len(np.unique(df_dft[column].values)) <= 1: 22 | extra_columns_to_remove.append(column) 23 | df_dft.drop(extra_columns_to_remove, axis=1, inplace=True) 24 | 25 | # Store SMILES. 26 | solvent_ohe = df_dft['solvent'].values 27 | base_ohe = df_dft['base'].values 28 | ligand_ohe = df_dft['ligand'].values 29 | 30 | # Remove non numerical. 31 | df_edbo_numeric = df_dft.select_dtypes(include=np.number) 32 | 33 | # Add back OHE features. 34 | df_edbo_numeric.insert(1, "solvent", solvent_ohe, False) 35 | df_edbo_numeric.insert(1, "base", base_ohe, False) 36 | df_edbo_numeric.insert(1, "ligand", ligand_ohe, False) 37 | 38 | df_edbo_numeric.to_csv('./data/dataset_B3_Mordred_clean.csv', index=0) 39 | -------------------------------------------------------------------------------- /examples/publication/Suzuki/1_run_ohe.py: -------------------------------------------------------------------------------- 1 | 2 | import shutil 3 | import pandas as pd 4 | import numpy as np 5 | import os 6 | from edbo.plus.benchmark.multiobjective_benchmark import Benchmark 7 | import os 8 | import numpy as np 9 | import matplotlib.pyplot as plt 10 | import seaborn as sns 11 | import pandas as pd 12 | sns.set_style("darkgrid") 13 | sns.set_context("talk") 14 | 15 | for acq_i in [ 16 | 'EHVI', 17 | 'MOUCB', 18 | 'MOGreedy' 19 | ]: 20 | for seed_i in np.arange(0, 5): 21 | budget = 30 22 | acq = acq_i 23 | batch = 1 24 | seed = seed_i 25 | 26 | df_exp = pd.read_csv('./data/dataset_B1.csv') 27 | df_exp['new_index'] = np.arange(0, len(df_exp.values)) 28 | sort_column = 'new_index' 29 | 30 | # Select the features for the model. 31 | columns_regression = df_exp.columns 32 | columns_regression = columns_regression.drop([sort_column, 'objective_conversion', 'objective_selectivity']).tolist() 33 | objectives = ['objective_conversion', 'objective_selectivity'] 34 | objective_modes = ['max', 'max'] 35 | objective_thresholds = [None, None] 36 | print(f"Columns for regression: {columns_regression}") 37 | ###################### 38 | 39 | label_benchmark = f"benchmark_ohe_acq_{acq}_batch_{batch}_seed_{seed}.csv" 40 | 41 | if not os.path.exists(f"./results_ohe/{label_benchmark}"): 42 | # Remove previous files 43 | if os.path.exists(label_benchmark): 44 | os.remove(label_benchmark) 45 | 46 | if os.path.exists(f'pred_{label_benchmark}'): 47 | os.remove(f'pred_{label_benchmark}') 48 | 49 | if os.path.exists(f'results_{label_benchmark}'): 50 | os.remove(f'results_{label_benchmark}') 51 | 52 | bench = Benchmark(df_ground=df_exp, 53 | features_regression=columns_regression, 54 | objective_names=objectives, 55 | objective_modes=objective_modes, 56 | objective_thresholds=objective_thresholds, 57 | filename=label_benchmark, 58 | filename_results=f'results_{label_benchmark}', 59 | index_column=sort_column, 60 | acquisition_function=acq) 61 | bench.run(steps=int(budget/batch), batch=batch, seed=seed, 62 | plot_predictions=False, 63 | plot_ground=False, 64 | plot_train=False, 65 | init_method='seed') 66 | 67 | # Move results. 68 | if not os.path.exists('results_ohe'): 69 | os.mkdir('results_ohe') 70 | shutil.move(label_benchmark, f'results_ohe/{label_benchmark}') 71 | shutil.move(f'pred_{label_benchmark}', f'results_ohe/pred_{label_benchmark}') 72 | shutil.move(f'results_{label_benchmark}', f'results_ohe/results_{label_benchmark}') 73 | 74 | # Clean. 75 | if os.path.exists('results'): 76 | shutil.rmtree('results') 77 | -------------------------------------------------------------------------------- /examples/publication/Suzuki/2_run_dft.py: -------------------------------------------------------------------------------- 1 | 2 | import shutil 3 | import pandas as pd 4 | import numpy as np 5 | import os 6 | from edbo.plus.benchmark.multiobjective_benchmark import Benchmark 7 | import os 8 | import numpy as np 9 | import matplotlib.pyplot as plt 10 | import seaborn as sns 11 | import pandas as pd 12 | sns.set_style("darkgrid") 13 | sns.set_context("talk") 14 | 15 | 16 | for batch in [1, 2, 3, 5]: 17 | for acq_i in [ 18 | 'EHVI', 19 | 'MOUCB', 20 | 'MOGreedy' 21 | ]: 22 | for seed_i in np.arange(0, 5): 23 | budget = 30 24 | acq = acq_i 25 | seed = seed_i 26 | 27 | df_exp = pd.read_csv('./data/dataset_B2_DFT_clean.csv') 28 | df_exp['new_index'] = np.arange(0, len(df_exp.values)) 29 | sort_column = 'new_index' 30 | 31 | # Select the features for the model. 32 | columns_regression = df_exp.columns 33 | columns_regression = columns_regression.drop('solvent') 34 | columns_regression = columns_regression.drop('ligand') 35 | 36 | columns_regression = columns_regression.drop([sort_column, 'objective_conversion', 'objective_selectivity']).tolist() 37 | objectives = ['objective_conversion', 'objective_selectivity'] 38 | objective_modes = ['max', 'max'] 39 | objective_thresholds = [None, None] 40 | print(f"Columns for regression: {columns_regression}") 41 | ###################### 42 | 43 | label_benchmark = f"benchmark_dft_acq_{acq}_batch_{batch}_seed_{seed}.csv" 44 | 45 | if not os.path.exists(f"./results_dft/{label_benchmark}"): 46 | # Remove previous files 47 | if os.path.exists(label_benchmark): 48 | os.remove(label_benchmark) 49 | 50 | if os.path.exists(f'pred_{label_benchmark}'): 51 | os.remove(f'pred_{label_benchmark}') 52 | 53 | if os.path.exists(f'results_{label_benchmark}'): 54 | os.remove(f'results_{label_benchmark}') 55 | 56 | bench = Benchmark(df_ground=df_exp, 57 | features_regression=columns_regression, 58 | objective_names=objectives, 59 | objective_modes=objective_modes, 60 | objective_thresholds=objective_thresholds, 61 | filename=label_benchmark, 62 | filename_results=f'results_{label_benchmark}', 63 | index_column=sort_column, 64 | acquisition_function=acq) 65 | bench.run(steps=int(budget/batch), batch=batch, seed=seed, 66 | plot_predictions=False, 67 | plot_ground=False, 68 | plot_train=False) 69 | 70 | # Move results. 71 | if not os.path.exists('results_dft'): 72 | os.mkdir('results_dft') 73 | shutil.move(label_benchmark, f'results_dft/{label_benchmark}') 74 | shutil.move(f'pred_{label_benchmark}', f'results_dft/pred_{label_benchmark}') 75 | shutil.move(f'results_{label_benchmark}', f'results_dft/results_{label_benchmark}') 76 | 77 | -------------------------------------------------------------------------------- /examples/publication/Suzuki/3_run_mordred.py: -------------------------------------------------------------------------------- 1 | 2 | import shutil 3 | import pandas as pd 4 | import numpy as np 5 | import os 6 | from edbo.plus.benchmark.multiobjective_benchmark import Benchmark 7 | import os 8 | import numpy as np 9 | import matplotlib.pyplot as plt 10 | import seaborn as sns 11 | import pandas as pd 12 | sns.set_style("darkgrid") 13 | sns.set_context("talk") 14 | 15 | 16 | for acq_i in [ 17 | 'EHVI', 18 | 'MOUCB', 19 | 'MOGreedy' 20 | ]: 21 | for seed_i in np.arange(0, 5): 22 | budget = 30 23 | acq = acq_i 24 | batch = 1 25 | seed = seed_i 26 | 27 | df_exp = pd.read_csv('./data/dataset_B3_Mordred_clean.csv') 28 | df_exp['new_index'] = np.arange(0, len(df_exp.values)) 29 | sort_column = 'new_index' 30 | 31 | # Select the features for the model. 32 | columns_regression = df_exp.columns 33 | columns_regression = columns_regression.drop('solvent') 34 | columns_regression = columns_regression.drop('ligand') 35 | 36 | columns_regression = columns_regression.drop([sort_column, 'objective_conversion', 'objective_selectivity']).tolist() 37 | objectives = ['objective_conversion', 'objective_selectivity'] 38 | objective_modes = ['max', 'max'] 39 | objective_thresholds = [None, None] 40 | print(f"Columns for regression: {columns_regression}") 41 | ###################### 42 | 43 | label_benchmark = f"benchmark_mordred_acq_{acq}_batch_{batch}_seed_{seed}.csv" 44 | 45 | if not os.path.exists(f"./results_mordred/{label_benchmark}"): 46 | # Remove previous files 47 | if os.path.exists(label_benchmark): 48 | os.remove(label_benchmark) 49 | 50 | if os.path.exists(f'pred_{label_benchmark}'): 51 | os.remove(f'pred_{label_benchmark}') 52 | 53 | if os.path.exists(f'results_{label_benchmark}'): 54 | os.remove(f'results_{label_benchmark}') 55 | 56 | bench = Benchmark(df_ground=df_exp, 57 | features_regression=columns_regression, 58 | objective_names=objectives, 59 | objective_modes=objective_modes, 60 | objective_thresholds=objective_thresholds, 61 | filename=label_benchmark, 62 | filename_results=f'results_{label_benchmark}', 63 | index_column=sort_column, 64 | acquisition_function=acq) 65 | bench.run(steps=int(budget/batch), batch=batch, seed=seed, 66 | plot_predictions=False, 67 | plot_ground=False, 68 | plot_train=False) 69 | 70 | # Move results. 71 | if not os.path.exists('results_mordred'): 72 | os.mkdir('results_mordred') 73 | shutil.move(label_benchmark, f'results_mordred/{label_benchmark}') 74 | shutil.move(f'pred_{label_benchmark}', f'results_mordred/pred_{label_benchmark}') 75 | shutil.move(f'results_{label_benchmark}', f'results_mordred/results_{label_benchmark}') 76 | -------------------------------------------------------------------------------- /examples/publication/Suzuki/4_random_features.py: -------------------------------------------------------------------------------- 1 | 2 | import shutil 3 | import pandas as pd 4 | import numpy as np 5 | import os 6 | from edbo.plus.benchmark.multiobjective_benchmark import Benchmark 7 | import os 8 | import numpy as np 9 | import matplotlib.pyplot as plt 10 | import seaborn as sns 11 | import pandas as pd 12 | sns.set_style("darkgrid") 13 | sns.set_context("talk") 14 | 15 | for acq_i in [ 16 | 'EHVI', 17 | 'MOUCB', 18 | 'MOGreedy' 19 | ]: 20 | for seed_i in np.arange(0, 5): 21 | budget = 30 22 | acq = acq_i 23 | batch = 1 24 | seed = seed_i 25 | 26 | df_exp = pd.read_csv('./data/dataset_B1.csv') 27 | df_exp['new_index'] = np.arange(0, len(df_exp.values)) 28 | sort_column = 'new_index' 29 | 30 | # Select the features for the model. 31 | columns_regression = df_exp.columns 32 | columns_regression = columns_regression.drop([sort_column, 'objective_conversion', 'objective_selectivity']).tolist() 33 | objectives = ['objective_conversion', 'objective_selectivity'] 34 | objective_modes = ['max', 'max'] 35 | objective_thresholds = [None, None] 36 | print(f"Columns for regression: {columns_regression}") 37 | ###################### 38 | 39 | label_benchmark = f"benchmark_random_acq_{acq}_batch_{batch}_seed_{seed}.csv" 40 | 41 | if not os.path.exists(f"./results_random/{label_benchmark}"): 42 | # Remove previous files 43 | if os.path.exists(label_benchmark): 44 | os.remove(label_benchmark) 45 | 46 | if os.path.exists(f'pred_{label_benchmark}'): 47 | os.remove(f'pred_{label_benchmark}') 48 | 49 | if os.path.exists(f'results_{label_benchmark}'): 50 | os.remove(f'results_{label_benchmark}') 51 | 52 | bench = Benchmark(df_ground=df_exp, 53 | features_regression=columns_regression, 54 | objective_names=objectives, 55 | objective_modes=objective_modes, 56 | objective_thresholds=objective_thresholds, 57 | filename=label_benchmark, 58 | filename_results=f'results_{label_benchmark}', 59 | index_column=sort_column, 60 | acquisition_function=acq) 61 | bench.run(steps=int(budget/batch), batch=batch, seed=seed, 62 | plot_predictions=False, 63 | plot_ground=False, 64 | plot_train=False, 65 | random_sampling=True) 66 | 67 | # Move results. 68 | if not os.path.exists('results_random'): 69 | os.mkdir('results_random') 70 | shutil.move(label_benchmark, f'results_random/{label_benchmark}') 71 | shutil.move(f'pred_{label_benchmark}', f'results_random/pred_{label_benchmark}') 72 | shutil.move(f'results_{label_benchmark}', f'results_random/results_{label_benchmark}') 73 | -------------------------------------------------------------------------------- /examples/publication/Suzuki/data/dataset_B1.csv: -------------------------------------------------------------------------------- 1 | ligand,base,solvent,ligand_equivalent,objective_conversion,objective_selectivity 2 | P(tBu)3,NaOH(aq.),MeOH,0.125,39.6,67.17171717171718 3 | P(tBu)3,s. NaHCO3(aq.),MeOH,0.125,52.3,74.37858508604207 4 | P(tBu)3,CsF(aq.),MeOH,0.125,50.8,74.01574803149606 5 | P(tBu)3,1M K3PO4(aq.),MeOH,0.125,50.3,73.55864811133202 6 | P(tBu)3,KOH(aq.),MeOH,0.125,61.5,78.21138211382113 7 | P(tBu)3,Cs2CO3(aq.),MeOH,0.125,61.00000000000001,79.01639344262294 8 | P(tBu)3,KOAc,MeOH,0.125,67.5,52.8888888888889 9 | P(tBu)3,None,MeOH,0.125,74.9,53.271028037383175 10 | P(Ph)3,NaOH(aq.),MeOH,0.125,99.2,73.79032258064517 11 | P(Ph)3,s. NaHCO3(aq.),MeOH,0.125,86.60000000000001,85.10392609699768 12 | P(Ph)3,CsF(aq.),MeOH,0.125,83.3,89.07563025210085 13 | P(Ph)3,1M K3PO4(aq.),MeOH,0.125,81.5,92.14723926380368 14 | P(Ph)3,KOH(aq.),MeOH,0.125,82.99999999999999,91.92771084337352 15 | P(Ph)3,Cs2CO3(aq.),MeOH,0.125,82.2,93.06569343065692 16 | P(Ph)3,KOAc,MeOH,0.125,81.4,95.0859950859951 17 | P(Ph)3,None,MeOH,0.125,80.30000000000001,94.89414694894144 18 | AmPhos,NaOH(aq.),MeOH,0.125,75.2,89.49468085106382 19 | AmPhos,s. NaHCO3(aq.),MeOH,0.125,75.39999999999999,90.18567639257296 20 | AmPhos,CsF(aq.),MeOH,0.125,77.3,90.03880983182407 21 | AmPhos,1M K3PO4(aq.),MeOH,0.125,74.3,88.42530282637955 22 | AmPhos,KOH(aq.),MeOH,0.125,56.900000000000006,78.55887521968366 23 | AmPhos,Cs2CO3(aq.),MeOH,0.125,60.1,78.70216306156405 24 | AmPhos,KOAc,MeOH,0.125,43.7,72.31121281464532 25 | AmPhos,None,MeOH,0.125,39.3,70.22900763358778 26 | P(Cy)3,NaOH(aq.),MeOH,0.125,46.5,71.82795698924731 27 | P(Cy)3,s. NaHCO3(aq.),MeOH,0.125,33.9,70.50147492625368 28 | P(Cy)3,CsF(aq.),MeOH,0.125,55.2,75.90579710144928 29 | P(Cy)3,1M K3PO4(aq.),MeOH,0.125,46.7,73.23340471092077 30 | P(Cy)3,KOH(aq.),MeOH,0.125,59.8,77.59197324414716 31 | P(Cy)3,Cs2CO3(aq.),MeOH,0.125,84.39999999999999,92.53554502369668 32 | P(Cy)3,KOAc,MeOH,0.125,80.60000000000001,94.04466501240694 33 | P(Cy)3,None,MeOH,0.125,76.7,92.4380704041721 34 | P(o-Tol)3,NaOH(aq.),MeOH,0.125,83.99999999999999,83.45238095238095 35 | P(o-Tol)3,s. NaHCO3(aq.),MeOH,0.125,76.5,84.70588235294117 36 | P(o-Tol)3,CsF(aq.),MeOH,0.125,83.79999999999998,82.69689737470168 37 | P(o-Tol)3,1M K3PO4(aq.),MeOH,0.125,76.5,80.65359477124183 38 | P(o-Tol)3,KOH(aq.),MeOH,0.125,74.5,75.16778523489933 39 | P(o-Tol)3,Cs2CO3(aq.),MeOH,0.125,79.5,66.41509433962264 40 | P(o-Tol)3,KOAc,MeOH,0.125,67.8,74.63126843657818 41 | P(o-Tol)3,None,MeOH,0.125,59.39999999999999,76.26262626262627 42 | CataCXium A,NaOH(aq.),MeOH,0.125,56.400000000000006,78.0141843971631 43 | CataCXium A,s. NaHCO3(aq.),MeOH,0.125,66.3,81.14630467571644 44 | CataCXium A,CsF(aq.),MeOH,0.125,47.7,74.8427672955975 45 | CataCXium A,1M K3PO4(aq.),MeOH,0.125,60.3,79.93366500829188 46 | CataCXium A,KOH(aq.),MeOH,0.125,63.8,80.87774294670847 47 | CataCXium A,Cs2CO3(aq.),MeOH,0.125,45.99999999999999,73.47826086956523 48 | CataCXium A,KOAc,MeOH,0.125,38.7,69.50904392764858 49 | CataCXium A,None,MeOH,0.125,47.39999999999999,73.41772151898735 50 | SPhos,NaOH(aq.),MeOH,0.0625,45.2,72.34513274336285 51 | SPhos,s. NaHCO3(aq.),MeOH,0.0625,28.0,58.57142857142858 52 | SPhos,CsF(aq.),MeOH,0.0625,38.39999999999999,67.44791666666667 53 | SPhos,1M K3PO4(aq.),MeOH,0.0625,39.3,67.68447837150127 54 | SPhos,KOH(aq.),MeOH,0.0625,36.9,66.66666666666667 55 | SPhos,Cs2CO3(aq.),MeOH,0.0625,74.6,71.58176943699732 56 | SPhos,KOAc,MeOH,0.0625,54.5,67.33944954128441 57 | SPhos,None,MeOH,0.0625,49.7,70.4225352112676 58 | dtbpf,NaOH(aq.),MeOH,0.0625,38.6,47.15025906735752 59 | dtbpf,s. NaHCO3(aq.),MeOH,0.0625,19.6,21.428571428571427 60 | dtbpf,CsF(aq.),MeOH,0.0625,20.7,25.120772946859905 61 | dtbpf,1M K3PO4(aq.),MeOH,0.0625,19.7,21.82741116751269 62 | dtbpf,KOH(aq.),MeOH,0.0625,19.8,24.24242424242425 63 | dtbpf,Cs2CO3(aq.),MeOH,0.0625,15.8,21.51898734177215 64 | dtbpf,KOAc,MeOH,0.0625,16.1,19.875776397515526 65 | dtbpf,None,MeOH,0.0625,13.0,20.0 66 | XPhos,NaOH(aq.),MeOH,0.0625,77.9,83.31193838254171 67 | XPhos,s. NaHCO3(aq.),MeOH,0.0625,79.39999999999999,83.50125944584383 68 | XPhos,CsF(aq.),MeOH,0.0625,72.7,82.80605226960111 69 | XPhos,1M K3PO4(aq.),MeOH,0.0625,53.8,77.32342007434944 70 | XPhos,KOH(aq.),MeOH,0.0625,46.0,72.6086956521739 71 | XPhos,Cs2CO3(aq.),MeOH,0.0625,41.0,70.73170731707317 72 | XPhos,KOAc,MeOH,0.0625,51.4,74.12451361867704 73 | XPhos,None,MeOH,0.0625,33.5,63.28358208955224 74 | dppf,NaOH(aq.),MeOH,0.0625,40.5,52.8395061728395 75 | dppf,s. NaHCO3(aq.),MeOH,0.0625,36.3,67.49311294765839 76 | dppf,CsF(aq.),MeOH,0.0625,35.3,65.43909348441927 77 | dppf,1M K3PO4(aq.),MeOH,0.0625,36.0,62.77777777777778 78 | dppf,KOH(aq.),MeOH,0.0625,28.3,49.1166077738516 79 | dppf,Cs2CO3(aq.),MeOH,0.0625,35.4,40.96045197740113 80 | dppf,KOAc,MeOH,0.0625,25.5,53.333333333333336 81 | dppf,None,MeOH,0.0625,20.0,54.50000000000001 82 | Xanthphos,NaOH(aq.),MeOH,0.0625,12.2,41.80327868852459 83 | Xanthphos,s. NaHCO3(aq.),MeOH,0.0625,7.8,32.05128205128205 84 | Xanthphos,CsF(aq.),MeOH,0.0625,9.7,32.98969072164949 85 | Xanthphos,1M K3PO4(aq.),MeOH,0.0625,8.5,31.764705882352946 86 | Xanthphos,KOH(aq.),MeOH,0.0625,10.2,40.19607843137255 87 | Xanthphos,Cs2CO3(aq.),MeOH,0.0625,12.0,35.833333333333336 88 | Xanthphos,KOAc,MeOH,0.0625,7.6,23.68421052631579 89 | Xanthphos,None,MeOH,0.0625,7.399999999999999,24.324324324324326 90 | P(tBu)3,NaOH(aq.),MeCN,0.125,38.2,69.63350785340315 91 | P(tBu)3,s. NaHCO3(aq.),MeCN,0.125,42.8,52.10280373831775 92 | P(tBu)3,CsF(aq.),MeCN,0.125,21.3,24.413145539906104 93 | P(tBu)3,1M K3PO4(aq.),MeCN,0.125,29.8,54.0268456375839 94 | P(tBu)3,KOH(aq.),MeCN,0.125,24.0,34.583333333333336 95 | P(tBu)3,Cs2CO3(aq.),MeCN,0.125,20.1,54.72636815920397 96 | P(tBu)3,KOAc,MeCN,0.125,18.4,10.326086956521738 97 | P(tBu)3,None,MeCN,0.125,22.1,12.21719457013575 98 | P(Ph)3,NaOH(aq.),MeCN,0.125,16.7,36.52694610778443 99 | P(Ph)3,s. NaHCO3(aq.),MeCN,0.125,34.6,67.05202312138728 100 | P(Ph)3,CsF(aq.),MeCN,0.125,37.9,76.2532981530343 101 | P(Ph)3,1M K3PO4(aq.),MeCN,0.125,25.1,79.6812749003984 102 | P(Ph)3,KOH(aq.),MeCN,0.125,13.3,75.18796992481204 103 | P(Ph)3,Cs2CO3(aq.),MeCN,0.125,22.9,74.67248908296943 104 | P(Ph)3,KOAc,MeCN,0.125,12.1,60.33057851239669 105 | P(Ph)3,None,MeCN,0.125,27.1,79.33579335793357 106 | AmPhos,NaOH(aq.),MeCN,0.125,13.3,31.57894736842105 107 | AmPhos,s. NaHCO3(aq.),MeCN,0.125,31.8,62.8930817610063 108 | AmPhos,CsF(aq.),MeCN,0.125,31.6,63.29113924050633 109 | AmPhos,1M K3PO4(aq.),MeCN,0.125,30.8,62.66233766233766 110 | AmPhos,KOH(aq.),MeCN,0.125,29.4,62.24489795918368 111 | AmPhos,Cs2CO3(aq.),MeCN,0.125,25.3,58.49802371541502 112 | AmPhos,KOAc,MeCN,0.125,21.2,50.943396226415096 113 | AmPhos,None,MeCN,0.125,26.7,55.0561797752809 114 | P(Cy)3,NaOH(aq.),MeCN,0.125,33.2,67.46987951807229 115 | P(Cy)3,s. NaHCO3(aq.),MeCN,0.125,32.2,67.3913043478261 116 | P(Cy)3,CsF(aq.),MeCN,0.125,15.999999999999998,60.62500000000001 117 | P(Cy)3,1M K3PO4(aq.),MeCN,0.125,10.3,66.99029126213593 118 | P(Cy)3,KOH(aq.),MeCN,0.125,7.800000000000001,55.12820512820512 119 | P(Cy)3,Cs2CO3(aq.),MeCN,0.125,7.300000000000001,43.83561643835616 120 | P(Cy)3,KOAc,MeCN,0.125,3.8,23.68421052631579 121 | P(Cy)3,None,MeCN,0.125,7.0,30.0 122 | P(o-Tol)3,NaOH(aq.),MeCN,0.125,11.5,4.3478260869565215 123 | P(o-Tol)3,s. NaHCO3(aq.),MeCN,0.125,13.7,2.18978102189781 124 | P(o-Tol)3,CsF(aq.),MeCN,0.125,12.0,3.333333333333333 125 | P(o-Tol)3,1M K3PO4(aq.),MeCN,0.125,9.7,2.061855670103093 126 | P(o-Tol)3,KOH(aq.),MeCN,0.125,10.5,1.9047619047619049 127 | P(o-Tol)3,Cs2CO3(aq.),MeCN,0.125,10.3,1.9417475728155336 128 | P(o-Tol)3,KOAc,MeCN,0.125,8.6,3.488372093023256 129 | P(o-Tol)3,None,MeCN,0.125,9.1,1.0989010989010988 130 | CataCXium A,NaOH(aq.),MeCN,0.125,9.1,5.494505494505495 131 | CataCXium A,s. NaHCO3(aq.),MeCN,0.125,7.199999999999999,8.333333333333334 132 | CataCXium A,CsF(aq.),MeCN,0.125,7.1,11.267605633802818 133 | CataCXium A,1M K3PO4(aq.),MeCN,0.125,12.9,36.43410852713178 134 | CataCXium A,KOH(aq.),MeCN,0.125,12.4,35.483870967741936 135 | CataCXium A,Cs2CO3(aq.),MeCN,0.125,12.7,33.85826771653544 136 | CataCXium A,KOAc,MeCN,0.125,14.1,36.87943262411348 137 | CataCXium A,None,MeCN,0.125,0.0,0.0 138 | SPhos,NaOH(aq.),MeCN,0.0625,13.3,3.007518796992482 139 | SPhos,s. NaHCO3(aq.),MeCN,0.0625,10.5,3.8095238095238098 140 | SPhos,CsF(aq.),MeCN,0.0625,10.9,5.504587155963303 141 | SPhos,1M K3PO4(aq.),MeCN,0.0625,9.2,4.347826086956522 142 | SPhos,KOH(aq.),MeCN,0.0625,8.0,6.25 143 | SPhos,Cs2CO3(aq.),MeCN,0.0625,10.3,2.912621359223301 144 | SPhos,KOAc,MeCN,0.0625,7.8,7.6923076923076925 145 | SPhos,None,MeCN,0.0625,7.2,6.944444444444445 146 | dtbpf,NaOH(aq.),MeCN,0.0625,9.3,8.602150537634408 147 | dtbpf,s. NaHCO3(aq.),MeCN,0.0625,7.1,7.042253521126761 148 | dtbpf,CsF(aq.),MeCN,0.0625,7.7,9.09090909090909 149 | dtbpf,1M K3PO4(aq.),MeCN,0.0625,6.4,6.25 150 | dtbpf,KOH(aq.),MeCN,0.0625,4.2,11.904761904761903 151 | dtbpf,Cs2CO3(aq.),MeCN,0.0625,7.9,10.126582278481012 152 | dtbpf,KOAc,MeCN,0.0625,5.4,5.555555555555556 153 | dtbpf,None,MeCN,0.0625,4.1,12.195121951219514 154 | XPhos,NaOH(aq.),MeCN,0.0625,9.5,3.1578947368421053 155 | XPhos,s. NaHCO3(aq.),MeCN,0.0625,11.2,1.785714285714286 156 | XPhos,CsF(aq.),MeCN,0.0625,8.7,0.0 157 | XPhos,1M K3PO4(aq.),MeCN,0.0625,9.7,16.494845360824744 158 | XPhos,KOH(aq.),MeCN,0.0625,9.7,11.34020618556701 159 | XPhos,Cs2CO3(aq.),MeCN,0.0625,10.6,24.528301886792452 160 | XPhos,KOAc,MeCN,0.0625,9.2,13.043478260869565 161 | XPhos,None,MeCN,0.0625,9.2,17.39130434782609 162 | dppf,NaOH(aq.),MeCN,0.0625,4.9,10.20408163265306 163 | dppf,s. NaHCO3(aq.),MeCN,0.0625,5.6,17.857142857142858 164 | dppf,CsF(aq.),MeCN,0.0625,5.9,16.949152542372882 165 | dppf,1M K3PO4(aq.),MeCN,0.0625,4.8,20.833333333333336 166 | dppf,KOH(aq.),MeCN,0.0625,4.6,15.217391304347828 167 | dppf,Cs2CO3(aq.),MeCN,0.0625,6.0,15.0 168 | dppf,KOAc,MeCN,0.0625,4.5,15.555555555555555 169 | dppf,None,MeCN,0.0625,4.9,18.367346938775515 170 | Xanthphos,NaOH(aq.),MeCN,0.0625,4.8,0.0 171 | Xanthphos,s. NaHCO3(aq.),MeCN,0.0625,4.4,2.272727272727273 172 | Xanthphos,CsF(aq.),MeCN,0.0625,4.1,0.0 173 | Xanthphos,1M K3PO4(aq.),MeCN,0.0625,6.4,0.0 174 | Xanthphos,KOH(aq.),MeCN,0.0625,4.0,0.0 175 | Xanthphos,Cs2CO3(aq.),MeCN,0.0625,6.2,0.0 176 | Xanthphos,KOAc,MeCN,0.0625,5.4,0.0 177 | Xanthphos,None,MeCN,0.0625,3.4000000000000004,0.0 178 | P(tBu)3,NaOH(aq.),THF,0.125,10.8,9.25925925925926 179 | P(tBu)3,s. NaHCO3(aq.),THF,0.125,33.7,1.1869436201780417 180 | P(tBu)3,CsF(aq.),THF,0.125,6.800000000000001,5.88235294117647 181 | P(tBu)3,1M K3PO4(aq.),THF,0.125,5.5,9.090909090909092 182 | P(tBu)3,KOH(aq.),THF,0.125,7.6,5.2631578947368425 183 | P(tBu)3,Cs2CO3(aq.),THF,0.125,9.6,3.125 184 | P(tBu)3,KOAc,THF,0.125,10.1,1.98019801980198 185 | P(tBu)3,None,THF,0.125,15.3,1.3071895424836604 186 | P(Ph)3,NaOH(aq.),THF,0.125,13.7,15.328467153284672 187 | P(Ph)3,s. NaHCO3(aq.),THF,0.125,11.6,22.41379310344828 188 | P(Ph)3,CsF(aq.),THF,0.125,10.4,34.61538461538461 189 | P(Ph)3,1M K3PO4(aq.),THF,0.125,9.1,27.47252747252747 190 | P(Ph)3,KOH(aq.),THF,0.125,10.0,22.000000000000004 191 | P(Ph)3,Cs2CO3(aq.),THF,0.125,8.4,21.428571428571427 192 | P(Ph)3,KOAc,THF,0.125,4.5,26.666666666666668 193 | P(Ph)3,None,THF,0.125,8.0,15.0 194 | AmPhos,NaOH(aq.),THF,0.125,11.2,1.785714285714286 195 | AmPhos,s. NaHCO3(aq.),THF,0.125,11.1,0.9009009009009008 196 | AmPhos,CsF(aq.),THF,0.125,8.5,0.0 197 | AmPhos,1M K3PO4(aq.),THF,0.125,5.6,0.0 198 | AmPhos,KOH(aq.),THF,0.125,1.7,0.0 199 | AmPhos,Cs2CO3(aq.),THF,0.125,1.8,0.0 200 | AmPhos,KOAc,THF,0.125,3.9,5.128205128205129 201 | AmPhos,None,THF,0.125,6.7,1.4925373134328357 202 | P(Cy)3,NaOH(aq.),THF,0.125,7.6,3.947368421052632 203 | P(Cy)3,s. NaHCO3(aq.),THF,0.125,7.9,0.0 204 | P(Cy)3,CsF(aq.),THF,0.125,7.4,0.0 205 | P(Cy)3,1M K3PO4(aq.),THF,0.125,5.8,0.0 206 | P(Cy)3,KOH(aq.),THF,0.125,4.800000000000001,4.166666666666666 207 | P(Cy)3,Cs2CO3(aq.),THF,0.125,4.6,2.173913043478261 208 | P(Cy)3,KOAc,THF,0.125,6.5,0.0 209 | P(Cy)3,None,THF,0.125,4.5,0.0 210 | P(o-Tol)3,NaOH(aq.),THF,0.125,8.0,5.0 211 | P(o-Tol)3,s. NaHCO3(aq.),THF,0.125,7.6,6.578947368421052 212 | P(o-Tol)3,CsF(aq.),THF,0.125,7.700000000000001,6.493506493506493 213 | P(o-Tol)3,1M K3PO4(aq.),THF,0.125,6.6,6.060606060606061 214 | P(o-Tol)3,KOH(aq.),THF,0.125,7.1,5.633802816901409 215 | P(o-Tol)3,Cs2CO3(aq.),THF,0.125,2.5,16.0 216 | P(o-Tol)3,KOAc,THF,0.125,5.0,8.0 217 | P(o-Tol)3,None,THF,0.125,2.3000000000000003,8.695652173913043 218 | CataCXium A,NaOH(aq.),THF,0.125,4.6,0.0 219 | CataCXium A,s. NaHCO3(aq.),THF,0.125,4.5,0.0 220 | CataCXium A,CsF(aq.),THF,0.125,4.6,0.0 221 | CataCXium A,1M K3PO4(aq.),THF,0.125,2.0,0.0 222 | CataCXium A,KOH(aq.),THF,0.125,1.8,0.0 223 | CataCXium A,Cs2CO3(aq.),THF,0.125,1.8,0.0 224 | CataCXium A,KOAc,THF,0.125,1.8,0.0 225 | CataCXium A,None,THF,0.125,4.0,0.0 226 | SPhos,NaOH(aq.),THF,0.0625,9.5,0.0 227 | SPhos,s. NaHCO3(aq.),THF,0.0625,4.7,0.0 228 | SPhos,CsF(aq.),THF,0.0625,4.2,0.0 229 | SPhos,1M K3PO4(aq.),THF,0.0625,4.6,0.0 230 | SPhos,KOH(aq.),THF,0.0625,4.300000000000001,0.0 231 | SPhos,Cs2CO3(aq.),THF,0.0625,4.8,0.0 232 | SPhos,KOAc,THF,0.0625,2.0,0.0 233 | SPhos,None,THF,0.0625,4.6,0.0 234 | dtbpf,NaOH(aq.),THF,0.0625,5.300000000000001,11.32075471698113 235 | dtbpf,s. NaHCO3(aq.),THF,0.0625,5.4,9.25925925925926 236 | dtbpf,CsF(aq.),THF,0.0625,4.7,14.893617021276595 237 | dtbpf,1M K3PO4(aq.),THF,0.0625,4.9,20.40816326530612 238 | dtbpf,KOH(aq.),THF,0.0625,5.1,13.725490196078432 239 | dtbpf,Cs2CO3(aq.),THF,0.0625,2.0,0.0 240 | dtbpf,KOAc,THF,0.0625,2.4,12.5 241 | dtbpf,None,THF,0.0625,4.800000000000001,4.166666666666666 242 | XPhos,NaOH(aq.),THF,0.0625,5.0,6.0 243 | XPhos,s. NaHCO3(aq.),THF,0.0625,5.0,0.0 244 | XPhos,CsF(aq.),THF,0.0625,4.7,0.0 245 | XPhos,1M K3PO4(aq.),THF,0.0625,2.0,0.0 246 | XPhos,KOH(aq.),THF,0.0625,1.8,0.0 247 | XPhos,Cs2CO3(aq.),THF,0.0625,1.7,0.0 248 | XPhos,KOAc,THF,0.0625,1.7,0.0 249 | XPhos,None,THF,0.0625,1.9,0.0 250 | dppf,NaOH(aq.),THF,0.0625,8.0,2.5 251 | dppf,s. NaHCO3(aq.),THF,0.0625,8.2,0.0 252 | dppf,CsF(aq.),THF,0.0625,15.1,0.0 253 | dppf,1M K3PO4(aq.),THF,0.0625,11.8,0.0 254 | dppf,KOH(aq.),THF,0.0625,4.199999999999999,9.523809523809526 255 | dppf,Cs2CO3(aq.),THF,0.0625,4.9,10.20408163265306 256 | dppf,KOAc,THF,0.0625,3.4,14.705882352941178 257 | dppf,None,THF,0.0625,4.4,13.636363636363637 258 | Xanthphos,NaOH(aq.),THF,0.0625,3.7,5.405405405405405 259 | Xanthphos,s. NaHCO3(aq.),THF,0.0625,2.4,20.833333333333336 260 | Xanthphos,CsF(aq.),THF,0.0625,1.9,0.0 261 | Xanthphos,1M K3PO4(aq.),THF,0.0625,2.7,29.629629629629623 262 | Xanthphos,KOH(aq.),THF,0.0625,3.0,40.0 263 | Xanthphos,Cs2CO3(aq.),THF,0.0625,1.8,0.0 264 | Xanthphos,KOAc,THF,0.0625,1.8,0.0 265 | Xanthphos,None,THF,0.0625,2.6,0.0 266 | P(tBu)3,NaOH(aq.),DMF,0.125,24.0,0.0 267 | P(tBu)3,s. NaHCO3(aq.),DMF,0.125,21.0,0.0 268 | P(tBu)3,CsF(aq.),DMF,0.125,14.7,0.0 269 | P(tBu)3,1M K3PO4(aq.),DMF,0.125,13.7,0.0 270 | P(tBu)3,KOH(aq.),DMF,0.125,18.4,0.0 271 | P(tBu)3,Cs2CO3(aq.),DMF,0.125,19.2,0.0 272 | P(tBu)3,KOAc,DMF,0.125,19.6,0.0 273 | P(tBu)3,None,DMF,0.125,21.9,0.0 274 | P(Ph)3,NaOH(aq.),DMF,0.125,23.8,0.0 275 | P(Ph)3,s. NaHCO3(aq.),DMF,0.125,10.2,0.0 276 | P(Ph)3,CsF(aq.),DMF,0.125,3.4000000000000004,0.0 277 | P(Ph)3,1M K3PO4(aq.),DMF,0.125,3.3,0.0 278 | P(Ph)3,KOH(aq.),DMF,0.125,4.1,0.0 279 | P(Ph)3,Cs2CO3(aq.),DMF,0.125,4.1,0.0 280 | P(Ph)3,KOAc,DMF,0.125,5.3,32.075471698113205 281 | P(Ph)3,None,DMF,0.125,3.8,0.0 282 | AmPhos,NaOH(aq.),DMF,0.125,22.2,0.0 283 | AmPhos,s. NaHCO3(aq.),DMF,0.125,16.3,0.0 284 | AmPhos,CsF(aq.),DMF,0.125,14.9,0.0 285 | AmPhos,1M K3PO4(aq.),DMF,0.125,17.8,0.0 286 | AmPhos,KOH(aq.),DMF,0.125,14.8,0.0 287 | AmPhos,Cs2CO3(aq.),DMF,0.125,15.6,0.0 288 | AmPhos,KOAc,DMF,0.125,4.3,0.0 289 | AmPhos,None,DMF,0.125,67.10000000000001,0.0 290 | P(Cy)3,NaOH(aq.),DMF,0.125,33.9,0.0 291 | P(Cy)3,s. NaHCO3(aq.),DMF,0.125,15.3,0.0 292 | P(Cy)3,CsF(aq.),DMF,0.125,9.7,0.0 293 | P(Cy)3,1M K3PO4(aq.),DMF,0.125,5.4,0.0 294 | P(Cy)3,KOH(aq.),DMF,0.125,3.8,0.0 295 | P(Cy)3,Cs2CO3(aq.),DMF,0.125,11.8,0.0 296 | P(Cy)3,KOAc,DMF,0.125,15.8,0.0 297 | P(Cy)3,None,DMF,0.125,11.3,0.0 298 | P(o-Tol)3,NaOH(aq.),DMF,0.125,8.7,0.0 299 | P(o-Tol)3,s. NaHCO3(aq.),DMF,0.125,0.0,0.0 300 | P(o-Tol)3,CsF(aq.),DMF,0.125,0.0,0.0 301 | P(o-Tol)3,1M K3PO4(aq.),DMF,0.125,0.0,0.0 302 | P(o-Tol)3,KOH(aq.),DMF,0.125,0.0,0.0 303 | P(o-Tol)3,Cs2CO3(aq.),DMF,0.125,0.0,0.0 304 | P(o-Tol)3,KOAc,DMF,0.125,0.0,0.0 305 | P(o-Tol)3,None,DMF,0.125,0.0,0.0 306 | CataCXium A,NaOH(aq.),DMF,0.125,0.0,0.0 307 | CataCXium A,s. NaHCO3(aq.),DMF,0.125,0.0,0.0 308 | CataCXium A,CsF(aq.),DMF,0.125,0.0,0.0 309 | CataCXium A,1M K3PO4(aq.),DMF,0.125,0.0,0.0 310 | CataCXium A,KOH(aq.),DMF,0.125,0.0,0.0 311 | CataCXium A,Cs2CO3(aq.),DMF,0.125,12.6,0.0 312 | CataCXium A,KOAc,DMF,0.125,6.8,0.0 313 | CataCXium A,None,DMF,0.125,0.0,0.0 314 | SPhos,NaOH(aq.),DMF,0.0625,9.2,100.0 315 | SPhos,s. NaHCO3(aq.),DMF,0.0625,0.0,0.0 316 | SPhos,CsF(aq.),DMF,0.0625,0.0,0.0 317 | SPhos,1M K3PO4(aq.),DMF,0.0625,0.0,0.0 318 | SPhos,KOH(aq.),DMF,0.0625,0.0,0.0 319 | SPhos,Cs2CO3(aq.),DMF,0.0625,14.5,0.0 320 | SPhos,KOAc,DMF,0.0625,0.0,0.0 321 | SPhos,None,DMF,0.0625,0.0,0.0 322 | dtbpf,NaOH(aq.),DMF,0.0625,3.8,0.0 323 | dtbpf,s. NaHCO3(aq.),DMF,0.0625,0.0,0.0 324 | dtbpf,CsF(aq.),DMF,0.0625,0.0,0.0 325 | dtbpf,1M K3PO4(aq.),DMF,0.0625,3.1,0.0 326 | dtbpf,KOH(aq.),DMF,0.0625,3.4,0.0 327 | dtbpf,Cs2CO3(aq.),DMF,0.0625,0.0,0.0 328 | dtbpf,KOAc,DMF,0.0625,0.0,0.0 329 | dtbpf,None,DMF,0.0625,0.0,0.0 330 | XPhos,NaOH(aq.),DMF,0.0625,12.1,0.0 331 | XPhos,s. NaHCO3(aq.),DMF,0.0625,21.200000000000003,0.0 332 | XPhos,CsF(aq.),DMF,0.0625,24.2,0.0 333 | XPhos,1M K3PO4(aq.),DMF,0.0625,29.2,0.0 334 | XPhos,KOH(aq.),DMF,0.0625,27.3,0.0 335 | XPhos,Cs2CO3(aq.),DMF,0.0625,22.8,0.0 336 | XPhos,KOAc,DMF,0.0625,18.5,0.0 337 | XPhos,None,DMF,0.0625,22.3,0.0 338 | dppf,NaOH(aq.),DMF,0.0625,19.0,68.94736842105263 339 | dppf,s. NaHCO3(aq.),DMF,0.0625,7.7,55.84415584415584 340 | dppf,CsF(aq.),DMF,0.0625,2.1,0.0 341 | dppf,1M K3PO4(aq.),DMF,0.0625,1.9,0.0 342 | dppf,KOH(aq.),DMF,0.0625,14.4,70.83333333333333 343 | dppf,Cs2CO3(aq.),DMF,0.0625,15.9,72.95597484276729 344 | dppf,KOAc,DMF,0.0625,2.7,33.33333333333333 345 | dppf,None,DMF,0.0625,1.9,0.0 346 | Xanthphos,NaOH(aq.),DMF,0.0625,2.2,0.0 347 | Xanthphos,s. NaHCO3(aq.),DMF,0.0625,1.9,0.0 348 | Xanthphos,CsF(aq.),DMF,0.0625,1.9,0.0 349 | Xanthphos,1M K3PO4(aq.),DMF,0.0625,1.8,0.0 350 | Xanthphos,KOH(aq.),DMF,0.0625,2.1,0.0 351 | Xanthphos,Cs2CO3(aq.),DMF,0.0625,2.1,0.0 352 | Xanthphos,KOAc,DMF,0.0625,2.6,34.61538461538461 353 | Xanthphos,None,DMF,0.0625,1.7,0.0 354 | -------------------------------------------------------------------------------- /examples/publication/Suzuki/performance/1_merge_all.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | 4 | batch = 1 5 | 6 | objective_1 = 'objective_conversion' 7 | objective_2 = 'objective_selectivity' 8 | 9 | columns_to_keep = ['step', 'n_experiments', 10 | 'dmaximin_tradeoff', 'hypervolume completed (%)', 11 | f'MAE_{objective_1}', f"MAE_{objective_2}", 12 | f'RMSE_{objective_1}', f'RMSE_{objective_2}', 13 | f'R2_{objective_1}', f'R2_{objective_2}', 14 | f'{objective_1}_best', f'{objective_2}_best' 15 | ] 16 | 17 | for feat in ['ohe', 'dft', 'mordred', 'random']: 18 | for acq in ['EHVI', 'MOUCB', 'MOGreedy']: 19 | df_i = pd.read_csv(f"../results_{feat}/results_benchmark_{feat}_acq_{acq}_batch_{batch}_seed_0.csv") 20 | columns_to_drop = list(set(df_i.columns.values) - set(columns_to_keep)) 21 | df_i.drop(columns=columns_to_drop, inplace=True) 22 | for seed_i in range(0, 5): 23 | df_j = pd.read_csv(f"../results_{feat}/results_benchmark_{feat}_acq_{acq}_batch_{batch}_seed_{seed_i}.csv") 24 | df_j.drop(columns=columns_to_drop, inplace=True) 25 | df_i = df_i.append(df_j) 26 | 27 | df_i.to_csv(f"./{feat}_{acq}_all.csv", index=False) 28 | 29 | df_av = df_i.groupby(['step', 'n_experiments']).agg([np.average]) 30 | df_av['step'] = np.unique(df_i.step.values) 31 | df_av['n_experiments'] = np.unique(df_i.n_experiments.values) 32 | df_av.to_csv(f"./{feat}_{acq}_avg.csv", index=False) 33 | 34 | df_min = df_i.groupby(['step', 'n_experiments']).agg([np.min]) 35 | df_min['step'] = np.unique(df_i.step.values) 36 | df_min['n_experiments'] = np.unique(df_i.n_experiments.values) 37 | df_min.to_csv(f"./{feat}_{acq}_min.csv", index=False) 38 | 39 | df_max = df_i.groupby(['step', 'n_experiments']).agg([np.max]) 40 | df_max['step'] = np.unique(df_i.step.values) 41 | df_max['n_experiments'] = np.unique(df_i.n_experiments.values) 42 | df_max.to_csv(f"./{feat}_{acq}_max.csv", index=False) 43 | 44 | 45 | 46 | -------------------------------------------------------------------------------- /examples/publication/Suzuki/performance/2_plot_ground_truth.py: -------------------------------------------------------------------------------- 1 | 2 | import pandas as pd 3 | import numpy as np 4 | import matplotlib.pyplot as plt 5 | import seaborn as sns 6 | sns.set_style("ticks") 7 | sns.despine() 8 | import matplotlib as mpl 9 | mpl.rcParams['grid.linestyle'] = ':' 10 | mpl.rcParams['grid.linewidth'] = 0.1 11 | plt.rcParams['font.family'] = 'Helvetica' 12 | plt.rcParams['font.size'] = 10 13 | import pareto 14 | from edbo.plus.benchmark.multiobjective_benchmark import is_pareto 15 | from pymoo.mcdm.high_tradeoff import HighTradeoffPoints 16 | from sklearn.preprocessing import MinMaxScaler 17 | 18 | # Hue: Color (ligand), shape (base), filling (solvent), alpha (ligand_eq). 19 | 20 | import seaborn as sns 21 | 22 | dataset = 'dft' 23 | acq = 'EHVI' 24 | batch = 1 25 | total_restarts = 5 26 | n_steps = 30 27 | seed = 0 28 | 29 | 30 | def get_pareto_points(objective_values): 31 | """ Get pareto for the ground truth function. 32 | NOTE: Assumes maximization.""" 33 | pareto_ground = pareto.eps_sort(tables=objective_values, 34 | objectives=np.arange(2), 35 | maximize_all=True) 36 | idx_pareto = is_pareto(objectives=-objective_values) 37 | return np.array(pareto_ground), idx_pareto 38 | 39 | 40 | def get_high_tradeoff_points(pareto_points): 41 | """ Pass a numpy array with the pareto points and returns a numpy 42 | array with the high tradeoff points.""" 43 | 44 | scaler_pareto = MinMaxScaler() 45 | pareto_scaled = scaler_pareto.fit_transform(pareto_points) 46 | try: 47 | tradeoff = HighTradeoffPoints() 48 | 49 | tradeoff_args = tradeoff.do(-pareto_scaled) # Always minimizing. 50 | tradeoff_points = pareto_points[tradeoff_args] 51 | except: 52 | tradeoff_points = [] 53 | pass 54 | return tradeoff_points 55 | 56 | 57 | df_exp = pd.read_csv('../data/dataset_B1.csv') 58 | objective_vals = df_exp[['objective_conversion', 'objective_selectivity']].values 59 | pareto_points, idx_pareto = get_pareto_points(objective_vals) 60 | high_tradeoff_points = get_high_tradeoff_points(pareto_points) 61 | 62 | 63 | df_benchmark = pd.read_csv(f'../results_{dataset}/results_benchmark_{dataset}_acq_{acq}_batch_{batch}_seed_{seed}.csv') 64 | 65 | fig, ax = plt.subplots(nrows=2, ncols=2, figsize=(9, 15)) 66 | 67 | palettes = [['tab10', 'viridis'], [None, 'Blues']] 68 | 69 | hues = [['ligand', 'base'], ['solvent', 'ligand_equivalent']] 70 | for i in range(0, 2): 71 | for j in range(0, 2): 72 | sns.scatterplot(x=df_exp['objective_conversion'], y=df_exp['objective_selectivity'], 73 | hue=df_exp[hues[i][j]], s=40, lw=1., edgecolor='black', ax=ax[i][j], palette=palettes[i][j]) 74 | sns.lineplot(x=pareto_points[:, 0], y=pareto_points[:, 1], 75 | linewidth=1.2, color='grey', ls='dotted', ax=ax[i][j]) 76 | ax[i][j].set_xlim(-5, 105) 77 | ax[i][j].set_ylim(-5, 105) 78 | ax[i][j].legend(loc=4) 79 | ax[i][j].set_title(hues[i][j]) 80 | 81 | plt.tight_layout() 82 | plt.show() 83 | 84 | palettes = ['tab10', None] 85 | fig, ax = plt.subplots(nrows=1, ncols=2, figsize=(7, 5)) 86 | hues = ['ligand', 'solvent'] 87 | 88 | for i in range(0, 2): 89 | sns.scatterplot(x=df_exp['objective_conversion'], y=df_exp['objective_selectivity'], 90 | hue=df_exp[hues[i]], s=50, lw=1., edgecolor='black', ax=ax[i], palette=palettes[i]) 91 | sns.lineplot(x=pareto_points[:, 0], y=pareto_points[:, 1], 92 | linewidth=1.2, color='grey', ls='dotted', ax=ax[i]) 93 | ax[i].set_xlim(-5, 105) 94 | ax[i].set_ylim(-5, 105) 95 | ax[i].legend(loc=4) 96 | ax[i].set_title(hues[i]) 97 | 98 | ax[0].legend(scatterpoints=1, loc='best', ncol=2, markerscale=1, fontsize=9) 99 | ax[1].legend(scatterpoints=1, loc='best', ncol=2, markerscale=1, fontsize=9) 100 | 101 | plt.tight_layout() 102 | plt.savefig('Fig2_scope.svg', dpi=500, format='svg') 103 | plt.show() 104 | -------------------------------------------------------------------------------- /examples/publication/Suzuki/performance/3_plot_decision_pathways_objectives.py: -------------------------------------------------------------------------------- 1 | 2 | import pandas as pd 3 | import numpy as np 4 | import matplotlib.pyplot as plt 5 | import seaborn as sns 6 | sns.set_style("ticks") 7 | sns.despine() 8 | import matplotlib as mpl 9 | mpl.rcParams['grid.linestyle'] = ':' 10 | mpl.rcParams['grid.linewidth'] = 0.1 11 | plt.rcParams['font.family'] = 'Helvetica' 12 | 13 | 14 | datasets = ['ohe', 'dft', 'mordred', 'random'] 15 | acq = 'EHVI' 16 | batch = 1 17 | total_restarts = 5 18 | n_steps = 30 19 | 20 | color_paletes = [sns.color_palette("Blues", n_colors=total_restarts), 21 | sns.color_palette("Reds", n_colors=total_restarts), 22 | sns.color_palette("Greens", n_colors=total_restarts), 23 | sns.color_palette("Oranges", n_colors=total_restarts)] 24 | 25 | cp = 0 26 | for dataset in datasets: 27 | objectives = ['objective_conversion', 'objective_selectivity'] 28 | dict_ratios_plot = {'width_ratios': [0.5, 0.2, 0.5, 0.2], 'wspace': 0.4} 29 | fig, ax = plt.subplots(nrows=1, ncols=4, figsize=(10, 3), 30 | gridspec_kw=dict_ratios_plot) 31 | obj_counter = 0 32 | for obj in objectives: 33 | 34 | for seed in range(total_restarts): 35 | df_benchmark = pd.read_csv(f'../results_{dataset}/results_benchmark_{dataset}_acq_{acq}_batch_{batch}_seed_{seed}.csv') 36 | df_exp = pd.read_csv('../data/dataset_B1.csv') 37 | total_number_of_experiments = len(df_exp) 38 | 39 | trace_xy = [] 40 | for i in range(0, n_steps): 41 | trace_xy.append([df_benchmark['step'][i], df_benchmark[f"{obj}_collected_values"][i]]) 42 | trace_xy = np.reshape(trace_xy, (len(trace_xy), -2)) 43 | ax[0+obj_counter].scatter(trace_xy[:, 0], trace_xy[:, 1], 44 | facecolor='white', s=50, 45 | edgecolors=color_paletes[cp][seed], 46 | zorder=100) 47 | ax[0+obj_counter].plot(trace_xy[:, 0], trace_xy[:, 1], 48 | linestyle='dotted', c=color_paletes[cp][seed], 49 | lw=1.1, alpha=1.) 50 | ax[0+obj_counter].set_xlim(-1, n_steps+1) 51 | ax[0+obj_counter].set_ylim(-5, 100+10) 52 | # ax[0].set_title(f'Objective: {obj}') 53 | sns.despine(trim=True, offset=2, ax=ax[0+obj_counter]) 54 | sns.distplot(a=df_benchmark, x=df_benchmark[f"{obj}_collected_values"], 55 | ax=ax[1+obj_counter], vertical=True, 56 | hist=False, 57 | # bins=20 58 | kde_kws={'shade': True, 59 | 'color': color_paletes[cp][seed], 60 | 'alpha': 0.1}, 61 | color='black' 62 | ) 63 | 64 | ax[1+obj_counter].set_xlim(0, 0.025) 65 | ax[1+obj_counter].set_ylim(-5, 100+10) 66 | ax[1+obj_counter].axvline(x=0.015, color='black', ls='dotted', alpha=0.5) 67 | 68 | ax[0+obj_counter].set_title(dataset) 69 | ax[0+obj_counter].set_xlabel('Number of samples collected') 70 | ax[0+obj_counter].set_ylabel(f"{obj} (in %)") 71 | hlinecolor = 'black' 72 | hlinestyle = 'dotted' 73 | hlinewidth = 0.5 74 | # plt.hlines(y=13, linestyles=hlinestyle, lw=hlinewidth, colors=hlinecolor, xmin=0, xmax=n_steps) 75 | # plt.hlines(y=14, linestyles=hlinestyle, lw=hlinewidth, colors=hlinecolor, xmin=0, xmax=n_steps) 76 | # plt.hlines(y=29, linestyles=hlinestyle, lw=hlinewidth, colors=hlinecolor, xmin=0, xmax=n_steps) 77 | # plt.hlines(y=9, linestyles=hlinestyle, lw=hlinewidth, colors=hlinecolor, xmin=0, xmax=n_steps) 78 | # plt.hlines(y=8, linestyles=hlinestyle, lw=hlinewidth, colors=hlinecolor, xmin=0, xmax=n_steps) 79 | obj_counter += 2 80 | plt.savefig(f"fig_3_{cp}.svg", format='svg', dpi=500) 81 | plt.show() 82 | plt.tight_layout() 83 | cp += 1 84 | -------------------------------------------------------------------------------- /examples/publication/Suzuki/performance/4_plot_performance.py: -------------------------------------------------------------------------------- 1 | 2 | import numpy as np 3 | import matplotlib.pyplot as plt 4 | import seaborn as sns 5 | import pandas as pd 6 | import os 7 | 8 | 9 | # sns.set_style("ticks") 10 | # sns.set_context("paper") 11 | import matplotlib as mpl 12 | mpl.rcParams['grid.linestyle'] = ':' 13 | mpl.rcParams['grid.linewidth'] = 0.1 14 | 15 | objective_1 = 'conversion' 16 | objective_2 = 'selectivity' 17 | 18 | plt.rcParams['font.family'] = 'Helvetica' 19 | # mpl.rc('font', **{'family':'sans-serif', 'sans-serif':['HelveticaLight']}) 20 | 21 | # Best objectives. 22 | best_conversion_in_scope = 100. 23 | best_selectivity_in_scope = 100. 24 | n_steps = 30 25 | feat_iter = 0 26 | 27 | if not os.path.exists('./figures'): 28 | os.mkdir('figures') 29 | 30 | 31 | for acq in ['EHVI', 'MOUCB', 'MOGreedy']: 32 | colors = ['#DC143C', '#0343DF', '#FAC205', '#15B01A'] 33 | color_i = 0 34 | fig, ax = plt.subplots(figsize=(8., 8.0), dpi=500, nrows=2, ncols=2) 35 | 36 | for feat in ['ohe', 'dft', 'mordred', 'random']: 37 | avg = pd.read_csv(f"./{feat}_{acq}_avg.csv") 38 | avg = avg.apply(pd.to_numeric, errors='coerce') 39 | max = pd.read_csv(f"./{feat}_{acq}_max.csv") 40 | max = max.apply(pd.to_numeric, errors='coerce') 41 | min = pd.read_csv(f"./{feat}_{acq}_min.csv") 42 | min = min.apply(pd.to_numeric, errors='coerce') 43 | 44 | n_exp = avg['n_experiments'].values[1:] 45 | 46 | # Hypervolume. 47 | hypervol_max = max['hypervolume completed (%)'].values[1:] 48 | hypervol_min = min['hypervolume completed (%)'].values[1:] 49 | hypervol_avg = avg['hypervolume completed (%)'].values[1:] 50 | 51 | # Where hypervolume is 99% completed. 52 | try: 53 | hyper_complete_arg = np.argwhere(hypervol_avg > 99.0)[0] 54 | hyper_complete_y = [hypervol_avg[hyper_complete_arg]] 55 | hyper_complete_x = [n_exp[hyper_complete_arg]] 56 | except: 57 | conversion_complete_x = [] 58 | conversion_complete_y = [] 59 | 60 | # Distance pareto. 61 | dtradeoff_max = max['dmaximin_tradeoff'].values[1:] 62 | dtradeoff_min = min['dmaximin_tradeoff'].values[1:] 63 | dtradeoff_avg = avg['dmaximin_tradeoff'].values[1:] 64 | 65 | 66 | # Best samples at each run. 67 | bestconversion_max = max['objective_conversion_best'].values[1:] 68 | bestselectivity_max = max['objective_selectivity_best'].values[1:] 69 | bestconversion_min = min['objective_conversion_best'].values[1:] 70 | bestselectivity_min = min['objective_selectivity_best'].values[1:] 71 | bestconversion_avg = avg['objective_conversion_best'].values[1:] 72 | bestselectivity_avg = avg['objective_selectivity_best'].values[1:] 73 | 74 | # Where best conversion is sampled. 75 | try: 76 | conversion_complete_arg = np.argwhere(bestconversion_max == best_conversion_in_scope)[0] 77 | conversion_complete_y = [bestconversion_max[conversion_complete_arg]] 78 | conversion_complete_x = [n_exp[conversion_complete_arg]] 79 | except: 80 | conversion_complete_x = [] 81 | conversion_complete_y = [] 82 | 83 | # Where best selectivity is sampled. 84 | try: 85 | selectivity_complete_arg = np.argwhere(bestselectivity_min == best_selectivity_in_scope)[0] 86 | selectivity_complete_y = [bestselectivity_min[selectivity_complete_arg]] 87 | selectivity_complete_x = [n_exp[selectivity_complete_arg]] 88 | except: 89 | selectivity_complete_x = [] 90 | selectivity_complete_y = [] 91 | 92 | # Plot performance for each acquisition function. 93 | ax[0][0].plot(n_exp, hypervol_avg, color=colors[color_i], lw=2.5, 94 | label=feat.upper()) 95 | ax[0][0].fill_between(x=n_exp, 96 | y1=hypervol_avg, 97 | y2=hypervol_max, color=colors[color_i], alpha=0.3, lw=0.) 98 | ax[0][0].fill_between(x=n_exp, 99 | y1=hypervol_min, 100 | y2=hypervol_avg, color=colors[color_i], alpha=0.3, lw=0.) 101 | ax[0][0].plot(n_exp, hypervol_min, color=colors[color_i], alpha=1., lw=1., ls='--') 102 | ax[0][0].plot(n_exp, hypervol_max, color=colors[color_i], alpha=1., lw=1., ls='--') 103 | ax[0][0].plot(n_exp, np.ones_like(n_exp)*100, 104 | dashes=[8, 4], color='black', linewidth=0.8) 105 | ax[0][0].scatter(n_exp, hypervol_avg, marker='o', s=0., color=colors[color_i]) 106 | 107 | ax[0][0].set_xticks(np.arange(0, 120, 10)) 108 | ax[0][0].set_xlim(0, n_steps) 109 | ax[0][0].set_ylim(0, 100) 110 | ax[0][0].set_xlabel('Samples') 111 | ax[0][0].set_ylabel('Hypervolume (%)') 112 | 113 | # Plot distance tradeoff. 114 | ax[0][1].plot(n_exp, dtradeoff_avg, color=colors[color_i], lw=2.5, 115 | label=feat.upper()) 116 | ax[0][1].plot(n_exp, dtradeoff_min, color=colors[color_i], lw=1., ls='--', 117 | label=feat.upper()) 118 | ax[0][1].plot(n_exp, dtradeoff_max, color=colors[color_i], lw=1., ls='--', 119 | label=feat.upper()) 120 | 121 | 122 | ax[0][1].fill_between(x=n_exp, 123 | y1=dtradeoff_avg, 124 | y2=dtradeoff_max, color=colors[color_i], alpha=0.3, 125 | ) 126 | ax[0][1].fill_between(x=n_exp, 127 | y1=dtradeoff_min, 128 | y2=dtradeoff_avg, color=colors[color_i], alpha=0.3, 129 | ) 130 | ax[0][1].plot(n_exp, np.ones_like(n_exp) * 0, 131 | dashes=[8, 4], color='black', linewidth=0.8) 132 | ax[0][1].scatter(n_exp, dtradeoff_avg, marker='o', s=0., 133 | color=colors[color_i]) 134 | 135 | 136 | ax[0][1].set_xticks(np.arange(0, 120, 10)) 137 | ax[0][1].set_xlim(0, n_steps) 138 | ax[0][1].set_ylim(0, 80) 139 | ax[0][1].set_xlabel('Samples') 140 | ax[0][1].set_ylabel(r'$d_{(trade-off)}$') 141 | 142 | # Plot best conversion. 143 | ax[1][0].plot(n_exp, bestconversion_avg, color=colors[color_i], lw=2.5, 144 | label=feat) 145 | ax[1][0].plot(n_exp, bestconversion_min, color=colors[color_i], lw=1, ls='--', 146 | label=feat, alpha=1.) 147 | ax[1][0].plot(n_exp, bestconversion_max, color=colors[color_i], lw=1, ls='--', 148 | label=feat, alpha=1.) 149 | ax[1][0].fill_between(x=n_exp, 150 | y1=bestconversion_avg, 151 | y2=bestconversion_max, color=colors[color_i], alpha=0.3, 152 | ) 153 | ax[1][0].fill_between(x=n_exp, 154 | y1=bestconversion_min, 155 | y2=bestconversion_avg, color=colors[color_i], alpha=0.3, 156 | ) 157 | 158 | ax[1][0].plot(n_exp, np.ones_like(n_exp) * 0, 159 | dashes=[8, 4], color='black', linewidth=0.8) 160 | ax[1][0].scatter(n_exp, bestconversion_avg, marker='o', s=0., 161 | color=colors[color_i]) 162 | 163 | ax[1][0].set_xticks(np.arange(0, 120, 10)) 164 | ax[1][0].set_xlim(0, n_steps) 165 | ax[1][0].set_ylim(20, 100) 166 | ax[1][0].set_xlabel('Samples') 167 | ax[1][0].set_ylabel('Best conversion') 168 | 169 | # Plot best selectivity. 170 | ax[1][1].plot(n_exp, bestselectivity_avg, color=colors[color_i], lw=2.5, 171 | label=feat.upper()) 172 | 173 | ax[1][1].plot(n_exp, bestselectivity_min, color=colors[color_i], lw=1.0, ls='--', 174 | label=feat.upper()) 175 | ax[1][1].plot(n_exp, bestselectivity_max, color=colors[color_i], lw=1.0, ls='--', 176 | label=feat.upper()) 177 | 178 | 179 | ax[1][1].fill_between(x=n_exp, 180 | y1=bestselectivity_avg, 181 | y2=bestselectivity_max, color=colors[color_i], alpha=0.3, 182 | ) 183 | ax[1][1].fill_between(x=n_exp, 184 | y1=bestselectivity_min, 185 | y2=bestselectivity_avg, color=colors[color_i], alpha=0.3, 186 | ) 187 | ax[1][1].plot(n_exp, np.ones_like(n_exp) * 0, 188 | dashes=[8, 4], color='black', linewidth=0.8) 189 | ax[1][1].scatter(n_exp, bestselectivity_avg, marker='o', s=0., 190 | color=colors[color_i]) 191 | 192 | 193 | ax[1][1].set_xticks(np.arange(0, 120, 10)) 194 | ax[1][1].set_xlim(0, n_steps) 195 | ax[1][1].set_ylim(0, 100.) 196 | ax[1][1].set_xlabel('Samples') 197 | ax[1][1].set_ylabel('Best selectivity') 198 | 199 | color_i += 1 200 | plt.legend() 201 | plt.tight_layout() 202 | plt.savefig(f"figures/benchmark_{acq}.svg") 203 | plt.show() 204 | 205 | 206 | -------------------------------------------------------------------------------- /examples/publication/Suzuki/performance/5_find_entry.py: -------------------------------------------------------------------------------- 1 | 2 | import pandas as pd 3 | 4 | 5 | df = pd.read_csv('../data/dataset_B1.csv') 6 | 7 | c_ligand, c_base, c_leq, c_solvent = 'SPhos', 'NaOH(aq.)', 0.0625, 'DMF' 8 | 9 | c_ligand, c_base, c_leq, c_solvent = 'P(Ph)3', 'KOAc', 0.125, 'MeOH' 10 | 11 | c_ligand, c_base, c_leq, c_solvent = 'P(Cy)3', 'Cs2CO3(aq.)', 0.125, 'MeOH' 12 | 13 | c_ligand, c_base, c_leq, c_solvent = 'P(Ph)3', 'NaOH(aq.)', 0.125, 'MeOH' 14 | 15 | c_ligand, c_base, c_leq, c_solvent = 'P(Ph)3', 'CsF(aq.)', 0.125, 'MeCN' 16 | 17 | df_new = df[(df['ligand'] == c_ligand) & (df['base'] == c_base) & (df['solvent'] == c_solvent)] 18 | 19 | print(df_new) 20 | -------------------------------------------------------------------------------- /examples/publication/Suzuki/performance/7_plot_performance_acquisition_function.py: -------------------------------------------------------------------------------- 1 | 2 | import numpy as np 3 | import matplotlib.pyplot as plt 4 | import seaborn as sns 5 | import pandas as pd 6 | import os 7 | 8 | 9 | # sns.set_style("ticks") 10 | # sns.set_context("paper") 11 | import matplotlib as mpl 12 | mpl.rcParams['grid.linestyle'] = ':' 13 | mpl.rcParams['grid.linewidth'] = 0.1 14 | 15 | objective_1 = 'conversion' 16 | objective_2 = 'selectivity' 17 | 18 | plt.rcParams['font.family'] = 'Helvetica' 19 | # mpl.rc('font', **{'family':'sans-serif', 'sans-serif':['HelveticaLight']}) 20 | 21 | # Best objectives. 22 | best_conversion_in_scope = 100. 23 | best_selectivity_in_scope = 100. 24 | n_steps = 30 25 | feat_iter = 0 26 | 27 | if not os.path.exists('./figures'): 28 | os.mkdir('figures') 29 | 30 | 31 | colors = ['#DC143C', '#0343DF', '#FAC205'] 32 | feat = 'DFT' 33 | color_i = 0 34 | fig, ax = plt.subplots(figsize=(8., 8.), dpi=500, nrows=2, ncols=2) 35 | for acq in ['EHVI', 'MOUCB', 'MOGreedy']: 36 | 37 | avg = pd.read_csv(f"./{feat}_{acq}_avg.csv") 38 | avg = avg.apply(pd.to_numeric, errors='coerce') 39 | max = pd.read_csv(f"./{feat}_{acq}_max.csv") 40 | max = max.apply(pd.to_numeric, errors='coerce') 41 | min = pd.read_csv(f"./{feat}_{acq}_min.csv") 42 | min = min.apply(pd.to_numeric, errors='coerce') 43 | 44 | n_exp = avg['n_experiments'].values[1:] 45 | 46 | # Hypervolume. 47 | hypervol_max = max['hypervolume completed (%)'].values[1:] 48 | hypervol_min = min['hypervolume completed (%)'].values[1:] 49 | hypervol_avg = avg['hypervolume completed (%)'].values[1:] 50 | 51 | # Where hypervolume is 99% completed. 52 | try: 53 | hyper_complete_arg = np.argwhere(hypervol_avg > 99.0)[0] 54 | hyper_complete_y = [hypervol_avg[hyper_complete_arg]] 55 | hyper_complete_x = [n_exp[hyper_complete_arg]] 56 | except: 57 | conversion_complete_x = [] 58 | conversion_complete_y = [] 59 | 60 | # Distance pareto. 61 | dtradeoff_max = max['dmaximin_tradeoff'].values[1:] 62 | dtradeoff_min = min['dmaximin_tradeoff'].values[1:] 63 | dtradeoff_avg = avg['dmaximin_tradeoff'].values[1:] 64 | 65 | 66 | # Best samples at each run. 67 | bestconversion_max = max['objective_conversion_best'].values[1:] 68 | bestselectivity_max = max['objective_selectivity_best'].values[1:] 69 | bestconversion_min = min['objective_conversion_best'].values[1:] 70 | bestselectivity_min = min['objective_selectivity_best'].values[1:] 71 | bestconversion_avg = avg['objective_conversion_best'].values[1:] 72 | bestselectivity_avg = avg['objective_selectivity_best'].values[1:] 73 | 74 | # Where best conversion is sampled. 75 | try: 76 | conversion_complete_arg = np.argwhere(bestconversion_max == best_conversion_in_scope)[0] 77 | conversion_complete_y = [bestconversion_max[conversion_complete_arg]] 78 | conversion_complete_x = [n_exp[conversion_complete_arg]] 79 | except: 80 | conversion_complete_x = [] 81 | conversion_complete_y = [] 82 | 83 | # Where best selectivity is sampled. 84 | try: 85 | selectivity_complete_arg = np.argwhere(bestselectivity_min == best_selectivity_in_scope)[0] 86 | selectivity_complete_y = [bestselectivity_min[selectivity_complete_arg]] 87 | selectivity_complete_x = [n_exp[selectivity_complete_arg]] 88 | except: 89 | selectivity_complete_x = [] 90 | selectivity_complete_y = [] 91 | 92 | # Plot performance for each acquisition function. 93 | ax[0][0].plot(n_exp, hypervol_avg, color=colors[color_i], lw=2.5, label=acq.upper()) 94 | ax[0][0].fill_between(x=n_exp, y1=hypervol_avg, y2=hypervol_max, color=colors[color_i], alpha=0.3, lw=0.) 95 | ax[0][0].fill_between(x=n_exp, y1=hypervol_min, y2=hypervol_avg, color=colors[color_i], alpha=0.3, lw=0.) 96 | ax[0][0].plot(n_exp, hypervol_min, color=colors[color_i], alpha=1., lw=1., ls='--') 97 | ax[0][0].plot(n_exp, hypervol_max, color=colors[color_i], alpha=1., lw=1., ls='--') 98 | ax[0][0].plot(n_exp, np.ones_like(n_exp)*100, dashes=[8, 4], color='black', linewidth=0.8) 99 | ax[0][0].scatter(n_exp, hypervol_avg, marker='o', s=0., color=colors[color_i]) 100 | 101 | ax[0][0].set_xticks(np.arange(0, 120, 10)) 102 | ax[0][0].set_xlim(0, n_steps) 103 | ax[0][0].set_ylim(0, 100) 104 | ax[0][0].set_xlabel('Samples') 105 | ax[0][0].set_ylabel('Hypervolume (%)') 106 | # plt.tick_params(axis="x", direction="in") 107 | # plt.tick_params(axis="y", direction="in") 108 | 109 | # Plot distance tradeoff. 110 | ax[0][1].plot(n_exp, dtradeoff_avg, color=colors[color_i], lw=2.5, label=acq.upper()) 111 | ax[0][1].plot(n_exp, dtradeoff_min, color=colors[color_i], lw=1., ls='--', label=acq.upper()) 112 | ax[0][1].plot(n_exp, dtradeoff_max, color=colors[color_i], lw=1., ls='--', label=acq.upper()) 113 | ax[0][1].fill_between(x=n_exp, y1=dtradeoff_avg, y2=dtradeoff_max, color=colors[color_i], alpha=0.3) 114 | ax[0][1].fill_between(x=n_exp, y1=dtradeoff_min, y2=dtradeoff_avg, color=colors[color_i], alpha=0.3) 115 | ax[0][1].plot(n_exp, np.ones_like(n_exp) * 0, dashes=[8, 4], color='black', linewidth=0.8) 116 | ax[0][1].scatter(n_exp, dtradeoff_avg, marker='o', s=0., color=colors[color_i]) 117 | 118 | ax[0][1].set_xticks(np.arange(0, 120, 10)) 119 | ax[0][1].set_xlim(0, n_steps) 120 | ax[0][1].set_ylim(0, 80) 121 | ax[0][1].set_xlabel('Samples') 122 | ax[0][1].set_ylabel(r'$d_{(trade-off)}$') 123 | 124 | # Plot best conversion. 125 | ax[1][0].plot(n_exp, bestconversion_avg, color=colors[color_i], lw=2.5, label=acq) 126 | ax[1][0].plot(n_exp, bestconversion_min, color=colors[color_i], lw=1, ls='--', label=acq, alpha=1.) 127 | ax[1][0].plot(n_exp, bestconversion_max, color=colors[color_i], lw=1, ls='--', label=acq, alpha=1.) 128 | ax[1][0].fill_between(x=n_exp, y1=bestconversion_avg, y2=bestconversion_max, color=colors[color_i], alpha=0.3) 129 | ax[1][0].fill_between(x=n_exp, y1=bestconversion_min, y2=bestconversion_avg, color=colors[color_i], alpha=0.3) 130 | 131 | ax[1][0].plot(n_exp, np.ones_like(n_exp) * 0, 132 | dashes=[8, 4], color='black', linewidth=0.8) 133 | ax[1][0].scatter(n_exp, bestconversion_avg, marker='o', s=0., 134 | color=colors[color_i]) 135 | 136 | ax[1][0].set_xticks(np.arange(0, 120, 10)) 137 | ax[1][0].set_xlim(0, n_steps) 138 | ax[1][0].set_ylim(20, 100) 139 | ax[1][0].set_xlabel('Samples') 140 | ax[1][0].set_ylabel('Best conversion') 141 | 142 | # Plot best selectivity. 143 | ax[1][1].plot(n_exp, bestselectivity_avg, color=colors[color_i], lw=2.5, 144 | label=acq.upper()) 145 | 146 | ax[1][1].plot(n_exp, bestselectivity_min, color=colors[color_i], lw=1.0, ls='--', 147 | label=acq.upper()) 148 | ax[1][1].plot(n_exp, bestselectivity_max, color=colors[color_i], lw=1.0, ls='--', 149 | label=acq.upper()) 150 | 151 | 152 | ax[1][1].fill_between(x=n_exp, 153 | y1=bestselectivity_avg, 154 | y2=bestselectivity_max, color=colors[color_i], alpha=0.3, 155 | ) 156 | ax[1][1].fill_between(x=n_exp, 157 | y1=bestselectivity_min, 158 | y2=bestselectivity_avg, color=colors[color_i], alpha=0.3, 159 | ) 160 | ax[1][1].plot(n_exp, np.ones_like(n_exp) * 0, 161 | dashes=[8, 4], color='black', linewidth=0.8) 162 | ax[1][1].scatter(n_exp, bestselectivity_avg, marker='o', s=0., 163 | color=colors[color_i]) 164 | 165 | 166 | ax[1][1].set_xticks(np.arange(0, 120, 10)) 167 | ax[1][1].set_xlim(0, n_steps) 168 | ax[1][1].set_ylim(0, 100.) 169 | ax[1][1].set_xlabel('Samples') 170 | ax[1][1].set_ylabel('Best selectivity') 171 | 172 | color_i += 1 173 | 174 | ax[0][1].legend() 175 | plt.tight_layout() 176 | # plt.savefig(f"figures/benchmark_acquisition_functions.svg") 177 | plt.show() 178 | 179 | 180 | -------------------------------------------------------------------------------- /examples/publication/Suzuki/performance_acq/1_merge_all.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | 4 | 5 | objective_1 = 'objective_conversion' 6 | objective_2 = 'objective_selectivity' 7 | columns_to_keep = ['step', 'n_experiments', 'hypervolume completed (%)'] 8 | 9 | for batch in [1, 2, 3, 5]: 10 | for acq in ['EHVI', 'MOUCB', 'MOGreedy']: 11 | 12 | df_i = pd.read_csv(f"../results/results_benchmark_dft_acq_{acq}_batch_{batch}_seed_0.csv") 13 | columns_to_drop = list(set(df_i.columns.values) - set(columns_to_keep)) 14 | df_i.drop(columns=columns_to_drop, inplace=True) 15 | for seed_i in range(0, 5): 16 | df_j = pd.read_csv(f"../results/results_benchmark_dft_acq_{acq}_batch_{batch}_seed_{seed_i}.csv") 17 | df_j.drop(columns=columns_to_drop, inplace=True) 18 | df_i = df_i.append(df_j) 19 | 20 | df_i.to_csv(f"./dft_{acq}_{batch}_all.csv", index=False) 21 | 22 | df_av = df_i.groupby(['step', 'n_experiments']).agg([np.average]) 23 | df_av['step'] = np.unique(df_i.step.values) 24 | df_av['n_experiments'] = np.unique(df_i.n_experiments.values) 25 | df_av.to_csv(f"./dft_{acq}_{batch}_avg.csv", index=False) 26 | 27 | df_min = df_i.groupby(['step', 'n_experiments']).agg([np.min]) 28 | df_min['step'] = np.unique(df_i.step.values) 29 | df_min['n_experiments'] = np.unique(df_i.n_experiments.values) 30 | df_min.to_csv(f"./dft_{acq}_{batch}_min.csv", index=False) 31 | 32 | df_max = df_i.groupby(['step', 'n_experiments']).agg([np.max]) 33 | df_max['step'] = np.unique(df_i.step.values) 34 | df_max['n_experiments'] = np.unique(df_i.n_experiments.values) 35 | df_max.to_csv(f"./dft_{acq}_{batch}_max.csv", index=False) 36 | 37 | 38 | 39 | -------------------------------------------------------------------------------- /examples/publication/Suzuki/performance_acq/2_plot_acq_batch.py: -------------------------------------------------------------------------------- 1 | import os.path 2 | 3 | import numpy as np 4 | import pandas as pd 5 | import matplotlib.pyplot as plt 6 | 7 | 8 | n_steps = 30 9 | colors = ['#DC143C', '#0343DF', '#FAC205'] 10 | feat = 'dft' 11 | fig, ax = plt.subplots(figsize=(15., 4.), dpi=500, nrows=1, ncols=4) 12 | 13 | batch_count = 0 14 | for batch in [1, 2, 3, 5]: 15 | color_i = 0 16 | for acq in ['EHVI', 'MOUCB', 'MOGreedy']: 17 | avg = pd.read_csv(f"./{feat}_{acq}_{batch}_avg.csv") 18 | avg = avg.apply(pd.to_numeric, errors='coerce') 19 | max = pd.read_csv(f"./{feat}_{acq}_{batch}_max.csv") 20 | max = max.apply(pd.to_numeric, errors='coerce') 21 | min = pd.read_csv(f"./{feat}_{acq}_{batch}_min.csv") 22 | min = min.apply(pd.to_numeric, errors='coerce') 23 | n_exp = avg['n_experiments'].values[1:] 24 | 25 | hypervol_max = max['hypervolume completed (%)'].values[1:] 26 | hypervol_min = min['hypervolume completed (%)'].values[1:] 27 | hypervol_avg = avg['hypervolume completed (%)'].values[1:] 28 | # Plot performance for each acquisition function. 29 | ax[batch_count].plot(n_exp, hypervol_avg, color=colors[color_i], lw=2.5, label=acq.upper()) 30 | ax[batch_count].fill_between(x=n_exp, y1=hypervol_avg, y2=hypervol_max, color=colors[color_i], alpha=0.3, lw=0.) 31 | ax[batch_count].fill_between(x=n_exp, y1=hypervol_min, y2=hypervol_avg, color=colors[color_i], alpha=0.3, lw=0.) 32 | ax[batch_count].plot(n_exp, hypervol_min, color=colors[color_i], alpha=1., lw=1., ls='--') 33 | ax[batch_count].plot(n_exp, hypervol_max, color=colors[color_i], alpha=1., lw=1., ls='--') 34 | ax[batch_count].plot(n_exp, np.ones_like(n_exp) * 100, dashes=[8, 4], color='black', linewidth=0.8) 35 | ax[batch_count].scatter(n_exp, hypervol_avg, marker='o', s=0., color=colors[color_i]) 36 | 37 | ax[batch_count].set_xticks(np.arange(0, 120, 5)) 38 | ax[batch_count].set_xlim(0, n_steps) 39 | ax[batch_count].set_ylim(0, 100) 40 | ax[batch_count].set_xlabel('Samples') 41 | ax[batch_count].set_ylabel('Hypervolume (%)') 42 | color_i += 1 43 | 44 | batch_count += 1 45 | plt.legend() 46 | 47 | if not os.path.exists('figures'): 48 | os.mkdir('figures') 49 | 50 | plt.tight_layout() 51 | plt.savefig(f"figures/benchmark_acquisition_functions_batch.svg") 52 | plt.show() 53 | 54 | -------------------------------------------------------------------------------- /examples/publication/Virtual-experimentation/1_benchmark.py: -------------------------------------------------------------------------------- 1 | 2 | import shutil 3 | from edbo.plus.benchmark.multiobjective_benchmark import Benchmark 4 | import os 5 | import numpy as np 6 | import pandas as pd 7 | 8 | 9 | ####################### 10 | # Benchmark inputs 11 | budget = 30 12 | 13 | acq = 'EHVI' 14 | seed = 1 15 | for sampling_method in ['seed', 'lhs', 'cvtsampling']: 16 | for batch in [1, 2, 3, 5]: 17 | df_exp = pd.read_csv('./data/data.csv') 18 | df_exp['new_index'] = np.arange(0, len(df_exp.values)) 19 | sort_column = 'new_index' 20 | 21 | # Select the features for the model. 22 | columns_regression = ['Temperature', 'Volume', 'D', 23 | 'SM2', 24 | 'W', 25 | 'Mixing', 26 | 'Time', 27 | 'WB' 28 | ] 29 | 30 | # Select objectives. 31 | objectives = ['P', 'I1'] 32 | objective_modes = ['max', 'min'] 33 | objective_thresholds = [None, None] 34 | print(f"Columns for regression: {columns_regression}") 35 | 36 | label_benchmark = f"benchmark_acq_{acq}_batch_{batch}_{sampling_method}.csv" 37 | 38 | # Remove previous files. 39 | if os.path.exists(label_benchmark): 40 | os.remove(label_benchmark) 41 | 42 | if os.path.exists(f'pred_{label_benchmark}'): 43 | os.remove(f'pred_{label_benchmark}') 44 | 45 | if os.path.exists(f'results_{label_benchmark}'): 46 | os.remove(f'results_{label_benchmark}') 47 | 48 | bench = Benchmark( 49 | df_ground=df_exp, 50 | features_regression=columns_regression, 51 | objective_names=objectives, 52 | objective_modes=objective_modes, 53 | objective_thresholds=objective_thresholds, 54 | filename=label_benchmark, 55 | filename_results=f'results_{label_benchmark}', 56 | index_column=sort_column,acquisition_function=acq 57 | ) 58 | 59 | bench.run( 60 | steps=int(budget/batch), batch=batch, seed=seed, 61 | init_method=sampling_method, 62 | plot_train=False, plot_predictions=False 63 | ) 64 | 65 | if not os.path.exists('results'): 66 | os.mkdir('results') 67 | 68 | shutil.move(label_benchmark, f'results/{label_benchmark}') 69 | shutil.move(f'pred_{label_benchmark}', f'results/pred_{label_benchmark}') 70 | shutil.move(f'results_{label_benchmark}', f'results/results_{label_benchmark}') 71 | -------------------------------------------------------------------------------- /examples/publication/Virtual-experimentation/performance/1_merge_all.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | 4 | batch = 1 5 | 6 | objective_1 = 'P' 7 | objective_2 = 'I1' 8 | 9 | columns_to_keep = ['step', 'n_experiments', 10 | 'dmaximin_tradeoff', 'hypervolume completed (%)', 11 | f'MAE_{objective_1}', f"MAE_{objective_2}", 12 | f'RMSE_{objective_1}', f'RMSE_{objective_2}', 13 | f'R2_{objective_1}', f'R2_{objective_2}', 14 | f'{objective_1}_best', f'{objective_2}_best' 15 | ] 16 | 17 | acq = 'EHVI' 18 | for sampling in ['seed', 'lhs', 'cvtsampling']: 19 | df_i = pd.read_csv(f"../results/results_benchmark_acq_{acq}_batch_{batch}_{sampling}.csv") 20 | columns_to_drop = list(set(df_i.columns.values) - set(columns_to_keep)) 21 | df_i.drop(columns=columns_to_drop, inplace=True) 22 | 23 | df_i.to_csv(f"./{sampling}_all.csv", index=False) 24 | 25 | df_av = df_i.groupby(['step', 'n_experiments']).agg([np.average]) 26 | df_av['step'] = np.unique(df_i.step.values) 27 | df_av['n_experiments'] = np.unique(df_i.n_experiments.values) 28 | df_av.to_csv(f"./{sampling}_avg.csv", index=False) 29 | 30 | df_min = df_i.groupby(['step', 'n_experiments']).agg([np.min]) 31 | df_min['step'] = np.unique(df_i.step.values) 32 | df_min['n_experiments'] = np.unique(df_i.n_experiments.values) 33 | df_min.to_csv(f"./{sampling}_min.csv", index=False) 34 | 35 | df_max = df_i.groupby(['step', 'n_experiments']).agg([np.max]) 36 | df_max['step'] = np.unique(df_i.step.values) 37 | df_max['n_experiments'] = np.unique(df_i.n_experiments.values) 38 | df_max.to_csv(f"./{sampling}_max.csv", index=False) 39 | 40 | 41 | 42 | -------------------------------------------------------------------------------- /examples/publication/Virtual-experimentation/performance/2_plot_ground_truth.py: -------------------------------------------------------------------------------- 1 | import os.path 2 | 3 | import pandas as pd 4 | import numpy as np 5 | import matplotlib.pyplot as plt 6 | import seaborn as sns 7 | sns.set_style("ticks") 8 | sns.despine() 9 | import matplotlib as mpl 10 | mpl.rcParams['grid.linestyle'] = ':' 11 | mpl.rcParams['grid.linewidth'] = 0.1 12 | plt.rcParams['font.family'] = 'Helvetica' 13 | plt.rcParams['font.size'] = 10 14 | import pareto 15 | from edbo.plus.benchmark.multiobjective_benchmark import is_pareto 16 | from pymoo.mcdm.high_tradeoff import HighTradeoffPoints 17 | from sklearn.preprocessing import MinMaxScaler 18 | 19 | 20 | # Hue: Color (ligand), shape (base), filling (solvent), alpha (ligand_eq). 21 | 22 | import seaborn as sns 23 | 24 | dataset = 'dft' 25 | acq = 'EHVI' 26 | batch = 1 27 | total_restarts = 5 28 | n_steps = 30 29 | seed = 0 30 | 31 | 32 | def get_pareto_points(objective_values): 33 | """ Get pareto for the ground truth function. 34 | NOTE: Assumes maximization.""" 35 | pareto_ground = pareto.eps_sort(tables=objective_values, 36 | objectives=np.arange(2), 37 | maximize_all=True) 38 | idx_pareto = is_pareto(objectives=-objective_values) 39 | return np.array(pareto_ground), idx_pareto 40 | 41 | def get_high_tradeoff_points(pareto_points): 42 | """ Pass a numpy array with the pareto points and returns a numpy 43 | array with the high tradeoff points.""" 44 | 45 | scaler_pareto = MinMaxScaler() 46 | pareto_scaled = scaler_pareto.fit_transform(pareto_points) 47 | try: 48 | tradeoff = HighTradeoffPoints() 49 | 50 | tradeoff_args = tradeoff.do(-pareto_scaled) # Always minimizing. 51 | tradeoff_points = pareto_points[tradeoff_args] 52 | except: 53 | tradeoff_points = [] 54 | pass 55 | return tradeoff_points 56 | 57 | 58 | df_exp = pd.read_csv('../data/data.csv') 59 | df_exp['I1'] = -df_exp['I1'].values 60 | objective_vals = df_exp[['P', 'I1']].values 61 | pareto_points, idx_pareto = get_pareto_points(objective_vals) 62 | high_tradeoff_points = get_high_tradeoff_points(pareto_points) 63 | 64 | fig, ax = plt.subplots(nrows=3, ncols=3, figsize=(12, 12)) 65 | 66 | print(df_exp.columns) 67 | 68 | palettes = [['Reds', 'Reds', 'Blues'], 69 | ['Greens', 'Oranges', 'Reds'], 70 | ['Blues', 'Greens', 'Oranges'] 71 | ] 72 | 73 | hues = [['Temperature', 'Temperature', 'Volume'], 74 | ['D', 'SM2', 'W'], 75 | ['Mixing', 'Time', 'WB'] 76 | ] 77 | 78 | for i in range(0, 3): 79 | for j in range(0, 3): 80 | sns.scatterplot(x=df_exp['P'], y=df_exp['I1'], 81 | hue=df_exp[hues[i][j]], s=40, lw=1., edgecolor='black', ax=ax[i][j], palette=palettes[i][j]) 82 | sns.lineplot(x=pareto_points[:, 0], y=pareto_points[:, 1], 83 | linewidth=1.2, color='grey', ls='dotted', ax=ax[i][j]) 84 | # ax[i][j].set_xlim(-5, 105) 85 | # ax[i][j].set_ylim(-5, 105) 86 | ax[i][j].legend(loc=3) 87 | ax[i][j].set_title(hues[i][j]) 88 | fig.delaxes(ax[0][0]) 89 | plt.tight_layout() 90 | 91 | if not os.path.exists('../plots'): 92 | os.mkdir('../plots') 93 | plt.savefig('../plots/SI_ground_truth.svg', dpi=500, format='svg') 94 | plt.show() 95 | -------------------------------------------------------------------------------- /examples/publication/Virtual-experimentation/performance/3_plot_performance_acquisition_function.py: -------------------------------------------------------------------------------- 1 | 2 | import numpy as np 3 | import matplotlib.pyplot as plt 4 | import seaborn as sns 5 | import pandas as pd 6 | import os 7 | 8 | 9 | # sns.set_style("ticks") 10 | # sns.set_context("paper") 11 | import matplotlib as mpl 12 | mpl.rcParams['grid.linestyle'] = ':' 13 | mpl.rcParams['grid.linewidth'] = 0.1 14 | 15 | objective_1 = 'P' 16 | objective_2 = 'I1' 17 | 18 | plt.rcParams['font.family'] = 'Helvetica' 19 | # mpl.rc('font', **{'family':'sans-serif', 'sans-serif':['HelveticaLight']}) 20 | 21 | # Best objectives. 22 | best_P_in_scope = 100. 23 | best_I1_in_scope = 100. 24 | n_steps = 30 25 | 26 | if not os.path.exists('./figures'): 27 | os.mkdir('figures') 28 | 29 | 30 | colors = ['#DC143C', '#0343DF', '#FAC205'] 31 | color_i = 0 32 | fig, ax = plt.subplots(figsize=(8., 8.), dpi=500, nrows=2, ncols=2) 33 | 34 | acq = 'EHVI' 35 | for sampling in ['seed', 'lhs', 'cvtsampling']: 36 | 37 | avg = pd.read_csv(f"./{sampling}_avg.csv") 38 | 39 | avg = avg.apply(pd.to_numeric, errors='coerce') 40 | max = pd.read_csv(f"./{sampling}_max.csv") 41 | max = max.apply(pd.to_numeric, errors='coerce') 42 | min = pd.read_csv(f"./{sampling}_min.csv") 43 | min = min.apply(pd.to_numeric, errors='coerce') 44 | 45 | n_exp = avg['n_experiments'].values[1:] 46 | 47 | # Hypervolume. 48 | hypervol_max = max['hypervolume completed (%)'].values[1:] 49 | hypervol_min = min['hypervolume completed (%)'].values[1:] 50 | hypervol_avg = avg['hypervolume completed (%)'].values[1:] 51 | 52 | # Where hypervolume is 99% completed. 53 | try: 54 | hyper_complete_arg = np.argwhere(hypervol_avg > 99.0)[0] 55 | hyper_complete_y = [hypervol_avg[hyper_complete_arg]] 56 | hyper_complete_x = [n_exp[hyper_complete_arg]] 57 | except: 58 | P_complete_x = [] 59 | P_complete_y = [] 60 | 61 | # Distance pareto. 62 | dtradeoff_max = max['dmaximin_tradeoff'].values[1:] 63 | dtradeoff_min = min['dmaximin_tradeoff'].values[1:] 64 | dtradeoff_avg = avg['dmaximin_tradeoff'].values[1:] 65 | 66 | 67 | # Best samples at each run. 68 | bestP_max = max[f'{objective_1}_best'].values[1:] 69 | bestI1_max = max[f'{objective_2}_best'].values[1:] 70 | bestP_min = min[f'{objective_1}_best'].values[1:] 71 | bestI1_min = min[f'{objective_2}_best'].values[1:] 72 | bestP_avg = avg[f'{objective_1}_best'].values[1:] 73 | bestI1_avg = avg[f'{objective_2}_best'].values[1:] 74 | 75 | # Where best P is sampled. 76 | try: 77 | P_complete_arg = np.argwhere(bestP_max == best_P_in_scope)[0] 78 | P_complete_y = [bestP_max[P_complete_arg]] 79 | P_complete_x = [n_exp[P_complete_arg]] 80 | except: 81 | P_complete_x = [] 82 | P_complete_y = [] 83 | 84 | # Where best I1 is sampled. 85 | try: 86 | I1_complete_arg = np.argwhere(bestI1_min == best_I1_in_scope)[0] 87 | I1_complete_y = [bestI1_min[I1_complete_arg]] 88 | I1_complete_x = [n_exp[I1_complete_arg]] 89 | except: 90 | I1_complete_x = [] 91 | I1_complete_y = [] 92 | 93 | # Plot performance for each acquisition function. 94 | ax[0][0].plot(n_exp, hypervol_avg, color=colors[color_i], lw=2.5, label=sampling.upper()) 95 | ax[0][0].fill_between(x=n_exp, y1=hypervol_avg, y2=hypervol_max, color=colors[color_i], alpha=0.3, lw=0.) 96 | ax[0][0].fill_between(x=n_exp, y1=hypervol_min, y2=hypervol_avg, color=colors[color_i], alpha=0.3, lw=0.) 97 | ax[0][0].plot(n_exp, hypervol_min, color=colors[color_i], alpha=1., lw=1., ls='--') 98 | ax[0][0].plot(n_exp, hypervol_max, color=colors[color_i], alpha=1., lw=1., ls='--') 99 | # ax[0][0].plot(n_exp, np.ones_like(n_exp)*100, dashes=[8, 4], color='black', linewidth=0.8) 100 | ax[0][0].scatter(n_exp, hypervol_avg, marker='o', s=0., color=colors[color_i]) 101 | 102 | ax[0][0].set_xticks(np.arange(0, 120, 5)) 103 | ax[0][0].set_xlim(0, n_steps) 104 | 105 | # ax[0][0].set_ylim(40, 100) 106 | ax[0][0].set_xlabel('Samples') 107 | ax[0][0].set_ylabel('Hypervolume (%)') 108 | # plt.tick_params(axis="x", direction="in") 109 | # plt.tick_params(axis="y", direction="in") 110 | 111 | # Plot distance tradeoff. 112 | ax[0][1].plot(n_exp, dtradeoff_avg, color=colors[color_i], lw=2.5, label=sampling.upper()) 113 | ax[0][1].plot(n_exp, dtradeoff_min, color=colors[color_i], lw=1., ls='--', label=sampling.upper()) 114 | ax[0][1].plot(n_exp, dtradeoff_max, color=colors[color_i], lw=1., ls='--', label=sampling.upper()) 115 | ax[0][1].fill_between(x=n_exp, y1=dtradeoff_avg, y2=dtradeoff_max, color=colors[color_i], alpha=0.3) 116 | ax[0][1].fill_between(x=n_exp, y1=dtradeoff_min, y2=dtradeoff_avg, color=colors[color_i], alpha=0.3) 117 | # ax[0][1].plot(n_exp, np.ones_like(n_exp) * 0, dashes=[8, 4], color='black', linewidth=0.8) 118 | ax[0][1].scatter(n_exp, dtradeoff_avg, marker='o', s=0., color=colors[color_i]) 119 | 120 | ax[0][1].set_xticks(np.arange(0, 120, 5)) 121 | ax[0][1].set_xlim(0, n_steps) 122 | # ax[0][1].set_ylim(0, 80) 123 | ax[0][1].set_xlabel('Samples') 124 | ax[0][1].set_ylabel(r'$d_{(trade-off)}$') 125 | 126 | # Plot best P. 127 | ax[1][0].plot(n_exp, bestP_avg, color=colors[color_i], lw=2.5, label=sampling) 128 | ax[1][0].plot(n_exp, bestP_min, color=colors[color_i], lw=1, ls='--', label=sampling, alpha=1.) 129 | ax[1][0].plot(n_exp, bestP_max, color=colors[color_i], lw=1, ls='--', label=sampling, alpha=1.) 130 | # ax[1][0].fill_between(x=n_exp, y1=bestP_avg, y2=bestP_max, color=colors[color_i], alpha=0.3) 131 | # ax[1][0].fill_between(x=n_exp, y1=bestP_min, y2=bestP_avg, color=colors[color_i], alpha=0.3) 132 | 133 | # ax[1][0].plot(n_exp, np.ones_like(n_exp) * 0, 134 | # dashes=[8, 4], color='black', linewidth=0.8) 135 | ax[1][0].scatter(n_exp, bestP_avg, marker='o', s=0., 136 | color=colors[color_i]) 137 | 138 | ax[1][0].set_xticks(np.arange(0, 120, 5)) 139 | ax[1][0].set_xlim(0, n_steps) 140 | # ax[1][0].set_ylim(0.8, 1.1) 141 | ax[1][0].set_xlabel('Samples') 142 | ax[1][0].set_ylabel('Best P') 143 | 144 | # Plot best I1. 145 | ax[1][1].plot(n_exp, bestI1_avg, color=colors[color_i], lw=2.5, 146 | label=sampling.upper()) 147 | 148 | ax[1][1].plot(n_exp, bestI1_min, color=colors[color_i], lw=1.0, ls='--', 149 | label=sampling.upper()) 150 | ax[1][1].plot(n_exp, bestI1_max, color=colors[color_i], lw=1.0, ls='--', 151 | label=sampling.upper()) 152 | 153 | ax[1][1].fill_between(x=n_exp, 154 | y1=bestI1_avg, 155 | y2=bestI1_max, color=colors[color_i], alpha=0.3, 156 | ) 157 | ax[1][1].fill_between(x=n_exp, 158 | y1=bestI1_min, 159 | y2=bestI1_avg, color=colors[color_i], alpha=0.3, 160 | ) 161 | # ax[1][1].plot(n_exp, np.ones_like(n_exp) * 0, 162 | # dashes=[8, 4], color='black', linewidth=0.8) 163 | ax[1][1].scatter(n_exp, bestI1_avg, marker='o', s=0., 164 | color=colors[color_i]) 165 | 166 | 167 | ax[1][1].set_xticks(np.arange(0, 120, 5)) 168 | ax[1][1].set_xlim(0, n_steps) 169 | ax[1][1].set_ylim(0.0, 0.005) 170 | ax[1][1].set_xlabel('Samples') 171 | ax[1][1].set_ylabel('Best I1') 172 | 173 | color_i += 1 174 | 175 | ax[0][1].legend() 176 | plt.tight_layout() 177 | plt.savefig(f"figures/benchmark_sampling.svg") 178 | plt.show() 179 | 180 | 181 | -------------------------------------------------------------------------------- /examples/publication/Virtual-experimentation/performance/4_hypervol_sampling.py: -------------------------------------------------------------------------------- 1 | 2 | import numpy as np 3 | import matplotlib.pyplot as plt 4 | import seaborn as sns 5 | import pandas as pd 6 | import os 7 | 8 | 9 | # sns.set_style("ticks") 10 | # sns.set_context("paper") 11 | import matplotlib as mpl 12 | mpl.rcParams['grid.linestyle'] = ':' 13 | mpl.rcParams['grid.linewidth'] = 0.1 14 | 15 | objective_1 = 'P' 16 | objective_2 = 'I1' 17 | 18 | plt.rcParams['font.family'] = 'Helvetica' 19 | # mpl.rc('font', **{'family':'sans-serif', 'sans-serif':['HelveticaLight']}) 20 | 21 | # Best objectives. 22 | best_conversion_in_scope = 100. 23 | best_selectivity_in_scope = 100. 24 | 25 | n_experiments = 30 26 | feat_iter = 0 27 | 28 | if not os.path.exists('./figures'): 29 | os.mkdir('figures') 30 | 31 | fig, ax = plt.subplots(figsize=(7., 4.0), dpi=500, nrows=1, ncols=3) 32 | 33 | colors_sampling = ['#DC143C', '#0343DF', '#FAC205', '#15B01A'] 34 | 35 | alphas = [0.4, 0.6, 0.7, 1.0] 36 | i = -1 37 | for sampling_method in ['seed', 'lhs', 'cvtsampling']: 38 | 39 | i += 1 40 | j = -1 41 | for batch in [1, 2, 3, 5]: 42 | j += 1 43 | acq = 'EHVI' 44 | 45 | df_i = pd.read_csv(f'../results/results_benchmark_acq_{acq}_batch_{batch}_{sampling_method}.csv') 46 | df_i = df_i[df_i['n_experiments'] <= n_experiments] 47 | 48 | # Hypervolume. 49 | hypervol = df_i['hypervolume completed (%)'].values[:] 50 | 51 | # Plot performance for each acquisition function. 52 | n_exp = df_i['n_experiments'].values[:] 53 | 54 | ax[i].plot(n_exp, hypervol, color=colors_sampling[j], lw=2.5, 55 | label=f"{batch}", 56 | alpha=alphas[j]) 57 | 58 | ax[i].set_title(f"{sampling_method}") 59 | ax[i].set_xlabel('Samples') 60 | ax[i].set_ylabel('Hypervolume (%)') 61 | ax[i].set_ylim(80, 100) 62 | 63 | ax[i].legend() 64 | plt.tight_layout() 65 | plt.savefig(f"figures/benchmark_hypervol.svg") 66 | 67 | plt.show() 68 | 69 | -------------------------------------------------------------------------------- /examples/tutorials/2_EDBO_WebApp_Tutorial.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/doyle-lab-ucla/edboplus/f5d1687e834d33c77598767c5383a919f9a58034/examples/tutorials/2_EDBO_WebApp_Tutorial.pdf -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | botorch==0.5.0 2 | gpytorch==1.5.1 3 | idaes-pse==1.5.1 4 | ipykernel==6.5.1 5 | ipython==7.29.0 6 | ipywidgets==7.6.5 7 | Jinja2==3.0.3 8 | joypy==0.2.6 9 | lxml==4.6.4 10 | mordred==1.2.0 11 | numpy==1.21.5 12 | ordered-set==4.0.2 13 | pandas==1.3.4 14 | pareto==1.1.1.post3 15 | pymoo==0.5.0 16 | scikit-learn==1.0.1 17 | scipy==1.7.2 18 | seaborn 19 | matplotlib 20 | sympy==1.9 21 | torch==1.10.0 22 | tqdm -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [metadata] 2 | description-file = README.md -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, find_packages 2 | 3 | setup( 4 | name='edbo', 5 | packages=['edbo'], 6 | version='0.2.0', 7 | author='Jose A. Garrido Torres & Abigail Gutmann Doyle', 8 | author_email='josegarridotorres@me.com', 9 | url='https://github.com/doyle-lab-ucla/edboplus', 10 | keywords=['Bayesian Optimization', 'Chemical Reaction Optimization'], 11 | license='MIT', 12 | description='Bayesian reaction optimization as a tool for chemical synthesis.', 13 | install_requires=[ 14 | 'botorch==0.5.0', 15 | 'gpytorch==1.5.1', 16 | 'idaes-pse==1.5.1', 17 | 'ipykernel==6.5.1', 18 | 'ipython==7.29.0', 19 | 'ipywidgets==7.6.5', 20 | 'Jinja2==3.0.3', 21 | 'joypy==0.2.6', 22 | 'lxml==4.6.4', 23 | 'mordred==1.2.0', 24 | 'numpy==1.21.5', 25 | 'ordered-set==4.0.2', 26 | 'pandas==1.3.4', 27 | 'pareto==1.1.1.post3', 28 | 'pymoo==0.5.0', 29 | 'scikit-learn==1.0.1', 30 | 'scipy==1.7.2', 31 | 'seaborn', 32 | 'matplotlib', 33 | 'sympy==1.9', 34 | 'torch==1.10.0', 35 | 'tqdm', 36 | ], 37 | classifiers=[ 38 | 'Development Status :: 3 - Alpha', 39 | 'Intended Audience :: Science/Research', 40 | 'Topic :: Scientific/Engineering :: Chemistry', 41 | 'License :: OSI Approved :: MIT License', 42 | 'Programming Language :: Python :: 3.8', 43 | ], 44 | ) --------------------------------------------------------------------------------