├── EDBOLogo.png
├── LICENSE
├── README.md
├── edbo
└── plus
│ ├── __init__.py
│ ├── benchmark
│ ├── __init__.py
│ └── multiobjective_benchmark.py
│ ├── model.py
│ ├── optimizer_botorch.py
│ ├── scope_generator.py
│ └── utils.py
├── examples
├── publication
│ ├── BMS_yield_cost
│ │ ├── 0_data_preprocessing.ipynb
│ │ ├── 1_preprocess_data.py
│ │ ├── 2_plot_ground_truth.py
│ │ ├── 3_run_edbo_cost_yield_performance.py
│ │ ├── 4_plot_performance_hypervol.py
│ │ ├── 5_plot_MAE_and_RMSE.py
│ │ ├── 6_distrib_plots.py
│ │ ├── 7_plot_scope_expansion.py
│ │ ├── 8_optimization_expanding_scope.py
│ │ ├── 9_optimization_constraints.py
│ │ └── data
│ │ │ ├── PCI_PMI_cost_full.csv
│ │ │ ├── PCI_PMI_cost_full_update.csv
│ │ │ ├── base_dft.csv
│ │ │ ├── clean_dft.csv
│ │ │ ├── experiments_yield_and_cost.csv
│ │ │ ├── ligand_dft.csv
│ │ │ └── solvent_dft.csv
│ ├── Crosscoupling
│ │ ├── 1_run_experiments.py
│ │ ├── campaigns
│ │ │ ├── 0_recalculate_predictions.py
│ │ │ ├── 1_analysis.py
│ │ │ ├── challenging_campaign_cvt
│ │ │ │ ├── edbo_crosscoupling_photoredox_yield_ee_round1.csv
│ │ │ │ ├── edbo_crosscoupling_photoredox_yield_ee_round2.csv
│ │ │ │ ├── edbo_crosscoupling_photoredox_yield_ee_round3.csv
│ │ │ │ ├── edbo_crosscoupling_photoredox_yield_ee_round4.csv
│ │ │ │ ├── edbo_crosscoupling_photoredox_yield_ee_round5.csv
│ │ │ │ ├── edbo_crosscoupling_photoredox_yield_ee_round6.csv
│ │ │ │ ├── edbo_crosscoupling_photoredox_yield_ee_round7.csv
│ │ │ │ ├── predictions_1.csv
│ │ │ │ ├── predictions_2.csv
│ │ │ │ ├── predictions_3.csv
│ │ │ │ ├── predictions_4.csv
│ │ │ │ ├── predictions_5.csv
│ │ │ │ ├── predictions_6.csv
│ │ │ │ └── predictions_7.csv
│ │ │ ├── challenging_campaign_random
│ │ │ │ ├── edbo_crosscoupling_photoredox_yield_ee_round1.csv
│ │ │ │ ├── edbo_crosscoupling_photoredox_yield_ee_round2.csv
│ │ │ │ ├── edbo_crosscoupling_photoredox_yield_ee_round3.csv
│ │ │ │ ├── edbo_crosscoupling_photoredox_yield_ee_round4.csv
│ │ │ │ ├── edbo_crosscoupling_photoredox_yield_ee_round5.csv
│ │ │ │ ├── edbo_crosscoupling_photoredox_yield_ee_round6.csv
│ │ │ │ ├── edbo_crosscoupling_photoredox_yield_ee_round7.csv
│ │ │ │ ├── predictions_1.csv
│ │ │ │ ├── predictions_2.csv
│ │ │ │ ├── predictions_3.csv
│ │ │ │ ├── predictions_4.csv
│ │ │ │ ├── predictions_5.csv
│ │ │ │ ├── predictions_6.csv
│ │ │ │ └── predictions_7.csv
│ │ │ ├── crosscoupling_results_challenging_campaign_cvt.csv
│ │ │ └── easy_campaign
│ │ │ │ ├── edbo_crosscoupling_photoredox_yield_ee_round0.csv
│ │ │ │ ├── edbo_crosscoupling_photoredox_yield_ee_round1.csv
│ │ │ │ ├── edbo_crosscoupling_photoredox_yield_ee_round2.csv
│ │ │ │ ├── edbo_crosscoupling_photoredox_yield_ee_round3.csv
│ │ │ │ ├── edbo_crosscoupling_photoredox_yield_ee_round4.csv
│ │ │ │ ├── edbo_crosscoupling_photoredox_yield_ee_round5.csv
│ │ │ │ ├── edbo_crosscoupling_photoredox_yield_ee_round6.csv
│ │ │ │ ├── edbo_crosscoupling_photoredox_yield_ee_round7.csv
│ │ │ │ ├── predictions_1.csv
│ │ │ │ ├── predictions_2.csv
│ │ │ │ ├── predictions_3.csv
│ │ │ │ ├── predictions_4.csv
│ │ │ │ ├── predictions_5.csv
│ │ │ │ ├── predictions_6.csv
│ │ │ │ └── predictions_7.csv
│ │ └── edbo_crosscoupling_photoredox_yield_ee.csv
│ ├── Suzuki
│ │ ├── 0_clean_dft.py
│ │ ├── 0_clean_mordred.py
│ │ ├── 1_run_ohe.py
│ │ ├── 2_run_dft.py
│ │ ├── 3_run_mordred.py
│ │ ├── 4_random_features.py
│ │ ├── data
│ │ │ ├── dataset_B1.csv
│ │ │ ├── dataset_B2.csv
│ │ │ ├── dataset_B2_DFT_clean.csv
│ │ │ ├── dataset_B3.csv
│ │ │ └── dataset_B3_Mordred_clean.csv
│ │ ├── performance
│ │ │ ├── 1_merge_all.py
│ │ │ ├── 2_plot_ground_truth.py
│ │ │ ├── 3_plot_decision_pathways_objectives.py
│ │ │ ├── 4_plot_performance.py
│ │ │ ├── 5_find_entry.py
│ │ │ └── 7_plot_performance_acquisition_function.py
│ │ └── performance_acq
│ │ │ ├── 1_merge_all.py
│ │ │ └── 2_plot_acq_batch.py
│ └── Virtual-experimentation
│ │ ├── 1_benchmark.py
│ │ ├── data
│ │ └── data.csv
│ │ └── performance
│ │ ├── 1_merge_all.py
│ │ ├── 2_plot_ground_truth.py
│ │ ├── 3_plot_performance_acquisition_function.py
│ │ └── 4_hypervol_sampling.py
└── tutorials
│ ├── 1_CLI_example.ipynb
│ └── 2_EDBO_WebApp_Tutorial.pdf
├── requirements.txt
├── setup.cfg
└── setup.py
/EDBOLogo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/doyle-lab-ucla/edboplus/f5d1687e834d33c77598767c5383a919f9a58034/EDBOLogo.png
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2022 Jose A. Garrido Torres
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 |
2 | #
3 |
4 | ## **EDBO+**. Bayesian reaction optimization as a tool for chemical synthesis
5 |
6 | WebApp: https://www.edbowebapp.com
7 |
8 | **Reference:** Garrido Torres, Jose A.; Lau, Sii Hong; Anchuri, Pranay; Stevens, Jason M.; Tabora, Jose E.; Li, Jun; Borovika, Alina; Adams, Ryan P.; Doyle, Abigail G. "A Multi-Objective Active Learning Platform and Web App for Reaction Optimization".
9 |
10 | **DOI:**
11 |
12 | 10.26434/chemrxiv-2022-cljcp
13 |
14 | 10.1021/jacs.2c08592
15 |
16 | **Links**:
17 | [ChemRxiv](https://chemrxiv.org/engage/chemrxiv/article-details/62f6966269f3a5df46b5584b),
18 | [JACS](https://pubs.acs.org/doi/full/10.1021/jacs.2c08592)
19 |
20 |
21 |
22 |
23 | ---
24 |
25 |
26 |
27 | ### Installation:
28 |
29 |
30 |
31 | (1) Create anaconda environment:
32 |
33 | ```
34 | conda create --name edbo_env python=3.8
35 | ```
36 |
37 | (2) Activate conda environment:
38 |
39 | ```
40 | conda activate edbo_env
41 | ```
42 |
43 | (3) Install EDBO+ dependencies:
44 |
45 | ```
46 | pip install -e .
47 | ```
48 |
49 |
50 |
51 | ---
52 |
53 |
54 |
55 | #### **Note**: to run the notebook tutorials install JupyterLab
56 |
57 | ```
58 | conda install jupyterlab
59 | ```
60 |
--------------------------------------------------------------------------------
/edbo/plus/__init__.py:
--------------------------------------------------------------------------------
1 | from .optimizer_botorch import *
--------------------------------------------------------------------------------
/edbo/plus/benchmark/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/doyle-lab-ucla/edboplus/f5d1687e834d33c77598767c5383a919f9a58034/edbo/plus/benchmark/__init__.py
--------------------------------------------------------------------------------
/edbo/plus/model.py:
--------------------------------------------------------------------------------
1 |
2 | import torch
3 | import gpytorch
4 | from gpytorch.kernels import MaternKernel, ScaleKernel
5 | from gpytorch.priors import GammaPrior
6 | from gpytorch.constraints import GreaterThan
7 | import numpy as np
8 |
9 | tkwargs = {
10 | "dtype": torch.double,
11 | "device": torch.device("cpu"),
12 | }
13 |
14 | def build_and_optimize_model(train_x, train_y):
15 | """ Builds model and optimizes it."""
16 |
17 | gp_options = {
18 | 'ls_prior1': 2.0, 'ls_prior2': 0.2, 'ls_prior3': 5.0,
19 | 'out_prior1': 5.0, 'out_prior2': 0.5, 'out_prior3': 8.0,
20 | 'noise_prior1': 1.5, 'noise_prior2': 0.1, 'noise_prior3': 5.0,
21 | 'noise_constraint': 1e-5,
22 | }
23 |
24 | n_features = np.shape(train_x)[1]
25 |
26 | class ExactGPModel(gpytorch.models.ExactGP):
27 | def __init__(self, train_x, train_y, likelihood):
28 | super(ExactGPModel, self).__init__(train_x, train_y,
29 | likelihood)
30 | self.mean_module = gpytorch.means.ConstantMean()
31 |
32 | kernels = MaternKernel(
33 | ard_num_dims=n_features,
34 | lengthscale_prior=GammaPrior(gp_options['ls_prior1'],
35 | gp_options['ls_prior2'])
36 | )
37 |
38 | self.covar_module = ScaleKernel(
39 | kernels,
40 | outputscale_prior=GammaPrior(gp_options['out_prior1'],
41 | gp_options['out_prior2']))
42 | try:
43 | ls_init = gp_options['ls_prior3']
44 | self.covar_module.base_kernel.lengthscale = ls_init
45 | except:
46 | uniform = gp_options['ls_prior3']
47 | ls_init = torch.ones(n_features).to(**tkwargs) * uniform
48 | self.covar_module.base_kernel.lengthscale = ls_init
49 |
50 | def forward(self, x):
51 | mean_x = self.mean_module(x)
52 | covar_x = self.covar_module(x)
53 | return gpytorch.distributions.MultivariateNormal(mean_x, covar_x)
54 |
55 | # initialize likelihood and model
56 | likelihood = gpytorch.likelihoods.GaussianLikelihood(
57 | GammaPrior(gp_options['noise_prior1'], gp_options['noise_prior2'])
58 | )
59 |
60 | likelihood.noise = gp_options['noise_prior3']
61 | model = ExactGPModel(train_x, train_y, likelihood).to(**tkwargs)
62 |
63 | model.likelihood.noise_covar.register_constraint(
64 | "raw_noise", GreaterThan(gp_options['noise_constraint'])
65 | )
66 |
67 | model.train()
68 | likelihood.train()
69 | optimizer = torch.optim.Adam([
70 | {'params': model.parameters()},
71 | ], lr=0.1)
72 |
73 | # "Loss" for GPs - the marginal log likelihood
74 | mll = gpytorch.mlls.ExactMarginalLogLikelihood(likelihood, model)
75 |
76 | training_iter = 1000
77 | for i in range(training_iter):
78 | # Zero gradients from previous iteration
79 | optimizer.zero_grad()
80 | # Output from model
81 | output = model(train_x)
82 | # Calc loss and backprop gradients
83 | loss = -mll(output, train_y.squeeze(-1).to(**tkwargs))
84 | loss.backward()
85 | optimizer.step()
86 |
87 | model.eval()
88 | likelihood.eval()
89 | return model, likelihood # Optimized model
90 |
91 |
--------------------------------------------------------------------------------
/edbo/plus/optimizer_botorch.py:
--------------------------------------------------------------------------------
1 | import os
2 | from pathlib import Path
3 | import random
4 | import sys
5 | import warnings
6 |
7 | from botorch.acquisition.monte_carlo import qExpectedImprovement
8 | from botorch.acquisition.multi_objective.monte_carlo import \
9 | qExpectedHypervolumeImprovement, qNoisyExpectedHypervolumeImprovement
10 | from botorch.models import SingleTaskGP, ModelListGP
11 | from botorch.optim import optimize_acqf_discrete
12 | from botorch.sampling.samplers import SobolQMCNormalSampler, IIDNormalSampler
13 | from botorch.utils.multi_objective.box_decompositions import \
14 | NondominatedPartitioning
15 | from idaes.surrogate.pysmo.sampling import LatinHypercubeSampling, CVTSampling
16 | import numpy as np
17 | from ordered_set import OrderedSet
18 | import pandas as pd
19 | from scipy.stats import norm
20 | from sklearn.preprocessing import MinMaxScaler
21 | from scipy.spatial.distance import cdist
22 | import torch
23 |
24 | from .utils import EDBOStandardScaler
25 | from .model import build_and_optimize_model
26 | from .scope_generator import create_reaction_scope
27 |
28 | tkwargs = {
29 | "dtype": torch.double,
30 | "device": torch.device("cpu"),
31 | }
32 |
33 |
34 | class EDBOplus:
35 |
36 | def __init__(self):
37 |
38 | self.predicted_mean = []
39 | self.predicted_variance = []
40 |
41 | @staticmethod
42 | def generate_reaction_scope(components, directory='./', filename='reaction.csv',
43 | check_overwrite=True):
44 | """
45 | Creates a reaction scope from a dictionary of components and values.
46 | """
47 | print("Generating a reaction scope...")
48 | df, n_combinations = create_reaction_scope(components=components, directory=directory,
49 | filename=filename,
50 | check_overwrite=check_overwrite)
51 | print(f"The scope was generated and contains {n_combinations} possible reactions!")
52 | return df
53 |
54 | @staticmethod
55 | def _init_sampling(df, batch, sampling_method, seed):
56 |
57 | np.random.seed(seed)
58 | random.seed(seed)
59 | numeric_cols = df._get_numeric_data().columns
60 | ohe_columns = list(OrderedSet(df.columns) - OrderedSet(numeric_cols))
61 | if len(ohe_columns) > 0:
62 | print(f"The following columns are categorical and will be encoded"
63 | f" using One-Hot-Encoding: {ohe_columns}")
64 | # Encode OHE.
65 | df_sampling = pd.get_dummies(df, prefix=ohe_columns,
66 | columns=ohe_columns, drop_first=True)
67 |
68 | class HiddenPrints:
69 | def __enter__(self):
70 | self._original_stdout = sys.stdout
71 | sys.stdout = open(os.devnull, 'w')
72 |
73 | def __exit__(self, exc_type, exc_val, exc_tb):
74 | sys.stdout.close()
75 | sys.stdout = self._original_stdout
76 |
77 | # Order df according to initial sampling method (random samples).
78 | with HiddenPrints():
79 | idaes = None
80 | if sampling_method == 'random':
81 | samples = df_sampling.sample(n=batch, random_state=seed)
82 | elif sampling_method.lower() == 'lhs':
83 | idaes = LatinHypercubeSampling(df_sampling, batch, sampling_type="selection")
84 | elif sampling_method.lower() == 'cvt':
85 | idaes = CVTSampling(df_sampling, batch, sampling_type="selection")
86 |
87 | if idaes is not None:
88 | samples = idaes.sample_points()
89 |
90 | # Sometimes the LHS or CVT sampling methods return less samples than requested. Add random samples in this case.
91 | additional_samples = None
92 | if len(samples) < batch:
93 | additional_samples = df.sample(n=batch-len(samples), random_state=seed, replace=True)
94 | additional_samples = additional_samples.reset_index(drop=True)
95 | # Add the additional samples to the samples dataframe. If some of the additional_samples are already in samples, generate new ones until the batch size is reached.
96 | extra_seed = 1
97 | while len(samples) < batch:
98 | samples = pd.concat([samples,additional_samples]).drop_duplicates(ignore_index=True)
99 | additional_samples = df.sample(n=batch-len(samples), random_state=seed+extra_seed, replace=True)
100 | extra_seed +=1
101 |
102 | # Get index of the best samples according to the random sampling method.
103 | df_sampling_matrix = df_sampling.to_numpy()
104 | priority_list = np.zeros_like(df_sampling.index)
105 |
106 | for sample in samples.to_numpy():
107 | d_i = cdist([sample], df_sampling_matrix, metric='cityblock')
108 | a = np.argmin(d_i)
109 | priority_list[a] = 1.
110 | df['priority'] = priority_list
111 |
112 | print(f"Generated {len(samples)} initial samples using {sampling_method} sampling (seed = {seed}). Run finished!")
113 |
114 | return df
115 |
116 |
117 | def run(self,
118 | objectives, objective_mode, objective_thresholds=None,
119 | directory='.', filename='reaction.csv',
120 | columns_features='all',
121 | batch=5, init_sampling_method='cvt', seed=0,
122 | scaler_features=MinMaxScaler(),
123 | scaler_objectives=EDBOStandardScaler(),
124 | acquisition_function='NoisyEHVI',
125 | acquisition_function_sampler='SobolQMCNormalSampler'):
126 |
127 | """
128 | Parameters
129 | ----------
130 | objectives: list
131 | list of string containing the name for each objective.
132 | Example:
133 | objectives = ['yield', 'cost', 'impurity']
134 |
135 | objective_mode: list
136 | list to select whether the objective should be maximized or minimized.
137 | Examples:
138 | A) Example for single-objective optimization:
139 | objective_mode = ['max']
140 | B) Example for multi-objective optimization:
141 | objective_mode = ['max', 'min', 'min']
142 |
143 | objective_thresholds: list
144 | List of worst case values for each objective.
145 | Example:
146 | objective_threshold = [50.0, 10.0, 10.0]
147 |
148 | columns_features: list
149 | List containing the names of the columns to be included in the regression model. By default set to
150 | 'all', which means the algorithm will automatically select all the columns that are not in
151 | the *objectives* list.
152 |
153 | batch: int
154 | Number of experiments that you want to run in parallel. For instance *batch = 5* means that you
155 | will run 5 experiments in each EDBO+ run. You can change this number at any stage of the optimization,
156 | so don't worry if you change your mind after creating or initializing the reaction scope.
157 |
158 | get_predictions: boolean
159 | If True it will print out a *csv file* with the predictions.
160 | You can also access the *predicted_mean* and *predicted_variance* through the EDBOPlus class.
161 |
162 | directory: string
163 | name of the directory to save the results of the optimization.
164 |
165 | filename: string
166 | Name of the file to save a *csv* with the priority list. If *get_predictions=True* EDBO+ will automatically
167 | save a second file including the predictions (*pred_filename.csv*).
168 |
169 | init_sampling_method: string:
170 | Method for selecting the first samples in the scope (in absence) Choices are:
171 | - 'random' : Random seed (as implemented in Pandas).
172 | - 'lhs' : LatinHypercube sampling.
173 | - 'cvt' : CVT sampling.
174 |
175 | scaler_features: sklearn class
176 | sklearn.preprocessing class for transforming the features.
177 | Example:
178 | sklearn.preprocessing.MinMaxScaler()
179 |
180 | scaler_objectives: sklearn class
181 | sklearn.preprocessing class for transforming the objective values.
182 | Examples:
183 | - sklearn.preprocessing.StandardScaler()
184 | Default:
185 | EDBOStandardScaler()
186 |
187 | seed: int
188 | Seed for the random initialization.
189 |
190 | acquisition_function_sampler: string
191 | Options are: 'SobolQMCNormalSampler' or 'IIDNormalSampler'.
192 |
193 | """
194 |
195 | wdir = Path(directory)
196 | csv_filename = wdir.joinpath(filename)
197 | torch.manual_seed(seed=seed)
198 | np.random.seed(seed)
199 | self.acquisition_sampler = acquisition_function_sampler
200 |
201 | # 1. Safe checks.
202 | self.objective_names = objectives
203 | # Check whether the columns_features contains the objectives.
204 | if columns_features != 'all':
205 | for objective in objectives:
206 | if objective in columns_features:
207 | columns_features.remove(objective)
208 | if 'priority' in columns_features:
209 | columns_features.remove('priority')
210 |
211 | # Check objectives is a list (even for single objective optimization).
212 | ohe_features = False
213 | if type(objectives) != list:
214 | objectives = [objectives]
215 | if type(objective_mode) != list:
216 | objective_mode = [objective_mode]
217 |
218 | # Check that the user's scope exists.
219 | msg = "Scope was not found. Please create a scope (csv file)."
220 | assert os.path.exists(csv_filename), msg
221 |
222 | # 2. Load reaction.
223 | df = pd.read_csv(f"{csv_filename}")
224 | df = df.dropna(axis='columns', how='all')
225 | original_df = df.copy(deep=True) # Make a copy of the original data.
226 |
227 | # 2.1. Initialize sampling (only in the first iteration).
228 | obj_in_df = list(filter(lambda x: x in df.columns.values, objectives))
229 |
230 | # TODO CHECK: Check whether new objective has been added – if not add PENDING.
231 | for obj_i in self.objective_names:
232 | if obj_i not in original_df.columns.values:
233 | original_df[obj_i] = ['PENDING'] * len(original_df.values)
234 |
235 | if columns_features != 'all':
236 | if 'priority' in df.columns.values:
237 | for obj_i in objectives:
238 | if obj_i not in df.columns.values:
239 | df[obj_i] = ['PENDING'] * len(df.values)
240 |
241 | df = df[columns_features + objectives + ['priority']]
242 | else:
243 | if len(obj_in_df) == 0:
244 | df = df[columns_features]
245 | else:
246 | df = df[columns_features + objectives]
247 |
248 | # No objectives columns in the scope? Then random initialization.
249 | if len(obj_in_df) == 0:
250 | print("There are no experimental observations yet. Random samples will be drawn.")
251 | df = self._init_sampling(df=df, batch=batch, seed=seed,
252 | sampling_method=init_sampling_method)
253 | original_df['priority'] = df['priority']
254 | # Append objectives.
255 | for objective in objectives:
256 | if objective not in original_df.columns.values:
257 | original_df[objective] = ['PENDING'] * len(original_df)
258 |
259 | # Sort values and save dataframe.
260 | original_df = original_df.sort_values('priority', ascending=False)
261 | original_df = original_df.loc[:,~original_df.columns.str.contains('^Unnamed')]
262 | original_df.to_csv(csv_filename, index=False)
263 | return original_df
264 |
265 | if columns_features == 'all': # replacing with actual list of all features for printout
266 | columns_features = list(set(df.columns.tolist())- set(objectives) - set(['priority']))
267 | print(f"This run will optimize for the following objectives: {objectives}")
268 | print(f"The following features will be used: {columns_features}")
269 |
270 | # 3. Separate train and test data.
271 |
272 | # 3.1. Auto-detect dummy features (one-hot-encoding).
273 | numeric_cols = df._get_numeric_data().columns
274 | for nc in numeric_cols:
275 | df[nc] = pd.to_numeric(df[nc], downcast='float')
276 | ohe_columns = list(OrderedSet(df.columns) - OrderedSet(numeric_cols))
277 | ohe_columns = list(OrderedSet(ohe_columns) - OrderedSet(objectives))
278 |
279 | if len(ohe_columns) > 0:
280 | print(f"The following columns are categorical and will be encoded"
281 | f" using One-Hot-Encoding: {ohe_columns}")
282 | ohe_features = True
283 |
284 | data = pd.get_dummies(df, prefix=ohe_columns, columns=ohe_columns, drop_first=True)
285 |
286 | # 3.2. Any sample with a value 'PENDING' in any objective is a test.
287 | idx_test = (data[data.apply(lambda r: r.astype(str).str.contains('PENDING', case=False).any(), axis=1)]).index.values
288 | idx_train = (data[~data.apply(lambda r: r.astype(str).str.contains('PENDING', case=False).any(), axis=1)]).index.values
289 |
290 | # Data only contains featurized information (train and test).
291 | df_train_y = data.loc[idx_train][objectives]
292 | if 'priority' in data.columns.tolist():
293 | data = data.drop(columns=objectives + ['priority'])
294 | else:
295 | data = data.drop(columns=objectives)
296 | df_train_x = data.loc[idx_train]
297 | df_test_x = data.loc[idx_test]
298 |
299 | if len(df_train_x.values) == 0:
300 | msg = 'The scope was already generated, please ' \
301 | 'insert at least one experimental observation ' \
302 | 'value and then press run.'
303 | print(msg)
304 | return original_df
305 |
306 | # Run the BO process.
307 | priority_list = self._model_run(
308 | data=data,
309 | df_train_x=df_train_x,
310 | df_test_x=df_test_x,
311 | df_train_y=df_train_y,
312 | batch=batch,
313 | objective_mode=objective_mode,
314 | objective_thresholds=objective_thresholds,
315 | seed=seed,
316 | scaler_x=scaler_features,
317 | scaler_y=scaler_objectives,
318 | acquisition_function=acquisition_function
319 | )
320 |
321 | # Low priority to the samples that have been already collected.
322 | for i in range(0, len(idx_train)):
323 | priority_list[idx_train[i]] = -1
324 |
325 | original_df['priority'] = priority_list
326 |
327 | cols_sort = ['priority'] + original_df.columns.values.tolist()
328 | # Attach objectives predictions and expected improvement.
329 | cols_for_preds = []
330 | for idx_obj in range(0, len(objectives)):
331 | name = objectives[idx_obj]
332 | mean = self.predicted_mean[:, idx_obj]
333 | var = self.predicted_variance[:, idx_obj]
334 | ei = self.ei[:, idx_obj]
335 | original_df[f"{name}_predicted_mean"] = mean
336 | original_df[f"{name}_predicted_variance"] = var
337 | original_df[f"{name}_expected_improvement"] = ei
338 | cols_for_preds.append([f"{name}_predicted_mean",
339 | f"{name}_predicted_variance",
340 | f"{name}_expected_improvement"
341 | ])
342 | cols_for_preds = np.ravel(cols_for_preds)
343 |
344 | original_df = original_df.sort_values(cols_sort, ascending=False)
345 | # Save extra df containing predictions, uncertainties and EI.
346 | original_df.to_csv(f"{directory}/pred_{filename}", index=False)
347 | # Drop predictions, uncertainties and EI.
348 | original_df = original_df.drop(columns=cols_for_preds, axis='columns')
349 | original_df = original_df.sort_values(cols_sort, ascending=False)
350 | original_df.to_csv(csv_filename, index=False)
351 |
352 | print("Run finished!")
353 | return original_df
354 |
355 | def _model_run(self, data, df_train_x, df_test_x, df_train_y, batch,
356 | objective_mode, objective_thresholds, seed,
357 | scaler_x, scaler_y, acquisition_function):
358 | """
359 | Runs the surrogate machine learning model.
360 | Returns a priority list for a given scope (top priority to low priority).
361 | """
362 |
363 | # Check number of objectives.
364 | n_objectives = len(df_train_y.columns.values)
365 |
366 | scaler_x.fit(df_train_x.to_numpy())
367 | init_train = scaler_x.transform(df_train_x.to_numpy())
368 | test_xnp = scaler_x.transform(df_test_x.to_numpy())
369 | test_x = torch.tensor(test_xnp.tolist()).double().to(**tkwargs)
370 | y = df_train_y.astype(float).to_numpy() # not scaled.
371 |
372 | individual_models = []
373 | for i in range(0, n_objectives):
374 | if objective_mode[i].lower() == 'min':
375 | y[:, i] = -y[:, i]
376 | y = scaler_y.fit_transform(y)
377 |
378 | print("Generating surrogate model...")
379 | for i in range(0, n_objectives):
380 | train_x = torch.tensor(init_train).to(**tkwargs).double()
381 | train_y = np.array(y)[:, i]
382 | train_y = (np.atleast_2d(train_y).reshape(len(train_y), -1))
383 | train_y_i = torch.tensor(train_y.tolist()).to(**tkwargs).double()
384 |
385 | gp, likelihood = build_and_optimize_model(train_x=train_x, train_y=train_y_i,)
386 |
387 | model_i = SingleTaskGP(train_X=train_x, train_Y=train_y_i,
388 | covar_module=gp.covar_module, likelihood=likelihood)
389 | individual_models.append(model_i)
390 |
391 | print("Model generated!")
392 |
393 | # Reference point is the minimum seen so far.
394 | ref_mins = np.min(y, axis=0)
395 | if objective_thresholds is None:
396 | ref_point = torch.tensor(ref_mins).double().to(**tkwargs)
397 | else:
398 | ref_point = np.zeros(n_objectives)
399 | for i in range(0, n_objectives):
400 | if objective_thresholds[i] is None:
401 | ref_point[i] = ref_mins[i]
402 | else:
403 | ref_point[i] = objective_thresholds[i]
404 | if objective_mode[i].lower() == 'min':
405 | ref_point[i] = -ref_point[i]
406 | # Scale.
407 | ref_point = scaler_y.transform(np.array([ref_point]))
408 | # Loop again.
409 | for i in range(0, n_objectives):
410 | if objective_thresholds[i] is None:
411 | ref_point[0][i] = ref_mins[i]
412 | ref_point = torch.tensor(ref_point[0]).double().to(**tkwargs)
413 |
414 | if len(data.values) > 100000:
415 | sobol_num_samples = 64
416 | elif len(data.values) > 50000:
417 | sobol_num_samples = 128
418 | elif len(data.values) > 10000:
419 | sobol_num_samples = 256
420 | else:
421 | sobol_num_samples = 512
422 |
423 | y_torch = torch.tensor(y).to(**tkwargs).double()
424 |
425 | if self.acquisition_sampler == 'IIDNormalSampler':
426 | sampler = IIDNormalSampler(num_samples=sobol_num_samples, collapse_batch_dims=True, seed=seed)
427 | if self.acquisition_sampler == 'SobolQMCNormalSampler':
428 | sampler = SobolQMCNormalSampler(num_samples=sobol_num_samples, collapse_batch_dims=True, seed=seed)
429 |
430 | print ("Optimizing acqusition function...")
431 |
432 | surrogate_model = None
433 |
434 | if acquisition_function.lower() == 'ehvi':
435 |
436 | partitioning = NondominatedPartitioning(
437 | ref_point=ref_point,
438 | Y=y_torch)
439 |
440 | surrogate_model = ModelListGP(*individual_models)
441 | individual_models = [] # empty to reuduce memory
442 |
443 | EHVI = qExpectedHypervolumeImprovement(
444 | model=surrogate_model, sampler=sampler,
445 | ref_point=ref_point, # use known reference point
446 | partitioning=partitioning
447 | )
448 |
449 | acq_result = optimize_acqf_discrete(
450 | acq_function=EHVI,
451 | choices=test_x,
452 | q=batch,
453 | unique=True
454 | )
455 |
456 |
457 | if acquisition_function.lower() == 'noisyehvi':
458 | with warnings.catch_warnings():
459 | warnings.simplefilter("ignore")
460 | acq_fct = None
461 | if n_objectives > 1: # NOTE: NoisyEHVI fails in case of n_objectives = 1 --> added that it uses EI in this case
462 | surrogate_model = ModelListGP(*individual_models)
463 | train_x = torch.tensor(init_train).to(**tkwargs).double()
464 | acq_fct = qNoisyExpectedHypervolumeImprovement(
465 | model=surrogate_model, sampler=sampler,
466 | ref_point=ref_point,
467 | alpha = 0.0,
468 | incremental_nehvi=True, X_baseline=train_x, prune_baseline=True
469 | )
470 | else:
471 | surrogate_model = individual_models[0]
472 | best_value = y_torch.max()
473 | acq_fct = qExpectedImprovement(
474 | model = surrogate_model,
475 | best_f = best_value,
476 | sampler = sampler
477 | )
478 |
479 | acq_result = optimize_acqf_discrete(
480 | acq_function=acq_fct,
481 | choices=test_x,
482 | q=batch,
483 | unique=True
484 | )
485 |
486 | best_samples = scaler_x.inverse_transform(acq_result[0].detach().cpu().numpy())
487 |
488 | print('Acquisition function optimized.')
489 |
490 | # Save rescaled predictions (only for first fantasy).
491 |
492 | # Get predictions in chunks.
493 | chunk_size = 1000
494 | n_chunks = len(data.values) // chunk_size
495 |
496 | if n_chunks == 0:
497 | n_chunks = 1
498 |
499 | self.predicted_mean = np.zeros(shape=(len(data.values), n_objectives))
500 | self.predicted_variance = np.zeros(shape=(len(data.values), n_objectives))
501 | self.ei = np.zeros(shape=(len(data.values), n_objectives))
502 |
503 | observed_raw_values = df_train_y.astype(float).to_numpy()
504 |
505 | for i in range(0, len(data.values), n_chunks):
506 | vals = data.values[i:i+n_chunks]
507 | data_tensor = torch.tensor(scaler_x.transform(vals)).double().to(**tkwargs)
508 | preds = surrogate_model.posterior(X=data_tensor)
509 | self.predicted_mean[i:i+n_chunks] = scaler_y.inverse_transform(preds.mean.detach().cpu().numpy())
510 | self.predicted_variance[i:i+n_chunks] = scaler_y.inverse_transform_var(preds.variance.detach().cpu().numpy())
511 |
512 | for j in range(0, len(objective_mode)):
513 | maximizing = False
514 | if objective_mode[j] == 'max':
515 | maximizing = True
516 | self.ei[i:i+n_chunks, j] = self.expected_improvement(
517 | train_y=observed_raw_values[:, j],
518 | mean=self.predicted_mean[i:i+n_chunks, j],
519 | variance=self.predicted_variance[i:i+n_chunks, j],
520 | maximizing=maximizing
521 | )
522 |
523 | print('Predictions and expected improvement obtained.')
524 |
525 | # Flip predictions if needed.
526 | for i in range(0, len(objective_mode)):
527 | if objective_mode[i] == 'min':
528 | self.predicted_mean[:, i] = -self.predicted_mean[:, i]
529 |
530 | # Rescale samples.
531 | all_samples = data.values
532 |
533 | priority_list = [0] * len(data.values)
534 |
535 | # Find best samples in data.
536 | for sample in best_samples:
537 | d_i = cdist([sample], all_samples, metric='cityblock')
538 | a = np.argmin(d_i)
539 | priority_list[a] = 1.
540 |
541 | return priority_list
542 |
543 | def expected_improvement(self, train_y, mean, variance,
544 | maximizing=False):
545 | """ expected_improvement
546 | Expected improvement acquisition function.
547 | Arguments:
548 | ----------
549 | mean: Numpy array.
550 | predicted mean of the Gaussian Process.
551 | variance: Numpy array.
552 | predicted variance of the Gaussian Process.
553 | train_y: Numpy array.
554 | Numpy array that contains the values of previously observed train targets.
555 | maximizing: Boolean.
556 | Boolean flag that indicates whether the loss function is to be maximised or minimised.
557 | """
558 |
559 | sigma = variance * 2.
560 |
561 | if maximizing:
562 | loss_optimum = np.max(train_y)
563 | else:
564 | loss_optimum = np.min(train_y)
565 |
566 | scaling_factor = (-1) ** (not maximizing)
567 |
568 | # In case sigma equals zero
569 | with np.errstate(divide='ignore'):
570 | Z = scaling_factor * (mean - loss_optimum) / sigma
571 | expected_improvement = scaling_factor * (mean - loss_optimum) * norm.cdf(Z) + sigma * norm.pdf(Z)
572 | expected_improvement[sigma == 0.0] = 0.0
573 |
574 | return expected_improvement
575 |
576 |
--------------------------------------------------------------------------------
/edbo/plus/scope_generator.py:
--------------------------------------------------------------------------------
1 |
2 | import itertools
3 | import pandas as pd
4 | import os
5 | from pathlib import Path
6 |
7 |
8 | def create_reaction_scope(components, directory='./', filename='reaction.csv',
9 | check_overwrite=True):
10 |
11 | """
12 | Reaction scope generator. Pass components dictionary, each
13 | dictionary key contains a list of the choices for a given component.
14 |
15 | ----------------------------------------------------------------------
16 | Example:
17 |
18 | components = {'temperature': [30, 40, 50],
19 | 'solvent': ['THF', 'DMSO'],
20 | 'concentration': [0.1, 0.2, 0.3, 0.4, 0.5]}
21 | ----------------------------------------------------------------------
22 |
23 | ----------------------------------------------------------------------
24 | Note:
25 | - All non-numerical choices are encoded using a One-Hot-Encoder.
26 | ----------------------------------------------------------------------
27 |
28 | ----------------------------------------------------------------------
29 | Returns:
30 | A dataframe with name *{label}.csv* including the entire
31 | set of choices (reaction scope).
32 | ----------------------------------------------------------------------
33 | """
34 |
35 | msg = "You need to pass a dictionary for components. \n"
36 | assert type(components) == dict, msg
37 |
38 | wdir = Path(directory)
39 | csv_filename = wdir.joinpath(filename)
40 | # Ask to overwrite previous scope.
41 |
42 | if os.path.exists(csv_filename) and check_overwrite is True:
43 | overwrite = input('Scope already exists. Overwrite? Y = yes, N = no\n')
44 | if overwrite.lower() != 'y':
45 | return
46 |
47 | # Predict how large will the scope be.
48 | n_combinations = 0
49 | for key in list(components.keys()):
50 | if n_combinations == 0:
51 | n_combinations = len(components[key])
52 | else:
53 | n_combinations *= len(components[key])
54 |
55 | # Generate initial scope.
56 | keys = components.keys()
57 | values = (components[key] for key in keys)
58 |
59 | scope = [dict(zip(keys, combination)) for combination in
60 | itertools.product(*values)]
61 | df_scope = pd.DataFrame(scope)
62 | df_scope.to_csv(csv_filename, index=False, mode='w',
63 | header=list(keys))
64 |
65 | return df_scope, n_combinations
66 |
--------------------------------------------------------------------------------
/edbo/plus/utils.py:
--------------------------------------------------------------------------------
1 |
2 | import numpy as np
3 |
4 |
5 | class EDBOStandardScaler:
6 | """
7 | Custom standard scaler for EDBO.
8 | """
9 | def __init__(self):
10 | pass
11 |
12 | def fit(self, x):
13 | self.mu = np.mean(x, axis=0)
14 | self.std = np.std(x, axis=0)
15 |
16 | def transform(self, x):
17 | for obj in range(0, len(self.std)):
18 | if self.std[obj] == 0.0:
19 | self.std[obj] = 1e-6
20 | return (x-[self.mu])/[self.std]
21 |
22 | def fit_transform(self, x):
23 | self.mu = np.mean(x, axis=0)
24 | self.std = np.std(x, axis=0)
25 |
26 | for obj in range(0, len(self.std)):
27 | if self.std[obj] == 0.0:
28 | self.std[obj] = 1e-6
29 | return (x-[self.mu])/[self.std]
30 |
31 | def inverse_transform(self, x):
32 | return x * [self.std] + [self.mu]
33 |
34 | def inverse_transform_var(self, x):
35 | return x * [self.std]
36 |
--------------------------------------------------------------------------------
/examples/publication/BMS_yield_cost/0_data_preprocessing.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "id": "bfc9fc54",
7 | "metadata": {},
8 | "outputs": [],
9 | "source": [
10 | "import numpy as np\n",
11 | "import pandas as pd"
12 | ]
13 | },
14 | {
15 | "cell_type": "code",
16 | "execution_count": 2,
17 | "id": "54a71f91",
18 | "metadata": {},
19 | "outputs": [
20 | {
21 | "data": {
22 | "text/plain": [
23 | "Index(['Ligand_inchi', 'Base_inchi', 'Solvent_inchi', 'Product_inchi',\n",
24 | " 'Electrophile_inchi', 'Nucleophile_inchi', 'Precatalyst_inchi', 'Base',\n",
25 | " 'Electrophile', 'Electrophile_PCI_Name', 'Ligand', 'Nucleophile',\n",
26 | " 'Nucleophile_PCI_Name', 'Precatalyst', 'Product', 'Solvent',\n",
27 | " 'Screen_ID', 'umol_Screen', 'Entry', 'Well', 'Row', 'Column',\n",
28 | " 'Base_Equiv', 'Electrophile_Equiv', 'Ligand_Equiv', 'Nucleophile_Equiv',\n",
29 | " 'Precatalyst_Equiv', 'Concentration', 'Time_h', 'Temp_C', 'SampleName',\n",
30 | " 'Vial', 'AP_ISO', 'AP_PDT', 'AP_STD', 'Mean_AP', 'Max_AP', 'SD_AP',\n",
31 | " 'Z_Score_AP', 'RelYield_PDT', 'Mean_RY', 'Max_RY', 'SD_RY',\n",
32 | " 'Z_Score_RY', 'Yield', 'Mean_Yield', 'Max_Yield', 'SD_Yield',\n",
33 | " 'Z_Score_Yield', 'Product_MW', 'Solvent_density', 'Solvent_mass',\n",
34 | " 'Product_mg', 'Base_Cost', 'Base_amt', 'Base_MW', 'Base_price.mol',\n",
35 | " 'Solvent_Cost', 'Solvent_amt', 'Solvent_MW', 'Solvent_price.mol',\n",
36 | " 'Ligand_Cost', 'Ligand_amt', 'Ligand_MW', 'Ligand_price.mol',\n",
37 | " 'Ligand_dol', 'Base_dol', 'Solvent_dol', 'reagent_cost',\n",
38 | " 'Nucleophile_MW', 'Electrophile_MW', 'Precatalyst_MW', 'Nucleophile_mg',\n",
39 | " 'Electrophile_mg', 'Precatalyst_mg', 'Ligand_mg', 'Base_mg', 'Total_mg',\n",
40 | " 'PMI', 'solvent mg', 'ligand_dol_will', 'base_dol_will',\n",
41 | " 'solvent_dol_will', 'total_cost_update'],\n",
42 | " dtype='object')"
43 | ]
44 | },
45 | "execution_count": 2,
46 | "metadata": {},
47 | "output_type": "execute_result"
48 | }
49 | ],
50 | "source": [
51 | "df = pd.read_csv('./data/PCI_PMI_cost_full_update.csv')\n",
52 | "df.columns"
53 | ]
54 | },
55 | {
56 | "cell_type": "code",
57 | "execution_count": 3,
58 | "id": "d4d208a8",
59 | "metadata": {},
60 | "outputs": [
61 | {
62 | "name": "stdout",
63 | "output_type": "stream",
64 | "text": [
65 | "Ligand_inchi\n",
66 | "Base_inchi\n",
67 | "Solvent_inchi\n",
68 | "Base\n",
69 | "Ligand\n",
70 | "Solvent\n",
71 | "Screen_ID\n",
72 | "Entry\n",
73 | "Well\n",
74 | "Row\n",
75 | "Column\n",
76 | "Concentration\n",
77 | "Temp_C\n",
78 | "SampleName\n",
79 | "Vial\n",
80 | "AP_ISO\n",
81 | "AP_PDT\n",
82 | "AP_STD\n",
83 | "Mean_AP\n",
84 | "Max_AP\n",
85 | "SD_AP\n",
86 | "Z_Score_AP\n",
87 | "RelYield_PDT\n",
88 | "Mean_RY\n",
89 | "Max_RY\n",
90 | "SD_RY\n",
91 | "Z_Score_RY\n",
92 | "Yield\n",
93 | "Mean_Yield\n",
94 | "Max_Yield\n",
95 | "SD_Yield\n",
96 | "Z_Score_Yield\n",
97 | "Solvent_density\n",
98 | "Solvent_mass\n",
99 | "Product_mg\n",
100 | "Base_Cost\n",
101 | "Base_amt\n",
102 | "Base_MW\n",
103 | "Base_price.mol\n",
104 | "Solvent_Cost\n",
105 | "Solvent_amt\n",
106 | "Solvent_MW\n",
107 | "Solvent_price.mol\n",
108 | "Ligand_Cost\n",
109 | "Ligand_amt\n",
110 | "Ligand_MW\n",
111 | "Ligand_price.mol\n",
112 | "Ligand_dol\n",
113 | "Base_dol\n",
114 | "Solvent_dol\n",
115 | "reagent_cost\n",
116 | "Ligand_mg\n",
117 | "Base_mg\n",
118 | "Total_mg\n",
119 | "PMI\n",
120 | "solvent mg\n",
121 | "ligand_dol_will\n",
122 | "base_dol_will\n",
123 | "solvent_dol_will\n",
124 | "total_cost_update\n"
125 | ]
126 | }
127 | ],
128 | "source": [
129 | "for i in range(0, len(df.columns)):\n",
130 | " if len(np.unique(df[df.columns[i]])) > 1:\n",
131 | " print(df.columns[i])\n",
132 | "\n",
133 | "# np.unique(df['Concentration'].values)"
134 | ]
135 | },
136 | {
137 | "cell_type": "code",
138 | "execution_count": 4,
139 | "id": "d4a6a824",
140 | "metadata": {},
141 | "outputs": [
142 | {
143 | "data": {
144 | "text/html": [
145 | "
\n",
146 | "\n",
159 | "
\n",
160 | " \n",
161 | " \n",
162 | " | \n",
163 | " base | \n",
164 | " ligand | \n",
165 | " solvent | \n",
166 | " concentration | \n",
167 | " temperature | \n",
168 | " yield | \n",
169 | " cost | \n",
170 | " PMI | \n",
171 | "
\n",
172 | " \n",
173 | " \n",
174 | " \n",
175 | " 0 | \n",
176 | " KOAc | \n",
177 | " BrettPhos | \n",
178 | " DMAc | \n",
179 | " 0.100 | \n",
180 | " 105 | \n",
181 | " 5.47 | \n",
182 | " 0.145775 | \n",
183 | " 917.668323 | \n",
184 | "
\n",
185 | " \n",
186 | " 1 | \n",
187 | " KOAc | \n",
188 | " PPhtBu2 | \n",
189 | " DMAc | \n",
190 | " 0.100 | \n",
191 | " 105 | \n",
192 | " 0.00 | \n",
193 | " 0.043201 | \n",
194 | " inf | \n",
195 | "
\n",
196 | " \n",
197 | " 2 | \n",
198 | " KOAc | \n",
199 | " tBPh-CPhos | \n",
200 | " DMAc | \n",
201 | " 0.100 | \n",
202 | " 105 | \n",
203 | " 78.95 | \n",
204 | " 0.269140 | \n",
205 | " 64.469151 | \n",
206 | "
\n",
207 | " \n",
208 | " 3 | \n",
209 | " KOAc | \n",
210 | " PCy3 HBF4 | \n",
211 | " DMAc | \n",
212 | " 0.100 | \n",
213 | " 105 | \n",
214 | " 7.26 | \n",
215 | " 0.032181 | \n",
216 | " 691.080949 | \n",
217 | "
\n",
218 | " \n",
219 | " 4 | \n",
220 | " KOAc | \n",
221 | " PPh3 | \n",
222 | " DMAc | \n",
223 | " 0.100 | \n",
224 | " 105 | \n",
225 | " 28.15 | \n",
226 | " 0.026373 | \n",
227 | " 178.881165 | \n",
228 | "
\n",
229 | " \n",
230 | " ... | \n",
231 | " ... | \n",
232 | " ... | \n",
233 | " ... | \n",
234 | " ... | \n",
235 | " ... | \n",
236 | " ... | \n",
237 | " ... | \n",
238 | " ... | \n",
239 | "
\n",
240 | " \n",
241 | " 1723 | \n",
242 | " CsOPiv | \n",
243 | " PPh2Me | \n",
244 | " p-Xylene | \n",
245 | " 0.153 | \n",
246 | " 120 | \n",
247 | " 1.60 | \n",
248 | " 0.110653 | \n",
249 | " 2091.688946 | \n",
250 | "
\n",
251 | " \n",
252 | " 1724 | \n",
253 | " CsOPiv | \n",
254 | " GorlosPhos HBF4 | \n",
255 | " p-Xylene | \n",
256 | " 0.153 | \n",
257 | " 120 | \n",
258 | " 8.39 | \n",
259 | " 0.121732 | \n",
260 | " 400.447659 | \n",
261 | "
\n",
262 | " \n",
263 | " 1725 | \n",
264 | " CsOPiv | \n",
265 | " JackiePhos | \n",
266 | " p-Xylene | \n",
267 | " 0.153 | \n",
268 | " 120 | \n",
269 | " 13.34 | \n",
270 | " 0.439356 | \n",
271 | " 252.868372 | \n",
272 | "
\n",
273 | " \n",
274 | " 1726 | \n",
275 | " CsOPiv | \n",
276 | " CgMe-PPh | \n",
277 | " p-Xylene | \n",
278 | " 0.153 | \n",
279 | " 120 | \n",
280 | " 19.13 | \n",
281 | " 0.141130 | \n",
282 | " 175.981223 | \n",
283 | "
\n",
284 | " \n",
285 | " 1727 | \n",
286 | " CsOPiv | \n",
287 | " PPhMe2 | \n",
288 | " p-Xylene | \n",
289 | " 0.153 | \n",
290 | " 120 | \n",
291 | " 0.00 | \n",
292 | " 0.111903 | \n",
293 | " inf | \n",
294 | "
\n",
295 | " \n",
296 | "
\n",
297 | "
1728 rows × 8 columns
\n",
298 | "
"
299 | ],
300 | "text/plain": [
301 | " base ligand solvent concentration temperature yield \\\n",
302 | "0 KOAc BrettPhos DMAc 0.100 105 5.47 \n",
303 | "1 KOAc PPhtBu2 DMAc 0.100 105 0.00 \n",
304 | "2 KOAc tBPh-CPhos DMAc 0.100 105 78.95 \n",
305 | "3 KOAc PCy3 HBF4 DMAc 0.100 105 7.26 \n",
306 | "4 KOAc PPh3 DMAc 0.100 105 28.15 \n",
307 | "... ... ... ... ... ... ... \n",
308 | "1723 CsOPiv PPh2Me p-Xylene 0.153 120 1.60 \n",
309 | "1724 CsOPiv GorlosPhos HBF4 p-Xylene 0.153 120 8.39 \n",
310 | "1725 CsOPiv JackiePhos p-Xylene 0.153 120 13.34 \n",
311 | "1726 CsOPiv CgMe-PPh p-Xylene 0.153 120 19.13 \n",
312 | "1727 CsOPiv PPhMe2 p-Xylene 0.153 120 0.00 \n",
313 | "\n",
314 | " cost PMI \n",
315 | "0 0.145775 917.668323 \n",
316 | "1 0.043201 inf \n",
317 | "2 0.269140 64.469151 \n",
318 | "3 0.032181 691.080949 \n",
319 | "4 0.026373 178.881165 \n",
320 | "... ... ... \n",
321 | "1723 0.110653 2091.688946 \n",
322 | "1724 0.121732 400.447659 \n",
323 | "1725 0.439356 252.868372 \n",
324 | "1726 0.141130 175.981223 \n",
325 | "1727 0.111903 inf \n",
326 | "\n",
327 | "[1728 rows x 8 columns]"
328 | ]
329 | },
330 | "execution_count": 4,
331 | "metadata": {},
332 | "output_type": "execute_result"
333 | }
334 | ],
335 | "source": [
336 | "df_sel = df[['Base', 'Ligand', 'Solvent', 'Concentration', 'Temp_C', 'Yield', 'total_cost_update', 'PMI']]\n",
337 | "\n",
338 | "df_all_exp_index = df_sel.rename(columns={'Base': 'base', 'Solvent': 'solvent',\n",
339 | " 'Ligand': 'ligand', 'Concentration': 'concentration',\n",
340 | " 'Temp_C': 'temperature', 'Yield': 'yield',\n",
341 | " 'total_cost_update': 'cost'\n",
342 | " })\n",
343 | "\n",
344 | "# df_all_exp_index.to_csv('./data/experiment_index.csv', index=False)\n",
345 | "df_all_exp_index"
346 | ]
347 | },
348 | {
349 | "cell_type": "code",
350 | "execution_count": 5,
351 | "id": "c81b98e3",
352 | "metadata": {},
353 | "outputs": [],
354 | "source": [
355 | "df_yield_cost = df_all_exp_index.drop(columns=['PMI'])\n",
356 | "# df_yield_cost['new_index'] = np.arange(0, len(df_yield_cost))\n",
357 | "df_yield_cost.to_csv('./data/experiments_yield_and_cost.csv')\n",
358 | "\n"
359 | ]
360 | },
361 | {
362 | "cell_type": "code",
363 | "execution_count": 6,
364 | "id": "3d0e53be",
365 | "metadata": {},
366 | "outputs": [],
367 | "source": [
368 | "df = pd.read_csv('./data/experiments_yield_and_cost.csv')\n",
369 | "df.rename(columns={'Unnamed: 0': 'new_index'}, inplace=True)\n",
370 | "df.to_csv('./data/experiments_yield_and_cost.csv', index=False)\n"
371 | ]
372 | },
373 | {
374 | "cell_type": "code",
375 | "execution_count": 18,
376 | "id": "5469a317",
377 | "metadata": {},
378 | "outputs": [],
379 | "source": []
380 | },
381 | {
382 | "cell_type": "code",
383 | "execution_count": 12,
384 | "metadata": {
385 | "collapsed": false,
386 | "pycharm": {
387 | "name": "#%%\n"
388 | }
389 | },
390 | "outputs": [],
391 | "source": []
392 | },
393 | {
394 | "cell_type": "code",
395 | "execution_count": 12,
396 | "metadata": {
397 | "collapsed": false,
398 | "pycharm": {
399 | "name": "#%%\n"
400 | }
401 | },
402 | "outputs": [],
403 | "source": []
404 | }
405 | ],
406 | "metadata": {
407 | "kernelspec": {
408 | "display_name": "Python 3.7.5 ('edboplus')",
409 | "language": "python",
410 | "name": "python3"
411 | },
412 | "language_info": {
413 | "codemirror_mode": {
414 | "name": "ipython",
415 | "version": 3
416 | },
417 | "file_extension": ".py",
418 | "mimetype": "text/x-python",
419 | "name": "python",
420 | "nbconvert_exporter": "python",
421 | "pygments_lexer": "ipython3",
422 | "version": "3.7.5"
423 | },
424 | "vscode": {
425 | "interpreter": {
426 | "hash": "f6b50c482b94d49566f339c9bbaa80fe4f4c53d65f91d29ce8fa084769027490"
427 | }
428 | }
429 | },
430 | "nbformat": 4,
431 | "nbformat_minor": 5
432 | }
--------------------------------------------------------------------------------
/examples/publication/BMS_yield_cost/1_preprocess_data.py:
--------------------------------------------------------------------------------
1 |
2 | import numpy as np
3 | import pandas as pd
4 |
5 | df_exp = pd.read_csv('./data/experiments_yield_and_cost.csv')
6 |
7 |
8 | # Base features.
9 | for i in ['base', 'ligand', 'solvent']:
10 | df_i = pd.read_csv(f"data/{i}_dft.csv")
11 | df_i.rename(columns={f"{i}_file_name": i}, inplace=True)
12 | df_exp = pd.merge(df_exp, df_i, on=i)
13 |
14 | df_edbo = df_exp.copy(deep=True)
15 | # Remove correlated features.
16 | corr_matrix = df_edbo.corr().abs()
17 | # Select upper triangle of correlation matrix
18 | upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
19 | # Find features with correlation greater than 0.95.
20 | to_drop = [column for column in upper.columns if any(upper[column] > 0.95)]
21 | # Drop features
22 | df_edbo.drop(to_drop, axis=1, inplace=True)
23 |
24 | # Remove columns that have only one or two unique values.
25 | extra_columns_to_remove = []
26 | for column in df_edbo.columns.values:
27 | if len(np.unique(df_edbo[column].values)) <= 1:
28 | extra_columns_to_remove.append(column)
29 | df_edbo.drop(extra_columns_to_remove, axis=1, inplace=True)
30 |
31 | # Remove non numerical.
32 | df_edbo_numeric = df_edbo.select_dtypes(include=np.number)
33 | df_edbo_numeric.to_csv('./data/clean_dft.csv', index=0)
34 | print(df_edbo_numeric)
35 |
--------------------------------------------------------------------------------
/examples/publication/BMS_yield_cost/2_plot_ground_truth.py:
--------------------------------------------------------------------------------
1 | import os.path
2 |
3 | import pandas as pd
4 | import numpy as np
5 | import matplotlib.pyplot as plt
6 | import seaborn as sns
7 | sns.set_style("ticks")
8 | sns.despine()
9 | import matplotlib as mpl
10 | mpl.rcParams['grid.linestyle'] = ':'
11 | mpl.rcParams['grid.linewidth'] = 0.1
12 | plt.rcParams['font.family'] = 'Helvetica'
13 | plt.rcParams['font.size'] = 10
14 | import pareto
15 | from edbo.plus.benchmark.multiobjective_benchmark import is_pareto
16 | from pymoo.mcdm.high_tradeoff import HighTradeoffPoints
17 | from sklearn.preprocessing import MinMaxScaler
18 | import seaborn as sns
19 |
20 |
21 | def get_pareto_points(objective_values):
22 | """ Get pareto for the ground truth function.
23 | NOTE: Assumes maximization."""
24 | pareto_ground = pareto.eps_sort(tables=objective_values,
25 | objectives=np.arange(2),
26 | maximize_all=True)
27 | idx_pareto = is_pareto(objectives=-objective_values)
28 | return np.array(pareto_ground), idx_pareto
29 |
30 |
31 | def get_high_tradeoff_points(pareto_points):
32 | """ Pass a numpy array with the pareto points and returns a numpy
33 | array with the high tradeoff points."""
34 |
35 | scaler_pareto = MinMaxScaler()
36 | pareto_scaled = scaler_pareto.fit_transform(pareto_points)
37 | try:
38 | tradeoff = HighTradeoffPoints()
39 |
40 | tradeoff_args = tradeoff.do(-pareto_scaled) # Always minimizing.
41 | tradeoff_points = pareto_points[tradeoff_args]
42 | except:
43 | tradeoff_points = []
44 | pass
45 | return tradeoff_points
46 |
47 |
48 | df_exp = pd.read_csv('./data/experiments_yield_and_cost.csv')
49 |
50 | df_exp['cost'] = -df_exp['cost']
51 | objective_vals = df_exp[['yield', 'cost']].values
52 | pareto_points, idx_pareto = get_pareto_points(objective_vals)
53 | high_tradeoff_points = get_high_tradeoff_points(pareto_points)
54 |
55 | print(np.unique(df_exp['base'].values))
56 |
57 | fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(10, 5))
58 |
59 |
60 | hues = ['ligand', 'base', 'solvent', 'concentration']
61 |
62 | sns.scatterplot(x=df_exp['cost'], y=df_exp['yield'],
63 | hue=df_exp['ligand'], s=80,
64 | lw=0.01, edgecolor='black',
65 | ax=ax, palette='Spectral',
66 | style=df_exp['solvent'],
67 | )
68 | sns.lineplot(x=pareto_points[:, 1], y=pareto_points[:, 0],
69 | linewidth=2, color='grey', ls='dotted', ax=ax)
70 | if not os.path.exists('results_plots'):
71 | os.mkdir('results_plots')
72 | plt.savefig(f'./results_plots/dataset.svg', format='svg', dpi=500)
73 | plt.show()
74 |
--------------------------------------------------------------------------------
/examples/publication/BMS_yield_cost/3_run_edbo_cost_yield_performance.py:
--------------------------------------------------------------------------------
1 |
2 | import shutil
3 | import pandas as pd
4 | import numpy as np
5 | import os
6 | from edbo.plus.benchmark.multiobjective_benchmark import Benchmark
7 | import os
8 | import numpy as np
9 | import matplotlib.pyplot as plt
10 | import seaborn as sns
11 | import pandas as pd
12 | sns.set_style("darkgrid")
13 | sns.set_context("talk")
14 |
15 | # Benchmark filename
16 | for batch in [1, 2, 3, 5]:
17 | for acq_i in ['EHVI']:
18 | for sampling_method in ['seed', 'lhs', 'cvtsampling']:
19 | budget = 60
20 | acq = acq_i
21 | seed = 1
22 |
23 | df_exp = pd.read_csv('./data/clean_dft.csv')
24 | sort_column = 'new_index'
25 |
26 | columns_regression = df_exp.columns
27 | columns_regression = columns_regression.drop([sort_column, 'yield', 'cost']).tolist()
28 | objectives = ['yield', 'cost']
29 | objective_modes = ['max', 'min']
30 | objective_thresholds = [None, None]
31 | print(f"Columns for regression: {columns_regression}")
32 |
33 | label_benchmark = f"benchmark_dft_acq_{acq}_batch_{batch}_seed_{seed}_init_sampling_{sampling_method}.csv"
34 |
35 | if not os.path.exists(f"./results/{label_benchmark}"):
36 |
37 | # Remove previous files
38 | if os.path.exists(label_benchmark):
39 | os.remove(label_benchmark)
40 |
41 | if os.path.exists(f'pred_{label_benchmark}'):
42 | os.remove(f'pred_{label_benchmark}')
43 |
44 | if os.path.exists(f'results_{label_benchmark}'):
45 | os.remove(f'results_{label_benchmark}')
46 |
47 | bench = Benchmark(df_ground=df_exp,
48 | features_regression=columns_regression,
49 | objective_names=objectives,
50 | objective_modes=objective_modes,
51 | objective_thresholds=objective_thresholds,
52 | filename=label_benchmark,
53 | filename_results=f'results_{label_benchmark}',
54 | index_column=sort_column,
55 | acquisition_function=acq)
56 |
57 | bench.run(steps=int(budget/batch), batch=batch, seed=seed,
58 | plot_ground=False,
59 | plot_predictions=False, plot_train=False,
60 | init_method=sampling_method)
61 |
62 | # Move results.
63 | if not os.path.exists('results'):
64 | os.mkdir('results')
65 | shutil.move(label_benchmark, f'results/{label_benchmark}')
66 | shutil.move(f'pred_{label_benchmark}', f'results/pred_{label_benchmark}')
67 | shutil.move(f'results_{label_benchmark}', f'results/results_{label_benchmark}')
68 |
69 |
--------------------------------------------------------------------------------
/examples/publication/BMS_yield_cost/4_plot_performance_hypervol.py:
--------------------------------------------------------------------------------
1 |
2 | import numpy as np
3 | import matplotlib.pyplot as plt
4 | import seaborn as sns
5 | import pandas as pd
6 | import os
7 |
8 |
9 | # sns.set_style("ticks")
10 | # sns.set_context("paper")
11 | import matplotlib as mpl
12 | mpl.rcParams['grid.linestyle'] = ':'
13 | mpl.rcParams['grid.linewidth'] = 0.1
14 |
15 | objective_1 = 'conversion'
16 | objective_2 = 'selectivity'
17 |
18 | plt.rcParams['font.family'] = 'Helvetica'
19 | # mpl.rc('font', **{'family':'sans-serif', 'sans-serif':['HelveticaLight']})
20 |
21 | # Best objectives.
22 | best_conversion_in_scope = 100.
23 | best_selectivity_in_scope = 100.
24 | n_steps = 60
25 | n_experiments = 60
26 | feat_iter = 0
27 |
28 | if not os.path.exists('./results_plots'):
29 | os.mkdir('results_plots')
30 |
31 | fig, ax = plt.subplots(figsize=(7., 2.5), dpi=500, nrows=1, ncols=3)
32 |
33 | colors_sampling = ['#DC143C', '#0343DF', '#FAC205', '#15B01A']
34 |
35 | alphas = [0.4, 0.6, 0.7, 1.0]
36 | i = -1
37 | for sampling_method in ['seed', 'lhs', 'cvtsampling']:
38 |
39 | i += 1
40 | j = -1
41 | for batch in [1, 2, 3, 5]:
42 | j += 1
43 | acq = 'EHVI'
44 |
45 | df_i = pd.read_csv(f'./results/results_benchmark_dft_acq_{acq}_batch_{batch}_seed_1_init_sampling_{sampling_method}.csv')
46 | df_i = df_i[df_i['n_experiments'] <= n_experiments]
47 |
48 | # Hypervolume.
49 | hypervol = df_i['hypervolume completed (%)'].values[:]
50 |
51 | # Plot performance for each acquisition function.
52 | n_exp = df_i['n_experiments'].values[:]
53 |
54 | ax[i].plot(n_exp, hypervol, color=colors_sampling[j], lw=2.5,
55 | label=f"{batch}",
56 | alpha=alphas[j])
57 |
58 | ax[i].set_title(f"{sampling_method}")
59 | ax[i].set_xlabel('Samples')
60 | ax[i].set_ylabel('Hypervolume (%)')
61 | ax[i].set_ylim(0, 100)
62 |
63 | ax[i].legend()
64 | plt.tight_layout()
65 | plt.savefig(f"results_plots/benchmark_hypervol.svg")
66 |
67 | plt.show()
68 |
69 |
--------------------------------------------------------------------------------
/examples/publication/BMS_yield_cost/5_plot_MAE_and_RMSE.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | import numpy as np
3 | import matplotlib.pyplot as plt
4 | import seaborn as sns
5 | import pareto
6 | from pymoo.mcdm.high_tradeoff import HighTradeoffPoints
7 | from sklearn.preprocessing import MinMaxScaler
8 |
9 | from edbo.plus.benchmark.multiobjective_benchmark import is_pareto
10 |
11 | sns.set_style("ticks")
12 | import matplotlib as mpl
13 | # mpl.rcParams['grid.linestyle'] = ':'
14 | # mpl.rcParams['grid.linewidth'] = 0.1
15 | plt.rcParams['font.family'] = 'Helvetica'
16 | import joypy
17 | from matplotlib import cm
18 |
19 | ##############
20 |
21 | def get_pareto_points(objective_values):
22 | """ Get pareto for the ground truth function.
23 | NOTE: Assumes maximization."""
24 | pareto_ground = pareto.eps_sort(tables=objective_values,
25 | objectives=np.arange(2),
26 | maximize_all=True)
27 | idx_pareto = is_pareto(objectives=-objective_values)
28 | return np.array(pareto_ground), idx_pareto
29 |
30 | def get_high_tradeoff_points(pareto_points):
31 | """ Pass a numpy array with the pareto points and returns a numpy
32 | array with the high tradeoff points."""
33 |
34 | scaler_pareto = MinMaxScaler()
35 | pareto_scaled = scaler_pareto.fit_transform(pareto_points)
36 | try:
37 | tradeoff = HighTradeoffPoints()
38 |
39 | tradeoff_args = tradeoff.do(-pareto_scaled) # Always minimizing.
40 | tradeoff_points = pareto_points[tradeoff_args]
41 | except:
42 | tradeoff_points = []
43 | pass
44 | return tradeoff_points
45 |
46 |
47 | df_exp = pd.read_csv('./data/experiments_yield_and_cost.csv')
48 | df_exp['cost'] = -df_exp['cost']
49 |
50 | objective_vals = df_exp[['yield', 'cost']].values
51 | pareto_points, idx_pareto = get_pareto_points(objective_vals)
52 | high_tradeoff_points = get_high_tradeoff_points(pareto_points)
53 |
54 | ######
55 |
56 |
57 | samplings = ['seed', 'lhs', 'cvtsampling']
58 | batch_sizes = [1, 2, 3, 5]
59 | # colorpalettes = ['Blues', 'Reds', 'Greens', 'Oranges']
60 | max_number_experiments = 45
61 | objective_1 = 'yield'
62 | objective_2 = 'cost'
63 |
64 | colors = ['blue', 'green', 'red']
65 |
66 | df_all = pd.read_csv(f'./results/results_benchmark_dft_acq_EHVI_batch_{batch_sizes[0]}_seed_1_init_sampling_{samplings[0]}.csv')
67 | for i in batch_sizes:
68 | for j in samplings:
69 | df_i = pd.read_csv(f'./results/results_benchmark_dft_acq_EHVI_batch_{i}_seed_1_init_sampling_{j}.csv')
70 | df_i = df_i[df_i['n_experiments'] <= max_number_experiments]
71 | df_all = df_all.append(df_i, ignore_index=True)
72 |
73 |
74 | df_all.drop_duplicates(inplace=True)
75 |
76 | df_finish = df_all[(df_all['n_experiments'] < max_number_experiments+2) & (df_all['n_experiments'] > max_number_experiments-2)]
77 |
78 | fig, ax = plt.subplots(nrows=1, ncols=4, figsize=(14, 2.2))
79 |
80 | sns.barplot(data=df_finish, x='init_method', y='MAE_yield',
81 | hue='batch', ax=ax[0], palette='Blues',
82 | lw=0.7, edgecolor='black', ci=None)
83 | # ax[0].set_ylim((5, 18))
84 |
85 | sns.barplot(data=df_finish, x='init_method', y='MAE_cost',
86 | hue='batch', ax=ax[1], palette='Reds',
87 | lw=0.7, edgecolor='black', ci=None)
88 | # ax[1].set_ylim(0.01)
89 |
90 |
91 | sns.barplot(data=df_finish, x='init_method', y='RMSE_yield',
92 | hue='batch', ax=ax[2], palette='Blues',
93 | lw=0.7, edgecolor='black', ci=None)
94 | # ax[2].set_ylim(10, 25)
95 |
96 | sns.barplot(data=df_finish, x='init_method', y='RMSE_cost',
97 | hue='batch', ax=ax[3], palette='Reds',
98 | lw=0.7, edgecolor='black', ci=None)
99 | # ax[3].set_ylim(0.01, 0.06)
100 |
101 |
102 | plt.savefig('./results_plots/fig2c.svg', format='svg', dpi=500)
103 | plt.tight_layout()
104 | plt.show()
105 |
--------------------------------------------------------------------------------
/examples/publication/BMS_yield_cost/6_distrib_plots.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | import numpy as np
3 | import matplotlib.pyplot as plt
4 | import seaborn as sns
5 | sns.set_style("ticks")
6 | import matplotlib as mpl
7 | # mpl.rcParams['grid.linestyle'] = ':'
8 | # mpl.rcParams['grid.linewidth'] = 0.1
9 | plt.rcParams['font.family'] = 'Helvetica'
10 | import joypy
11 | from matplotlib import cm
12 |
13 | samplings = ['seed', 'lhs', 'cvtsampling']
14 | objective_1 = 'yield'
15 | objective_2 = 'cost'
16 | max_num_experiments = 46
17 |
18 | df_0 = pd.read_csv(f'./results/results_benchmark_dft_acq_EHVI_batch_3_seed_1_init_sampling_{samplings[0]}.csv')
19 | df_0['step'] += 1
20 | df_0 = df_0[df_0['n_experiments'] < max_num_experiments]
21 |
22 | df_1 = pd.read_csv(f'./results/results_benchmark_dft_acq_EHVI_batch_3_seed_1_init_sampling_{samplings[1]}.csv')
23 | df_1['step'] += 1
24 | df_1 = df_1[df_1['n_experiments'] < max_num_experiments]
25 |
26 | df_2 = pd.read_csv(f'./results/results_benchmark_dft_acq_EHVI_batch_3_seed_1_init_sampling_{samplings[2]}.csv')
27 | df_2['step'] += 1
28 | df_2 = df_2[df_2['n_experiments'] < max_num_experiments]
29 |
30 | frames = [df_0, df_1, df_2]
31 | colormaps_obj_1 = [cm.Blues] * 3
32 | colormaps_obj_2 = [cm.Reds] * 3
33 | # colormaps_obj_2 = [cm.PuRd] * 3
34 | # colormaps = [cm.autumn_r, cm.autumn_r, cm.cool, cm.summer]
35 | # pal = sns.cubehelix_palette(10, rot=-.25, light=.7, as_cmap=True)
36 |
37 | for i in range(0, 3):
38 | df = pd.concat(frames)
39 |
40 | plt.figure()
41 | ax, fig = joypy.joyplot(
42 | data=eval(f"df_{i}")[['step', f"{objective_1}_collected_values"]],
43 | by='step',
44 | linecolor='black',
45 | linewidth=0.7,
46 | ylim='own',
47 | column=['yield_collected_values'],
48 | colormap=colormaps_obj_1[i],
49 | legend=False,
50 | alpha=0.95, #bins=10,
51 | normalize=False,
52 | grid=False,
53 | figsize=(3, 3), #x_range=(0, 100)
54 | x_range=(0, 100)
55 | )
56 |
57 | plt.savefig(f'./results_plots/subplot_{samplings[i]}_{objective_1}.svg', format='svg', dpi=500)
58 | plt.show()
59 | ax, fig = joypy.joyplot(
60 | data=eval(f"df_{i}")[['step', f"{objective_2}_collected_values"]],
61 | by='step',
62 | linecolor='black',
63 | linewidth=0.7,
64 | # hist=True,
65 | ylim='own',
66 | column=[f'{objective_2}_collected_values'],
67 | # color=['#686de0'],
68 | colormap=colormaps_obj_2[i],
69 | legend=False,
70 | alpha=0.95, #bins=10,
71 | normalize=False, grid=False,
72 | figsize=(3, 3),
73 | x_range=(0, 0.4)
74 | )
75 | plt.savefig(f'./results_plots/subplot_{samplings[i]}_{objective_2}.svg', format='svg', dpi=500)
76 | plt.show()
77 |
--------------------------------------------------------------------------------
/examples/publication/BMS_yield_cost/7_plot_scope_expansion.py:
--------------------------------------------------------------------------------
1 | import os.path
2 |
3 | import pandas as pd
4 | import numpy as np
5 | import matplotlib.pyplot as plt
6 | import seaborn as sns
7 | sns.set_style("ticks")
8 | sns.despine()
9 | import matplotlib as mpl
10 | mpl.rcParams['grid.linestyle'] = ':'
11 | mpl.rcParams['grid.linewidth'] = 0.1
12 | plt.rcParams['font.family'] = 'Helvetica'
13 | plt.rcParams['font.size'] = 10
14 | import pareto
15 | from edbo.plus.benchmark.multiobjective_benchmark import is_pareto
16 | from pymoo.mcdm.high_tradeoff import HighTradeoffPoints
17 | from sklearn.preprocessing import MinMaxScaler
18 | import seaborn as sns
19 |
20 |
21 | def get_pareto_points(objective_values):
22 | """ Get pareto for the ground truth function.
23 | NOTE: Assumes maximization."""
24 | pareto_ground = pareto.eps_sort(tables=objective_values,
25 | objectives=np.arange(2),
26 | maximize_all=True)
27 | idx_pareto = is_pareto(objectives=-objective_values)
28 | return np.array(pareto_ground), idx_pareto
29 |
30 |
31 | def get_high_tradeoff_points(pareto_points):
32 | """ Pass a numpy array with the pareto points and returns a numpy
33 | array with the high tradeoff points."""
34 |
35 | scaler_pareto = MinMaxScaler()
36 | pareto_scaled = scaler_pareto.fit_transform(pareto_points)
37 | try:
38 | tradeoff = HighTradeoffPoints()
39 |
40 | tradeoff_args = tradeoff.do(-pareto_scaled) # Always minimizing.
41 | tradeoff_points = pareto_points[tradeoff_args]
42 | except:
43 | tradeoff_points = []
44 | pass
45 | return tradeoff_points
46 |
47 |
48 | df_exp = pd.read_csv('./data/experiments_yield_and_cost.csv')
49 |
50 | df_exp['cost'] = -df_exp['cost']
51 | objective_vals = df_exp[['yield', 'cost']].values
52 | pareto_points, idx_pareto = get_pareto_points(objective_vals)
53 | high_tradeoff_points = get_high_tradeoff_points(pareto_points)
54 |
55 | print(np.unique(df_exp['base'].values))
56 |
57 | fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(10, 5))
58 |
59 |
60 | hues = ['ligand', 'base', 'solvent', 'concentration']
61 |
62 | sns.scatterplot(x=df_exp['cost'], y=df_exp['yield'],
63 | hue=df_exp['ligand'], s=80,
64 | lw=0.01, edgecolor='black',
65 | ax=ax, palette='Spectral',
66 | style=df_exp['solvent'],
67 | )
68 | sns.lineplot(x=pareto_points[:, 1], y=pareto_points[:, 0],
69 | linewidth=2, color='grey', ls='dotted', ax=ax)
70 | ax.set_xlim(-0.5, 0.02)
71 | ax.set_ylim(-10, 110)
72 |
73 | if not os.path.exists('results_plots'):
74 | os.mkdir('results_plots')
75 |
76 | plt.savefig(f'./results_plots/dataset.svg', format='svg', dpi=500)
77 | # plt.show()
78 |
79 | # Reduced space
80 |
81 | df_exp = pd.read_csv('./data/experiments_yield_and_cost.csv')
82 |
83 | # Removing a ligand.
84 | df_exp = df_exp[df_exp["ligand"].str.contains("CgMe-PPh")==False]
85 | df_exp = df_exp[df_exp["ligand"].str.contains("PPh3")==False]
86 |
87 | df_exp['cost'] = -df_exp['cost']
88 | objective_vals = df_exp[['yield', 'cost']].values
89 | pareto_points, idx_pareto = get_pareto_points(objective_vals)
90 | high_tradeoff_points = get_high_tradeoff_points(pareto_points)
91 |
92 | print(np.unique(df_exp['base'].values))
93 |
94 | fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(10, 5))
95 |
96 | hues = ['ligand', 'base', 'solvent', 'concentration']
97 |
98 | sns.scatterplot(x=df_exp['cost'], y=df_exp['yield'],
99 | hue=df_exp['ligand'], s=80,
100 | lw=0.01, edgecolor='black',
101 | ax=ax, palette='Spectral',
102 | style=df_exp['solvent'],
103 | )
104 | sns.lineplot(x=pareto_points[:, 1], y=pareto_points[:, 0],
105 | linewidth=2, color='grey', ls='dotted', ax=ax)
106 | ax.set_xlim(-0.5, 0.02)
107 | ax.set_ylim(-10, 110)
108 |
109 | if not os.path.exists('results_plots'):
110 | os.mkdir('results_plots')
111 | plt.savefig(f'./results_plots/dataset_reduced.svg', format='svg', dpi=500)
112 | # plt.show()
113 |
114 |
--------------------------------------------------------------------------------
/examples/publication/BMS_yield_cost/8_optimization_expanding_scope.py:
--------------------------------------------------------------------------------
1 | from edbo.plus.optimizer_botorch import EDBOplus
2 | import pandas as pd
3 | import numpy as np
4 | import seaborn as sns
5 | import matplotlib.pyplot as plt
6 |
7 | df_lookup = pd.read_csv('./data/experiments_yield_and_cost.csv')
8 | df_large = pd.read_csv('./data/experiments_yield_and_cost.csv')
9 |
10 | condition1 = df_large["ligand"].str.contains("CgMe-PPh")==False
11 | condition2 = df_large["ligand"].str.contains("PPh3")==False
12 | df_small = df_large[condition1 & condition2]
13 |
14 | # Refereces for plots.
15 | ref_best_yield_small_scope = np.max(df_small['yield'])
16 | ref_best_cost_small_scope = np.min(df_small['cost'])
17 |
18 | ref_best_yield_large_scope = np.max(df_large['yield'])
19 | ref_best_cost_large_scope = np.min(df_large['cost'])
20 |
21 | df_small.to_csv('./data/small_scope_lookup.csv', index=False)
22 | df_large.to_csv('./data/large_scope_lookup.csv', index=False)
23 |
24 | df_small.drop(columns=['yield', 'cost'], inplace=True)
25 | df_large.drop(columns=['yield', 'cost'], inplace=True)
26 |
27 | df_small.to_csv('./small_scope.csv', index=False)
28 | df_large.to_csv('./large_scope.csv', index=False)
29 |
30 | # Expand scope.
31 | df_expand = df_large.copy()
32 | condition1 = df_large["ligand"].str.contains("CgMe-PPh")==True
33 | condition2 = df_large["ligand"].str.contains("PPh3")==True
34 | df_expand = df_large[condition1 | condition2]
35 | df_expand['priority'] = np.zeros(len(df_expand))
36 | df_expand['yield'] = ['PENDING'] * len(df_expand)
37 | df_expand['cost'] = ['PENDING'] * len(df_expand)
38 |
39 | print('References:')
40 | print('Small scope (best yield / best cost):', ref_best_yield_small_scope, ref_best_cost_small_scope)
41 | print('Large scope (best yield / best cost):',ref_best_yield_large_scope, ref_best_cost_large_scope)
42 |
43 | # Run optimization loops.
44 | n_rounds_small = 6
45 | n_round_large = 5
46 | batch_size = 3
47 | columns_regression = df_small.drop(columns=['new_index']).columns.tolist()
48 |
49 | n_experiments = 0
50 |
51 | track_results_dict = {
52 | 'n_experiments': [],
53 | 'best_yield': [],
54 | 'best_cost': [],
55 | 'max_ei_yield': [],
56 | 'max_ei_cost': [],
57 | 'max_uncertainty_yield': [],
58 | 'max_uncertainty_cost': [],
59 | 'avg_uncertainty_yield': [],
60 | 'avg_uncertainty_cost': [],
61 | }
62 |
63 | collected_yields = []
64 | collected_costs = []
65 |
66 | for round in range(0, n_rounds_small):
67 | EDBOplus().run(
68 | filename='small_scope.csv', # Previously generated scope.
69 | objectives=['yield', 'cost'], # Objectives to be optimized.
70 | objective_mode=['max', 'min'], # Maximize yield and ee but minimize side_product.
71 | batch=batch_size, # Number of experiments in parallel that we want to perform in this round.
72 | columns_features=columns_regression, # features to be included in the model.
73 | init_sampling_method='cvtsampling' # initialization method.
74 | )
75 |
76 | n_experiments += batch_size
77 | # Update with experimental values (observations).
78 | df_results = pd.read_csv('small_scope.csv')
79 | arg_lookup = df_results.loc[0:batch_size-1]['new_index'].values
80 |
81 | for a in range(len(arg_lookup)):
82 | df_results.at[a,'yield'] = df_lookup.loc[arg_lookup[a]]['yield']
83 | df_results.at[a,'cost'] = df_lookup.loc[arg_lookup[a]]['cost']
84 | collected_yields.append(df_lookup.loc[arg_lookup[a]]['yield'])
85 | collected_costs.append(df_lookup.loc[arg_lookup[a]]['cost'])
86 |
87 | df_results.to_csv('small_scope.csv', index=False)
88 |
89 | if round > 0:
90 | # Save all predicted values.
91 | df_pred = pd.read_csv('pred_small_scope.csv')
92 | max_ei_yield = np.max(df_pred['yield_expected_improvement'])
93 | max_ei_cost = np.max(df_pred['cost_expected_improvement'])
94 | max_uncertainty_yield = np.max((df_pred['yield_predicted_variance']))
95 | max_uncertainty_cost = np.max((df_pred['cost_predicted_variance']))
96 | avg_uncertainty_yield = np.average((df_pred['yield_predicted_variance']))
97 | avg_uncertainty_cost = np.average((df_pred['cost_predicted_variance']))
98 | best_yield = np.max(collected_yields)
99 | best_cost = np.min(collected_costs)
100 | track_results_dict['n_experiments'].append(n_experiments)
101 | track_results_dict['best_yield'].append(best_yield)
102 | track_results_dict['best_cost'].append(best_cost)
103 | track_results_dict['max_ei_yield'].append(max_ei_yield)
104 | track_results_dict['max_ei_cost'].append(max_ei_cost)
105 | track_results_dict['max_uncertainty_yield'].append(max_uncertainty_yield)
106 | track_results_dict['max_uncertainty_cost'].append(max_uncertainty_cost)
107 | track_results_dict['avg_uncertainty_yield'].append(avg_uncertainty_yield)
108 | track_results_dict['avg_uncertainty_cost'].append(avg_uncertainty_cost)
109 |
110 | # Plot before expanding:
111 | fig, ax = plt.subplots(nrows=2, ncols=2, figsize=(7, 7))
112 |
113 | sns.scatterplot(
114 | x=np.array(track_results_dict['n_experiments']),
115 | y=np.array(track_results_dict['max_ei_yield']), ax=ax[0][0], color='C1', s=100,
116 | zorder=100
117 | )
118 | sns.scatterplot(
119 | x=track_results_dict['n_experiments'],
120 | y=track_results_dict['max_ei_cost'], ax=ax[0][1], color='C1', s=100,
121 | zorder=100
122 | )
123 | sns.scatterplot(
124 | x=track_results_dict['n_experiments'],
125 | y=track_results_dict['best_yield'], ax=ax[1][0], color='C1', s=100,
126 | zorder=100
127 | )
128 | sns.scatterplot(
129 | x=track_results_dict['n_experiments'],
130 | y=track_results_dict['best_cost'], ax=ax[1][1], color='C1',s=100,
131 | zorder=100
132 | )
133 |
134 | ax[0][0].set_xlabel('Number of experiments')
135 | ax[0][1].set_xlabel('Number of experiments')
136 | ax[1][0].set_xlabel('Number of experiments')
137 | ax[1][1].set_xlabel('Number of experiments')
138 | ax[0][0].set_ylabel('Max EI (yield)')
139 | ax[0][1].set_ylabel('Max EI (cost)')
140 | ax[1][0].set_ylabel('Highest yield found')
141 | ax[1][1].set_ylabel('Lowest cost found')
142 |
143 |
144 | # Expand scope:
145 | df_small = pd.read_csv('small_scope.csv')
146 | df_expand = df_expand.append(df_small)
147 | df_expand.sort_values(by=['priority'], ascending=False, inplace=True)
148 | df_expand.to_csv('expanded_scope.csv', index=False)
149 |
150 | n_experiments -= batch_size
151 |
152 | # Keep optimizing after expanding.
153 | for round in range(0, n_round_large):
154 | EDBOplus().run(
155 | filename='expanded_scope.csv', # Previously generated scope.
156 | objectives=['yield', 'cost'], # Objectives to be optimized.
157 | objective_mode=['max', 'min'], # Maximize yield and ee but minimize side_product.
158 | batch=batch_size, # Number of experiments in parallel that we want to perform in this round.
159 | columns_features=columns_regression, # features to be included in the model.
160 | init_sampling_method='cvtsampling' # initialization method.
161 | )
162 |
163 | n_experiments += batch_size
164 | # Update with experimental values (observations).
165 | df_results = pd.read_csv('expanded_scope.csv')
166 | arg_lookup = df_results.loc[0:batch_size-1]['new_index'].values
167 |
168 | for a in range(len(arg_lookup)):
169 | df_results.at[a,'yield'] = df_lookup.loc[arg_lookup[a]]['yield']
170 | df_results.at[a,'cost'] = df_lookup.loc[arg_lookup[a]]['cost']
171 | collected_yields.append(df_lookup.loc[arg_lookup[a]]['yield'])
172 | collected_costs.append(df_lookup.loc[arg_lookup[a]]['cost'])
173 |
174 | df_results.to_csv('expanded_scope.csv', index=False)
175 |
176 | if round > 0:
177 | # Save all predicted values.
178 | df_pred = pd.read_csv('pred_expanded_scope.csv')
179 | max_ei_yield = np.max(df_pred['yield_expected_improvement'])
180 | max_ei_cost = np.max(df_pred['cost_expected_improvement'])
181 | max_uncertainty_yield = np.max((df_pred['yield_predicted_variance']))
182 | max_uncertainty_cost = np.max((df_pred['cost_predicted_variance']))
183 | avg_uncertainty_yield = np.average((df_pred['yield_predicted_variance']))
184 | avg_uncertainty_cost = np.average((df_pred['cost_predicted_variance']))
185 | best_yield = np.max(collected_yields)
186 | best_cost = np.min(collected_costs)
187 | track_results_dict['n_experiments'].append(n_experiments)
188 | track_results_dict['best_yield'].append(best_yield)
189 | track_results_dict['best_cost'].append(best_cost)
190 | track_results_dict['max_ei_yield'].append(max_ei_yield)
191 | track_results_dict['max_ei_cost'].append(max_ei_cost)
192 | track_results_dict['max_uncertainty_yield'].append(max_uncertainty_yield)
193 | track_results_dict['avg_uncertainty_yield'].append(avg_uncertainty_yield)
194 | track_results_dict['avg_uncertainty_cost'].append(avg_uncertainty_cost)
195 |
196 |
197 | sns.scatterplot(
198 | x=np.array(track_results_dict['n_experiments']),
199 | y=np.array(track_results_dict['max_ei_yield']), ax=ax[0][0], color='C0', s=95,
200 | zorder=10
201 | )
202 | sns.scatterplot(
203 | x=track_results_dict['n_experiments'],
204 | y=track_results_dict['max_ei_cost'], ax=ax[0][1], color='C0', s=95,
205 | )
206 | sns.scatterplot(
207 | x=track_results_dict['n_experiments'],
208 | y=track_results_dict['best_yield'], ax=ax[1][0], color='C0', s=95,
209 | zorder=10
210 | )
211 | sns.scatterplot(
212 | x=track_results_dict['n_experiments'],
213 | y=track_results_dict['best_cost'], ax=ax[1][1], color='C0',s=95,
214 | zorder=10
215 | )
216 |
217 | plt.tight_layout()
218 | plt.savefig('./results_plots/expand_scope.svg', format='svg')
219 | plt.show()
220 |
221 |
222 |
223 |
224 |
225 |
226 |
227 |
228 |
229 |
--------------------------------------------------------------------------------
/examples/publication/BMS_yield_cost/9_optimization_constraints.py:
--------------------------------------------------------------------------------
1 |
2 | from edbo.plus.optimizer_botorch import EDBOplus
3 | import pandas as pd
4 | import numpy as np
5 | import matplotlib.pyplot as plt
6 | import pareto
7 | from edbo.plus.benchmark.multiobjective_benchmark import is_pareto
8 | import torch
9 | from botorch.utils.multi_objective.hypervolume import Hypervolume
10 | import copy
11 | import numpy as np
12 | import matplotlib.pyplot as plt
13 | import pandas as pd
14 | import matplotlib as mpl
15 |
16 | # # Metrics.
17 | # def get_pareto_points(objective_values):
18 | # """ Get pareto for the ground truth function.
19 | # NOTE: Assumes maximization."""
20 | # pareto_ground = pareto.eps_sort(tables=objective_values,
21 | # objectives=np.arange(2),
22 | # maximize_all=True)
23 | # idx_pareto = is_pareto(objectives=-objective_values)
24 | # return np.array(pareto_ground), idx_pareto
25 |
26 | # def get_hypervolume(pareto_points, ref_mins):
27 | # """
28 | # Calculate hypervolume.
29 | # """
30 | # pareto_torch = torch.Tensor(pareto_points)
31 | # hv = Hypervolume(ref_point=torch.Tensor(ref_mins))
32 | # hypervolume = hv.compute(pareto_Y=pareto_torch)
33 | # return hypervolume
34 |
35 |
36 | # # Combinations of constraints tested in this example.
37 | # # Columns that remain constant after EDBO suggest the best sample using batch=1.
38 | set_constraints = [
39 | ['ligand'],
40 | ['ligand', 'base'],
41 | ['solvent', 'concentration', 'temperature'],
42 | ]
43 |
44 | # df_results = pd.DataFrame(columns=['seed', 'constraints',
45 | # 'n_exp', 'hypervolume'])
46 |
47 | # for columns_to_constrain in set_constraints:
48 | # # Parameters.
49 | # batch_size = 5
50 | # # columns_to_constrain = ['solvent', 'concentration', 'temperature']
51 | # n_rounds = 7
52 | # n_seeds = 5
53 | # # Load lookup tables.
54 | # df_hte = pd.read_csv('./data/experiments_yield_and_cost.csv')
55 | # # Get targets for hypervolume indicator.
56 | # targets_hte = np.zeros((len(df_hte), 2))
57 | # targets_hte[:, 0] = df_hte['yield'].to_numpy()
58 | # targets_hte[:, 1] = -df_hte['cost'].to_numpy()
59 | # worst_targets = np.min(targets_hte, axis=0)
60 | # pareto_ref = get_pareto_points(objective_values=targets_hte)[0]
61 | # hypervolume_ref = get_hypervolume(pareto_points=pareto_ref, ref_mins=worst_targets)
62 |
63 | # # Get columns names for regression and search space.
64 | # columns_search_space = df_hte.drop(columns=['yield', 'cost']).columns.tolist()
65 | # columns_regression = df_hte.drop(columns=['new_index', 'yield', 'cost']).columns.tolist()
66 | # df_full_space = df_hte[columns_search_space]
67 |
68 | # # Initialize optimization campaing.
69 | # for seed in range(0, n_seeds):
70 | # n_exp = 0
71 | # df_full_space.to_csv('optimization.csv', index=False)
72 | # for round in range(0, n_rounds):
73 | # EDBOplus().run(
74 | # filename='optimization.csv',
75 | # seed=seed,
76 | # objectives=['yield', 'cost'],
77 | # objective_mode=['max', 'min'], # Maximize yield but minimize cost.
78 | # batch=1,
79 | # columns_features=columns_regression, # features to be included in the model.
80 | # init_sampling_method='cvtsampling' # initialization method.
81 | # )
82 |
83 | # df_opt = pd.read_csv('optimization.csv')
84 |
85 | # # Initial optimization to obtain the best sample in the entire search space.
86 | # best_suggested_sample = df_opt.loc[0]
87 | # df_reduced_space = df_opt.copy()
88 | # for col in columns_to_constrain:
89 | # df_reduced_space = df_reduced_space[df_reduced_space[col] == best_suggested_sample[col]]
90 |
91 | # df_reduced_space.drop(columns=['yield', 'cost', 'priority'], inplace=True)
92 | # df_reduced_space.to_csv('optimization_reduced.csv', index=False)
93 |
94 | # EDBOplus().run(
95 | # filename='optimization_reduced.csv', # Previously generated scope.
96 | # objectives=['yield', 'cost'], # Objectives to be optimized.
97 | # objective_mode=['max', 'min'], # Maximize yield and ee but minimize side_product.
98 | # batch=batch_size,
99 | # seed=seed,
100 | # columns_features=columns_regression, # features to be included in the model.
101 | # init_sampling_method='cvtsampling' # initialization method.
102 | # )
103 |
104 | # df_opt_reduced = pd.read_csv('optimization_reduced.csv')
105 |
106 | # idx_best_samples = df_opt_reduced['new_index'].values.tolist()[:batch_size]
107 | # print('Index best samples:', idx_best_samples)
108 | # df_opt = df_opt.sort_values(by='new_index')
109 | # df_opt.reset_index(inplace=True)
110 | # df_opt.drop(columns=['index'], inplace=True)
111 |
112 | # for a in range(len(idx_best_samples)):
113 | # df_opt.at[idx_best_samples[a],'yield'] = df_hte.loc[idx_best_samples[a]]['yield']
114 | # df_opt.at[idx_best_samples[a],'cost'] = df_hte.loc[idx_best_samples[a]]['cost']
115 | # df_opt.at[idx_best_samples[a],'priority'] = 1
116 |
117 | # df_opt = df_opt.sort_values(by='priority', ascending=False)
118 | # df_opt.to_csv('optimization.csv', index=False)
119 |
120 | # # Monitoring hypervolume.
121 | # df_train = df_opt[df_opt['yield'] != 'PENDING']
122 | # df_train['yield'] = copy.deepcopy(pd.to_numeric(df_train['yield']))
123 | # df_train['cost'] = copy.deepcopy(pd.to_numeric(df_train['cost']))
124 |
125 | # targets_train = np.zeros((len(df_train), 2))
126 | # targets_train[:, 0] = df_train['yield'].to_numpy()
127 | # targets_train[:, 1] = -df_train['cost'].to_numpy()
128 | # pareto_train = get_pareto_points(objective_values=targets_train)[0]
129 | # hypervolume_train = get_hypervolume(pareto_points=pareto_train,
130 | # ref_mins=worst_targets)
131 | # hypervolume_explored = (hypervolume_train/hypervolume_ref) * 100
132 |
133 | # n_exp += batch_size
134 | # print(f"Number of samples: {n_exp}")
135 | # print(f"Hypervolume: {hypervolume_explored}")
136 |
137 | # dict_results = {'seed': seed,
138 | # 'constraints': columns_to_constrain,
139 | # 'n_exp': n_exp,
140 | # 'hypervolume': hypervolume_explored}
141 | # df_results = df_results.append(dict_results, ignore_index=True)
142 | # df_results.to_csv('constraint_optimization_results.csv')
143 |
144 |
145 | # Plot results.
146 | df_results = pd.read_csv('constraint_optimization_results.csv')
147 | colors = [ '#0343DF', '#FAC205', '#DC143C']
148 | count = 0
149 |
150 | mpl.rcParams['grid.linestyle'] = ':'
151 | mpl.rcParams['grid.linewidth'] = 0.1
152 | plt.rcParams['font.family'] = 'Helvetica'
153 |
154 | fig, ax = plt.subplots(figsize=(4., 4.0), dpi=500, nrows=1, ncols=1)
155 |
156 | for constraints in set_constraints:
157 | # Get subset for constraints.
158 | constraints = str(constraints)
159 | df_constraint = df_results[df_results['constraints'] == constraints]
160 |
161 | # Get average, max and min hypervolume explored at each step.
162 | df_avg = df_constraint.groupby(['n_exp']).agg([np.average])
163 | df_max = df_constraint.groupby(['n_exp']).agg([np.max])
164 | df_min = df_constraint.groupby(['n_exp']).agg([np.min])
165 |
166 |
167 | n_exp = np.unique(df_results['n_exp'].values).flatten()
168 | hypervol_avg = df_avg['hypervolume'].values.flatten()
169 | hypervol_max = df_max['hypervolume'].values.flatten()
170 | hypervol_min = df_min['hypervolume'].values.flatten()
171 |
172 | color = colors[count]
173 |
174 | ax.plot(n_exp, hypervol_avg, color=color, lw=2.5,
175 | label=str(constraints))
176 | ax.fill_between(x=n_exp,
177 | y1=hypervol_avg,
178 | y2=hypervol_max, color=color, alpha=0.3, lw=0.)
179 | ax.fill_between(x=n_exp,
180 | y1=hypervol_min,
181 | y2=hypervol_avg, color=color, alpha=0.3, lw=0.)
182 | ax.plot(n_exp, hypervol_min, color=color, alpha=1., lw=1., ls='--')
183 | ax.plot(n_exp, hypervol_max, color=color, alpha=1., lw=1., ls='--')
184 | ax.plot(n_exp, np.ones_like(n_exp)*100,
185 | dashes=[8, 4], color='black', linewidth=0.8)
186 | ax.scatter(n_exp, hypervol_avg, marker='o', s=0., color=color)
187 | count += 1
188 |
189 | ax.set_xticks(np.arange(0, 120, 10))
190 | ax.set_xlim(0, np.max(n_exp[:-1]))
191 | ax.set_ylim(0, 100)
192 | ax.set_xlabel('Number of experiments')
193 | ax.set_ylabel('Hypervolume (%)')
194 | plt.legend()
195 | plt.savefig('./results_plots/optimization_constraints.svg', format='svg')
196 |
--------------------------------------------------------------------------------
/examples/publication/BMS_yield_cost/data/base_dft.csv:
--------------------------------------------------------------------------------
1 | base_file_name,base_SMILES,base_stoichiometry,base_number_of_atoms,base_charge,base_multiplicity,base_convergence_criteria,base_dipole,base_molar_mass,base_molar_volume,base_electronic_spatial_extent,base_homo_energy,base_lumo_energy,base_electronegativity,base_hardness,base_electrophilicity,base_E_scf,base_zero_point_correction,base_E_thermal_correction,base_H_thermal_correction,base_G_thermal_correction,base_E_zpe,base_E,base_H,base_G,base_ES_root_dipole,base_ES_root_molar_volume,base_ES_root_electronic_spatial_extent,base_ES1_transition,base_ES1_osc_strength,base_ES1_,base_ES2_transition,base_ES2_osc_strength,base_ES2_,base_ES3_transition,base_ES3_osc_strength,base_ES3_,base_ES4_transition,base_ES4_osc_strength,base_ES4_,base_ES5_transition,base_ES5_osc_strength,base_ES5_,base_ES6_transition,base_ES6_osc_strength,base_ES6_,base_ES7_transition,base_ES7_osc_strength,base_ES7_,base_ES8_transition,base_ES8_osc_strength,base_ES8_,base_ES9_transition,base_ES9_osc_strength,base_ES9_,base_ES10_transition,base_ES10_osc_strength,base_ES10_,base_atom1_atom,base_atom1_Mulliken_charge,base_atom1_APT_charge,base_atom1_NPA_charge,base_atom1_NPA_core,base_atom1_NPA_valence,base_atom1_NPA_Rydberg,base_atom1_NPA_total,base_atom1_NMR_shift,base_atom1_NMR_anisotropy,base_atom1_ES_root_Mulliken_charge,base_atom1_ES_root_NPA_charge,base_atom1_ES_root_NPA_core,base_atom1_ES_root_NPA_valence,base_atom1_ES_root_NPA_Rydberg,base_atom1_ES_root_NPA_total,base_atom2_atom,base_atom2_Mulliken_charge,base_atom2_APT_charge,base_atom2_NPA_charge,base_atom2_NPA_core,base_atom2_NPA_valence,base_atom2_NPA_Rydberg,base_atom2_NPA_total,base_atom2_NMR_shift,base_atom2_NMR_anisotropy,base_atom2_ES_root_Mulliken_charge,base_atom2_ES_root_NPA_charge,base_atom2_ES_root_NPA_core,base_atom2_ES_root_NPA_valence,base_atom2_ES_root_NPA_Rydberg,base_atom2_ES_root_NPA_total,base_atom3_atom,base_atom3_Mulliken_charge,base_atom3_APT_charge,base_atom3_NPA_charge,base_atom3_NPA_core,base_atom3_NPA_valence,base_atom3_NPA_Rydberg,base_atom3_NPA_total,base_atom3_NMR_shift,base_atom3_NMR_anisotropy,base_atom3_ES_root_Mulliken_charge,base_atom3_ES_root_NPA_charge,base_atom3_ES_root_NPA_core,base_atom3_ES_root_NPA_valence,base_atom3_ES_root_NPA_Rydberg,base_atom3_ES_root_NPA_total,base_c_min_atom_number,base_c_min_atom,base_c_min_atom=O,base_c_min_Mulliken_charge,base_c_min_APT_charge,base_c_min_NPA_charge,base_c_min_NPA_core,base_c_min_NPA_valence,base_c_min_NPA_Rydberg,base_c_min_NPA_total,base_c_min_NMR_shift,base_c_min_NMR_anisotropy,base_c_min_ES_root_Mulliken_charge,base_c_min_ES_root_NPA_charge,base_c_min_ES_root_NPA_core,base_c_min_ES_root_NPA_valence,base_c_min_ES_root_NPA_Rydberg,base_c_min_ES_root_NPA_total,base_c_min+1_atom_number,base_c_min+1_atom,base_c_min+1_atom=O,base_c_min+1_Mulliken_charge,base_c_min+1_APT_charge,base_c_min+1_NPA_charge,base_c_min+1_NPA_core,base_c_min+1_NPA_valence,base_c_min+1_NPA_Rydberg,base_c_min+1_NPA_total,base_c_min+1_NMR_shift,base_c_min+1_NMR_anisotropy,base_c_min+1_ES_root_Mulliken_charge,base_c_min+1_ES_root_NPA_charge,base_c_min+1_ES_root_NPA_core,base_c_min+1_ES_root_NPA_valence,base_c_min+1_ES_root_NPA_Rydberg,base_c_min+1_ES_root_NPA_total,base_c_max_atom_number,base_c_max_atom,base_c_max_atom=Cs,base_c_max_atom=K,base_c_max_Mulliken_charge,base_c_max_APT_charge,base_c_max_NPA_charge,base_c_max_NPA_core,base_c_max_NPA_valence,base_c_max_NPA_Rydberg,base_c_max_NPA_total,base_c_max_NMR_shift,base_c_max_NMR_anisotropy,base_c_max_ES_root_Mulliken_charge,base_c_max_ES_root_NPA_charge,base_c_max_ES_root_NPA_core,base_c_max_ES_root_NPA_valence,base_c_max_ES_root_NPA_Rydberg,base_c_max_ES_root_NPA_total,base_c_max-1_atom_number,base_c_max-1_atom,base_c_max-1_atom=C,base_c_max-1_Mulliken_charge,base_c_max-1_APT_charge,base_c_max-1_NPA_charge,base_c_max-1_NPA_core,base_c_max-1_NPA_valence,base_c_max-1_NPA_Rydberg,base_c_max-1_NPA_total,base_c_max-1_NMR_shift,base_c_max-1_NMR_anisotropy,base_c_max-1_ES_root_Mulliken_charge,base_c_max-1_ES_root_NPA_charge,base_c_max-1_ES_root_NPA_core,base_c_max-1_ES_root_NPA_valence,base_c_max-1_ES_root_NPA_Rydberg,base_c_max-1_ES_root_NPA_total,base_vib_1_vibration,base_vib_1_standard_vibration,base_vib_1_correlation,base_vib_1_frequency,base_vib_1_reduced_mass,base_vib_1_frc_const,base_vib_1_IR_intensity,base_vib_1_dip_strength,base_vib_1_rot_strength,base_vib_1_E-M_angle,base_vib_1_standard_frequency,base_vib_1_standard_reduced_mass,base_vib_1_standard_frc_const,base_vib_1_standard_IR_intensity,base_vib_1_standard_dip_strength,base_vib_1_standard_rot_strength,base_vib_1_standard_E-M_angle,base_vib_2_vibration,base_vib_2_standard_vibration,base_vib_2_correlation,base_vib_2_frequency,base_vib_2_reduced_mass,base_vib_2_frc_const,base_vib_2_IR_intensity,base_vib_2_dip_strength,base_vib_2_rot_strength,base_vib_2_E-M_angle,base_vib_2_standard_frequency,base_vib_2_standard_reduced_mass,base_vib_2_standard_frc_const,base_vib_2_standard_IR_intensity,base_vib_2_standard_dip_strength,base_vib_2_standard_rot_strength,base_vib_2_standard_E-M_angle,base_vib_3_vibration,base_vib_3_standard_vibration,base_vib_3_correlation,base_vib_3_frequency,base_vib_3_reduced_mass,base_vib_3_frc_const,base_vib_3_IR_intensity,base_vib_3_dip_strength,base_vib_3_rot_strength,base_vib_3_E-M_angle,base_vib_3_standard_frequency,base_vib_3_standard_reduced_mass,base_vib_3_standard_frc_const,base_vib_3_standard_IR_intensity,base_vib_3_standard_dip_strength,base_vib_3_standard_rot_strength,base_vib_3_standard_E-M_angle,base_vib_4_vibration,base_vib_4_standard_vibration,base_vib_4_correlation,base_vib_4_frequency,base_vib_4_reduced_mass,base_vib_4_frc_const,base_vib_4_IR_intensity,base_vib_4_dip_strength,base_vib_4_rot_strength,base_vib_4_E-M_angle,base_vib_4_standard_frequency,base_vib_4_standard_reduced_mass,base_vib_4_standard_frc_const,base_vib_4_standard_IR_intensity,base_vib_4_standard_dip_strength,base_vib_4_standard_rot_strength,base_vib_4_standard_E-M_angle,base_vib_5_vibration,base_vib_5_standard_vibration,base_vib_5_correlation,base_vib_5_frequency,base_vib_5_reduced_mass,base_vib_5_frc_const,base_vib_5_IR_intensity,base_vib_5_dip_strength,base_vib_5_rot_strength,base_vib_5_E-M_angle,base_vib_5_standard_frequency,base_vib_5_standard_reduced_mass,base_vib_5_standard_frc_const,base_vib_5_standard_IR_intensity,base_vib_5_standard_dip_strength,base_vib_5_standard_rot_strength,base_vib_5_standard_E-M_angle,base_vib_6_vibration,base_vib_6_standard_vibration,base_vib_6_correlation,base_vib_6_frequency,base_vib_6_reduced_mass,base_vib_6_frc_const,base_vib_6_IR_intensity,base_vib_6_dip_strength,base_vib_6_rot_strength,base_vib_6_E-M_angle,base_vib_6_standard_frequency,base_vib_6_standard_reduced_mass,base_vib_6_standard_frc_const,base_vib_6_standard_IR_intensity,base_vib_6_standard_dip_strength,base_vib_6_standard_rot_strength,base_vib_6_standard_E-M_angle,base_vib_7_vibration,base_vib_7_standard_vibration,base_vib_7_correlation,base_vib_7_frequency,base_vib_7_reduced_mass,base_vib_7_frc_const,base_vib_7_IR_intensity,base_vib_7_dip_strength,base_vib_7_rot_strength,base_vib_7_E-M_angle,base_vib_7_standard_frequency,base_vib_7_standard_reduced_mass,base_vib_7_standard_frc_const,base_vib_7_standard_IR_intensity,base_vib_7_standard_dip_strength,base_vib_7_standard_rot_strength,base_vib_7_standard_E-M_angle,base_vib_8_vibration,base_vib_8_standard_vibration,base_vib_8_correlation,base_vib_8_frequency,base_vib_8_reduced_mass,base_vib_8_frc_const,base_vib_8_IR_intensity,base_vib_8_dip_strength,base_vib_8_rot_strength,base_vib_8_E-M_angle,base_vib_8_standard_frequency,base_vib_8_standard_reduced_mass,base_vib_8_standard_frc_const,base_vib_8_standard_IR_intensity,base_vib_8_standard_dip_strength,base_vib_8_standard_rot_strength,base_vib_8_standard_E-M_angle,base_atom1_%VBur,base_atom2_%VBur,base_atom3_%VBur,base_c_min_%VBur,base_c_min+1_%VBur,base_c_max_%VBur,base_c_max-1_%VBur
2 | CsOAc,O=C([O-])C.[Cs+],C2H3CsO2,8,0,1,met,10.1478,191.9499,406.01,876.2473,-0.16646,-0.05377,0.11011499999999999,0.056345,0.056345,-248.406264045,0.049939,0.056406,0.05735,0.015986,-248.356325,-248.349858,-248.348914,-248.390278,8.2546,1380.434,909.9558,498.95,0.0037,0.000,432.49,0.1696,0.000,429.03,0.0000,0.000,324.62,0.0000,0.000,312.85,0.0122,0.000,308.18,0.0425,0.000,299.13,0.0286,0.000,290.79,0.0138,0.000,290.15,0.0014,0.000,281.64,0.0007,0.000,O1,-0.630819,-0.948476,-0.82325,1.99975,6.80840,0.01510,8.82325,37.0240,325.6241,-0.367329,-0.45721,1.99977,6.44316,0.01428,8.45721,O2,-0.627311,-0.929421,-0.81310,1.99975,6.79835,0.01500,8.81310,12.2909,315.3062,-0.370284,-0.45523,1.99977,6.44127,0.01419,8.45523,Cs3,0.845450,0.946019,0.93442,53.99587,0.05739,0.01232,54.06558,77.7746,4.3447,0.102280,0.12092,54.00014,0.75463,0.12431,54.87908,1,O,1,-0.630819,-0.948476,-0.82325,1.99975,6.80840,0.01510,8.82325,37.0240,325.6241,-0.367329,-0.45721,1.99977,6.44316,0.01428,8.45721,3,O,1,-0.627311,-0.929421,-0.81310,1.99975,6.79835,0.01500,8.81310,12.2909,315.3062,-0.370284,-0.45523,1.99977,6.44127,0.01419,8.45523,5,Cs,1,0,0.845450,0.946019,0.93442,53.99587,0.05739,0.01232,54.06558,77.7746,4.3447,0.102280,0.12092,54.00014,0.75463,0.12431,54.87908,2,C,1,0.503244,1.052880,0.79419,1.99945,3.15120,0.05516,5.20581,23.9298,101.6692,0.597363,0.80166,1.99946,3.14820,0.05067,5.19834,6,6,1.,618.7872,2.451,0.5529,7.6617,49.3959,0.0056,89.9964,618.7872,2.451,0.5529,7.6617,49.3959,0.0056,89.9964,7,7,1.,646.437,6.0553,1.4909,18.1353,111.9193,-0.0052,90.08,646.437,6.0553,1.4909,18.1353,111.9193,-0.0052,90.08,8,8,1.0000000000000002,916.3218,7.2493,3.5863,16.7246,72.8142,0.0002,89.9953,916.3218,7.2493,3.5863,16.7246,72.8142,0.0002,89.9953,9,9,1.0000000000000002,1027.1507,1.4579,0.9063,6.8743,26.6993,0.0013,89.997,1027.1507,1.4579,0.9063,6.8743,26.6993,0.0013,89.997,10,10,1.,1063.8484,1.8673,1.2451,6.1805,23.1767,-0.0021,90.0022,1063.8484,1.8673,1.2451,6.1805,23.1767,-0.0021,90.0022,11,11,1.0000000000000002,1377.1441,1.4156,1.5818,25.4311,73.6705,0.0003,89.9992,1377.1441,1.4156,1.5818,25.4311,73.6705,0.0003,89.9992,12,12,1.,1428.8053,5.1995,6.254,180.9276,505.1719,0.0002,89.9995,1428.8053,5.1995,6.254,180.9276,505.1719,0.0002,89.9995,15,15,0.9999999999999999,1677.2563,8.6498,14.3368,430.3053,1023.4925,0.,90.,1677.2563,8.6498,14.3368,430.3053,1023.4925,0.,90.,0.4621717912637346,0.4583096314481602,0.4355558216819856,0.4621717912637346,0.4583096314481602,0.4355558216819856,0.5387263854375617
3 | CsOPiv,O=C([O-])C(C)(C)C.[Cs+],C5H9CsO2,17,0,1,met,11.0112,234.0303,1542.838,1831.5572,-0.17387,-0.05538,0.114625,0.059245,0.059245,-366.348293098,0.135133,0.145329,0.146273,0.096898,-366.21316,-366.202964,-366.20202,-366.251395,8.9444,1218.03,1885.5803,469.32,0.0031,0.000,434.42,0.1679,0.000,409.02,0.0000,0.000,311.54,0.0012,0.000,307.21,0.0401,0.000,301.54,0.0076,0.000,297.70,0.0363,0.000,281.05,0.0009,0.000,280.56,0.0077,0.000,275.55,0.0018,0.000,O1,-0.626594,-0.907646,-0.80886,1.99975,6.79434,0.01477,8.80886,36.4140,345.8849,-0.363461,-0.43354,1.99977,6.41981,0.01396,8.43354,O2,-0.640045,-0.928433,-0.82255,1.99975,6.80799,0.01481,8.82255,56.9329,339.6416,-0.390175,-0.47421,1.99977,6.46010,0.01434,8.47421,Cs3,0.848316,0.958216,0.93820,53.99606,0.05620,0.00954,54.06180,77.7495,5.0811,0.075126,0.10338,53.99940,0.78274,0.11448,54.89662,3,O,1,-0.640045,-0.928433,-0.82255,1.99975,6.80799,0.01481,8.82255,56.9329,339.6416,-0.390175,-0.47421,1.99977,6.46010,0.01434,8.47421,1,O,1,-0.626594,-0.907646,-0.80886,1.99975,6.79434,0.01477,8.80886,36.4140,345.8849,-0.363461,-0.43354,1.99977,6.41981,0.01396,8.43354,8,Cs,1,0,0.848316,0.958216,0.93820,53.99606,0.05620,0.00954,54.06180,77.7495,5.0811,0.075126,0.10338,53.99940,0.78274,0.11448,54.89662,2,C,1,0.513009,0.941821,0.81589,1.99929,3.12499,0.05983,5.18411,16.0679,110.6588,0.611765,0.82753,1.99928,3.11710,0.05610,5.17247,15,6,0.9959074339135122,794.1344,6.9049,2.5656,5.4827,27.5427,0.0024,89.9968,618.7872,2.451,0.5529,7.6617,49.3959,0.0056,89.9964,14,7,0.9816918876799305,588.297,3.5036,0.7144,10.1352,68.7297,0.0015,89.864,646.437,6.0553,1.4909,18.1353,111.9193,-0.0052,90.08,17,8,-0.9966831868163546,888.0265,3.386,1.5732,30.4925,136.9854,0.009,89.8991,916.3218,7.2493,3.5863,16.7246,72.8142,0.0002,89.9953,22,9,0.9443745773600293,1059.5097,1.4077,0.9311,0.7714,2.9047,-0.0667,90.3188,1027.1507,1.4579,0.9063,6.8743,26.6993,0.0013,89.997,23,10,-0.9999999970969979,1243.8906,2.7116,2.4719,4.866,15.6061,-0.0039,90.0381,1063.8484,1.8673,1.2451,6.1805,23.1767,-0.0021,90.0022,29,11,-0.999939389883036,1456.51,1.5233,1.904,45.7368,125.2737,0.0018,89.9899,1377.1441,1.4156,1.5818,25.4311,73.6705,0.0003,89.9992,26,12,0.9995646393653587,1395.7311,2.9236,3.3556,87.7367,250.7766,-0.0087,90.0441,1428.8053,5.1995,6.254,180.9276,505.1719,0.0002,89.9995,36,15,0.9971974621484632,1661.3087,11.2165,18.2392,378.225,908.2539,-0.0008,90.0001,1677.2563,8.6498,14.3368,430.3053,1023.4925,0.,90.,0.5230981108350049,0.539564683692105,0.43534624711834974,0.539564683692105,0.5230981108350049,0.43534624711834974,0.664291488278794
4 | KOAc,O=C([O-])C.[K+],C2H3KO2,8,0,1,met,7.1686,98.1428,545.524,641.7535,-0.19743,-0.04683,0.12212999999999999,0.0753,0.0753,-828.445320274,0.050381,0.056594,0.057539,0.018561,-828.394939,-828.388726,-828.387782,-828.426759,8.5245,606.138,685.0531,363.32,0.0031,0.000,327.15,0.0000,0.000,321.58,0.1037,0.000,261.01,0.0000,0.000,251.07,0.0078,0.000,244.35,0.0107,0.000,241.63,0.0001,0.000,239.66,0.0025,0.000,234.54,0.0124,0.000,232.06,0.0018,0.000,O1,-0.627463,-0.943989,-0.82067,1.99980,6.81192,0.00895,8.82067,30.0644,310.1108,-0.363511,-0.43458,1.99983,6.41968,0.01507,8.43458,O2,-0.627534,-0.927130,-0.81465,1.99980,6.79875,0.01610,8.81465,6.9248,303.5098,-0.370326,-0.43362,1.99983,6.41864,0.01515,8.43362,K3,0.725768,0.862762,0.91455,17.99508,0.06358,0.02679,18.08545,1286.0105,40.8048,0.001831,0.05032,18.00078,0.85275,0.09615,18.94968,1,O,1,-0.627463,-0.943989,-0.82067,1.99980,6.81192,0.00895,8.82067,30.0644,310.1108,-0.363511,-0.43458,1.99983,6.41968,0.01507,8.43458,3,O,1,-0.627534,-0.927130,-0.81465,1.99980,6.79875,0.01610,8.81465,6.9248,303.5098,-0.370326,-0.43362,1.99983,6.41864,0.01515,8.43362,5,K,0,1,0.725768,0.862762,0.91455,17.99508,0.06358,0.02679,18.08545,1286.0105,40.8048,0.001831,0.05032,18.00078,0.85275,0.09615,18.94968,2,C,1,0.582472,1.091107,0.78874,1.99958,3.15451,0.05718,5.21126,19.4377,106.5248,0.659809,0.78899,1.99960,3.15956,0.05185,5.21101,6,6,0.9999890370004872,616.9335,2.5135,0.5637,7.6394,49.4004,-0.0238,90.0144,618.7872,2.451,0.5529,7.6617,49.3959,0.0056,89.9964,7,7,0.9987724746696157,659.7903,6.1973,1.5895,29.1201,176.0738,0.0207,89.6665,646.437,6.0553,1.4909,18.1353,111.9193,-0.0052,90.08,8,8,-0.9998826675540469,925.2324,7.2377,3.6505,12.1374,52.3339,-0.0009,90.0186,916.3218,7.2493,3.5863,16.7246,72.8142,0.0002,89.9953,9,9,0.9999989809690724,1031.7256,1.4605,0.916,7.6401,29.5422,-0.0049,90.009,1027.1507,1.4579,0.9063,6.8743,26.6993,0.0013,89.997,10,10,0.9999999998768896,1069.0842,1.8415,1.2401,5.4846,20.4663,0.0076,89.9904,1063.8484,1.8673,1.2451,6.1805,23.1767,-0.0021,90.0022,11,11,0.9999990327735793,1383.254,1.412,1.5918,21.8732,63.0837,-0.0014,90.0059,1377.1441,1.4156,1.5818,25.4311,73.6705,0.0003,89.9992,12,12,0.9997085947517004,1439.3398,4.9649,6.0602,223.0332,618.178,-0.0014,90.0038,1428.8053,5.1995,6.254,180.9276,505.1719,0.0002,89.9995,15,15,-0.9997468732478542,1644.7032,7.706,12.2816,409.2433,992.662,0.0008,89.9999,1677.2563,8.6498,14.3368,430.3053,1023.4925,0.,90.,0.5728870393102002,0.5692344540582618,0.8437172539744319,0.5728870393102002,0.5692344540582618,0.8437172539744319,0.6196521062243645
5 | KOPiv,O=C([O-])C(C)(C)C.[K+],C5H9KO2,17,0,1,met,7.7731,140.2232,1394.117,1500.593,-0.2019,-0.04847,0.125185,0.076715,0.076715,-946.387800763,0.135595,0.145534,0.146478,0.099461,-946.252206,-946.242267,-946.241322,-946.28834,8.9776,909.207,1563.3278,355.25,0.0024,0.000,330.15,0.1106,0.000,321.67,0.0000,0.000,256.20,0.0000,0.000,248.54,0.0117,0.000,247.09,0.0045,0.000,240.58,0.0065,0.000,236.56,0.0018,0.000,233.63,0.0109,0.000,229.18,0.0018,0.000,O1,-0.640102,-0.902338,-0.81130,1.99979,6.79620,0.01531,8.81130,27.8484,331.9497,-0.379796,-0.42010,1.99983,6.40559,0.01469,8.42010,O2,-0.649551,-0.924868,-0.82499,1.99979,6.81009,0.01510,8.82499,46.0592,327.5974,-0.397948,-0.45369,1.99983,6.43914,0.01472,8.45369,K3,0.731425,0.873667,0.91906,17.99520,0.06186,0.02389,18.08094,1287.9448,39.1010,-0.016351,0.04662,18.00055,0.86198,0.09086,18.95338,3,O,1,-0.649551,-0.924868,-0.82499,1.99979,6.81009,0.01510,8.82499,46.0592,327.5974,-0.397948,-0.45369,1.99983,6.43914,0.01472,8.45369,1,O,1,-0.640102,-0.902338,-0.81130,1.99979,6.79620,0.01531,8.81130,27.8484,331.9497,-0.379796,-0.42010,1.99983,6.40559,0.01469,8.42010,8,K,0,1,0.731425,0.873667,0.91906,17.99520,0.06186,0.02389,18.08094,1287.9448,39.1010,-0.016351,0.04662,18.00055,0.86198,0.09086,18.95338,2,C,1,0.614239,0.982954,0.81254,1.99942,3.12952,0.05852,5.18746,11.3401,114.6019,0.697336,0.81335,1.99941,3.12961,0.05763,5.18665,15,6,0.9985454480831925,791.4536,6.8855,2.5412,5.0505,25.4575,0.0014,89.9982,618.7872,2.451,0.5529,7.6617,49.3959,0.0056,89.9964,14,7,-0.9599107742386958,595.6456,3.6165,0.756,15.9905,107.098,0.0007,89.7512,646.437,6.0553,1.4909,18.1353,111.9193,-0.0052,90.08,17,8,0.9954281640750704,893.1298,3.5163,1.6526,27.9244,124.7315,0.0022,89.9738,916.3218,7.2493,3.5863,16.7246,72.8142,0.0002,89.9953,22,9,-0.9145365457535707,1060.0869,1.4056,0.9307,0.9054,3.4074,-0.0275,90.1271,1027.1507,1.4579,0.9063,6.8743,26.6993,0.0013,89.997,23,10,-0.9999999942993534,1242.9392,2.7008,2.4583,3.9264,12.6023,-0.0019,90.0937,1063.8484,1.8673,1.2451,6.1805,23.1767,-0.0021,90.0022,29,11,0.9999328641835897,1460.0476,1.5532,1.9507,61.3992,167.7656,0.0017,89.9895,1377.1441,1.4156,1.5818,25.4311,73.6705,0.0003,89.9992,29,12,-0.999479732422525,1460.0476,1.5532,1.9507,61.3992,167.7656,0.0017,89.9895,1428.8053,5.1995,6.254,180.9276,505.1719,0.0002,89.9995,36,15,-0.9973077260375155,1625.8494,10.5985,16.5065,357.3476,876.8352,-0.0087,90.0138,1677.2563,8.6498,14.3368,430.3053,1023.4925,0.,90.,0.6338732373282236,0.6495613903775336,0.8435376186341725,0.6495613903775336,0.6338732373282236,0.8435376186341725,0.7457261758629982
--------------------------------------------------------------------------------
/examples/publication/BMS_yield_cost/data/solvent_dft.csv:
--------------------------------------------------------------------------------
1 | solvent_file_name,solvent_SMILES,solvent_stoichiometry,solvent_number_of_atoms,solvent_charge,solvent_multiplicity,solvent_convergence_criteria,solvent_dipole,solvent_molar_mass,solvent_molar_volume,solvent_electronic_spatial_extent,solvent_homo_energy,solvent_lumo_energy,solvent_electronegativity,solvent_hardness,solvent_electrophilicity,solvent_E_scf,solvent_zero_point_correction,solvent_E_thermal_correction,solvent_H_thermal_correction,solvent_G_thermal_correction,solvent_E_zpe,solvent_E,solvent_H,solvent_G,solvent_ES_root_dipole,solvent_ES_root_molar_volume,solvent_ES_root_electronic_spatial_extent,solvent_ES1_transition,solvent_ES1_osc_strength,solvent_ES1_,solvent_ES2_transition,solvent_ES2_osc_strength,solvent_ES2_,solvent_ES3_transition,solvent_ES3_osc_strength,solvent_ES3_,solvent_ES4_transition,solvent_ES4_osc_strength,solvent_ES4_,solvent_ES5_transition,solvent_ES5_osc_strength,solvent_ES5_,solvent_ES6_transition,solvent_ES6_osc_strength,solvent_ES6_,solvent_ES7_transition,solvent_ES7_osc_strength,solvent_ES7_,solvent_ES8_transition,solvent_ES8_osc_strength,solvent_ES8_,solvent_ES9_transition,solvent_ES9_osc_strength,solvent_ES9_,solvent_ES10_transition,solvent_ES10_osc_strength,solvent_ES10_,solvent_c_min_atom_number,solvent_c_min_atom,solvent_c_min_atom=N,solvent_c_min_atom=O,solvent_c_min_atom=C,solvent_c_min_Mulliken_charge,solvent_c_min_APT_charge,solvent_c_min_NPA_charge,solvent_c_min_NPA_core,solvent_c_min_NPA_valence,solvent_c_min_NPA_Rydberg,solvent_c_min_NPA_total,solvent_c_min_NMR_shift,solvent_c_min_NMR_anisotropy,solvent_c_min_ES_root_Mulliken_charge,solvent_c_min_ES_root_NPA_charge,solvent_c_min_ES_root_NPA_core,solvent_c_min_ES_root_NPA_valence,solvent_c_min_ES_root_NPA_Rydberg,solvent_c_min_ES_root_NPA_total,solvent_c_min+1_atom_number,solvent_c_min+1_atom,solvent_c_min+1_atom=C,solvent_c_min+1_atom=O,solvent_c_min+1_atom=N,solvent_c_min+1_Mulliken_charge,solvent_c_min+1_APT_charge,solvent_c_min+1_NPA_charge,solvent_c_min+1_NPA_core,solvent_c_min+1_NPA_valence,solvent_c_min+1_NPA_Rydberg,solvent_c_min+1_NPA_total,solvent_c_min+1_NMR_shift,solvent_c_min+1_NMR_anisotropy,solvent_c_min+1_ES_root_Mulliken_charge,solvent_c_min+1_ES_root_NPA_charge,solvent_c_min+1_ES_root_NPA_core,solvent_c_min+1_ES_root_NPA_valence,solvent_c_min+1_ES_root_NPA_Rydberg,solvent_c_min+1_ES_root_NPA_total,solvent_c_max_atom_number,solvent_c_max_atom,solvent_c_max_atom=C,solvent_c_max_atom=H,solvent_c_max_Mulliken_charge,solvent_c_max_APT_charge,solvent_c_max_NPA_charge,solvent_c_max_NPA_core,solvent_c_max_NPA_valence,solvent_c_max_NPA_Rydberg,solvent_c_max_NPA_total,solvent_c_max_NMR_shift,solvent_c_max_NMR_anisotropy,solvent_c_max_ES_root_Mulliken_charge,solvent_c_max_ES_root_NPA_charge,solvent_c_max_ES_root_NPA_core,solvent_c_max_ES_root_NPA_valence,solvent_c_max_ES_root_NPA_Rydberg,solvent_c_max_ES_root_NPA_total,solvent_c_max-1_atom_number,solvent_c_max-1_atom,solvent_c_max-1_atom=H,solvent_c_max-1_Mulliken_charge,solvent_c_max-1_APT_charge,solvent_c_max-1_NPA_charge,solvent_c_max-1_NPA_core,solvent_c_max-1_NPA_valence,solvent_c_max-1_NPA_Rydberg,solvent_c_max-1_NPA_total,solvent_c_max-1_NMR_shift,solvent_c_max-1_NMR_anisotropy,solvent_c_max-1_ES_root_Mulliken_charge,solvent_c_max-1_ES_root_NPA_charge,solvent_c_max-1_ES_root_NPA_core,solvent_c_max-1_ES_root_NPA_valence,solvent_c_max-1_ES_root_NPA_Rydberg,solvent_c_max-1_ES_root_NPA_total,solvent_c_min_%VBur,solvent_c_min+1_%VBur,solvent_c_max_%VBur,solvent_c_max-1_%VBur
2 | BuCN,CCCC#N,C4H7N,12,0,1,met,4.0491,69.106,914.079,571.8195,-0.3186,0.03549,0.141555,0.177045,0.177045,-211.38290967,0.103466,0.109411,0.110356,0.074482,-211.279443,-211.273498,-211.272554,-211.308428,3.124,796.555,571.8803,161.05,0.0001,0.000,154.99,0.0089,0.000,154.72,0.0015,0.000,134.54,0.0145,0.000,132.51,0.0177,0.000,127.49,0.0395,0.000,127.44,0.0007,0.000,125.59,0.0002,0.000,123.78,0.0156,0.000,122.66,0.0268,0.000,5,N,1,0,0,-0.472503,-0.316553,-0.32895,1.99965,5.30799,0.02132,7.32895,0.2592,455.8112,-0.443011,-0.26674,1.99970,5.23124,0.03580,7.26674,1,C,1,0,0,-0.446562,0.081423,-0.68089,1.99946,4.67343,0.00800,6.68089,174.3062,23.2239,-0.452535,-0.68587,1.99946,4.67726,0.00916,6.68587,4,C,1,0,0.347150,0.093010,0.28826,1.99942,3.67740,0.03492,5.71174,83.2576,312.0115,0.357169,0.29020,1.99955,3.66181,0.04844,5.70980,12,H,1,0.190629,-0.001691,0.27199,0.00000,0.72663,0.00138,0.72801,30.2477,6.3175,0.177540,0.26144,0.00000,0.73154,0.00702,0.73856,0.2941528696745606,0.42073590611059547,0.44292087063261576,0.41396964162749617
3 | BuOAc,CCCCOC(C)=O,C6H12O2,20,0,1,met,1.732,116.1596,1374.801,1567.8891,-0.26727,0.01633,0.12547,0.1418,0.1418,-386.334640247,0.176341,0.186348,0.187292,0.139915,-386.158299,-386.148293,-386.147349,-386.194725,1.1576,933.507,1569.2596,213.62,0.0012,0.000,159.76,0.1002,0.000,150.56,0.0020,0.000,142.27,0.0048,0.000,137.73,0.0011,0.000,137.18,0.0008,0.000,134.14,0.1183,0.000,133.21,0.0135,0.000,130.96,0.0016,0.000,128.95,0.0377,0.000,5,O,0,1,0,-0.456115,-0.901813,-0.56061,1.99975,6.54945,0.01141,8.56061,134.3177,142.8309,-0.443534,-0.51973,1.99977,6.50828,0.01167,8.51973,8,O,0,1,0,-0.470030,-0.684614,-0.60235,1.99978,6.58292,0.01964,8.60235,-71.7431,570.4859,-0.303557,-0.28504,1.99981,6.26508,0.02015,8.28504,6,C,1,0,0.600089,1.124279,0.82420,1.99949,3.13174,0.04457,5.17580,29.0632,84.0999,0.462388,0.48792,1.99952,3.47223,0.04032,5.51208,18,H,1,0.180845,0.019608,0.25616,0.00000,0.74297,0.00086,0.74384,30.2399,6.1567,0.160979,0.23070,0.00000,0.76480,0.00451,0.76930,0.5591748750037423,0.43049609293134933,0.5580371845154337,0.3342714289991317
4 | DMAc,CC(N(C)C)=O,C4H9NO,15,0,1,met,3.6595,87.1212,854.473,624.6972,-0.2338,0.03388,0.09996000000000001,0.13384000000000001,0.13384000000000001,-287.830205604,0.131005,0.138714,0.139658,0.099133,-287.699201,-287.691491,-287.690547,-287.731073,2.0521,881.599,624.4849,220.76,0.0009,0.000,178.41,0.2223,0.000,162.71,0.0264,0.000,156.10,0.0121,0.000,147.68,0.0172,0.000,142.30,0.0073,0.000,135.59,0.0005,0.000,132.75,0.0026,0.000,132.39,0.0150,0.000,130.70,0.0003,0.000,6,O,0,1,0,-0.508595,-0.777915,-0.63034,1.99980,6.61277,0.01778,8.63034,-65.8556,569.4492,-0.338968,-0.28910,1.99983,6.27046,0.01882,8.28910,3,N,0,0,1,-0.392215,-0.743329,-0.47855,1.99933,5.46881,0.01041,7.47855,155.0284,111.8179,-0.395470,-0.46828,1.99935,5.45833,0.01061,7.46828,2,C,1,0,0.577639,1.037174,0.69314,1.99938,3.26821,0.03928,5.30686,33.7389,96.6573,0.462343,0.37787,1.99940,3.58626,0.03647,5.62213,10,H,1,0.206649,0.071311,0.26442,0.00000,0.73335,0.00223,0.73558,27.7382,6.7158,0.180961,0.24084,0.00000,0.75824,0.00092,0.75916,0.4078321008353043,0.6535133678632375,0.5860303583725037,0.3953474446872848
5 | p-Xylene,CC1=CC=C(C)C=C1,C8H10,18,0,1,met,0.0011,106.167,1034.845,1072.0641,-0.22568,0.0068,0.10944,0.11624,0.11624,-310.884514007,0.15581,0.163925,0.164869,0.121251,-310.728704,-310.720589,-310.719645,-310.763263,0.0004,862.371,1072.3756,232.87,0.0037,0.000,206.12,0.0838,0.000,177.12,0.3578,0.000,175.96,0.7943,0.000,159.63,0.0002,0.000,159.63,0.0034,0.000,157.51,0.0000,0.000,156.96,0.0015,0.000,154.99,0.0042,0.000,150.37,0.0000,0.000,1,C,0,0,1,-0.529145,0.099551,-0.68665,1.99939,4.68003,0.00723,6.68665,167.9050,34.2593,-0.530333,-0.69452,1.99940,4.68738,0.00774,6.69452,6,C,1,0,0,-0.529145,0.099552,-0.68665,1.99939,4.68003,0.00723,6.68665,167.9049,34.2594,-0.530332,-0.69452,1.99940,4.68738,0.00774,6.69452,9,H,0,1,0.154870,-0.023372,0.23741,0.00000,0.76170,0.00089,0.76259,30.1558,7.2488,0.157494,0.23910,0.00000,0.75928,0.00162,0.76090,14,H,1,0.154871,-0.023375,0.23741,0.00000,0.76170,0.00089,0.76259,30.1556,7.2489,0.157495,0.23911,0.00000,0.75928,0.00162,0.76089,0.4245681267027933,0.4245980659261699,0.3513068471003862,0.3512469686536331
--------------------------------------------------------------------------------
/examples/publication/Crosscoupling/1_run_experiments.py:
--------------------------------------------------------------------------------
1 |
2 | # Cross-coupling photoredox.
3 | import pandas as pd
4 | from edbo.plus.optimizer_botorch import EDBOplus
5 |
6 | filename = 'edbo_crosscoupling_photoredox_yield_ee.csv'
7 |
8 | df_to_opt = pd.read_csv(filename)
9 | regression_columns = df_to_opt.columns.drop(['Ligand', 'priority']).values.tolist()
10 |
11 | opt = EDBOplus()
12 | opt.run(
13 | filename=filename,
14 | objectives=['yield', 'ee'],
15 | objective_mode=['max', 'max'],
16 | objective_thresholds=[None, None],
17 | batch=3,
18 | init_sampling_method='cvtsampling',
19 | columns_features=regression_columns
20 | )
21 |
--------------------------------------------------------------------------------
/examples/publication/Crosscoupling/campaigns/0_recalculate_predictions.py:
--------------------------------------------------------------------------------
1 |
2 |
3 | import matplotlib.pyplot as plt
4 | import numpy as np
5 | import pandas as pd
6 | import shutil
7 |
8 | for campaign in ['challenging_campaign_cvt', 'challenging_campaign_random', 'easy_campaign']:
9 | for round in range(1, 8):
10 | df = pd.read_csv(f"{campaign}/edbo_crosscoupling_photoredox_yield_ee_round{round}.csv")
11 | df.to_csv('optimization.csv', index=False)
12 |
13 | from edbo.plus.optimizer_botorch import EDBOplus
14 |
15 | filename = 'optimization.csv'
16 |
17 | regression_columns = df.columns.drop(['Ligand', 'priority']).values.tolist()
18 |
19 | opt = EDBOplus()
20 | opt.run(
21 | filename=filename,
22 | objectives=['yield', 'ee'],
23 | objective_mode=['max', 'max'],
24 | objective_thresholds=[None, None],
25 | batch=3,
26 | init_sampling_method='cvtsampling',
27 | columns_features=regression_columns
28 | )
29 |
30 | shutil.copy('pred_optimization.csv', f"{campaign}/predictions_{round}.csv")
--------------------------------------------------------------------------------
/examples/publication/Crosscoupling/campaigns/1_analysis.py:
--------------------------------------------------------------------------------
1 |
2 | import matplotlib.pyplot as plt
3 | import numpy as np
4 | import pandas as pd
5 | import shutil
6 | import seaborn as sns
7 |
8 | for campaign in ['challenging_campaign_cvt', 'challenging_campaign_random', 'easy_campaign']:
9 |
10 | fig, ax = plt.subplots(nrows=1, ncols=4, figsize=(12, 4))
11 |
12 | av_uncertainties_yield = []
13 | max_uncertainties_yield = []
14 | av_uncertainties_ee = []
15 | max_uncertainties_ee = []
16 |
17 | for round in range(1, 8):
18 | df = pd.read_csv(f"{campaign}/predictions_{round}.csv")
19 |
20 |
21 | max_uncertainties_yield.append(df['yield_predicted_variance'].max())
22 | max_uncertainties_ee.append(df['ee_predicted_variance'].max())
23 |
24 | av_uncertainties_yield.append(df['yield_predicted_variance'].mean())
25 | av_uncertainties_ee.append(df['ee_predicted_variance'].mean())
26 |
27 | max_uncertainties_yield = np.sqrt(max_uncertainties_yield)
28 | max_uncertainties_ee = np.sqrt(max_uncertainties_ee)
29 | av_uncertainties_yield = np.sqrt(av_uncertainties_yield)
30 | av_uncertainties_ee = np.sqrt(av_uncertainties_ee)
31 | plt.title(f"{campaign}", loc='center')
32 | sns.scatterplot(x=np.arange(1, 8), y=av_uncertainties_yield, ax=ax[0], label='average_uncertainty_yield')
33 | sns.scatterplot(x=np.arange(1, 8), y=av_uncertainties_ee, ax=ax[0], label='average_uncertainty_ee')
34 | plt.title(f"{campaign}", loc='center')
35 | sns.scatterplot(x=np.arange(1, 8), y=max_uncertainties_yield, ax=ax[1], label='max_uncertainty_yield')
36 | sns.scatterplot(x=np.arange(1, 8), y=max_uncertainties_ee, ax=ax[1], label='max_uncertainty_ee')
37 |
38 | ax[0].set_xlabel('Round')
39 | ax[0].set_ylabel('Uncertainty')
40 | ax[1].set_xlabel('Round')
41 | ax[1].set_ylabel('Uncertainty')
42 | ax[0].set_xticks(np.arange(1, 8))
43 | ax[1].set_xticks(np.arange(1, 8))
44 | ax[0].set_ylim(0, 15)
45 | ax[1].set_ylim(0, 15)
46 |
47 | ax[0].legend(loc='upper center', bbox_to_anchor=(0.5, -0.2),
48 | fancybox=True, shadow=True)
49 | ax[1].legend(loc='upper center', bbox_to_anchor=(0.5, -0.2),
50 | fancybox=True, shadow=True)
51 | plt.tight_layout()
52 |
53 |
54 | # Expected improvement.
55 | av_eis_yield = []
56 | max_eis_yield = []
57 | av_eis_ee = []
58 | max_eis_ee = []
59 |
60 | for round in range(1, 8):
61 | df = pd.read_csv(f"{campaign}/predictions_{round}.csv")
62 |
63 | max_eis_yield.append(df['yield_expected_improvement'].max())
64 |
65 | max_eis_ee.append(df['ee_expected_improvement'].max())
66 |
67 | av_eis_yield.append(df['yield_expected_improvement'].mean())
68 | av_eis_ee.append(df['ee_expected_improvement'].mean())
69 |
70 |
71 | plt.title(f"{campaign}", loc='center')
72 | sns.scatterplot(x=np.arange(1, 8), y=av_eis_yield, ax=ax[2], label='average_EI_yield')
73 | sns.scatterplot(x=np.arange(1, 8), y=av_eis_ee, ax=ax[2], label='average_EI_ee')
74 | plt.title(f"{campaign}", loc='center')
75 | sns.scatterplot(x=np.arange(1, 8), y=max_eis_yield, ax=ax[3], label='max_EI_yield')
76 | sns.scatterplot(x=np.arange(1, 8), y=max_eis_ee, ax=ax[3], label='max_EI_ee')
77 |
78 | ax[2].set_xlabel('Round')
79 | ax[2].set_ylabel('EI')
80 | ax[3].set_xlabel('Round')
81 | ax[3].set_ylabel('EI')
82 | ax[2].set_xticks(np.arange(1, 8))
83 | ax[3].set_xticks(np.arange(1, 8))
84 | ax[2].set_ylim(0, 100)
85 | ax[3].set_ylim(0, 100)
86 | ax[2].legend(loc='upper center', bbox_to_anchor=(0.5, -0.2),
87 | fancybox=True, shadow=True)
88 | ax[3].legend(loc='upper center', bbox_to_anchor=(0.5, -0.2),
89 | fancybox=True, shadow=True)
90 | plt.tight_layout()
91 | plt.savefig(f"./plots/{campaign}.svg", format='svg')
92 |
93 | # Save results in csv file.
94 | df = pd.DataFrame([],
95 | columns=['max_uncertainty_yield', 'avg_uncertainty_yield', 'max_EI_yield', 'avg_EI_yield',
96 | 'max_uncertainty_ee', 'avg_uncertainty_ee', 'max_EI_ee', 'avg_EI_ee'])
97 | df['max_uncertainty_yield'] = max_uncertainties_yield
98 | df['max_uncertainty_ee'] = max_uncertainties_ee
99 | df['avg_uncertainty_yield'] = av_uncertainties_yield
100 | df['avg_uncertainty_yield'] = av_uncertainties_ee
101 | df['max_EI_yield'] = max_eis_yield
102 | df['max_EI_ee'] = max_eis_ee
103 | df['avg_EI_yield'] = av_eis_yield
104 | df['avg_EI_ee'] = av_eis_ee
105 |
106 | df.to_csv(f'crosscoupling_results_{campaign}.csv')
107 | plt.show()
108 |
109 |
--------------------------------------------------------------------------------
/examples/publication/Crosscoupling/campaigns/crosscoupling_results_challenging_campaign_cvt.csv:
--------------------------------------------------------------------------------
1 | ,max_uncertainty_yield,avg_uncertainty_yield,max_EI_yield,avg_EI_yield,max_uncertainty_ee,avg_uncertainty_ee,max_EI_ee,avg_EI_ee
2 | 0,6.827581272767426,2.9324467709552584,32.40782689891654,25.602683007714496,3.276152995193164,,7.60101466180589,5.928642147112224
3 | 1,8.210461679254594,5.773323515784001,46.574447681313785,11.421841208309262,9.962907192629936,,74.7088879188402,18.83098803650586
4 | 2,5.128887161645351,3.81806514719726,13.744451293043053,2.709265489045186,5.803851647525387,,21.650273058529116,5.9165346528746054
5 | 3,4.541302003122084,3.2123354371983703,5.162726577974803,0.40422385279901685,5.227188408641633,,16.10033337598427,2.30352295288218
6 | 4,3.859516872139973,2.863183469922444,0.5196149081752943,0.020525537970990253,4.08837032148018,,7.477211599451265,1.2544773840895274
7 | 5,3.795156763474525,2.436198804520339,0.2496501348370021,0.00964674815969636,3.8062975956252423,,6.226783841040771,0.6123865268781998
8 | 6,5.306888155590136,2.6604791781908226,2.778079942380365,0.1453910230805698,6.401147787936088,,12.269211324706786,1.6390130724559002
9 |
--------------------------------------------------------------------------------
/examples/publication/Suzuki/0_clean_dft.py:
--------------------------------------------------------------------------------
1 | import os
2 |
3 | import numpy as np
4 | import pandas as pd
5 |
6 |
7 | df_dft = pd.read_csv('data/dataset_B2.csv')
8 |
9 | # # Remove correlated features.
10 | corr_matrix = df_dft.corr().abs()
11 | # Select upper triangle of correlation matrix.
12 | upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
13 | # Find features with correlation greater than 0.95.
14 | to_drop = [column for column in upper.columns if any(upper[column] > 0.95)]
15 | # Drop features
16 | df_dft.drop(to_drop, axis=1, inplace=True)
17 |
18 | # Remove columns that have only one or two unique values.
19 | extra_columns_to_remove = []
20 | for column in df_dft.columns.values:
21 | if len(np.unique(df_dft[column].values)) <= 1:
22 | extra_columns_to_remove.append(column)
23 | df_dft.drop(extra_columns_to_remove, axis=1, inplace=True)
24 |
25 | # Store SMILES.
26 | solvent_ohe = df_dft['solvent'].values
27 | base_ohe = df_dft['base'].values
28 | ligand_ohe = df_dft['ligand'].values
29 |
30 | # Remove non numerical.
31 | df_edbo_numeric = df_dft.select_dtypes(include=np.number)
32 |
33 | # Add back OHE features.
34 | df_edbo_numeric.insert(1, "solvent", solvent_ohe, False)
35 | df_edbo_numeric.insert(1, "base", base_ohe, False)
36 | df_edbo_numeric.insert(1, "ligand", ligand_ohe, False)
37 |
38 | df_edbo_numeric.to_csv('./data/dataset_B2_DFT_clean.csv', index=0)
39 |
--------------------------------------------------------------------------------
/examples/publication/Suzuki/0_clean_mordred.py:
--------------------------------------------------------------------------------
1 |
2 |
3 | import numpy as np
4 | import pandas as pd
5 |
6 |
7 | df_dft = pd.read_csv('data/dataset_B3.csv')
8 |
9 | # # Remove correlated features.
10 | corr_matrix = df_dft.corr().abs()
11 | # Select upper triangle of correlation matrix.
12 | upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
13 | # Find features with correlation greater than 0.95.
14 | to_drop = [column for column in upper.columns if any(upper[column] > 0.95)]
15 | # Drop features
16 | df_dft.drop(to_drop, axis=1, inplace=True)
17 |
18 | # Remove columns that have only one or two unique values.
19 | extra_columns_to_remove = []
20 | for column in df_dft.columns.values:
21 | if len(np.unique(df_dft[column].values)) <= 1:
22 | extra_columns_to_remove.append(column)
23 | df_dft.drop(extra_columns_to_remove, axis=1, inplace=True)
24 |
25 | # Store SMILES.
26 | solvent_ohe = df_dft['solvent'].values
27 | base_ohe = df_dft['base'].values
28 | ligand_ohe = df_dft['ligand'].values
29 |
30 | # Remove non numerical.
31 | df_edbo_numeric = df_dft.select_dtypes(include=np.number)
32 |
33 | # Add back OHE features.
34 | df_edbo_numeric.insert(1, "solvent", solvent_ohe, False)
35 | df_edbo_numeric.insert(1, "base", base_ohe, False)
36 | df_edbo_numeric.insert(1, "ligand", ligand_ohe, False)
37 |
38 | df_edbo_numeric.to_csv('./data/dataset_B3_Mordred_clean.csv', index=0)
39 |
--------------------------------------------------------------------------------
/examples/publication/Suzuki/1_run_ohe.py:
--------------------------------------------------------------------------------
1 |
2 | import shutil
3 | import pandas as pd
4 | import numpy as np
5 | import os
6 | from edbo.plus.benchmark.multiobjective_benchmark import Benchmark
7 | import os
8 | import numpy as np
9 | import matplotlib.pyplot as plt
10 | import seaborn as sns
11 | import pandas as pd
12 | sns.set_style("darkgrid")
13 | sns.set_context("talk")
14 |
15 | for acq_i in [
16 | 'EHVI',
17 | 'MOUCB',
18 | 'MOGreedy'
19 | ]:
20 | for seed_i in np.arange(0, 5):
21 | budget = 30
22 | acq = acq_i
23 | batch = 1
24 | seed = seed_i
25 |
26 | df_exp = pd.read_csv('./data/dataset_B1.csv')
27 | df_exp['new_index'] = np.arange(0, len(df_exp.values))
28 | sort_column = 'new_index'
29 |
30 | # Select the features for the model.
31 | columns_regression = df_exp.columns
32 | columns_regression = columns_regression.drop([sort_column, 'objective_conversion', 'objective_selectivity']).tolist()
33 | objectives = ['objective_conversion', 'objective_selectivity']
34 | objective_modes = ['max', 'max']
35 | objective_thresholds = [None, None]
36 | print(f"Columns for regression: {columns_regression}")
37 | ######################
38 |
39 | label_benchmark = f"benchmark_ohe_acq_{acq}_batch_{batch}_seed_{seed}.csv"
40 |
41 | if not os.path.exists(f"./results_ohe/{label_benchmark}"):
42 | # Remove previous files
43 | if os.path.exists(label_benchmark):
44 | os.remove(label_benchmark)
45 |
46 | if os.path.exists(f'pred_{label_benchmark}'):
47 | os.remove(f'pred_{label_benchmark}')
48 |
49 | if os.path.exists(f'results_{label_benchmark}'):
50 | os.remove(f'results_{label_benchmark}')
51 |
52 | bench = Benchmark(df_ground=df_exp,
53 | features_regression=columns_regression,
54 | objective_names=objectives,
55 | objective_modes=objective_modes,
56 | objective_thresholds=objective_thresholds,
57 | filename=label_benchmark,
58 | filename_results=f'results_{label_benchmark}',
59 | index_column=sort_column,
60 | acquisition_function=acq)
61 | bench.run(steps=int(budget/batch), batch=batch, seed=seed,
62 | plot_predictions=False,
63 | plot_ground=False,
64 | plot_train=False,
65 | init_method='seed')
66 |
67 | # Move results.
68 | if not os.path.exists('results_ohe'):
69 | os.mkdir('results_ohe')
70 | shutil.move(label_benchmark, f'results_ohe/{label_benchmark}')
71 | shutil.move(f'pred_{label_benchmark}', f'results_ohe/pred_{label_benchmark}')
72 | shutil.move(f'results_{label_benchmark}', f'results_ohe/results_{label_benchmark}')
73 |
74 | # Clean.
75 | if os.path.exists('results'):
76 | shutil.rmtree('results')
77 |
--------------------------------------------------------------------------------
/examples/publication/Suzuki/2_run_dft.py:
--------------------------------------------------------------------------------
1 |
2 | import shutil
3 | import pandas as pd
4 | import numpy as np
5 | import os
6 | from edbo.plus.benchmark.multiobjective_benchmark import Benchmark
7 | import os
8 | import numpy as np
9 | import matplotlib.pyplot as plt
10 | import seaborn as sns
11 | import pandas as pd
12 | sns.set_style("darkgrid")
13 | sns.set_context("talk")
14 |
15 |
16 | for batch in [1, 2, 3, 5]:
17 | for acq_i in [
18 | 'EHVI',
19 | 'MOUCB',
20 | 'MOGreedy'
21 | ]:
22 | for seed_i in np.arange(0, 5):
23 | budget = 30
24 | acq = acq_i
25 | seed = seed_i
26 |
27 | df_exp = pd.read_csv('./data/dataset_B2_DFT_clean.csv')
28 | df_exp['new_index'] = np.arange(0, len(df_exp.values))
29 | sort_column = 'new_index'
30 |
31 | # Select the features for the model.
32 | columns_regression = df_exp.columns
33 | columns_regression = columns_regression.drop('solvent')
34 | columns_regression = columns_regression.drop('ligand')
35 |
36 | columns_regression = columns_regression.drop([sort_column, 'objective_conversion', 'objective_selectivity']).tolist()
37 | objectives = ['objective_conversion', 'objective_selectivity']
38 | objective_modes = ['max', 'max']
39 | objective_thresholds = [None, None]
40 | print(f"Columns for regression: {columns_regression}")
41 | ######################
42 |
43 | label_benchmark = f"benchmark_dft_acq_{acq}_batch_{batch}_seed_{seed}.csv"
44 |
45 | if not os.path.exists(f"./results_dft/{label_benchmark}"):
46 | # Remove previous files
47 | if os.path.exists(label_benchmark):
48 | os.remove(label_benchmark)
49 |
50 | if os.path.exists(f'pred_{label_benchmark}'):
51 | os.remove(f'pred_{label_benchmark}')
52 |
53 | if os.path.exists(f'results_{label_benchmark}'):
54 | os.remove(f'results_{label_benchmark}')
55 |
56 | bench = Benchmark(df_ground=df_exp,
57 | features_regression=columns_regression,
58 | objective_names=objectives,
59 | objective_modes=objective_modes,
60 | objective_thresholds=objective_thresholds,
61 | filename=label_benchmark,
62 | filename_results=f'results_{label_benchmark}',
63 | index_column=sort_column,
64 | acquisition_function=acq)
65 | bench.run(steps=int(budget/batch), batch=batch, seed=seed,
66 | plot_predictions=False,
67 | plot_ground=False,
68 | plot_train=False)
69 |
70 | # Move results.
71 | if not os.path.exists('results_dft'):
72 | os.mkdir('results_dft')
73 | shutil.move(label_benchmark, f'results_dft/{label_benchmark}')
74 | shutil.move(f'pred_{label_benchmark}', f'results_dft/pred_{label_benchmark}')
75 | shutil.move(f'results_{label_benchmark}', f'results_dft/results_{label_benchmark}')
76 |
77 |
--------------------------------------------------------------------------------
/examples/publication/Suzuki/3_run_mordred.py:
--------------------------------------------------------------------------------
1 |
2 | import shutil
3 | import pandas as pd
4 | import numpy as np
5 | import os
6 | from edbo.plus.benchmark.multiobjective_benchmark import Benchmark
7 | import os
8 | import numpy as np
9 | import matplotlib.pyplot as plt
10 | import seaborn as sns
11 | import pandas as pd
12 | sns.set_style("darkgrid")
13 | sns.set_context("talk")
14 |
15 |
16 | for acq_i in [
17 | 'EHVI',
18 | 'MOUCB',
19 | 'MOGreedy'
20 | ]:
21 | for seed_i in np.arange(0, 5):
22 | budget = 30
23 | acq = acq_i
24 | batch = 1
25 | seed = seed_i
26 |
27 | df_exp = pd.read_csv('./data/dataset_B3_Mordred_clean.csv')
28 | df_exp['new_index'] = np.arange(0, len(df_exp.values))
29 | sort_column = 'new_index'
30 |
31 | # Select the features for the model.
32 | columns_regression = df_exp.columns
33 | columns_regression = columns_regression.drop('solvent')
34 | columns_regression = columns_regression.drop('ligand')
35 |
36 | columns_regression = columns_regression.drop([sort_column, 'objective_conversion', 'objective_selectivity']).tolist()
37 | objectives = ['objective_conversion', 'objective_selectivity']
38 | objective_modes = ['max', 'max']
39 | objective_thresholds = [None, None]
40 | print(f"Columns for regression: {columns_regression}")
41 | ######################
42 |
43 | label_benchmark = f"benchmark_mordred_acq_{acq}_batch_{batch}_seed_{seed}.csv"
44 |
45 | if not os.path.exists(f"./results_mordred/{label_benchmark}"):
46 | # Remove previous files
47 | if os.path.exists(label_benchmark):
48 | os.remove(label_benchmark)
49 |
50 | if os.path.exists(f'pred_{label_benchmark}'):
51 | os.remove(f'pred_{label_benchmark}')
52 |
53 | if os.path.exists(f'results_{label_benchmark}'):
54 | os.remove(f'results_{label_benchmark}')
55 |
56 | bench = Benchmark(df_ground=df_exp,
57 | features_regression=columns_regression,
58 | objective_names=objectives,
59 | objective_modes=objective_modes,
60 | objective_thresholds=objective_thresholds,
61 | filename=label_benchmark,
62 | filename_results=f'results_{label_benchmark}',
63 | index_column=sort_column,
64 | acquisition_function=acq)
65 | bench.run(steps=int(budget/batch), batch=batch, seed=seed,
66 | plot_predictions=False,
67 | plot_ground=False,
68 | plot_train=False)
69 |
70 | # Move results.
71 | if not os.path.exists('results_mordred'):
72 | os.mkdir('results_mordred')
73 | shutil.move(label_benchmark, f'results_mordred/{label_benchmark}')
74 | shutil.move(f'pred_{label_benchmark}', f'results_mordred/pred_{label_benchmark}')
75 | shutil.move(f'results_{label_benchmark}', f'results_mordred/results_{label_benchmark}')
76 |
--------------------------------------------------------------------------------
/examples/publication/Suzuki/4_random_features.py:
--------------------------------------------------------------------------------
1 |
2 | import shutil
3 | import pandas as pd
4 | import numpy as np
5 | import os
6 | from edbo.plus.benchmark.multiobjective_benchmark import Benchmark
7 | import os
8 | import numpy as np
9 | import matplotlib.pyplot as plt
10 | import seaborn as sns
11 | import pandas as pd
12 | sns.set_style("darkgrid")
13 | sns.set_context("talk")
14 |
15 | for acq_i in [
16 | 'EHVI',
17 | 'MOUCB',
18 | 'MOGreedy'
19 | ]:
20 | for seed_i in np.arange(0, 5):
21 | budget = 30
22 | acq = acq_i
23 | batch = 1
24 | seed = seed_i
25 |
26 | df_exp = pd.read_csv('./data/dataset_B1.csv')
27 | df_exp['new_index'] = np.arange(0, len(df_exp.values))
28 | sort_column = 'new_index'
29 |
30 | # Select the features for the model.
31 | columns_regression = df_exp.columns
32 | columns_regression = columns_regression.drop([sort_column, 'objective_conversion', 'objective_selectivity']).tolist()
33 | objectives = ['objective_conversion', 'objective_selectivity']
34 | objective_modes = ['max', 'max']
35 | objective_thresholds = [None, None]
36 | print(f"Columns for regression: {columns_regression}")
37 | ######################
38 |
39 | label_benchmark = f"benchmark_random_acq_{acq}_batch_{batch}_seed_{seed}.csv"
40 |
41 | if not os.path.exists(f"./results_random/{label_benchmark}"):
42 | # Remove previous files
43 | if os.path.exists(label_benchmark):
44 | os.remove(label_benchmark)
45 |
46 | if os.path.exists(f'pred_{label_benchmark}'):
47 | os.remove(f'pred_{label_benchmark}')
48 |
49 | if os.path.exists(f'results_{label_benchmark}'):
50 | os.remove(f'results_{label_benchmark}')
51 |
52 | bench = Benchmark(df_ground=df_exp,
53 | features_regression=columns_regression,
54 | objective_names=objectives,
55 | objective_modes=objective_modes,
56 | objective_thresholds=objective_thresholds,
57 | filename=label_benchmark,
58 | filename_results=f'results_{label_benchmark}',
59 | index_column=sort_column,
60 | acquisition_function=acq)
61 | bench.run(steps=int(budget/batch), batch=batch, seed=seed,
62 | plot_predictions=False,
63 | plot_ground=False,
64 | plot_train=False,
65 | random_sampling=True)
66 |
67 | # Move results.
68 | if not os.path.exists('results_random'):
69 | os.mkdir('results_random')
70 | shutil.move(label_benchmark, f'results_random/{label_benchmark}')
71 | shutil.move(f'pred_{label_benchmark}', f'results_random/pred_{label_benchmark}')
72 | shutil.move(f'results_{label_benchmark}', f'results_random/results_{label_benchmark}')
73 |
--------------------------------------------------------------------------------
/examples/publication/Suzuki/data/dataset_B1.csv:
--------------------------------------------------------------------------------
1 | ligand,base,solvent,ligand_equivalent,objective_conversion,objective_selectivity
2 | P(tBu)3,NaOH(aq.),MeOH,0.125,39.6,67.17171717171718
3 | P(tBu)3,s. NaHCO3(aq.),MeOH,0.125,52.3,74.37858508604207
4 | P(tBu)3,CsF(aq.),MeOH,0.125,50.8,74.01574803149606
5 | P(tBu)3,1M K3PO4(aq.),MeOH,0.125,50.3,73.55864811133202
6 | P(tBu)3,KOH(aq.),MeOH,0.125,61.5,78.21138211382113
7 | P(tBu)3,Cs2CO3(aq.),MeOH,0.125,61.00000000000001,79.01639344262294
8 | P(tBu)3,KOAc,MeOH,0.125,67.5,52.8888888888889
9 | P(tBu)3,None,MeOH,0.125,74.9,53.271028037383175
10 | P(Ph)3,NaOH(aq.),MeOH,0.125,99.2,73.79032258064517
11 | P(Ph)3,s. NaHCO3(aq.),MeOH,0.125,86.60000000000001,85.10392609699768
12 | P(Ph)3,CsF(aq.),MeOH,0.125,83.3,89.07563025210085
13 | P(Ph)3,1M K3PO4(aq.),MeOH,0.125,81.5,92.14723926380368
14 | P(Ph)3,KOH(aq.),MeOH,0.125,82.99999999999999,91.92771084337352
15 | P(Ph)3,Cs2CO3(aq.),MeOH,0.125,82.2,93.06569343065692
16 | P(Ph)3,KOAc,MeOH,0.125,81.4,95.0859950859951
17 | P(Ph)3,None,MeOH,0.125,80.30000000000001,94.89414694894144
18 | AmPhos,NaOH(aq.),MeOH,0.125,75.2,89.49468085106382
19 | AmPhos,s. NaHCO3(aq.),MeOH,0.125,75.39999999999999,90.18567639257296
20 | AmPhos,CsF(aq.),MeOH,0.125,77.3,90.03880983182407
21 | AmPhos,1M K3PO4(aq.),MeOH,0.125,74.3,88.42530282637955
22 | AmPhos,KOH(aq.),MeOH,0.125,56.900000000000006,78.55887521968366
23 | AmPhos,Cs2CO3(aq.),MeOH,0.125,60.1,78.70216306156405
24 | AmPhos,KOAc,MeOH,0.125,43.7,72.31121281464532
25 | AmPhos,None,MeOH,0.125,39.3,70.22900763358778
26 | P(Cy)3,NaOH(aq.),MeOH,0.125,46.5,71.82795698924731
27 | P(Cy)3,s. NaHCO3(aq.),MeOH,0.125,33.9,70.50147492625368
28 | P(Cy)3,CsF(aq.),MeOH,0.125,55.2,75.90579710144928
29 | P(Cy)3,1M K3PO4(aq.),MeOH,0.125,46.7,73.23340471092077
30 | P(Cy)3,KOH(aq.),MeOH,0.125,59.8,77.59197324414716
31 | P(Cy)3,Cs2CO3(aq.),MeOH,0.125,84.39999999999999,92.53554502369668
32 | P(Cy)3,KOAc,MeOH,0.125,80.60000000000001,94.04466501240694
33 | P(Cy)3,None,MeOH,0.125,76.7,92.4380704041721
34 | P(o-Tol)3,NaOH(aq.),MeOH,0.125,83.99999999999999,83.45238095238095
35 | P(o-Tol)3,s. NaHCO3(aq.),MeOH,0.125,76.5,84.70588235294117
36 | P(o-Tol)3,CsF(aq.),MeOH,0.125,83.79999999999998,82.69689737470168
37 | P(o-Tol)3,1M K3PO4(aq.),MeOH,0.125,76.5,80.65359477124183
38 | P(o-Tol)3,KOH(aq.),MeOH,0.125,74.5,75.16778523489933
39 | P(o-Tol)3,Cs2CO3(aq.),MeOH,0.125,79.5,66.41509433962264
40 | P(o-Tol)3,KOAc,MeOH,0.125,67.8,74.63126843657818
41 | P(o-Tol)3,None,MeOH,0.125,59.39999999999999,76.26262626262627
42 | CataCXium A,NaOH(aq.),MeOH,0.125,56.400000000000006,78.0141843971631
43 | CataCXium A,s. NaHCO3(aq.),MeOH,0.125,66.3,81.14630467571644
44 | CataCXium A,CsF(aq.),MeOH,0.125,47.7,74.8427672955975
45 | CataCXium A,1M K3PO4(aq.),MeOH,0.125,60.3,79.93366500829188
46 | CataCXium A,KOH(aq.),MeOH,0.125,63.8,80.87774294670847
47 | CataCXium A,Cs2CO3(aq.),MeOH,0.125,45.99999999999999,73.47826086956523
48 | CataCXium A,KOAc,MeOH,0.125,38.7,69.50904392764858
49 | CataCXium A,None,MeOH,0.125,47.39999999999999,73.41772151898735
50 | SPhos,NaOH(aq.),MeOH,0.0625,45.2,72.34513274336285
51 | SPhos,s. NaHCO3(aq.),MeOH,0.0625,28.0,58.57142857142858
52 | SPhos,CsF(aq.),MeOH,0.0625,38.39999999999999,67.44791666666667
53 | SPhos,1M K3PO4(aq.),MeOH,0.0625,39.3,67.68447837150127
54 | SPhos,KOH(aq.),MeOH,0.0625,36.9,66.66666666666667
55 | SPhos,Cs2CO3(aq.),MeOH,0.0625,74.6,71.58176943699732
56 | SPhos,KOAc,MeOH,0.0625,54.5,67.33944954128441
57 | SPhos,None,MeOH,0.0625,49.7,70.4225352112676
58 | dtbpf,NaOH(aq.),MeOH,0.0625,38.6,47.15025906735752
59 | dtbpf,s. NaHCO3(aq.),MeOH,0.0625,19.6,21.428571428571427
60 | dtbpf,CsF(aq.),MeOH,0.0625,20.7,25.120772946859905
61 | dtbpf,1M K3PO4(aq.),MeOH,0.0625,19.7,21.82741116751269
62 | dtbpf,KOH(aq.),MeOH,0.0625,19.8,24.24242424242425
63 | dtbpf,Cs2CO3(aq.),MeOH,0.0625,15.8,21.51898734177215
64 | dtbpf,KOAc,MeOH,0.0625,16.1,19.875776397515526
65 | dtbpf,None,MeOH,0.0625,13.0,20.0
66 | XPhos,NaOH(aq.),MeOH,0.0625,77.9,83.31193838254171
67 | XPhos,s. NaHCO3(aq.),MeOH,0.0625,79.39999999999999,83.50125944584383
68 | XPhos,CsF(aq.),MeOH,0.0625,72.7,82.80605226960111
69 | XPhos,1M K3PO4(aq.),MeOH,0.0625,53.8,77.32342007434944
70 | XPhos,KOH(aq.),MeOH,0.0625,46.0,72.6086956521739
71 | XPhos,Cs2CO3(aq.),MeOH,0.0625,41.0,70.73170731707317
72 | XPhos,KOAc,MeOH,0.0625,51.4,74.12451361867704
73 | XPhos,None,MeOH,0.0625,33.5,63.28358208955224
74 | dppf,NaOH(aq.),MeOH,0.0625,40.5,52.8395061728395
75 | dppf,s. NaHCO3(aq.),MeOH,0.0625,36.3,67.49311294765839
76 | dppf,CsF(aq.),MeOH,0.0625,35.3,65.43909348441927
77 | dppf,1M K3PO4(aq.),MeOH,0.0625,36.0,62.77777777777778
78 | dppf,KOH(aq.),MeOH,0.0625,28.3,49.1166077738516
79 | dppf,Cs2CO3(aq.),MeOH,0.0625,35.4,40.96045197740113
80 | dppf,KOAc,MeOH,0.0625,25.5,53.333333333333336
81 | dppf,None,MeOH,0.0625,20.0,54.50000000000001
82 | Xanthphos,NaOH(aq.),MeOH,0.0625,12.2,41.80327868852459
83 | Xanthphos,s. NaHCO3(aq.),MeOH,0.0625,7.8,32.05128205128205
84 | Xanthphos,CsF(aq.),MeOH,0.0625,9.7,32.98969072164949
85 | Xanthphos,1M K3PO4(aq.),MeOH,0.0625,8.5,31.764705882352946
86 | Xanthphos,KOH(aq.),MeOH,0.0625,10.2,40.19607843137255
87 | Xanthphos,Cs2CO3(aq.),MeOH,0.0625,12.0,35.833333333333336
88 | Xanthphos,KOAc,MeOH,0.0625,7.6,23.68421052631579
89 | Xanthphos,None,MeOH,0.0625,7.399999999999999,24.324324324324326
90 | P(tBu)3,NaOH(aq.),MeCN,0.125,38.2,69.63350785340315
91 | P(tBu)3,s. NaHCO3(aq.),MeCN,0.125,42.8,52.10280373831775
92 | P(tBu)3,CsF(aq.),MeCN,0.125,21.3,24.413145539906104
93 | P(tBu)3,1M K3PO4(aq.),MeCN,0.125,29.8,54.0268456375839
94 | P(tBu)3,KOH(aq.),MeCN,0.125,24.0,34.583333333333336
95 | P(tBu)3,Cs2CO3(aq.),MeCN,0.125,20.1,54.72636815920397
96 | P(tBu)3,KOAc,MeCN,0.125,18.4,10.326086956521738
97 | P(tBu)3,None,MeCN,0.125,22.1,12.21719457013575
98 | P(Ph)3,NaOH(aq.),MeCN,0.125,16.7,36.52694610778443
99 | P(Ph)3,s. NaHCO3(aq.),MeCN,0.125,34.6,67.05202312138728
100 | P(Ph)3,CsF(aq.),MeCN,0.125,37.9,76.2532981530343
101 | P(Ph)3,1M K3PO4(aq.),MeCN,0.125,25.1,79.6812749003984
102 | P(Ph)3,KOH(aq.),MeCN,0.125,13.3,75.18796992481204
103 | P(Ph)3,Cs2CO3(aq.),MeCN,0.125,22.9,74.67248908296943
104 | P(Ph)3,KOAc,MeCN,0.125,12.1,60.33057851239669
105 | P(Ph)3,None,MeCN,0.125,27.1,79.33579335793357
106 | AmPhos,NaOH(aq.),MeCN,0.125,13.3,31.57894736842105
107 | AmPhos,s. NaHCO3(aq.),MeCN,0.125,31.8,62.8930817610063
108 | AmPhos,CsF(aq.),MeCN,0.125,31.6,63.29113924050633
109 | AmPhos,1M K3PO4(aq.),MeCN,0.125,30.8,62.66233766233766
110 | AmPhos,KOH(aq.),MeCN,0.125,29.4,62.24489795918368
111 | AmPhos,Cs2CO3(aq.),MeCN,0.125,25.3,58.49802371541502
112 | AmPhos,KOAc,MeCN,0.125,21.2,50.943396226415096
113 | AmPhos,None,MeCN,0.125,26.7,55.0561797752809
114 | P(Cy)3,NaOH(aq.),MeCN,0.125,33.2,67.46987951807229
115 | P(Cy)3,s. NaHCO3(aq.),MeCN,0.125,32.2,67.3913043478261
116 | P(Cy)3,CsF(aq.),MeCN,0.125,15.999999999999998,60.62500000000001
117 | P(Cy)3,1M K3PO4(aq.),MeCN,0.125,10.3,66.99029126213593
118 | P(Cy)3,KOH(aq.),MeCN,0.125,7.800000000000001,55.12820512820512
119 | P(Cy)3,Cs2CO3(aq.),MeCN,0.125,7.300000000000001,43.83561643835616
120 | P(Cy)3,KOAc,MeCN,0.125,3.8,23.68421052631579
121 | P(Cy)3,None,MeCN,0.125,7.0,30.0
122 | P(o-Tol)3,NaOH(aq.),MeCN,0.125,11.5,4.3478260869565215
123 | P(o-Tol)3,s. NaHCO3(aq.),MeCN,0.125,13.7,2.18978102189781
124 | P(o-Tol)3,CsF(aq.),MeCN,0.125,12.0,3.333333333333333
125 | P(o-Tol)3,1M K3PO4(aq.),MeCN,0.125,9.7,2.061855670103093
126 | P(o-Tol)3,KOH(aq.),MeCN,0.125,10.5,1.9047619047619049
127 | P(o-Tol)3,Cs2CO3(aq.),MeCN,0.125,10.3,1.9417475728155336
128 | P(o-Tol)3,KOAc,MeCN,0.125,8.6,3.488372093023256
129 | P(o-Tol)3,None,MeCN,0.125,9.1,1.0989010989010988
130 | CataCXium A,NaOH(aq.),MeCN,0.125,9.1,5.494505494505495
131 | CataCXium A,s. NaHCO3(aq.),MeCN,0.125,7.199999999999999,8.333333333333334
132 | CataCXium A,CsF(aq.),MeCN,0.125,7.1,11.267605633802818
133 | CataCXium A,1M K3PO4(aq.),MeCN,0.125,12.9,36.43410852713178
134 | CataCXium A,KOH(aq.),MeCN,0.125,12.4,35.483870967741936
135 | CataCXium A,Cs2CO3(aq.),MeCN,0.125,12.7,33.85826771653544
136 | CataCXium A,KOAc,MeCN,0.125,14.1,36.87943262411348
137 | CataCXium A,None,MeCN,0.125,0.0,0.0
138 | SPhos,NaOH(aq.),MeCN,0.0625,13.3,3.007518796992482
139 | SPhos,s. NaHCO3(aq.),MeCN,0.0625,10.5,3.8095238095238098
140 | SPhos,CsF(aq.),MeCN,0.0625,10.9,5.504587155963303
141 | SPhos,1M K3PO4(aq.),MeCN,0.0625,9.2,4.347826086956522
142 | SPhos,KOH(aq.),MeCN,0.0625,8.0,6.25
143 | SPhos,Cs2CO3(aq.),MeCN,0.0625,10.3,2.912621359223301
144 | SPhos,KOAc,MeCN,0.0625,7.8,7.6923076923076925
145 | SPhos,None,MeCN,0.0625,7.2,6.944444444444445
146 | dtbpf,NaOH(aq.),MeCN,0.0625,9.3,8.602150537634408
147 | dtbpf,s. NaHCO3(aq.),MeCN,0.0625,7.1,7.042253521126761
148 | dtbpf,CsF(aq.),MeCN,0.0625,7.7,9.09090909090909
149 | dtbpf,1M K3PO4(aq.),MeCN,0.0625,6.4,6.25
150 | dtbpf,KOH(aq.),MeCN,0.0625,4.2,11.904761904761903
151 | dtbpf,Cs2CO3(aq.),MeCN,0.0625,7.9,10.126582278481012
152 | dtbpf,KOAc,MeCN,0.0625,5.4,5.555555555555556
153 | dtbpf,None,MeCN,0.0625,4.1,12.195121951219514
154 | XPhos,NaOH(aq.),MeCN,0.0625,9.5,3.1578947368421053
155 | XPhos,s. NaHCO3(aq.),MeCN,0.0625,11.2,1.785714285714286
156 | XPhos,CsF(aq.),MeCN,0.0625,8.7,0.0
157 | XPhos,1M K3PO4(aq.),MeCN,0.0625,9.7,16.494845360824744
158 | XPhos,KOH(aq.),MeCN,0.0625,9.7,11.34020618556701
159 | XPhos,Cs2CO3(aq.),MeCN,0.0625,10.6,24.528301886792452
160 | XPhos,KOAc,MeCN,0.0625,9.2,13.043478260869565
161 | XPhos,None,MeCN,0.0625,9.2,17.39130434782609
162 | dppf,NaOH(aq.),MeCN,0.0625,4.9,10.20408163265306
163 | dppf,s. NaHCO3(aq.),MeCN,0.0625,5.6,17.857142857142858
164 | dppf,CsF(aq.),MeCN,0.0625,5.9,16.949152542372882
165 | dppf,1M K3PO4(aq.),MeCN,0.0625,4.8,20.833333333333336
166 | dppf,KOH(aq.),MeCN,0.0625,4.6,15.217391304347828
167 | dppf,Cs2CO3(aq.),MeCN,0.0625,6.0,15.0
168 | dppf,KOAc,MeCN,0.0625,4.5,15.555555555555555
169 | dppf,None,MeCN,0.0625,4.9,18.367346938775515
170 | Xanthphos,NaOH(aq.),MeCN,0.0625,4.8,0.0
171 | Xanthphos,s. NaHCO3(aq.),MeCN,0.0625,4.4,2.272727272727273
172 | Xanthphos,CsF(aq.),MeCN,0.0625,4.1,0.0
173 | Xanthphos,1M K3PO4(aq.),MeCN,0.0625,6.4,0.0
174 | Xanthphos,KOH(aq.),MeCN,0.0625,4.0,0.0
175 | Xanthphos,Cs2CO3(aq.),MeCN,0.0625,6.2,0.0
176 | Xanthphos,KOAc,MeCN,0.0625,5.4,0.0
177 | Xanthphos,None,MeCN,0.0625,3.4000000000000004,0.0
178 | P(tBu)3,NaOH(aq.),THF,0.125,10.8,9.25925925925926
179 | P(tBu)3,s. NaHCO3(aq.),THF,0.125,33.7,1.1869436201780417
180 | P(tBu)3,CsF(aq.),THF,0.125,6.800000000000001,5.88235294117647
181 | P(tBu)3,1M K3PO4(aq.),THF,0.125,5.5,9.090909090909092
182 | P(tBu)3,KOH(aq.),THF,0.125,7.6,5.2631578947368425
183 | P(tBu)3,Cs2CO3(aq.),THF,0.125,9.6,3.125
184 | P(tBu)3,KOAc,THF,0.125,10.1,1.98019801980198
185 | P(tBu)3,None,THF,0.125,15.3,1.3071895424836604
186 | P(Ph)3,NaOH(aq.),THF,0.125,13.7,15.328467153284672
187 | P(Ph)3,s. NaHCO3(aq.),THF,0.125,11.6,22.41379310344828
188 | P(Ph)3,CsF(aq.),THF,0.125,10.4,34.61538461538461
189 | P(Ph)3,1M K3PO4(aq.),THF,0.125,9.1,27.47252747252747
190 | P(Ph)3,KOH(aq.),THF,0.125,10.0,22.000000000000004
191 | P(Ph)3,Cs2CO3(aq.),THF,0.125,8.4,21.428571428571427
192 | P(Ph)3,KOAc,THF,0.125,4.5,26.666666666666668
193 | P(Ph)3,None,THF,0.125,8.0,15.0
194 | AmPhos,NaOH(aq.),THF,0.125,11.2,1.785714285714286
195 | AmPhos,s. NaHCO3(aq.),THF,0.125,11.1,0.9009009009009008
196 | AmPhos,CsF(aq.),THF,0.125,8.5,0.0
197 | AmPhos,1M K3PO4(aq.),THF,0.125,5.6,0.0
198 | AmPhos,KOH(aq.),THF,0.125,1.7,0.0
199 | AmPhos,Cs2CO3(aq.),THF,0.125,1.8,0.0
200 | AmPhos,KOAc,THF,0.125,3.9,5.128205128205129
201 | AmPhos,None,THF,0.125,6.7,1.4925373134328357
202 | P(Cy)3,NaOH(aq.),THF,0.125,7.6,3.947368421052632
203 | P(Cy)3,s. NaHCO3(aq.),THF,0.125,7.9,0.0
204 | P(Cy)3,CsF(aq.),THF,0.125,7.4,0.0
205 | P(Cy)3,1M K3PO4(aq.),THF,0.125,5.8,0.0
206 | P(Cy)3,KOH(aq.),THF,0.125,4.800000000000001,4.166666666666666
207 | P(Cy)3,Cs2CO3(aq.),THF,0.125,4.6,2.173913043478261
208 | P(Cy)3,KOAc,THF,0.125,6.5,0.0
209 | P(Cy)3,None,THF,0.125,4.5,0.0
210 | P(o-Tol)3,NaOH(aq.),THF,0.125,8.0,5.0
211 | P(o-Tol)3,s. NaHCO3(aq.),THF,0.125,7.6,6.578947368421052
212 | P(o-Tol)3,CsF(aq.),THF,0.125,7.700000000000001,6.493506493506493
213 | P(o-Tol)3,1M K3PO4(aq.),THF,0.125,6.6,6.060606060606061
214 | P(o-Tol)3,KOH(aq.),THF,0.125,7.1,5.633802816901409
215 | P(o-Tol)3,Cs2CO3(aq.),THF,0.125,2.5,16.0
216 | P(o-Tol)3,KOAc,THF,0.125,5.0,8.0
217 | P(o-Tol)3,None,THF,0.125,2.3000000000000003,8.695652173913043
218 | CataCXium A,NaOH(aq.),THF,0.125,4.6,0.0
219 | CataCXium A,s. NaHCO3(aq.),THF,0.125,4.5,0.0
220 | CataCXium A,CsF(aq.),THF,0.125,4.6,0.0
221 | CataCXium A,1M K3PO4(aq.),THF,0.125,2.0,0.0
222 | CataCXium A,KOH(aq.),THF,0.125,1.8,0.0
223 | CataCXium A,Cs2CO3(aq.),THF,0.125,1.8,0.0
224 | CataCXium A,KOAc,THF,0.125,1.8,0.0
225 | CataCXium A,None,THF,0.125,4.0,0.0
226 | SPhos,NaOH(aq.),THF,0.0625,9.5,0.0
227 | SPhos,s. NaHCO3(aq.),THF,0.0625,4.7,0.0
228 | SPhos,CsF(aq.),THF,0.0625,4.2,0.0
229 | SPhos,1M K3PO4(aq.),THF,0.0625,4.6,0.0
230 | SPhos,KOH(aq.),THF,0.0625,4.300000000000001,0.0
231 | SPhos,Cs2CO3(aq.),THF,0.0625,4.8,0.0
232 | SPhos,KOAc,THF,0.0625,2.0,0.0
233 | SPhos,None,THF,0.0625,4.6,0.0
234 | dtbpf,NaOH(aq.),THF,0.0625,5.300000000000001,11.32075471698113
235 | dtbpf,s. NaHCO3(aq.),THF,0.0625,5.4,9.25925925925926
236 | dtbpf,CsF(aq.),THF,0.0625,4.7,14.893617021276595
237 | dtbpf,1M K3PO4(aq.),THF,0.0625,4.9,20.40816326530612
238 | dtbpf,KOH(aq.),THF,0.0625,5.1,13.725490196078432
239 | dtbpf,Cs2CO3(aq.),THF,0.0625,2.0,0.0
240 | dtbpf,KOAc,THF,0.0625,2.4,12.5
241 | dtbpf,None,THF,0.0625,4.800000000000001,4.166666666666666
242 | XPhos,NaOH(aq.),THF,0.0625,5.0,6.0
243 | XPhos,s. NaHCO3(aq.),THF,0.0625,5.0,0.0
244 | XPhos,CsF(aq.),THF,0.0625,4.7,0.0
245 | XPhos,1M K3PO4(aq.),THF,0.0625,2.0,0.0
246 | XPhos,KOH(aq.),THF,0.0625,1.8,0.0
247 | XPhos,Cs2CO3(aq.),THF,0.0625,1.7,0.0
248 | XPhos,KOAc,THF,0.0625,1.7,0.0
249 | XPhos,None,THF,0.0625,1.9,0.0
250 | dppf,NaOH(aq.),THF,0.0625,8.0,2.5
251 | dppf,s. NaHCO3(aq.),THF,0.0625,8.2,0.0
252 | dppf,CsF(aq.),THF,0.0625,15.1,0.0
253 | dppf,1M K3PO4(aq.),THF,0.0625,11.8,0.0
254 | dppf,KOH(aq.),THF,0.0625,4.199999999999999,9.523809523809526
255 | dppf,Cs2CO3(aq.),THF,0.0625,4.9,10.20408163265306
256 | dppf,KOAc,THF,0.0625,3.4,14.705882352941178
257 | dppf,None,THF,0.0625,4.4,13.636363636363637
258 | Xanthphos,NaOH(aq.),THF,0.0625,3.7,5.405405405405405
259 | Xanthphos,s. NaHCO3(aq.),THF,0.0625,2.4,20.833333333333336
260 | Xanthphos,CsF(aq.),THF,0.0625,1.9,0.0
261 | Xanthphos,1M K3PO4(aq.),THF,0.0625,2.7,29.629629629629623
262 | Xanthphos,KOH(aq.),THF,0.0625,3.0,40.0
263 | Xanthphos,Cs2CO3(aq.),THF,0.0625,1.8,0.0
264 | Xanthphos,KOAc,THF,0.0625,1.8,0.0
265 | Xanthphos,None,THF,0.0625,2.6,0.0
266 | P(tBu)3,NaOH(aq.),DMF,0.125,24.0,0.0
267 | P(tBu)3,s. NaHCO3(aq.),DMF,0.125,21.0,0.0
268 | P(tBu)3,CsF(aq.),DMF,0.125,14.7,0.0
269 | P(tBu)3,1M K3PO4(aq.),DMF,0.125,13.7,0.0
270 | P(tBu)3,KOH(aq.),DMF,0.125,18.4,0.0
271 | P(tBu)3,Cs2CO3(aq.),DMF,0.125,19.2,0.0
272 | P(tBu)3,KOAc,DMF,0.125,19.6,0.0
273 | P(tBu)3,None,DMF,0.125,21.9,0.0
274 | P(Ph)3,NaOH(aq.),DMF,0.125,23.8,0.0
275 | P(Ph)3,s. NaHCO3(aq.),DMF,0.125,10.2,0.0
276 | P(Ph)3,CsF(aq.),DMF,0.125,3.4000000000000004,0.0
277 | P(Ph)3,1M K3PO4(aq.),DMF,0.125,3.3,0.0
278 | P(Ph)3,KOH(aq.),DMF,0.125,4.1,0.0
279 | P(Ph)3,Cs2CO3(aq.),DMF,0.125,4.1,0.0
280 | P(Ph)3,KOAc,DMF,0.125,5.3,32.075471698113205
281 | P(Ph)3,None,DMF,0.125,3.8,0.0
282 | AmPhos,NaOH(aq.),DMF,0.125,22.2,0.0
283 | AmPhos,s. NaHCO3(aq.),DMF,0.125,16.3,0.0
284 | AmPhos,CsF(aq.),DMF,0.125,14.9,0.0
285 | AmPhos,1M K3PO4(aq.),DMF,0.125,17.8,0.0
286 | AmPhos,KOH(aq.),DMF,0.125,14.8,0.0
287 | AmPhos,Cs2CO3(aq.),DMF,0.125,15.6,0.0
288 | AmPhos,KOAc,DMF,0.125,4.3,0.0
289 | AmPhos,None,DMF,0.125,67.10000000000001,0.0
290 | P(Cy)3,NaOH(aq.),DMF,0.125,33.9,0.0
291 | P(Cy)3,s. NaHCO3(aq.),DMF,0.125,15.3,0.0
292 | P(Cy)3,CsF(aq.),DMF,0.125,9.7,0.0
293 | P(Cy)3,1M K3PO4(aq.),DMF,0.125,5.4,0.0
294 | P(Cy)3,KOH(aq.),DMF,0.125,3.8,0.0
295 | P(Cy)3,Cs2CO3(aq.),DMF,0.125,11.8,0.0
296 | P(Cy)3,KOAc,DMF,0.125,15.8,0.0
297 | P(Cy)3,None,DMF,0.125,11.3,0.0
298 | P(o-Tol)3,NaOH(aq.),DMF,0.125,8.7,0.0
299 | P(o-Tol)3,s. NaHCO3(aq.),DMF,0.125,0.0,0.0
300 | P(o-Tol)3,CsF(aq.),DMF,0.125,0.0,0.0
301 | P(o-Tol)3,1M K3PO4(aq.),DMF,0.125,0.0,0.0
302 | P(o-Tol)3,KOH(aq.),DMF,0.125,0.0,0.0
303 | P(o-Tol)3,Cs2CO3(aq.),DMF,0.125,0.0,0.0
304 | P(o-Tol)3,KOAc,DMF,0.125,0.0,0.0
305 | P(o-Tol)3,None,DMF,0.125,0.0,0.0
306 | CataCXium A,NaOH(aq.),DMF,0.125,0.0,0.0
307 | CataCXium A,s. NaHCO3(aq.),DMF,0.125,0.0,0.0
308 | CataCXium A,CsF(aq.),DMF,0.125,0.0,0.0
309 | CataCXium A,1M K3PO4(aq.),DMF,0.125,0.0,0.0
310 | CataCXium A,KOH(aq.),DMF,0.125,0.0,0.0
311 | CataCXium A,Cs2CO3(aq.),DMF,0.125,12.6,0.0
312 | CataCXium A,KOAc,DMF,0.125,6.8,0.0
313 | CataCXium A,None,DMF,0.125,0.0,0.0
314 | SPhos,NaOH(aq.),DMF,0.0625,9.2,100.0
315 | SPhos,s. NaHCO3(aq.),DMF,0.0625,0.0,0.0
316 | SPhos,CsF(aq.),DMF,0.0625,0.0,0.0
317 | SPhos,1M K3PO4(aq.),DMF,0.0625,0.0,0.0
318 | SPhos,KOH(aq.),DMF,0.0625,0.0,0.0
319 | SPhos,Cs2CO3(aq.),DMF,0.0625,14.5,0.0
320 | SPhos,KOAc,DMF,0.0625,0.0,0.0
321 | SPhos,None,DMF,0.0625,0.0,0.0
322 | dtbpf,NaOH(aq.),DMF,0.0625,3.8,0.0
323 | dtbpf,s. NaHCO3(aq.),DMF,0.0625,0.0,0.0
324 | dtbpf,CsF(aq.),DMF,0.0625,0.0,0.0
325 | dtbpf,1M K3PO4(aq.),DMF,0.0625,3.1,0.0
326 | dtbpf,KOH(aq.),DMF,0.0625,3.4,0.0
327 | dtbpf,Cs2CO3(aq.),DMF,0.0625,0.0,0.0
328 | dtbpf,KOAc,DMF,0.0625,0.0,0.0
329 | dtbpf,None,DMF,0.0625,0.0,0.0
330 | XPhos,NaOH(aq.),DMF,0.0625,12.1,0.0
331 | XPhos,s. NaHCO3(aq.),DMF,0.0625,21.200000000000003,0.0
332 | XPhos,CsF(aq.),DMF,0.0625,24.2,0.0
333 | XPhos,1M K3PO4(aq.),DMF,0.0625,29.2,0.0
334 | XPhos,KOH(aq.),DMF,0.0625,27.3,0.0
335 | XPhos,Cs2CO3(aq.),DMF,0.0625,22.8,0.0
336 | XPhos,KOAc,DMF,0.0625,18.5,0.0
337 | XPhos,None,DMF,0.0625,22.3,0.0
338 | dppf,NaOH(aq.),DMF,0.0625,19.0,68.94736842105263
339 | dppf,s. NaHCO3(aq.),DMF,0.0625,7.7,55.84415584415584
340 | dppf,CsF(aq.),DMF,0.0625,2.1,0.0
341 | dppf,1M K3PO4(aq.),DMF,0.0625,1.9,0.0
342 | dppf,KOH(aq.),DMF,0.0625,14.4,70.83333333333333
343 | dppf,Cs2CO3(aq.),DMF,0.0625,15.9,72.95597484276729
344 | dppf,KOAc,DMF,0.0625,2.7,33.33333333333333
345 | dppf,None,DMF,0.0625,1.9,0.0
346 | Xanthphos,NaOH(aq.),DMF,0.0625,2.2,0.0
347 | Xanthphos,s. NaHCO3(aq.),DMF,0.0625,1.9,0.0
348 | Xanthphos,CsF(aq.),DMF,0.0625,1.9,0.0
349 | Xanthphos,1M K3PO4(aq.),DMF,0.0625,1.8,0.0
350 | Xanthphos,KOH(aq.),DMF,0.0625,2.1,0.0
351 | Xanthphos,Cs2CO3(aq.),DMF,0.0625,2.1,0.0
352 | Xanthphos,KOAc,DMF,0.0625,2.6,34.61538461538461
353 | Xanthphos,None,DMF,0.0625,1.7,0.0
354 |
--------------------------------------------------------------------------------
/examples/publication/Suzuki/performance/1_merge_all.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | import numpy as np
3 |
4 | batch = 1
5 |
6 | objective_1 = 'objective_conversion'
7 | objective_2 = 'objective_selectivity'
8 |
9 | columns_to_keep = ['step', 'n_experiments',
10 | 'dmaximin_tradeoff', 'hypervolume completed (%)',
11 | f'MAE_{objective_1}', f"MAE_{objective_2}",
12 | f'RMSE_{objective_1}', f'RMSE_{objective_2}',
13 | f'R2_{objective_1}', f'R2_{objective_2}',
14 | f'{objective_1}_best', f'{objective_2}_best'
15 | ]
16 |
17 | for feat in ['ohe', 'dft', 'mordred', 'random']:
18 | for acq in ['EHVI', 'MOUCB', 'MOGreedy']:
19 | df_i = pd.read_csv(f"../results_{feat}/results_benchmark_{feat}_acq_{acq}_batch_{batch}_seed_0.csv")
20 | columns_to_drop = list(set(df_i.columns.values) - set(columns_to_keep))
21 | df_i.drop(columns=columns_to_drop, inplace=True)
22 | for seed_i in range(0, 5):
23 | df_j = pd.read_csv(f"../results_{feat}/results_benchmark_{feat}_acq_{acq}_batch_{batch}_seed_{seed_i}.csv")
24 | df_j.drop(columns=columns_to_drop, inplace=True)
25 | df_i = df_i.append(df_j)
26 |
27 | df_i.to_csv(f"./{feat}_{acq}_all.csv", index=False)
28 |
29 | df_av = df_i.groupby(['step', 'n_experiments']).agg([np.average])
30 | df_av['step'] = np.unique(df_i.step.values)
31 | df_av['n_experiments'] = np.unique(df_i.n_experiments.values)
32 | df_av.to_csv(f"./{feat}_{acq}_avg.csv", index=False)
33 |
34 | df_min = df_i.groupby(['step', 'n_experiments']).agg([np.min])
35 | df_min['step'] = np.unique(df_i.step.values)
36 | df_min['n_experiments'] = np.unique(df_i.n_experiments.values)
37 | df_min.to_csv(f"./{feat}_{acq}_min.csv", index=False)
38 |
39 | df_max = df_i.groupby(['step', 'n_experiments']).agg([np.max])
40 | df_max['step'] = np.unique(df_i.step.values)
41 | df_max['n_experiments'] = np.unique(df_i.n_experiments.values)
42 | df_max.to_csv(f"./{feat}_{acq}_max.csv", index=False)
43 |
44 |
45 |
46 |
--------------------------------------------------------------------------------
/examples/publication/Suzuki/performance/2_plot_ground_truth.py:
--------------------------------------------------------------------------------
1 |
2 | import pandas as pd
3 | import numpy as np
4 | import matplotlib.pyplot as plt
5 | import seaborn as sns
6 | sns.set_style("ticks")
7 | sns.despine()
8 | import matplotlib as mpl
9 | mpl.rcParams['grid.linestyle'] = ':'
10 | mpl.rcParams['grid.linewidth'] = 0.1
11 | plt.rcParams['font.family'] = 'Helvetica'
12 | plt.rcParams['font.size'] = 10
13 | import pareto
14 | from edbo.plus.benchmark.multiobjective_benchmark import is_pareto
15 | from pymoo.mcdm.high_tradeoff import HighTradeoffPoints
16 | from sklearn.preprocessing import MinMaxScaler
17 |
18 | # Hue: Color (ligand), shape (base), filling (solvent), alpha (ligand_eq).
19 |
20 | import seaborn as sns
21 |
22 | dataset = 'dft'
23 | acq = 'EHVI'
24 | batch = 1
25 | total_restarts = 5
26 | n_steps = 30
27 | seed = 0
28 |
29 |
30 | def get_pareto_points(objective_values):
31 | """ Get pareto for the ground truth function.
32 | NOTE: Assumes maximization."""
33 | pareto_ground = pareto.eps_sort(tables=objective_values,
34 | objectives=np.arange(2),
35 | maximize_all=True)
36 | idx_pareto = is_pareto(objectives=-objective_values)
37 | return np.array(pareto_ground), idx_pareto
38 |
39 |
40 | def get_high_tradeoff_points(pareto_points):
41 | """ Pass a numpy array with the pareto points and returns a numpy
42 | array with the high tradeoff points."""
43 |
44 | scaler_pareto = MinMaxScaler()
45 | pareto_scaled = scaler_pareto.fit_transform(pareto_points)
46 | try:
47 | tradeoff = HighTradeoffPoints()
48 |
49 | tradeoff_args = tradeoff.do(-pareto_scaled) # Always minimizing.
50 | tradeoff_points = pareto_points[tradeoff_args]
51 | except:
52 | tradeoff_points = []
53 | pass
54 | return tradeoff_points
55 |
56 |
57 | df_exp = pd.read_csv('../data/dataset_B1.csv')
58 | objective_vals = df_exp[['objective_conversion', 'objective_selectivity']].values
59 | pareto_points, idx_pareto = get_pareto_points(objective_vals)
60 | high_tradeoff_points = get_high_tradeoff_points(pareto_points)
61 |
62 |
63 | df_benchmark = pd.read_csv(f'../results_{dataset}/results_benchmark_{dataset}_acq_{acq}_batch_{batch}_seed_{seed}.csv')
64 |
65 | fig, ax = plt.subplots(nrows=2, ncols=2, figsize=(9, 15))
66 |
67 | palettes = [['tab10', 'viridis'], [None, 'Blues']]
68 |
69 | hues = [['ligand', 'base'], ['solvent', 'ligand_equivalent']]
70 | for i in range(0, 2):
71 | for j in range(0, 2):
72 | sns.scatterplot(x=df_exp['objective_conversion'], y=df_exp['objective_selectivity'],
73 | hue=df_exp[hues[i][j]], s=40, lw=1., edgecolor='black', ax=ax[i][j], palette=palettes[i][j])
74 | sns.lineplot(x=pareto_points[:, 0], y=pareto_points[:, 1],
75 | linewidth=1.2, color='grey', ls='dotted', ax=ax[i][j])
76 | ax[i][j].set_xlim(-5, 105)
77 | ax[i][j].set_ylim(-5, 105)
78 | ax[i][j].legend(loc=4)
79 | ax[i][j].set_title(hues[i][j])
80 |
81 | plt.tight_layout()
82 | plt.show()
83 |
84 | palettes = ['tab10', None]
85 | fig, ax = plt.subplots(nrows=1, ncols=2, figsize=(7, 5))
86 | hues = ['ligand', 'solvent']
87 |
88 | for i in range(0, 2):
89 | sns.scatterplot(x=df_exp['objective_conversion'], y=df_exp['objective_selectivity'],
90 | hue=df_exp[hues[i]], s=50, lw=1., edgecolor='black', ax=ax[i], palette=palettes[i])
91 | sns.lineplot(x=pareto_points[:, 0], y=pareto_points[:, 1],
92 | linewidth=1.2, color='grey', ls='dotted', ax=ax[i])
93 | ax[i].set_xlim(-5, 105)
94 | ax[i].set_ylim(-5, 105)
95 | ax[i].legend(loc=4)
96 | ax[i].set_title(hues[i])
97 |
98 | ax[0].legend(scatterpoints=1, loc='best', ncol=2, markerscale=1, fontsize=9)
99 | ax[1].legend(scatterpoints=1, loc='best', ncol=2, markerscale=1, fontsize=9)
100 |
101 | plt.tight_layout()
102 | plt.savefig('Fig2_scope.svg', dpi=500, format='svg')
103 | plt.show()
104 |
--------------------------------------------------------------------------------
/examples/publication/Suzuki/performance/3_plot_decision_pathways_objectives.py:
--------------------------------------------------------------------------------
1 |
2 | import pandas as pd
3 | import numpy as np
4 | import matplotlib.pyplot as plt
5 | import seaborn as sns
6 | sns.set_style("ticks")
7 | sns.despine()
8 | import matplotlib as mpl
9 | mpl.rcParams['grid.linestyle'] = ':'
10 | mpl.rcParams['grid.linewidth'] = 0.1
11 | plt.rcParams['font.family'] = 'Helvetica'
12 |
13 |
14 | datasets = ['ohe', 'dft', 'mordred', 'random']
15 | acq = 'EHVI'
16 | batch = 1
17 | total_restarts = 5
18 | n_steps = 30
19 |
20 | color_paletes = [sns.color_palette("Blues", n_colors=total_restarts),
21 | sns.color_palette("Reds", n_colors=total_restarts),
22 | sns.color_palette("Greens", n_colors=total_restarts),
23 | sns.color_palette("Oranges", n_colors=total_restarts)]
24 |
25 | cp = 0
26 | for dataset in datasets:
27 | objectives = ['objective_conversion', 'objective_selectivity']
28 | dict_ratios_plot = {'width_ratios': [0.5, 0.2, 0.5, 0.2], 'wspace': 0.4}
29 | fig, ax = plt.subplots(nrows=1, ncols=4, figsize=(10, 3),
30 | gridspec_kw=dict_ratios_plot)
31 | obj_counter = 0
32 | for obj in objectives:
33 |
34 | for seed in range(total_restarts):
35 | df_benchmark = pd.read_csv(f'../results_{dataset}/results_benchmark_{dataset}_acq_{acq}_batch_{batch}_seed_{seed}.csv')
36 | df_exp = pd.read_csv('../data/dataset_B1.csv')
37 | total_number_of_experiments = len(df_exp)
38 |
39 | trace_xy = []
40 | for i in range(0, n_steps):
41 | trace_xy.append([df_benchmark['step'][i], df_benchmark[f"{obj}_collected_values"][i]])
42 | trace_xy = np.reshape(trace_xy, (len(trace_xy), -2))
43 | ax[0+obj_counter].scatter(trace_xy[:, 0], trace_xy[:, 1],
44 | facecolor='white', s=50,
45 | edgecolors=color_paletes[cp][seed],
46 | zorder=100)
47 | ax[0+obj_counter].plot(trace_xy[:, 0], trace_xy[:, 1],
48 | linestyle='dotted', c=color_paletes[cp][seed],
49 | lw=1.1, alpha=1.)
50 | ax[0+obj_counter].set_xlim(-1, n_steps+1)
51 | ax[0+obj_counter].set_ylim(-5, 100+10)
52 | # ax[0].set_title(f'Objective: {obj}')
53 | sns.despine(trim=True, offset=2, ax=ax[0+obj_counter])
54 | sns.distplot(a=df_benchmark, x=df_benchmark[f"{obj}_collected_values"],
55 | ax=ax[1+obj_counter], vertical=True,
56 | hist=False,
57 | # bins=20
58 | kde_kws={'shade': True,
59 | 'color': color_paletes[cp][seed],
60 | 'alpha': 0.1},
61 | color='black'
62 | )
63 |
64 | ax[1+obj_counter].set_xlim(0, 0.025)
65 | ax[1+obj_counter].set_ylim(-5, 100+10)
66 | ax[1+obj_counter].axvline(x=0.015, color='black', ls='dotted', alpha=0.5)
67 |
68 | ax[0+obj_counter].set_title(dataset)
69 | ax[0+obj_counter].set_xlabel('Number of samples collected')
70 | ax[0+obj_counter].set_ylabel(f"{obj} (in %)")
71 | hlinecolor = 'black'
72 | hlinestyle = 'dotted'
73 | hlinewidth = 0.5
74 | # plt.hlines(y=13, linestyles=hlinestyle, lw=hlinewidth, colors=hlinecolor, xmin=0, xmax=n_steps)
75 | # plt.hlines(y=14, linestyles=hlinestyle, lw=hlinewidth, colors=hlinecolor, xmin=0, xmax=n_steps)
76 | # plt.hlines(y=29, linestyles=hlinestyle, lw=hlinewidth, colors=hlinecolor, xmin=0, xmax=n_steps)
77 | # plt.hlines(y=9, linestyles=hlinestyle, lw=hlinewidth, colors=hlinecolor, xmin=0, xmax=n_steps)
78 | # plt.hlines(y=8, linestyles=hlinestyle, lw=hlinewidth, colors=hlinecolor, xmin=0, xmax=n_steps)
79 | obj_counter += 2
80 | plt.savefig(f"fig_3_{cp}.svg", format='svg', dpi=500)
81 | plt.show()
82 | plt.tight_layout()
83 | cp += 1
84 |
--------------------------------------------------------------------------------
/examples/publication/Suzuki/performance/4_plot_performance.py:
--------------------------------------------------------------------------------
1 |
2 | import numpy as np
3 | import matplotlib.pyplot as plt
4 | import seaborn as sns
5 | import pandas as pd
6 | import os
7 |
8 |
9 | # sns.set_style("ticks")
10 | # sns.set_context("paper")
11 | import matplotlib as mpl
12 | mpl.rcParams['grid.linestyle'] = ':'
13 | mpl.rcParams['grid.linewidth'] = 0.1
14 |
15 | objective_1 = 'conversion'
16 | objective_2 = 'selectivity'
17 |
18 | plt.rcParams['font.family'] = 'Helvetica'
19 | # mpl.rc('font', **{'family':'sans-serif', 'sans-serif':['HelveticaLight']})
20 |
21 | # Best objectives.
22 | best_conversion_in_scope = 100.
23 | best_selectivity_in_scope = 100.
24 | n_steps = 30
25 | feat_iter = 0
26 |
27 | if not os.path.exists('./figures'):
28 | os.mkdir('figures')
29 |
30 |
31 | for acq in ['EHVI', 'MOUCB', 'MOGreedy']:
32 | colors = ['#DC143C', '#0343DF', '#FAC205', '#15B01A']
33 | color_i = 0
34 | fig, ax = plt.subplots(figsize=(8., 8.0), dpi=500, nrows=2, ncols=2)
35 |
36 | for feat in ['ohe', 'dft', 'mordred', 'random']:
37 | avg = pd.read_csv(f"./{feat}_{acq}_avg.csv")
38 | avg = avg.apply(pd.to_numeric, errors='coerce')
39 | max = pd.read_csv(f"./{feat}_{acq}_max.csv")
40 | max = max.apply(pd.to_numeric, errors='coerce')
41 | min = pd.read_csv(f"./{feat}_{acq}_min.csv")
42 | min = min.apply(pd.to_numeric, errors='coerce')
43 |
44 | n_exp = avg['n_experiments'].values[1:]
45 |
46 | # Hypervolume.
47 | hypervol_max = max['hypervolume completed (%)'].values[1:]
48 | hypervol_min = min['hypervolume completed (%)'].values[1:]
49 | hypervol_avg = avg['hypervolume completed (%)'].values[1:]
50 |
51 | # Where hypervolume is 99% completed.
52 | try:
53 | hyper_complete_arg = np.argwhere(hypervol_avg > 99.0)[0]
54 | hyper_complete_y = [hypervol_avg[hyper_complete_arg]]
55 | hyper_complete_x = [n_exp[hyper_complete_arg]]
56 | except:
57 | conversion_complete_x = []
58 | conversion_complete_y = []
59 |
60 | # Distance pareto.
61 | dtradeoff_max = max['dmaximin_tradeoff'].values[1:]
62 | dtradeoff_min = min['dmaximin_tradeoff'].values[1:]
63 | dtradeoff_avg = avg['dmaximin_tradeoff'].values[1:]
64 |
65 |
66 | # Best samples at each run.
67 | bestconversion_max = max['objective_conversion_best'].values[1:]
68 | bestselectivity_max = max['objective_selectivity_best'].values[1:]
69 | bestconversion_min = min['objective_conversion_best'].values[1:]
70 | bestselectivity_min = min['objective_selectivity_best'].values[1:]
71 | bestconversion_avg = avg['objective_conversion_best'].values[1:]
72 | bestselectivity_avg = avg['objective_selectivity_best'].values[1:]
73 |
74 | # Where best conversion is sampled.
75 | try:
76 | conversion_complete_arg = np.argwhere(bestconversion_max == best_conversion_in_scope)[0]
77 | conversion_complete_y = [bestconversion_max[conversion_complete_arg]]
78 | conversion_complete_x = [n_exp[conversion_complete_arg]]
79 | except:
80 | conversion_complete_x = []
81 | conversion_complete_y = []
82 |
83 | # Where best selectivity is sampled.
84 | try:
85 | selectivity_complete_arg = np.argwhere(bestselectivity_min == best_selectivity_in_scope)[0]
86 | selectivity_complete_y = [bestselectivity_min[selectivity_complete_arg]]
87 | selectivity_complete_x = [n_exp[selectivity_complete_arg]]
88 | except:
89 | selectivity_complete_x = []
90 | selectivity_complete_y = []
91 |
92 | # Plot performance for each acquisition function.
93 | ax[0][0].plot(n_exp, hypervol_avg, color=colors[color_i], lw=2.5,
94 | label=feat.upper())
95 | ax[0][0].fill_between(x=n_exp,
96 | y1=hypervol_avg,
97 | y2=hypervol_max, color=colors[color_i], alpha=0.3, lw=0.)
98 | ax[0][0].fill_between(x=n_exp,
99 | y1=hypervol_min,
100 | y2=hypervol_avg, color=colors[color_i], alpha=0.3, lw=0.)
101 | ax[0][0].plot(n_exp, hypervol_min, color=colors[color_i], alpha=1., lw=1., ls='--')
102 | ax[0][0].plot(n_exp, hypervol_max, color=colors[color_i], alpha=1., lw=1., ls='--')
103 | ax[0][0].plot(n_exp, np.ones_like(n_exp)*100,
104 | dashes=[8, 4], color='black', linewidth=0.8)
105 | ax[0][0].scatter(n_exp, hypervol_avg, marker='o', s=0., color=colors[color_i])
106 |
107 | ax[0][0].set_xticks(np.arange(0, 120, 10))
108 | ax[0][0].set_xlim(0, n_steps)
109 | ax[0][0].set_ylim(0, 100)
110 | ax[0][0].set_xlabel('Samples')
111 | ax[0][0].set_ylabel('Hypervolume (%)')
112 |
113 | # Plot distance tradeoff.
114 | ax[0][1].plot(n_exp, dtradeoff_avg, color=colors[color_i], lw=2.5,
115 | label=feat.upper())
116 | ax[0][1].plot(n_exp, dtradeoff_min, color=colors[color_i], lw=1., ls='--',
117 | label=feat.upper())
118 | ax[0][1].plot(n_exp, dtradeoff_max, color=colors[color_i], lw=1., ls='--',
119 | label=feat.upper())
120 |
121 |
122 | ax[0][1].fill_between(x=n_exp,
123 | y1=dtradeoff_avg,
124 | y2=dtradeoff_max, color=colors[color_i], alpha=0.3,
125 | )
126 | ax[0][1].fill_between(x=n_exp,
127 | y1=dtradeoff_min,
128 | y2=dtradeoff_avg, color=colors[color_i], alpha=0.3,
129 | )
130 | ax[0][1].plot(n_exp, np.ones_like(n_exp) * 0,
131 | dashes=[8, 4], color='black', linewidth=0.8)
132 | ax[0][1].scatter(n_exp, dtradeoff_avg, marker='o', s=0.,
133 | color=colors[color_i])
134 |
135 |
136 | ax[0][1].set_xticks(np.arange(0, 120, 10))
137 | ax[0][1].set_xlim(0, n_steps)
138 | ax[0][1].set_ylim(0, 80)
139 | ax[0][1].set_xlabel('Samples')
140 | ax[0][1].set_ylabel(r'$d_{(trade-off)}$')
141 |
142 | # Plot best conversion.
143 | ax[1][0].plot(n_exp, bestconversion_avg, color=colors[color_i], lw=2.5,
144 | label=feat)
145 | ax[1][0].plot(n_exp, bestconversion_min, color=colors[color_i], lw=1, ls='--',
146 | label=feat, alpha=1.)
147 | ax[1][0].plot(n_exp, bestconversion_max, color=colors[color_i], lw=1, ls='--',
148 | label=feat, alpha=1.)
149 | ax[1][0].fill_between(x=n_exp,
150 | y1=bestconversion_avg,
151 | y2=bestconversion_max, color=colors[color_i], alpha=0.3,
152 | )
153 | ax[1][0].fill_between(x=n_exp,
154 | y1=bestconversion_min,
155 | y2=bestconversion_avg, color=colors[color_i], alpha=0.3,
156 | )
157 |
158 | ax[1][0].plot(n_exp, np.ones_like(n_exp) * 0,
159 | dashes=[8, 4], color='black', linewidth=0.8)
160 | ax[1][0].scatter(n_exp, bestconversion_avg, marker='o', s=0.,
161 | color=colors[color_i])
162 |
163 | ax[1][0].set_xticks(np.arange(0, 120, 10))
164 | ax[1][0].set_xlim(0, n_steps)
165 | ax[1][0].set_ylim(20, 100)
166 | ax[1][0].set_xlabel('Samples')
167 | ax[1][0].set_ylabel('Best conversion')
168 |
169 | # Plot best selectivity.
170 | ax[1][1].plot(n_exp, bestselectivity_avg, color=colors[color_i], lw=2.5,
171 | label=feat.upper())
172 |
173 | ax[1][1].plot(n_exp, bestselectivity_min, color=colors[color_i], lw=1.0, ls='--',
174 | label=feat.upper())
175 | ax[1][1].plot(n_exp, bestselectivity_max, color=colors[color_i], lw=1.0, ls='--',
176 | label=feat.upper())
177 |
178 |
179 | ax[1][1].fill_between(x=n_exp,
180 | y1=bestselectivity_avg,
181 | y2=bestselectivity_max, color=colors[color_i], alpha=0.3,
182 | )
183 | ax[1][1].fill_between(x=n_exp,
184 | y1=bestselectivity_min,
185 | y2=bestselectivity_avg, color=colors[color_i], alpha=0.3,
186 | )
187 | ax[1][1].plot(n_exp, np.ones_like(n_exp) * 0,
188 | dashes=[8, 4], color='black', linewidth=0.8)
189 | ax[1][1].scatter(n_exp, bestselectivity_avg, marker='o', s=0.,
190 | color=colors[color_i])
191 |
192 |
193 | ax[1][1].set_xticks(np.arange(0, 120, 10))
194 | ax[1][1].set_xlim(0, n_steps)
195 | ax[1][1].set_ylim(0, 100.)
196 | ax[1][1].set_xlabel('Samples')
197 | ax[1][1].set_ylabel('Best selectivity')
198 |
199 | color_i += 1
200 | plt.legend()
201 | plt.tight_layout()
202 | plt.savefig(f"figures/benchmark_{acq}.svg")
203 | plt.show()
204 |
205 |
206 |
--------------------------------------------------------------------------------
/examples/publication/Suzuki/performance/5_find_entry.py:
--------------------------------------------------------------------------------
1 |
2 | import pandas as pd
3 |
4 |
5 | df = pd.read_csv('../data/dataset_B1.csv')
6 |
7 | c_ligand, c_base, c_leq, c_solvent = 'SPhos', 'NaOH(aq.)', 0.0625, 'DMF'
8 |
9 | c_ligand, c_base, c_leq, c_solvent = 'P(Ph)3', 'KOAc', 0.125, 'MeOH'
10 |
11 | c_ligand, c_base, c_leq, c_solvent = 'P(Cy)3', 'Cs2CO3(aq.)', 0.125, 'MeOH'
12 |
13 | c_ligand, c_base, c_leq, c_solvent = 'P(Ph)3', 'NaOH(aq.)', 0.125, 'MeOH'
14 |
15 | c_ligand, c_base, c_leq, c_solvent = 'P(Ph)3', 'CsF(aq.)', 0.125, 'MeCN'
16 |
17 | df_new = df[(df['ligand'] == c_ligand) & (df['base'] == c_base) & (df['solvent'] == c_solvent)]
18 |
19 | print(df_new)
20 |
--------------------------------------------------------------------------------
/examples/publication/Suzuki/performance/7_plot_performance_acquisition_function.py:
--------------------------------------------------------------------------------
1 |
2 | import numpy as np
3 | import matplotlib.pyplot as plt
4 | import seaborn as sns
5 | import pandas as pd
6 | import os
7 |
8 |
9 | # sns.set_style("ticks")
10 | # sns.set_context("paper")
11 | import matplotlib as mpl
12 | mpl.rcParams['grid.linestyle'] = ':'
13 | mpl.rcParams['grid.linewidth'] = 0.1
14 |
15 | objective_1 = 'conversion'
16 | objective_2 = 'selectivity'
17 |
18 | plt.rcParams['font.family'] = 'Helvetica'
19 | # mpl.rc('font', **{'family':'sans-serif', 'sans-serif':['HelveticaLight']})
20 |
21 | # Best objectives.
22 | best_conversion_in_scope = 100.
23 | best_selectivity_in_scope = 100.
24 | n_steps = 30
25 | feat_iter = 0
26 |
27 | if not os.path.exists('./figures'):
28 | os.mkdir('figures')
29 |
30 |
31 | colors = ['#DC143C', '#0343DF', '#FAC205']
32 | feat = 'DFT'
33 | color_i = 0
34 | fig, ax = plt.subplots(figsize=(8., 8.), dpi=500, nrows=2, ncols=2)
35 | for acq in ['EHVI', 'MOUCB', 'MOGreedy']:
36 |
37 | avg = pd.read_csv(f"./{feat}_{acq}_avg.csv")
38 | avg = avg.apply(pd.to_numeric, errors='coerce')
39 | max = pd.read_csv(f"./{feat}_{acq}_max.csv")
40 | max = max.apply(pd.to_numeric, errors='coerce')
41 | min = pd.read_csv(f"./{feat}_{acq}_min.csv")
42 | min = min.apply(pd.to_numeric, errors='coerce')
43 |
44 | n_exp = avg['n_experiments'].values[1:]
45 |
46 | # Hypervolume.
47 | hypervol_max = max['hypervolume completed (%)'].values[1:]
48 | hypervol_min = min['hypervolume completed (%)'].values[1:]
49 | hypervol_avg = avg['hypervolume completed (%)'].values[1:]
50 |
51 | # Where hypervolume is 99% completed.
52 | try:
53 | hyper_complete_arg = np.argwhere(hypervol_avg > 99.0)[0]
54 | hyper_complete_y = [hypervol_avg[hyper_complete_arg]]
55 | hyper_complete_x = [n_exp[hyper_complete_arg]]
56 | except:
57 | conversion_complete_x = []
58 | conversion_complete_y = []
59 |
60 | # Distance pareto.
61 | dtradeoff_max = max['dmaximin_tradeoff'].values[1:]
62 | dtradeoff_min = min['dmaximin_tradeoff'].values[1:]
63 | dtradeoff_avg = avg['dmaximin_tradeoff'].values[1:]
64 |
65 |
66 | # Best samples at each run.
67 | bestconversion_max = max['objective_conversion_best'].values[1:]
68 | bestselectivity_max = max['objective_selectivity_best'].values[1:]
69 | bestconversion_min = min['objective_conversion_best'].values[1:]
70 | bestselectivity_min = min['objective_selectivity_best'].values[1:]
71 | bestconversion_avg = avg['objective_conversion_best'].values[1:]
72 | bestselectivity_avg = avg['objective_selectivity_best'].values[1:]
73 |
74 | # Where best conversion is sampled.
75 | try:
76 | conversion_complete_arg = np.argwhere(bestconversion_max == best_conversion_in_scope)[0]
77 | conversion_complete_y = [bestconversion_max[conversion_complete_arg]]
78 | conversion_complete_x = [n_exp[conversion_complete_arg]]
79 | except:
80 | conversion_complete_x = []
81 | conversion_complete_y = []
82 |
83 | # Where best selectivity is sampled.
84 | try:
85 | selectivity_complete_arg = np.argwhere(bestselectivity_min == best_selectivity_in_scope)[0]
86 | selectivity_complete_y = [bestselectivity_min[selectivity_complete_arg]]
87 | selectivity_complete_x = [n_exp[selectivity_complete_arg]]
88 | except:
89 | selectivity_complete_x = []
90 | selectivity_complete_y = []
91 |
92 | # Plot performance for each acquisition function.
93 | ax[0][0].plot(n_exp, hypervol_avg, color=colors[color_i], lw=2.5, label=acq.upper())
94 | ax[0][0].fill_between(x=n_exp, y1=hypervol_avg, y2=hypervol_max, color=colors[color_i], alpha=0.3, lw=0.)
95 | ax[0][0].fill_between(x=n_exp, y1=hypervol_min, y2=hypervol_avg, color=colors[color_i], alpha=0.3, lw=0.)
96 | ax[0][0].plot(n_exp, hypervol_min, color=colors[color_i], alpha=1., lw=1., ls='--')
97 | ax[0][0].plot(n_exp, hypervol_max, color=colors[color_i], alpha=1., lw=1., ls='--')
98 | ax[0][0].plot(n_exp, np.ones_like(n_exp)*100, dashes=[8, 4], color='black', linewidth=0.8)
99 | ax[0][0].scatter(n_exp, hypervol_avg, marker='o', s=0., color=colors[color_i])
100 |
101 | ax[0][0].set_xticks(np.arange(0, 120, 10))
102 | ax[0][0].set_xlim(0, n_steps)
103 | ax[0][0].set_ylim(0, 100)
104 | ax[0][0].set_xlabel('Samples')
105 | ax[0][0].set_ylabel('Hypervolume (%)')
106 | # plt.tick_params(axis="x", direction="in")
107 | # plt.tick_params(axis="y", direction="in")
108 |
109 | # Plot distance tradeoff.
110 | ax[0][1].plot(n_exp, dtradeoff_avg, color=colors[color_i], lw=2.5, label=acq.upper())
111 | ax[0][1].plot(n_exp, dtradeoff_min, color=colors[color_i], lw=1., ls='--', label=acq.upper())
112 | ax[0][1].plot(n_exp, dtradeoff_max, color=colors[color_i], lw=1., ls='--', label=acq.upper())
113 | ax[0][1].fill_between(x=n_exp, y1=dtradeoff_avg, y2=dtradeoff_max, color=colors[color_i], alpha=0.3)
114 | ax[0][1].fill_between(x=n_exp, y1=dtradeoff_min, y2=dtradeoff_avg, color=colors[color_i], alpha=0.3)
115 | ax[0][1].plot(n_exp, np.ones_like(n_exp) * 0, dashes=[8, 4], color='black', linewidth=0.8)
116 | ax[0][1].scatter(n_exp, dtradeoff_avg, marker='o', s=0., color=colors[color_i])
117 |
118 | ax[0][1].set_xticks(np.arange(0, 120, 10))
119 | ax[0][1].set_xlim(0, n_steps)
120 | ax[0][1].set_ylim(0, 80)
121 | ax[0][1].set_xlabel('Samples')
122 | ax[0][1].set_ylabel(r'$d_{(trade-off)}$')
123 |
124 | # Plot best conversion.
125 | ax[1][0].plot(n_exp, bestconversion_avg, color=colors[color_i], lw=2.5, label=acq)
126 | ax[1][0].plot(n_exp, bestconversion_min, color=colors[color_i], lw=1, ls='--', label=acq, alpha=1.)
127 | ax[1][0].plot(n_exp, bestconversion_max, color=colors[color_i], lw=1, ls='--', label=acq, alpha=1.)
128 | ax[1][0].fill_between(x=n_exp, y1=bestconversion_avg, y2=bestconversion_max, color=colors[color_i], alpha=0.3)
129 | ax[1][0].fill_between(x=n_exp, y1=bestconversion_min, y2=bestconversion_avg, color=colors[color_i], alpha=0.3)
130 |
131 | ax[1][0].plot(n_exp, np.ones_like(n_exp) * 0,
132 | dashes=[8, 4], color='black', linewidth=0.8)
133 | ax[1][0].scatter(n_exp, bestconversion_avg, marker='o', s=0.,
134 | color=colors[color_i])
135 |
136 | ax[1][0].set_xticks(np.arange(0, 120, 10))
137 | ax[1][0].set_xlim(0, n_steps)
138 | ax[1][0].set_ylim(20, 100)
139 | ax[1][0].set_xlabel('Samples')
140 | ax[1][0].set_ylabel('Best conversion')
141 |
142 | # Plot best selectivity.
143 | ax[1][1].plot(n_exp, bestselectivity_avg, color=colors[color_i], lw=2.5,
144 | label=acq.upper())
145 |
146 | ax[1][1].plot(n_exp, bestselectivity_min, color=colors[color_i], lw=1.0, ls='--',
147 | label=acq.upper())
148 | ax[1][1].plot(n_exp, bestselectivity_max, color=colors[color_i], lw=1.0, ls='--',
149 | label=acq.upper())
150 |
151 |
152 | ax[1][1].fill_between(x=n_exp,
153 | y1=bestselectivity_avg,
154 | y2=bestselectivity_max, color=colors[color_i], alpha=0.3,
155 | )
156 | ax[1][1].fill_between(x=n_exp,
157 | y1=bestselectivity_min,
158 | y2=bestselectivity_avg, color=colors[color_i], alpha=0.3,
159 | )
160 | ax[1][1].plot(n_exp, np.ones_like(n_exp) * 0,
161 | dashes=[8, 4], color='black', linewidth=0.8)
162 | ax[1][1].scatter(n_exp, bestselectivity_avg, marker='o', s=0.,
163 | color=colors[color_i])
164 |
165 |
166 | ax[1][1].set_xticks(np.arange(0, 120, 10))
167 | ax[1][1].set_xlim(0, n_steps)
168 | ax[1][1].set_ylim(0, 100.)
169 | ax[1][1].set_xlabel('Samples')
170 | ax[1][1].set_ylabel('Best selectivity')
171 |
172 | color_i += 1
173 |
174 | ax[0][1].legend()
175 | plt.tight_layout()
176 | # plt.savefig(f"figures/benchmark_acquisition_functions.svg")
177 | plt.show()
178 |
179 |
180 |
--------------------------------------------------------------------------------
/examples/publication/Suzuki/performance_acq/1_merge_all.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | import numpy as np
3 |
4 |
5 | objective_1 = 'objective_conversion'
6 | objective_2 = 'objective_selectivity'
7 | columns_to_keep = ['step', 'n_experiments', 'hypervolume completed (%)']
8 |
9 | for batch in [1, 2, 3, 5]:
10 | for acq in ['EHVI', 'MOUCB', 'MOGreedy']:
11 |
12 | df_i = pd.read_csv(f"../results/results_benchmark_dft_acq_{acq}_batch_{batch}_seed_0.csv")
13 | columns_to_drop = list(set(df_i.columns.values) - set(columns_to_keep))
14 | df_i.drop(columns=columns_to_drop, inplace=True)
15 | for seed_i in range(0, 5):
16 | df_j = pd.read_csv(f"../results/results_benchmark_dft_acq_{acq}_batch_{batch}_seed_{seed_i}.csv")
17 | df_j.drop(columns=columns_to_drop, inplace=True)
18 | df_i = df_i.append(df_j)
19 |
20 | df_i.to_csv(f"./dft_{acq}_{batch}_all.csv", index=False)
21 |
22 | df_av = df_i.groupby(['step', 'n_experiments']).agg([np.average])
23 | df_av['step'] = np.unique(df_i.step.values)
24 | df_av['n_experiments'] = np.unique(df_i.n_experiments.values)
25 | df_av.to_csv(f"./dft_{acq}_{batch}_avg.csv", index=False)
26 |
27 | df_min = df_i.groupby(['step', 'n_experiments']).agg([np.min])
28 | df_min['step'] = np.unique(df_i.step.values)
29 | df_min['n_experiments'] = np.unique(df_i.n_experiments.values)
30 | df_min.to_csv(f"./dft_{acq}_{batch}_min.csv", index=False)
31 |
32 | df_max = df_i.groupby(['step', 'n_experiments']).agg([np.max])
33 | df_max['step'] = np.unique(df_i.step.values)
34 | df_max['n_experiments'] = np.unique(df_i.n_experiments.values)
35 | df_max.to_csv(f"./dft_{acq}_{batch}_max.csv", index=False)
36 |
37 |
38 |
39 |
--------------------------------------------------------------------------------
/examples/publication/Suzuki/performance_acq/2_plot_acq_batch.py:
--------------------------------------------------------------------------------
1 | import os.path
2 |
3 | import numpy as np
4 | import pandas as pd
5 | import matplotlib.pyplot as plt
6 |
7 |
8 | n_steps = 30
9 | colors = ['#DC143C', '#0343DF', '#FAC205']
10 | feat = 'dft'
11 | fig, ax = plt.subplots(figsize=(15., 4.), dpi=500, nrows=1, ncols=4)
12 |
13 | batch_count = 0
14 | for batch in [1, 2, 3, 5]:
15 | color_i = 0
16 | for acq in ['EHVI', 'MOUCB', 'MOGreedy']:
17 | avg = pd.read_csv(f"./{feat}_{acq}_{batch}_avg.csv")
18 | avg = avg.apply(pd.to_numeric, errors='coerce')
19 | max = pd.read_csv(f"./{feat}_{acq}_{batch}_max.csv")
20 | max = max.apply(pd.to_numeric, errors='coerce')
21 | min = pd.read_csv(f"./{feat}_{acq}_{batch}_min.csv")
22 | min = min.apply(pd.to_numeric, errors='coerce')
23 | n_exp = avg['n_experiments'].values[1:]
24 |
25 | hypervol_max = max['hypervolume completed (%)'].values[1:]
26 | hypervol_min = min['hypervolume completed (%)'].values[1:]
27 | hypervol_avg = avg['hypervolume completed (%)'].values[1:]
28 | # Plot performance for each acquisition function.
29 | ax[batch_count].plot(n_exp, hypervol_avg, color=colors[color_i], lw=2.5, label=acq.upper())
30 | ax[batch_count].fill_between(x=n_exp, y1=hypervol_avg, y2=hypervol_max, color=colors[color_i], alpha=0.3, lw=0.)
31 | ax[batch_count].fill_between(x=n_exp, y1=hypervol_min, y2=hypervol_avg, color=colors[color_i], alpha=0.3, lw=0.)
32 | ax[batch_count].plot(n_exp, hypervol_min, color=colors[color_i], alpha=1., lw=1., ls='--')
33 | ax[batch_count].plot(n_exp, hypervol_max, color=colors[color_i], alpha=1., lw=1., ls='--')
34 | ax[batch_count].plot(n_exp, np.ones_like(n_exp) * 100, dashes=[8, 4], color='black', linewidth=0.8)
35 | ax[batch_count].scatter(n_exp, hypervol_avg, marker='o', s=0., color=colors[color_i])
36 |
37 | ax[batch_count].set_xticks(np.arange(0, 120, 5))
38 | ax[batch_count].set_xlim(0, n_steps)
39 | ax[batch_count].set_ylim(0, 100)
40 | ax[batch_count].set_xlabel('Samples')
41 | ax[batch_count].set_ylabel('Hypervolume (%)')
42 | color_i += 1
43 |
44 | batch_count += 1
45 | plt.legend()
46 |
47 | if not os.path.exists('figures'):
48 | os.mkdir('figures')
49 |
50 | plt.tight_layout()
51 | plt.savefig(f"figures/benchmark_acquisition_functions_batch.svg")
52 | plt.show()
53 |
54 |
--------------------------------------------------------------------------------
/examples/publication/Virtual-experimentation/1_benchmark.py:
--------------------------------------------------------------------------------
1 |
2 | import shutil
3 | from edbo.plus.benchmark.multiobjective_benchmark import Benchmark
4 | import os
5 | import numpy as np
6 | import pandas as pd
7 |
8 |
9 | #######################
10 | # Benchmark inputs
11 | budget = 30
12 |
13 | acq = 'EHVI'
14 | seed = 1
15 | for sampling_method in ['seed', 'lhs', 'cvtsampling']:
16 | for batch in [1, 2, 3, 5]:
17 | df_exp = pd.read_csv('./data/data.csv')
18 | df_exp['new_index'] = np.arange(0, len(df_exp.values))
19 | sort_column = 'new_index'
20 |
21 | # Select the features for the model.
22 | columns_regression = ['Temperature', 'Volume', 'D',
23 | 'SM2',
24 | 'W',
25 | 'Mixing',
26 | 'Time',
27 | 'WB'
28 | ]
29 |
30 | # Select objectives.
31 | objectives = ['P', 'I1']
32 | objective_modes = ['max', 'min']
33 | objective_thresholds = [None, None]
34 | print(f"Columns for regression: {columns_regression}")
35 |
36 | label_benchmark = f"benchmark_acq_{acq}_batch_{batch}_{sampling_method}.csv"
37 |
38 | # Remove previous files.
39 | if os.path.exists(label_benchmark):
40 | os.remove(label_benchmark)
41 |
42 | if os.path.exists(f'pred_{label_benchmark}'):
43 | os.remove(f'pred_{label_benchmark}')
44 |
45 | if os.path.exists(f'results_{label_benchmark}'):
46 | os.remove(f'results_{label_benchmark}')
47 |
48 | bench = Benchmark(
49 | df_ground=df_exp,
50 | features_regression=columns_regression,
51 | objective_names=objectives,
52 | objective_modes=objective_modes,
53 | objective_thresholds=objective_thresholds,
54 | filename=label_benchmark,
55 | filename_results=f'results_{label_benchmark}',
56 | index_column=sort_column,acquisition_function=acq
57 | )
58 |
59 | bench.run(
60 | steps=int(budget/batch), batch=batch, seed=seed,
61 | init_method=sampling_method,
62 | plot_train=False, plot_predictions=False
63 | )
64 |
65 | if not os.path.exists('results'):
66 | os.mkdir('results')
67 |
68 | shutil.move(label_benchmark, f'results/{label_benchmark}')
69 | shutil.move(f'pred_{label_benchmark}', f'results/pred_{label_benchmark}')
70 | shutil.move(f'results_{label_benchmark}', f'results/results_{label_benchmark}')
71 |
--------------------------------------------------------------------------------
/examples/publication/Virtual-experimentation/performance/1_merge_all.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | import numpy as np
3 |
4 | batch = 1
5 |
6 | objective_1 = 'P'
7 | objective_2 = 'I1'
8 |
9 | columns_to_keep = ['step', 'n_experiments',
10 | 'dmaximin_tradeoff', 'hypervolume completed (%)',
11 | f'MAE_{objective_1}', f"MAE_{objective_2}",
12 | f'RMSE_{objective_1}', f'RMSE_{objective_2}',
13 | f'R2_{objective_1}', f'R2_{objective_2}',
14 | f'{objective_1}_best', f'{objective_2}_best'
15 | ]
16 |
17 | acq = 'EHVI'
18 | for sampling in ['seed', 'lhs', 'cvtsampling']:
19 | df_i = pd.read_csv(f"../results/results_benchmark_acq_{acq}_batch_{batch}_{sampling}.csv")
20 | columns_to_drop = list(set(df_i.columns.values) - set(columns_to_keep))
21 | df_i.drop(columns=columns_to_drop, inplace=True)
22 |
23 | df_i.to_csv(f"./{sampling}_all.csv", index=False)
24 |
25 | df_av = df_i.groupby(['step', 'n_experiments']).agg([np.average])
26 | df_av['step'] = np.unique(df_i.step.values)
27 | df_av['n_experiments'] = np.unique(df_i.n_experiments.values)
28 | df_av.to_csv(f"./{sampling}_avg.csv", index=False)
29 |
30 | df_min = df_i.groupby(['step', 'n_experiments']).agg([np.min])
31 | df_min['step'] = np.unique(df_i.step.values)
32 | df_min['n_experiments'] = np.unique(df_i.n_experiments.values)
33 | df_min.to_csv(f"./{sampling}_min.csv", index=False)
34 |
35 | df_max = df_i.groupby(['step', 'n_experiments']).agg([np.max])
36 | df_max['step'] = np.unique(df_i.step.values)
37 | df_max['n_experiments'] = np.unique(df_i.n_experiments.values)
38 | df_max.to_csv(f"./{sampling}_max.csv", index=False)
39 |
40 |
41 |
42 |
--------------------------------------------------------------------------------
/examples/publication/Virtual-experimentation/performance/2_plot_ground_truth.py:
--------------------------------------------------------------------------------
1 | import os.path
2 |
3 | import pandas as pd
4 | import numpy as np
5 | import matplotlib.pyplot as plt
6 | import seaborn as sns
7 | sns.set_style("ticks")
8 | sns.despine()
9 | import matplotlib as mpl
10 | mpl.rcParams['grid.linestyle'] = ':'
11 | mpl.rcParams['grid.linewidth'] = 0.1
12 | plt.rcParams['font.family'] = 'Helvetica'
13 | plt.rcParams['font.size'] = 10
14 | import pareto
15 | from edbo.plus.benchmark.multiobjective_benchmark import is_pareto
16 | from pymoo.mcdm.high_tradeoff import HighTradeoffPoints
17 | from sklearn.preprocessing import MinMaxScaler
18 |
19 |
20 | # Hue: Color (ligand), shape (base), filling (solvent), alpha (ligand_eq).
21 |
22 | import seaborn as sns
23 |
24 | dataset = 'dft'
25 | acq = 'EHVI'
26 | batch = 1
27 | total_restarts = 5
28 | n_steps = 30
29 | seed = 0
30 |
31 |
32 | def get_pareto_points(objective_values):
33 | """ Get pareto for the ground truth function.
34 | NOTE: Assumes maximization."""
35 | pareto_ground = pareto.eps_sort(tables=objective_values,
36 | objectives=np.arange(2),
37 | maximize_all=True)
38 | idx_pareto = is_pareto(objectives=-objective_values)
39 | return np.array(pareto_ground), idx_pareto
40 |
41 | def get_high_tradeoff_points(pareto_points):
42 | """ Pass a numpy array with the pareto points and returns a numpy
43 | array with the high tradeoff points."""
44 |
45 | scaler_pareto = MinMaxScaler()
46 | pareto_scaled = scaler_pareto.fit_transform(pareto_points)
47 | try:
48 | tradeoff = HighTradeoffPoints()
49 |
50 | tradeoff_args = tradeoff.do(-pareto_scaled) # Always minimizing.
51 | tradeoff_points = pareto_points[tradeoff_args]
52 | except:
53 | tradeoff_points = []
54 | pass
55 | return tradeoff_points
56 |
57 |
58 | df_exp = pd.read_csv('../data/data.csv')
59 | df_exp['I1'] = -df_exp['I1'].values
60 | objective_vals = df_exp[['P', 'I1']].values
61 | pareto_points, idx_pareto = get_pareto_points(objective_vals)
62 | high_tradeoff_points = get_high_tradeoff_points(pareto_points)
63 |
64 | fig, ax = plt.subplots(nrows=3, ncols=3, figsize=(12, 12))
65 |
66 | print(df_exp.columns)
67 |
68 | palettes = [['Reds', 'Reds', 'Blues'],
69 | ['Greens', 'Oranges', 'Reds'],
70 | ['Blues', 'Greens', 'Oranges']
71 | ]
72 |
73 | hues = [['Temperature', 'Temperature', 'Volume'],
74 | ['D', 'SM2', 'W'],
75 | ['Mixing', 'Time', 'WB']
76 | ]
77 |
78 | for i in range(0, 3):
79 | for j in range(0, 3):
80 | sns.scatterplot(x=df_exp['P'], y=df_exp['I1'],
81 | hue=df_exp[hues[i][j]], s=40, lw=1., edgecolor='black', ax=ax[i][j], palette=palettes[i][j])
82 | sns.lineplot(x=pareto_points[:, 0], y=pareto_points[:, 1],
83 | linewidth=1.2, color='grey', ls='dotted', ax=ax[i][j])
84 | # ax[i][j].set_xlim(-5, 105)
85 | # ax[i][j].set_ylim(-5, 105)
86 | ax[i][j].legend(loc=3)
87 | ax[i][j].set_title(hues[i][j])
88 | fig.delaxes(ax[0][0])
89 | plt.tight_layout()
90 |
91 | if not os.path.exists('../plots'):
92 | os.mkdir('../plots')
93 | plt.savefig('../plots/SI_ground_truth.svg', dpi=500, format='svg')
94 | plt.show()
95 |
--------------------------------------------------------------------------------
/examples/publication/Virtual-experimentation/performance/3_plot_performance_acquisition_function.py:
--------------------------------------------------------------------------------
1 |
2 | import numpy as np
3 | import matplotlib.pyplot as plt
4 | import seaborn as sns
5 | import pandas as pd
6 | import os
7 |
8 |
9 | # sns.set_style("ticks")
10 | # sns.set_context("paper")
11 | import matplotlib as mpl
12 | mpl.rcParams['grid.linestyle'] = ':'
13 | mpl.rcParams['grid.linewidth'] = 0.1
14 |
15 | objective_1 = 'P'
16 | objective_2 = 'I1'
17 |
18 | plt.rcParams['font.family'] = 'Helvetica'
19 | # mpl.rc('font', **{'family':'sans-serif', 'sans-serif':['HelveticaLight']})
20 |
21 | # Best objectives.
22 | best_P_in_scope = 100.
23 | best_I1_in_scope = 100.
24 | n_steps = 30
25 |
26 | if not os.path.exists('./figures'):
27 | os.mkdir('figures')
28 |
29 |
30 | colors = ['#DC143C', '#0343DF', '#FAC205']
31 | color_i = 0
32 | fig, ax = plt.subplots(figsize=(8., 8.), dpi=500, nrows=2, ncols=2)
33 |
34 | acq = 'EHVI'
35 | for sampling in ['seed', 'lhs', 'cvtsampling']:
36 |
37 | avg = pd.read_csv(f"./{sampling}_avg.csv")
38 |
39 | avg = avg.apply(pd.to_numeric, errors='coerce')
40 | max = pd.read_csv(f"./{sampling}_max.csv")
41 | max = max.apply(pd.to_numeric, errors='coerce')
42 | min = pd.read_csv(f"./{sampling}_min.csv")
43 | min = min.apply(pd.to_numeric, errors='coerce')
44 |
45 | n_exp = avg['n_experiments'].values[1:]
46 |
47 | # Hypervolume.
48 | hypervol_max = max['hypervolume completed (%)'].values[1:]
49 | hypervol_min = min['hypervolume completed (%)'].values[1:]
50 | hypervol_avg = avg['hypervolume completed (%)'].values[1:]
51 |
52 | # Where hypervolume is 99% completed.
53 | try:
54 | hyper_complete_arg = np.argwhere(hypervol_avg > 99.0)[0]
55 | hyper_complete_y = [hypervol_avg[hyper_complete_arg]]
56 | hyper_complete_x = [n_exp[hyper_complete_arg]]
57 | except:
58 | P_complete_x = []
59 | P_complete_y = []
60 |
61 | # Distance pareto.
62 | dtradeoff_max = max['dmaximin_tradeoff'].values[1:]
63 | dtradeoff_min = min['dmaximin_tradeoff'].values[1:]
64 | dtradeoff_avg = avg['dmaximin_tradeoff'].values[1:]
65 |
66 |
67 | # Best samples at each run.
68 | bestP_max = max[f'{objective_1}_best'].values[1:]
69 | bestI1_max = max[f'{objective_2}_best'].values[1:]
70 | bestP_min = min[f'{objective_1}_best'].values[1:]
71 | bestI1_min = min[f'{objective_2}_best'].values[1:]
72 | bestP_avg = avg[f'{objective_1}_best'].values[1:]
73 | bestI1_avg = avg[f'{objective_2}_best'].values[1:]
74 |
75 | # Where best P is sampled.
76 | try:
77 | P_complete_arg = np.argwhere(bestP_max == best_P_in_scope)[0]
78 | P_complete_y = [bestP_max[P_complete_arg]]
79 | P_complete_x = [n_exp[P_complete_arg]]
80 | except:
81 | P_complete_x = []
82 | P_complete_y = []
83 |
84 | # Where best I1 is sampled.
85 | try:
86 | I1_complete_arg = np.argwhere(bestI1_min == best_I1_in_scope)[0]
87 | I1_complete_y = [bestI1_min[I1_complete_arg]]
88 | I1_complete_x = [n_exp[I1_complete_arg]]
89 | except:
90 | I1_complete_x = []
91 | I1_complete_y = []
92 |
93 | # Plot performance for each acquisition function.
94 | ax[0][0].plot(n_exp, hypervol_avg, color=colors[color_i], lw=2.5, label=sampling.upper())
95 | ax[0][0].fill_between(x=n_exp, y1=hypervol_avg, y2=hypervol_max, color=colors[color_i], alpha=0.3, lw=0.)
96 | ax[0][0].fill_between(x=n_exp, y1=hypervol_min, y2=hypervol_avg, color=colors[color_i], alpha=0.3, lw=0.)
97 | ax[0][0].plot(n_exp, hypervol_min, color=colors[color_i], alpha=1., lw=1., ls='--')
98 | ax[0][0].plot(n_exp, hypervol_max, color=colors[color_i], alpha=1., lw=1., ls='--')
99 | # ax[0][0].plot(n_exp, np.ones_like(n_exp)*100, dashes=[8, 4], color='black', linewidth=0.8)
100 | ax[0][0].scatter(n_exp, hypervol_avg, marker='o', s=0., color=colors[color_i])
101 |
102 | ax[0][0].set_xticks(np.arange(0, 120, 5))
103 | ax[0][0].set_xlim(0, n_steps)
104 |
105 | # ax[0][0].set_ylim(40, 100)
106 | ax[0][0].set_xlabel('Samples')
107 | ax[0][0].set_ylabel('Hypervolume (%)')
108 | # plt.tick_params(axis="x", direction="in")
109 | # plt.tick_params(axis="y", direction="in")
110 |
111 | # Plot distance tradeoff.
112 | ax[0][1].plot(n_exp, dtradeoff_avg, color=colors[color_i], lw=2.5, label=sampling.upper())
113 | ax[0][1].plot(n_exp, dtradeoff_min, color=colors[color_i], lw=1., ls='--', label=sampling.upper())
114 | ax[0][1].plot(n_exp, dtradeoff_max, color=colors[color_i], lw=1., ls='--', label=sampling.upper())
115 | ax[0][1].fill_between(x=n_exp, y1=dtradeoff_avg, y2=dtradeoff_max, color=colors[color_i], alpha=0.3)
116 | ax[0][1].fill_between(x=n_exp, y1=dtradeoff_min, y2=dtradeoff_avg, color=colors[color_i], alpha=0.3)
117 | # ax[0][1].plot(n_exp, np.ones_like(n_exp) * 0, dashes=[8, 4], color='black', linewidth=0.8)
118 | ax[0][1].scatter(n_exp, dtradeoff_avg, marker='o', s=0., color=colors[color_i])
119 |
120 | ax[0][1].set_xticks(np.arange(0, 120, 5))
121 | ax[0][1].set_xlim(0, n_steps)
122 | # ax[0][1].set_ylim(0, 80)
123 | ax[0][1].set_xlabel('Samples')
124 | ax[0][1].set_ylabel(r'$d_{(trade-off)}$')
125 |
126 | # Plot best P.
127 | ax[1][0].plot(n_exp, bestP_avg, color=colors[color_i], lw=2.5, label=sampling)
128 | ax[1][0].plot(n_exp, bestP_min, color=colors[color_i], lw=1, ls='--', label=sampling, alpha=1.)
129 | ax[1][0].plot(n_exp, bestP_max, color=colors[color_i], lw=1, ls='--', label=sampling, alpha=1.)
130 | # ax[1][0].fill_between(x=n_exp, y1=bestP_avg, y2=bestP_max, color=colors[color_i], alpha=0.3)
131 | # ax[1][0].fill_between(x=n_exp, y1=bestP_min, y2=bestP_avg, color=colors[color_i], alpha=0.3)
132 |
133 | # ax[1][0].plot(n_exp, np.ones_like(n_exp) * 0,
134 | # dashes=[8, 4], color='black', linewidth=0.8)
135 | ax[1][0].scatter(n_exp, bestP_avg, marker='o', s=0.,
136 | color=colors[color_i])
137 |
138 | ax[1][0].set_xticks(np.arange(0, 120, 5))
139 | ax[1][0].set_xlim(0, n_steps)
140 | # ax[1][0].set_ylim(0.8, 1.1)
141 | ax[1][0].set_xlabel('Samples')
142 | ax[1][0].set_ylabel('Best P')
143 |
144 | # Plot best I1.
145 | ax[1][1].plot(n_exp, bestI1_avg, color=colors[color_i], lw=2.5,
146 | label=sampling.upper())
147 |
148 | ax[1][1].plot(n_exp, bestI1_min, color=colors[color_i], lw=1.0, ls='--',
149 | label=sampling.upper())
150 | ax[1][1].plot(n_exp, bestI1_max, color=colors[color_i], lw=1.0, ls='--',
151 | label=sampling.upper())
152 |
153 | ax[1][1].fill_between(x=n_exp,
154 | y1=bestI1_avg,
155 | y2=bestI1_max, color=colors[color_i], alpha=0.3,
156 | )
157 | ax[1][1].fill_between(x=n_exp,
158 | y1=bestI1_min,
159 | y2=bestI1_avg, color=colors[color_i], alpha=0.3,
160 | )
161 | # ax[1][1].plot(n_exp, np.ones_like(n_exp) * 0,
162 | # dashes=[8, 4], color='black', linewidth=0.8)
163 | ax[1][1].scatter(n_exp, bestI1_avg, marker='o', s=0.,
164 | color=colors[color_i])
165 |
166 |
167 | ax[1][1].set_xticks(np.arange(0, 120, 5))
168 | ax[1][1].set_xlim(0, n_steps)
169 | ax[1][1].set_ylim(0.0, 0.005)
170 | ax[1][1].set_xlabel('Samples')
171 | ax[1][1].set_ylabel('Best I1')
172 |
173 | color_i += 1
174 |
175 | ax[0][1].legend()
176 | plt.tight_layout()
177 | plt.savefig(f"figures/benchmark_sampling.svg")
178 | plt.show()
179 |
180 |
181 |
--------------------------------------------------------------------------------
/examples/publication/Virtual-experimentation/performance/4_hypervol_sampling.py:
--------------------------------------------------------------------------------
1 |
2 | import numpy as np
3 | import matplotlib.pyplot as plt
4 | import seaborn as sns
5 | import pandas as pd
6 | import os
7 |
8 |
9 | # sns.set_style("ticks")
10 | # sns.set_context("paper")
11 | import matplotlib as mpl
12 | mpl.rcParams['grid.linestyle'] = ':'
13 | mpl.rcParams['grid.linewidth'] = 0.1
14 |
15 | objective_1 = 'P'
16 | objective_2 = 'I1'
17 |
18 | plt.rcParams['font.family'] = 'Helvetica'
19 | # mpl.rc('font', **{'family':'sans-serif', 'sans-serif':['HelveticaLight']})
20 |
21 | # Best objectives.
22 | best_conversion_in_scope = 100.
23 | best_selectivity_in_scope = 100.
24 |
25 | n_experiments = 30
26 | feat_iter = 0
27 |
28 | if not os.path.exists('./figures'):
29 | os.mkdir('figures')
30 |
31 | fig, ax = plt.subplots(figsize=(7., 4.0), dpi=500, nrows=1, ncols=3)
32 |
33 | colors_sampling = ['#DC143C', '#0343DF', '#FAC205', '#15B01A']
34 |
35 | alphas = [0.4, 0.6, 0.7, 1.0]
36 | i = -1
37 | for sampling_method in ['seed', 'lhs', 'cvtsampling']:
38 |
39 | i += 1
40 | j = -1
41 | for batch in [1, 2, 3, 5]:
42 | j += 1
43 | acq = 'EHVI'
44 |
45 | df_i = pd.read_csv(f'../results/results_benchmark_acq_{acq}_batch_{batch}_{sampling_method}.csv')
46 | df_i = df_i[df_i['n_experiments'] <= n_experiments]
47 |
48 | # Hypervolume.
49 | hypervol = df_i['hypervolume completed (%)'].values[:]
50 |
51 | # Plot performance for each acquisition function.
52 | n_exp = df_i['n_experiments'].values[:]
53 |
54 | ax[i].plot(n_exp, hypervol, color=colors_sampling[j], lw=2.5,
55 | label=f"{batch}",
56 | alpha=alphas[j])
57 |
58 | ax[i].set_title(f"{sampling_method}")
59 | ax[i].set_xlabel('Samples')
60 | ax[i].set_ylabel('Hypervolume (%)')
61 | ax[i].set_ylim(80, 100)
62 |
63 | ax[i].legend()
64 | plt.tight_layout()
65 | plt.savefig(f"figures/benchmark_hypervol.svg")
66 |
67 | plt.show()
68 |
69 |
--------------------------------------------------------------------------------
/examples/tutorials/2_EDBO_WebApp_Tutorial.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/doyle-lab-ucla/edboplus/f5d1687e834d33c77598767c5383a919f9a58034/examples/tutorials/2_EDBO_WebApp_Tutorial.pdf
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | botorch==0.5.0
2 | gpytorch==1.5.1
3 | idaes-pse==1.5.1
4 | ipykernel==6.5.1
5 | ipython==7.29.0
6 | ipywidgets==7.6.5
7 | Jinja2==3.0.3
8 | joypy==0.2.6
9 | lxml==4.6.4
10 | mordred==1.2.0
11 | numpy==1.21.5
12 | ordered-set==4.0.2
13 | pandas==1.3.4
14 | pareto==1.1.1.post3
15 | pymoo==0.5.0
16 | scikit-learn==1.0.1
17 | scipy==1.7.2
18 | seaborn
19 | matplotlib
20 | sympy==1.9
21 | torch==1.10.0
22 | tqdm
--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | [metadata]
2 | description-file = README.md
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | from setuptools import setup, find_packages
2 |
3 | setup(
4 | name='edbo',
5 | packages=['edbo'],
6 | version='0.2.0',
7 | author='Jose A. Garrido Torres & Abigail Gutmann Doyle',
8 | author_email='josegarridotorres@me.com',
9 | url='https://github.com/doyle-lab-ucla/edboplus',
10 | keywords=['Bayesian Optimization', 'Chemical Reaction Optimization'],
11 | license='MIT',
12 | description='Bayesian reaction optimization as a tool for chemical synthesis.',
13 | install_requires=[
14 | 'botorch==0.5.0',
15 | 'gpytorch==1.5.1',
16 | 'idaes-pse==1.5.1',
17 | 'ipykernel==6.5.1',
18 | 'ipython==7.29.0',
19 | 'ipywidgets==7.6.5',
20 | 'Jinja2==3.0.3',
21 | 'joypy==0.2.6',
22 | 'lxml==4.6.4',
23 | 'mordred==1.2.0',
24 | 'numpy==1.21.5',
25 | 'ordered-set==4.0.2',
26 | 'pandas==1.3.4',
27 | 'pareto==1.1.1.post3',
28 | 'pymoo==0.5.0',
29 | 'scikit-learn==1.0.1',
30 | 'scipy==1.7.2',
31 | 'seaborn',
32 | 'matplotlib',
33 | 'sympy==1.9',
34 | 'torch==1.10.0',
35 | 'tqdm',
36 | ],
37 | classifiers=[
38 | 'Development Status :: 3 - Alpha',
39 | 'Intended Audience :: Science/Research',
40 | 'Topic :: Scientific/Engineering :: Chemistry',
41 | 'License :: OSI Approved :: MIT License',
42 | 'Programming Language :: Python :: 3.8',
43 | ],
44 | )
--------------------------------------------------------------------------------