├── .gitmodules ├── LICENSE ├── README.md ├── Run_PREFER.ipynb ├── __init__.py ├── analysis_notebooks ├── Best_PREFER_model_VS_RF.ipynb ├── Plot_performance_distributions_for_representation.ipynb ├── README.txt └── TestSet_Bootstrapping.ipynb ├── api_version.txt ├── cddd-environment-light.yml ├── cddd-environment.yml ├── compute_model_based_representations.py ├── config_files ├── config_PREFER_logD.yaml ├── config_PREFER_smalldata.yaml ├── config_PREFER_solub.yaml └── config_model_based_representations.yaml ├── moler-environment-light.yml ├── moler-environment.yml ├── prefer-environment.yml ├── prefer ├── __init__.py ├── azure_ml │ ├── README.md │ ├── aml_config.py │ ├── aml_configuration │ │ └── aml_config.json │ ├── config_logD_azure.yaml │ ├── exceptions.py │ ├── get_model_utils.py │ ├── included_prefixes.json │ ├── model_registration_prefer.py │ ├── model_registration_utils.py │ ├── reproducibility.py │ ├── schedule_global_model_pipeline.py │ ├── telemetry_utils.py │ └── utils.py ├── docs │ ├── PREFER_scheme.png │ └── SaltsMod.txt ├── model_based_representations │ ├── __init__.py │ ├── cddd_wrapper.py │ ├── interface.py │ ├── model_based_representations_factory.py │ ├── models │ │ └── __init__.py │ └── moler_wrapper.py ├── molecule_representations │ ├── __init__.py │ ├── descriptors2D_representations_builder.py │ ├── fingerprints_representations_builder.py │ └── model_representations_builder.py ├── schema │ ├── __init__.py │ └── config.py ├── scripts │ ├── __init__.py │ ├── aml_context.py │ ├── combine_results.py │ ├── get_representations.py │ ├── model_wrapper.py │ ├── run_PREFER.py │ └── utils.py ├── src │ ├── __init__.py │ ├── benchmarking.py │ ├── molecule_representations.py │ ├── molecule_representations_builder.py │ ├── prefer_model_wrapper.py │ └── vector_molecule_representations.py ├── tests │ ├── __init__.py │ ├── data_for_test │ │ └── logDPublic.csv │ ├── file_for_test │ │ ├── config_PREFER_test_custom_autosklearn.yaml │ │ └── logD_desirability_scores.yaml │ ├── test_autosklearn_customization.py │ ├── test_check_input_dataframe.py │ ├── test_data_preparation.py │ ├── test_filtering.py │ ├── test_helpers.py │ ├── test_prefer_model_wrapper.py │ └── test_scripts.py └── utils │ ├── __init__.py │ ├── automation.py │ ├── check_input_dataframe.py │ ├── data_preparation.py │ ├── data_utils.py │ ├── features_scaling.py │ ├── filtering.py │ ├── mapping.py │ ├── models_evaluation.py │ ├── models_utils.py │ ├── post_processing_and_optimization_helpers.py │ ├── random_utils.py │ ├── run_automl.py │ ├── save_load.py │ └── splitting_strategies.py ├── pyproject.toml ├── run_prefer_automation.py ├── setup.py └── small_data_experiments ├── README_smalldata.txt ├── analysis_smalldata_example.ipynb ├── extract_zipped_files.ipynb ├── run_PREFER_smalldata_example.ipynb └── run_prefer_automation_smalldata.py /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "prefer/model_based_representations/models/cddd"] 2 | path = prefer/model_based_representations/models/cddd 3 | url = https://github.com/jrwnter/cddd 4 | [submodule "prefer/model_based_representations/models/molecule-generation"] 5 | path = prefer/model_based_representations/models/molecule-generation 6 | url = https://github.com/microsoft/molecule-generation 7 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | BSD 3-Clause License 2 | 3 | Copyright (c) 2023, RDKit 4 | 5 | Redistribution and use in source and binary forms, with or without 6 | modification, are permitted provided that the following conditions are met: 7 | 8 | 1. Redistributions of source code must retain the above copyright notice, this 9 | list of conditions and the following disclaimer. 10 | 11 | 2. Redistributions in binary form must reproduce the above copyright notice, 12 | this list of conditions and the following disclaimer in the documentation 13 | and/or other materials provided with the distribution. 14 | 15 | 3. Neither the name of the copyright holder nor the names of its 16 | contributors may be used to endorse or promote products derived from 17 | this software without specific prior written permission. 18 | 19 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 20 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 21 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 22 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 23 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 24 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 25 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 26 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 27 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 28 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 29 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Benchmarking and Property Prediction Framework (PREFER) 2 | 3 | The PREFER framework automatizes the evaluation of different combinations of molecular representations and machine learning models for predicting molecular properties. 4 | It covers different molecular representation from classical, e.g. Fingerprints and 2D Descriptors, to data-driven representations, e.g. Continuous and Data Driven representations (CDDD) [1] or MoLeR[2]. 5 | PREFER uses AutoSklearn [3] to implement the ML model selection and the hyperparameter tuning. 6 | 7 | ![caption](prefer/docs/PREFER_scheme.png) 8 | 9 | *General overview of the PREFER framework where the Model Selection part is based on [3].* 10 | 11 | ## Getting Started 12 | 13 | ### Installation 14 | 15 | #### Python Environment 16 | The main conda environment for using PREFER can be installed from `prefer-environment.yml`, as follows: 17 | 18 | ``` 19 | conda env create -f prefer-environment.yml 20 | ``` 21 | 22 | Depending to the models employed to generate model-based molecular representations, other environments need to be installed (one for each model). The supported models in the current PREFER code are CDDD [1] and MoLeR [2]. The corresponding environments can be found in `moler-environment-light.yml` and `cddd-environment-light.yml` and can be installed as follows: 23 | 24 | ``` 25 | conda env create -f moler-environment-light.yml 26 | OR 27 | conda env create -f cddd-environment-light.yml 28 | ``` 29 | 30 | Before running any experiments, relevant paths need to be set (including cddd and moler folders which are integrated in PREFER as git submodules), as follows: 31 | 32 | ``` 33 | PYTHONPATH="path_to/PREFER/prefer/model_based_representations/models/cddd/:path_to/PREFER/prefer/model_based_representations/models/molecule-generation/:path_to/PREFER/:$PYTHONPATH" 34 | export PYTHONPATH 35 | ``` 36 | 37 | New models should be included as git submodules and add in the PYTHONPATH. 38 | 39 | #### Conda Environments in Jupyter 40 | To use the PREFER conda environment in a Jupyter notebook, the environment needs to be added to Jupyter's kernelspec: 41 | 42 | ``` 43 | conda activate prefer-env 44 | python -m ipykernel install --user --name prefer-env --display-name "Python (prefer-env)" 45 | ``` 46 | 47 | Check that Jupyter has access to this environment by running 48 | 49 | ``` 50 | jupyter kernelspec list 51 | ``` 52 | 53 | The recently added env `Python (prefer-env)` should be available now in Jupyter. 54 | 55 | 56 | 57 | ## Prerequisites 58 | 59 | In order to run PREFER, we provide one notebook (Run-PREFER.ipynb) and one python script (run_prefer_automation.py) 60 | 61 | Main steps are as follows: 62 | 63 | ### STEP 0: clone the repository and unpack the git submodules 64 | Once you have cloned this repository, please go into your cloned folder and run the following commands: 65 | 66 | ``` 67 | git submodule update --init --recursive 68 | ``` 69 | 70 | This is needed to unpack the git submodules used to connect PREFER to the models used to compute the model-based representations. 71 | 72 | ### STEP 1: download public test datasets 73 | Two public datasets can be used to test the code: 74 | - [logD](https://www.ebi.ac.uk/chembl/g/#browse/activities/filter/document_chembl_id%3ACHEMBL3301361%20AND%20standard_type%3A(%22LogD7.4%22)) from ChEMBL 75 | - [solubility](https://pubchem.ncbi.nlm.nih.gov/bioassay/1996) from PubChem 76 | 77 | ### STEP 2: download models for calculating data-based molecular representations 78 | Two models are supported currenlty as submodules in PREFER: CDDD and MOLER. 79 | Pre-trained models can be dowloaded from: 80 | 81 | - CDDD: [here](https://drive.google.com/open?id=1oyknOulq_j0w9kzOKKIHdTLo5HphT99h) 82 | - MOLER: [here](https://figshare.com/ndownloader/files/34642724) 83 | 84 | Save these trained models locally, since they will be used afterwards. 85 | 86 | 87 | ### STEP 3: set the configuration files 88 | For each PREFER job a yaml config file need to be prepared as follows: 89 | 90 | 1. Main settings: 91 | ``` 92 | path_to_df: 'path_to_df' 93 | experiment_name: 'experiment_name' 94 | id_column_name: 'id_column_name' 95 | smiles_column_name: 'smiles_column_name' 96 | properties_column_name_list: 97 | - 'property_1_col_name' 98 | - 'property_2_col_name' 99 | problem_type: 'regression' # or 'classification' 100 | splitting_strategy: 'random' # or 'cluster' or 'temporal' 101 | temporal_info_column_name: 'temporal_info_column_name' 102 | ``` 103 | 104 | Examples are provided in ./config_files. 105 | 106 | 2. Settings for model based representations: 107 | ``` 108 | model_based_representations: 109 | 'model_name': 110 | 'path_to_model': 'path to model folder'(see STEP2) 111 | 'conda_env': 'name of the conda env installed for this model' 112 | 'submodule_path': 'path to the submodule folder included in PREFER for running the model'(e.g. path_to/prefer/model_based_representations/models/cddd/) 113 | 114 | prefer_path: 'path_to_/PREFER/' 115 | ``` 116 | 117 | Examples of configuration file for the representations is provided in ./config_files/config_model_based_representations.yaml. 118 | 119 | 120 | 121 | ### STEP 4: run Run-PREFER.ipynb notebook 122 | To run the notebook `Run_PREFER.ipynb`, first of all select the correct kernel (Python (prefer-env)) and then change the needed paths, in particular: 123 | 124 | - sys.path.append('path_to/PREFER/') 125 | - sys.path.append('path_to/models/cddd/') # to connect CDDD model 126 | - sys.path.append('path_to/models/molecule-generation/') # to connect MOLER model 127 | 128 | By running the notebook a folder (PREFER_results) will be created with the main results (benchmarking object and models). 129 | Moreover different folders with structure {model_name}_representations_{experiment_name} will be created containing the model_based representations. 130 | 131 | In the notebook one can also find an example of how to use the stored PREFER-model-wrapper to predict new samples. This way the best model found for each molecular representation can be used later to predict the property under analysis. 132 | 133 | An automatized version of the notebook can be found in `run_prefer_automation.py`. You can run it from the terminal with the following commands: 134 | 135 | ``` 136 | conda activate prefer-env 137 | 138 | PYTHONPATH="path_to/PREFER/prefer/model_based_representations/models/cddd/:path_to/PREFER/prefer/model_based_representations/models/molecule-generation/:path_to/PREFER/:$PYTHONPATH" 139 | export PYTHONPATH 140 | 141 | python run_prefer_automation.py --prefer_args path_to_yaml_configuration_file(see STEP3) --model_based_representations_args path_to_yaml_configuration_file_for_models_used_to_compute_the_representations(see STEP4) 142 | ``` 143 | 144 | 145 | ## WARNING: 146 | Please make sure that you select the right model type according to the dataset used (e.g.for a classification model binary labels should be provided in the dataset). 147 | 148 | ## Authors 149 | 150 | * **Jessica Lanini** 151 | 152 | With the contribution of 153 | - Nadine Schneider 154 | - Gianluca Santarossa 155 | - Sarah Lewis 156 | - Krzysztof Maziarz 157 | - Marwin Segler 158 | - Hubert Misztela 159 | 160 | 161 | ## References 162 | [1] Winter, Robin, et al. "Learning continuous and data-driven molecular descriptors by translating equivalent chemical representations." Chemical science 10.6 (2019): 1692-1701. 163 | 164 | [2] Maziarz, Krzysztof, et al. "Learning to extend molecular scaffolds with structural motifs." arXiv preprint arXiv:2103.03864 (2021). 165 | 166 | [3] Feurer, Matthias, et al. "Efficient and robust automated machine learning." Advances in neural information processing systems 28 (2015). 167 | -------------------------------------------------------------------------------- /__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rdkit/PREFER/16986ca183b38b94f9b3b62b441ec40671b8cf8a/__init__.py -------------------------------------------------------------------------------- /analysis_notebooks/Plot_performance_distributions_for_representation.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Plot performances for each molecular representation" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "After running TestSet_Bootstrapping.ipynb a .pkl file containing the final performances for each molecular representation should have been created. This notebook will then plot the results and perform a statistical analysis. " 15 | ] 16 | }, 17 | { 18 | "cell_type": "markdown", 19 | "metadata": {}, 20 | "source": [ 21 | "## Imports" 22 | ] 23 | }, 24 | { 25 | "cell_type": "code", 26 | "execution_count": null, 27 | "metadata": { 28 | "tags": [] 29 | }, 30 | "outputs": [], 31 | "source": [ 32 | "import sys\n", 33 | "%load_ext autoreload\n", 34 | "# path to the main directory\n", 35 | "path_to_PREFER = 'path_to/PREFER/'\n", 36 | "# path to submodules\n", 37 | "path_to_cddd = 'path_to/PREFER/prefer/model_based_representations/models/cddd/'\n", 38 | "path_to_moler = 'path_to/PREFER/prefer/model_based_representations/models/molecule-generation/'\n", 39 | "sys.path.append(path_to_PREFER)\n", 40 | "sys.path.append(path_to_cddd)\n", 41 | "sys.path.append(path_to_moler)\n", 42 | "import warnings\n", 43 | "warnings.filterwarnings('ignore')\n", 44 | "from prefer.utils.filtering import *\n", 45 | "import sys" 46 | ] 47 | }, 48 | { 49 | "cell_type": "code", 50 | "execution_count": null, 51 | "metadata": { 52 | "tags": [] 53 | }, 54 | "outputs": [], 55 | "source": [ 56 | "from prefer.utils.post_processing_and_optimization_helpers import create_heat_map\n", 57 | "from prefer.utils.automation import merge_table_metrics, data_preparation, generate_molecular_representations, run, create_comparison_table" 58 | ] 59 | }, 60 | { 61 | "cell_type": "markdown", 62 | "metadata": {}, 63 | "source": [ 64 | "### Folders where to find models" 65 | ] 66 | }, 67 | { 68 | "cell_type": "code", 69 | "execution_count": null, 70 | "metadata": {}, 71 | "outputs": [], 72 | "source": [ 73 | "import pickle \n", 74 | "name = \"final_dict_['publicSolubility', 'publicLogD'].pickle\"\n", 75 | "with open(name, 'rb') as handle:\n", 76 | " dict1 = pickle.load(handle)\n" 77 | ] 78 | }, 79 | { 80 | "cell_type": "code", 81 | "execution_count": null, 82 | "metadata": {}, 83 | "outputs": [], 84 | "source": [ 85 | "import pandas as pd\n", 86 | "df1 = pd.DataFrame()\n", 87 | "repr_vect = []\n", 88 | "value_vect = []\n", 89 | "for repr_ in dict1['autosklearn']['publicSolubility'].keys():\n", 90 | " for elem in dict1['autosklearn']['publicSolubility'][repr_]:\n", 91 | " repr_vect.append(repr_)\n", 92 | " value_vect.append(elem)\n", 93 | "df1['Representation'] = repr_vect\n", 94 | "df1['∆AUPRC'] = value_vect" 95 | ] 96 | }, 97 | { 98 | "cell_type": "code", 99 | "execution_count": null, 100 | "metadata": {}, 101 | "outputs": [], 102 | "source": [ 103 | "import pandas as pd\n", 104 | "df2 = pd.DataFrame()\n", 105 | "repr_vect = []\n", 106 | "value_vect = []\n", 107 | "for repr_ in dict1['autosklearn']['publicLogD'].keys():\n", 108 | " for elem in dict1['autosklearn']['publicLogD'][repr_]:\n", 109 | " repr_vect.append(repr_)\n", 110 | " value_vect.append(elem)\n", 111 | "df2['Representation'] = repr_vect\n", 112 | "df2['R2'] = value_vect" 113 | ] 114 | }, 115 | { 116 | "cell_type": "code", 117 | "execution_count": null, 118 | "metadata": { 119 | "tags": [] 120 | }, 121 | "outputs": [], 122 | "source": [ 123 | "from scipy import stats\n", 124 | "collect_stats = []\n", 125 | "for exper in dict1['autosklearn'].keys():\n", 126 | " print(exper)\n", 127 | " collect_group = []\n", 128 | " for repr_ in dict1['autosklearn'][exper].keys():\n", 129 | " collect_group.append(dict1['autosklearn'][exper][repr_])\n", 130 | " #perform Friedman Test\n", 131 | " collect_stats.append(stats.friedmanchisquare(collect_group[0], collect_group[1], collect_group[2], collect_group[3]))\n", 132 | "\n", 133 | "collect_stats" 134 | ] 135 | }, 136 | { 137 | "cell_type": "code", 138 | "execution_count": null, 139 | "metadata": { 140 | "tags": [] 141 | }, 142 | "outputs": [], 143 | "source": [ 144 | "import seaborn as sns\n", 145 | "import matplotlib.pyplot as plt\n", 146 | "plt.style.use('fivethirtyeight')\n", 147 | "fig, axes = plt.subplots(1, 3, figsize=(10, 5), sharey=True)\n", 148 | "fig.suptitle('Performances', size = 15)\n", 149 | "sns.set(font_scale=0.8)\n", 150 | "sns.violinplot(ax=axes[0], x='Representation', y='∆AUPRC', data=df1.sort_values('Representation'));\n", 151 | "axes[0].set_title('LE-MDCK', size = 15)\n", 152 | "axes[0].tick_params(axis='x', rotation=45)\n", 153 | "\n", 154 | "\n", 155 | "plt.savefig(f'classification_results_with_mean_and_std.png', bbox_inches='tight', transparent=True)\n", 156 | "\n", 157 | "plt.show()" 158 | ] 159 | }, 160 | { 161 | "cell_type": "code", 162 | "execution_count": null, 163 | "metadata": {}, 164 | "outputs": [], 165 | "source": [ 166 | "import seaborn as sns\n", 167 | "import matplotlib.pyplot as plt\n", 168 | "plt.style.use('fivethirtyeight')\n", 169 | "fig, axes = plt.subplots(1, 3, figsize=(10, 5), sharey=True)\n", 170 | "fig.suptitle('Performances', size = 15)\n", 171 | "sns.set(font_scale=0.8)\n", 172 | "sns.violinplot(ax=axes[0], x='Representation', y='R2', data=df2.sort_values('Representation'));\n", 173 | "axes[0].set_title('logD', size = 15)\n", 174 | "axes[0].tick_params(axis='x', rotation=45)\n", 175 | "\n", 176 | "\n", 177 | "plt.savefig(f'regression_results_with_mean_and_std.png', bbox_inches='tight', transparent=True)\n", 178 | "\n", 179 | "plt.show()" 180 | ] 181 | }, 182 | { 183 | "cell_type": "code", 184 | "execution_count": null, 185 | "metadata": {}, 186 | "outputs": [], 187 | "source": [ 188 | "income_groups = [df1.loc[df1['Representation']==repr_, '∆AUPRC'].values for repr_ in df1['Representation'].dropna().unique()]\n", 189 | "stat, p_value = f_oneway(*income_groups)\n", 190 | "print(f\"F Test: statistic={stat:.4f}, p-value={p_value:.4f}\")" 191 | ] 192 | }, 193 | { 194 | "cell_type": "code", 195 | "execution_count": null, 196 | "metadata": {}, 197 | "outputs": [], 198 | "source": [ 199 | "income_groups = [df2.loc[df2['Representation']==repr_, 'R2'].values for repr_ in df2['Representation'].dropna().unique()]\n", 200 | "stat, p_value = f_oneway(*income_groups)\n", 201 | "print(f\"F Test: statistic={stat:.4f}, p-value={p_value:.4f}\")" 202 | ] 203 | } 204 | ], 205 | "metadata": { 206 | "kernelspec": { 207 | "display_name": "Python (prefer-env-released2)", 208 | "language": "python", 209 | "name": "prefer-env-released2" 210 | }, 211 | "language_info": { 212 | "codemirror_mode": { 213 | "name": "ipython", 214 | "version": 3 215 | }, 216 | "file_extension": ".py", 217 | "mimetype": "text/x-python", 218 | "name": "python", 219 | "nbconvert_exporter": "python", 220 | "pygments_lexer": "ipython3", 221 | "version": "3.7.7" 222 | } 223 | }, 224 | "nbformat": 4, 225 | "nbformat_minor": 4 226 | } 227 | -------------------------------------------------------------------------------- /analysis_notebooks/README.txt: -------------------------------------------------------------------------------- 1 | The analysis_notebooks folder contains three notebooks to compute the performances of the PREFER models for each molecular representation and/or the performances of the best model against a RandomForest baseline. 2 | In order to run the notebook, one should first run a PREFER job (e.g. through the Run_PREFER.ipynb notebook) in order to train PREFER models for each molecular representation. After that one can run the notebboks in the analysis_notebooks folder in the following order: 3 | 4 | 1. TestSet_Bootstrapping.ipynb 5 | 2. Plot_performance_distributions_for_representation.ipynb AND/OR Best_PREFER_model_VS_RF.ipynb -------------------------------------------------------------------------------- /api_version.txt: -------------------------------------------------------------------------------- 1 | 0.0.0 -------------------------------------------------------------------------------- /cddd-environment-light.yml: -------------------------------------------------------------------------------- 1 | name: cddd-env-prefer-light 2 | 3 | channels: 4 | - rdkit 5 | - defaults 6 | 7 | dependencies: 8 | - python==3.7.7 9 | - numpy 10 | - rdkit==2020.09.1.0 11 | - scikit-learn==0.24.1 12 | - tensorflow-gpu==1.13.1 13 | - pip: 14 | - dirhash 15 | - pandas 16 | - pyyaml 17 | - matplotlib 18 | - seaborn -------------------------------------------------------------------------------- /cddd-environment.yml: -------------------------------------------------------------------------------- 1 | name: cddd-env-prefer 2 | channels: 3 | - rdkit 4 | - defaults 5 | - plotly 6 | - pytorch 7 | dependencies: 8 | - pip<22 9 | - python==3.7.7 10 | - matplotlib 11 | - numpy 12 | - rdkit==2020.09.1.0 13 | - scikit-learn==0.24.1 14 | - seaborn 15 | - tqdm 16 | # For AML Hyperdrive 17 | - plotly 18 | - plotly-orca 19 | - psutil 20 | - openssl 21 | - h5py==2.10.0 22 | - tensorflow-gpu==1.13.1 23 | - pytorch 24 | - cpuonly 25 | - pip: 26 | - docopt 27 | - dpu-utils>=0.2.13 28 | - zmq 29 | - pytest 30 | - azureml-pipeline 31 | # For AML hyperdrive 32 | - azureml-widgets 33 | - azure-keyvault-secrets 34 | - ipywidgets 35 | - pandas 36 | - cairosvg 37 | - fcd_torch 38 | - pydantic 39 | - pyyaml 40 | - wrapt 41 | - dirhash 42 | - scandir 43 | - semver 44 | - py-repo-root==1.1.1 45 | - keras==2.1.2 46 | - fcd 47 | - json2html 48 | - ghostml 49 | - auto-sklearn==0.14.7 50 | - protobuf==3.20.1 51 | - opencensus-ext-azure 52 | - opencensus-ext-requests 53 | 54 | -------------------------------------------------------------------------------- /compute_model_based_representations.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023, Novartis Institutes for BioMedical Research Inc. 2 | # All rights reserved. 3 | # 4 | # Redistribution and use in source and binary forms, with or without 5 | # modification, are permitted provided that the following conditions are 6 | # met: 7 | # 8 | # * Redistributions of source code must retain the above copyright 9 | # notice, this list of conditions and the following disclaimer. 10 | # * Redistributions in binary form must reproduce the above 11 | # copyright notice, this list of conditions and the following 12 | # disclaimer in the documentation and/or other materials provided 13 | # with the distribution. 14 | # * Neither the name of Novartis Institutes for BioMedical Research Inc. 15 | # nor the names of its contributors may be used to endorse or promote 16 | # products derived from this software without specific prior written permission. 17 | # 18 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 19 | # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 20 | # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 21 | # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 22 | # OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 23 | # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 24 | # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 25 | # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 26 | # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 27 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 28 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 29 | # 30 | # Created by Jessica Lanini, January 2023 31 | 32 | 33 | import sys 34 | 35 | import warnings 36 | import pandas as pd 37 | warnings.filterwarnings('ignore') 38 | from prefer.utils.data_preparation import prepare_data 39 | import logging 40 | import argparse 41 | import yaml 42 | 43 | root = logging.getLogger() 44 | root.setLevel(logging.DEBUG) 45 | 46 | from prefer.molecule_representations.model_representations_builder import ModelRepresentationsBuilder 47 | import json 48 | 49 | 50 | def compute_representations_from_args( 51 | args, 52 | path_to_model, 53 | model_name = None, 54 | ): 55 | """ 56 | NB: we assume that the dataset has the semicolomn as separator and that the dataset 57 | """ 58 | path_to_df=args.path_to_df 59 | id_column_name=args.id_column_name 60 | smiles_column_name=args.smiles_column_name 61 | split_type=args.splitting_strategy 62 | temporal_info_column_name=args.temporal_info_column_name 63 | 64 | supported_models = ['CDDD', 'MOLER'] 65 | if model_name == None: 66 | model_name = 'MODELBASED' 67 | print(f'WARNING: PREFER supports only {supported_models}, but other models can be used') 68 | elif(model_name not in supported_models): 69 | print(f'WARNING: PREFER supports only {supported_models}, but other models can be used') 70 | 71 | try: 72 | properties_column_name = json.loads(args.properties_column_name[0]) 73 | 74 | except Exception: 75 | properties_column_name_json_format = json.dumps(args.properties_column_name) 76 | properties_column_name = json.loads(properties_column_name_json_format) 77 | 78 | properties_column_name_list=properties_column_name 79 | 80 | # Read your .csv files 81 | if path_to_df.endswith("/"): # Normalise away trailing slashes 82 | path_to_df = path_to_df[:-1] 83 | 84 | try: 85 | arr = os.listdir(path_to_df) 86 | path_to_df = path_to_df + "/" + arr[0] 87 | except Exception: 88 | logging.info("Already a file") 89 | 90 | try: 91 | df = pd.read_csv(path_to_df) 92 | except Exception: 93 | df = pd.read_csv(path_to_df, sep=";") 94 | 95 | # in prepare_data now the dataset is both prepared and filtered 96 | try: 97 | # Manipulate dataframe such that it is in the right shape fo being used as input of the DataStorage class 98 | # ¦ ID ¦ Smiles ¦ Property_1 ¦ Property_2 ¦ ... ¦ Property_N ¦ 99 | # ------------------------------------------------------------- 100 | # This is done by specifying the experiment_name, the name of column where the ID information and SMILES representation of each sample is stored, and finally 101 | # the list of the columns' names of the properties to model. 102 | df = prepare_data( 103 | df=df, 104 | id_column_name=id_column_name, 105 | smiles_column_name=smiles_column_name, 106 | properties_column_name_list=properties_column_name_list, 107 | temporal_info_column_name=temporal_info_column_name, 108 | ) 109 | 110 | except Exception: 111 | logging.error( 112 | "ERROR in preparing data. One of id_column_name, smiles_column_name, properties_column_name_list may be wrong." 113 | ) 114 | sys.exit(1) 115 | 116 | #For model based representations 117 | model_based_representations = ModelRepresentationsBuilder(path_to_model = path_to_model, limit_def = args.limit_def) 118 | model_based = model_based_representations.build_representations(df, split_type = split_type) 119 | 120 | # save representations 121 | import os 122 | 123 | # define the name of the directory to be created 124 | experiment_name = args.experiment_name 125 | path = f"./{model_name}_representations_{experiment_name}" 126 | 127 | try: 128 | os.mkdir(path) 129 | except OSError: 130 | print ("Creation of the directory %s failed" % path) 131 | else: 132 | print ("Successfully created the directory %s " % path) 133 | 134 | model_based.representation_name = model_name 135 | model_based.save(path) 136 | 137 | print(f'{model_name} representation correctly saved in {path}') 138 | return 139 | 140 | 141 | 142 | if __name__ == "__main__": 143 | ''' 144 | Script to compute the model_based representations of a set of molecules in a dataframe. 145 | ''' 146 | parser = argparse.ArgumentParser( 147 | description=f"Compute model_based-representations", 148 | ) 149 | parser.add_argument( 150 | "--prefer_args", 151 | type=str, 152 | help="path to the .yaml file where configuration parameters are stored.", 153 | ) 154 | 155 | parser.add_argument( 156 | "--path_to_model", 157 | type=str, 158 | help="path to model_based model that has been previously downloaded", 159 | ) 160 | 161 | parser.add_argument( 162 | "--model_name", 163 | type=str, 164 | help="string of model_name, e.g. CDDD or MOLER", 165 | ) 166 | 167 | args = parser.parse_args() 168 | a_yaml_file = open(args.prefer_args) 169 | parsed_yaml_file = yaml.load(a_yaml_file, Loader=yaml.FullLoader) 170 | 171 | args.path_to_df = parsed_yaml_file["path_to_df"] 172 | args.experiment_name = parsed_yaml_file["experiment_name"] 173 | args.id_column_name = parsed_yaml_file["id_column_name"] 174 | args.smiles_column_name = parsed_yaml_file["smiles_column_name"] 175 | args.properties_column_name = parsed_yaml_file["properties_column_name_list"] 176 | args.problem_type = parsed_yaml_file["problem_type"] 177 | args.splitting_strategy = parsed_yaml_file["splitting_strategy"] 178 | 179 | if 'limit_def' in parsed_yaml_file: 180 | args.limit_def = parsed_yaml_file["limit_def"] 181 | else: 182 | args.limit_def = None 183 | 184 | 185 | if "temporal_info_column_name" in parsed_yaml_file: 186 | args.temporal_info_column_name = parsed_yaml_file["temporal_info_column_name"] 187 | else: 188 | args.temporal_info_column_name = None 189 | 190 | compute_representations_from_args(args,args.path_to_model, args.model_name) 191 | -------------------------------------------------------------------------------- /config_files/config_PREFER_logD.yaml: -------------------------------------------------------------------------------- 1 | path_to_df: 'path_to_CHEMBL25-chembl_activity-kmMh6jUABmv9Qdf9kIxKtg7qNt9dhyK1PXvuKt6OTGc=.csv' 2 | experiment_name: 'logD' 3 | id_column_name: 'Molecule ChEMBL ID' 4 | smiles_column_name: 'Smiles' 5 | properties_column_name_list: 6 | - 'Standard Value' 7 | problem_type: 'regression' 8 | splitting_strategy: 'random' 9 | -------------------------------------------------------------------------------- /config_files/config_PREFER_smalldata.yaml: -------------------------------------------------------------------------------- 1 | experiment_name: small_data 2 | id_column_name: Assay_ID 3 | limit_def: 0 4 | path_to_df: none 5 | problem_type: classification 6 | properties_column_name_list: 7 | - Property 8 | smiles_column_name: SMILES 9 | splitting_strategy: random 10 | -------------------------------------------------------------------------------- /config_files/config_PREFER_solub.yaml: -------------------------------------------------------------------------------- 1 | path_to_df: 'path_to_solubility.csv' 2 | experiment_name: 'solubility' 3 | id_column_name: 'SID' 4 | smiles_column_name: 'Smiles' 5 | properties_column_name_list: 6 | - 'PUBCHEM_ACTIVITY_OUTCOME' 7 | problem_type: 'classification' 8 | splitting_strategy: 'random' 9 | # temporal_info_column_name: 10 | 11 | -------------------------------------------------------------------------------- /config_files/config_model_based_representations.yaml: -------------------------------------------------------------------------------- 1 | model_based_representations: 2 | 'MOLER': 3 | 'path_to_model': 'path_to/moler' 4 | 'conda_env': 'moler-env-prefer-light' 5 | 'submodule_path': 'path_to/PREFER/prefer/model_based_representations/models/molecule-generation/' 6 | 'CDDD': 7 | 'path_to_model': 'path_to/cddd/default_model' 8 | 'conda_env': 'cddd-env-prefer-light' 9 | 'submodule_path': 'path_to/PREFER/prefer/model_based_representations/models/cddd/' 10 | 11 | 12 | #python path needed to connect together all the relevant folders for the project, in particular, all the submodules related to the models used for models based representations (e.g. cddd and moler submodules) and the main PREFER folder 13 | prefer_path: 'path_to/PREFER/' -------------------------------------------------------------------------------- /moler-environment-light.yml: -------------------------------------------------------------------------------- 1 | name: moler-env-prefer-light 2 | channels: 3 | - rdkit 4 | - defaults 5 | - plotly 6 | - pytorch 7 | - conda-forge 8 | dependencies: 9 | - pip<22 10 | - python==3.7.7 11 | - matplotlib 12 | - numpy==1.21.5 13 | - rdkit==2020.09.1.0 14 | - scikit-learn==0.24.1 15 | - seaborn 16 | - tqdm 17 | - tensorflow==2.9.1 18 | - pip: 19 | - pandas>=1.2.4 20 | - tf2-gnn~=2.12.0 21 | - pyyaml 22 | - dirhash 23 | - matplotlib-inline==0.1.3 24 | - ipywidgets 25 | - more-itertools -------------------------------------------------------------------------------- /moler-environment.yml: -------------------------------------------------------------------------------- 1 | name: moler-env-prefer 2 | channels: 3 | - rdkit 4 | - defaults 5 | - plotly 6 | - pytorch 7 | - conda-forge 8 | dependencies: 9 | - pip<22 10 | - python==3.7.7 11 | - matplotlib 12 | - numpy==1.21.5 13 | - rdkit==2020.09.1.0 14 | - scikit-learn==0.24.1 15 | - seaborn 16 | - tqdm 17 | # For AML Hyperdrive 18 | - plotly 19 | - plotly-orca 20 | - psutil 21 | - openssl 22 | - h5py==2.10.0 23 | - tensorflow==2.9.1 24 | - typing-extensions 25 | - pytorch 26 | - cpuonly 27 | - pip: 28 | - azure-identity==1.7.0 29 | - docopt 30 | - dpu-utils>=0.2.13 31 | - zmq 32 | - pytest 33 | - coverage 34 | - azureml-pipeline 35 | - azure-keyvault-secrets 36 | # For AML hyperdrive 37 | - azureml-widgets 38 | - ipywidgets 39 | - pandas>=1.2.4 40 | - cairosvg 41 | - tf2-gnn>=2.12.0 42 | - more-itertools 43 | - fcd_torch 44 | - pydantic 45 | - pyyaml 46 | - dirhash 47 | - scandir 48 | - semver 49 | - py-repo-root==1.1.1 50 | - json2html 51 | - mysql-connector-python==8.0.17 52 | - tokenizers==0.9.2 53 | - transformers==3.4.0 54 | - opencensus-ext-azure 55 | - opencensus-ext-requests -------------------------------------------------------------------------------- /prefer-environment.yml: -------------------------------------------------------------------------------- 1 | name: prefer-env 2 | channels: 3 | - rdkit 4 | - defaults 5 | - plotly 6 | dependencies: 7 | - pip<22 8 | - python==3.7.7 9 | - matplotlib 10 | - numpy==1.19.2 11 | - rdkit==2020.09.1.0 12 | - scikit-learn==0.24.1 13 | - seaborn 14 | - plotly 15 | - tensorflow-gpu==2.1.0 16 | - typing-extensions 17 | - pip: 18 | - dpu-utils>=0.2.13 19 | - ipywidgets 20 | - pandas>=1.2.4 21 | - sklearn 22 | - more-itertools 23 | - pyyaml 24 | - py-repo-root>=1.1.1 25 | - tf2-gnn~=2.12.0 26 | - ghostml 27 | - dirhash 28 | - auto-sklearn ==0.14.7 29 | - ipykernel 30 | - nbformat 31 | 32 | -------------------------------------------------------------------------------- /prefer/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rdkit/PREFER/16986ca183b38b94f9b3b62b441ec40671b8cf8a/prefer/__init__.py -------------------------------------------------------------------------------- /prefer/azure_ml/README.md: -------------------------------------------------------------------------------- 1 | # Working in AzureML 2 | PREFER can also run in AzureML. Main scripts regarding AzureML are collected in PREFER/prefer/azure_ml/ 3 | The following steps should be implemented: 4 | 5 | 6 | ## STEP 1: install the full cddd and moler environments 7 | As follows: 8 | 9 | ``` 10 | conda env create -f moler-environment-light.yml 11 | OR 12 | conda env create -f cddd-environment-light.yml 13 | ``` 14 | 15 | ## STEP 2: import datasets and models in your Azure Storage 16 | Import the dataset you want to use as well as the cddd and moler models in your Azure storage 17 | 18 | ## STEP 3: prepare the config file 19 | Prepare a yaml file with the following information: 20 | 21 | ``` 22 | path_to_df: 'path_in_Azure_to_the_saved_df' 23 | experiment_name: 'name of the experiment' 24 | id_column_name: '' 25 | smiles_column_name: '' 26 | properties_column_name_list: 27 | - 'property1_columns_name' # NB if more than one then it is a multitasking 28 | problem_type: 'regression' # Can be regression or classification 29 | splitting_strategy: 'random' 30 | representations: 31 | 'DESCRIPTORS2D' : '' 32 | 'FINGERPRINTS': '' 33 | 'CDDD': 'path_in_azure_to_stored_CDDD_model' 34 | ``` 35 | 36 | An example is provided in PREFER/prefer/azure_ml/config_logD_azure.yaml 37 | 38 | 39 | 40 | ## STEP 3: import datasets and models in your Azure Storage 41 | Go to PREFER/prefer/azure_ml and run the following command: 42 | 43 | ``` 44 | python schedule_global_model_pipeline.py --prefer_args config_logD_azure.yaml --prefer_env cddd-environment.yml 45 | ``` 46 | 47 | One can also use the moler-environment.yml for running experiments with moler environment. 48 | 49 | 50 | 51 | 52 | -------------------------------------------------------------------------------- /prefer/azure_ml/aml_config.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023, Novartis Institutes for BioMedical Research Inc. and Microsoft Corporation 2 | # All rights reserved. 3 | # 4 | # Redistribution and use in source and binary forms, with or without 5 | # modification, are permitted provided that the following conditions are 6 | # met: 7 | # 8 | # * Redistributions of source code must retain the above copyright 9 | # notice, this list of conditions and the following disclaimer. 10 | # * Redistributions in binary form must reproduce the above 11 | # copyright notice, this list of conditions and the following 12 | # disclaimer in the documentation and/or other materials provided 13 | # with the distribution. 14 | # * Neither the name of Novartis Institutes for BioMedical Research Inc. nor Microsoft Corporation 15 | # nor the names of its contributors may be used to endorse or promote 16 | # products derived from this software without specific prior written permission. 17 | # 18 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 19 | # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 20 | # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 21 | # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 22 | # OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 23 | # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 24 | # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 25 | # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 26 | # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 27 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 28 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 29 | # 30 | # Created by Jessica Lanini, January 2023 31 | 32 | 33 | from __future__ import annotations 34 | import os 35 | import json 36 | from dataclasses import dataclass, fields 37 | from prefer.azure_ml.exceptions import MissingEnvironmentVariable 38 | 39 | ENV_PREFIX = "AML_CONFIG_" 40 | ENV_PATH_NAMES = { 41 | "subscription_id": f"{ENV_PREFIX}SUBSCRIPTION_ID", 42 | "resource_group": f"{ENV_PREFIX}RESOURCE_GROUP", 43 | "workspace_name": f"{ENV_PREFIX}WORKSPACE_NAME", 44 | "compute_target_name": f"{ENV_PREFIX}COMPUTE_TARGET_NAME", 45 | "cpu_compute_target_name": f"{ENV_PREFIX}CPU_COMPUTE_TARGET_NAME", 46 | "datastore_name": f"{ENV_PREFIX}DATASTORE_NAME", 47 | "result_store_name": f"{ENV_PREFIX}RESULT_STORE_NAME", 48 | "keyvault_name": f"{ENV_PREFIX}KEYVAULT_NAME", 49 | } 50 | 51 | 52 | @dataclass 53 | class AmlConfig: 54 | 55 | subscription_id: str 56 | resource_group: str 57 | workspace_name: str 58 | compute_target_name: str 59 | cpu_compute_target_name: str 60 | datastore_name: str 61 | result_store_name: str 62 | keyvault_name: str 63 | 64 | @classmethod 65 | def can_load_from_environment_variables(cls) -> bool: 66 | field_names = [x.name for x in fields(cls)] 67 | aml_config = {name: os.getenv(ENV_PATH_NAMES[name]) for name in field_names} 68 | if any(x is None for x in aml_config.values()): 69 | return False 70 | return True 71 | 72 | @classmethod 73 | def from_environment_variables(cls) -> AmlConfig: 74 | """Instantiate an AmlConfig from environment variables. 75 | If the relevant environment variables are not all set, raise MissingEnvironmentVariable""" 76 | field_names = [x.name for x in fields(cls)] 77 | aml_config = {name: os.getenv(ENV_PATH_NAMES[name]) for name in field_names} 78 | if not AmlConfig.can_load_from_environment_variables(): 79 | raise MissingEnvironmentVariable 80 | return cls(**aml_config) 81 | 82 | @classmethod 83 | def from_file(cls, filename: str) -> AmlConfig: 84 | with open(filename, "rt") as fh: 85 | aml_config = json.load(fh) 86 | return cls(**aml_config) 87 | -------------------------------------------------------------------------------- /prefer/azure_ml/aml_configuration/aml_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "subscription_id": "", 3 | "resource_group": "", 4 | "workspace_name": "", 5 | "compute_target_name": "", 6 | "cpu_compute_target_name": "", 7 | "datastore_name": "", 8 | "trained_models_datastore_name": "", 9 | "trained_property_models_datastore_name": "", 10 | "trained_generative_models_datastore_name": "", 11 | "result_store_name": "", 12 | "keyvault_name": "" 13 | } 14 | -------------------------------------------------------------------------------- /prefer/azure_ml/config_logD_azure.yaml: -------------------------------------------------------------------------------- 1 | path_to_df: 'path_in_Azure_to_the_saved_df' 2 | experiment_name: 'LogD_public_random' 3 | id_column_name: 'Molecule ChEMBL ID' 4 | smiles_column_name: 'Smiles' 5 | properties_column_name_list: 6 | - 'Standard Value' # if more than one then it is a multitasking 7 | problem_type: 'regression' # Can be Regression or Classification 8 | splitting_strategy: 'random' 9 | representations: 10 | 'DESCRIPTORS2D' : '' 11 | 'FINGERPRINTS': '' 12 | 'CDDD': 'path_in_azure_to_stored_CDDD_model' 13 | desirability_scores: # please leave this field as it is 14 | score1: 15 | - x : -1.0 16 | y : 0.0 17 | - x : 0.0 18 | y : 0.2 19 | - x : 1.0 20 | y : 0.9 21 | - x : 2.0 22 | y : 1.0 23 | - x : 3.0 24 | y : 0.5 25 | - x : 4.0 26 | y : 0.0 -------------------------------------------------------------------------------- /prefer/azure_ml/exceptions.py: -------------------------------------------------------------------------------- 1 | class MissingEnvironmentVariable(Exception): 2 | pass 3 | -------------------------------------------------------------------------------- /prefer/azure_ml/included_prefixes.json: -------------------------------------------------------------------------------- 1 | { 2 | 3 | "prefer/azure_ml/model_registration_prefer.py": { 4 | "project_root": [ 5 | "prefer/azure_ml/", 6 | "prefer/src/", 7 | "prefer/utils/", 8 | "prefer/model_based_representations/" 9 | ] 10 | }, 11 | "prefer/scripts/get_representations.py": { 12 | "project_root": [ 13 | "prefer/src/", 14 | "prefer/docs/", 15 | "prefer/utils/", 16 | "prefer/molecule_representations/", 17 | "prefer/model_based_representations/", 18 | "prefer/model_based_representations/models/cddd/", 19 | "prefer/model_based_representations/models/molecule-generation/", 20 | "prefer/azure_ml/", 21 | "prefer/scripts/" 22 | ] 23 | }, 24 | "prefer/scripts/model_wrapper.py": { 25 | "project_root": [ 26 | "prefer/src/", 27 | "prefer/utils/", 28 | "prefer/molecule_representations/", 29 | "prefer/model_based_representations/", 30 | "prefer/model_based_representations/models/cddd/", 31 | "prefer/model_based_representations/models/molecule-generation/", 32 | "prefer/scripts/" 33 | ] 34 | }, 35 | "prefer/scripts/combine_results.py": { 36 | "project_root": [ 37 | "prefer/src/", 38 | "prefer/utils/", 39 | "prefer/molecule_representations/", 40 | "prefer/model_based_representations/", 41 | "prefer/model_based_representations/models/cddd/", 42 | "prefer/model_based_representations/models/molecule-generation/", 43 | "prefer/scripts/" 44 | ] 45 | }, 46 | "prefer/scripts/run_PREFER.py": { 47 | "project_root": [ 48 | "prefer/src/", 49 | "prefer/utils/", 50 | "prefer/molecule_representations/", 51 | "prefer/model_based_representations/", 52 | "prefer/model_based_representations/models/cddd/", 53 | "prefer/model_based_representations/models/molecule-generation/", 54 | "prefer/scripts/" 55 | ] 56 | } 57 | } -------------------------------------------------------------------------------- /prefer/azure_ml/model_registration_prefer.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023, Novartis Institutes for BioMedical Research Inc. and Microsoft Corporation 2 | # All rights reserved. 3 | # 4 | # Redistribution and use in source and binary forms, with or without 5 | # modification, are permitted provided that the following conditions are 6 | # met: 7 | # 8 | # * Redistributions of source code must retain the above copyright 9 | # notice, this list of conditions and the following disclaimer. 10 | # * Redistributions in binary form must reproduce the above 11 | # copyright notice, this list of conditions and the following 12 | # disclaimer in the documentation and/or other materials provided 13 | # with the distribution. 14 | # * Neither the name of Novartis Institutes for BioMedical Research Inc. nor Microsoft Corporation 15 | # nor the names of its contributors may be used to endorse or promote 16 | # products derived from this software without specific prior written permission. 17 | # 18 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 19 | # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 20 | # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 21 | # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 22 | # OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 23 | # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 24 | # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 25 | # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 26 | # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 27 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 28 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 29 | # 30 | # Created by Jessica Lanini, January 2023 31 | 32 | 33 | #!/usr/bin/env python 34 | import json 35 | import os 36 | import pickle 37 | import time 38 | import logging 39 | 40 | from azureml.core.run import Run 41 | 42 | from prefer.azure_ml.model_registration_utils import ( 43 | add_tags_to_aml_run, 44 | name_property_model_for_registration, 45 | ) 46 | from prefer.azure_ml.telemetry_utils import ( 47 | set_telemetry_handlers, 48 | function_span_metrics_decorator, 49 | ) 50 | 51 | logger = logging.getLogger(__name__) 52 | set_telemetry_handlers(logger, "Training") 53 | 54 | 55 | @function_span_metrics_decorator("training.model_registration_prefer.run_from_args", "Training") 56 | def run_from_args(args): 57 | aml_run = Run.get_context() 58 | 59 | model_dir = args.MODEL_PREFER_WRAPPED_DIR 60 | pkls = [name for name in os.listdir(model_dir) if name.endswith(".pkl")] 61 | assert len(pkls) == 1 62 | wrapped_model_name = pkls[0] 63 | 64 | with open(os.path.join(model_dir, wrapped_model_name), "rb") as in_file: 65 | wrapped_model = pickle.load(in_file) 66 | rep_model_id = wrapped_model.rep_model_id 67 | 68 | problem_type = wrapped_model.problem_type 69 | local_or_global_model = args.local_or_global_model 70 | 71 | is_best = local_or_global_model == "local" 72 | representation_name = wrapped_model.model_representation 73 | registration_name = name_property_model_for_registration( 74 | wrapped_model.friendly_model_name, representation_name 75 | ) 76 | 77 | cloud_target_dir = "packaged_model" 78 | # # (1) Save the wrapped model: 79 | aml_run.upload_file( 80 | path_or_stream=os.path.join(model_dir, wrapped_model_name), 81 | name=os.path.join(cloud_target_dir, wrapped_model_name), 82 | ) 83 | 84 | # (2) Save metadata 85 | metadata = { 86 | "git_status": args.git_status, 87 | "repo_hash": args.repo_hash, 88 | "api_version": args.api_version, 89 | } 90 | 91 | metadata_filename = "metadata.json" 92 | 93 | with open(metadata_filename, "wt") as fh: 94 | json.dump(metadata, fh) 95 | 96 | aml_run.upload_file( 97 | path_or_stream=metadata_filename, name=os.path.join(cloud_target_dir, metadata_filename), 98 | ) 99 | 100 | # (3) Register the model with tags and properties. 101 | tags = { 102 | "is_best": is_best, 103 | # All property prediction models are active by default (i.e. not marked for future deletion) because: 104 | # - only the best local model is registered, therefore it makes sense that it will not be deleted by default. 105 | # - all global models are registered, and the data scientist could potentially pick any of them for production. 106 | "is_active": True, 107 | } 108 | 109 | # TODO why do we need to store the desirability curve with the property model? 110 | 111 | properties = { 112 | "generative_or_property_model": "property_model", 113 | "property_model_friendly_name": wrapped_model.friendly_model_name, 114 | "problem_type": problem_type, 115 | "representation_name": representation_name, 116 | "generative_model_id": rep_model_id, 117 | "local_or_global_model": args.local_or_global_model, 118 | "timestamp": args.timestamp, 119 | "user_name": args.user_name, 120 | "run_name": args.run_name, 121 | "git_status": args.git_status, 122 | "repo_hash": args.repo_hash, 123 | "api_version": args.api_version, 124 | "project_code": wrapped_model.project_code, 125 | "desirabilities": wrapped_model.desirability_scores, 126 | "env": args.conda_env_name, 127 | } 128 | 129 | aml_run.register_model( 130 | model_name=registration_name, model_path=cloud_target_dir, tags=tags, properties=properties, 131 | ) 132 | 133 | add_tags_to_aml_run(aml_run, tags) 134 | aml_run.add_properties(properties) 135 | 136 | 137 | def run(): 138 | import argparse 139 | 140 | parser = argparse.ArgumentParser(description="Model registration script (PREFER).") 141 | parser.add_argument( 142 | "MODEL_PREFER_WRAPPED_DIR", 143 | type=str, 144 | help="Directory with the pickled PREFER model with metadata.", 145 | ) 146 | parser.add_argument( 147 | "--timestamp", 148 | type=str, 149 | default=time.strftime("%Y-%m-%d_%H-%M-%S"), 150 | help="Timestamp of the run (used for model tagging).", 151 | ) 152 | parser.add_argument( 153 | "--user-name", 154 | type=str, 155 | default="unknown_user", 156 | help="Name of the scheduling user (used for model tagging).", 157 | ) 158 | parser.add_argument( 159 | "--run-name", 160 | type=str, 161 | default="unknown_run_name", 162 | help="Name of the pipeline run (used for model tagging).", 163 | ) 164 | parser.add_argument( 165 | "--git_status", 166 | type=str, 167 | required=True, 168 | help="Git status (branch, commit, etc) at the time of training.", 169 | ) 170 | parser.add_argument( 171 | "--repo_hash", 172 | type=str, 173 | required=True, 174 | help="Hash of the repo root at the time of training.", 175 | ) 176 | parser.add_argument( 177 | "--api_version", type=str, required=True, help="API version supported by the model.", 178 | ) 179 | parser.add_argument( 180 | "--local_or_global_model", 181 | type=str, 182 | choices=["local", "global"], 183 | help="local/global", 184 | required=True, 185 | ) 186 | parser.add_argument( 187 | "--conda_env_name", 188 | type=str, 189 | default="moler-environment", 190 | help="Name of the conda environment used to build the model.", 191 | ) 192 | args = parser.parse_args() 193 | 194 | run_from_args(args) 195 | 196 | 197 | if __name__ == "__main__": 198 | run() 199 | -------------------------------------------------------------------------------- /prefer/azure_ml/model_registration_utils.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023, Novartis Institutes for BioMedical Research Inc. and Microsoft Corporation 2 | # All rights reserved. 3 | # 4 | # Redistribution and use in source and binary forms, with or without 5 | # modification, are permitted provided that the following conditions are 6 | # met: 7 | # 8 | # * Redistributions of source code must retain the above copyright 9 | # notice, this list of conditions and the following disclaimer. 10 | # * Redistributions in binary form must reproduce the above 11 | # copyright notice, this list of conditions and the following 12 | # disclaimer in the documentation and/or other materials provided 13 | # with the distribution. 14 | # * Neither the name of Novartis Institutes for BioMedical Research Inc. nor Microsoft Corporation 15 | # nor the names of its contributors may be used to endorse or promote 16 | # products derived from this software without specific prior written permission. 17 | # 18 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 19 | # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 20 | # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 21 | # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 22 | # OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 23 | # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 24 | # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 25 | # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 26 | # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 27 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 28 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 29 | # 30 | # Created by Jessica Lanini, January 2023 31 | 32 | 33 | #! /usr/bin/env python3 34 | import time 35 | from typing import List, Optional, Any, Dict 36 | 37 | from azureml.core import Workspace, RunConfiguration, Run, ComputeTarget 38 | from azureml.pipeline.core import PipelineData, PipelineStep 39 | from pathlib import Path 40 | 41 | from dirhash import dirhash 42 | 43 | from prefer.azure_ml.reproducibility import get_current_api_version, get_git_status 44 | from prefer.azure_ml.utils import create_step 45 | from prefer.azure_ml.aml_config import AmlConfig 46 | 47 | from pyreporoot import project_root 48 | 49 | 50 | def name_property_model_for_registration(experiment_name: str, representation_name: str) -> str: 51 | return experiment_name + "_" + representation_name 52 | 53 | 54 | def add_tags_to_aml_run(aml_run: Run, tags: Dict[str, Any]) -> None: 55 | # Add the tags to the AML run as well. 56 | for key, value in tags.items(): 57 | aml_run.tag(key, value) 58 | 59 | # If the run has a parent (which should be the case for pipelines), also add the tags there. 60 | if aml_run.parent is not None: 61 | for key, value in tags.items(): 62 | aml_run.parent.tag(key, value) 63 | 64 | 65 | def create_registration_step( 66 | aml_config: AmlConfig, 67 | workspace: Workspace, 68 | model_name: str, 69 | user_name: str, 70 | run_name: Optional[str], 71 | model_training_output: PipelineData, 72 | eval_outputs: List[PipelineData], 73 | run_config: RunConfiguration, 74 | tmpdir_to_use: str, 75 | ) -> PipelineStep: 76 | timestamp = time.strftime("%Y-%m-%d_%H-%M-%S") 77 | run_name = run_name or f"{timestamp}-{user_name}" 78 | 79 | git_status = get_git_status() 80 | repo_hash = _dir_hash(str(project_root(Path(__file__)))) 81 | api_version = get_current_api_version() 82 | 83 | return create_step( 84 | tmpdir_to_use=tmpdir_to_use, 85 | name="Model Registration", 86 | script_name="prefer/azure_ml/model_registration_prefer.py", 87 | arguments=[ 88 | model_name, 89 | "--timestamp", 90 | timestamp, 91 | "--user-name", 92 | user_name, 93 | "--run-name", 94 | run_name, 95 | "--git_status", 96 | git_status, 97 | "--repo_hash", 98 | repo_hash, 99 | "--api_version", 100 | api_version, 101 | model_training_output, 102 | ] 103 | + eval_outputs, 104 | inputs=[model_training_output] + eval_outputs, 105 | outputs=[], 106 | compute_target=ComputeTarget(workspace, aml_config.cpu_compute_target_name), 107 | runconfig=run_config, 108 | ) 109 | 110 | 111 | def create_registration_prefer_step( 112 | aml_config: AmlConfig, 113 | workspace: Workspace, 114 | user_name: str, 115 | run_name: Optional[str], 116 | model_prefer_wrapped: PipelineData, 117 | run_config: RunConfiguration, 118 | tmpdir_to_use: str, 119 | local_or_global_model: str, 120 | conda_env_name: str, 121 | ) -> PipelineStep: 122 | timestamp = time.strftime("%Y-%m-%d_%H-%M-%S") 123 | run_name = run_name or f"{timestamp}-{user_name}" 124 | 125 | git_status = get_git_status() 126 | repo_hash = _dir_hash(str(project_root(Path(__file__)))) 127 | api_version = get_current_api_version() 128 | 129 | return create_step( 130 | tmpdir_to_use=tmpdir_to_use, 131 | name="Model Registration", 132 | script_name="prefer/azure_ml/model_registration_prefer.py", 133 | arguments=[ 134 | model_prefer_wrapped, 135 | "--timestamp", 136 | timestamp, 137 | "--user-name", 138 | user_name, 139 | "--run-name", 140 | run_name, 141 | "--git_status", 142 | git_status, 143 | "--repo_hash", 144 | repo_hash, 145 | "--api_version", 146 | api_version, 147 | "--local_or_global_model", 148 | local_or_global_model, 149 | "--conda_env_name", 150 | conda_env_name, 151 | ], 152 | inputs=[model_prefer_wrapped], 153 | outputs=[], 154 | compute_target=ComputeTarget(workspace, aml_config.compute_target_name), 155 | runconfig=run_config, 156 | ) 157 | 158 | 159 | def _dir_hash(folder_name: str, **kwargs) -> str: 160 | """ 161 | Calculate SHA256 hash of a an entire folder tree, recursively 162 | Note: The multi-threaded version seems to be throwing an error currently, but the single threaded version 163 | is fast enough given how infrequently we expect this to be used 164 | """ 165 | return dirhash(folder_name, "sha256", **kwargs) 166 | -------------------------------------------------------------------------------- /prefer/azure_ml/reproducibility.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023, Novartis Institutes for BioMedical Research Inc. and Microsoft Corporation 2 | # All rights reserved. 3 | # 4 | # Redistribution and use in source and binary forms, with or without 5 | # modification, are permitted provided that the following conditions are 6 | # met: 7 | # 8 | # * Redistributions of source code must retain the above copyright 9 | # notice, this list of conditions and the following disclaimer. 10 | # * Redistributions in binary form must reproduce the above 11 | # copyright notice, this list of conditions and the following 12 | # disclaimer in the documentation and/or other materials provided 13 | # with the distribution. 14 | # * Neither the name of Novartis Institutes for BioMedical Research Inc. nor Microsoft Corporation 15 | # nor the names of its contributors may be used to endorse or promote 16 | # products derived from this software without specific prior written permission. 17 | # 18 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 19 | # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 20 | # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 21 | # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 22 | # OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 23 | # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 24 | # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 25 | # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 26 | # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 27 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 28 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 29 | # 30 | # Created by Jessica Lanini, January 2023 31 | 32 | 33 | import subprocess 34 | from hashlib import sha256 35 | from time import time 36 | import logging 37 | 38 | from pathlib import Path 39 | 40 | from pyreporoot import project_root 41 | from semver import VersionInfo 42 | 43 | logger = logging.getLogger(__name__) 44 | 45 | GenChemVersion = VersionInfo 46 | 47 | 48 | def get_current_api_version() -> GenChemVersion: 49 | """Returns the current version of the APIs.""" 50 | with open(str(project_root(Path(__file__)).joinpath("api_version.txt")), "rt") as f: 51 | return GenChemVersion.parse(f.read().rstrip()) 52 | 53 | 54 | def timeit(fn): 55 | """ 56 | *args and **kwargs are to support positional and named arguments of fn 57 | Use this as a decorator for the function you wish to time 58 | @timeit 59 | def my_func(args): 60 | .... 61 | return 62 | 63 | This produces output of the form "Time taken in my_func: 1.11111111s". The time is returned in seconds. 64 | """ 65 | 66 | def get_time(*args, **kwargs): 67 | start = time() 68 | output = fn(*args, **kwargs) 69 | logger.info(f"Time taken in {fn.__name__}: {time() - start:.7f}s") 70 | return output # make sure that the decorator returns the output of fn 71 | 72 | return get_time 73 | 74 | 75 | def file_hash(filename: str) -> str: 76 | """ 77 | Calculate SHA256 hash of a file 78 | """ 79 | sha256_hash = sha256() 80 | with open(filename, "rb") as f: 81 | # Read and update hash string value in blocks of 16K 82 | for byte_block in iter(lambda: f.read(16384), b""): 83 | sha256_hash.update(byte_block) 84 | return sha256_hash.hexdigest() 85 | 86 | 87 | def get_git_short_hash() -> str: 88 | return subprocess.check_output(["git", "rev-parse", "--short", "HEAD"]).decode("ASCII").strip() 89 | 90 | 91 | def get_git_status() -> str: 92 | """ 93 | Queries the git tree for current git revision hash, status of local changes, status of untracked files 94 | Output is of form: 95 | FULL_GIT_HASH@BRANCH_NAME+local_changes+untracked_files 96 | The git hash and branch name are always returned. 97 | '+local_changes' tag is added if any local changes are found 98 | '+untracked_files' tag is added if any untracked files are found 99 | Returns: 100 | (str) String describing the status of the git tree 101 | """ 102 | try: 103 | 104 | def run_command(command): 105 | return ( 106 | subprocess.check_output(command, cwd=project_root(Path(__file__))) 107 | .decode("ASCII") 108 | .strip() 109 | ) 110 | 111 | # pylint: disable=unexpected-keyword-arg # For some reason, pylint doesn't like "cwd" 112 | head_ref_names = run_command(["git", "log", "--format=%D", "-1"]) 113 | # head_ref_names returns output of the form 114 | # HEAD -> user_branch_name, origin/master, origin/HEAD, master 115 | # Parse this to recover the branch name if possible, else leave empty 116 | if " -> " in head_ref_names: 117 | branch = "@" + head_ref_names.split(" -> ")[1].split(",")[0] 118 | else: 119 | branch = "" 120 | 121 | # Recover the change hash, keep the full version 122 | change_hash: str = run_command(["git", "rev-parse", "HEAD"]) 123 | 124 | # Identify if there are any uncommitted local changes 125 | local_changes: str = run_command(["git", "diff-index", "HEAD", "--"]) 126 | change_status = "" if local_changes == "" else "+local_changes" 127 | 128 | # Identify if there are any untracked local changes 129 | untracked_files: str = run_command(["git", "status", "--short"]) 130 | untracked_status = "" if untracked_files == "" else "+untracked_files" 131 | 132 | # This ony works when the remote is called origin, but we can't guarantee that 133 | # Find out if there is a way to query the name of the current remote tree 134 | # unpushed_changes: str = run_command(["git", "log", "origin.."]) 135 | # unpushed_status = "" if unpushed_changes == "" else "+unpushed_changes" 136 | 137 | return "{}{}{}{}".format(change_hash, branch, change_status, untracked_status) 138 | except (subprocess.CalledProcessError, FileNotFoundError): 139 | return "UNKNOWN GIT REVISION" 140 | -------------------------------------------------------------------------------- /prefer/azure_ml/telemetry_utils.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023, Novartis Institutes for BioMedical Research Inc. and Microsoft Corporation 2 | # All rights reserved. 3 | # 4 | # Redistribution and use in source and binary forms, with or without 5 | # modification, are permitted provided that the following conditions are 6 | # met: 7 | # 8 | # * Redistributions of source code must retain the above copyright 9 | # notice, this list of conditions and the following disclaimer. 10 | # * Redistributions in binary form must reproduce the above 11 | # copyright notice, this list of conditions and the following 12 | # disclaimer in the documentation and/or other materials provided 13 | # with the distribution. 14 | # * Neither the name of Novartis Institutes for BioMedical Research Inc. nor Microsoft Corporation 15 | # nor the names of its contributors may be used to endorse or promote 16 | # products derived from this software without specific prior written permission. 17 | # 18 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 19 | # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 20 | # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 21 | # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 22 | # OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 23 | # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 24 | # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 25 | # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 26 | # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 27 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 28 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 29 | # 30 | # Created by Jessica Lanini, January 2023 31 | 32 | 33 | import logging 34 | from functools import wraps 35 | from typing import Optional 36 | from opencensus.ext.azure.log_exporter import AzureLogHandler 37 | from opencensus.trace import config_integration 38 | from opencensus.ext.azure.trace_exporter import AzureExporter 39 | from opencensus.trace.samplers import ProbabilitySampler 40 | from opencensus.trace.tracer import Tracer 41 | from azureml.core import Run 42 | from azureml.exceptions import RunEnvironmentException 43 | 44 | 45 | def _callback_add_cloudrole(envelope): 46 | envelope.tags["ai.cloud.role"] = "Azure ML" 47 | return True 48 | 49 | 50 | class AppFilter(logging.Filter): 51 | def __init__(self, run_type: str): 52 | self.run_type = run_type 53 | 54 | def filter(self, record): 55 | custom_dimensions = {} 56 | _set_run_context(custom_dimensions, self.run_type) 57 | record.custom_dimensions = custom_dimensions 58 | return True 59 | 60 | 61 | def set_telemetry_handlers(logger, run_type: str): 62 | logger.setLevel(logging.INFO) 63 | logger.addHandler(logging.StreamHandler()) 64 | try: 65 | handler = AzureLogHandler() 66 | handler.add_telemetry_processor(_callback_add_cloudrole) 67 | handler.addFilter(AppFilter(run_type)) 68 | logger.addHandler(handler) 69 | 70 | # configure tracing for app insights integrations 71 | config_integration.trace_integrations(["requests"]) 72 | _get_tracer(run_type) 73 | except Exception: 74 | # application insights connection string is not set up in environment. 75 | # Probably this doesn't run in AML, skipping setting up Application Insights for local run. 76 | pass 77 | 78 | 79 | def _set_run_context(dictionary: dict, run_type: str): 80 | try: 81 | run = Run.get_context(allow_offline=False) 82 | dictionary["parent_run_id"] = run.parent.id 83 | dictionary["step_id"] = run.id 84 | dictionary["step_name"] = run.name 85 | dictionary["experiment_name"] = run.experiment.name 86 | dictionary["run_url"] = run.parent.get_portal_url() 87 | dictionary["run_type"] = run_type 88 | except RunEnvironmentException: 89 | # Not an AzureML run 90 | pass 91 | 92 | 93 | def _callback_add_context(run_type: str, envelope): 94 | _set_run_context(envelope.data.baseData.properties, run_type) 95 | 96 | 97 | def _get_tracer(run_type: str): 98 | try: 99 | app_insights_exporter = AzureExporter() 100 | app_insights_exporter.add_telemetry_processor(_callback_add_cloudrole) 101 | app_insights_exporter.add_telemetry_processor( 102 | lambda envelope: _callback_add_context(run_type, envelope) 103 | ) 104 | return Tracer(exporter=app_insights_exporter, sampler=ProbabilitySampler(rate=1.0)) 105 | except Exception: 106 | # application insights connection string is not set up in environment. 107 | # Probably this doesn't run in AML, returning tracer which reports locally. 108 | return Tracer(sampler=ProbabilitySampler(rate=1.0)) 109 | 110 | 111 | def function_span_metrics_decorator(span_name: str, run_type: Optional[str]): 112 | def decorator(func): 113 | @wraps(func) 114 | def wrapper(*args, **kwargs): 115 | tracer = _get_tracer(run_type) 116 | with tracer.span(name=span_name): 117 | return func(*args, **kwargs) 118 | 119 | return wrapper 120 | 121 | return decorator 122 | -------------------------------------------------------------------------------- /prefer/docs/PREFER_scheme.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rdkit/PREFER/16986ca183b38b94f9b3b62b441ec40671b8cf8a/prefer/docs/PREFER_scheme.png -------------------------------------------------------------------------------- /prefer/docs/SaltsMod.txt: -------------------------------------------------------------------------------- 1 | // Notes: 2 | // 1) don't include charges 3 | // 2) The search for salts is a substructure search where the substructure 4 | // must match the entire fragment, so we don't need to be choosy about bond 5 | // types 6 | // 3) The matching is done in order, so if you put the more complex stuff at the 7 | // bottom the "don't remove the last fragment" algorithm has a chance of 8 | // of returning something sensible 9 | 10 | // start with simple inorganics: 11 | [Cl,Br,I,F] 12 | [Li,Na,K,Ca,Mg,Zn] 13 | [O,N] 14 | 15 | // "complex" inorganics 16 | [N](=O)(O)O 17 | [P](=O)(O)(O)O 18 | [P](F)(F)(F)(F)(F)F 19 | [S](=O)(=O)(O)O 20 | [CH3][S](=O)(=O)(O) 21 | c1cc([CH3])ccc1[S](=O)(=O)(O) p-Toluene sulfonate 22 | F[B](F)F 23 | // organics 24 | [CH3]C(=O)O Acetic acid 25 | FC(F)(F)C(=O)O TFA 26 | OC(=O)C=CC(=O)O Fumarate/Maleate 27 | OC(=O)C(=O)O Oxalate 28 | OC(=O)C(O)C(O)C(=O)O Tartrate 29 | C1CCCCC1[NH]C1CCCCC1 Dicylcohexylammonium 30 | 31 | // added for Patent Stuff 32 | OC(=O)C=CC(O)=O succinate 33 | OC(=O)CCC(O)=O 34 | CC(O)=O 35 | [In] 36 | [Cu] 37 | [Zn] 38 | [Y] 39 | C 40 | -------------------------------------------------------------------------------- /prefer/model_based_representations/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rdkit/PREFER/16986ca183b38b94f9b3b62b441ec40671b8cf8a/prefer/model_based_representations/__init__.py -------------------------------------------------------------------------------- /prefer/model_based_representations/cddd_wrapper.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023, Novartis Institutes for BioMedical Research Inc. and Microsoft Corporation 2 | # All rights reserved. 3 | # 4 | # Redistribution and use in source and binary forms, with or without 5 | # modification, are permitted provided that the following conditions are 6 | # met: 7 | # 8 | # * Redistributions of source code must retain the above copyright 9 | # notice, this list of conditions and the following disclaimer. 10 | # * Redistributions in binary form must reproduce the above 11 | # copyright notice, this list of conditions and the following 12 | # disclaimer in the documentation and/or other materials provided 13 | # with the distribution. 14 | # * Neither the name of Novartis Institutes for BioMedical Research Inc. nor Microsoft Corporation 15 | # nor the names of its contributors may be used to endorse or promote 16 | # products derived from this software without specific prior written permission. 17 | # 18 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 19 | # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 20 | # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 21 | # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 22 | # OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 23 | # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 24 | # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 25 | # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 26 | # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 27 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 28 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 29 | # 30 | # Created by Jessica Lanini, January 2023 31 | 32 | 33 | import json 34 | import os 35 | from typing import Any, List, Tuple, Optional 36 | 37 | import numpy as np 38 | 39 | 40 | from prefer.utils.random_utils import set_random_seed 41 | from prefer.model_based_representations.interface import LatentSpaceMoleculeGenerator 42 | 43 | 44 | class CDDDGeneratorModel(LatentSpaceMoleculeGenerator): 45 | def __init__(self, dir: str, seed: int = 0, num_workers: int = 6, **kwargs: Any): 46 | super().__init__(dir, **kwargs) 47 | 48 | self.num_workers = num_workers 49 | 50 | set_random_seed(seed) 51 | 52 | self._set_inference_model(dir) 53 | 54 | self._dir = dir 55 | 56 | self._can_decode_from_scaffold = False 57 | # By default we will store the stats in the model dir (but a different loc might be passed to 58 | # sampling benchmark or to distribution matching benchmark, which will change it): 59 | self.latent_space_stats_file_dir = dir 60 | 61 | def _set_inference_model(self, dir): 62 | from cddd.inference import InferenceModel 63 | 64 | self._inference_model = InferenceModel( 65 | model_dir=dir, 66 | use_gpu=True, 67 | cpu_threads=self.num_workers, 68 | gpu_mem_frac=0.75, 69 | batch_size=4096, 70 | ) 71 | self._latent_size = self._inference_model.hparams.emb_size 72 | 73 | def encode(self, smiles_list: List[str]) -> List[np.ndarray]: 74 | """See parent class.""" 75 | return list(self._inference_model.seq_to_emb(smiles_list)) 76 | 77 | def get_name(self) -> str: 78 | return "CDDD" 79 | 80 | @classmethod 81 | def is_valid_dir(cls, model_dir: str) -> bool: 82 | file_name = os.path.join(model_dir, "hparams.json") 83 | try: 84 | if not os.path.exists(file_name): 85 | return False 86 | 87 | with open(file_name, "rt") as fh: 88 | # Bizarrely, the file contains a quoted JSON string, so we need a double-load here: 89 | hparams = json.loads(json.load(fh)) 90 | 91 | return hparams["model"] == "NoisyGRUSeq2SeqWithFeatures" 92 | except Exception: # Parse errors, key error, etc. 93 | return False 94 | 95 | def set_extra_args(self, **kwargs): 96 | workers = kwargs.get("num_workers") 97 | if workers is not None: 98 | self.num_workers = kwargs.get("num_workers") 99 | latent_space_stats = kwargs.get("latent_space_stats_file_dir") 100 | if latent_space_stats is not None: 101 | self.latent_space_stats_file_dir = kwargs.get("latent_space_stats_file_dir") 102 | -------------------------------------------------------------------------------- /prefer/model_based_representations/interface.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023, Novartis Institutes for BioMedical Research Inc. and Microsoft Corporation 2 | # All rights reserved. 3 | # 4 | # Redistribution and use in source and binary forms, with or without 5 | # modification, are permitted provided that the following conditions are 6 | # met: 7 | # 8 | # * Redistributions of source code must retain the above copyright 9 | # notice, this list of conditions and the following disclaimer. 10 | # * Redistributions in binary form must reproduce the above 11 | # copyright notice, this list of conditions and the following 12 | # disclaimer in the documentation and/or other materials provided 13 | # with the distribution. 14 | # * Neither the name of Novartis Institutes for BioMedical Research Inc. nor Microsoft Corporation 15 | # nor the names of its contributors may be used to endorse or promote 16 | # products derived from this software without specific prior written permission. 17 | # 18 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 19 | # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 20 | # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 21 | # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 22 | # OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 23 | # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 24 | # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 25 | # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 26 | # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 27 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 28 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 29 | # 30 | # Created by Jessica Lanini, January 2023 31 | 32 | 33 | """Abstract base classes for molecule encoders, generators, etc.""" 34 | from abc import ABC, abstractmethod 35 | 36 | from typing import ContextManager, List, Optional, Tuple, Iterable, Union 37 | 38 | import numpy as np 39 | from dirhash import dirhash 40 | from pathlib import Path 41 | 42 | Pathlike = Union[str, Path] 43 | 44 | 45 | class MoleculeSampler(ABC): 46 | """ 47 | A molecule generator that can sample random molecules. 48 | """ 49 | 50 | def sample(self, num_samples: int) -> List[str]: 51 | """ 52 | Sample SMILES strings using the wrapped model. 53 | 54 | Args: 55 | num_samples: Number of results to return. 56 | 57 | Returns: 58 | List of SMILES strings. 59 | """ 60 | # The below is a default implementation, can be overwritten for specific models. 61 | return [smiles for smiles, _ in self.sample_with_emb(num_samples)] 62 | 63 | 64 | class AbstractModelRepresentation(ContextManager): 65 | """ 66 | Base class for all molecule encoders, decoders, and samplers, providing 67 | - Default implementations for ContextManager 68 | - A model_id based on hash of files in the directory where the model is saved 69 | - Model name identifying the type of model 70 | """ 71 | 72 | def __init__(self, dir: Pathlike, model_id_file_patterns: Iterable[str] = ("*",), **kwargs): 73 | # As `dir_hash` takes a `str`, we need to explicitly cast it 74 | self._model_id = dirhash(str(dir), "sha256", match=model_id_file_patterns) 75 | 76 | # Any arguments that make their way into here were passed into a model, but not understood 77 | # by it. We intentionally allow this, since it allows the user to provide preferred choices 78 | # for arguments without checking if a given model supports them. However, we print a warning 79 | # to make this more explicit. 80 | if kwargs: 81 | print("The following arguments were provided and ignored:", list(kwargs.keys())) 82 | 83 | def __enter__(self): 84 | return self 85 | 86 | def __exit__(self, exc_type, exc_value, traceback): 87 | return None 88 | 89 | def get_model_id(self) -> str: 90 | """ 91 | 92 | Returns: the model id as a string 93 | 94 | """ 95 | return self._model_id 96 | 97 | @abstractmethod 98 | def set_extra_args(self, **kwargs) -> None: 99 | pass 100 | 101 | @classmethod 102 | @abstractmethod 103 | def is_valid_dir(cls, model_dir): 104 | pass 105 | 106 | @abstractmethod 107 | def get_name(self) -> str: 108 | """ 109 | 110 | Returns: a human-readable string to describe the model type (e.g. 'MoLeR'). 111 | 112 | """ 113 | raise NotImplementedError 114 | 115 | 116 | class LatentSpaceMoleculeGenerator(AbstractModelRepresentation, MoleculeSampler): 117 | """ 118 | Autoencoder / Latent Space based Generative Model 119 | """ 120 | 121 | @abstractmethod 122 | def encode(self, smiles_list: List[str]) -> List[np.array]: 123 | """ 124 | Map input molecules to points in vector space. 125 | Args: 126 | smiles_list: List of molecules as SMILES 127 | 128 | Returns: 2D array of molecules as vectors (latent space) 129 | TODO: should this be List[np.array]? 130 | 131 | """ 132 | raise NotImplementedError 133 | -------------------------------------------------------------------------------- /prefer/model_based_representations/model_based_representations_factory.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023, Novartis Institutes for BioMedical Research Inc. and Microsoft Corporation 2 | # All rights reserved. 3 | # 4 | # Redistribution and use in source and binary forms, with or without 5 | # modification, are permitted provided that the following conditions are 6 | # met: 7 | # 8 | # * Redistributions of source code must retain the above copyright 9 | # notice, this list of conditions and the following disclaimer. 10 | # * Redistributions in binary form must reproduce the above 11 | # copyright notice, this list of conditions and the following 12 | # disclaimer in the documentation and/or other materials provided 13 | # with the distribution. 14 | # * Neither the name of Novartis Institutes for BioMedical Research Inc. nor Microsoft Corporation 15 | # nor the names of its contributors may be used to endorse or promote 16 | # products derived from this software without specific prior written permission. 17 | # 18 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 19 | # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 20 | # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 21 | # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 22 | # OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 23 | # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 24 | # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 25 | # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 26 | # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 27 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 28 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 29 | # 30 | # Created by Jessica Lanini, January 2023 31 | 32 | 33 | import os 34 | from typing import Any, List, Optional, Type 35 | 36 | from prefer.model_based_representations.interface import ( 37 | LatentSpaceMoleculeGenerator, 38 | AbstractModelRepresentation, 39 | ) 40 | 41 | import tensorflow as tf 42 | 43 | if tf.__version__ >= "2.0.0": 44 | # MoLeR environment 45 | from prefer.model_based_representations.moler_wrapper import MoLeRGeneratorModel 46 | 47 | latent_space_models = [MoLeRGeneratorModel] 48 | else: 49 | # CDDD environment 50 | from prefer.model_based_representations.cddd_wrapper import CDDDGeneratorModel 51 | 52 | latent_space_models = [CDDDGeneratorModel] 53 | 54 | # Add here new model based molecular representation 55 | 56 | 57 | def load_latent_model_from_directory(model_dir: str, **kwargs: Any) -> LatentSpaceMoleculeGenerator: 58 | model: LatentSpaceMoleculeGenerator = load_model_from_directory(model_dir, [], **kwargs) 59 | return model 60 | 61 | 62 | def load_model_from_directory( 63 | model_dir: str, 64 | extra_model_types: Optional[List[Type[AbstractModelRepresentation]]] = None, 65 | **kwargs: Any, 66 | ) -> AbstractModelRepresentation: 67 | """Loads a model from the given directory. 68 | 69 | Note: 70 | This method will figure out the exact type of model from the data. 71 | Both `args` and `kwargs` are passed to the model's `__init__` method. 72 | 73 | Returns: 74 | An object implementing the AbstractModelRepresentation interface. 75 | """ 76 | if extra_model_types is None: 77 | extra_model_types = [] 78 | all_models = latent_space_models + extra_model_types 79 | if not os.path.isdir(model_dir): 80 | raise ValueError(f"{model_dir} is not a directory!") 81 | 82 | for cls in all_models: 83 | if cls.is_valid_dir(model_dir): 84 | return cls(model_dir, **kwargs) 85 | raise ValueError(f"{model_dir} does not contain any of the recognised model types.") 86 | -------------------------------------------------------------------------------- /prefer/model_based_representations/models/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | -------------------------------------------------------------------------------- /prefer/model_based_representations/moler_wrapper.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023, Novartis Institutes for BioMedical Research Inc. and Microsoft Corporation 2 | # All rights reserved. 3 | # 4 | # Redistribution and use in source and binary forms, with or without 5 | # modification, are permitted provided that the following conditions are 6 | # met: 7 | # 8 | # * Redistributions of source code must retain the above copyright 9 | # notice, this list of conditions and the following disclaimer. 10 | # * Redistributions in binary form must reproduce the above 11 | # copyright notice, this list of conditions and the following 12 | # disclaimer in the documentation and/or other materials provided 13 | # with the distribution. 14 | # * Neither the name of Novartis Institutes for BioMedical Research Inc. nor Microsoft Corporation 15 | # nor the names of its contributors may be used to endorse or promote 16 | # products derived from this software without specific prior written permission. 17 | # 18 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 19 | # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 20 | # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 21 | # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 22 | # OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 23 | # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 24 | # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 25 | # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 26 | # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 27 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 28 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 29 | # 30 | # Created by Jessica Lanini, January 2023 31 | 32 | 33 | import os 34 | import pathlib 35 | from typing import Any, List, Tuple, Union 36 | 37 | import numpy as np 38 | 39 | from prefer.model_based_representations.interface import LatentSpaceMoleculeGenerator 40 | from molecule_generation.wrapper import VaeWrapper 41 | 42 | Pathlike = Union[str, pathlib.Path] 43 | 44 | 45 | class MoLeRGeneratorModel(VaeWrapper, LatentSpaceMoleculeGenerator): 46 | def __init__( 47 | self, dir: Pathlike, seed: int = 0, num_workers: int = 6, beam_size: int = 1, **kwargs: Any 48 | ): 49 | VaeWrapper.__init__(self, dir, seed=seed, num_workers=num_workers, beam_size=beam_size) 50 | LatentSpaceMoleculeGenerator.__init__( 51 | self, dir, model_id_file_patterns=("*_best.pkl", "*_best.hdf5"), **kwargs 52 | ) 53 | 54 | self._can_decode_from_scaffold = True 55 | 56 | def get_name(self) -> str: 57 | return "MoLeR" 58 | 59 | @classmethod 60 | def is_valid_dir(cls, model_dir: str) -> object: 61 | files_in_dir = os.listdir(model_dir) 62 | return any( 63 | "_MoLeR__" in filename or "_MotifMoLeR__" in filename for filename in files_in_dir 64 | ) 65 | return any(cls._is_moler_model_filename(filename) for filename in files_in_dir) 66 | 67 | def set_extra_args(self, **kwargs): 68 | workers = kwargs.get("num_workers") 69 | if workers is not None: 70 | self.num_workers = kwargs.get("num_workers") 71 | beam = kwargs.get("beam_size") 72 | if beam is not None: 73 | self.beam_size = kwargs.get("beam_size") 74 | -------------------------------------------------------------------------------- /prefer/molecule_representations/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rdkit/PREFER/16986ca183b38b94f9b3b62b441ec40671b8cf8a/prefer/molecule_representations/__init__.py -------------------------------------------------------------------------------- /prefer/molecule_representations/descriptors2D_representations_builder.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023, Novartis Institutes for BioMedical Research Inc. and Microsoft Corporation 2 | # All rights reserved. 3 | # 4 | # Redistribution and use in source and binary forms, with or without 5 | # modification, are permitted provided that the following conditions are 6 | # met: 7 | # 8 | # * Redistributions of source code must retain the above copyright 9 | # notice, this list of conditions and the following disclaimer. 10 | # * Redistributions in binary form must reproduce the above 11 | # copyright notice, this list of conditions and the following 12 | # disclaimer in the documentation and/or other materials provided 13 | # with the distribution. 14 | # * Neither the name of Novartis Institutes for BioMedical Research Inc. nor Microsoft Corporation 15 | # nor the names of its contributors may be used to endorse or promote 16 | # products derived from this software without specific prior written permission. 17 | # 18 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 19 | # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 20 | # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 21 | # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 22 | # OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 23 | # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 24 | # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 25 | # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 26 | # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 27 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 28 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 29 | # 30 | # Created by Jessica Lanini, January 2023 31 | 32 | 33 | #!/usr/bin/env python 34 | # -*- coding: utf-8 -*- 35 | 36 | import logging 37 | import sys 38 | 39 | from pandas import DataFrame 40 | 41 | 42 | from prefer.utils.data_utils import check_if_nan, generate_molecule, generate2DDesc 43 | from prefer.src.molecule_representations import MoleculeRepresentations 44 | from prefer.src.molecule_representations_builder import MoleculeRepresentationsBuilder 45 | from prefer.src.vector_molecule_representations import VectorMoleculeRepresentations 46 | 47 | 48 | class Descriptors2DRepresentationsBuilder(MoleculeRepresentationsBuilder): 49 | #TODO Set scale_type to None 50 | def __init__( 51 | self, limit_def: int = None, scale_type: str = "standardization" 52 | ): 53 | self.limit_def = limit_def 54 | self.scale_type = scale_type # e.g. standardization 55 | 56 | def build_representations( 57 | self, molecule_data_orig: DataFrame, split_type: str = "random", seed=1, 58 | ) -> MoleculeRepresentations: 59 | """ 60 | method to compute Morgan Fingerprints as implemented in RDKit 61 | 62 | Input: 63 | - molecule_data_orig: this is a dataframe of the shape 64 | | ID | Smiles | Property_1 | Property_2 | ... | Property_N | 65 | ------------------------------------------------------------ 66 | - split_type: string related to the type of test/train split one want to apply. Possible split_type are random, temporal and cluster. One can add new splitting strategies in utils.splitting_strategies 67 | Output: 68 | - MoleculeRepresentations object 69 | """ 70 | 71 | print(f'Building Descriptors 2D. Warning: current scale_type is set to {self.scale_type}') 72 | molecule_data = molecule_data_orig.copy() 73 | logging.info("Generate 2D Descriptors") 74 | molecules = generate_molecule(molecule_data) 75 | # Generate _2DDescriptors 76 | molecule_data["molecule_representation"] = generate2DDesc(molecules) 77 | molecule_data = self.remove_nan(molecule_data) 78 | vector_molecule_representation = VectorMoleculeRepresentations( 79 | df=molecule_data, 80 | representation_name="DESCRIPTORS2D", 81 | split_type=split_type, 82 | scale_type=self.scale_type, 83 | seed=seed, 84 | limit_def = self.limit_def, 85 | ) 86 | 87 | return vector_molecule_representation 88 | 89 | def remove_nan(self, molecule_data: DataFrame): 90 | """ 91 | method use to check whetehr a representation has nan values and in case remove the corresponding row. 92 | 93 | input: representation_to_add is the representation to check 94 | """ 95 | nan_rows = check_if_nan(molecule_data["molecule_representation"]) 96 | if nan_rows: 97 | logging.warning( 98 | "Found nan in the representation:" 99 | + "2D Descriptors" 100 | + ". The following sample/s should be removed from the dataframe:" 101 | + str(nan_rows) 102 | ) 103 | molecule_data = molecule_data.drop(molecule_data.index[nan_rows]) 104 | # Reset indices 105 | molecule_data = molecule_data.reset_index(drop=True) 106 | return molecule_data 107 | -------------------------------------------------------------------------------- /prefer/molecule_representations/fingerprints_representations_builder.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023, Novartis Institutes for BioMedical Research Inc. and Microsoft Corporation 2 | # All rights reserved. 3 | # 4 | # Redistribution and use in source and binary forms, with or without 5 | # modification, are permitted provided that the following conditions are 6 | # met: 7 | # 8 | # * Redistributions of source code must retain the above copyright 9 | # notice, this list of conditions and the following disclaimer. 10 | # * Redistributions in binary form must reproduce the above 11 | # copyright notice, this list of conditions and the following 12 | # disclaimer in the documentation and/or other materials provided 13 | # with the distribution. 14 | # * Neither the name of Novartis Institutes for BioMedical Research Inc. nor Microsoft Corporation 15 | # nor the names of its contributors may be used to endorse or promote 16 | # products derived from this software without specific prior written permission. 17 | # 18 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 19 | # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 20 | # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 21 | # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 22 | # OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 23 | # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 24 | # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 25 | # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 26 | # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 27 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 28 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 29 | # 30 | # Created by Jessica Lanini, January 2023 31 | 32 | 33 | #!/usr/bin/env python 34 | # -*- coding: utf-8 -*- 35 | 36 | import logging 37 | import sys 38 | 39 | from pandas import DataFrame 40 | 41 | 42 | from prefer.utils.data_utils import check_if_nan, generate_fingerprints, generate_molecule 43 | from prefer.src.molecule_representations_builder import MoleculeRepresentationsBuilder 44 | from prefer.src.molecule_representations import MoleculeRepresentations 45 | from prefer.src.vector_molecule_representations import VectorMoleculeRepresentations 46 | 47 | 48 | class FingerprintsRepresentationsBuilder(MoleculeRepresentationsBuilder): 49 | def __init__( 50 | self, limit_def: int = None, 51 | ): 52 | self.limit_def = limit_def 53 | 54 | def build_representations( 55 | self, molecule_data_orig: DataFrame, split_type: str = "random", seed=1, 56 | ) -> MoleculeRepresentations: 57 | """ 58 | method to compute Morgan Fingerprints as implemented in RDKit 59 | 60 | Input: 61 | - molecule_data: this is a dataframe of the shape 62 | | ID | Smiles | Property_1 | Property_2 | ... | Property_N | 63 | ------------------------------------------------------------ 64 | - split_type: string related to the type of test/train split one want to apply. Possible split_type are random, temporal and cluster. One can add new splitting strategies in utils.splitting_strategies 65 | Output: 66 | - MoleculeRepresentations object 67 | """ 68 | 69 | 70 | molecule_data = molecule_data_orig.copy() 71 | logging.info("Generate Morgan Fingerprints") 72 | molecules = generate_molecule(molecule_data) 73 | molecule_data["molecule_representation"] = generate_fingerprints(molecules) 74 | molecule_data = self.remove_nan(molecule_data) 75 | 76 | return VectorMoleculeRepresentations( 77 | df=molecule_data, representation_name="FINGERPRINTS", split_type=split_type, seed=seed, limit_def = self.limit_def, 78 | ) 79 | 80 | def remove_nan(self, molecule_data: DataFrame): 81 | """ 82 | method use to check whetehr a representation has nan values and in case remove the corresponding row. 83 | 84 | input: representation_to_add is the representation to check 85 | """ 86 | nan_rows = check_if_nan(molecule_data["molecule_representation"]) 87 | if nan_rows: 88 | logging.warning( 89 | "Found nan in the representation:" 90 | + "fingerprints" 91 | + ". The following sample/s should be removed from the dataframe:" 92 | + str(nan_rows) 93 | ) 94 | molecule_data = molecule_data.drop(molecule_data.index[nan_rows]) 95 | # Reset indices 96 | molecule_data = molecule_data.reset_index(drop=True) 97 | return molecule_data 98 | -------------------------------------------------------------------------------- /prefer/molecule_representations/model_representations_builder.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023, Novartis Institutes for BioMedical Research Inc. and Microsoft Corporation 2 | # All rights reserved. 3 | # 4 | # Redistribution and use in source and binary forms, with or without 5 | # modification, are permitted provided that the following conditions are 6 | # met: 7 | # 8 | # * Redistributions of source code must retain the above copyright 9 | # notice, this list of conditions and the following disclaimer. 10 | # * Redistributions in binary form must reproduce the above 11 | # copyright notice, this list of conditions and the following 12 | # disclaimer in the documentation and/or other materials provided 13 | # with the distribution. 14 | # * Neither the name of Novartis Institutes for BioMedical Research Inc. nor Microsoft Corporation 15 | # nor the names of its contributors may be used to endorse or promote 16 | # products derived from this software without specific prior written permission. 17 | # 18 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 19 | # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 20 | # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 21 | # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 22 | # OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 23 | # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 24 | # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 25 | # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 26 | # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 27 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 28 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 29 | # 30 | # Created by Jessica Lanini, January 2023 31 | 32 | 33 | #!/usr/bin/env python 34 | # -*- coding: utf-8 -*- 35 | 36 | import logging 37 | import sys 38 | 39 | from pandas import DataFrame 40 | 41 | 42 | from prefer.utils.data_utils import check_if_nan 43 | from prefer.src.molecule_representations_builder import MoleculeRepresentationsBuilder 44 | from prefer.src.molecule_representations import MoleculeRepresentations 45 | from prefer.src.vector_molecule_representations import VectorMoleculeRepresentations 46 | from prefer.model_based_representations.model_based_representations_factory import ( 47 | load_model_from_directory, 48 | ) 49 | 50 | 51 | class ModelRepresentationsBuilder(MoleculeRepresentationsBuilder): 52 | def __init__( 53 | self, path_to_model: str = None, representation_name: str = None, limit_def: int = None, 54 | ): 55 | self.path_to_model = path_to_model 56 | # here instance models 57 | self.model_instance = load_model_from_directory(path_to_model) 58 | self.representation_name = representation_name 59 | self.limit_def = limit_def 60 | 61 | def build_representations( 62 | self, 63 | molecule_data_orig: DataFrame, 64 | embedding_types: str = "vector", 65 | split_type: str = "random", 66 | padding_size: int = 100, 67 | seed=1, 68 | ) -> MoleculeRepresentations: 69 | """ 70 | generic generator model to convert smile to embeddings 71 | Input: 72 | - molecule_data: this is a dataframe of the shape 73 | | ID | Smiles | Property_1 | Property_2 | ... | Property_N | 74 | ------------------------------------------------------------ 75 | - split_type: string related to the type of test/train split one want to apply. 76 | Possible split_type are random, temporal and cluster. One can add new splitting strategies 77 | in utils.splitting_strategies 78 | - padding_size: max dimension of the final list of vectors (max number of atoms per molecule) 79 | 80 | Output: 81 | - MoleculeRepresentations object 82 | """ 83 | 84 | 85 | 86 | if self.representation_name is None: 87 | self.representation_name = "model_based_representation" 88 | 89 | if embedding_types not in ["vector"]: 90 | raise ValueError("ERROR: embedding_types not known, only vector is possible.") 91 | 92 | molecule_data = molecule_data_orig.copy() 93 | logging.info("Generate Model based Representation") 94 | 95 | try: 96 | 97 | if embedding_types == "vector": 98 | with self.model_instance as model: 99 | smiles_embedding = model.encode(molecule_data.Smiles.to_list()) 100 | version_model_ID = model.get_model_id() 101 | list_of_smiles_embedding = [x for x in smiles_embedding] 102 | else: 103 | raise ValueError(f"{embedding_types} not known. Only vector is possible.") 104 | except Exception as e: 105 | raise ValueError( 106 | f"ERROR: the model directory for the model based representation might be incorrect or another error occurred: ValueError exception thrown{e}" 107 | ) 108 | 109 | if embedding_types == "vector": 110 | molecule_data["molecule_representation"] = list_of_smiles_embedding 111 | molecule_data = self.remove_nan(molecule_data) 112 | return VectorMoleculeRepresentations( 113 | df=molecule_data, 114 | representation_name=self.representation_name, 115 | split_type=split_type, 116 | seed=seed, 117 | model_id=version_model_ID, 118 | limit_def = self.limit_def, 119 | ) 120 | else: 121 | raise ValueError( 122 | f"embedding_types: {embedding_types} not known. Only vector is supported" 123 | ) 124 | 125 | def remove_nan(self, molecule_data: DataFrame): 126 | """ 127 | method use to check whetehr a representation has nan values and in case remove the corresponding row. 128 | 129 | input: representation_to_add is the representation to check 130 | """ 131 | nan_rows = check_if_nan(molecule_data["molecule_representation"]) 132 | if nan_rows: 133 | logging.warning( 134 | "Found nan in the representation" 135 | + self.representation_name 136 | + ". The following sample/s should be removed from the dataframe:" 137 | + str(nan_rows) 138 | ) 139 | molecule_data = molecule_data.drop(molecule_data.index[nan_rows]) 140 | # Reset indices 141 | molecule_data = molecule_data.reset_index(drop=True) 142 | return molecule_data 143 | -------------------------------------------------------------------------------- /prefer/schema/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rdkit/PREFER/16986ca183b38b94f9b3b62b441ec40671b8cf8a/prefer/schema/__init__.py -------------------------------------------------------------------------------- /prefer/schema/config.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023, Novartis Institutes for BioMedical Research Inc. and Microsoft Corporation 2 | # All rights reserved. 3 | # 4 | # Redistribution and use in source and binary forms, with or without 5 | # modification, are permitted provided that the following conditions are 6 | # met: 7 | # 8 | # * Redistributions of source code must retain the above copyright 9 | # notice, this list of conditions and the following disclaimer. 10 | # * Redistributions in binary form must reproduce the above 11 | # copyright notice, this list of conditions and the following 12 | # disclaimer in the documentation and/or other materials provided 13 | # with the distribution. 14 | # * Neither the name of Novartis Institutes for BioMedical Research Inc. nor Microsoft Corporation 15 | # nor the names of its contributors may be used to endorse or promote 16 | # products derived from this software without specific prior written permission. 17 | # 18 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 19 | # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 20 | # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 21 | # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 22 | # OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 23 | # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 24 | # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 25 | # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 26 | # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 27 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 28 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 29 | # 30 | # Created by Jessica Lanini, January 2023 31 | 32 | 33 | from __future__ import annotations 34 | from typing import List, Dict, Optional 35 | import yaml 36 | import pydantic 37 | from pydantic import Extra 38 | 39 | """pydantic Models defining structure of a PREFER config YAML file""" 40 | 41 | 42 | class PreferConfig(pydantic.BaseModel): 43 | class Config: 44 | extra = Extra.forbid 45 | 46 | problem_type: str 47 | experiment_name: str 48 | smiles_column_name: str 49 | id_column_name: str 50 | desirability_scores: Optional[Dict[str, List[Dict[str, float]]]] 51 | splitting_strategy: str = "random" 52 | 53 | @classmethod 54 | def from_yaml_file(cls, path: str) -> PreferConfig: 55 | with open(path) as f: 56 | parsed_yaml = yaml.load(f, Loader=yaml.FullLoader) 57 | return cls.parse_obj(parsed_yaml) 58 | 59 | 60 | class LocalConfig(PreferConfig): 61 | """Config for training a local (project-specific) property model""" 62 | 63 | assay_name: str 64 | project_code: str 65 | properties_column_name: str 66 | # TODO why do we have different field names 'datapath' and 'path_to_df' for local and global models? 67 | datapath: str 68 | 69 | 70 | class GlobalConfig(PreferConfig): 71 | """Config for training a global (not project-specific) property model""" 72 | 73 | properties_column_name_list: List[str] 74 | path_to_df: str 75 | representations: Dict[str, str] 76 | temporal_info_column_name: Optional[str] 77 | -------------------------------------------------------------------------------- /prefer/scripts/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | -------------------------------------------------------------------------------- /prefer/scripts/aml_context.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023, Novartis Institutes for BioMedical Research Inc. and Microsoft Corporation 2 | # All rights reserved. 3 | # 4 | # Redistribution and use in source and binary forms, with or without 5 | # modification, are permitted provided that the following conditions are 6 | # met: 7 | # 8 | # * Redistributions of source code must retain the above copyright 9 | # notice, this list of conditions and the following disclaimer. 10 | # * Redistributions in binary form must reproduce the above 11 | # copyright notice, this list of conditions and the following 12 | # disclaimer in the documentation and/or other materials provided 13 | # with the distribution. 14 | # * Neither the name of Novartis Institutes for BioMedical Research Inc. nor Microsoft Corporation 15 | # nor the names of its contributors may be used to endorse or promote 16 | # products derived from this software without specific prior written permission. 17 | # 18 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 19 | # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 20 | # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 21 | # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 22 | # OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 23 | # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 24 | # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 25 | # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 26 | # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 27 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 28 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 29 | # 30 | # Created by Jessica Lanini, January 2023 31 | 32 | 33 | def run_aml_context(): 34 | 35 | try: 36 | from azureml.core.run import Run 37 | 38 | run_context = Run.get_context(allow_offline=False) 39 | except Exception as e: 40 | print(e) 41 | run_context = None 42 | 43 | return run_context 44 | -------------------------------------------------------------------------------- /prefer/scripts/combine_results.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023, Novartis Institutes for BioMedical Research Inc. and Microsoft Corporation 2 | # All rights reserved. 3 | # 4 | # Redistribution and use in source and binary forms, with or without 5 | # modification, are permitted provided that the following conditions are 6 | # met: 7 | # 8 | # * Redistributions of source code must retain the above copyright 9 | # notice, this list of conditions and the following disclaimer. 10 | # * Redistributions in binary form must reproduce the above 11 | # copyright notice, this list of conditions and the following 12 | # disclaimer in the documentation and/or other materials provided 13 | # with the distribution. 14 | # * Neither the name of Novartis Institutes for BioMedical Research Inc. nor Microsoft Corporation 15 | # nor the names of its contributors may be used to endorse or promote 16 | # products derived from this software without specific prior written permission. 17 | # 18 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 19 | # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 20 | # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 21 | # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 22 | # OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 23 | # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 24 | # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 25 | # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 26 | # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 27 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 28 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 29 | # 30 | # Created by Jessica Lanini, January 2023 31 | 32 | 33 | #!/usr/bin/python 34 | 35 | import shutil 36 | 37 | from prefer.src.benchmarking import Benchmarking 38 | from prefer.utils.models_evaluation import save_html 39 | from prefer.utils.save_load import save_combined_results 40 | import pandas as pd 41 | import os 42 | import argparse 43 | import json 44 | 45 | 46 | def extract_folders(original_flder): 47 | 48 | dirfiles = os.listdir(original_flder) 49 | fullpaths = map(lambda name: os.path.join(original_flder, name), dirfiles) 50 | dirs = [] 51 | for file in fullpaths: 52 | if (os.path.isdir(file)) and (file[0] not in [".", "_"]): 53 | dirs.append(file) 54 | return dirs[0] 55 | 56 | 57 | def combine_results_from_args( 58 | store_result_folder, 59 | problem_type, 60 | benchs_folder1, 61 | benchs_folder2, 62 | benchs_folder3, 63 | benchs_folder4, 64 | experiment_name=None, 65 | save_json=True, 66 | ): 67 | 68 | """ 69 | Script to combine all the results stored as separate bench objects in different folders, colelcted in the input list, computed from different PREFER runs. 70 | """ 71 | 72 | collect_df = {} 73 | results = [] 74 | collect_bench = [] 75 | # create final folder 76 | os.makedirs(store_result_folder, exist_ok=True) 77 | 78 | if store_result_folder.endswith("/"): # Normalise away trailing slashes 79 | store_result_folder = store_result_folder[:-1] 80 | 81 | all_folders = [benchs_folder1, benchs_folder2, benchs_folder3, benchs_folder4] 82 | 83 | for folder in all_folders: 84 | if folder: 85 | end = folder.split("/")[-1] 86 | if "." in end: 87 | folder = folder.split("/")[:-1] 88 | folder = "/".join(folder) 89 | folder = extract_folders(folder) 90 | # folder of interests 91 | final_dir = folder 92 | 93 | if not final_dir.endswith("/"): # Normalise away trailing slashes 94 | final_dir = final_dir + "/" 95 | 96 | tmp = Benchmarking(problem_type=problem_type) 97 | try: 98 | tmp.load(final_dir) 99 | print("bench loaded") 100 | tmp.create_summary_table() 101 | print("summary_table computed") 102 | 103 | if experiment_name: 104 | experiment_name_tmp = experiment_name 105 | else: 106 | experiment_name_tmp = tmp.experiment_name 107 | tmp.table_metrics.rename( 108 | columns={ 109 | tmp.table_metrics.columns[0]: experiment_name_tmp 110 | + ":" 111 | + tmp.table_metrics.columns[0] 112 | }, 113 | inplace=True, 114 | ) 115 | 116 | if experiment_name_tmp not in collect_df: 117 | collect_df[experiment_name_tmp] = [tmp.table_metrics] 118 | else: 119 | collect_df[experiment_name_tmp].append(tmp.table_metrics) 120 | collect_bench.append(tmp) 121 | 122 | except Exception as e: 123 | # WARNING? 124 | raise ValueError( 125 | f"An error occurred with folder: {final_dir}. Benchmarking object cannot be imported. In particular: {e}", 126 | ) 127 | 128 | # dump metrics for every model: 129 | metrics_dict = tmp.table_metrics.to_dict() 130 | print("metrics_dict created") 131 | # Note: As the df with the metric has only + as identificator 132 | # so we add there experiment name and in the body attach problem type. 133 | experiment_id = next(iter(metrics_dict)) 134 | new_experiment_id = experiment_name_tmp + "," + experiment_id 135 | metrics_dict[new_experiment_id] = metrics_dict[experiment_id] 136 | metrics_dict[new_experiment_id]["Problem type"] = tmp.problem_type 137 | del metrics_dict[experiment_id] 138 | results.append(metrics_dict) 139 | print("metrics_dict appended") 140 | else: 141 | continue 142 | 143 | # Then save one json with all the experiments (dataset x model type): 144 | if save_json: 145 | with open(store_result_folder + "/" + "PREFER_comparison_table.json", "w") as jsonfile: 146 | json.dump(results, jsonfile) 147 | 148 | for key in collect_df.keys(): 149 | merged = pd.concat(collect_df[key], axis=1) 150 | save_html( 151 | merged, df_name=key, path=store_result_folder + "/" + "PREFER_comparison_table.html", 152 | ) 153 | merged.to_csv(store_result_folder + "/" + "PREFER_comparison_table.csv") 154 | merged.to_pickle(store_result_folder + "/" + "PREFER_comparison_table.pkl") 155 | 156 | return 157 | 158 | 159 | if __name__ == "__main__": 160 | parser = argparse.ArgumentParser(description="combine results of different PREFER runs") 161 | parser.add_argument( 162 | "-bf1", 163 | "--benchs_folder1", 164 | type=str, 165 | help="path of the folder where results are stored", 166 | required=True, 167 | ) 168 | 169 | parser.add_argument( 170 | "-bf2", "--benchs_folder2", type=str, help="path of the folder where results are stored", 171 | ) 172 | 173 | parser.add_argument( 174 | "-bf3", "--benchs_folder3", type=str, help="path of the folder where results are stored", 175 | ) 176 | 177 | parser.add_argument( 178 | "-bf4", "--benchs_folder4", type=str, help="path of the folder where results are stored", 179 | ) 180 | 181 | parser.add_argument( 182 | "-srf", 183 | "--store_result_folder", 184 | type=str, 185 | help="path of the folder where results are stored", 186 | ) 187 | 188 | parser.add_argument( 189 | "-pt", "--problem_type", type=str, help="problem_type: regression or classification", 190 | ) 191 | 192 | args = parser.parse_args() 193 | combine_results_from_args( 194 | benchs_folder1=args.benchs_folder1, 195 | benchs_folder2=args.benchs_folder2, 196 | benchs_folder3=args.benchs_folder3, 197 | benchs_folder4=args.benchs_folder4, 198 | store_result_folder=args.store_result_folder, 199 | problem_type=args.problem_type, 200 | ) 201 | -------------------------------------------------------------------------------- /prefer/scripts/get_representations.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023, Novartis Institutes for BioMedical Research Inc. and Microsoft Corporation 2 | # All rights reserved. 3 | # 4 | # Redistribution and use in source and binary forms, with or without 5 | # modification, are permitted provided that the following conditions are 6 | # met: 7 | # 8 | # * Redistributions of source code must retain the above copyright 9 | # notice, this list of conditions and the following disclaimer. 10 | # * Redistributions in binary form must reproduce the above 11 | # copyright notice, this list of conditions and the following 12 | # disclaimer in the documentation and/or other materials provided 13 | # with the distribution. 14 | # * Neither the name of Novartis Institutes for BioMedical Research Inc. nor Microsoft Corporation 15 | # nor the names of its contributors may be used to endorse or promote 16 | # products derived from this software without specific prior written permission. 17 | # 18 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 19 | # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 20 | # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 21 | # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 22 | # OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 23 | # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 24 | # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 25 | # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 26 | # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 27 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 28 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 29 | # 30 | # Created by Jessica Lanini, January 2023 31 | 32 | 33 | #!/usr/bin/python 34 | import argparse 35 | import json 36 | import logging 37 | import os 38 | import sys 39 | 40 | import pandas as pd 41 | import shutil 42 | 43 | 44 | from prefer.utils.data_preparation import prepare_data 45 | from prefer.utils.mapping import mapping_representations 46 | 47 | logger = logging.getLogger(__name__) 48 | 49 | 50 | def compute_representations_from_args( 51 | path_to_df, 52 | representation_to_compute, 53 | path_to_model, 54 | output_dir, 55 | experiment_name, 56 | id_column_name, 57 | smiles_column_name, 58 | splitting_strategy, 59 | temporal_info_column_name, 60 | properties_column_name_list, 61 | ): 62 | """ 63 | we assume that the dataset has the semicolomn as separator and that the dataset 64 | """ 65 | if os.path.exists(output_dir): 66 | shutil.rmtree(output_dir) 67 | os.makedirs(output_dir, exist_ok=True) 68 | 69 | # Read your .csv files 70 | if path_to_df.endswith("/"): # Normalise away trailing slashes 71 | path_to_df = path_to_df[:-1] 72 | 73 | try: 74 | arr = os.listdir(path_to_df) 75 | path_to_df = path_to_df + "/" + arr[0] 76 | except Exception: 77 | logger.info("Already a file") 78 | 79 | try: 80 | df = pd.read_csv(path_to_df) 81 | except Exception: 82 | df = pd.read_csv(path_to_df, sep=";") 83 | 84 | # in prepare_data now the dataset is both prepared and filtered 85 | 86 | # Manipulate dataframe such that it is in the right shape fo being used as input of the DataStorage class 87 | # ¦ ID ¦ Smiles ¦ Property_1 ¦ Property_2 ¦ ... ¦ Property_N ¦ 88 | # ------------------------------------------------------------- 89 | # This is done by specifying the experiment_name, the name of column where the ID information and SMILES representation of each sample is stored, and finally 90 | # the list of the columns' names of the properties to model. 91 | df = prepare_data( 92 | df=df, 93 | id_column_name=id_column_name, 94 | smiles_column_name=smiles_column_name, 95 | properties_column_name_list=properties_column_name_list, 96 | temporal_info_column_name=temporal_info_column_name, 97 | ) 98 | 99 | mapping_representations( 100 | representation_name=representation_to_compute, 101 | df=df, 102 | output_dir=output_dir, 103 | path_to_model=path_to_model, 104 | experiment_name=experiment_name, 105 | path_to_df=path_to_df, 106 | split_type=splitting_strategy, 107 | ) 108 | logger.info("Representation Computed") 109 | 110 | return output_dir 111 | 112 | 113 | if __name__ == "__main__": 114 | """ 115 | Example of usage: 116 | %run get_representations.py -ptd "/path/to/dataframe/dataframe.csv" -rtc "FINGERPRINTS" 117 | -od "/path/to/representation/PREFER_automation_branch/" -en "logD" -icn "Molecule ChEMBL ID" 118 | -scn "Smiles" -pcn "Standard Value" 119 | """ 120 | parser = argparse.ArgumentParser(description="compute molecule representation") 121 | parser.add_argument( 122 | "-ptd", 123 | "--path_to_df", 124 | type=str, 125 | help="The entire path to the dataframe used for this experiment. The dataframe should be stored as .csv, " 126 | "should use semicolomn as separator and should contain information about the SMILE representation " 127 | "of each molecule, an ID of the molecules and the property/ies one want to model.", 128 | required=True, 129 | ) 130 | 131 | parser.add_argument( 132 | "-rtc", 133 | "--representation_to_compute", 134 | type=str, 135 | help="name of the rapresentation to compute or path to the generator which is used " 136 | "to map smiles into embeddings. If a model-based representation is selected then a " 137 | "path to model should be indicated", 138 | required=True, 139 | ) 140 | 141 | parser.add_argument( 142 | "-ptm", 143 | "--path_to_model", 144 | type=str, 145 | help="path to the model that should be used to convert smiles into embeddings", 146 | ) 147 | 148 | parser.add_argument( 149 | "-od", 150 | "--output_dir", 151 | type=str, 152 | help="path to the directory where to store the molecule representation computed", 153 | required=True, 154 | ) 155 | 156 | parser.add_argument( 157 | "-en", 158 | "--experiment_name", 159 | type=str, 160 | help="name of the experiment one would like to perform. E.g. logD", 161 | required=True, 162 | ) 163 | 164 | parser.add_argument( 165 | "-icn", 166 | "--id_column_name", 167 | type=str, 168 | help="name of the dataframe column where the id of each molecule is stored", 169 | required=True, 170 | ) 171 | 172 | parser.add_argument( 173 | "-scn", 174 | "--smiles_column_name", 175 | type=str, 176 | help="name of the dataframe column where the smile representation of each molecule is stored", 177 | required=True, 178 | ) 179 | 180 | parser.add_argument( 181 | "-ss", 182 | "--splitting_strategy", 183 | type=str, 184 | help="name of splitting startegy selected [random, temporal, cluster]", 185 | required=True, 186 | ) 187 | 188 | parser.add_argument( 189 | "-ticn", 190 | "--temporal_info_column_name", 191 | type=str, 192 | help="name of the column where the temporal information is stored", 193 | ) 194 | 195 | parser.add_argument( 196 | "-pcn", 197 | "--properties_column_name", 198 | action="append", 199 | help="list of names of the dataframe columns where the property/ies of each molecule is stored", 200 | required=True, 201 | ) 202 | # if multiple tasks -pcn "Task1" -pcn "Task2" -pcn "Task3" 203 | 204 | args = parser.parse_args() 205 | if ( 206 | args.representation_to_compute not in ["FINGERPRINTS", "DESCRIPTORS2D", "TF2_GNN"] 207 | and not args.path_to_model 208 | ): 209 | raise RuntimeError( 210 | f"Please specify a path_to_model for molecular representations which are not in the default ones " 211 | f"[FINGERPRINTS, DESCRIPTORS2D, TF2_GNN]" 212 | ) 213 | 214 | try: 215 | properties_column_name = json.loads(args.properties_column_name[0]) 216 | 217 | except Exception: 218 | properties_column_name_json_format = json.dumps(args.properties_column_name) 219 | properties_column_name = json.loads(properties_column_name_json_format) 220 | 221 | compute_representations_from_args( 222 | path_to_df=args.path_to_df, 223 | representation_to_compute=args.representation_to_compute, 224 | path_to_model=args.path_to_model, 225 | output_dir=args.output_dir, 226 | experiment_name=args.experiment_name, 227 | id_column_name=args.id_column_name, 228 | smiles_column_name=args.smiles_column_name, 229 | splitting_strategy=args.splitting_strategy, 230 | temporal_info_column_name=args.temporal_info_column_name, 231 | properties_column_name_list=properties_column_name, 232 | ) 233 | -------------------------------------------------------------------------------- /prefer/scripts/run_PREFER.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023, Novartis Institutes for BioMedical Research Inc. and Microsoft Corporation 2 | # All rights reserved. 3 | # 4 | # Redistribution and use in source and binary forms, with or without 5 | # modification, are permitted provided that the following conditions are 6 | # met: 7 | # 8 | # * Redistributions of source code must retain the above copyright 9 | # notice, this list of conditions and the following disclaimer. 10 | # * Redistributions in binary form must reproduce the above 11 | # copyright notice, this list of conditions and the following 12 | # disclaimer in the documentation and/or other materials provided 13 | # with the distribution. 14 | # * Neither the name of Novartis Institutes for BioMedical Research Inc. nor Microsoft Corporation 15 | # nor the names of its contributors may be used to endorse or promote 16 | # products derived from this software without specific prior written permission. 17 | # 18 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 19 | # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 20 | # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 21 | # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 22 | # OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 23 | # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 24 | # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 25 | # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 26 | # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 27 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 28 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 29 | # 30 | # Created by Jessica Lanini, January 2023 31 | 32 | 33 | #!/usr/bin/python 34 | 35 | import argparse 36 | import os 37 | import sys 38 | import time 39 | import logging 40 | 41 | 42 | from prefer.src.vector_molecule_representations import VectorMoleculeRepresentations 43 | from prefer.src.benchmarking import Benchmarking 44 | from prefer.utils.save_load import saving_procedure_autosklearn 45 | 46 | import tempfile 47 | 48 | logger = logging.getLogger(__name__) 49 | 50 | 51 | def run_PREFER_from_args( 52 | problem_type, representation_name, repr_dir, final_folder_path, experiment_name=None, 53 | ): 54 | 55 | os.makedirs(final_folder_path, exist_ok=True) 56 | 57 | if final_folder_path is None: 58 | final_folder_path = "." 59 | 60 | # Import saved representation 61 | repr_type = retrieve_type_of_molecular_representation(representation_name) 62 | list_if_files = os.listdir(repr_dir) 63 | if not repr_dir.endswith("/"): # Normalise away trailing slashes 64 | repr_dir = repr_dir + "/" 65 | 66 | repr_ = repr_type.load(repr_dir + list_if_files[0]) 67 | 68 | tasks_number = len([col for col in repr_.df.columns if "Property" in col]) 69 | if tasks_number == 1: 70 | mask = False 71 | else: 72 | mask = True 73 | logger.info(tasks_number, mask) 74 | 75 | with tempfile.TemporaryDirectory() as tmpdirname: 76 | bench = Benchmarking(problem_type=problem_type, working_directory=tmpdirname,) 77 | try: 78 | bench.benchmark([repr_], experiment_name=experiment_name) 79 | except TypeError as e: 80 | logger.error("EXCEPTION during property model training: ", e) 81 | pass 82 | 83 | # saving procedure 84 | timestr = time.strftime("%Y%m%d-%H%M%S") 85 | name = representation_name 86 | if experiment_name is not None: 87 | name = name + "_" + experiment_name 88 | try: 89 | if not os.path.exists(final_folder_path): 90 | os.mkdir(final_folder_path) 91 | except OSError as e: 92 | logger.error("Creation of the directory %s failed", final_folder_path, e) 93 | else: 94 | logger.info("Successfully created the directory %s ", final_folder_path) 95 | dir_destination = final_folder_path + "/" + name + "_" + timestr 96 | 97 | saving_procedure_autosklearn(bench, dir_destination) 98 | return 99 | 100 | 101 | def retrieve_type_of_molecular_representation(representation_name: str) -> type: 102 | return VectorMoleculeRepresentations 103 | 104 | 105 | if __name__ == "__main__": 106 | """ 107 | Example of usage: 108 | %run run_PREFER.py -pt "regression" -rn "FINGERPRINTS" -mn "RandomForest" -rd "/path/to/representation/PREFER_automation_branch/" -pg '{"max_depth": [10], "min_samples_leaf": [2], "n_estimators": [10]}' -pge "{}" 109 | """ 110 | parser = argparse.ArgumentParser(description="run PREFER") 111 | parser.add_argument( 112 | "-pt", 113 | "--problem_type", 114 | type=str, 115 | help="whether this is a or a problem", 116 | required=True, 117 | ) 118 | 119 | parser.add_argument( 120 | "-rn", 121 | "--representation_name", 122 | type=str, 123 | help="name of the rapresentation to compute or path to the generator which is used to map smiles into embeddings", 124 | required=True, 125 | ) # here you can have a list representations so that is case this list has a lenght >1 then the first step is to combine the representations 126 | 127 | parser.add_argument( 128 | "-rd", 129 | "--repr_dir", 130 | type=str, 131 | help="directory where the selected representation is stored", 132 | required=True, 133 | ) 134 | 135 | parser.add_argument( 136 | "-ffp", 137 | "--final_folder_path", 138 | type=str, 139 | help="directory where the results will be stored. If not specified results will be store in the " 140 | "current directory.", 141 | ) 142 | 143 | parser.add_argument( 144 | "-en", "--experiment_name", type=str, help="name of the current experiment", 145 | ) 146 | 147 | args = parser.parse_args() 148 | run_PREFER_from_args( 149 | problem_type=args.problem_type, 150 | representation_name=args.representation_name, 151 | repr_dir=args.repr_dir, 152 | final_folder_path=args.final_folder_path, 153 | experiment_name=args.experiment_name, 154 | ) 155 | -------------------------------------------------------------------------------- /prefer/scripts/utils.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023, Novartis Institutes for BioMedical Research Inc. and Microsoft Corporation 2 | # All rights reserved. 3 | # 4 | # Redistribution and use in source and binary forms, with or without 5 | # modification, are permitted provided that the following conditions are 6 | # met: 7 | # 8 | # * Redistributions of source code must retain the above copyright 9 | # notice, this list of conditions and the following disclaimer. 10 | # * Redistributions in binary form must reproduce the above 11 | # copyright notice, this list of conditions and the following 12 | # disclaimer in the documentation and/or other materials provided 13 | # with the distribution. 14 | # * Neither the name of Novartis Institutes for BioMedical Research Inc. nor Microsoft Corporation 15 | # nor the names of its contributors may be used to endorse or promote 16 | # products derived from this software without specific prior written permission. 17 | # 18 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 19 | # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 20 | # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 21 | # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 22 | # OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 23 | # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 24 | # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 25 | # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 26 | # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 27 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 28 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 29 | # 30 | # Created by Jessica Lanini, January 2023 31 | 32 | 33 | def validate_prefer_config(config: dict) -> bool: 34 | required_fields = [ 35 | "path_to_df", 36 | "experiment_name", 37 | "id_column_name", 38 | "smiles_column_name", 39 | "properties_column_name_list", 40 | "problem_type", 41 | "representations", 42 | ] 43 | for field in required_fields: 44 | if field not in config.keys(): 45 | return False 46 | 47 | return True 48 | 49 | 50 | def validate_local_model_config(config: dict) -> bool: 51 | required_fields = [ 52 | "datapath", 53 | "experiment_name", 54 | "assay_name", 55 | "project_code", 56 | "id_column_name", 57 | "smiles_column_name", 58 | "properties_column_name", 59 | "problem_type", 60 | ] 61 | for field in required_fields: 62 | if field not in config.keys(): 63 | return False 64 | 65 | return True 66 | -------------------------------------------------------------------------------- /prefer/src/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | -------------------------------------------------------------------------------- /prefer/src/molecule_representations.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023, Novartis Institutes for BioMedical Research Inc. and Microsoft Corporation 2 | # All rights reserved. 3 | # 4 | # Redistribution and use in source and binary forms, with or without 5 | # modification, are permitted provided that the following conditions are 6 | # met: 7 | # 8 | # * Redistributions of source code must retain the above copyright 9 | # notice, this list of conditions and the following disclaimer. 10 | # * Redistributions in binary form must reproduce the above 11 | # copyright notice, this list of conditions and the following 12 | # disclaimer in the documentation and/or other materials provided 13 | # with the distribution. 14 | # * Neither the name of Novartis Institutes for BioMedical Research Inc. nor Microsoft Corporation 15 | # nor the names of its contributors may be used to endorse or promote 16 | # products derived from this software without specific prior written permission. 17 | # 18 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 19 | # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 20 | # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 21 | # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 22 | # OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 23 | # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 24 | # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 25 | # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 26 | # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 27 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 28 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 29 | # 30 | # Created by Jessica Lanini, January 2023 31 | 32 | 33 | import logging 34 | import pickle 35 | from abc import abstractmethod, ABC 36 | import time 37 | from dataclasses import dataclass 38 | from typing import Optional 39 | import os 40 | 41 | from pandas import DataFrame 42 | 43 | 44 | @dataclass 45 | class MoleculeRepresentations(ABC): 46 | df: DataFrame 47 | representation_name: str 48 | split_type: str 49 | model_path: str = "" 50 | repr_type: str = "" 51 | model_id: str = "tmp_id" 52 | experiment_name: str = "new_experiment" 53 | path_to_df: str = "" 54 | limit_def: int = None 55 | 56 | @abstractmethod 57 | def split(self): 58 | pass 59 | 60 | def save( 61 | self, 62 | path: str, 63 | name: Optional[str] = None, 64 | experiment_name: Optional[str] = None, 65 | path_to_df: Optional[str] = None, 66 | ): 67 | """ 68 | 69 | method to save the MoleculeRepresentations object in the location specified by path 70 | 71 | Usage: 72 | mol_repr.save('../folder/') 73 | """ 74 | 75 | if experiment_name is not None: 76 | self.experiment_name = experiment_name 77 | 78 | if path_to_df is not None: 79 | self.path_to_df = path_to_df 80 | 81 | timestr = time.strftime("%Y%m%d-%H%M%S") 82 | 83 | final_path = os.path.join( 84 | path, 85 | f"{self.experiment_name}_{name or self.representation_name}_{self.repr_type}_{timestr}.pkl", 86 | ) 87 | 88 | with open(final_path, "wb",) as output: 89 | pickle.dump(self.__dict__, output, pickle.HIGHEST_PROTOCOL) 90 | 91 | logging.info(f"Representation saved in {final_path}") 92 | 93 | @classmethod 94 | def load(cls, path: str): 95 | """ 96 | Load MoleculeRepresentations from a .pkl file. 97 | """ 98 | 99 | with open(path, "rb") as input: 100 | tmp = pickle.load(input) 101 | return cls(**tmp) 102 | -------------------------------------------------------------------------------- /prefer/src/molecule_representations_builder.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023, Novartis Institutes for BioMedical Research Inc. and Microsoft Corporation 2 | # All rights reserved. 3 | # 4 | # Redistribution and use in source and binary forms, with or without 5 | # modification, are permitted provided that the following conditions are 6 | # met: 7 | # 8 | # * Redistributions of source code must retain the above copyright 9 | # notice, this list of conditions and the following disclaimer. 10 | # * Redistributions in binary form must reproduce the above 11 | # copyright notice, this list of conditions and the following 12 | # disclaimer in the documentation and/or other materials provided 13 | # with the distribution. 14 | # * Neither the name of Novartis Institutes for BioMedical Research Inc. nor Microsoft Corporation 15 | # nor the names of its contributors may be used to endorse or promote 16 | # products derived from this software without specific prior written permission. 17 | # 18 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 19 | # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 20 | # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 21 | # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 22 | # OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 23 | # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 24 | # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 25 | # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 26 | # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 27 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 28 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 29 | # 30 | # Created by Jessica Lanini, January 2023 31 | 32 | 33 | import sys 34 | from abc import abstractmethod 35 | 36 | from pandas import DataFrame 37 | 38 | 39 | from prefer.src.molecule_representations import MoleculeRepresentations 40 | 41 | 42 | class MoleculeRepresentationsBuilder: 43 | @abstractmethod 44 | def build_representations(self, molecule_data: DataFrame) -> MoleculeRepresentations: 45 | """ 46 | Method to implement the molecular representation of interest. 47 | Input: 48 | - molecule_data: this is a dataframe of the shape 49 | | ID | Smiles | Property_1 | Property_2 | ... | Property_N | 50 | ------------------------------------------------------------ 51 | """ 52 | pass 53 | 54 | @abstractmethod 55 | def remove_nan(self, molecule_data: DataFrame): 56 | pass 57 | -------------------------------------------------------------------------------- /prefer/src/vector_molecule_representations.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023, Novartis Institutes for BioMedical Research Inc. and Microsoft Corporation 2 | # All rights reserved. 3 | # 4 | # Redistribution and use in source and binary forms, with or without 5 | # modification, are permitted provided that the following conditions are 6 | # met: 7 | # 8 | # * Redistributions of source code must retain the above copyright 9 | # notice, this list of conditions and the following disclaimer. 10 | # * Redistributions in binary form must reproduce the above 11 | # copyright notice, this list of conditions and the following 12 | # disclaimer in the documentation and/or other materials provided 13 | # with the distribution. 14 | # * Neither the name of Novartis Institutes for BioMedical Research Inc. nor Microsoft Corporation 15 | # nor the names of its contributors may be used to endorse or promote 16 | # products derived from this software without specific prior written permission. 17 | # 18 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 19 | # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 20 | # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 21 | # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 22 | # OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 23 | # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 24 | # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 25 | # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 26 | # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 27 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 28 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 29 | # 30 | # Created by Jessica Lanini, January 2023 31 | 32 | 33 | #!/usr/bin/env python 34 | # -*- coding: utf-8 -*- 35 | import logging 36 | import sys 37 | from typing import Optional 38 | from dataclasses import dataclass 39 | 40 | 41 | import pandas as pd 42 | 43 | 44 | from prefer.utils.data_utils import convert 45 | from prefer.utils.splitting_strategies import ( 46 | random_split, 47 | temporal_split, 48 | cluster_split, 49 | ) 50 | from prefer.utils.features_scaling import scale_features 51 | from prefer.src.molecule_representations import MoleculeRepresentations 52 | 53 | 54 | @dataclass 55 | class VectorMoleculeRepresentations(MoleculeRepresentations): 56 | repr_type: str = "vector" 57 | scale_type: Optional[str] = None 58 | seed: int = 1 59 | features_means: Optional[pd.Series] = None 60 | features_stds: Optional[pd.Series] = None 61 | 62 | def split(self, return_indices: bool = False): 63 | """ 64 | method to extract the indices used to split the original dataset and obtain the final dataframes 65 | """ 66 | print("Splitting the dataset according to: " + self.split_type + " split") 67 | 68 | indices = self.extract_indices() 69 | 70 | # In case of One shot one could add another split_type that should split alond the tasks and not on the samples 71 | if not indices: 72 | raise ValueError("Empty indices for splitting dataset") 73 | else: 74 | if len(indices) == 2: 75 | logging.debug("No Validation Set") 76 | index_train = indices[0] 77 | index_test = indices[1] 78 | Xtrain, ytrain, Xtest, ytest = self.extract_matrices(index_train, index_test) 79 | if self.scale_type: 80 | print("Scaling features according to: " + self.scale_type) 81 | Xtrain, Xtest, self.features_means, self.features_stds = scale_features( 82 | Xtrain, Xtest, scaling_type=self.scale_type 83 | ) 84 | else: 85 | raise ValueError("Validation set cannot be computed for the moment") 86 | 87 | if return_indices: 88 | return Xtrain, ytrain, Xtest, ytest, index_train, index_test 89 | else: 90 | return Xtrain, ytrain, Xtest, ytest 91 | 92 | def extract_matrices(self, index_train, index_test): 93 | """ 94 | method used to convert the test/train datasets, obtained by splitting the original dataset, into numpy arrays and store them into Xtrain and Xtest. 95 | """ 96 | if max(index_train) > (self.df.shape[0] - 1) or max(index_test) > (self.df.shape[0] - 1): 97 | raise ValueError("ERROR with indices") 98 | 99 | properties = self.df.columns[["Property" in str(x) for x in self.df.columns.values]].values 100 | if properties.size == 0: 101 | properties = self.df.columns[["true_label_" in str(x) for x in self.df.columns.values]].values 102 | elif properties.size == 0: 103 | raise ValueError('Columns with either Property or true_label_ cannot be found in the dataset. Cannot understand where labels are stored.') 104 | df_train = self.df.iloc[index_train] 105 | df_train = df_train.reset_index() 106 | df_test = self.df.iloc[index_test] 107 | df_test = df_test.reset_index() 108 | if "molecule_representation" in df_train.columns.values: 109 | repr_name = "molecule_representation" 110 | else: 111 | repr_name = self.representation_name 112 | 113 | Xtrain, ytrain = convert(df_train, repr_name, properties) 114 | Xtest, ytest = convert(df_test, repr_name, properties) 115 | 116 | return Xtrain, ytrain, Xtest, ytest 117 | 118 | def extract_indices(self): 119 | """ 120 | method to extract the indices used to split the original dataset. They are computed according to the strategy required by the user. 121 | """ 122 | if self.split_type == "random": 123 | return random_split(self.df, self.seed, limit_def=self.limit_def) 124 | elif self.split_type == "cluster": 125 | return cluster_split(df=self.df) 126 | elif self.split_type == "temporal": 127 | return temporal_split(df=self.df) 128 | else: 129 | raise ValueError( 130 | f"Split method {self.split_type} is not valid. Allowed options are random, cluster, temporal" 131 | ) 132 | -------------------------------------------------------------------------------- /prefer/tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rdkit/PREFER/16986ca183b38b94f9b3b62b441ec40671b8cf8a/prefer/tests/__init__.py -------------------------------------------------------------------------------- /prefer/tests/data_for_test/logDPublic.csv: -------------------------------------------------------------------------------- 1 | ,Unnamed: 0,Unnamed: 0.1,index,Unnamed: 0.1.1,Unnamed: 0.1.1.1,Molecule ChEMBL ID,Smiles,Standard Value 2 | 0,0,0,0,0,0,CHEMBL1682950,CC[C@H](NC(=O)c1c([S+](C)[O-])c(nc2ccccc12)c3ccsc3)c4ccccc4,2.66 3 | 1,1,1,1,1,1,CHEMBL1682948,CC[C@H](NC(=O)c1c([S+](C)[O-])c(nc2ccccc12)c3ccc(F)cc3)c4ccccc4,3.72 4 | 2,2,2,2,2,2,CHEMBL1682945,CC[C@H](NC(=O)c1c(CC[S+](C)[O-])c(nc2ccccc12)c3ccccc3)c4ccccc4,3.15 5 | 3,4,4,4,4,1,CHEMBL1364723,CC1=C(C)C[S+]([O-])N(S(=O)(=O)c2ccc(Cl)cc2)C1,3.5 6 | 4,0,0,0,0,0,CHEMBL1682950,CC[C@H](NC(=O)c1c([S+](C)[O-])c(nc2ccccc12)c3ccsc3)c4ccccc4,2.66 7 | 5,1,1,1,1,1,CHEMBL1682948,CC[C@H](NC(=O)c1c([S+](C)[O-])c(nc2ccccc12)c3ccc(F)cc3)c4ccccc4,3.72 8 | 6,2,2,2,2,2,CHEMBL1682945,CC[C@H](NC(=O)c1c(CC[S+](C)[O-])c(nc2ccccc12)c3ccccc3)c4ccccc4,3.15 9 | 7,4,4,4,4,1,CHEMBL1364723,CC1=C(C)C[S+]([O-])N(S(=O)(=O)c2ccc(Cl)cc2)C1,3.5 10 | 8,0,0,0,0,0,CHEMBL1682950,CC[C@H](NC(=O)c1c([S+](C)[O-])c(nc2ccccc12)c3ccsc3)c4ccccc4,2.66 11 | 9,1,1,1,1,1,CHEMBL1682948,CC[C@H](NC(=O)c1c([S+](C)[O-])c(nc2ccccc12)c3ccc(F)cc3)c4ccccc4,3.72 12 | 10,2,2,2,2,2,CHEMBL1682945,CC[C@H](NC(=O)c1c(CC[S+](C)[O-])c(nc2ccccc12)c3ccccc3)c4ccccc4,3.15 13 | 11,4,4,4,4,1,CHEMBL1364723,CC1=C(C)C[S+]([O-])N(S(=O)(=O)c2ccc(Cl)cc2)C1,3.5 14 | 12,0,0,0,0,0,CHEMBL1682950,CC[C@H](NC(=O)c1c([S+](C)[O-])c(nc2ccccc12)c3ccsc3)c4ccccc4,2.66 15 | 13,1,1,1,1,1,CHEMBL1682948,CC[C@H](NC(=O)c1c([S+](C)[O-])c(nc2ccccc12)c3ccc(F)cc3)c4ccccc4,3.72 16 | 14,2,2,2,2,2,CHEMBL1682945,CC[C@H](NC(=O)c1c(CC[S+](C)[O-])c(nc2ccccc12)c3ccccc3)c4ccccc4,3.15 17 | 15,4,4,4,4,1,CHEMBL1364723,CC1=C(C)C[S+]([O-])N(S(=O)(=O)c2ccc(Cl)cc2)C1,3.5 18 | 16,0,0,0,0,0,CHEMBL1682950,CC[C@H](NC(=O)c1c([S+](C)[O-])c(nc2ccccc12)c3ccsc3)c4ccccc4,2.66 19 | 17,1,1,1,1,1,CHEMBL1682948,CC[C@H](NC(=O)c1c([S+](C)[O-])c(nc2ccccc12)c3ccc(F)cc3)c4ccccc4,3.72 20 | 18,2,2,2,2,2,CHEMBL1682945,CC[C@H](NC(=O)c1c(CC[S+](C)[O-])c(nc2ccccc12)c3ccccc3)c4ccccc4,3.15 21 | 19,4,4,4,4,1,CHEMBL1364723,CC1=C(C)C[S+]([O-])N(S(=O)(=O)c2ccc(Cl)cc2)C1,3.5 22 | 20,0,0,0,0,0,CHEMBL1682950,CC[C@H](NC(=O)c1c([S+](C)[O-])c(nc2ccccc12)c3ccsc3)c4ccccc4,2.66 23 | 21,1,1,1,1,1,CHEMBL1682948,CC[C@H](NC(=O)c1c([S+](C)[O-])c(nc2ccccc12)c3ccc(F)cc3)c4ccccc4,3.72 24 | 22,2,2,2,2,2,CHEMBL1682945,CC[C@H](NC(=O)c1c(CC[S+](C)[O-])c(nc2ccccc12)c3ccccc3)c4ccccc4,3.15 25 | 23,4,4,4,4,1,CHEMBL1364723,CC1=C(C)C[S+]([O-])N(S(=O)(=O)c2ccc(Cl)cc2)C1,3.5 26 | 24,0,0,0,0,0,CHEMBL1682950,CC[C@H](NC(=O)c1c([S+](C)[O-])c(nc2ccccc12)c3ccsc3)c4ccccc4,2.66 27 | 25,1,1,1,1,1,CHEMBL1682948,CC[C@H](NC(=O)c1c([S+](C)[O-])c(nc2ccccc12)c3ccc(F)cc3)c4ccccc4,3.72 28 | 26,2,2,2,2,2,CHEMBL1682945,CC[C@H](NC(=O)c1c(CC[S+](C)[O-])c(nc2ccccc12)c3ccccc3)c4ccccc4,3.15 29 | 27,4,4,4,4,1,CHEMBL1364723,CC1=C(C)C[S+]([O-])N(S(=O)(=O)c2ccc(Cl)cc2)C1,3.5 30 | 28,0,0,0,0,0,CHEMBL1682950,CC[C@H](NC(=O)c1c([S+](C)[O-])c(nc2ccccc12)c3ccsc3)c4ccccc4,2.66 31 | 29,1,1,1,1,1,CHEMBL1682948,CC[C@H](NC(=O)c1c([S+](C)[O-])c(nc2ccccc12)c3ccc(F)cc3)c4ccccc4,3.72 32 | 30,2,2,2,2,2,CHEMBL1682945,CC[C@H](NC(=O)c1c(CC[S+](C)[O-])c(nc2ccccc12)c3ccccc3)c4ccccc4,3.15 33 | 31,4,4,4,4,1,CHEMBL1364723,CC1=C(C)C[S+]([O-])N(S(=O)(=O)c2ccc(Cl)cc2)C1,3.5 34 | 32,0,0,0,0,0,CHEMBL1682950,CC[C@H](NC(=O)c1c([S+](C)[O-])c(nc2ccccc12)c3ccsc3)c4ccccc4,2.66 35 | 33,1,1,1,1,1,CHEMBL1682948,CC[C@H](NC(=O)c1c([S+](C)[O-])c(nc2ccccc12)c3ccc(F)cc3)c4ccccc4,3.72 36 | 34,2,2,2,2,2,CHEMBL1682945,CC[C@H](NC(=O)c1c(CC[S+](C)[O-])c(nc2ccccc12)c3ccccc3)c4ccccc4,3.15 37 | 35,4,4,4,4,1,CHEMBL1364723,CC1=C(C)C[S+]([O-])N(S(=O)(=O)c2ccc(Cl)cc2)C1,3.5 38 | 36,0,0,0,0,0,CHEMBL1682950,CC[C@H](NC(=O)c1c([S+](C)[O-])c(nc2ccccc12)c3ccsc3)c4ccccc4,2.66 39 | 37,1,1,1,1,1,CHEMBL1682948,CC[C@H](NC(=O)c1c([S+](C)[O-])c(nc2ccccc12)c3ccc(F)cc3)c4ccccc4,3.72 40 | 38,2,2,2,2,2,CHEMBL1682945,CC[C@H](NC(=O)c1c(CC[S+](C)[O-])c(nc2ccccc12)c3ccccc3)c4ccccc4,3.15 41 | 39,4,4,4,4,1,CHEMBL1364723,CC1=C(C)C[S+]([O-])N(S(=O)(=O)c2ccc(Cl)cc2)C1,3.5 42 | 40,0,0,0,0,0,CHEMBL1682950,CC[C@H](NC(=O)c1c([S+](C)[O-])c(nc2ccccc12)c3ccsc3)c4ccccc4,2.66 43 | 41,1,1,1,1,1,CHEMBL1682948,CC[C@H](NC(=O)c1c([S+](C)[O-])c(nc2ccccc12)c3ccc(F)cc3)c4ccccc4,3.72 44 | 42,2,2,2,2,2,CHEMBL1682945,CC[C@H](NC(=O)c1c(CC[S+](C)[O-])c(nc2ccccc12)c3ccccc3)c4ccccc4,3.15 45 | 43,4,4,4,4,1,CHEMBL1364723,CC1=C(C)C[S+]([O-])N(S(=O)(=O)c2ccc(Cl)cc2)C1,3.5 46 | -------------------------------------------------------------------------------- /prefer/tests/file_for_test/config_PREFER_test_custom_autosklearn.yaml: -------------------------------------------------------------------------------- 1 | path_to_df: '../data_for_test/logDPublic.csv' 2 | experiment_name: 'logD' 3 | id_column_name: 'Molecule ChEMBL ID' 4 | smiles_column_name: 'Smiles' 5 | properties_column_name_list: 6 | - 'Standard Value' 7 | problem_type: 'regression' 8 | splitting_strategy: 'random' 9 | model_instance: 10 | - 'resampling_strategy="cv"' 11 | - 'per_run_time_limit=30' 12 | - 'metric = "balanced_accuracy"' 13 | - ' n_jobs = 3' 14 | - 'ppppp = "uncorrect"' 15 | 16 | -------------------------------------------------------------------------------- /prefer/tests/file_for_test/logD_desirability_scores.yaml: -------------------------------------------------------------------------------- 1 | desirability_scores: 2 | score1: 3 | - x : -1.0 4 | y : 0.0 5 | - x : 0.0 6 | y : 0.2 7 | - x : 1.0 8 | y : 0.9 9 | - x : 2.0 10 | y : 1.0 11 | - x : 3.0 12 | y : 0.5 13 | - x : 4.0 14 | y : 0.0 15 | 16 | -------------------------------------------------------------------------------- /prefer/tests/test_autosklearn_customization.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023, Novartis Institutes for BioMedical Research Inc. and Microsoft Corporation 2 | # All rights reserved. 3 | # 4 | # Redistribution and use in source and binary forms, with or without 5 | # modification, are permitted provided that the following conditions are 6 | # met: 7 | # 8 | # * Redistributions of source code must retain the above copyright 9 | # notice, this list of conditions and the following disclaimer. 10 | # * Redistributions in binary form must reproduce the above 11 | # copyright notice, this list of conditions and the following 12 | # disclaimer in the documentation and/or other materials provided 13 | # with the distribution. 14 | # * Neither the name of Novartis Institutes for BioMedical Research Inc. nor Microsoft Corporation 15 | # nor the names of its contributors may be used to endorse or promote 16 | # products derived from this software without specific prior written permission. 17 | # 18 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 19 | # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 20 | # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 21 | # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 22 | # OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 23 | # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 24 | # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 25 | # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 26 | # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 27 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 28 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 29 | # 30 | # Created by Jessica Lanini, January 2023 31 | 32 | 33 | import sys 34 | import unittest 35 | import yaml 36 | 37 | 38 | from autosklearn.regression import AutoSklearnRegressor 39 | from prefer.utils.models_utils import ( 40 | get_autosklearn_customized_model, 41 | convert_atype_to_btype, 42 | convert_list_into_dict, 43 | ) 44 | 45 | 46 | class TestAutosklearnCustomization(unittest.TestCase): 47 | def test_get_autosklearn_customized_model(self): 48 | 49 | prefer_args = "./file_for_test/config_PREFER_test_custom_autosklearn.yaml" 50 | a_yaml_file = open(prefer_args) 51 | parsed_yaml_file = yaml.load(a_yaml_file, Loader=yaml.FullLoader) 52 | 53 | if "model_instance" in parsed_yaml_file: 54 | model_instance = parsed_yaml_file["model_instance"] 55 | else: 56 | model_instance = None 57 | 58 | ml = get_autosklearn_customized_model( 59 | model_instance=model_instance, model_type="regression", working_directory="." 60 | ) 61 | 62 | self.assertTrue(isinstance(ml, AutoSklearnRegressor)) 63 | 64 | def test_convert_atype_to_btype(self): 65 | a = 1 66 | b = "test" 67 | new_a = convert_atype_to_btype(a, b) 68 | self.assertTrue(isinstance(new_a, str)) 69 | 70 | def test_convert_list_into_dict(self): 71 | list_ = ["key1 = value1", "key2 : value2"] 72 | dict_test = {"key1": "value1", "key2": "value2"} 73 | dict_ = convert_list_into_dict(list_) 74 | self.assertTrue(dict_test == dict_) 75 | 76 | 77 | if __name__ == "__main__": 78 | unittest.main() 79 | -------------------------------------------------------------------------------- /prefer/tests/test_check_input_dataframe.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023, Novartis Institutes for BioMedical Research Inc. and Microsoft Corporation 2 | # All rights reserved. 3 | # 4 | # Redistribution and use in source and binary forms, with or without 5 | # modification, are permitted provided that the following conditions are 6 | # met: 7 | # 8 | # * Redistributions of source code must retain the above copyright 9 | # notice, this list of conditions and the following disclaimer. 10 | # * Redistributions in binary form must reproduce the above 11 | # copyright notice, this list of conditions and the following 12 | # disclaimer in the documentation and/or other materials provided 13 | # with the distribution. 14 | # * Neither the name of Novartis Institutes for BioMedical Research Inc. nor Microsoft Corporation 15 | # nor the names of its contributors may be used to endorse or promote 16 | # products derived from this software without specific prior written permission. 17 | # 18 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 19 | # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 20 | # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 21 | # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 22 | # OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 23 | # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 24 | # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 25 | # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 26 | # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 27 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 28 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 29 | # 30 | # Created by Jessica Lanini, January 2023 31 | 32 | 33 | import sys 34 | import unittest 35 | import numpy as np 36 | import pandas as pd 37 | 38 | 39 | from prefer.utils.check_input_dataframe import ( 40 | check_dataframe, 41 | check_fields, 42 | check_fields_types, 43 | check_final_structure, 44 | ) 45 | 46 | 47 | class TestCheckDataStorage(unittest.TestCase): 48 | def setUp(self): 49 | """Executed before every test case""" 50 | mol_representation_df = pd.DataFrame( 51 | np.random.randint(0, 100, size=(100, 4)), columns=list("ABCD") 52 | ) 53 | mol_representation_df.iloc[0, 0] = np.nan 54 | mol_representation_df = mol_representation_df.dropna() 55 | self.globalvar = mol_representation_df 56 | 57 | def tearDown(self): 58 | """Executed after every test case""" 59 | print("\ntearDown executing after the test case. Result:") 60 | 61 | def test_check_dataframe(self): 62 | self.assertFalse(check_dataframe(self.globalvar)) 63 | 64 | def test_check_fields(self): 65 | df = pd.DataFrame( 66 | np.random.randint(0, 100, size=(100, 4)), 67 | columns=list(["Smiles", "ID", "Property_2", "Property_3"]), 68 | ) 69 | self.assertFalse(check_fields(df)) 70 | 71 | def test_check_fields_types(self): 72 | df = pd.DataFrame( 73 | np.random.randint(0, 100, size=(100, 4)), 74 | columns=list(["Smiles", "ID", "Property_2", "Property_3"]), 75 | ) 76 | experiment_name = "experim_1" 77 | index_of_separation = 55 78 | split_type = "wrong_split_type" 79 | mask = False 80 | mask_value = -1 81 | problem_type = "regression" 82 | self.assertFalse( 83 | check_fields_types( 84 | df, experiment_name, problem_type, mask, mask_value, split_type, index_of_separation 85 | ) 86 | ) 87 | 88 | def test_check_fields_types_2(self): 89 | df = pd.DataFrame( 90 | np.random.randint(0, 100, size=(100, 4)), 91 | columns=list(["Smiles", "ID", "Property_2", "Property_3"]), 92 | ) 93 | experiment_name = "experim_1" 94 | index_of_separation = 55 95 | split_type = "temporal" 96 | mask = False 97 | mask_value = -1 98 | problem_type = "regression" 99 | self.assertTrue( 100 | check_fields_types( 101 | df, experiment_name, problem_type, mask, mask_value, split_type, index_of_separation 102 | ) 103 | ) 104 | 105 | def test_check_final_structure(self): 106 | df = pd.DataFrame( 107 | np.random.randint(0, 100, size=(100, 4)), 108 | columns=list(["Smiles", "ID", "Property_1", "Property_2"]), 109 | ) 110 | df["Property_1"][0] = np.nan 111 | self.assertFalse(check_final_structure(df)) 112 | 113 | def test_check_final_structure_1(self): 114 | df = pd.DataFrame( 115 | np.random.randint(0, 100, size=(100, 4)), 116 | columns=list(["Smiles", "ID", "Property_1", "Property_2"]), 117 | ) 118 | self.assertTrue(check_final_structure(df)) 119 | 120 | 121 | if __name__ == "__main__": 122 | unittest.main() 123 | -------------------------------------------------------------------------------- /prefer/tests/test_data_preparation.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023, Novartis Institutes for BioMedical Research Inc. and Microsoft Corporation 2 | # All rights reserved. 3 | # 4 | # Redistribution and use in source and binary forms, with or without 5 | # modification, are permitted provided that the following conditions are 6 | # met: 7 | # 8 | # * Redistributions of source code must retain the above copyright 9 | # notice, this list of conditions and the following disclaimer. 10 | # * Redistributions in binary form must reproduce the above 11 | # copyright notice, this list of conditions and the following 12 | # disclaimer in the documentation and/or other materials provided 13 | # with the distribution. 14 | # * Neither the name of Novartis Institutes for BioMedical Research Inc. nor Microsoft Corporation 15 | # nor the names of its contributors may be used to endorse or promote 16 | # products derived from this software without specific prior written permission. 17 | # 18 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 19 | # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 20 | # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 21 | # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 22 | # OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 23 | # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 24 | # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 25 | # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 26 | # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 27 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 28 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 29 | # 30 | # Created by Jessica Lanini, January 2023 31 | 32 | 33 | import sys 34 | import unittest 35 | 36 | import numpy as np 37 | import pandas as pd 38 | 39 | 40 | from prefer.utils.data_preparation import prepare_data 41 | 42 | 43 | class TestDataPreparation(unittest.TestCase): 44 | def test_prepare_data(self): 45 | df = pd.DataFrame( 46 | np.random.randint(0, 100, size=(100, 4)), 47 | columns=list(["Smiles", "ID", "Property_1", "Property_2"]), 48 | ) 49 | with self.assertRaises(ValueError) as context: 50 | prepare_data( 51 | df, 52 | id_column_name="invalid_name", 53 | smiles_column_name="Smiles", 54 | properties_column_name_list=["Property_1", "Property_2"], 55 | ) 56 | self.assertEqual("ERROR: columns name not found in the dataframe", str(context.exception)) 57 | 58 | 59 | if __name__ == "__main__": 60 | unittest.main() 61 | -------------------------------------------------------------------------------- /prefer/tests/test_filtering.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023, Novartis Institutes for BioMedical Research Inc. and Microsoft Corporation 2 | # All rights reserved. 3 | # 4 | # Redistribution and use in source and binary forms, with or without 5 | # modification, are permitted provided that the following conditions are 6 | # met: 7 | # 8 | # * Redistributions of source code must retain the above copyright 9 | # notice, this list of conditions and the following disclaimer. 10 | # * Redistributions in binary form must reproduce the above 11 | # copyright notice, this list of conditions and the following 12 | # disclaimer in the documentation and/or other materials provided 13 | # with the distribution. 14 | # * Neither the name of Novartis Institutes for BioMedical Research Inc. nor Microsoft Corporation 15 | # nor the names of its contributors may be used to endorse or promote 16 | # products derived from this software without specific prior written permission. 17 | # 18 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 19 | # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 20 | # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 21 | # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 22 | # OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 23 | # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 24 | # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 25 | # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 26 | # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 27 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 28 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 29 | # 30 | # Created by Jessica Lanini, January 2023 31 | 32 | 33 | import sys 34 | import unittest 35 | 36 | import numpy as np 37 | import pandas as pd 38 | 39 | 40 | from prefer.utils.filtering import filter_and_normalize_mols, find_nan 41 | 42 | 43 | class TestFiltering(unittest.TestCase): 44 | def test_find_nan_1(self): 45 | # Empty df 46 | df = pd.DataFrame() 47 | representation_to_evaluate = ["Fingerprints", "_2DDescriptors", "Embedded_cddd"] 48 | with self.assertRaises(ValueError): 49 | find_nan(df, representation_to_evaluate) 50 | 51 | def test_find_nan_2(self): 52 | # Invalid representation 53 | df = pd.DataFrame( 54 | np.random.randint(0, 100, size=(100, 4)), 55 | columns=list(["Smiles", "ID", "Fingerprints", "_2DD"]), 56 | ) 57 | representation_to_evaluate = ["invalid"] 58 | with self.assertRaises(ValueError): 59 | find_nan(df, representation_to_evaluate) 60 | 61 | def test_filter_salt(self): 62 | df = pd.DataFrame( 63 | np.random.randint(0, 100, size=(100, 3)), columns=list(["ID", "Fingerprints", "_2DD"]) 64 | ) 65 | with self.assertRaises(ValueError): 66 | filter_and_normalize_mols(df) 67 | 68 | 69 | if __name__ == "__main__": 70 | unittest.main() 71 | -------------------------------------------------------------------------------- /prefer/tests/test_helpers.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023, Novartis Institutes for BioMedical Research Inc. and Microsoft Corporation 2 | # All rights reserved. 3 | # 4 | # Redistribution and use in source and binary forms, with or without 5 | # modification, are permitted provided that the following conditions are 6 | # met: 7 | # 8 | # * Redistributions of source code must retain the above copyright 9 | # notice, this list of conditions and the following disclaimer. 10 | # * Redistributions in binary form must reproduce the above 11 | # copyright notice, this list of conditions and the following 12 | # disclaimer in the documentation and/or other materials provided 13 | # with the distribution. 14 | # * Neither the name of Novartis Institutes for BioMedical Research Inc. nor Microsoft Corporation 15 | # nor the names of its contributors may be used to endorse or promote 16 | # products derived from this software without specific prior written permission. 17 | # 18 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 19 | # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 20 | # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 21 | # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 22 | # OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 23 | # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 24 | # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 25 | # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 26 | # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 27 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 28 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 29 | # 30 | # Created by Jessica Lanini, January 2023 31 | 32 | 33 | import sys 34 | import unittest 35 | import pandas as pd 36 | import numpy as np 37 | 38 | 39 | from prefer.utils.models_utils import output_dataframe_preparation 40 | 41 | 42 | class TestHelpers(unittest.TestCase): 43 | def test_output_dataframe_preparation_singleTask(self): 44 | # list of lists 45 | data = [ 46 | ["a1", "b1", "c1"], 47 | ["a2", "b2", "c2"], 48 | ["a3", "b3", "c3"], 49 | ["a4", "b4", "c4"], 50 | ["a5", "b5", "c5"], 51 | ] 52 | 53 | df = pd.DataFrame(data) 54 | df["Property_1"] = [ 55 | "test_label1", 56 | "train_label1", 57 | "test_label2", 58 | "train_label2", 59 | "train_label3", 60 | ] 61 | index_train = [1, 3, 4] 62 | index_test = [0, 2] 63 | predictions_train = ["train_val1", "train_val2", "train_val3"] 64 | predictions_test = ["test_val1", "test_val2"] 65 | expected_df = df.copy() 66 | expected_df["model_predictions_property_1"] = [ 67 | "test_val1", 68 | "train_val1", 69 | "test_val2", 70 | "train_val2", 71 | "train_val3", 72 | ] 73 | expected_df["is_train"] = [False, True, False, True, True] 74 | 75 | output_df = output_dataframe_preparation( 76 | df, 77 | index_train=index_train, 78 | index_test=index_test, 79 | predictions_train=predictions_train, 80 | predictions_test=predictions_test, 81 | ) 82 | 83 | all_collect = [] 84 | for col in output_df.columns: 85 | all_collect.append(all(output_df[0].values == expected_df[0].values)) 86 | all_collect.append(all(output_df.columns.values == expected_df.columns.values)) 87 | 88 | self.assertTrue(all(all_collect)) 89 | 90 | def test_output_dataframe_preparation_multiTask(self): 91 | # list of lists 92 | data = [ 93 | ["a1", "b1", "c1"], 94 | ["a2", "b2", "c2"], 95 | ["a3", "b3", "c3"], 96 | ["a4", "b4", "c4"], 97 | ["a5", "b5", "c5"], 98 | ] 99 | 100 | df = pd.DataFrame(data) 101 | df["Property_1"] = [ 102 | "test_label1", 103 | "train_label1", 104 | "test_label2", 105 | "train_label2", 106 | "train_label3", 107 | ] 108 | 109 | df["Property_2"] = [ 110 | "test_label1", 111 | "train_label1", 112 | "test_label2", 113 | "train_label2", 114 | "train_label3", 115 | ] 116 | index_train = [1, 3, 4] 117 | index_test = [0, 2] 118 | predictions_train = np.array( 119 | [["train_val1", "train_val2", "train_val3"], ["train_val1", "train_val2", "train_val3"]] 120 | ) 121 | predictions_train = predictions_train.T 122 | predictions_test = np.array([["test_val1", "test_val2"], ["test_val1", "test_val2"]]) 123 | predictions_test = predictions_test.T 124 | expected_df = df.copy() 125 | expected_df["model_predictions_property_1"] = np.array( 126 | ["test_val1", "train_val1", "test_val2", "train_val2", "train_val3"] 127 | ) 128 | expected_df["model_predictions_property_2"] = np.array( 129 | ["test_val1", "train_val1", "test_val2", "train_val2", "train_val3"] 130 | ) 131 | expected_df["is_train"] = [False, True, False, True, True] 132 | 133 | output_df = output_dataframe_preparation( 134 | df, 135 | index_train=index_train, 136 | index_test=index_test, 137 | predictions_train=predictions_train, 138 | predictions_test=predictions_test, 139 | ) 140 | 141 | all_collect = [] 142 | for col in output_df.columns: 143 | all_collect.append(all(output_df[0].values == expected_df[0].values)) 144 | all_collect.append(all(output_df.columns.values == expected_df.columns.values)) 145 | 146 | self.assertTrue(all(all_collect)) 147 | 148 | 149 | if __name__ == "__main__": 150 | unittest.main() 151 | -------------------------------------------------------------------------------- /prefer/tests/test_prefer_model_wrapper.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023, Novartis Institutes for BioMedical Research Inc. and Microsoft Corporation 2 | # All rights reserved. 3 | # 4 | # Redistribution and use in source and binary forms, with or without 5 | # modification, are permitted provided that the following conditions are 6 | # met: 7 | # 8 | # * Redistributions of source code must retain the above copyright 9 | # notice, this list of conditions and the following disclaimer. 10 | # * Redistributions in binary form must reproduce the above 11 | # copyright notice, this list of conditions and the following 12 | # disclaimer in the documentation and/or other materials provided 13 | # with the distribution. 14 | # * Neither the name of Novartis Institutes for BioMedical Research Inc. nor Microsoft Corporation 15 | # nor the names of its contributors may be used to endorse or promote 16 | # products derived from this software without specific prior written permission. 17 | # 18 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 19 | # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 20 | # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 21 | # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 22 | # OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 23 | # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 24 | # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 25 | # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 26 | # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 27 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 28 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 29 | # 30 | # Created by Jessica Lanini, January 2023 31 | 32 | 33 | import sys 34 | import unittest 35 | 36 | 37 | from prefer.src.prefer_model_wrapper import PreferModelWrapper 38 | from sklearn.linear_model import LinearRegression 39 | import numpy as np 40 | 41 | 42 | class TestPreferModelWrapper(unittest.TestCase): 43 | def test_prefer_model_wrapper(self): 44 | fingerprint_length = 2048 # Default value in `get_fingerprints` 45 | 46 | # Dummy model predicts 3.0 everywhere 47 | X = np.zeros((2, fingerprint_length)) 48 | y = np.dot(X, np.ones(fingerprint_length)) + 3 49 | model = LinearRegression().fit(X, y) 50 | 51 | # When molecule is un-scoreable, PreferModelWrapper gives worst possible score 52 | worst_score = 0.32 53 | 54 | wrapper = PreferModelWrapper( 55 | model=model, 56 | metadata={ 57 | "problem_type": "regression", 58 | "best_model_representation": "FINGERPRINTS", 59 | "friendly_model_name": "jan", 60 | "desirability_scores": {"junk": [{"x": 0, "y": 1.0}, {"x": worst_score, "y": 0.0}]}, 61 | "rep_model_id": "the_rep_model", 62 | }, 63 | ) 64 | scores = wrapper.predict( 65 | ["CC", "CCC", "unparseable SMILES"], is_smiles_func=True, rep_model_id=None 66 | ) 67 | 68 | assertion = scores == [3.0, 3.0, 3.0] 69 | self.assertTrue(all(assertion) == True) 70 | 71 | 72 | if __name__ == "__main__": 73 | unittest.main() 74 | -------------------------------------------------------------------------------- /prefer/tests/test_scripts.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023, Novartis Institutes for BioMedical Research Inc. and Microsoft Corporation 2 | # All rights reserved. 3 | # 4 | # Redistribution and use in source and binary forms, with or without 5 | # modification, are permitted provided that the following conditions are 6 | # met: 7 | # 8 | # * Redistributions of source code must retain the above copyright 9 | # notice, this list of conditions and the following disclaimer. 10 | # * Redistributions in binary form must reproduce the above 11 | # copyright notice, this list of conditions and the following 12 | # disclaimer in the documentation and/or other materials provided 13 | # with the distribution. 14 | # * Neither the name of Novartis Institutes for BioMedical Research Inc. nor Microsoft Corporation 15 | # nor the names of its contributors may be used to endorse or promote 16 | # products derived from this software without specific prior written permission. 17 | # 18 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 19 | # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 20 | # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 21 | # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 22 | # OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 23 | # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 24 | # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 25 | # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 26 | # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 27 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 28 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 29 | # 30 | # Created by Jessica Lanini, January 2023 31 | 32 | 33 | import sys 34 | import unittest 35 | import os 36 | import ast 37 | 38 | import numpy as np 39 | import json 40 | import yaml 41 | import pandas as pd 42 | 43 | 44 | from prefer.scripts.get_representations import compute_representations_from_args 45 | from prefer.scripts.run_PREFER import run_PREFER_from_args 46 | from prefer.scripts.model_wrapper import store_metadata 47 | 48 | 49 | class TestScripts(unittest.TestCase): 50 | def test_get_representations(self): 51 | path_to_df = "./data_for_test/logDPublic.csv" 52 | representation_to_compute = "FINGERPRINTS" 53 | path_to_model = None 54 | output_dir = "./representations_dir/" 55 | experiment_name = "test_logDPublic" 56 | id_column_name = "Molecule ChEMBL ID" 57 | smiles_column_name = "Smiles" 58 | splitting_strategy = "random" 59 | temporal_info_column_name = None 60 | properties_column_name_list = ["Standard Value"] 61 | try: 62 | os.makedirs(output_dir, exist_ok=True) 63 | print("Directory '%s' created successfully" % output_dir) 64 | except OSError as error: 65 | print("Directory '%s' can not be created") 66 | 67 | output_dir_new = compute_representations_from_args( 68 | path_to_df, 69 | representation_to_compute, 70 | path_to_model, 71 | output_dir, 72 | experiment_name, 73 | id_column_name, 74 | smiles_column_name, 75 | splitting_strategy, 76 | temporal_info_column_name, 77 | properties_column_name_list, 78 | ) 79 | self.assertTrue(output_dir == output_dir_new) 80 | 81 | def test_run_PREFER(self): 82 | problem_type = "regression" 83 | representation_name = "FINGERPRINTS" 84 | repr_dir = "./representations_dir/" 85 | final_folder_path = "./output_dir/" 86 | experiment_name = "test_logDPublic" 87 | try: 88 | os.makedirs(final_folder_path, exist_ok=True) 89 | print("Directory '%s' created successfully" % final_folder_path) 90 | except OSError as error: 91 | print("Directory '%s' can not be created") 92 | 93 | run_PREFER_from_args( 94 | problem_type, representation_name, repr_dir, final_folder_path, experiment_name, 95 | ) 96 | 97 | def test_store_metadata(self): 98 | path_to_df = "./data_for_test/logDPublic.csv" 99 | path_to_model = None 100 | problem_type = "regression" 101 | experiment_name = "test_logDPublic_wrapper" 102 | id_column_name = "Molecule ChEMBL ID" 103 | smiles_column_name = "Smiles" 104 | properties_column_name_list = ["Standard Value"] 105 | representation_name = "FINGERPRINTS" 106 | final_folder_path = "./wrappers_dir/" 107 | try: 108 | os.makedirs(final_folder_path, exist_ok=True) 109 | print("Directory '%s' created successfully" % final_folder_path) 110 | except OSError as error: 111 | print("Directory '%s' can not be created") 112 | 113 | property_model_folder_path = "./output_dir/" 114 | repr_dir = "./representations_dir/" 115 | with open("./file_for_test/logD_desirability_scores.yaml") as file: 116 | try: 117 | parsed_yaml_file = yaml.safe_load(file) 118 | except yaml.YAMLError as exception: 119 | print(exception) 120 | 121 | desirability_scores = json.dumps(parsed_yaml_file["desirability_scores"]) 122 | is_str = isinstance(desirability_scores, str) 123 | if is_str: 124 | desirability_scores = ast.literal_eval(desirability_scores) 125 | store_metadata( 126 | path_to_df=path_to_df, 127 | path_to_model=path_to_model, 128 | problem_type=problem_type, 129 | experiment_name=experiment_name, 130 | id_column_name=id_column_name, 131 | smiles_column_name=smiles_column_name, 132 | properties_column_name_list=properties_column_name_list, 133 | representation_name=representation_name, 134 | final_folder_path=final_folder_path, 135 | property_model_folder_path=property_model_folder_path, 136 | repr_dir=repr_dir, 137 | desirability_scores=desirability_scores, 138 | ) 139 | 140 | 141 | if __name__ == "__main__": 142 | unittest.main() 143 | -------------------------------------------------------------------------------- /prefer/utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rdkit/PREFER/16986ca183b38b94f9b3b62b441ec40671b8cf8a/prefer/utils/__init__.py -------------------------------------------------------------------------------- /prefer/utils/check_input_dataframe.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023, Novartis Institutes for BioMedical Research Inc. and Microsoft Corporation 2 | # All rights reserved. 3 | # 4 | # Redistribution and use in source and binary forms, with or without 5 | # modification, are permitted provided that the following conditions are 6 | # met: 7 | # 8 | # * Redistributions of source code must retain the above copyright 9 | # notice, this list of conditions and the following disclaimer. 10 | # * Redistributions in binary form must reproduce the above 11 | # copyright notice, this list of conditions and the following 12 | # disclaimer in the documentation and/or other materials provided 13 | # with the distribution. 14 | # * Neither the name of Novartis Institutes for BioMedical Research Inc. nor Microsoft Corporation 15 | # nor the names of its contributors may be used to endorse or promote 16 | # products derived from this software without specific prior written permission. 17 | # 18 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 19 | # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 20 | # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 21 | # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 22 | # OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 23 | # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 24 | # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 25 | # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 26 | # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 27 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 28 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 29 | # 30 | # Created by Jessica Lanini, January 2023 31 | 32 | 33 | """ 34 | Helper functions to check the validity of the input dataframe. 35 | """ 36 | import logging 37 | 38 | import pandas as pd 39 | 40 | 41 | def check_dataframe(df): 42 | """ 43 | This function helps to evaluate whether a dataframe has a correct indices of the rows 44 | """ 45 | if df.index[-1] != (df.shape[0] - 1): 46 | print("Indices of the dataframe are not correct") 47 | return df.index[-1] == (df.shape[0] - 1) 48 | 49 | 50 | def check_fields(df): 51 | cols = df.columns.values 52 | property_to_eval = df.columns[["Property" in str(x) for x in df.columns.values]].values 53 | expected_cols = ["Smiles", "ID"] + [ 54 | "Property_" + str(x + 1) for x, elem in enumerate(property_to_eval) 55 | ] 56 | return all([x in cols for x in expected_cols]) 57 | 58 | 59 | def check_fields_types( 60 | df, experiment_name, problem_type, mask, mask_value, split_type, index_of_separation 61 | ): 62 | """ 63 | This function helps to evaluate whether the dataframe fields are of the correct type. 64 | """ 65 | return all( 66 | [ 67 | isinstance(df, pd.DataFrame), 68 | isinstance(experiment_name, str), 69 | isinstance(problem_type, str), 70 | isinstance(split_type, str), 71 | split_type in ["random", "cluster", "temporal"], 72 | problem_type in ["regression", "classification"], 73 | isinstance(index_of_separation, int), 74 | isinstance(mask, bool), 75 | isinstance(mask_value, (int, float, complex)), 76 | ] 77 | ) 78 | 79 | 80 | def check_final_structure(df): 81 | """ 82 | Function to check if the dataframes are proper for the building of the models 83 | """ 84 | property_to_eval = df.columns[["Property" in str(x) for x in df.columns.values]].values 85 | for prop in property_to_eval: 86 | if df[prop].isnull().sum() > 0: 87 | logging.error( 88 | "ERROR --> some labels are NaN. Please check your dataframes before running eval_.BenchMoleProp()" 89 | ) 90 | return False 91 | return True 92 | -------------------------------------------------------------------------------- /prefer/utils/data_preparation.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023, Novartis Institutes for BioMedical Research Inc. and Microsoft Corporation 2 | # All rights reserved. 3 | # 4 | # Redistribution and use in source and binary forms, with or without 5 | # modification, are permitted provided that the following conditions are 6 | # met: 7 | # 8 | # * Redistributions of source code must retain the above copyright 9 | # notice, this list of conditions and the following disclaimer. 10 | # * Redistributions in binary form must reproduce the above 11 | # copyright notice, this list of conditions and the following 12 | # disclaimer in the documentation and/or other materials provided 13 | # with the distribution. 14 | # * Neither the name of Novartis Institutes for BioMedical Research Inc. nor Microsoft Corporation 15 | # nor the names of its contributors may be used to endorse or promote 16 | # products derived from this software without specific prior written permission. 17 | # 18 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 19 | # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 20 | # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 21 | # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 22 | # OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 23 | # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 24 | # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 25 | # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 26 | # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 27 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 28 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 29 | # 30 | # Created by Jessica Lanini, January 2023 31 | 32 | 33 | #!/usr/bin/env python 34 | # -*- coding: utf-8 -*- 35 | 36 | import sys 37 | 38 | 39 | from prefer.utils.filtering import filter_and_normalize_mols 40 | import pandas as pd 41 | 42 | 43 | def prepare_data( 44 | df, 45 | id_column_name: str, 46 | smiles_column_name: str, 47 | properties_column_name_list: list, 48 | temporal_info_column_name: str = None, 49 | filter_flag: bool = True, 50 | ): 51 | 52 | """ 53 | Function to prepare datasets. 54 | The inputs are: 55 | df: dataframe to be manipulated 56 | id_column_name: string of the name of the column where the ID is stored 57 | smiles_column_name: string of the name of the column where the smile representation is stored 58 | properties_column_name_list: list of the strings/names of the property/ies to evaluate 59 | 60 | """ 61 | if not isinstance(properties_column_name_list, list): 62 | raise ValueError('properties_column_name_list should be a list of names of the selected labels') 63 | # Evaluate whether unique labels 64 | if len(properties_column_name_list) > len(set(properties_column_name_list)): 65 | raise ValueError('Duplicates in the labels list cannot be handled by PREFER - please provide unique labels names') 66 | 67 | # Check if consistent 68 | check = list() 69 | check.append(all([x in df.columns.values for x in properties_column_name_list])) 70 | check.append(id_column_name in df.columns.values) 71 | check.append(smiles_column_name in df.columns.values) 72 | if temporal_info_column_name: 73 | check.append(temporal_info_column_name in df.columns.values) 74 | if not all(check): 75 | raise ValueError("ERROR: columns name not found in the dataframe") 76 | 77 | cols = list() 78 | df.rename(columns={id_column_name: "ID"}, inplace=True) 79 | cols.append("ID") 80 | df.rename(columns={smiles_column_name: "Smiles"}, inplace=True) 81 | cols.append("Smiles") 82 | 83 | if temporal_info_column_name: 84 | df[temporal_info_column_name] = pd.to_datetime(df[temporal_info_column_name]) 85 | df.rename(columns={temporal_info_column_name: "Time"}, inplace=True) 86 | cols.append("Time") 87 | 88 | # TO DO extend AutoSklearn in the case of sparsity of the label matrx. For now we need to remove nans 89 | print( 90 | "WARNING: Autosklearn does not handle for now label matrix sparsity, thus nan values will be removed both for single task and multitasking cases" 91 | ) 92 | for index, _ in enumerate(properties_column_name_list): 93 | df = df[df[properties_column_name_list[index]].notna()] 94 | df = df.reset_index(drop=True) 95 | 96 | for index, properties_column_name in enumerate(properties_column_name_list): 97 | df.rename(columns={properties_column_name: "Property_" + str(index + 1)}, inplace=True) 98 | cols.append("Property_" + str(index + 1)) 99 | if filter_flag: 100 | return filter_and_normalize_mols(df[cols]) 101 | else: 102 | return df[cols] 103 | -------------------------------------------------------------------------------- /prefer/utils/features_scaling.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023, Novartis Institutes for BioMedical Research Inc. and Microsoft Corporation 2 | # All rights reserved. 3 | # 4 | # Redistribution and use in source and binary forms, with or without 5 | # modification, are permitted provided that the following conditions are 6 | # met: 7 | # 8 | # * Redistributions of source code must retain the above copyright 9 | # notice, this list of conditions and the following disclaimer. 10 | # * Redistributions in binary form must reproduce the above 11 | # copyright notice, this list of conditions and the following 12 | # disclaimer in the documentation and/or other materials provided 13 | # with the distribution. 14 | # * Neither the name of Novartis Institutes for BioMedical Research Inc. nor Microsoft Corporation 15 | # nor the names of its contributors may be used to endorse or promote 16 | # products derived from this software without specific prior written permission. 17 | # 18 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 19 | # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 20 | # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 21 | # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 22 | # OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 23 | # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 24 | # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 25 | # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 26 | # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 27 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 28 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 29 | # 30 | # Created by Jessica Lanini, January 2023 31 | 32 | 33 | #!/usr/bin/env python 34 | # -*- coding: utf-8 -*- 35 | import pandas as pd 36 | import numpy as np 37 | 38 | 39 | def scale_features(Xtrain, Xtest, scaling_type="standardization"): 40 | """ 41 | method used to normalize features 42 | Xtrain and Xtest are 2D numpy arrays 43 | scaling_type is a tring that can be standardization or normalization 44 | """ 45 | print("Scaling features") 46 | Xtrain_df = pd.DataFrame(Xtrain) 47 | Xtest_df = pd.DataFrame(Xtest) 48 | # compute mean and std of the train set 49 | means_ = Xtrain_df.mean() # only from the training set 50 | stds_ = Xtrain_df.std() 51 | 52 | Xtrain_scaled = [] 53 | for index, row in Xtrain_df.iterrows(): 54 | scaled_row = apply_scaling( 55 | np.array(row), scaling_type=scaling_type, means=means_, stds=stds_ 56 | ) 57 | Xtrain_scaled.append(scaled_row) 58 | Xtrain_scaled = np.array(Xtrain_scaled) 59 | Xtrain_scaled_df = pd.DataFrame(Xtrain_scaled) 60 | 61 | Xtest_scaled = [] 62 | for index, row in Xtest_df.iterrows(): 63 | scaled_row = apply_scaling( 64 | np.array(row), scaling_type=scaling_type, means=means_, stds=stds_ 65 | ) 66 | Xtest_scaled.append(scaled_row) 67 | Xtest_scaled = np.array(Xtest_scaled) 68 | 69 | if scaling_type == "standardization": 70 | means_sc = Xtrain_scaled_df.mean() # only from the training set 71 | stds_sc = Xtrain_scaled_df.std() 72 | 73 | if not (np.array(round(stds_sc, 1)) == 1.0).all(): 74 | raise ValueError( 75 | "ERROR: when standardizing matrix; not all stds of the scaled matrix are 1.0" 76 | ) 77 | if not (np.array(round(means_sc, 1)) == 0.0).all(): 78 | raise ValueError( 79 | "ERROR: when standardizing matrix; not all means of the scaled matrix are 0.0" 80 | ) 81 | 82 | if scaling_type == "normalization": 83 | means_sc = Xtrain_scaled_df.mean() # only from the training set 84 | 85 | if not (np.array(round(means_sc, 1)) == 0.0).all(): 86 | raise ValueError( 87 | "ERROR: when normalizing matrix; not all means of the scaled matrix are 0.0" 88 | ) 89 | 90 | return Xtrain_scaled, Xtest_scaled, means_, stds_ 91 | 92 | 93 | def apply_scaling(features_vect, scaling_type="standardization", means=None, stds=None): 94 | """ 95 | function to apply a specific scaling given means and stds. 96 | Inputs: 97 | - features_vect, must be a numpy array of the features related to a single sample 98 | - scaling_type, is a string that can be standardization or normalization 99 | - means, is a numpy array of the means (one for each feature) 100 | - stds, is a numpy array of the standard deviation values (one for each feature) 101 | Output: 102 | - numpy array of scaled features 103 | """ 104 | 105 | # making sure the features_vect is numpy arrays: 106 | features_vect = np.array(features_vect) 107 | array_sum = np.sum(features_vect) 108 | array_has_nan = not np.isfinite(array_sum) 109 | # check if features_vect contains nan 110 | if array_has_nan: 111 | raise ValueError("features_vect provided to the apply_scaling contains nan - cannot scale.") 112 | 113 | if means is not None: 114 | # making sure the means is numpy arrays: 115 | means = np.array(means) 116 | 117 | # check the dimension 118 | if means.shape[0] != features_vect.shape[0]: 119 | raise ValueError( 120 | f"ERROR: features_vect dimension ({features_vect.shape[0]}) does not match with means dimension ({means.shape[0]})" 121 | ) 122 | 123 | if stds is not None: 124 | # making sure the stds is numpy arrays: 125 | stds = np.array(stds) 126 | 127 | # check the dimension 128 | if stds.shape[0] != features_vect.shape[0]: 129 | raise ValueError( 130 | "ERROR: features_vect dimension does not match with stds dimension" 131 | ) 132 | 133 | if scaling_type == "standardization": 134 | 135 | features_vect = (features_vect - means) / stds 136 | # Replace possible inf with nans 137 | features_vect[features_vect == -np.inf] = np.nan 138 | features_vect[features_vect == np.inf] = np.nan 139 | # if zeros in stds we should also have nans in features_vect 140 | if any(np.isnan(features_vect)) == (0 in stds): 141 | if any(np.isnan(features_vect)): 142 | # important check in case of zeros in stds 143 | (stds_zeros,) = np.where( 144 | stds == 0 145 | ) # zeros in stds should correspond to nans in standardize array 146 | features_vect_nans = [x[0] for x in np.argwhere(np.isnan(features_vect))] 147 | 148 | if list(stds_zeros) == list(features_vect_nans): 149 | return list(features_vect[~np.isnan(features_vect)]) 150 | else: 151 | raise ValueError( 152 | "ERROR: there is a problem with the standardization: no match between zeros in the stds vector and nans in the standardize features vector" 153 | ) 154 | else: # no zeros in stds and no nans in feature_vect 155 | return list(features_vect) 156 | else: 157 | raise ValueError( 158 | "ERROR: found nans in standardize features_vect but no zeros in stds or viceversa" 159 | ) 160 | 161 | elif scaling_type == "normalization": 162 | features_vect = features_vect - means 163 | return list(features_vect) 164 | else: 165 | raise ValueError( 166 | "ERROR: only standardization or normalization are possible scaling_type" 167 | ) 168 | else: 169 | if scaling_type == "standardization": 170 | raise ValueError( 171 | "ERROR: only normalization is possible since stds vector is not provided" 172 | ) 173 | 174 | elif scaling_type == "normalization": 175 | features_vect = features_vect - means 176 | return list(features_vect) 177 | else: 178 | raise ValueError( 179 | "ERROR: only standardization or normalization are possible scaling_type" 180 | ) 181 | 182 | else: 183 | raise ValueError("ERROR: please provide a means vector (one value for each feature)") 184 | -------------------------------------------------------------------------------- /prefer/utils/filtering.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023, Novartis Institutes for BioMedical Research Inc. and Microsoft Corporation 2 | # All rights reserved. 3 | # 4 | # Redistribution and use in source and binary forms, with or without 5 | # modification, are permitted provided that the following conditions are 6 | # met: 7 | # 8 | # * Redistributions of source code must retain the above copyright 9 | # notice, this list of conditions and the following disclaimer. 10 | # * Redistributions in binary form must reproduce the above 11 | # copyright notice, this list of conditions and the following 12 | # disclaimer in the documentation and/or other materials provided 13 | # with the distribution. 14 | # * Neither the name of Novartis Institutes for BioMedical Research Inc. nor Microsoft Corporation 15 | # nor the names of its contributors may be used to endorse or promote 16 | # products derived from this software without specific prior written permission. 17 | # 18 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 19 | # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 20 | # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 21 | # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 22 | # OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 23 | # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 24 | # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 25 | # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 26 | # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 27 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 28 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 29 | # 30 | # Created by Jessica Lanini, January 2023 31 | 32 | 33 | import logging 34 | import sys 35 | from pathlib import Path 36 | 37 | import numpy as np 38 | from rdkit import Chem 39 | from rdkit import RDLogger 40 | from rdkit.Chem import SaltRemover 41 | from rdkit.Chem.MolStandardize import rdMolStandardize 42 | 43 | directory_path = Path(__file__).parents[1] 44 | SALTS_FILE = "docs/SaltsMod.txt" 45 | 46 | 47 | from prefer.utils.check_input_dataframe import check_dataframe 48 | from prefer.utils.data_utils import extract_representations 49 | 50 | 51 | class MissingSmiles(ValueError): 52 | pass 53 | 54 | 55 | class EmptyDataframe(ValueError): 56 | pass 57 | 58 | 59 | class MissingRepresentation(ValueError): 60 | pass 61 | 62 | 63 | def filter_and_normalize_smiles(smiles): 64 | 65 | uncharger = rdMolStandardize.Uncharger() 66 | # format is smarts, then flag to allow one to pick and choose which to include 67 | forbidden_elements = Chem.MolFromSmarts( 68 | "[Cu,Sb,As,Sn,Pt,Te,Pd,Lu,Ge,Zn,Cu,Co,Ni,Fe,Hg,Zr,Mn,Ag,Bi,Cd,Cr,Ti,Al,Au,Mo,V,Mg,In,Ga,Pb,Ca,W]" 69 | ) 70 | remover = SaltRemover.SaltRemover(defnFilename=directory_path / SALTS_FILE) 71 | 72 | try: 73 | mol = Chem.MolFromSmiles(str(smiles)) 74 | except Exception as e: 75 | logging.error(f"ERROR: Invalid SMILES {smiles}.{e}") 76 | 77 | if mol is None or mol.GetNumAtoms() > 100: 78 | return None 79 | 80 | res, deleted = remover.StripMolWithDeleted(mol) 81 | # add a flag in case you want to remove or keep the salts - Default we keep it without the salt 82 | if len(deleted) != 0: 83 | return None 84 | 85 | mol = uncharger.uncharge(mol) 86 | if mol is None: 87 | return None 88 | 89 | if Chem.SanitizeMol(mol, catchErrors=True): # maybe the molecule has changed 90 | return None 91 | 92 | if mol.HasSubstructMatch(forbidden_elements): 93 | return None 94 | 95 | smi = Chem.MolToSmiles(mol, isomericSmiles=False) 96 | if len(smi) < 2 or "*" in smi or "R" in smi: 97 | return None 98 | 99 | return smi 100 | 101 | 102 | def filter_and_normalize_mols(df): 103 | """ 104 | function to filter the row dataset at the beginning of the benchmarking pipeline 105 | """ 106 | 107 | # Turn off the warning 108 | lg = RDLogger.logger() 109 | lg.setLevel(RDLogger.CRITICAL) 110 | 111 | # Check if 'Smiles' columns is in the current dataframe 112 | if "Smiles" not in df.columns.values: 113 | raise MissingSmiles("ERROR: Smiles column not in the dataframe") 114 | 115 | uncharger = rdMolStandardize.Uncharger() 116 | # format is smarts, then flag to allow one to pick and choose which to include 117 | forbidden_elements = Chem.MolFromSmarts( 118 | "[Cu,Sb,As,Sn,Pt,Te,Pd,Lu,Ge,Zn,Cu,Co,Ni,Fe,Hg,Zr,Mn,Ag,Bi,Cd,Cr,Ti,Al,Au,Mo,V,Mg,In,Ga,Pb,Ca,W]" 119 | ) 120 | # Define the list to store the indices of the rows to be dropped 121 | rows_to_drop = [] 122 | 123 | for index, smile in enumerate(df["Smiles"]): 124 | 125 | mol = Chem.MolFromSmiles(str(smile)) 126 | 127 | if mol is None: 128 | logging.warning("WARNING: mol is None for smile: " + str(smile)) 129 | rows_to_drop.append(index) 130 | continue 131 | if mol.GetNumAtoms() > 100: 132 | rows_to_drop.append(index) 133 | continue 134 | remover = SaltRemover.SaltRemover(defnFilename=directory_path / SALTS_FILE) 135 | 136 | res, deleted = remover.StripMolWithDeleted(mol) 137 | # add a flag in case you want to remove or keep the salts - Default we keep it without the salt 138 | if len(deleted) != 0: 139 | rows_to_drop.append(index) 140 | continue 141 | 142 | mol = uncharger.uncharge(mol) 143 | if mol is None: 144 | rows_to_drop.append(index) 145 | continue 146 | 147 | if Chem.SanitizeMol(mol, catchErrors=True): # maybe the molecule has changed 148 | rows_to_drop.append(index) 149 | continue 150 | 151 | if mol.HasSubstructMatch(forbidden_elements): 152 | rows_to_drop.append(index) 153 | continue 154 | 155 | smi = Chem.MolToSmiles(mol, isomericSmiles=False) 156 | if len(smi) < 2 or "*" in smi or "R" in smi: 157 | rows_to_drop.append(index) 158 | continue 159 | 160 | # Update the smile at index index 161 | df["Smiles"][index] = smi 162 | 163 | print("Percentage of dropped molecule: " + str((len(rows_to_drop) * 100) / df.shape[0])) 164 | filtered_df = df.drop(rows_to_drop).reset_index(drop=True) 165 | if check_dataframe( 166 | filtered_df 167 | ): # check whether the indices are all correct or something went wrong 168 | df = filtered_df 169 | else: 170 | raise ValueError( 171 | "ERROR: Problem with inidices. Maybe a reset_index() is needed. The dataset will not be updated." 172 | ) 173 | return df 174 | 175 | 176 | def find_nan(df, representation_to_evaluate=[], drop=False): 177 | """ 178 | This function check for each dataset, each dataframe representations to evaluate if some rows contain nan values. 179 | The indices corresponding to the rows with nan values will be stored in molecules_to_drop variable and if drop is True 180 | the rows will be directly removed and the indices will be restored. 181 | """ 182 | 183 | logging.info("filter nan values in the molecular representations") 184 | 185 | # Check if empty 186 | if df.empty: 187 | raise EmptyDataframe("ERROR: df is empty") 188 | 189 | # Extract representation_to_evaluate if empty 190 | if not representation_to_evaluate: 191 | representation_to_evaluate = extract_representations(df) 192 | elif not all([repr_ in df.columns.values for repr_ in representation_to_evaluate]): 193 | # Check if the representation is in the dataframe 194 | raise MissingRepresentation( 195 | "ERROR: One or more representations are not in the dataset stored. HINT: Run Molecules_Representations to compute the representations needed" 196 | ) 197 | 198 | # find nan 199 | for representation in representation_to_evaluate: 200 | logging.info("For the representation " + representation) 201 | find_nan_vect = [np.isnan(x).any().sum() for x in df[representation]] 202 | if np.sum(find_nan_vect) > 0 and drop: 203 | indx_nan = [indx for indx, elem in enumerate(find_nan_vect) if elem > 0] 204 | logging.info("Drop Molecules at positions:" + str(indx_nan)) 205 | df.drop(indx_nan, inplace=True) 206 | df = df.reset_index() 207 | else: 208 | logging.info( 209 | "No molecules need to be dropped for " + representation + " representation" 210 | ) 211 | -------------------------------------------------------------------------------- /prefer/utils/mapping.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023, Novartis Institutes for BioMedical Research Inc. and Microsoft Corporation 2 | # All rights reserved. 3 | # 4 | # Redistribution and use in source and binary forms, with or without 5 | # modification, are permitted provided that the following conditions are 6 | # met: 7 | # 8 | # * Redistributions of source code must retain the above copyright 9 | # notice, this list of conditions and the following disclaimer. 10 | # * Redistributions in binary form must reproduce the above 11 | # copyright notice, this list of conditions and the following 12 | # disclaimer in the documentation and/or other materials provided 13 | # with the distribution. 14 | # * Neither the name of Novartis Institutes for BioMedical Research Inc. nor Microsoft Corporation 15 | # nor the names of its contributors may be used to endorse or promote 16 | # products derived from this software without specific prior written permission. 17 | # 18 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 19 | # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 20 | # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 21 | # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 22 | # OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 23 | # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 24 | # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 25 | # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 26 | # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 27 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 28 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 29 | # 30 | # Created by Jessica Lanini, January 2023 31 | 32 | 33 | import sys 34 | 35 | 36 | from pandas import DataFrame 37 | 38 | 39 | from prefer.molecule_representations.descriptors2D_representations_builder import ( 40 | Descriptors2DRepresentationsBuilder, 41 | ) 42 | from prefer.molecule_representations.fingerprints_representations_builder import ( 43 | FingerprintsRepresentationsBuilder, 44 | ) 45 | from prefer.molecule_representations.model_representations_builder import ( 46 | ModelRepresentationsBuilder, 47 | ) 48 | 49 | 50 | def mapping_representations( 51 | representation_name: str, 52 | df: DataFrame, 53 | output_dir: str, 54 | path_to_model: str = "", 55 | path_to_df: str = "", 56 | experiment_name: str = "", 57 | split_type: str = "random", 58 | ): # obj should be the object of the class for generic model 59 | """ 60 | Function to map representation names to the corresponding molecule representation builder. The function generate the representation and it save it in a 61 | define directory. 62 | The function returns the directory name (string) and the representation type (MoleculeRepresentations object) 63 | """ 64 | 65 | if representation_name == "DESCRIPTORS2D": 66 | builder = Descriptors2DRepresentationsBuilder() 67 | elif representation_name == "FINGERPRINTS": 68 | builder = FingerprintsRepresentationsBuilder() 69 | else: 70 | builder = ModelRepresentationsBuilder( 71 | path_to_model=path_to_model, representation_name=representation_name 72 | ) 73 | 74 | representations = builder.build_representations(molecule_data_orig=df, split_type=split_type) 75 | representations.save(output_dir, representation_name, experiment_name, path_to_df) 76 | 77 | 78 | def representations_supported(): 79 | """ 80 | Function to return the names of the representations currently supported by PREFER 81 | """ 82 | 83 | return ["CDDD", "DESCRIPTORS2D", "MOLER", "FINGERPRINTS"] 84 | -------------------------------------------------------------------------------- /prefer/utils/random_utils.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023, Novartis Institutes for BioMedical Research Inc. and Microsoft Corporation 2 | # All rights reserved. 3 | # 4 | # Redistribution and use in source and binary forms, with or without 5 | # modification, are permitted provided that the following conditions are 6 | # met: 7 | # 8 | # * Redistributions of source code must retain the above copyright 9 | # notice, this list of conditions and the following disclaimer. 10 | # * Redistributions in binary form must reproduce the above 11 | # copyright notice, this list of conditions and the following 12 | # disclaimer in the documentation and/or other materials provided 13 | # with the distribution. 14 | # * Neither the name of Novartis Institutes for BioMedical Research Inc. nor Microsoft Corporation 15 | # nor the names of its contributors may be used to endorse or promote 16 | # products derived from this software without specific prior written permission. 17 | # 18 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 19 | # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 20 | # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 21 | # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 22 | # OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 23 | # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 24 | # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 25 | # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 26 | # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 27 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 28 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 29 | # 30 | # Created by Jessica Lanini, January 2023 31 | 32 | 33 | import random 34 | import numpy as np 35 | 36 | 37 | def set_random_seed(seed: int = 42): 38 | """ 39 | Set random seed for random, numpy, tensorflow and pytorch 40 | Args: 41 | seed: the magic hyperparameter 42 | 43 | Returns: None 44 | 45 | """ 46 | 47 | np.random.seed(seed) 48 | random.seed(seed) 49 | 50 | try: 51 | import tensorflow as tf 52 | 53 | tf_version = int(tf.version.VERSION.split(".")[0]) 54 | 55 | if tf_version <= 1: 56 | tf.set_random_seed(seed) 57 | else: 58 | 59 | tf.random.set_seed(seed) 60 | except ModuleNotFoundError: 61 | print("Tensorflow not found; skipping: tf.random.set_seed(...)") 62 | pass 63 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.black] 2 | line-length = 100 3 | target-version = ['py37'] 4 | exclude = ''' 5 | /( 6 | \.git 7 | | \.hg 8 | | \.mypy_cache 9 | | \.tox 10 | | \.venv 11 | | _build 12 | | buck-out 13 | | build 14 | | dist 15 | )/ 16 | ''' 17 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import os 2 | import setuptools 3 | 4 | this_directory = os.path.abspath(os.path.dirname(__file__)) 5 | with open(os.path.join(this_directory, "README.md"), encoding="utf-8") as f: 6 | long_description = f.read() 7 | 8 | setuptools.setup( 9 | name="prefer", 10 | use_scm_version=True, 11 | license="MIT", 12 | author="Jessica Lanini", 13 | author_email="jessica.lanini@novartis.com", 14 | description="benchmarking and Property pREdiction FramEwoRk", 15 | long_description=long_description, 16 | long_description_content_type="text/markdown", 17 | url="https://dev.azure.com/MSAI-DevOps-Org/FormulaOne%20Azure%20-%20AI%20Exploration%20-%20Gen%20Chem/_git/PREFER", #TO DO change it accprding to the final GitHub location 18 | setup_requires=["setuptools_scm"], 19 | python_requires="==3.7.7", 20 | install_requires=[ 21 | "dpu-utils>=0.2.13", 22 | "scikit-learn==0.24.1", 23 | "numpy==1.19.2", 24 | "pandas>=1.2.4", 25 | "auto-sklearn==0.14.7", 26 | ], 27 | packages=setuptools.find_packages(), 28 | entry_points={"console_scripts": ["prefer = prefer.run_prefer_automation:run_PREFER"]}, 29 | ) 30 | -------------------------------------------------------------------------------- /small_data_experiments/README_smalldata.txt: -------------------------------------------------------------------------------- 1 | The set of files stored in this folder can be used to 2 | 1. Download the FS-Mol test sets from the main FS-Mol repo (extract_zipped_files.ipynb) [you need to unzip the main fsmol.tar file to run the script] 3 | 2. Run PREFER on the FS-Mol test sub-sets (run_PREFER_smalldata_example.ipynb) 4 | 3. Analyze the results and compare them with respect to the FS-Mol results (analysis_smalldata_example.ipynb) -------------------------------------------------------------------------------- /small_data_experiments/extract_zipped_files.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "48c532a6-753c-4eaa-a68f-a87a8d91d139", 6 | "metadata": {}, 7 | "source": [ 8 | "## Notebook to extract zip files downloaded from [FS-Mol repo](https://figshare.com/ndownloader/files/31345321)" 9 | ] 10 | }, 11 | { 12 | "cell_type": "markdown", 13 | "id": "6402e340-f77c-4a05-82dc-88adebd54d6d", 14 | "metadata": {}, 15 | "source": [ 16 | "This notebook is provided to easily extract and convert zipped files from the FS-Mol repository to .csv files. So before running the cells please download the zip files from [here](https://figshare.com/ndownloader/files/31345321)" 17 | ] 18 | }, 19 | { 20 | "cell_type": "markdown", 21 | "id": "e1a35cd3-16bb-4a4d-b77e-f7ac9cd3e0ab", 22 | "metadata": {}, 23 | "source": [ 24 | "### To run the notebook please extract the fsmol.tar" 25 | ] 26 | }, 27 | { 28 | "cell_type": "code", 29 | "execution_count": null, 30 | "id": "3cd989e0-c560-4845-80bc-90a4411cb38c", 31 | "metadata": { 32 | "tags": [] 33 | }, 34 | "outputs": [], 35 | "source": [ 36 | "# Add the path where you have saved the zip files\n", 37 | "path_to_zip_files = None\n", 38 | "# Add path where you would like to store the converted csv files to be used in PREFER\n", 39 | "path_where_to_store_csv_files = None" 40 | ] 41 | }, 42 | { 43 | "cell_type": "code", 44 | "execution_count": null, 45 | "id": "673ef336-c98b-4e6b-8641-a1d7af0dd5e0", 46 | "metadata": { 47 | "tags": [] 48 | }, 49 | "outputs": [], 50 | "source": [ 51 | "# unzip files\n", 52 | "import os\n", 53 | "import json, gzip\n", 54 | "\n", 55 | "# only test sets will be used for training and testing the PREFER model\n", 56 | "sets = ['test']\n", 57 | "index = 0\n", 58 | "for set_ in sets:\n", 59 | " path_to_files = f'{path_to_zip_files}/{set_}/'\n", 60 | " zipped_file_names = os.listdir(path_to_files)\n", 61 | " for zipped_file in zipped_file_names:\n", 62 | " print(f'current file is: {zipped_file} - iteration number: {index}')\n", 63 | " index=index+1\n", 64 | " run = f'gunzip {path_to_files}{zipped_file}'\n", 65 | " !{run}" 66 | ] 67 | }, 68 | { 69 | "cell_type": "code", 70 | "execution_count": null, 71 | "id": "c09e06bd-6d78-4315-bd90-328628d7d7fb", 72 | "metadata": { 73 | "tags": [] 74 | }, 75 | "outputs": [], 76 | "source": [ 77 | "# covert data for PREFER\n", 78 | "\n", 79 | "import json\n", 80 | "import pandas as pd\n", 81 | "file_names = os.listdir(path_to_files)\n", 82 | "\n", 83 | "for file_name in file_names:\n", 84 | " print(f'Current file is {file_name}')\n", 85 | " data = []\n", 86 | " df = pd.DataFrame()\n", 87 | " collect_smiles = []\n", 88 | " collect_ids = []\n", 89 | " collect_labels = []\n", 90 | " with open(path_to_files+file_name) as f:\n", 91 | " for line in f:\n", 92 | " data.append(json.loads(line))\n", 93 | " for elem in data:\n", 94 | " collect_smiles.append(elem['SMILES'])\n", 95 | " collect_ids.append(elem['Assay_ID'])\n", 96 | " collect_labels.append(elem['Property'])\n", 97 | " df = pd.DataFrame({'SMILES': collect_smiles, 'Assay_ID': collect_ids, 'Property': collect_labels})\n", 98 | " file_name = file_name.replace('.jsonl', '')\n", 99 | " df.to_csv(f'{path_where_to_store_csv_files}/{file_name}.csv', index = False)" 100 | ] 101 | }, 102 | { 103 | "cell_type": "code", 104 | "execution_count": null, 105 | "id": "35a91a35-0bd2-4b87-b206-d4138a366b6a", 106 | "metadata": {}, 107 | "outputs": [], 108 | "source": [] 109 | } 110 | ], 111 | "metadata": { 112 | "interpreter": { 113 | "hash": "e06fcc65451699fab52210cecc89ce74d347871d8379f3a65371b5502fcda228" 114 | }, 115 | "kernelspec": { 116 | "display_name": "Python (prefer-env-released2)", 117 | "language": "python", 118 | "name": "prefer-env-released2" 119 | }, 120 | "language_info": { 121 | "codemirror_mode": { 122 | "name": "ipython", 123 | "version": 3 124 | }, 125 | "file_extension": ".py", 126 | "mimetype": "text/x-python", 127 | "name": "python", 128 | "nbconvert_exporter": "python", 129 | "pygments_lexer": "ipython3", 130 | "version": "3.7.7" 131 | } 132 | }, 133 | "nbformat": 4, 134 | "nbformat_minor": 5 135 | } 136 | --------------------------------------------------------------------------------