├── .gitmodules
├── LICENSE
├── README.md
├── Run_PREFER.ipynb
├── __init__.py
├── analysis_notebooks
    ├── Best_PREFER_model_VS_RF.ipynb
    ├── Plot_performance_distributions_for_representation.ipynb
    ├── README.txt
    └── TestSet_Bootstrapping.ipynb
├── api_version.txt
├── cddd-environment-light.yml
├── cddd-environment.yml
├── compute_model_based_representations.py
├── config_files
    ├── config_PREFER_logD.yaml
    ├── config_PREFER_smalldata.yaml
    ├── config_PREFER_solub.yaml
    └── config_model_based_representations.yaml
├── moler-environment-light.yml
├── moler-environment.yml
├── prefer-environment.yml
├── prefer
    ├── __init__.py
    ├── azure_ml
    │   ├── README.md
    │   ├── aml_config.py
    │   ├── aml_configuration
    │   │   └── aml_config.json
    │   ├── config_logD_azure.yaml
    │   ├── exceptions.py
    │   ├── get_model_utils.py
    │   ├── included_prefixes.json
    │   ├── model_registration_prefer.py
    │   ├── model_registration_utils.py
    │   ├── reproducibility.py
    │   ├── schedule_global_model_pipeline.py
    │   ├── telemetry_utils.py
    │   └── utils.py
    ├── docs
    │   ├── PREFER_scheme.png
    │   └── SaltsMod.txt
    ├── model_based_representations
    │   ├── __init__.py
    │   ├── cddd_wrapper.py
    │   ├── interface.py
    │   ├── model_based_representations_factory.py
    │   ├── models
    │   │   └── __init__.py
    │   └── moler_wrapper.py
    ├── molecule_representations
    │   ├── __init__.py
    │   ├── descriptors2D_representations_builder.py
    │   ├── fingerprints_representations_builder.py
    │   └── model_representations_builder.py
    ├── schema
    │   ├── __init__.py
    │   └── config.py
    ├── scripts
    │   ├── __init__.py
    │   ├── aml_context.py
    │   ├── combine_results.py
    │   ├── get_representations.py
    │   ├── model_wrapper.py
    │   ├── run_PREFER.py
    │   └── utils.py
    ├── src
    │   ├── __init__.py
    │   ├── benchmarking.py
    │   ├── molecule_representations.py
    │   ├── molecule_representations_builder.py
    │   ├── prefer_model_wrapper.py
    │   └── vector_molecule_representations.py
    ├── tests
    │   ├── __init__.py
    │   ├── data_for_test
    │   │   └── logDPublic.csv
    │   ├── file_for_test
    │   │   ├── config_PREFER_test_custom_autosklearn.yaml
    │   │   └── logD_desirability_scores.yaml
    │   ├── test_autosklearn_customization.py
    │   ├── test_check_input_dataframe.py
    │   ├── test_data_preparation.py
    │   ├── test_filtering.py
    │   ├── test_helpers.py
    │   ├── test_prefer_model_wrapper.py
    │   └── test_scripts.py
    └── utils
    │   ├── __init__.py
    │   ├── automation.py
    │   ├── check_input_dataframe.py
    │   ├── data_preparation.py
    │   ├── data_utils.py
    │   ├── features_scaling.py
    │   ├── filtering.py
    │   ├── mapping.py
    │   ├── models_evaluation.py
    │   ├── models_utils.py
    │   ├── post_processing_and_optimization_helpers.py
    │   ├── random_utils.py
    │   ├── run_automl.py
    │   ├── save_load.py
    │   └── splitting_strategies.py
├── pyproject.toml
├── run_prefer_automation.py
├── setup.py
└── small_data_experiments
    ├── README_smalldata.txt
    ├── analysis_smalldata_example.ipynb
    ├── extract_zipped_files.ipynb
    ├── run_PREFER_smalldata_example.ipynb
    └── run_prefer_automation_smalldata.py


/.gitmodules:
--------------------------------------------------------------------------------
1 | [submodule "prefer/model_based_representations/models/cddd"]
2 | 	path = prefer/model_based_representations/models/cddd
3 | 	url = https://github.com/jrwnter/cddd
4 | [submodule "prefer/model_based_representations/models/molecule-generation"]
5 | 	path = prefer/model_based_representations/models/molecule-generation
6 | 	url = https://github.com/microsoft/molecule-generation
7 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | BSD 3-Clause License
 2 | 
 3 | Copyright (c) 2023, RDKit
 4 | 
 5 | Redistribution and use in source and binary forms, with or without
 6 | modification, are permitted provided that the following conditions are met:
 7 | 
 8 | 1. Redistributions of source code must retain the above copyright notice, this
 9 |    list of conditions and the following disclaimer.
10 | 
11 | 2. Redistributions in binary form must reproduce the above copyright notice,
12 |    this list of conditions and the following disclaimer in the documentation
13 |    and/or other materials provided with the distribution.
14 | 
15 | 3. Neither the name of the copyright holder nor the names of its
16 |    contributors may be used to endorse or promote products derived from
17 |    this software without specific prior written permission.
18 | 
19 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
20 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
22 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
23 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
25 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
26 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
27 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Benchmarking and Property Prediction Framework (PREFER)
  2 | 
  3 | The PREFER framework automatizes the evaluation of different combinations of molecular representations and machine learning models for predicting molecular properties. 
  4 | It covers different molecular representation from classical, e.g. Fingerprints and 2D Descriptors, to data-driven representations, e.g. Continuous and Data Driven representations (CDDD) [1] or MoLeR[2].
  5 | PREFER uses AutoSklearn [3] to implement the ML model selection and the hyperparameter tuning.
  6 | 
  7 | ![caption](prefer/docs/PREFER_scheme.png)
  8 | 
  9 | *General overview of the PREFER framework where the Model Selection part is based on [3].*
 10 | 
 11 | ## Getting Started
 12 | 
 13 | ### Installation
 14 | 
 15 | #### Python Environment
 16 | The main conda environment for using PREFER can be installed from `prefer-environment.yml`, as follows:
 17 | 
 18 | ```
 19 | conda env create -f prefer-environment.yml
 20 | ```
 21 | 
 22 | Depending to the models employed to generate model-based molecular representations, other environments need to be installed (one for each model). The supported models in the current PREFER code are CDDD [1] and MoLeR [2]. The corresponding environments can be found in `moler-environment-light.yml` and `cddd-environment-light.yml` and can be installed as follows:
 23 | 
 24 | ```
 25 | conda env create -f moler-environment-light.yml
 26 | OR
 27 | conda env create -f cddd-environment-light.yml
 28 | ```
 29 | 
 30 | Before running any experiments, relevant paths need to be set (including cddd and moler folders which are integrated in PREFER as git submodules), as follows:
 31 | 
 32 | ```
 33 | PYTHONPATH="path_to/PREFER/prefer/model_based_representations/models/cddd/:path_to/PREFER/prefer/model_based_representations/models/molecule-generation/:path_to/PREFER/:$PYTHONPATH"
 34 | export PYTHONPATH
 35 | ```
 36 | 
 37 | New models should be included as git submodules and add in the PYTHONPATH.
 38 | 
 39 | #### Conda Environments in Jupyter 
 40 | To use the PREFER conda environment in a Jupyter notebook, the environment needs to be added to Jupyter's kernelspec:
 41 | 
 42 | ```
 43 | conda activate prefer-env
 44 | python -m ipykernel install --user --name prefer-env --display-name "Python (prefer-env)"
 45 | ```
 46 | 
 47 | Check that Jupyter has access to this environment by running 
 48 | 
 49 | ```
 50 | jupyter kernelspec list
 51 | ```
 52 | 
 53 | The recently added env `Python (prefer-env)` should be available now in Jupyter. 
 54 | 
 55 | 
 56 | 
 57 | ## Prerequisites
 58 | 
 59 | In order to run PREFER, we provide one notebook (Run-PREFER.ipynb) and one python script (run_prefer_automation.py) 
 60 | 
 61 | Main steps are as follows:
 62 | 
 63 | ### STEP 0: clone the repository and unpack the git submodules
 64 | Once you have cloned this repository, please go into your cloned folder and run the following commands:
 65 | 
 66 | ```
 67 | git submodule update --init --recursive
 68 | ```
 69 | 
 70 | This is needed to unpack the git submodules used to connect PREFER to the models used to compute the model-based representations.
 71 | 
 72 | ### STEP 1: download public test datasets
 73 | Two public datasets can be used to test the code:
 74 | - [logD](https://www.ebi.ac.uk/chembl/g/#browse/activities/filter/document_chembl_id%3ACHEMBL3301361%20AND%20standard_type%3A(%22LogD7.4%22)) from ChEMBL
 75 | - [solubility](https://pubchem.ncbi.nlm.nih.gov/bioassay/1996) from PubChem
 76 | 
 77 | ### STEP 2: download models for calculating data-based molecular representations
 78 | Two models are supported currenlty as submodules in PREFER: CDDD and MOLER. 
 79 | Pre-trained models can be dowloaded from:
 80 | 
 81 | - CDDD: [here](https://drive.google.com/open?id=1oyknOulq_j0w9kzOKKIHdTLo5HphT99h)
 82 | - MOLER: [here](https://figshare.com/ndownloader/files/34642724)
 83 | 
 84 | Save these trained models locally, since they will be used afterwards. 
 85 | 
 86 | 
 87 | ### STEP 3: set the configuration files
 88 | For each PREFER job a yaml config file need to be prepared as follows:
 89 | 
 90 | 1. Main settings:
 91 | ```
 92 | path_to_df: 'path_to_df'
 93 | experiment_name: 'experiment_name'
 94 | id_column_name:  'id_column_name'
 95 | smiles_column_name:  'smiles_column_name'
 96 | properties_column_name_list: 
 97 |       - 'property_1_col_name'
 98 |       - 'property_2_col_name'
 99 | problem_type: 'regression' # or 'classification'
100 | splitting_strategy: 'random' # or 'cluster' or 'temporal'
101 | temporal_info_column_name: 'temporal_info_column_name'
102 | ```
103 | 
104 | Examples are provided in ./config_files.
105 | 
106 | 2. Settings for model based representations:
107 | ```
108 | model_based_representations:
109 |     'model_name': 
110 |         'path_to_model': 'path to model folder'(see STEP2)
111 |         'conda_env': 'name of the conda env installed for this model'
112 |         'submodule_path': 'path to the submodule folder included in PREFER for running the model'(e.g. path_to/prefer/model_based_representations/models/cddd/)
113 |     
114 | prefer_path: 'path_to_/PREFER/'
115 | ```
116 | 
117 | Examples of configuration file for the representations is provided in ./config_files/config_model_based_representations.yaml.
118 | 
119 | 
120 | 
121 | ### STEP 4: run Run-PREFER.ipynb notebook
122 | To run the notebook `Run_PREFER.ipynb`, first of all select the correct kernel (Python (prefer-env)) and then change the needed paths, in particular:
123 | 
124 | - sys.path.append('path_to/PREFER/')
125 | - sys.path.append('path_to/models/cddd/') # to connect CDDD model
126 | - sys.path.append('path_to/models/molecule-generation/') # to connect MOLER model
127 | 
128 | By running the notebook a folder (PREFER_results) will be created with the main results (benchmarking object and models). 
129 | Moreover different folders with structure {model_name}_representations_{experiment_name} will be created containing the model_based representations.
130 | 
131 | In the notebook one can also find an example of how to use the stored PREFER-model-wrapper to predict new samples. This way the best model found for each molecular representation can be used later to predict the property under analysis. 
132 | 
133 | An automatized version of the notebook can be found in `run_prefer_automation.py`. You can run it from the terminal with the following commands:
134 | 
135 | ```
136 | conda activate prefer-env
137 | 
138 | PYTHONPATH="path_to/PREFER/prefer/model_based_representations/models/cddd/:path_to/PREFER/prefer/model_based_representations/models/molecule-generation/:path_to/PREFER/:$PYTHONPATH"
139 | export PYTHONPATH
140 | 
141 | python run_prefer_automation.py --prefer_args path_to_yaml_configuration_file(see STEP3) --model_based_representations_args path_to_yaml_configuration_file_for_models_used_to_compute_the_representations(see STEP4)
142 | ```
143 | 
144 | 
145 | ## WARNING: 
146 | Please make sure that you select the right model type according to the dataset used (e.g.for a classification model binary labels should be provided in the dataset). 
147 | 
148 | ## Authors
149 | 
150 | * **Jessica Lanini** 
151 | 
152 | With the contribution of
153 | - Nadine Schneider
154 | - Gianluca Santarossa
155 | - Sarah Lewis
156 | - Krzysztof Maziarz
157 | - Marwin Segler
158 | - Hubert Misztela
159 | 
160 | 
161 | ## References
162 | [1] Winter, Robin, et al. "Learning continuous and data-driven molecular descriptors by translating equivalent chemical representations." Chemical science 10.6 (2019): 1692-1701.
163 | 
164 | [2] Maziarz, Krzysztof, et al. "Learning to extend molecular scaffolds with structural motifs." arXiv preprint arXiv:2103.03864 (2021).
165 | 
166 | [3] Feurer, Matthias, et al. "Efficient and robust automated machine learning." Advances in neural information processing systems 28 (2015).
167 | 


--------------------------------------------------------------------------------
/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rdkit/PREFER/16986ca183b38b94f9b3b62b441ec40671b8cf8a/__init__.py


--------------------------------------------------------------------------------
/analysis_notebooks/Plot_performance_distributions_for_representation.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Plot performances for each molecular representation"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "markdown",
 12 |    "metadata": {},
 13 |    "source": [
 14 |     "After running TestSet_Bootstrapping.ipynb a .pkl file containing the final performances for each molecular representation should have been created. This notebook will then plot the results and perform a statistical analysis. "
 15 |    ]
 16 |   },
 17 |   {
 18 |    "cell_type": "markdown",
 19 |    "metadata": {},
 20 |    "source": [
 21 |     "## Imports"
 22 |    ]
 23 |   },
 24 |   {
 25 |    "cell_type": "code",
 26 |    "execution_count": null,
 27 |    "metadata": {
 28 |     "tags": []
 29 |    },
 30 |    "outputs": [],
 31 |    "source": [
 32 |     "import sys\n",
 33 |     "%load_ext autoreload\n",
 34 |     "# path to the main directory\n",
 35 |     "path_to_PREFER = 'path_to/PREFER/'\n",
 36 |     "# path to submodules\n",
 37 |     "path_to_cddd = 'path_to/PREFER/prefer/model_based_representations/models/cddd/'\n",
 38 |     "path_to_moler = 'path_to/PREFER/prefer/model_based_representations/models/molecule-generation/'\n",
 39 |     "sys.path.append(path_to_PREFER)\n",
 40 |     "sys.path.append(path_to_cddd)\n",
 41 |     "sys.path.append(path_to_moler)\n",
 42 |     "import warnings\n",
 43 |     "warnings.filterwarnings('ignore')\n",
 44 |     "from prefer.utils.filtering import *\n",
 45 |     "import sys"
 46 |    ]
 47 |   },
 48 |   {
 49 |    "cell_type": "code",
 50 |    "execution_count": null,
 51 |    "metadata": {
 52 |     "tags": []
 53 |    },
 54 |    "outputs": [],
 55 |    "source": [
 56 |     "from prefer.utils.post_processing_and_optimization_helpers import create_heat_map\n",
 57 |     "from prefer.utils.automation import merge_table_metrics, data_preparation, generate_molecular_representations, run, create_comparison_table"
 58 |    ]
 59 |   },
 60 |   {
 61 |    "cell_type": "markdown",
 62 |    "metadata": {},
 63 |    "source": [
 64 |     "### Folders where to find models"
 65 |    ]
 66 |   },
 67 |   {
 68 |    "cell_type": "code",
 69 |    "execution_count": null,
 70 |    "metadata": {},
 71 |    "outputs": [],
 72 |    "source": [
 73 |     "import pickle \n",
 74 |     "name = \"final_dict_['publicSolubility', 'publicLogD'].pickle\"\n",
 75 |     "with open(name, 'rb') as handle:\n",
 76 |     "    dict1 = pickle.load(handle)\n"
 77 |    ]
 78 |   },
 79 |   {
 80 |    "cell_type": "code",
 81 |    "execution_count": null,
 82 |    "metadata": {},
 83 |    "outputs": [],
 84 |    "source": [
 85 |     "import pandas as pd\n",
 86 |     "df1 = pd.DataFrame()\n",
 87 |     "repr_vect = []\n",
 88 |     "value_vect = []\n",
 89 |     "for repr_ in dict1['autosklearn']['publicSolubility'].keys():\n",
 90 |     "    for elem in dict1['autosklearn']['publicSolubility'][repr_]:\n",
 91 |     "        repr_vect.append(repr_)\n",
 92 |     "        value_vect.append(elem)\n",
 93 |     "df1['Representation'] = repr_vect\n",
 94 |     "df1['∆AUPRC'] = value_vect"
 95 |    ]
 96 |   },
 97 |   {
 98 |    "cell_type": "code",
 99 |    "execution_count": null,
100 |    "metadata": {},
101 |    "outputs": [],
102 |    "source": [
103 |     "import pandas as pd\n",
104 |     "df2 = pd.DataFrame()\n",
105 |     "repr_vect = []\n",
106 |     "value_vect = []\n",
107 |     "for repr_ in dict1['autosklearn']['publicLogD'].keys():\n",
108 |     "    for elem in dict1['autosklearn']['publicLogD'][repr_]:\n",
109 |     "        repr_vect.append(repr_)\n",
110 |     "        value_vect.append(elem)\n",
111 |     "df2['Representation'] = repr_vect\n",
112 |     "df2['R2'] = value_vect"
113 |    ]
114 |   },
115 |   {
116 |    "cell_type": "code",
117 |    "execution_count": null,
118 |    "metadata": {
119 |     "tags": []
120 |    },
121 |    "outputs": [],
122 |    "source": [
123 |     "from scipy import stats\n",
124 |     "collect_stats = []\n",
125 |     "for exper in dict1['autosklearn'].keys():\n",
126 |     "    print(exper)\n",
127 |     "    collect_group = []\n",
128 |     "    for repr_ in dict1['autosklearn'][exper].keys():\n",
129 |     "        collect_group.append(dict1['autosklearn'][exper][repr_])\n",
130 |     "    #perform Friedman Test\n",
131 |     "    collect_stats.append(stats.friedmanchisquare(collect_group[0], collect_group[1], collect_group[2], collect_group[3]))\n",
132 |     "\n",
133 |     "collect_stats"
134 |    ]
135 |   },
136 |   {
137 |    "cell_type": "code",
138 |    "execution_count": null,
139 |    "metadata": {
140 |     "tags": []
141 |    },
142 |    "outputs": [],
143 |    "source": [
144 |     "import seaborn as sns\n",
145 |     "import matplotlib.pyplot as plt\n",
146 |     "plt.style.use('fivethirtyeight')\n",
147 |     "fig, axes = plt.subplots(1, 3, figsize=(10, 5), sharey=True)\n",
148 |     "fig.suptitle('Performances', size = 15)\n",
149 |     "sns.set(font_scale=0.8)\n",
150 |     "sns.violinplot(ax=axes[0], x='Representation', y='∆AUPRC', data=df1.sort_values('Representation'));\n",
151 |     "axes[0].set_title('LE-MDCK', size = 15)\n",
152 |     "axes[0].tick_params(axis='x', rotation=45)\n",
153 |     "\n",
154 |     "\n",
155 |     "plt.savefig(f'classification_results_with_mean_and_std.png', bbox_inches='tight', transparent=True)\n",
156 |     "\n",
157 |     "plt.show()"
158 |    ]
159 |   },
160 |   {
161 |    "cell_type": "code",
162 |    "execution_count": null,
163 |    "metadata": {},
164 |    "outputs": [],
165 |    "source": [
166 |     "import seaborn as sns\n",
167 |     "import matplotlib.pyplot as plt\n",
168 |     "plt.style.use('fivethirtyeight')\n",
169 |     "fig, axes = plt.subplots(1, 3, figsize=(10, 5), sharey=True)\n",
170 |     "fig.suptitle('Performances', size = 15)\n",
171 |     "sns.set(font_scale=0.8)\n",
172 |     "sns.violinplot(ax=axes[0], x='Representation', y='R2', data=df2.sort_values('Representation'));\n",
173 |     "axes[0].set_title('logD', size = 15)\n",
174 |     "axes[0].tick_params(axis='x', rotation=45)\n",
175 |     "\n",
176 |     "\n",
177 |     "plt.savefig(f'regression_results_with_mean_and_std.png', bbox_inches='tight', transparent=True)\n",
178 |     "\n",
179 |     "plt.show()"
180 |    ]
181 |   },
182 |   {
183 |    "cell_type": "code",
184 |    "execution_count": null,
185 |    "metadata": {},
186 |    "outputs": [],
187 |    "source": [
188 |     "income_groups = [df1.loc[df1['Representation']==repr_, '∆AUPRC'].values for repr_ in df1['Representation'].dropna().unique()]\n",
189 |     "stat, p_value = f_oneway(*income_groups)\n",
190 |     "print(f\"F Test: statistic={stat:.4f}, p-value={p_value:.4f}\")"
191 |    ]
192 |   },
193 |   {
194 |    "cell_type": "code",
195 |    "execution_count": null,
196 |    "metadata": {},
197 |    "outputs": [],
198 |    "source": [
199 |     "income_groups = [df2.loc[df2['Representation']==repr_, 'R2'].values for repr_ in df2['Representation'].dropna().unique()]\n",
200 |     "stat, p_value = f_oneway(*income_groups)\n",
201 |     "print(f\"F Test: statistic={stat:.4f}, p-value={p_value:.4f}\")"
202 |    ]
203 |   }
204 |  ],
205 |  "metadata": {
206 |   "kernelspec": {
207 |    "display_name": "Python (prefer-env-released2)",
208 |    "language": "python",
209 |    "name": "prefer-env-released2"
210 |   },
211 |   "language_info": {
212 |    "codemirror_mode": {
213 |     "name": "ipython",
214 |     "version": 3
215 |    },
216 |    "file_extension": ".py",
217 |    "mimetype": "text/x-python",
218 |    "name": "python",
219 |    "nbconvert_exporter": "python",
220 |    "pygments_lexer": "ipython3",
221 |    "version": "3.7.7"
222 |   }
223 |  },
224 |  "nbformat": 4,
225 |  "nbformat_minor": 4
226 | }
227 | 


--------------------------------------------------------------------------------
/analysis_notebooks/README.txt:
--------------------------------------------------------------------------------
1 | The analysis_notebooks folder contains three notebooks to compute the performances of the PREFER models for each molecular representation and/or the performances of the best model against a RandomForest baseline.
2 | In order to run the notebook, one should first run a PREFER job (e.g. through the Run_PREFER.ipynb notebook) in order to train PREFER models for each molecular representation. After that one can run the notebboks in the analysis_notebooks folder in the following order:
3 | 
4 | 1. TestSet_Bootstrapping.ipynb
5 | 2. Plot_performance_distributions_for_representation.ipynb AND/OR Best_PREFER_model_VS_RF.ipynb 


--------------------------------------------------------------------------------
/api_version.txt:
--------------------------------------------------------------------------------
1 | 0.0.0


--------------------------------------------------------------------------------
/cddd-environment-light.yml:
--------------------------------------------------------------------------------
 1 | name: cddd-env-prefer-light
 2 | 
 3 | channels:
 4 |   - rdkit
 5 |   - defaults
 6 | 
 7 | dependencies:
 8 |   - python==3.7.7
 9 |   - numpy
10 |   - rdkit==2020.09.1.0
11 |   - scikit-learn==0.24.1
12 |   - tensorflow-gpu==1.13.1
13 |   - pip:
14 |       - dirhash
15 |       - pandas
16 |       - pyyaml
17 |       - matplotlib
18 |       - seaborn


--------------------------------------------------------------------------------
/cddd-environment.yml:
--------------------------------------------------------------------------------
 1 | name: cddd-env-prefer
 2 | channels:
 3 |   - rdkit
 4 |   - defaults
 5 |   - plotly
 6 |   - pytorch
 7 | dependencies:
 8 |   - pip<22
 9 |   - python==3.7.7
10 |   - matplotlib
11 |   - numpy
12 |   - rdkit==2020.09.1.0
13 |   - scikit-learn==0.24.1
14 |   - seaborn
15 |   - tqdm
16 |   # For AML Hyperdrive
17 |   - plotly
18 |   - plotly-orca
19 |   - psutil
20 |   - openssl
21 |   - h5py==2.10.0
22 |   - tensorflow-gpu==1.13.1
23 |   - pytorch
24 |   - cpuonly
25 |   - pip:
26 |     - docopt
27 |     - dpu-utils>=0.2.13
28 |     - zmq
29 |     - pytest
30 |     - azureml-pipeline
31 |     # For AML hyperdrive
32 |     - azureml-widgets
33 |     - azure-keyvault-secrets
34 |     - ipywidgets
35 |     - pandas
36 |     - cairosvg
37 |     - fcd_torch
38 |     - pydantic
39 |     - pyyaml
40 |     - wrapt
41 |     - dirhash
42 |     - scandir
43 |     - semver
44 |     - py-repo-root==1.1.1
45 |     - keras==2.1.2
46 |     - fcd
47 |     - json2html
48 |     - ghostml
49 |     - auto-sklearn==0.14.7
50 |     - protobuf==3.20.1
51 |     - opencensus-ext-azure
52 |     - opencensus-ext-requests
53 | 
54 | 


--------------------------------------------------------------------------------
/compute_model_based_representations.py:
--------------------------------------------------------------------------------
  1 | #  Copyright (c) 2023, Novartis Institutes for BioMedical Research Inc.
  2 | #  All rights reserved.
  3 | # 
  4 | # Redistribution and use in source and binary forms, with or without
  5 | # modification, are permitted provided that the following conditions are
  6 | # met: 
  7 | #
  8 | #     * Redistributions of source code must retain the above copyright 
  9 | #       notice, this list of conditions and the following disclaimer.
 10 | #     * Redistributions in binary form must reproduce the above
 11 | #       copyright notice, this list of conditions and the following 
 12 | #       disclaimer in the documentation and/or other materials provided 
 13 | #       with the distribution.
 14 | #     * Neither the name of Novartis Institutes for BioMedical Research Inc. 
 15 | #       nor the names of its contributors may be used to endorse or promote 
 16 | #       products derived from this software without specific prior written permission.
 17 | #
 18 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 19 | # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 20 | # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 21 | # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 22 | # OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 23 | # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 24 | # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 25 | # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 26 | # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 27 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 28 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 29 | #
 30 | # Created by Jessica Lanini, January 2023
 31 | 
 32 | 
 33 | import sys
 34 | 
 35 | import warnings
 36 | import pandas as pd
 37 | warnings.filterwarnings('ignore')
 38 | from prefer.utils.data_preparation import prepare_data
 39 | import logging
 40 | import argparse
 41 | import yaml
 42 | 
 43 | root = logging.getLogger()
 44 | root.setLevel(logging.DEBUG)
 45 | 
 46 | from prefer.molecule_representations.model_representations_builder import ModelRepresentationsBuilder
 47 | import json
 48 | 
 49 | 
 50 | def compute_representations_from_args(
 51 |     args,
 52 |     path_to_model,
 53 |     model_name = None,
 54 | ):
 55 |     """
 56 |     NB: we assume that the dataset has the semicolomn as separator and that the dataset
 57 |     """
 58 |     path_to_df=args.path_to_df
 59 |     id_column_name=args.id_column_name
 60 |     smiles_column_name=args.smiles_column_name
 61 |     split_type=args.splitting_strategy
 62 |     temporal_info_column_name=args.temporal_info_column_name
 63 |     
 64 |     supported_models = ['CDDD', 'MOLER']
 65 |     if model_name == None:
 66 |         model_name = 'MODELBASED'
 67 |         print(f'WARNING: PREFER supports only {supported_models}, but other models can be used')
 68 |     elif(model_name not in supported_models):
 69 |         print(f'WARNING: PREFER supports only {supported_models}, but other models can be used')
 70 |     
 71 |     try:
 72 |         properties_column_name = json.loads(args.properties_column_name[0])
 73 | 
 74 |     except Exception:
 75 |         properties_column_name_json_format = json.dumps(args.properties_column_name)
 76 |         properties_column_name = json.loads(properties_column_name_json_format)
 77 |         
 78 |     properties_column_name_list=properties_column_name
 79 |     
 80 |     # Read your .csv files
 81 |     if path_to_df.endswith("/"):  # Normalise away trailing slashes
 82 |         path_to_df = path_to_df[:-1]
 83 | 
 84 |     try:
 85 |         arr = os.listdir(path_to_df)
 86 |         path_to_df = path_to_df + "/" + arr[0]
 87 |     except Exception:
 88 |         logging.info("Already a file")
 89 | 
 90 |     try:
 91 |         df = pd.read_csv(path_to_df)
 92 |     except Exception:
 93 |         df = pd.read_csv(path_to_df, sep=";")
 94 | 
 95 |     # in prepare_data now the dataset is both prepared and filtered
 96 |     try:
 97 |         # Manipulate dataframe such that it is in the right shape fo being used as input of the DataStorage class
 98 |         # ¦ ID ¦ Smiles ¦ Property_1 ¦ Property_2 ¦ ... ¦ Property_N ¦
 99 |         # -------------------------------------------------------------
100 |         # This is done by specifying the experiment_name, the name of column where the ID information and SMILES representation of each sample is stored, and finally
101 |         # the list of the columns' names of the properties to model.
102 |         df = prepare_data(
103 |             df=df,
104 |             id_column_name=id_column_name,
105 |             smiles_column_name=smiles_column_name,
106 |             properties_column_name_list=properties_column_name_list,
107 |             temporal_info_column_name=temporal_info_column_name,
108 |         )
109 | 
110 |     except Exception:
111 |         logging.error(
112 |             "ERROR in preparing data. One of id_column_name, smiles_column_name, properties_column_name_list may be wrong."
113 |         )
114 |         sys.exit(1)
115 | 
116 |     #For model based representations
117 |     model_based_representations = ModelRepresentationsBuilder(path_to_model = path_to_model, limit_def = args.limit_def)
118 |     model_based = model_based_representations.build_representations(df, split_type = split_type)
119 |     
120 |     # save representations
121 |     import os
122 | 
123 |     # define the name of the directory to be created
124 |     experiment_name = args.experiment_name
125 |     path = f"./{model_name}_representations_{experiment_name}"
126 | 
127 |     try:
128 |         os.mkdir(path)
129 |     except OSError:
130 |         print ("Creation of the directory %s failed" % path)
131 |     else:
132 |         print ("Successfully created the directory %s " % path)
133 |         
134 |     model_based.representation_name = model_name
135 |     model_based.save(path)
136 |     
137 |     print(f'{model_name} representation correctly saved in {path}')
138 |     return 
139 | 
140 | 
141 | 
142 | if __name__ == "__main__":
143 |     '''
144 |     Script to compute the model_based representations of a set of molecules in a dataframe. 
145 |     '''
146 |     parser = argparse.ArgumentParser(
147 |         description=f"Compute model_based-representations",
148 |     )
149 |     parser.add_argument(
150 |         "--prefer_args",
151 |         type=str,
152 |         help="path to the .yaml file where configuration parameters are stored.",
153 |     )
154 | 
155 |     parser.add_argument(
156 |         "--path_to_model",
157 |         type=str,
158 |         help="path to model_based model that has been previously downloaded",
159 |     )
160 |     
161 |     parser.add_argument(
162 |         "--model_name",
163 |         type=str,
164 |         help="string of model_name, e.g. CDDD or MOLER",
165 |     )
166 | 
167 |     args = parser.parse_args()
168 |     a_yaml_file = open(args.prefer_args)
169 |     parsed_yaml_file = yaml.load(a_yaml_file, Loader=yaml.FullLoader)
170 | 
171 |     args.path_to_df = parsed_yaml_file["path_to_df"]
172 |     args.experiment_name = parsed_yaml_file["experiment_name"]
173 |     args.id_column_name = parsed_yaml_file["id_column_name"]
174 |     args.smiles_column_name = parsed_yaml_file["smiles_column_name"]
175 |     args.properties_column_name = parsed_yaml_file["properties_column_name_list"]
176 |     args.problem_type = parsed_yaml_file["problem_type"]
177 |     args.splitting_strategy = parsed_yaml_file["splitting_strategy"]
178 |     
179 |     if 'limit_def' in parsed_yaml_file:
180 |         args.limit_def = parsed_yaml_file["limit_def"]
181 |     else:
182 |         args.limit_def = None
183 | 
184 | 
185 |     if "temporal_info_column_name" in parsed_yaml_file:
186 |         args.temporal_info_column_name = parsed_yaml_file["temporal_info_column_name"]
187 |     else:
188 |         args.temporal_info_column_name = None
189 |         
190 |     compute_representations_from_args(args,args.path_to_model, args.model_name)
191 | 


--------------------------------------------------------------------------------
/config_files/config_PREFER_logD.yaml:
--------------------------------------------------------------------------------
1 | path_to_df: 'path_to_CHEMBL25-chembl_activity-kmMh6jUABmv9Qdf9kIxKtg7qNt9dhyK1PXvuKt6OTGc=.csv'
2 | experiment_name: 'logD'
3 | id_column_name:  'Molecule ChEMBL ID'
4 | smiles_column_name:  'Smiles'
5 | properties_column_name_list: 
6 |       - 'Standard Value' 
7 | problem_type: 'regression'
8 | splitting_strategy: 'random'
9 | 


--------------------------------------------------------------------------------
/config_files/config_PREFER_smalldata.yaml:
--------------------------------------------------------------------------------
 1 | experiment_name: small_data
 2 | id_column_name: Assay_ID
 3 | limit_def: 0
 4 | path_to_df: none
 5 | problem_type: classification
 6 | properties_column_name_list:
 7 | - Property
 8 | smiles_column_name: SMILES
 9 | splitting_strategy: random
10 | 


--------------------------------------------------------------------------------
/config_files/config_PREFER_solub.yaml:
--------------------------------------------------------------------------------
 1 | path_to_df: 'path_to_solubility.csv'
 2 | experiment_name: 'solubility'
 3 | id_column_name:  'SID'
 4 | smiles_column_name:  'Smiles'
 5 | properties_column_name_list: 
 6 |       - 'PUBCHEM_ACTIVITY_OUTCOME'
 7 | problem_type: 'classification'
 8 | splitting_strategy: 'random'
 9 | # temporal_info_column_name: 
10 | 
11 | 


--------------------------------------------------------------------------------
/config_files/config_model_based_representations.yaml:
--------------------------------------------------------------------------------
 1 | model_based_representations:
 2 |     'MOLER': 
 3 |         'path_to_model': 'path_to/moler'
 4 |         'conda_env': 'moler-env-prefer-light'
 5 |         'submodule_path': 'path_to/PREFER/prefer/model_based_representations/models/molecule-generation/'
 6 |     'CDDD': 
 7 |         'path_to_model': 'path_to/cddd/default_model'
 8 |         'conda_env': 'cddd-env-prefer-light'
 9 |         'submodule_path': 'path_to/PREFER/prefer/model_based_representations/models/cddd/'
10 |     
11 |     
12 | #python path needed to connect together all the relevant folders for the project, in particular, all the submodules related to the models used for models based representations (e.g. cddd and moler submodules) and the main PREFER folder
13 | prefer_path: 'path_to/PREFER/'


--------------------------------------------------------------------------------
/moler-environment-light.yml:
--------------------------------------------------------------------------------
 1 | name: moler-env-prefer-light
 2 | channels:
 3 |   - rdkit
 4 |   - defaults
 5 |   - plotly
 6 |   - pytorch
 7 |   - conda-forge
 8 | dependencies:
 9 |   - pip<22
10 |   - python==3.7.7
11 |   - matplotlib
12 |   - numpy==1.21.5
13 |   - rdkit==2020.09.1.0
14 |   - scikit-learn==0.24.1
15 |   - seaborn
16 |   - tqdm
17 |   - tensorflow==2.9.1
18 |   - pip:
19 |     - pandas>=1.2.4
20 |     - tf2-gnn~=2.12.0
21 |     - pyyaml
22 |     - dirhash
23 |     - matplotlib-inline==0.1.3
24 |     - ipywidgets
25 |     - more-itertools


--------------------------------------------------------------------------------
/moler-environment.yml:
--------------------------------------------------------------------------------
 1 | name: moler-env-prefer
 2 | channels:
 3 |   - rdkit
 4 |   - defaults
 5 |   - plotly
 6 |   - pytorch
 7 |   - conda-forge
 8 | dependencies:
 9 |   - pip<22
10 |   - python==3.7.7
11 |   - matplotlib
12 |   - numpy==1.21.5
13 |   - rdkit==2020.09.1.0
14 |   - scikit-learn==0.24.1
15 |   - seaborn
16 |   - tqdm
17 |   # For AML Hyperdrive
18 |   - plotly
19 |   - plotly-orca
20 |   - psutil
21 |   - openssl
22 |   - h5py==2.10.0
23 |   - tensorflow==2.9.1
24 |   - typing-extensions
25 |   - pytorch
26 |   - cpuonly
27 |   - pip:
28 |     - azure-identity==1.7.0
29 |     - docopt
30 |     - dpu-utils>=0.2.13
31 |     - zmq
32 |     - pytest
33 |     - coverage
34 |     - azureml-pipeline
35 |     - azure-keyvault-secrets
36 |     # For AML hyperdrive
37 |     - azureml-widgets
38 |     - ipywidgets
39 |     - pandas>=1.2.4
40 |     - cairosvg
41 |     - tf2-gnn>=2.12.0
42 |     - more-itertools
43 |     - fcd_torch
44 |     - pydantic
45 |     - pyyaml
46 |     - dirhash
47 |     - scandir
48 |     - semver
49 |     - py-repo-root==1.1.1
50 |     - json2html
51 |     - mysql-connector-python==8.0.17
52 |     - tokenizers==0.9.2
53 |     - transformers==3.4.0
54 |     - opencensus-ext-azure
55 |     - opencensus-ext-requests


--------------------------------------------------------------------------------
/prefer-environment.yml:
--------------------------------------------------------------------------------
 1 | name: prefer-env
 2 | channels:
 3 |   - rdkit
 4 |   - defaults
 5 |   - plotly
 6 | dependencies:
 7 |   - pip<22
 8 |   - python==3.7.7
 9 |   - matplotlib
10 |   - numpy==1.19.2
11 |   - rdkit==2020.09.1.0
12 |   - scikit-learn==0.24.1
13 |   - seaborn
14 |   - plotly
15 |   - tensorflow-gpu==2.1.0
16 |   - typing-extensions
17 |   - pip:
18 |     - dpu-utils>=0.2.13
19 |     - ipywidgets
20 |     - pandas>=1.2.4
21 |     - sklearn
22 |     - more-itertools
23 |     - pyyaml
24 |     - py-repo-root>=1.1.1
25 |     - tf2-gnn~=2.12.0
26 |     - ghostml
27 |     - dirhash
28 |     - auto-sklearn ==0.14.7
29 |     - ipykernel
30 |     - nbformat
31 | 
32 | 


--------------------------------------------------------------------------------
/prefer/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rdkit/PREFER/16986ca183b38b94f9b3b62b441ec40671b8cf8a/prefer/__init__.py


--------------------------------------------------------------------------------
/prefer/azure_ml/README.md:
--------------------------------------------------------------------------------
 1 | # Working in AzureML
 2 | PREFER can also run in AzureML. Main scripts regarding AzureML are collected in PREFER/prefer/azure_ml/
 3 | The following steps should be implemented:
 4 | 
 5 | 
 6 | ## STEP 1: install the full cddd and moler environments
 7 | As follows:
 8 | 
 9 | ```
10 | conda env create -f moler-environment-light.yml
11 | OR
12 | conda env create -f cddd-environment-light.yml
13 | ```
14 | 
15 | ## STEP 2: import datasets and models in your Azure Storage
16 | Import the dataset you want to use as well as the cddd and moler models in your Azure storage
17 | 
18 | ## STEP 3: prepare the config file
19 | Prepare a yaml file with the following information:
20 | 
21 | ```
22 | path_to_df: 'path_in_Azure_to_the_saved_df'
23 | experiment_name: 'name of the experiment'
24 | id_column_name:  ''
25 | smiles_column_name:  ''
26 | properties_column_name_list: 
27 |       - 'property1_columns_name' # NB if more than one then it is a multitasking
28 | problem_type: 'regression' # Can be regression or classification
29 | splitting_strategy: 'random'
30 | representations:
31 |     'DESCRIPTORS2D' : '' 
32 |     'FINGERPRINTS': ''
33 |     'CDDD': 'path_in_azure_to_stored_CDDD_model'
34 | ```
35 | 
36 | An example is provided in PREFER/prefer/azure_ml/config_logD_azure.yaml
37 | 
38 | 
39 | 
40 | ## STEP 3: import datasets and models in your Azure Storage
41 | Go to PREFER/prefer/azure_ml and run the following command:
42 | 
43 | ```
44 | python schedule_global_model_pipeline.py --prefer_args config_logD_azure.yaml --prefer_env cddd-environment.yml
45 | ```
46 | 
47 | One can also use the moler-environment.yml for running experiments with moler environment. 
48 | 
49 | 
50 | 
51 | 
52 | 


--------------------------------------------------------------------------------
/prefer/azure_ml/aml_config.py:
--------------------------------------------------------------------------------
 1 | #  Copyright (c) 2023, Novartis Institutes for BioMedical Research Inc. and Microsoft Corporation
 2 | #  All rights reserved.
 3 | # 
 4 | # Redistribution and use in source and binary forms, with or without
 5 | # modification, are permitted provided that the following conditions are
 6 | # met: 
 7 | #
 8 | #     * Redistributions of source code must retain the above copyright 
 9 | #       notice, this list of conditions and the following disclaimer.
10 | #     * Redistributions in binary form must reproduce the above
11 | #       copyright notice, this list of conditions and the following 
12 | #       disclaimer in the documentation and/or other materials provided 
13 | #       with the distribution.
14 | #     * Neither the name of Novartis Institutes for BioMedical Research Inc. nor Microsoft Corporation 
15 | #       nor the names of its contributors may be used to endorse or promote 
16 | #       products derived from this software without specific prior written permission.
17 | #
18 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19 | # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20 | # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21 | # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
22 | # OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23 | # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
24 | # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25 | # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26 | # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 | #
30 | # Created by Jessica Lanini, January 2023
31 | 
32 | 
33 | from __future__ import annotations
34 | import os
35 | import json
36 | from dataclasses import dataclass, fields
37 | from prefer.azure_ml.exceptions import MissingEnvironmentVariable
38 | 
39 | ENV_PREFIX = "AML_CONFIG_"
40 | ENV_PATH_NAMES = {
41 |     "subscription_id": f"{ENV_PREFIX}SUBSCRIPTION_ID",
42 |     "resource_group": f"{ENV_PREFIX}RESOURCE_GROUP",
43 |     "workspace_name": f"{ENV_PREFIX}WORKSPACE_NAME",
44 |     "compute_target_name": f"{ENV_PREFIX}COMPUTE_TARGET_NAME",
45 |     "cpu_compute_target_name": f"{ENV_PREFIX}CPU_COMPUTE_TARGET_NAME",
46 |     "datastore_name": f"{ENV_PREFIX}DATASTORE_NAME",
47 |     "result_store_name": f"{ENV_PREFIX}RESULT_STORE_NAME",
48 |     "keyvault_name": f"{ENV_PREFIX}KEYVAULT_NAME",
49 | }
50 | 
51 | 
52 | @dataclass
53 | class AmlConfig:
54 | 
55 |     subscription_id: str
56 |     resource_group: str
57 |     workspace_name: str
58 |     compute_target_name: str
59 |     cpu_compute_target_name: str
60 |     datastore_name: str
61 |     result_store_name: str
62 |     keyvault_name: str
63 | 
64 |     @classmethod
65 |     def can_load_from_environment_variables(cls) -> bool:
66 |         field_names = [x.name for x in fields(cls)]
67 |         aml_config = {name: os.getenv(ENV_PATH_NAMES[name]) for name in field_names}
68 |         if any(x is None for x in aml_config.values()):
69 |             return False
70 |         return True
71 | 
72 |     @classmethod
73 |     def from_environment_variables(cls) -> AmlConfig:
74 |         """Instantiate an AmlConfig from environment variables.
75 |         If the relevant environment variables are not all set, raise MissingEnvironmentVariable"""
76 |         field_names = [x.name for x in fields(cls)]
77 |         aml_config = {name: os.getenv(ENV_PATH_NAMES[name]) for name in field_names}
78 |         if not AmlConfig.can_load_from_environment_variables():
79 |             raise MissingEnvironmentVariable
80 |         return cls(**aml_config)
81 | 
82 |     @classmethod
83 |     def from_file(cls, filename: str) -> AmlConfig:
84 |         with open(filename, "rt") as fh:
85 |             aml_config = json.load(fh)
86 |             return cls(**aml_config)
87 | 


--------------------------------------------------------------------------------
/prefer/azure_ml/aml_configuration/aml_config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "subscription_id": "",
 3 |     "resource_group": "",
 4 |     "workspace_name": "",
 5 |     "compute_target_name": "",
 6 |     "cpu_compute_target_name": "",
 7 |     "datastore_name": "",
 8 |     "trained_models_datastore_name": "",
 9 |     "trained_property_models_datastore_name": "",
10 |     "trained_generative_models_datastore_name": "",
11 |     "result_store_name": "",
12 |     "keyvault_name": ""
13 | }
14 | 


--------------------------------------------------------------------------------
/prefer/azure_ml/config_logD_azure.yaml:
--------------------------------------------------------------------------------
 1 | path_to_df: 'path_in_Azure_to_the_saved_df'
 2 | experiment_name: 'LogD_public_random'
 3 | id_column_name:  'Molecule ChEMBL ID'
 4 | smiles_column_name:  'Smiles'
 5 | properties_column_name_list: 
 6 |       - 'Standard Value' # if more than one then it is a multitasking
 7 | problem_type: 'regression' # Can be Regression or Classification
 8 | splitting_strategy: 'random'
 9 | representations:
10 |     'DESCRIPTORS2D' : '' 
11 |     'FINGERPRINTS': ''
12 |     'CDDD': 'path_in_azure_to_stored_CDDD_model'
13 | desirability_scores: # please leave this field as it is
14 |       score1:
15 |             - x : -1.0
16 |               y : 0.0
17 |             - x : 0.0
18 |               y : 0.2
19 |             - x : 1.0
20 |               y : 0.9
21 |             - x : 2.0
22 |               y : 1.0
23 |             - x : 3.0
24 |               y : 0.5
25 |             - x : 4.0
26 |               y : 0.0


--------------------------------------------------------------------------------
/prefer/azure_ml/exceptions.py:
--------------------------------------------------------------------------------
1 | class MissingEnvironmentVariable(Exception):
2 |     pass
3 | 


--------------------------------------------------------------------------------
/prefer/azure_ml/included_prefixes.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 
 3 |     "prefer/azure_ml/model_registration_prefer.py": {
 4 |         "project_root": [
 5 |             "prefer/azure_ml/", 
 6 |             "prefer/src/",
 7 |             "prefer/utils/", 
 8 |             "prefer/model_based_representations/"
 9 |         ]
10 |     },
11 |     "prefer/scripts/get_representations.py": {
12 |         "project_root": [
13 |             "prefer/src/",
14 |             "prefer/docs/",
15 |             "prefer/utils/",
16 |             "prefer/molecule_representations/",
17 |             "prefer/model_based_representations/",
18 |             "prefer/model_based_representations/models/cddd/", 
19 |             "prefer/model_based_representations/models/molecule-generation/",
20 |             "prefer/azure_ml/",
21 |             "prefer/scripts/"
22 |         ]
23 |     },
24 |     "prefer/scripts/model_wrapper.py": {
25 |         "project_root": [
26 |             "prefer/src/",
27 |             "prefer/utils/",
28 |             "prefer/molecule_representations/",
29 |             "prefer/model_based_representations/",
30 |             "prefer/model_based_representations/models/cddd/", 
31 |             "prefer/model_based_representations/models/molecule-generation/",
32 |             "prefer/scripts/"
33 |         ]
34 |     },
35 |     "prefer/scripts/combine_results.py": {
36 |         "project_root": [
37 |             "prefer/src/",
38 |             "prefer/utils/",
39 |             "prefer/molecule_representations/",
40 |             "prefer/model_based_representations/",
41 |             "prefer/model_based_representations/models/cddd/", 
42 |             "prefer/model_based_representations/models/molecule-generation/",
43 |             "prefer/scripts/"
44 |         ]
45 |     },
46 |     "prefer/scripts/run_PREFER.py": {
47 |         "project_root": [
48 |             "prefer/src/",
49 |             "prefer/utils/",
50 |             "prefer/molecule_representations/",
51 |             "prefer/model_based_representations/",
52 |             "prefer/model_based_representations/models/cddd/", 
53 |             "prefer/model_based_representations/models/molecule-generation/",
54 |             "prefer/scripts/"
55 |         ]
56 |     }
57 | }


--------------------------------------------------------------------------------
/prefer/azure_ml/model_registration_prefer.py:
--------------------------------------------------------------------------------
  1 | #  Copyright (c) 2023, Novartis Institutes for BioMedical Research Inc. and Microsoft Corporation
  2 | #  All rights reserved.
  3 | # 
  4 | # Redistribution and use in source and binary forms, with or without
  5 | # modification, are permitted provided that the following conditions are
  6 | # met: 
  7 | #
  8 | #     * Redistributions of source code must retain the above copyright 
  9 | #       notice, this list of conditions and the following disclaimer.
 10 | #     * Redistributions in binary form must reproduce the above
 11 | #       copyright notice, this list of conditions and the following 
 12 | #       disclaimer in the documentation and/or other materials provided 
 13 | #       with the distribution.
 14 | #     * Neither the name of Novartis Institutes for BioMedical Research Inc. nor Microsoft Corporation 
 15 | #       nor the names of its contributors may be used to endorse or promote 
 16 | #       products derived from this software without specific prior written permission.
 17 | #
 18 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 19 | # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 20 | # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 21 | # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 22 | # OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 23 | # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 24 | # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 25 | # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 26 | # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 27 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 28 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 29 | #
 30 | # Created by Jessica Lanini, January 2023
 31 | 
 32 | 
 33 | #!/usr/bin/env python
 34 | import json
 35 | import os
 36 | import pickle
 37 | import time
 38 | import logging
 39 | 
 40 | from azureml.core.run import Run
 41 | 
 42 | from prefer.azure_ml.model_registration_utils import (
 43 |     add_tags_to_aml_run,
 44 |     name_property_model_for_registration,
 45 | )
 46 | from prefer.azure_ml.telemetry_utils import (
 47 |     set_telemetry_handlers,
 48 |     function_span_metrics_decorator,
 49 | )
 50 | 
 51 | logger = logging.getLogger(__name__)
 52 | set_telemetry_handlers(logger, "Training")
 53 | 
 54 | 
 55 | @function_span_metrics_decorator("training.model_registration_prefer.run_from_args", "Training")
 56 | def run_from_args(args):
 57 |     aml_run = Run.get_context()
 58 | 
 59 |     model_dir = args.MODEL_PREFER_WRAPPED_DIR
 60 |     pkls = [name for name in os.listdir(model_dir) if name.endswith(".pkl")]
 61 |     assert len(pkls) == 1
 62 |     wrapped_model_name = pkls[0]
 63 | 
 64 |     with open(os.path.join(model_dir, wrapped_model_name), "rb") as in_file:
 65 |         wrapped_model = pickle.load(in_file)
 66 |     rep_model_id = wrapped_model.rep_model_id
 67 | 
 68 |     problem_type = wrapped_model.problem_type
 69 |     local_or_global_model = args.local_or_global_model
 70 | 
 71 |     is_best = local_or_global_model == "local"
 72 |     representation_name = wrapped_model.model_representation
 73 |     registration_name = name_property_model_for_registration(
 74 |         wrapped_model.friendly_model_name, representation_name
 75 |     )
 76 | 
 77 |     cloud_target_dir = "packaged_model"
 78 |     # # (1) Save the wrapped model:
 79 |     aml_run.upload_file(
 80 |         path_or_stream=os.path.join(model_dir, wrapped_model_name),
 81 |         name=os.path.join(cloud_target_dir, wrapped_model_name),
 82 |     )
 83 | 
 84 |     # (2) Save metadata
 85 |     metadata = {
 86 |         "git_status": args.git_status,
 87 |         "repo_hash": args.repo_hash,
 88 |         "api_version": args.api_version,
 89 |     }
 90 | 
 91 |     metadata_filename = "metadata.json"
 92 | 
 93 |     with open(metadata_filename, "wt") as fh:
 94 |         json.dump(metadata, fh)
 95 | 
 96 |     aml_run.upload_file(
 97 |         path_or_stream=metadata_filename, name=os.path.join(cloud_target_dir, metadata_filename),
 98 |     )
 99 | 
100 |     # (3) Register the model with tags and properties.
101 |     tags = {
102 |         "is_best": is_best,
103 |         # All property prediction models are active by default (i.e. not marked for future deletion) because:
104 |         # - only the best local model is registered, therefore it makes sense that it will not be deleted by default.
105 |         # - all global models are registered, and the data scientist could potentially pick any of them for production.
106 |         "is_active": True,
107 |     }
108 | 
109 |     # TODO why do we need to store the desirability curve with the property model?
110 | 
111 |     properties = {
112 |         "generative_or_property_model": "property_model",
113 |         "property_model_friendly_name": wrapped_model.friendly_model_name,
114 |         "problem_type": problem_type,
115 |         "representation_name": representation_name,
116 |         "generative_model_id": rep_model_id,
117 |         "local_or_global_model": args.local_or_global_model,
118 |         "timestamp": args.timestamp,
119 |         "user_name": args.user_name,
120 |         "run_name": args.run_name,
121 |         "git_status": args.git_status,
122 |         "repo_hash": args.repo_hash,
123 |         "api_version": args.api_version,
124 |         "project_code": wrapped_model.project_code,
125 |         "desirabilities": wrapped_model.desirability_scores,
126 |         "env": args.conda_env_name,
127 |     }
128 | 
129 |     aml_run.register_model(
130 |         model_name=registration_name, model_path=cloud_target_dir, tags=tags, properties=properties,
131 |     )
132 | 
133 |     add_tags_to_aml_run(aml_run, tags)
134 |     aml_run.add_properties(properties)
135 | 
136 | 
137 | def run():
138 |     import argparse
139 | 
140 |     parser = argparse.ArgumentParser(description="Model registration script (PREFER).")
141 |     parser.add_argument(
142 |         "MODEL_PREFER_WRAPPED_DIR",
143 |         type=str,
144 |         help="Directory with the pickled PREFER model with metadata.",
145 |     )
146 |     parser.add_argument(
147 |         "--timestamp",
148 |         type=str,
149 |         default=time.strftime("%Y-%m-%d_%H-%M-%S"),
150 |         help="Timestamp of the run (used for model tagging).",
151 |     )
152 |     parser.add_argument(
153 |         "--user-name",
154 |         type=str,
155 |         default="unknown_user",
156 |         help="Name of the scheduling user (used for model tagging).",
157 |     )
158 |     parser.add_argument(
159 |         "--run-name",
160 |         type=str,
161 |         default="unknown_run_name",
162 |         help="Name of the pipeline run (used for model tagging).",
163 |     )
164 |     parser.add_argument(
165 |         "--git_status",
166 |         type=str,
167 |         required=True,
168 |         help="Git status (branch, commit, etc) at the time of training.",
169 |     )
170 |     parser.add_argument(
171 |         "--repo_hash",
172 |         type=str,
173 |         required=True,
174 |         help="Hash of the repo root at the time of training.",
175 |     )
176 |     parser.add_argument(
177 |         "--api_version", type=str, required=True, help="API version supported by the model.",
178 |     )
179 |     parser.add_argument(
180 |         "--local_or_global_model",
181 |         type=str,
182 |         choices=["local", "global"],
183 |         help="local/global",
184 |         required=True,
185 |     )
186 |     parser.add_argument(
187 |         "--conda_env_name",
188 |         type=str,
189 |         default="moler-environment",
190 |         help="Name of the conda environment used to build the model.",
191 |     )
192 |     args = parser.parse_args()
193 | 
194 |     run_from_args(args)
195 | 
196 | 
197 | if __name__ == "__main__":
198 |     run()
199 | 


--------------------------------------------------------------------------------
/prefer/azure_ml/model_registration_utils.py:
--------------------------------------------------------------------------------
  1 | #  Copyright (c) 2023, Novartis Institutes for BioMedical Research Inc. and Microsoft Corporation
  2 | #  All rights reserved.
  3 | # 
  4 | # Redistribution and use in source and binary forms, with or without
  5 | # modification, are permitted provided that the following conditions are
  6 | # met: 
  7 | #
  8 | #     * Redistributions of source code must retain the above copyright 
  9 | #       notice, this list of conditions and the following disclaimer.
 10 | #     * Redistributions in binary form must reproduce the above
 11 | #       copyright notice, this list of conditions and the following 
 12 | #       disclaimer in the documentation and/or other materials provided 
 13 | #       with the distribution.
 14 | #     * Neither the name of Novartis Institutes for BioMedical Research Inc. nor Microsoft Corporation 
 15 | #       nor the names of its contributors may be used to endorse or promote 
 16 | #       products derived from this software without specific prior written permission.
 17 | #
 18 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 19 | # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 20 | # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 21 | # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 22 | # OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 23 | # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 24 | # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 25 | # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 26 | # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 27 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 28 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 29 | #
 30 | # Created by Jessica Lanini, January 2023
 31 | 
 32 | 
 33 | #! /usr/bin/env python3
 34 | import time
 35 | from typing import List, Optional, Any, Dict
 36 | 
 37 | from azureml.core import Workspace, RunConfiguration, Run, ComputeTarget
 38 | from azureml.pipeline.core import PipelineData, PipelineStep
 39 | from pathlib import Path
 40 | 
 41 | from dirhash import dirhash
 42 | 
 43 | from prefer.azure_ml.reproducibility import get_current_api_version, get_git_status
 44 | from prefer.azure_ml.utils import create_step
 45 | from prefer.azure_ml.aml_config import AmlConfig
 46 | 
 47 | from pyreporoot import project_root
 48 | 
 49 | 
 50 | def name_property_model_for_registration(experiment_name: str, representation_name: str) -> str:
 51 |     return experiment_name + "_" + representation_name
 52 | 
 53 | 
 54 | def add_tags_to_aml_run(aml_run: Run, tags: Dict[str, Any]) -> None:
 55 |     # Add the tags to the AML run as well.
 56 |     for key, value in tags.items():
 57 |         aml_run.tag(key, value)
 58 | 
 59 |     # If the run has a parent (which should be the case for pipelines), also add the tags there.
 60 |     if aml_run.parent is not None:
 61 |         for key, value in tags.items():
 62 |             aml_run.parent.tag(key, value)
 63 | 
 64 | 
 65 | def create_registration_step(
 66 |     aml_config: AmlConfig,
 67 |     workspace: Workspace,
 68 |     model_name: str,
 69 |     user_name: str,
 70 |     run_name: Optional[str],
 71 |     model_training_output: PipelineData,
 72 |     eval_outputs: List[PipelineData],
 73 |     run_config: RunConfiguration,
 74 |     tmpdir_to_use: str,
 75 | ) -> PipelineStep:
 76 |     timestamp = time.strftime("%Y-%m-%d_%H-%M-%S")
 77 |     run_name = run_name or f"{timestamp}-{user_name}"
 78 | 
 79 |     git_status = get_git_status()
 80 |     repo_hash = _dir_hash(str(project_root(Path(__file__))))
 81 |     api_version = get_current_api_version()
 82 | 
 83 |     return create_step(
 84 |         tmpdir_to_use=tmpdir_to_use,
 85 |         name="Model Registration",
 86 |         script_name="prefer/azure_ml/model_registration_prefer.py",
 87 |         arguments=[
 88 |             model_name,
 89 |             "--timestamp",
 90 |             timestamp,
 91 |             "--user-name",
 92 |             user_name,
 93 |             "--run-name",
 94 |             run_name,
 95 |             "--git_status",
 96 |             git_status,
 97 |             "--repo_hash",
 98 |             repo_hash,
 99 |             "--api_version",
100 |             api_version,
101 |             model_training_output,
102 |         ]
103 |         + eval_outputs,
104 |         inputs=[model_training_output] + eval_outputs,
105 |         outputs=[],
106 |         compute_target=ComputeTarget(workspace, aml_config.cpu_compute_target_name),
107 |         runconfig=run_config,
108 |     )
109 | 
110 | 
111 | def create_registration_prefer_step(
112 |     aml_config: AmlConfig,
113 |     workspace: Workspace,
114 |     user_name: str,
115 |     run_name: Optional[str],
116 |     model_prefer_wrapped: PipelineData,
117 |     run_config: RunConfiguration,
118 |     tmpdir_to_use: str,
119 |     local_or_global_model: str,
120 |     conda_env_name: str,
121 | ) -> PipelineStep:
122 |     timestamp = time.strftime("%Y-%m-%d_%H-%M-%S")
123 |     run_name = run_name or f"{timestamp}-{user_name}"
124 | 
125 |     git_status = get_git_status()
126 |     repo_hash = _dir_hash(str(project_root(Path(__file__))))
127 |     api_version = get_current_api_version()
128 | 
129 |     return create_step(
130 |         tmpdir_to_use=tmpdir_to_use,
131 |         name="Model Registration",
132 |         script_name="prefer/azure_ml/model_registration_prefer.py",
133 |         arguments=[
134 |             model_prefer_wrapped,
135 |             "--timestamp",
136 |             timestamp,
137 |             "--user-name",
138 |             user_name,
139 |             "--run-name",
140 |             run_name,
141 |             "--git_status",
142 |             git_status,
143 |             "--repo_hash",
144 |             repo_hash,
145 |             "--api_version",
146 |             api_version,
147 |             "--local_or_global_model",
148 |             local_or_global_model,
149 |             "--conda_env_name",
150 |             conda_env_name,
151 |         ],
152 |         inputs=[model_prefer_wrapped],
153 |         outputs=[],
154 |         compute_target=ComputeTarget(workspace, aml_config.compute_target_name),
155 |         runconfig=run_config,
156 |     )
157 | 
158 | 
159 | def _dir_hash(folder_name: str, **kwargs) -> str:
160 |     """
161 |     Calculate SHA256 hash of a an entire folder tree, recursively
162 |     Note: The multi-threaded version seems to be throwing an error currently, but the single threaded version
163 |             is fast enough given how infrequently we expect this to be used
164 |     """
165 |     return dirhash(folder_name, "sha256", **kwargs)
166 | 


--------------------------------------------------------------------------------
/prefer/azure_ml/reproducibility.py:
--------------------------------------------------------------------------------
  1 | #  Copyright (c) 2023, Novartis Institutes for BioMedical Research Inc. and Microsoft Corporation
  2 | #  All rights reserved.
  3 | # 
  4 | # Redistribution and use in source and binary forms, with or without
  5 | # modification, are permitted provided that the following conditions are
  6 | # met: 
  7 | #
  8 | #     * Redistributions of source code must retain the above copyright 
  9 | #       notice, this list of conditions and the following disclaimer.
 10 | #     * Redistributions in binary form must reproduce the above
 11 | #       copyright notice, this list of conditions and the following 
 12 | #       disclaimer in the documentation and/or other materials provided 
 13 | #       with the distribution.
 14 | #     * Neither the name of Novartis Institutes for BioMedical Research Inc. nor Microsoft Corporation 
 15 | #       nor the names of its contributors may be used to endorse or promote 
 16 | #       products derived from this software without specific prior written permission.
 17 | #
 18 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 19 | # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 20 | # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 21 | # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 22 | # OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 23 | # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 24 | # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 25 | # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 26 | # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 27 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 28 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 29 | #
 30 | # Created by Jessica Lanini, January 2023
 31 | 
 32 | 
 33 | import subprocess
 34 | from hashlib import sha256
 35 | from time import time
 36 | import logging
 37 | 
 38 | from pathlib import Path
 39 | 
 40 | from pyreporoot import project_root
 41 | from semver import VersionInfo
 42 | 
 43 | logger = logging.getLogger(__name__)
 44 | 
 45 | GenChemVersion = VersionInfo
 46 | 
 47 | 
 48 | def get_current_api_version() -> GenChemVersion:
 49 |     """Returns the current version of the APIs."""
 50 |     with open(str(project_root(Path(__file__)).joinpath("api_version.txt")), "rt") as f:
 51 |         return GenChemVersion.parse(f.read().rstrip())
 52 | 
 53 | 
 54 | def timeit(fn):
 55 |     """
 56 |     *args and **kwargs are to support positional and named arguments of fn
 57 |     Use this as a decorator for the function you wish to time
 58 |         @timeit
 59 |         def my_func(args):
 60 |             ....
 61 |             return
 62 | 
 63 |         This produces output of the form "Time taken in my_func: 1.11111111s". The time is returned in seconds.
 64 |     """
 65 | 
 66 |     def get_time(*args, **kwargs):
 67 |         start = time()
 68 |         output = fn(*args, **kwargs)
 69 |         logger.info(f"Time taken in {fn.__name__}: {time() - start:.7f}s")
 70 |         return output  # make sure that the decorator returns the output of fn
 71 | 
 72 |     return get_time
 73 | 
 74 | 
 75 | def file_hash(filename: str) -> str:
 76 |     """
 77 |     Calculate SHA256 hash of a file
 78 |     """
 79 |     sha256_hash = sha256()
 80 |     with open(filename, "rb") as f:
 81 |         # Read and update hash string value in blocks of 16K
 82 |         for byte_block in iter(lambda: f.read(16384), b""):
 83 |             sha256_hash.update(byte_block)
 84 |     return sha256_hash.hexdigest()
 85 | 
 86 | 
 87 | def get_git_short_hash() -> str:
 88 |     return subprocess.check_output(["git", "rev-parse", "--short", "HEAD"]).decode("ASCII").strip()
 89 | 
 90 | 
 91 | def get_git_status() -> str:
 92 |     """
 93 |     Queries the git tree for current git revision hash, status of local changes, status of untracked files
 94 |     Output is of form:
 95 |         FULL_GIT_HASH@BRANCH_NAME+local_changes+untracked_files
 96 |         The git hash and branch name are always returned.
 97 |         '+local_changes' tag is added if any local changes are found
 98 |         '+untracked_files' tag is added if any untracked files are found
 99 |     Returns:
100 |         (str) String describing the status of the git tree
101 |     """
102 |     try:
103 | 
104 |         def run_command(command):
105 |             return (
106 |                 subprocess.check_output(command, cwd=project_root(Path(__file__)))
107 |                 .decode("ASCII")
108 |                 .strip()
109 |             )
110 | 
111 |         # pylint: disable=unexpected-keyword-arg # For some reason, pylint doesn't like "cwd"
112 |         head_ref_names = run_command(["git", "log", "--format=%D", "-1"])
113 |         # head_ref_names returns output of the form
114 |         # HEAD -> user_branch_name, origin/master, origin/HEAD, master
115 |         # Parse this to recover the branch name if possible, else leave empty
116 |         if " -> " in head_ref_names:
117 |             branch = "@" + head_ref_names.split(" -> ")[1].split(",")[0]
118 |         else:
119 |             branch = ""
120 | 
121 |         # Recover the change hash, keep the full version
122 |         change_hash: str = run_command(["git", "rev-parse", "HEAD"])
123 | 
124 |         # Identify if there are any uncommitted local changes
125 |         local_changes: str = run_command(["git", "diff-index", "HEAD", "--"])
126 |         change_status = "" if local_changes == "" else "+local_changes"
127 | 
128 |         # Identify if there are any untracked local changes
129 |         untracked_files: str = run_command(["git", "status", "--short"])
130 |         untracked_status = "" if untracked_files == "" else "+untracked_files"
131 | 
132 |         # This ony works when the remote is called origin, but we can't guarantee that
133 |         # Find out if there is a way to query the name of the current remote tree
134 |         # unpushed_changes: str = run_command(["git", "log", "origin.."])
135 |         # unpushed_status = "" if unpushed_changes == "" else "+unpushed_changes"
136 | 
137 |         return "{}{}{}{}".format(change_hash, branch, change_status, untracked_status)
138 |     except (subprocess.CalledProcessError, FileNotFoundError):
139 |         return "UNKNOWN GIT REVISION"
140 | 


--------------------------------------------------------------------------------
/prefer/azure_ml/telemetry_utils.py:
--------------------------------------------------------------------------------
  1 | #  Copyright (c) 2023, Novartis Institutes for BioMedical Research Inc. and Microsoft Corporation
  2 | #  All rights reserved.
  3 | # 
  4 | # Redistribution and use in source and binary forms, with or without
  5 | # modification, are permitted provided that the following conditions are
  6 | # met: 
  7 | #
  8 | #     * Redistributions of source code must retain the above copyright 
  9 | #       notice, this list of conditions and the following disclaimer.
 10 | #     * Redistributions in binary form must reproduce the above
 11 | #       copyright notice, this list of conditions and the following 
 12 | #       disclaimer in the documentation and/or other materials provided 
 13 | #       with the distribution.
 14 | #     * Neither the name of Novartis Institutes for BioMedical Research Inc. nor Microsoft Corporation 
 15 | #       nor the names of its contributors may be used to endorse or promote 
 16 | #       products derived from this software without specific prior written permission.
 17 | #
 18 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 19 | # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 20 | # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 21 | # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 22 | # OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 23 | # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 24 | # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 25 | # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 26 | # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 27 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 28 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 29 | #
 30 | # Created by Jessica Lanini, January 2023
 31 | 
 32 | 
 33 | import logging
 34 | from functools import wraps
 35 | from typing import Optional
 36 | from opencensus.ext.azure.log_exporter import AzureLogHandler
 37 | from opencensus.trace import config_integration
 38 | from opencensus.ext.azure.trace_exporter import AzureExporter
 39 | from opencensus.trace.samplers import ProbabilitySampler
 40 | from opencensus.trace.tracer import Tracer
 41 | from azureml.core import Run
 42 | from azureml.exceptions import RunEnvironmentException
 43 | 
 44 | 
 45 | def _callback_add_cloudrole(envelope):
 46 |     envelope.tags["ai.cloud.role"] = "Azure ML"
 47 |     return True
 48 | 
 49 | 
 50 | class AppFilter(logging.Filter):
 51 |     def __init__(self, run_type: str):
 52 |         self.run_type = run_type
 53 | 
 54 |     def filter(self, record):
 55 |         custom_dimensions = {}
 56 |         _set_run_context(custom_dimensions, self.run_type)
 57 |         record.custom_dimensions = custom_dimensions
 58 |         return True
 59 | 
 60 | 
 61 | def set_telemetry_handlers(logger, run_type: str):
 62 |     logger.setLevel(logging.INFO)
 63 |     logger.addHandler(logging.StreamHandler())
 64 |     try:
 65 |         handler = AzureLogHandler()
 66 |         handler.add_telemetry_processor(_callback_add_cloudrole)
 67 |         handler.addFilter(AppFilter(run_type))
 68 |         logger.addHandler(handler)
 69 | 
 70 |         # configure tracing for app insights integrations
 71 |         config_integration.trace_integrations(["requests"])
 72 |         _get_tracer(run_type)
 73 |     except Exception:
 74 |         # application insights connection string is not set up in environment.
 75 |         # Probably this doesn't run in AML, skipping setting up Application Insights for local run.
 76 |         pass
 77 | 
 78 | 
 79 | def _set_run_context(dictionary: dict, run_type: str):
 80 |     try:
 81 |         run = Run.get_context(allow_offline=False)
 82 |         dictionary["parent_run_id"] = run.parent.id
 83 |         dictionary["step_id"] = run.id
 84 |         dictionary["step_name"] = run.name
 85 |         dictionary["experiment_name"] = run.experiment.name
 86 |         dictionary["run_url"] = run.parent.get_portal_url()
 87 |         dictionary["run_type"] = run_type
 88 |     except RunEnvironmentException:
 89 |         # Not an AzureML run
 90 |         pass
 91 | 
 92 | 
 93 | def _callback_add_context(run_type: str, envelope):
 94 |     _set_run_context(envelope.data.baseData.properties, run_type)
 95 | 
 96 | 
 97 | def _get_tracer(run_type: str):
 98 |     try:
 99 |         app_insights_exporter = AzureExporter()
100 |         app_insights_exporter.add_telemetry_processor(_callback_add_cloudrole)
101 |         app_insights_exporter.add_telemetry_processor(
102 |             lambda envelope: _callback_add_context(run_type, envelope)
103 |         )
104 |         return Tracer(exporter=app_insights_exporter, sampler=ProbabilitySampler(rate=1.0))
105 |     except Exception:
106 |         # application insights connection string is not set up in environment.
107 |         # Probably this doesn't run in AML, returning tracer which reports locally.
108 |         return Tracer(sampler=ProbabilitySampler(rate=1.0))
109 | 
110 | 
111 | def function_span_metrics_decorator(span_name: str, run_type: Optional[str]):
112 |     def decorator(func):
113 |         @wraps(func)
114 |         def wrapper(*args, **kwargs):
115 |             tracer = _get_tracer(run_type)
116 |             with tracer.span(name=span_name):
117 |                 return func(*args, **kwargs)
118 | 
119 |         return wrapper
120 | 
121 |     return decorator
122 | 


--------------------------------------------------------------------------------
/prefer/docs/PREFER_scheme.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rdkit/PREFER/16986ca183b38b94f9b3b62b441ec40671b8cf8a/prefer/docs/PREFER_scheme.png


--------------------------------------------------------------------------------
/prefer/docs/SaltsMod.txt:
--------------------------------------------------------------------------------
 1 | // Notes:
 2 | //  1) don't include charges
 3 | //  2) The search for salts is a substructure search where the substructure
 4 | //     must match the entire fragment, so we don't need to be choosy about bond 
 5 | //     types
 6 | //  3) The matching is done in order, so if you put the more complex stuff at the
 7 | //     bottom the "don't remove the last fragment" algorithm has a chance of
 8 | //     of returning something sensible
 9 | 
10 | // start with simple inorganics:
11 | [Cl,Br,I,F]
12 | [Li,Na,K,Ca,Mg,Zn]
13 | [O,N]
14 | 
15 | // "complex" inorganics
16 | [N](=O)(O)O
17 | [P](=O)(O)(O)O
18 | [P](F)(F)(F)(F)(F)F
19 | [S](=O)(=O)(O)O
20 | [CH3][S](=O)(=O)(O)
21 | c1cc([CH3])ccc1[S](=O)(=O)(O)	p-Toluene sulfonate
22 | F[B](F)F
23 | // organics
24 | [CH3]C(=O)O	  Acetic acid
25 | FC(F)(F)C(=O)O	  TFA
26 | OC(=O)C=CC(=O)O	  Fumarate/Maleate
27 | OC(=O)C(=O)O	  Oxalate
28 | OC(=O)C(O)C(O)C(=O)O	  Tartrate
29 | C1CCCCC1[NH]C1CCCCC1	  Dicylcohexylammonium
30 | 
31 | // added for Patent Stuff
32 | OC(=O)C=CC(O)=O succinate
33 | OC(=O)CCC(O)=O
34 | CC(O)=O
35 | [In]
36 | [Cu]
37 | [Zn]
38 | [Y]
39 | C
40 | 


--------------------------------------------------------------------------------
/prefer/model_based_representations/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rdkit/PREFER/16986ca183b38b94f9b3b62b441ec40671b8cf8a/prefer/model_based_representations/__init__.py


--------------------------------------------------------------------------------
/prefer/model_based_representations/cddd_wrapper.py:
--------------------------------------------------------------------------------
  1 | #  Copyright (c) 2023, Novartis Institutes for BioMedical Research Inc. and Microsoft Corporation
  2 | #  All rights reserved.
  3 | # 
  4 | # Redistribution and use in source and binary forms, with or without
  5 | # modification, are permitted provided that the following conditions are
  6 | # met: 
  7 | #
  8 | #     * Redistributions of source code must retain the above copyright 
  9 | #       notice, this list of conditions and the following disclaimer.
 10 | #     * Redistributions in binary form must reproduce the above
 11 | #       copyright notice, this list of conditions and the following 
 12 | #       disclaimer in the documentation and/or other materials provided 
 13 | #       with the distribution.
 14 | #     * Neither the name of Novartis Institutes for BioMedical Research Inc. nor Microsoft Corporation 
 15 | #       nor the names of its contributors may be used to endorse or promote 
 16 | #       products derived from this software without specific prior written permission.
 17 | #
 18 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 19 | # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 20 | # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 21 | # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 22 | # OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 23 | # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 24 | # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 25 | # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 26 | # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 27 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 28 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 29 | #
 30 | # Created by Jessica Lanini, January 2023
 31 | 
 32 | 
 33 | import json
 34 | import os
 35 | from typing import Any, List, Tuple, Optional
 36 | 
 37 | import numpy as np
 38 | 
 39 | 
 40 | from prefer.utils.random_utils import set_random_seed
 41 | from prefer.model_based_representations.interface import LatentSpaceMoleculeGenerator
 42 | 
 43 | 
 44 | class CDDDGeneratorModel(LatentSpaceMoleculeGenerator):
 45 |     def __init__(self, dir: str, seed: int = 0, num_workers: int = 6, **kwargs: Any):
 46 |         super().__init__(dir, **kwargs)
 47 | 
 48 |         self.num_workers = num_workers
 49 | 
 50 |         set_random_seed(seed)
 51 | 
 52 |         self._set_inference_model(dir)
 53 | 
 54 |         self._dir = dir
 55 | 
 56 |         self._can_decode_from_scaffold = False
 57 |         # By default we will store the stats in the model dir (but a different loc might be passed to
 58 |         # sampling benchmark or to distribution matching benchmark, which will change it):
 59 |         self.latent_space_stats_file_dir = dir
 60 | 
 61 |     def _set_inference_model(self, dir):
 62 |         from cddd.inference import InferenceModel
 63 | 
 64 |         self._inference_model = InferenceModel(
 65 |             model_dir=dir,
 66 |             use_gpu=True,
 67 |             cpu_threads=self.num_workers,
 68 |             gpu_mem_frac=0.75,
 69 |             batch_size=4096,
 70 |         )
 71 |         self._latent_size = self._inference_model.hparams.emb_size
 72 | 
 73 |     def encode(self, smiles_list: List[str]) -> List[np.ndarray]:
 74 |         """See parent class."""
 75 |         return list(self._inference_model.seq_to_emb(smiles_list))
 76 | 
 77 |     def get_name(self) -> str:
 78 |         return "CDDD"
 79 | 
 80 |     @classmethod
 81 |     def is_valid_dir(cls, model_dir: str) -> bool:
 82 |         file_name = os.path.join(model_dir, "hparams.json")
 83 |         try:
 84 |             if not os.path.exists(file_name):
 85 |                 return False
 86 | 
 87 |             with open(file_name, "rt") as fh:
 88 |                 # Bizarrely, the file contains a quoted JSON string, so we need a double-load here:
 89 |                 hparams = json.loads(json.load(fh))
 90 | 
 91 |             return hparams["model"] == "NoisyGRUSeq2SeqWithFeatures"
 92 |         except Exception:  # Parse errors, key error, etc.
 93 |             return False
 94 | 
 95 |     def set_extra_args(self, **kwargs):
 96 |         workers = kwargs.get("num_workers")
 97 |         if workers is not None:
 98 |             self.num_workers = kwargs.get("num_workers")
 99 |         latent_space_stats = kwargs.get("latent_space_stats_file_dir")
100 |         if latent_space_stats is not None:
101 |             self.latent_space_stats_file_dir = kwargs.get("latent_space_stats_file_dir")
102 | 


--------------------------------------------------------------------------------
/prefer/model_based_representations/interface.py:
--------------------------------------------------------------------------------
  1 | #  Copyright (c) 2023, Novartis Institutes for BioMedical Research Inc. and Microsoft Corporation
  2 | #  All rights reserved.
  3 | # 
  4 | # Redistribution and use in source and binary forms, with or without
  5 | # modification, are permitted provided that the following conditions are
  6 | # met: 
  7 | #
  8 | #     * Redistributions of source code must retain the above copyright 
  9 | #       notice, this list of conditions and the following disclaimer.
 10 | #     * Redistributions in binary form must reproduce the above
 11 | #       copyright notice, this list of conditions and the following 
 12 | #       disclaimer in the documentation and/or other materials provided 
 13 | #       with the distribution.
 14 | #     * Neither the name of Novartis Institutes for BioMedical Research Inc. nor Microsoft Corporation 
 15 | #       nor the names of its contributors may be used to endorse or promote 
 16 | #       products derived from this software without specific prior written permission.
 17 | #
 18 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 19 | # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 20 | # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 21 | # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 22 | # OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 23 | # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 24 | # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 25 | # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 26 | # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 27 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 28 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 29 | #
 30 | # Created by Jessica Lanini, January 2023
 31 | 
 32 | 
 33 | """Abstract base classes for molecule encoders, generators, etc."""
 34 | from abc import ABC, abstractmethod
 35 | 
 36 | from typing import ContextManager, List, Optional, Tuple, Iterable, Union
 37 | 
 38 | import numpy as np
 39 | from dirhash import dirhash
 40 | from pathlib import Path
 41 | 
 42 | Pathlike = Union[str, Path]
 43 | 
 44 | 
 45 | class MoleculeSampler(ABC):
 46 |     """
 47 |     A molecule generator that can sample random molecules.
 48 |     """
 49 | 
 50 |     def sample(self, num_samples: int) -> List[str]:
 51 |         """
 52 |         Sample SMILES strings using the wrapped model.
 53 | 
 54 |         Args:
 55 |             num_samples: Number of results to return.
 56 | 
 57 |         Returns:
 58 |             List of SMILES strings.
 59 |         """
 60 |         # The below is a default implementation, can be overwritten for specific models.
 61 |         return [smiles for smiles, _ in self.sample_with_emb(num_samples)]
 62 | 
 63 | 
 64 | class AbstractModelRepresentation(ContextManager):
 65 |     """
 66 |     Base class for all molecule encoders, decoders, and samplers, providing
 67 |     - Default implementations for ContextManager
 68 |     - A model_id based on hash of files in the directory where the model is saved
 69 |     - Model name identifying the type of model
 70 |     """
 71 | 
 72 |     def __init__(self, dir: Pathlike, model_id_file_patterns: Iterable[str] = ("*",), **kwargs):
 73 |         # As `dir_hash` takes a `str`, we need to explicitly cast it
 74 |         self._model_id = dirhash(str(dir), "sha256", match=model_id_file_patterns)
 75 | 
 76 |         # Any arguments that make their way into here were passed into a model, but not understood
 77 |         # by it. We intentionally allow this, since it allows the user to provide preferred choices
 78 |         # for arguments without checking if a given model supports them. However, we print a warning
 79 |         # to make this more explicit.
 80 |         if kwargs:
 81 |             print("The following arguments were provided and ignored:", list(kwargs.keys()))
 82 | 
 83 |     def __enter__(self):
 84 |         return self
 85 | 
 86 |     def __exit__(self, exc_type, exc_value, traceback):
 87 |         return None
 88 | 
 89 |     def get_model_id(self) -> str:
 90 |         """
 91 | 
 92 |         Returns: the model id as a string
 93 | 
 94 |         """
 95 |         return self._model_id
 96 | 
 97 |     @abstractmethod
 98 |     def set_extra_args(self, **kwargs) -> None:
 99 |         pass
100 | 
101 |     @classmethod
102 |     @abstractmethod
103 |     def is_valid_dir(cls, model_dir):
104 |         pass
105 | 
106 |     @abstractmethod
107 |     def get_name(self) -> str:
108 |         """
109 | 
110 |         Returns: a human-readable string to describe the model type (e.g. 'MoLeR').
111 | 
112 |         """
113 |         raise NotImplementedError
114 | 
115 | 
116 | class LatentSpaceMoleculeGenerator(AbstractModelRepresentation, MoleculeSampler):
117 |     """
118 |     Autoencoder / Latent Space based Generative Model
119 |     """
120 | 
121 |     @abstractmethod
122 |     def encode(self, smiles_list: List[str]) -> List[np.array]:
123 |         """
124 |         Map input molecules to points in vector space.
125 |         Args:
126 |             smiles_list: List of molecules as SMILES
127 | 
128 |         Returns: 2D array of molecules as vectors (latent space)
129 |         TODO: should this be List[np.array]?
130 | 
131 |         """
132 |         raise NotImplementedError
133 | 


--------------------------------------------------------------------------------
/prefer/model_based_representations/model_based_representations_factory.py:
--------------------------------------------------------------------------------
 1 | #  Copyright (c) 2023, Novartis Institutes for BioMedical Research Inc. and Microsoft Corporation
 2 | #  All rights reserved.
 3 | # 
 4 | # Redistribution and use in source and binary forms, with or without
 5 | # modification, are permitted provided that the following conditions are
 6 | # met: 
 7 | #
 8 | #     * Redistributions of source code must retain the above copyright 
 9 | #       notice, this list of conditions and the following disclaimer.
10 | #     * Redistributions in binary form must reproduce the above
11 | #       copyright notice, this list of conditions and the following 
12 | #       disclaimer in the documentation and/or other materials provided 
13 | #       with the distribution.
14 | #     * Neither the name of Novartis Institutes for BioMedical Research Inc. nor Microsoft Corporation 
15 | #       nor the names of its contributors may be used to endorse or promote 
16 | #       products derived from this software without specific prior written permission.
17 | #
18 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19 | # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20 | # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21 | # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
22 | # OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23 | # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
24 | # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25 | # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26 | # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 | #
30 | # Created by Jessica Lanini, January 2023
31 | 
32 | 
33 | import os
34 | from typing import Any, List, Optional, Type
35 | 
36 | from prefer.model_based_representations.interface import (
37 |     LatentSpaceMoleculeGenerator,
38 |     AbstractModelRepresentation,
39 | )
40 | 
41 | import tensorflow as tf
42 | 
43 | if tf.__version__ >= "2.0.0":
44 |     # MoLeR environment
45 |     from prefer.model_based_representations.moler_wrapper import MoLeRGeneratorModel
46 | 
47 |     latent_space_models = [MoLeRGeneratorModel]
48 | else:
49 |     # CDDD environment
50 |     from prefer.model_based_representations.cddd_wrapper import CDDDGeneratorModel
51 | 
52 |     latent_space_models = [CDDDGeneratorModel]
53 |     
54 | # Add here new model based molecular representation
55 | 
56 | 
57 | def load_latent_model_from_directory(model_dir: str, **kwargs: Any) -> LatentSpaceMoleculeGenerator:
58 |     model: LatentSpaceMoleculeGenerator = load_model_from_directory(model_dir, [], **kwargs)
59 |     return model
60 | 
61 | 
62 | def load_model_from_directory(
63 |     model_dir: str,
64 |     extra_model_types: Optional[List[Type[AbstractModelRepresentation]]] = None,
65 |     **kwargs: Any,
66 | ) -> AbstractModelRepresentation:
67 |     """Loads a model from the given directory.
68 | 
69 |     Note:
70 |         This method will figure out the exact type of model from the data.
71 |         Both `args` and `kwargs` are passed to the model's `__init__` method.
72 | 
73 |     Returns:
74 |         An object implementing the AbstractModelRepresentation interface.
75 |     """
76 |     if extra_model_types is None:
77 |         extra_model_types = []
78 |     all_models = latent_space_models + extra_model_types
79 |     if not os.path.isdir(model_dir):
80 |         raise ValueError(f"{model_dir} is not a directory!")
81 | 
82 |     for cls in all_models:
83 |         if cls.is_valid_dir(model_dir):
84 |             return cls(model_dir, **kwargs)
85 |     raise ValueError(f"{model_dir} does not contain any of the recognised model types.")
86 | 


--------------------------------------------------------------------------------
/prefer/model_based_representations/models/__init__.py:
--------------------------------------------------------------------------------
1 | 
2 | 
3 | 


--------------------------------------------------------------------------------
/prefer/model_based_representations/moler_wrapper.py:
--------------------------------------------------------------------------------
 1 | #  Copyright (c) 2023, Novartis Institutes for BioMedical Research Inc. and Microsoft Corporation
 2 | #  All rights reserved.
 3 | # 
 4 | # Redistribution and use in source and binary forms, with or without
 5 | # modification, are permitted provided that the following conditions are
 6 | # met: 
 7 | #
 8 | #     * Redistributions of source code must retain the above copyright 
 9 | #       notice, this list of conditions and the following disclaimer.
10 | #     * Redistributions in binary form must reproduce the above
11 | #       copyright notice, this list of conditions and the following 
12 | #       disclaimer in the documentation and/or other materials provided 
13 | #       with the distribution.
14 | #     * Neither the name of Novartis Institutes for BioMedical Research Inc. nor Microsoft Corporation 
15 | #       nor the names of its contributors may be used to endorse or promote 
16 | #       products derived from this software without specific prior written permission.
17 | #
18 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19 | # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20 | # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21 | # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
22 | # OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23 | # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
24 | # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25 | # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26 | # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 | #
30 | # Created by Jessica Lanini, January 2023
31 | 
32 | 
33 | import os
34 | import pathlib
35 | from typing import Any, List, Tuple, Union
36 | 
37 | import numpy as np
38 | 
39 | from prefer.model_based_representations.interface import LatentSpaceMoleculeGenerator
40 | from molecule_generation.wrapper import VaeWrapper
41 | 
42 | Pathlike = Union[str, pathlib.Path]
43 | 
44 | 
45 | class MoLeRGeneratorModel(VaeWrapper, LatentSpaceMoleculeGenerator):
46 |     def __init__(
47 |         self, dir: Pathlike, seed: int = 0, num_workers: int = 6, beam_size: int = 1, **kwargs: Any
48 |     ):
49 |         VaeWrapper.__init__(self, dir, seed=seed, num_workers=num_workers, beam_size=beam_size)
50 |         LatentSpaceMoleculeGenerator.__init__(
51 |             self, dir, model_id_file_patterns=("*_best.pkl", "*_best.hdf5"), **kwargs
52 |         )
53 | 
54 |         self._can_decode_from_scaffold = True
55 | 
56 |     def get_name(self) -> str:
57 |         return "MoLeR"
58 | 
59 |     @classmethod
60 |     def is_valid_dir(cls, model_dir: str) -> object:
61 |         files_in_dir = os.listdir(model_dir)
62 |         return any(
63 |             "_MoLeR__" in filename or "_MotifMoLeR__" in filename for filename in files_in_dir
64 |         )
65 |         return any(cls._is_moler_model_filename(filename) for filename in files_in_dir)
66 | 
67 |     def set_extra_args(self, **kwargs):
68 |         workers = kwargs.get("num_workers")
69 |         if workers is not None:
70 |             self.num_workers = kwargs.get("num_workers")
71 |         beam = kwargs.get("beam_size")
72 |         if beam is not None:
73 |             self.beam_size = kwargs.get("beam_size")
74 | 


--------------------------------------------------------------------------------
/prefer/molecule_representations/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rdkit/PREFER/16986ca183b38b94f9b3b62b441ec40671b8cf8a/prefer/molecule_representations/__init__.py


--------------------------------------------------------------------------------
/prefer/molecule_representations/descriptors2D_representations_builder.py:
--------------------------------------------------------------------------------
  1 | #  Copyright (c) 2023, Novartis Institutes for BioMedical Research Inc. and Microsoft Corporation
  2 | #  All rights reserved.
  3 | # 
  4 | # Redistribution and use in source and binary forms, with or without
  5 | # modification, are permitted provided that the following conditions are
  6 | # met: 
  7 | #
  8 | #     * Redistributions of source code must retain the above copyright 
  9 | #       notice, this list of conditions and the following disclaimer.
 10 | #     * Redistributions in binary form must reproduce the above
 11 | #       copyright notice, this list of conditions and the following 
 12 | #       disclaimer in the documentation and/or other materials provided 
 13 | #       with the distribution.
 14 | #     * Neither the name of Novartis Institutes for BioMedical Research Inc. nor Microsoft Corporation 
 15 | #       nor the names of its contributors may be used to endorse or promote 
 16 | #       products derived from this software without specific prior written permission.
 17 | #
 18 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 19 | # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 20 | # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 21 | # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 22 | # OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 23 | # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 24 | # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 25 | # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 26 | # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 27 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 28 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 29 | #
 30 | # Created by Jessica Lanini, January 2023
 31 | 
 32 | 
 33 | #!/usr/bin/env python
 34 | # -*- coding: utf-8 -*-
 35 | 
 36 | import logging
 37 | import sys
 38 | 
 39 | from pandas import DataFrame
 40 | 
 41 | 
 42 | from prefer.utils.data_utils import check_if_nan, generate_molecule, generate2DDesc
 43 | from prefer.src.molecule_representations import MoleculeRepresentations
 44 | from prefer.src.molecule_representations_builder import MoleculeRepresentationsBuilder
 45 | from prefer.src.vector_molecule_representations import VectorMoleculeRepresentations
 46 | 
 47 | 
 48 | class Descriptors2DRepresentationsBuilder(MoleculeRepresentationsBuilder):
 49 |     #TODO Set scale_type to None
 50 |     def __init__(
 51 |         self, limit_def: int = None, scale_type: str = "standardization"
 52 |     ):
 53 |         self.limit_def = limit_def
 54 |         self.scale_type = scale_type # e.g. standardization
 55 |         
 56 |     def build_representations(
 57 |         self, molecule_data_orig: DataFrame, split_type: str = "random", seed=1,
 58 |     ) -> MoleculeRepresentations:
 59 |         """
 60 |         method to compute Morgan Fingerprints as implemented in RDKit
 61 | 
 62 |         Input:
 63 |         - molecule_data_orig: this is a dataframe of the shape
 64 |         | ID | Smiles | Property_1 | Property_2 | ... | Property_N |
 65 |         ------------------------------------------------------------
 66 |         - split_type: string related to the type of test/train split one want to apply. Possible split_type are random, temporal and cluster. One can add new splitting strategies in utils.splitting_strategies
 67 |         Output:
 68 |         - MoleculeRepresentations object
 69 |         """
 70 | 
 71 |         print(f'Building Descriptors 2D. Warning: current scale_type is set to {self.scale_type}')
 72 |         molecule_data = molecule_data_orig.copy()
 73 |         logging.info("Generate 2D Descriptors")
 74 |         molecules = generate_molecule(molecule_data)
 75 |         # Generate _2DDescriptors
 76 |         molecule_data["molecule_representation"] = generate2DDesc(molecules)
 77 |         molecule_data = self.remove_nan(molecule_data)
 78 |         vector_molecule_representation = VectorMoleculeRepresentations(
 79 |             df=molecule_data,
 80 |             representation_name="DESCRIPTORS2D",
 81 |             split_type=split_type,
 82 |             scale_type=self.scale_type,
 83 |             seed=seed,
 84 |             limit_def = self.limit_def,
 85 |         )
 86 | 
 87 |         return vector_molecule_representation
 88 | 
 89 |     def remove_nan(self, molecule_data: DataFrame):
 90 |         """
 91 |         method use to check whetehr a representation has nan values and in case remove the corresponding row.
 92 | 
 93 |         input: representation_to_add is the representation to check
 94 |         """
 95 |         nan_rows = check_if_nan(molecule_data["molecule_representation"])
 96 |         if nan_rows:
 97 |             logging.warning(
 98 |                 "Found nan in the representation:"
 99 |                 + "2D Descriptors"
100 |                 + ". The following sample/s should be removed from the dataframe:"
101 |                 + str(nan_rows)
102 |             )
103 |             molecule_data = molecule_data.drop(molecule_data.index[nan_rows])
104 |             # Reset indices
105 |             molecule_data = molecule_data.reset_index(drop=True)
106 |         return molecule_data
107 | 


--------------------------------------------------------------------------------
/prefer/molecule_representations/fingerprints_representations_builder.py:
--------------------------------------------------------------------------------
 1 | #  Copyright (c) 2023, Novartis Institutes for BioMedical Research Inc. and Microsoft Corporation
 2 | #  All rights reserved.
 3 | # 
 4 | # Redistribution and use in source and binary forms, with or without
 5 | # modification, are permitted provided that the following conditions are
 6 | # met: 
 7 | #
 8 | #     * Redistributions of source code must retain the above copyright 
 9 | #       notice, this list of conditions and the following disclaimer.
10 | #     * Redistributions in binary form must reproduce the above
11 | #       copyright notice, this list of conditions and the following 
12 | #       disclaimer in the documentation and/or other materials provided 
13 | #       with the distribution.
14 | #     * Neither the name of Novartis Institutes for BioMedical Research Inc. nor Microsoft Corporation 
15 | #       nor the names of its contributors may be used to endorse or promote 
16 | #       products derived from this software without specific prior written permission.
17 | #
18 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19 | # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20 | # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21 | # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
22 | # OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23 | # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
24 | # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25 | # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26 | # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 | #
30 | # Created by Jessica Lanini, January 2023
31 | 
32 | 
33 | #!/usr/bin/env python
34 | # -*- coding: utf-8 -*-
35 | 
36 | import logging
37 | import sys
38 | 
39 | from pandas import DataFrame
40 | 
41 | 
42 | from prefer.utils.data_utils import check_if_nan, generate_fingerprints, generate_molecule
43 | from prefer.src.molecule_representations_builder import MoleculeRepresentationsBuilder
44 | from prefer.src.molecule_representations import MoleculeRepresentations
45 | from prefer.src.vector_molecule_representations import VectorMoleculeRepresentations
46 | 
47 | 
48 | class FingerprintsRepresentationsBuilder(MoleculeRepresentationsBuilder):
49 |     def __init__(
50 |         self, limit_def: int = None,
51 |     ):
52 |         self.limit_def = limit_def
53 |         
54 |     def build_representations(
55 |         self, molecule_data_orig: DataFrame, split_type: str = "random", seed=1,
56 |     ) -> MoleculeRepresentations:
57 |         """
58 |         method to compute Morgan Fingerprints as implemented in RDKit
59 | 
60 |         Input:
61 |         - molecule_data: this is a dataframe of the shape
62 |         | ID | Smiles | Property_1 | Property_2 | ... | Property_N |
63 |         ------------------------------------------------------------
64 |         - split_type: string related to the type of test/train split one want to apply. Possible split_type are random, temporal and cluster. One can add new splitting strategies in utils.splitting_strategies
65 |         Output:
66 |         - MoleculeRepresentations object
67 |         """
68 |             
69 |             
70 |         molecule_data = molecule_data_orig.copy()
71 |         logging.info("Generate Morgan Fingerprints")
72 |         molecules = generate_molecule(molecule_data)
73 |         molecule_data["molecule_representation"] = generate_fingerprints(molecules)
74 |         molecule_data = self.remove_nan(molecule_data)
75 | 
76 |         return VectorMoleculeRepresentations(
77 |             df=molecule_data, representation_name="FINGERPRINTS", split_type=split_type, seed=seed, limit_def = self.limit_def,
78 |         )
79 | 
80 |     def remove_nan(self, molecule_data: DataFrame):
81 |         """
82 |         method use to check whetehr a representation has nan values and in case remove the corresponding row.
83 | 
84 |         input: representation_to_add is the representation to check
85 |         """
86 |         nan_rows = check_if_nan(molecule_data["molecule_representation"])
87 |         if nan_rows:
88 |             logging.warning(
89 |                 "Found nan in the representation:"
90 |                 + "fingerprints"
91 |                 + ". The following sample/s should be removed from the dataframe:"
92 |                 + str(nan_rows)
93 |             )
94 |             molecule_data = molecule_data.drop(molecule_data.index[nan_rows])
95 |             # Reset indices
96 |             molecule_data = molecule_data.reset_index(drop=True)
97 |         return molecule_data
98 | 


--------------------------------------------------------------------------------
/prefer/molecule_representations/model_representations_builder.py:
--------------------------------------------------------------------------------
  1 | #  Copyright (c) 2023, Novartis Institutes for BioMedical Research Inc. and Microsoft Corporation
  2 | #  All rights reserved.
  3 | # 
  4 | # Redistribution and use in source and binary forms, with or without
  5 | # modification, are permitted provided that the following conditions are
  6 | # met: 
  7 | #
  8 | #     * Redistributions of source code must retain the above copyright 
  9 | #       notice, this list of conditions and the following disclaimer.
 10 | #     * Redistributions in binary form must reproduce the above
 11 | #       copyright notice, this list of conditions and the following 
 12 | #       disclaimer in the documentation and/or other materials provided 
 13 | #       with the distribution.
 14 | #     * Neither the name of Novartis Institutes for BioMedical Research Inc. nor Microsoft Corporation 
 15 | #       nor the names of its contributors may be used to endorse or promote 
 16 | #       products derived from this software without specific prior written permission.
 17 | #
 18 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 19 | # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 20 | # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 21 | # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 22 | # OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 23 | # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 24 | # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 25 | # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 26 | # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 27 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 28 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 29 | #
 30 | # Created by Jessica Lanini, January 2023
 31 | 
 32 | 
 33 | #!/usr/bin/env python
 34 | # -*- coding: utf-8 -*-
 35 | 
 36 | import logging
 37 | import sys
 38 | 
 39 | from pandas import DataFrame
 40 | 
 41 | 
 42 | from prefer.utils.data_utils import check_if_nan
 43 | from prefer.src.molecule_representations_builder import MoleculeRepresentationsBuilder
 44 | from prefer.src.molecule_representations import MoleculeRepresentations
 45 | from prefer.src.vector_molecule_representations import VectorMoleculeRepresentations
 46 | from prefer.model_based_representations.model_based_representations_factory import (
 47 |     load_model_from_directory,
 48 | )
 49 | 
 50 | 
 51 | class ModelRepresentationsBuilder(MoleculeRepresentationsBuilder):
 52 |     def __init__(
 53 |         self, path_to_model: str = None, representation_name: str = None, limit_def: int = None,
 54 |     ):
 55 |         self.path_to_model = path_to_model
 56 |         # here instance models
 57 |         self.model_instance = load_model_from_directory(path_to_model)
 58 |         self.representation_name = representation_name
 59 |         self.limit_def = limit_def
 60 | 
 61 |     def build_representations(
 62 |         self,
 63 |         molecule_data_orig: DataFrame,
 64 |         embedding_types: str = "vector",
 65 |         split_type: str = "random",
 66 |         padding_size: int = 100,
 67 |         seed=1,
 68 |     ) -> MoleculeRepresentations:
 69 |         """
 70 |         generic generator model to convert smile to embeddings
 71 |         Input:
 72 |         - molecule_data: this is a dataframe of the shape
 73 |         | ID | Smiles | Property_1 | Property_2 | ... | Property_N |
 74 |         ------------------------------------------------------------
 75 |         - split_type: string related to the type of test/train split one want to apply.
 76 |             Possible split_type are random, temporal and cluster. One can add new splitting strategies
 77 |             in utils.splitting_strategies
 78 |         - padding_size: max dimension of the final list of vectors (max number of atoms per molecule)
 79 | 
 80 |         Output:
 81 |         - MoleculeRepresentations object
 82 |         """
 83 |     
 84 |             
 85 |             
 86 |         if self.representation_name is None:
 87 |             self.representation_name = "model_based_representation"
 88 | 
 89 |         if embedding_types not in ["vector"]:
 90 |             raise ValueError("ERROR: embedding_types not known, only vector is possible.")
 91 | 
 92 |         molecule_data = molecule_data_orig.copy()
 93 |         logging.info("Generate Model based Representation")
 94 | 
 95 |         try:
 96 | 
 97 |             if embedding_types == "vector":
 98 |                 with self.model_instance as model:
 99 |                     smiles_embedding = model.encode(molecule_data.Smiles.to_list())
100 |                     version_model_ID = model.get_model_id()
101 |                 list_of_smiles_embedding = [x for x in smiles_embedding]
102 |             else:
103 |                 raise ValueError(f"{embedding_types} not known. Only vector is possible.")
104 |         except Exception as e:
105 |             raise ValueError(
106 |                 f"ERROR: the model directory for the model based representation might be incorrect or another error occurred: ValueError exception thrown{e}"
107 |             )
108 | 
109 |         if embedding_types == "vector":
110 |             molecule_data["molecule_representation"] = list_of_smiles_embedding
111 |             molecule_data = self.remove_nan(molecule_data)
112 |             return VectorMoleculeRepresentations(
113 |                 df=molecule_data,
114 |                 representation_name=self.representation_name,
115 |                 split_type=split_type,
116 |                 seed=seed,
117 |                 model_id=version_model_ID,
118 |                 limit_def = self.limit_def,
119 |             )
120 |         else:
121 |             raise ValueError(
122 |                 f"embedding_types: {embedding_types} not known. Only vector is supported"
123 |             )
124 | 
125 |     def remove_nan(self, molecule_data: DataFrame):
126 |         """
127 |         method use to check whetehr a representation has nan values and in case remove the corresponding row.
128 | 
129 |         input: representation_to_add is the representation to check
130 |         """
131 |         nan_rows = check_if_nan(molecule_data["molecule_representation"])
132 |         if nan_rows:
133 |             logging.warning(
134 |                 "Found nan in the representation"
135 |                 + self.representation_name
136 |                 + ". The following sample/s should be removed from the dataframe:"
137 |                 + str(nan_rows)
138 |             )
139 |             molecule_data = molecule_data.drop(molecule_data.index[nan_rows])
140 |             # Reset indices
141 |             molecule_data = molecule_data.reset_index(drop=True)
142 |         return molecule_data
143 | 


--------------------------------------------------------------------------------
/prefer/schema/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rdkit/PREFER/16986ca183b38b94f9b3b62b441ec40671b8cf8a/prefer/schema/__init__.py


--------------------------------------------------------------------------------
/prefer/schema/config.py:
--------------------------------------------------------------------------------
 1 | #  Copyright (c) 2023, Novartis Institutes for BioMedical Research Inc. and Microsoft Corporation
 2 | #  All rights reserved.
 3 | # 
 4 | # Redistribution and use in source and binary forms, with or without
 5 | # modification, are permitted provided that the following conditions are
 6 | # met: 
 7 | #
 8 | #     * Redistributions of source code must retain the above copyright 
 9 | #       notice, this list of conditions and the following disclaimer.
10 | #     * Redistributions in binary form must reproduce the above
11 | #       copyright notice, this list of conditions and the following 
12 | #       disclaimer in the documentation and/or other materials provided 
13 | #       with the distribution.
14 | #     * Neither the name of Novartis Institutes for BioMedical Research Inc. nor Microsoft Corporation 
15 | #       nor the names of its contributors may be used to endorse or promote 
16 | #       products derived from this software without specific prior written permission.
17 | #
18 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19 | # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20 | # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21 | # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
22 | # OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23 | # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
24 | # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25 | # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26 | # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 | #
30 | # Created by Jessica Lanini, January 2023
31 | 
32 | 
33 | from __future__ import annotations
34 | from typing import List, Dict, Optional
35 | import yaml
36 | import pydantic
37 | from pydantic import Extra
38 | 
39 | """pydantic Models defining structure of a PREFER config YAML file"""
40 | 
41 | 
42 | class PreferConfig(pydantic.BaseModel):
43 |     class Config:
44 |         extra = Extra.forbid
45 | 
46 |     problem_type: str
47 |     experiment_name: str
48 |     smiles_column_name: str
49 |     id_column_name: str
50 |     desirability_scores: Optional[Dict[str, List[Dict[str, float]]]]
51 |     splitting_strategy: str = "random"
52 | 
53 |     @classmethod
54 |     def from_yaml_file(cls, path: str) -> PreferConfig:
55 |         with open(path) as f:
56 |             parsed_yaml = yaml.load(f, Loader=yaml.FullLoader)
57 |         return cls.parse_obj(parsed_yaml)
58 | 
59 | 
60 | class LocalConfig(PreferConfig):
61 |     """Config for training a local (project-specific) property model"""
62 | 
63 |     assay_name: str
64 |     project_code: str
65 |     properties_column_name: str
66 |     # TODO why do we have different field names 'datapath' and 'path_to_df' for local and global models?
67 |     datapath: str
68 | 
69 | 
70 | class GlobalConfig(PreferConfig):
71 |     """Config for training a global (not project-specific) property model"""
72 | 
73 |     properties_column_name_list: List[str]
74 |     path_to_df: str
75 |     representations: Dict[str, str]
76 |     temporal_info_column_name: Optional[str]
77 | 


--------------------------------------------------------------------------------
/prefer/scripts/__init__.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | 


--------------------------------------------------------------------------------
/prefer/scripts/aml_context.py:
--------------------------------------------------------------------------------
 1 | #  Copyright (c) 2023, Novartis Institutes for BioMedical Research Inc. and Microsoft Corporation
 2 | #  All rights reserved.
 3 | # 
 4 | # Redistribution and use in source and binary forms, with or without
 5 | # modification, are permitted provided that the following conditions are
 6 | # met: 
 7 | #
 8 | #     * Redistributions of source code must retain the above copyright 
 9 | #       notice, this list of conditions and the following disclaimer.
10 | #     * Redistributions in binary form must reproduce the above
11 | #       copyright notice, this list of conditions and the following 
12 | #       disclaimer in the documentation and/or other materials provided 
13 | #       with the distribution.
14 | #     * Neither the name of Novartis Institutes for BioMedical Research Inc. nor Microsoft Corporation 
15 | #       nor the names of its contributors may be used to endorse or promote 
16 | #       products derived from this software without specific prior written permission.
17 | #
18 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19 | # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20 | # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21 | # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
22 | # OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23 | # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
24 | # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25 | # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26 | # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 | #
30 | # Created by Jessica Lanini, January 2023
31 | 
32 | 
33 | def run_aml_context():
34 | 
35 |     try:
36 |         from azureml.core.run import Run
37 | 
38 |         run_context = Run.get_context(allow_offline=False)
39 |     except Exception as e:
40 |         print(e)
41 |         run_context = None
42 | 
43 |     return run_context
44 | 


--------------------------------------------------------------------------------
/prefer/scripts/combine_results.py:
--------------------------------------------------------------------------------
  1 | #  Copyright (c) 2023, Novartis Institutes for BioMedical Research Inc. and Microsoft Corporation
  2 | #  All rights reserved.
  3 | # 
  4 | # Redistribution and use in source and binary forms, with or without
  5 | # modification, are permitted provided that the following conditions are
  6 | # met: 
  7 | #
  8 | #     * Redistributions of source code must retain the above copyright 
  9 | #       notice, this list of conditions and the following disclaimer.
 10 | #     * Redistributions in binary form must reproduce the above
 11 | #       copyright notice, this list of conditions and the following 
 12 | #       disclaimer in the documentation and/or other materials provided 
 13 | #       with the distribution.
 14 | #     * Neither the name of Novartis Institutes for BioMedical Research Inc. nor Microsoft Corporation 
 15 | #       nor the names of its contributors may be used to endorse or promote 
 16 | #       products derived from this software without specific prior written permission.
 17 | #
 18 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 19 | # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 20 | # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 21 | # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 22 | # OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 23 | # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 24 | # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 25 | # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 26 | # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 27 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 28 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 29 | #
 30 | # Created by Jessica Lanini, January 2023
 31 | 
 32 | 
 33 | #!/usr/bin/python
 34 | 
 35 | import shutil
 36 | 
 37 | from prefer.src.benchmarking import Benchmarking
 38 | from prefer.utils.models_evaluation import save_html
 39 | from prefer.utils.save_load import save_combined_results
 40 | import pandas as pd
 41 | import os
 42 | import argparse
 43 | import json
 44 | 
 45 | 
 46 | def extract_folders(original_flder):
 47 | 
 48 |     dirfiles = os.listdir(original_flder)
 49 |     fullpaths = map(lambda name: os.path.join(original_flder, name), dirfiles)
 50 |     dirs = []
 51 |     for file in fullpaths:
 52 |         if (os.path.isdir(file)) and (file[0] not in [".", "_"]):
 53 |             dirs.append(file)
 54 |     return dirs[0]
 55 | 
 56 | 
 57 | def combine_results_from_args(
 58 |     store_result_folder,
 59 |     problem_type,
 60 |     benchs_folder1,
 61 |     benchs_folder2,
 62 |     benchs_folder3,
 63 |     benchs_folder4,
 64 |     experiment_name=None,
 65 |     save_json=True,
 66 | ):
 67 | 
 68 |     """
 69 |     Script to combine all the results stored as separate bench objects in different folders, colelcted in the input list, computed from different PREFER runs.
 70 |     """
 71 | 
 72 |     collect_df = {}
 73 |     results = []
 74 |     collect_bench = []
 75 |     # create final folder
 76 |     os.makedirs(store_result_folder, exist_ok=True)
 77 | 
 78 |     if store_result_folder.endswith("/"):  # Normalise away trailing slashes
 79 |         store_result_folder = store_result_folder[:-1]
 80 | 
 81 |     all_folders = [benchs_folder1, benchs_folder2, benchs_folder3, benchs_folder4]
 82 | 
 83 |     for folder in all_folders:
 84 |         if folder:
 85 |             end = folder.split("/")[-1]
 86 |             if "." in end:
 87 |                 folder = folder.split("/")[:-1]
 88 |                 folder = "/".join(folder)
 89 |             folder = extract_folders(folder)
 90 |             # folder of interests
 91 |             final_dir = folder
 92 | 
 93 |             if not final_dir.endswith("/"):  # Normalise away trailing slashes
 94 |                 final_dir = final_dir + "/"
 95 | 
 96 |             tmp = Benchmarking(problem_type=problem_type)
 97 |             try:
 98 |                 tmp.load(final_dir)
 99 |                 print("bench loaded")
100 |                 tmp.create_summary_table()
101 |                 print("summary_table computed")
102 | 
103 |                 if experiment_name:
104 |                     experiment_name_tmp = experiment_name
105 |                 else:
106 |                     experiment_name_tmp = tmp.experiment_name
107 |                 tmp.table_metrics.rename(
108 |                     columns={
109 |                         tmp.table_metrics.columns[0]: experiment_name_tmp
110 |                         + ":"
111 |                         + tmp.table_metrics.columns[0]
112 |                     },
113 |                     inplace=True,
114 |                 )
115 | 
116 |                 if experiment_name_tmp not in collect_df:
117 |                     collect_df[experiment_name_tmp] = [tmp.table_metrics]
118 |                 else:
119 |                     collect_df[experiment_name_tmp].append(tmp.table_metrics)
120 |                 collect_bench.append(tmp)
121 | 
122 |             except Exception as e:
123 |                 # WARNING?
124 |                 raise ValueError(
125 |                     f"An error occurred with folder: {final_dir}. Benchmarking object cannot be imported. In particular: {e}",
126 |                 )
127 | 
128 |             # dump metrics for every model:
129 |             metrics_dict = tmp.table_metrics.to_dict()
130 |             print("metrics_dict created")
131 |             # Note: As the df with the metric has only <representation>+<model type> as identificator
132 |             # so we add there experiment name and in the body attach problem type.
133 |             experiment_id = next(iter(metrics_dict))
134 |             new_experiment_id = experiment_name_tmp + "," + experiment_id
135 |             metrics_dict[new_experiment_id] = metrics_dict[experiment_id]
136 |             metrics_dict[new_experiment_id]["Problem type"] = tmp.problem_type
137 |             del metrics_dict[experiment_id]
138 |             results.append(metrics_dict)
139 |             print("metrics_dict appended")
140 |         else:
141 |             continue
142 | 
143 |     # Then save one json with all the experiments (dataset x model type):
144 |     if save_json:
145 |         with open(store_result_folder + "/" + "PREFER_comparison_table.json", "w") as jsonfile:
146 |             json.dump(results, jsonfile)
147 | 
148 |     for key in collect_df.keys():
149 |         merged = pd.concat(collect_df[key], axis=1)
150 |         save_html(
151 |             merged, df_name=key, path=store_result_folder + "/" + "PREFER_comparison_table.html",
152 |         )
153 |         merged.to_csv(store_result_folder + "/" + "PREFER_comparison_table.csv")
154 |         merged.to_pickle(store_result_folder + "/" + "PREFER_comparison_table.pkl")
155 | 
156 |     return
157 | 
158 | 
159 | if __name__ == "__main__":
160 |     parser = argparse.ArgumentParser(description="combine results of different PREFER runs")
161 |     parser.add_argument(
162 |         "-bf1",
163 |         "--benchs_folder1",
164 |         type=str,
165 |         help="path of the folder where results are stored",
166 |         required=True,
167 |     )
168 | 
169 |     parser.add_argument(
170 |         "-bf2", "--benchs_folder2", type=str, help="path of the folder where results are stored",
171 |     )
172 | 
173 |     parser.add_argument(
174 |         "-bf3", "--benchs_folder3", type=str, help="path of the folder where results are stored",
175 |     )
176 | 
177 |     parser.add_argument(
178 |         "-bf4", "--benchs_folder4", type=str, help="path of the folder where results are stored",
179 |     )
180 | 
181 |     parser.add_argument(
182 |         "-srf",
183 |         "--store_result_folder",
184 |         type=str,
185 |         help="path of the folder where results are stored",
186 |     )
187 | 
188 |     parser.add_argument(
189 |         "-pt", "--problem_type", type=str, help="problem_type: regression or classification",
190 |     )
191 | 
192 |     args = parser.parse_args()
193 |     combine_results_from_args(
194 |         benchs_folder1=args.benchs_folder1,
195 |         benchs_folder2=args.benchs_folder2,
196 |         benchs_folder3=args.benchs_folder3,
197 |         benchs_folder4=args.benchs_folder4,
198 |         store_result_folder=args.store_result_folder,
199 |         problem_type=args.problem_type,
200 |     )
201 | 


--------------------------------------------------------------------------------
/prefer/scripts/get_representations.py:
--------------------------------------------------------------------------------
  1 | #  Copyright (c) 2023, Novartis Institutes for BioMedical Research Inc. and Microsoft Corporation
  2 | #  All rights reserved.
  3 | # 
  4 | # Redistribution and use in source and binary forms, with or without
  5 | # modification, are permitted provided that the following conditions are
  6 | # met: 
  7 | #
  8 | #     * Redistributions of source code must retain the above copyright 
  9 | #       notice, this list of conditions and the following disclaimer.
 10 | #     * Redistributions in binary form must reproduce the above
 11 | #       copyright notice, this list of conditions and the following 
 12 | #       disclaimer in the documentation and/or other materials provided 
 13 | #       with the distribution.
 14 | #     * Neither the name of Novartis Institutes for BioMedical Research Inc. nor Microsoft Corporation 
 15 | #       nor the names of its contributors may be used to endorse or promote 
 16 | #       products derived from this software without specific prior written permission.
 17 | #
 18 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 19 | # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 20 | # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 21 | # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 22 | # OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 23 | # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 24 | # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 25 | # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 26 | # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 27 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 28 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 29 | #
 30 | # Created by Jessica Lanini, January 2023
 31 | 
 32 | 
 33 | #!/usr/bin/python
 34 | import argparse
 35 | import json
 36 | import logging
 37 | import os
 38 | import sys
 39 | 
 40 | import pandas as pd
 41 | import shutil
 42 | 
 43 | 
 44 | from prefer.utils.data_preparation import prepare_data
 45 | from prefer.utils.mapping import mapping_representations
 46 | 
 47 | logger = logging.getLogger(__name__)
 48 | 
 49 | 
 50 | def compute_representations_from_args(
 51 |     path_to_df,
 52 |     representation_to_compute,
 53 |     path_to_model,
 54 |     output_dir,
 55 |     experiment_name,
 56 |     id_column_name,
 57 |     smiles_column_name,
 58 |     splitting_strategy,
 59 |     temporal_info_column_name,
 60 |     properties_column_name_list,
 61 | ):
 62 |     """
 63 |     we assume that the dataset has the semicolomn as separator and that the dataset
 64 |     """
 65 |     if os.path.exists(output_dir):
 66 |         shutil.rmtree(output_dir)
 67 |     os.makedirs(output_dir, exist_ok=True)
 68 | 
 69 |     # Read your .csv files
 70 |     if path_to_df.endswith("/"):  # Normalise away trailing slashes
 71 |         path_to_df = path_to_df[:-1]
 72 | 
 73 |     try:
 74 |         arr = os.listdir(path_to_df)
 75 |         path_to_df = path_to_df + "/" + arr[0]
 76 |     except Exception:
 77 |         logger.info("Already a file")
 78 | 
 79 |     try:
 80 |         df = pd.read_csv(path_to_df)
 81 |     except Exception:
 82 |         df = pd.read_csv(path_to_df, sep=";")
 83 | 
 84 |     # in prepare_data now the dataset is both prepared and filtered
 85 | 
 86 |     # Manipulate dataframe such that it is in the right shape fo being used as input of the DataStorage class
 87 |     # ¦ ID ¦ Smiles ¦ Property_1 ¦ Property_2 ¦ ... ¦ Property_N ¦
 88 |     # -------------------------------------------------------------
 89 |     # This is done by specifying the experiment_name, the name of column where the ID information and SMILES representation of each sample is stored, and finally
 90 |     # the list of the columns' names of the properties to model.
 91 |     df = prepare_data(
 92 |         df=df,
 93 |         id_column_name=id_column_name,
 94 |         smiles_column_name=smiles_column_name,
 95 |         properties_column_name_list=properties_column_name_list,
 96 |         temporal_info_column_name=temporal_info_column_name,
 97 |     )
 98 | 
 99 |     mapping_representations(
100 |         representation_name=representation_to_compute,
101 |         df=df,
102 |         output_dir=output_dir,
103 |         path_to_model=path_to_model,
104 |         experiment_name=experiment_name,
105 |         path_to_df=path_to_df,
106 |         split_type=splitting_strategy,
107 |     )
108 |     logger.info("Representation Computed")
109 |     
110 |     return output_dir
111 | 
112 | 
113 | if __name__ == "__main__":
114 |     """
115 |     Example of usage:
116 |     %run get_representations.py -ptd "/path/to/dataframe/dataframe.csv" -rtc "FINGERPRINTS"
117 |     -od "/path/to/representation/PREFER_automation_branch/" -en "logD" -icn "Molecule ChEMBL ID"
118 |     -scn "Smiles" -pcn "Standard Value"
119 |     """
120 |     parser = argparse.ArgumentParser(description="compute molecule representation")
121 |     parser.add_argument(
122 |         "-ptd",
123 |         "--path_to_df",
124 |         type=str,
125 |         help="The entire path to the dataframe used for this experiment. The dataframe should be stored as .csv, "
126 |         "should use semicolomn as separator and should contain information about the SMILE representation "
127 |         "of each molecule, an ID of the molecules and the property/ies one want to model.",
128 |         required=True,
129 |     )
130 | 
131 |     parser.add_argument(
132 |         "-rtc",
133 |         "--representation_to_compute",
134 |         type=str,
135 |         help="name of the rapresentation to compute or path to the generator which is used "
136 |         "to map smiles into embeddings. If a model-based representation is selected then a "
137 |         "path to model should be indicated",
138 |         required=True,
139 |     )
140 | 
141 |     parser.add_argument(
142 |         "-ptm",
143 |         "--path_to_model",
144 |         type=str,
145 |         help="path to the model that should be used to convert smiles into embeddings",
146 |     )
147 | 
148 |     parser.add_argument(
149 |         "-od",
150 |         "--output_dir",
151 |         type=str,
152 |         help="path to the directory where to store the molecule representation computed",
153 |         required=True,
154 |     )
155 | 
156 |     parser.add_argument(
157 |         "-en",
158 |         "--experiment_name",
159 |         type=str,
160 |         help="name of the experiment one would like to perform. E.g. logD",
161 |         required=True,
162 |     )
163 | 
164 |     parser.add_argument(
165 |         "-icn",
166 |         "--id_column_name",
167 |         type=str,
168 |         help="name of the dataframe column where the id of each molecule is stored",
169 |         required=True,
170 |     )
171 | 
172 |     parser.add_argument(
173 |         "-scn",
174 |         "--smiles_column_name",
175 |         type=str,
176 |         help="name of the dataframe column where the smile representation of each molecule is stored",
177 |         required=True,
178 |     )
179 | 
180 |     parser.add_argument(
181 |         "-ss",
182 |         "--splitting_strategy",
183 |         type=str,
184 |         help="name of splitting startegy selected [random, temporal, cluster]",
185 |         required=True,
186 |     )
187 | 
188 |     parser.add_argument(
189 |         "-ticn",
190 |         "--temporal_info_column_name",
191 |         type=str,
192 |         help="name of the column where the temporal information is stored",
193 |     )
194 | 
195 |     parser.add_argument(
196 |         "-pcn",
197 |         "--properties_column_name",
198 |         action="append",
199 |         help="list of names of the dataframe columns where the property/ies of each molecule is stored",
200 |         required=True,
201 |     )
202 |     # if multiple tasks -pcn "Task1" -pcn "Task2" -pcn "Task3"
203 | 
204 |     args = parser.parse_args()
205 |     if (
206 |         args.representation_to_compute not in ["FINGERPRINTS", "DESCRIPTORS2D", "TF2_GNN"]
207 |         and not args.path_to_model
208 |     ):
209 |         raise RuntimeError(
210 |             f"Please specify a path_to_model for molecular representations which are not in the default ones "
211 |             f"[FINGERPRINTS, DESCRIPTORS2D, TF2_GNN]"
212 |         )
213 | 
214 |     try:
215 |         properties_column_name = json.loads(args.properties_column_name[0])
216 | 
217 |     except Exception:
218 |         properties_column_name_json_format = json.dumps(args.properties_column_name)
219 |         properties_column_name = json.loads(properties_column_name_json_format)
220 | 
221 |     compute_representations_from_args(
222 |         path_to_df=args.path_to_df,
223 |         representation_to_compute=args.representation_to_compute,
224 |         path_to_model=args.path_to_model,
225 |         output_dir=args.output_dir,
226 |         experiment_name=args.experiment_name,
227 |         id_column_name=args.id_column_name,
228 |         smiles_column_name=args.smiles_column_name,
229 |         splitting_strategy=args.splitting_strategy,
230 |         temporal_info_column_name=args.temporal_info_column_name,
231 |         properties_column_name_list=properties_column_name,
232 |     )
233 | 


--------------------------------------------------------------------------------
/prefer/scripts/run_PREFER.py:
--------------------------------------------------------------------------------
  1 | #  Copyright (c) 2023, Novartis Institutes for BioMedical Research Inc. and Microsoft Corporation
  2 | #  All rights reserved.
  3 | # 
  4 | # Redistribution and use in source and binary forms, with or without
  5 | # modification, are permitted provided that the following conditions are
  6 | # met: 
  7 | #
  8 | #     * Redistributions of source code must retain the above copyright 
  9 | #       notice, this list of conditions and the following disclaimer.
 10 | #     * Redistributions in binary form must reproduce the above
 11 | #       copyright notice, this list of conditions and the following 
 12 | #       disclaimer in the documentation and/or other materials provided 
 13 | #       with the distribution.
 14 | #     * Neither the name of Novartis Institutes for BioMedical Research Inc. nor Microsoft Corporation 
 15 | #       nor the names of its contributors may be used to endorse or promote 
 16 | #       products derived from this software without specific prior written permission.
 17 | #
 18 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 19 | # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 20 | # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 21 | # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 22 | # OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 23 | # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 24 | # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 25 | # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 26 | # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 27 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 28 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 29 | #
 30 | # Created by Jessica Lanini, January 2023
 31 | 
 32 | 
 33 | #!/usr/bin/python
 34 | 
 35 | import argparse
 36 | import os
 37 | import sys
 38 | import time
 39 | import logging
 40 | 
 41 | 
 42 | from prefer.src.vector_molecule_representations import VectorMoleculeRepresentations
 43 | from prefer.src.benchmarking import Benchmarking
 44 | from prefer.utils.save_load import saving_procedure_autosklearn
 45 | 
 46 | import tempfile
 47 | 
 48 | logger = logging.getLogger(__name__)
 49 | 
 50 | 
 51 | def run_PREFER_from_args(
 52 |     problem_type, representation_name, repr_dir, final_folder_path, experiment_name=None,
 53 | ):
 54 | 
 55 |     os.makedirs(final_folder_path, exist_ok=True)
 56 | 
 57 |     if final_folder_path is None:
 58 |         final_folder_path = "."
 59 | 
 60 |     # Import saved representation
 61 |     repr_type = retrieve_type_of_molecular_representation(representation_name)
 62 |     list_if_files = os.listdir(repr_dir)
 63 |     if not repr_dir.endswith("/"):  # Normalise away trailing slashes
 64 |         repr_dir = repr_dir + "/"
 65 | 
 66 |     repr_ = repr_type.load(repr_dir + list_if_files[0])
 67 | 
 68 |     tasks_number = len([col for col in repr_.df.columns if "Property" in col])
 69 |     if tasks_number == 1:
 70 |         mask = False
 71 |     else:
 72 |         mask = True
 73 |     logger.info(tasks_number, mask)
 74 | 
 75 |     with tempfile.TemporaryDirectory() as tmpdirname:
 76 |         bench = Benchmarking(problem_type=problem_type, working_directory=tmpdirname,)
 77 |         try:
 78 |             bench.benchmark([repr_], experiment_name=experiment_name)
 79 |         except TypeError as e:
 80 |             logger.error("EXCEPTION during property model training: ", e)
 81 |             pass
 82 | 
 83 |         # saving procedure
 84 |         timestr = time.strftime("%Y%m%d-%H%M%S")
 85 |         name = representation_name
 86 |         if experiment_name is not None:
 87 |             name = name + "_" + experiment_name
 88 |         try:
 89 |             if not os.path.exists(final_folder_path):
 90 |                 os.mkdir(final_folder_path)
 91 |         except OSError as e:
 92 |             logger.error("Creation of the directory %s failed", final_folder_path, e)
 93 |         else:
 94 |             logger.info("Successfully created the directory %s ", final_folder_path)
 95 |         dir_destination = final_folder_path + "/" + name + "_" + timestr
 96 | 
 97 |         saving_procedure_autosklearn(bench, dir_destination)
 98 |     return
 99 | 
100 | 
101 | def retrieve_type_of_molecular_representation(representation_name: str) -> type:
102 |     return VectorMoleculeRepresentations
103 | 
104 | 
105 | if __name__ == "__main__":
106 |     """
107 |     Example of usage:
108 |     %run run_PREFER.py -pt "regression" -rn "FINGERPRINTS" -mn "RandomForest" -rd "/path/to/representation/PREFER_automation_branch/" -pg '{"max_depth": [10], "min_samples_leaf": [2], "n_estimators": [10]}' -pge "{}"
109 |     """
110 |     parser = argparse.ArgumentParser(description="run PREFER")
111 |     parser.add_argument(
112 |         "-pt",
113 |         "--problem_type",
114 |         type=str,
115 |         help="whether this is a <regression> or a <classification> problem",
116 |         required=True,
117 |     )
118 | 
119 |     parser.add_argument(
120 |         "-rn",
121 |         "--representation_name",
122 |         type=str,
123 |         help="name of the rapresentation to compute or path to the generator which is used to map smiles into embeddings",
124 |         required=True,
125 |     )  # here you can have a list representations so that is case this list has a lenght >1 then the first step is to combine the representations
126 | 
127 |     parser.add_argument(
128 |         "-rd",
129 |         "--repr_dir",
130 |         type=str,
131 |         help="directory where the selected representation is stored",
132 |         required=True,
133 |     )
134 | 
135 |     parser.add_argument(
136 |         "-ffp",
137 |         "--final_folder_path",
138 |         type=str,
139 |         help="directory where the results will be stored. If not specified results will be store in the "
140 |         "current directory.",
141 |     )
142 | 
143 |     parser.add_argument(
144 |         "-en", "--experiment_name", type=str, help="name of the current experiment",
145 |     )
146 | 
147 |     args = parser.parse_args()
148 |     run_PREFER_from_args(
149 |         problem_type=args.problem_type,
150 |         representation_name=args.representation_name,
151 |         repr_dir=args.repr_dir,
152 |         final_folder_path=args.final_folder_path,
153 |         experiment_name=args.experiment_name,
154 |     )
155 | 


--------------------------------------------------------------------------------
/prefer/scripts/utils.py:
--------------------------------------------------------------------------------
 1 | #  Copyright (c) 2023, Novartis Institutes for BioMedical Research Inc. and Microsoft Corporation
 2 | #  All rights reserved.
 3 | # 
 4 | # Redistribution and use in source and binary forms, with or without
 5 | # modification, are permitted provided that the following conditions are
 6 | # met: 
 7 | #
 8 | #     * Redistributions of source code must retain the above copyright 
 9 | #       notice, this list of conditions and the following disclaimer.
10 | #     * Redistributions in binary form must reproduce the above
11 | #       copyright notice, this list of conditions and the following 
12 | #       disclaimer in the documentation and/or other materials provided 
13 | #       with the distribution.
14 | #     * Neither the name of Novartis Institutes for BioMedical Research Inc. nor Microsoft Corporation 
15 | #       nor the names of its contributors may be used to endorse or promote 
16 | #       products derived from this software without specific prior written permission.
17 | #
18 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19 | # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20 | # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21 | # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
22 | # OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23 | # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
24 | # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25 | # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26 | # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 | #
30 | # Created by Jessica Lanini, January 2023
31 | 
32 | 
33 | def validate_prefer_config(config: dict) -> bool:
34 |     required_fields = [
35 |         "path_to_df",
36 |         "experiment_name",
37 |         "id_column_name",
38 |         "smiles_column_name",
39 |         "properties_column_name_list",
40 |         "problem_type",
41 |         "representations",
42 |     ]
43 |     for field in required_fields:
44 |         if field not in config.keys():
45 |             return False
46 | 
47 |     return True
48 | 
49 | 
50 | def validate_local_model_config(config: dict) -> bool:
51 |     required_fields = [
52 |         "datapath",
53 |         "experiment_name",
54 |         "assay_name",
55 |         "project_code",
56 |         "id_column_name",
57 |         "smiles_column_name",
58 |         "properties_column_name",
59 |         "problem_type",
60 |     ]
61 |     for field in required_fields:
62 |         if field not in config.keys():
63 |             return False
64 | 
65 |     return True
66 | 


--------------------------------------------------------------------------------
/prefer/src/__init__.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | 


--------------------------------------------------------------------------------
/prefer/src/molecule_representations.py:
--------------------------------------------------------------------------------
  1 | #  Copyright (c) 2023, Novartis Institutes for BioMedical Research Inc. and Microsoft Corporation
  2 | #  All rights reserved.
  3 | # 
  4 | # Redistribution and use in source and binary forms, with or without
  5 | # modification, are permitted provided that the following conditions are
  6 | # met: 
  7 | #
  8 | #     * Redistributions of source code must retain the above copyright 
  9 | #       notice, this list of conditions and the following disclaimer.
 10 | #     * Redistributions in binary form must reproduce the above
 11 | #       copyright notice, this list of conditions and the following 
 12 | #       disclaimer in the documentation and/or other materials provided 
 13 | #       with the distribution.
 14 | #     * Neither the name of Novartis Institutes for BioMedical Research Inc. nor Microsoft Corporation 
 15 | #       nor the names of its contributors may be used to endorse or promote 
 16 | #       products derived from this software without specific prior written permission.
 17 | #
 18 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 19 | # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 20 | # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 21 | # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 22 | # OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 23 | # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 24 | # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 25 | # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 26 | # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 27 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 28 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 29 | #
 30 | # Created by Jessica Lanini, January 2023
 31 | 
 32 | 
 33 | import logging
 34 | import pickle
 35 | from abc import abstractmethod, ABC
 36 | import time
 37 | from dataclasses import dataclass
 38 | from typing import Optional
 39 | import os
 40 | 
 41 | from pandas import DataFrame
 42 | 
 43 | 
 44 | @dataclass
 45 | class MoleculeRepresentations(ABC):
 46 |     df: DataFrame
 47 |     representation_name: str
 48 |     split_type: str
 49 |     model_path: str = ""
 50 |     repr_type: str = ""
 51 |     model_id: str = "tmp_id"
 52 |     experiment_name: str = "new_experiment"
 53 |     path_to_df: str = ""
 54 |     limit_def: int = None
 55 | 
 56 |     @abstractmethod
 57 |     def split(self):
 58 |         pass
 59 | 
 60 |     def save(
 61 |         self,
 62 |         path: str,
 63 |         name: Optional[str] = None,
 64 |         experiment_name: Optional[str] = None,
 65 |         path_to_df: Optional[str] = None,
 66 |     ):
 67 |         """
 68 | 
 69 |         method to save the MoleculeRepresentations object in the location specified by path
 70 | 
 71 |         Usage:
 72 |         mol_repr.save('../folder/')
 73 |         """
 74 | 
 75 |         if experiment_name is not None:
 76 |             self.experiment_name = experiment_name
 77 | 
 78 |         if path_to_df is not None:
 79 |             self.path_to_df = path_to_df
 80 | 
 81 |         timestr = time.strftime("%Y%m%d-%H%M%S")
 82 | 
 83 |         final_path = os.path.join(
 84 |             path,
 85 |             f"{self.experiment_name}_{name or self.representation_name}_{self.repr_type}_{timestr}.pkl",
 86 |         )
 87 | 
 88 |         with open(final_path, "wb",) as output:
 89 |             pickle.dump(self.__dict__, output, pickle.HIGHEST_PROTOCOL)
 90 | 
 91 |         logging.info(f"Representation saved in {final_path}")
 92 | 
 93 |     @classmethod
 94 |     def load(cls, path: str):
 95 |         """
 96 |         Load MoleculeRepresentations from a .pkl file.
 97 |         """
 98 | 
 99 |         with open(path, "rb") as input:
100 |             tmp = pickle.load(input)
101 |         return cls(**tmp)
102 | 


--------------------------------------------------------------------------------
/prefer/src/molecule_representations_builder.py:
--------------------------------------------------------------------------------
 1 | #  Copyright (c) 2023, Novartis Institutes for BioMedical Research Inc. and Microsoft Corporation
 2 | #  All rights reserved.
 3 | # 
 4 | # Redistribution and use in source and binary forms, with or without
 5 | # modification, are permitted provided that the following conditions are
 6 | # met: 
 7 | #
 8 | #     * Redistributions of source code must retain the above copyright 
 9 | #       notice, this list of conditions and the following disclaimer.
10 | #     * Redistributions in binary form must reproduce the above
11 | #       copyright notice, this list of conditions and the following 
12 | #       disclaimer in the documentation and/or other materials provided 
13 | #       with the distribution.
14 | #     * Neither the name of Novartis Institutes for BioMedical Research Inc. nor Microsoft Corporation 
15 | #       nor the names of its contributors may be used to endorse or promote 
16 | #       products derived from this software without specific prior written permission.
17 | #
18 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19 | # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20 | # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21 | # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
22 | # OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23 | # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
24 | # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25 | # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26 | # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 | #
30 | # Created by Jessica Lanini, January 2023
31 | 
32 | 
33 | import sys
34 | from abc import abstractmethod
35 | 
36 | from pandas import DataFrame
37 | 
38 | 
39 | from prefer.src.molecule_representations import MoleculeRepresentations
40 | 
41 | 
42 | class MoleculeRepresentationsBuilder:
43 |     @abstractmethod
44 |     def build_representations(self, molecule_data: DataFrame) -> MoleculeRepresentations:
45 |         """
46 |         Method to implement the molecular representation of interest.
47 |         Input:
48 |         - molecule_data: this is a dataframe of the shape
49 |         | ID | Smiles | Property_1 | Property_2 | ... | Property_N |
50 |         ------------------------------------------------------------
51 |         """
52 |         pass
53 | 
54 |     @abstractmethod
55 |     def remove_nan(self, molecule_data: DataFrame):
56 |         pass
57 | 


--------------------------------------------------------------------------------
/prefer/src/vector_molecule_representations.py:
--------------------------------------------------------------------------------
  1 | #  Copyright (c) 2023, Novartis Institutes for BioMedical Research Inc. and Microsoft Corporation
  2 | #  All rights reserved.
  3 | # 
  4 | # Redistribution and use in source and binary forms, with or without
  5 | # modification, are permitted provided that the following conditions are
  6 | # met: 
  7 | #
  8 | #     * Redistributions of source code must retain the above copyright 
  9 | #       notice, this list of conditions and the following disclaimer.
 10 | #     * Redistributions in binary form must reproduce the above
 11 | #       copyright notice, this list of conditions and the following 
 12 | #       disclaimer in the documentation and/or other materials provided 
 13 | #       with the distribution.
 14 | #     * Neither the name of Novartis Institutes for BioMedical Research Inc. nor Microsoft Corporation 
 15 | #       nor the names of its contributors may be used to endorse or promote 
 16 | #       products derived from this software without specific prior written permission.
 17 | #
 18 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 19 | # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 20 | # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 21 | # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 22 | # OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 23 | # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 24 | # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 25 | # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 26 | # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 27 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 28 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 29 | #
 30 | # Created by Jessica Lanini, January 2023
 31 | 
 32 | 
 33 | #!/usr/bin/env python
 34 | # -*- coding: utf-8 -*-
 35 | import logging
 36 | import sys
 37 | from typing import Optional
 38 | from dataclasses import dataclass
 39 | 
 40 | 
 41 | import pandas as pd
 42 | 
 43 | 
 44 | from prefer.utils.data_utils import convert
 45 | from prefer.utils.splitting_strategies import (
 46 |     random_split,
 47 |     temporal_split,
 48 |     cluster_split,
 49 | )
 50 | from prefer.utils.features_scaling import scale_features
 51 | from prefer.src.molecule_representations import MoleculeRepresentations
 52 | 
 53 | 
 54 | @dataclass
 55 | class VectorMoleculeRepresentations(MoleculeRepresentations):
 56 |     repr_type: str = "vector"
 57 |     scale_type: Optional[str] = None
 58 |     seed: int = 1
 59 |     features_means: Optional[pd.Series] = None
 60 |     features_stds: Optional[pd.Series] = None
 61 | 
 62 |     def split(self, return_indices: bool = False):
 63 |         """
 64 |         method to extract the indices used to split the original dataset and obtain the final dataframes
 65 |         """
 66 |         print("Splitting the dataset according to: " + self.split_type + " split")
 67 | 
 68 |         indices = self.extract_indices()
 69 | 
 70 |         # In case of One shot one could add another split_type that should split alond the tasks and not on the samples
 71 |         if not indices:
 72 |             raise ValueError("Empty indices for splitting dataset")
 73 |         else:
 74 |             if len(indices) == 2:
 75 |                 logging.debug("No Validation Set")
 76 |                 index_train = indices[0]
 77 |                 index_test = indices[1]
 78 |                 Xtrain, ytrain, Xtest, ytest = self.extract_matrices(index_train, index_test)
 79 |                 if self.scale_type:
 80 |                     print("Scaling features according to: " + self.scale_type)
 81 |                     Xtrain, Xtest, self.features_means, self.features_stds = scale_features(
 82 |                         Xtrain, Xtest, scaling_type=self.scale_type
 83 |                     )
 84 |             else:
 85 |                 raise ValueError("Validation set cannot be computed for the moment")
 86 | 
 87 |             if return_indices:
 88 |                 return Xtrain, ytrain, Xtest, ytest, index_train, index_test
 89 |             else:
 90 |                 return Xtrain, ytrain, Xtest, ytest
 91 | 
 92 |     def extract_matrices(self, index_train, index_test):
 93 |         """
 94 |         method used to convert the test/train datasets, obtained by splitting the original dataset, into numpy arrays and store them into Xtrain and Xtest.
 95 |         """
 96 |         if max(index_train) > (self.df.shape[0] - 1) or max(index_test) > (self.df.shape[0] - 1):
 97 |             raise ValueError("ERROR with indices")
 98 | 
 99 |         properties = self.df.columns[["Property" in str(x) for x in self.df.columns.values]].values
100 |         if properties.size == 0:
101 |             properties = self.df.columns[["true_label_" in str(x) for x in self.df.columns.values]].values 
102 |         elif properties.size == 0:
103 |             raise ValueError('Columns with either Property or true_label_ cannot be found in the dataset. Cannot understand where labels are stored.')
104 |         df_train = self.df.iloc[index_train]
105 |         df_train = df_train.reset_index()
106 |         df_test = self.df.iloc[index_test]
107 |         df_test = df_test.reset_index()
108 |         if "molecule_representation" in df_train.columns.values:
109 |             repr_name = "molecule_representation"
110 |         else:
111 |             repr_name = self.representation_name
112 | 
113 |         Xtrain, ytrain = convert(df_train, repr_name, properties)
114 |         Xtest, ytest = convert(df_test, repr_name, properties)
115 | 
116 |         return Xtrain, ytrain, Xtest, ytest
117 | 
118 |     def extract_indices(self):
119 |         """
120 |         method to extract the indices used to split the original dataset. They are computed according to the strategy required by the user.
121 |         """
122 |         if self.split_type == "random":
123 |             return random_split(self.df, self.seed, limit_def=self.limit_def)
124 |         elif self.split_type == "cluster":
125 |             return cluster_split(df=self.df)
126 |         elif self.split_type == "temporal":
127 |             return temporal_split(df=self.df)
128 |         else:
129 |             raise ValueError(
130 |                 f"Split method {self.split_type} is not valid. Allowed options are random, cluster, temporal"
131 |             )
132 | 


--------------------------------------------------------------------------------
/prefer/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rdkit/PREFER/16986ca183b38b94f9b3b62b441ec40671b8cf8a/prefer/tests/__init__.py


--------------------------------------------------------------------------------
/prefer/tests/data_for_test/logDPublic.csv:
--------------------------------------------------------------------------------
 1 | ,Unnamed: 0,Unnamed: 0.1,index,Unnamed: 0.1.1,Unnamed: 0.1.1.1,Molecule ChEMBL ID,Smiles,Standard Value
 2 | 0,0,0,0,0,0,CHEMBL1682950,CC[C@H](NC(=O)c1c([S+](C)[O-])c(nc2ccccc12)c3ccsc3)c4ccccc4,2.66
 3 | 1,1,1,1,1,1,CHEMBL1682948,CC[C@H](NC(=O)c1c([S+](C)[O-])c(nc2ccccc12)c3ccc(F)cc3)c4ccccc4,3.72
 4 | 2,2,2,2,2,2,CHEMBL1682945,CC[C@H](NC(=O)c1c(CC[S+](C)[O-])c(nc2ccccc12)c3ccccc3)c4ccccc4,3.15
 5 | 3,4,4,4,4,1,CHEMBL1364723,CC1=C(C)C[S+]([O-])N(S(=O)(=O)c2ccc(Cl)cc2)C1,3.5
 6 | 4,0,0,0,0,0,CHEMBL1682950,CC[C@H](NC(=O)c1c([S+](C)[O-])c(nc2ccccc12)c3ccsc3)c4ccccc4,2.66
 7 | 5,1,1,1,1,1,CHEMBL1682948,CC[C@H](NC(=O)c1c([S+](C)[O-])c(nc2ccccc12)c3ccc(F)cc3)c4ccccc4,3.72
 8 | 6,2,2,2,2,2,CHEMBL1682945,CC[C@H](NC(=O)c1c(CC[S+](C)[O-])c(nc2ccccc12)c3ccccc3)c4ccccc4,3.15
 9 | 7,4,4,4,4,1,CHEMBL1364723,CC1=C(C)C[S+]([O-])N(S(=O)(=O)c2ccc(Cl)cc2)C1,3.5
10 | 8,0,0,0,0,0,CHEMBL1682950,CC[C@H](NC(=O)c1c([S+](C)[O-])c(nc2ccccc12)c3ccsc3)c4ccccc4,2.66
11 | 9,1,1,1,1,1,CHEMBL1682948,CC[C@H](NC(=O)c1c([S+](C)[O-])c(nc2ccccc12)c3ccc(F)cc3)c4ccccc4,3.72
12 | 10,2,2,2,2,2,CHEMBL1682945,CC[C@H](NC(=O)c1c(CC[S+](C)[O-])c(nc2ccccc12)c3ccccc3)c4ccccc4,3.15
13 | 11,4,4,4,4,1,CHEMBL1364723,CC1=C(C)C[S+]([O-])N(S(=O)(=O)c2ccc(Cl)cc2)C1,3.5
14 | 12,0,0,0,0,0,CHEMBL1682950,CC[C@H](NC(=O)c1c([S+](C)[O-])c(nc2ccccc12)c3ccsc3)c4ccccc4,2.66
15 | 13,1,1,1,1,1,CHEMBL1682948,CC[C@H](NC(=O)c1c([S+](C)[O-])c(nc2ccccc12)c3ccc(F)cc3)c4ccccc4,3.72
16 | 14,2,2,2,2,2,CHEMBL1682945,CC[C@H](NC(=O)c1c(CC[S+](C)[O-])c(nc2ccccc12)c3ccccc3)c4ccccc4,3.15
17 | 15,4,4,4,4,1,CHEMBL1364723,CC1=C(C)C[S+]([O-])N(S(=O)(=O)c2ccc(Cl)cc2)C1,3.5
18 | 16,0,0,0,0,0,CHEMBL1682950,CC[C@H](NC(=O)c1c([S+](C)[O-])c(nc2ccccc12)c3ccsc3)c4ccccc4,2.66
19 | 17,1,1,1,1,1,CHEMBL1682948,CC[C@H](NC(=O)c1c([S+](C)[O-])c(nc2ccccc12)c3ccc(F)cc3)c4ccccc4,3.72
20 | 18,2,2,2,2,2,CHEMBL1682945,CC[C@H](NC(=O)c1c(CC[S+](C)[O-])c(nc2ccccc12)c3ccccc3)c4ccccc4,3.15
21 | 19,4,4,4,4,1,CHEMBL1364723,CC1=C(C)C[S+]([O-])N(S(=O)(=O)c2ccc(Cl)cc2)C1,3.5
22 | 20,0,0,0,0,0,CHEMBL1682950,CC[C@H](NC(=O)c1c([S+](C)[O-])c(nc2ccccc12)c3ccsc3)c4ccccc4,2.66
23 | 21,1,1,1,1,1,CHEMBL1682948,CC[C@H](NC(=O)c1c([S+](C)[O-])c(nc2ccccc12)c3ccc(F)cc3)c4ccccc4,3.72
24 | 22,2,2,2,2,2,CHEMBL1682945,CC[C@H](NC(=O)c1c(CC[S+](C)[O-])c(nc2ccccc12)c3ccccc3)c4ccccc4,3.15
25 | 23,4,4,4,4,1,CHEMBL1364723,CC1=C(C)C[S+]([O-])N(S(=O)(=O)c2ccc(Cl)cc2)C1,3.5
26 | 24,0,0,0,0,0,CHEMBL1682950,CC[C@H](NC(=O)c1c([S+](C)[O-])c(nc2ccccc12)c3ccsc3)c4ccccc4,2.66
27 | 25,1,1,1,1,1,CHEMBL1682948,CC[C@H](NC(=O)c1c([S+](C)[O-])c(nc2ccccc12)c3ccc(F)cc3)c4ccccc4,3.72
28 | 26,2,2,2,2,2,CHEMBL1682945,CC[C@H](NC(=O)c1c(CC[S+](C)[O-])c(nc2ccccc12)c3ccccc3)c4ccccc4,3.15
29 | 27,4,4,4,4,1,CHEMBL1364723,CC1=C(C)C[S+]([O-])N(S(=O)(=O)c2ccc(Cl)cc2)C1,3.5
30 | 28,0,0,0,0,0,CHEMBL1682950,CC[C@H](NC(=O)c1c([S+](C)[O-])c(nc2ccccc12)c3ccsc3)c4ccccc4,2.66
31 | 29,1,1,1,1,1,CHEMBL1682948,CC[C@H](NC(=O)c1c([S+](C)[O-])c(nc2ccccc12)c3ccc(F)cc3)c4ccccc4,3.72
32 | 30,2,2,2,2,2,CHEMBL1682945,CC[C@H](NC(=O)c1c(CC[S+](C)[O-])c(nc2ccccc12)c3ccccc3)c4ccccc4,3.15
33 | 31,4,4,4,4,1,CHEMBL1364723,CC1=C(C)C[S+]([O-])N(S(=O)(=O)c2ccc(Cl)cc2)C1,3.5
34 | 32,0,0,0,0,0,CHEMBL1682950,CC[C@H](NC(=O)c1c([S+](C)[O-])c(nc2ccccc12)c3ccsc3)c4ccccc4,2.66
35 | 33,1,1,1,1,1,CHEMBL1682948,CC[C@H](NC(=O)c1c([S+](C)[O-])c(nc2ccccc12)c3ccc(F)cc3)c4ccccc4,3.72
36 | 34,2,2,2,2,2,CHEMBL1682945,CC[C@H](NC(=O)c1c(CC[S+](C)[O-])c(nc2ccccc12)c3ccccc3)c4ccccc4,3.15
37 | 35,4,4,4,4,1,CHEMBL1364723,CC1=C(C)C[S+]([O-])N(S(=O)(=O)c2ccc(Cl)cc2)C1,3.5
38 | 36,0,0,0,0,0,CHEMBL1682950,CC[C@H](NC(=O)c1c([S+](C)[O-])c(nc2ccccc12)c3ccsc3)c4ccccc4,2.66
39 | 37,1,1,1,1,1,CHEMBL1682948,CC[C@H](NC(=O)c1c([S+](C)[O-])c(nc2ccccc12)c3ccc(F)cc3)c4ccccc4,3.72
40 | 38,2,2,2,2,2,CHEMBL1682945,CC[C@H](NC(=O)c1c(CC[S+](C)[O-])c(nc2ccccc12)c3ccccc3)c4ccccc4,3.15
41 | 39,4,4,4,4,1,CHEMBL1364723,CC1=C(C)C[S+]([O-])N(S(=O)(=O)c2ccc(Cl)cc2)C1,3.5
42 | 40,0,0,0,0,0,CHEMBL1682950,CC[C@H](NC(=O)c1c([S+](C)[O-])c(nc2ccccc12)c3ccsc3)c4ccccc4,2.66
43 | 41,1,1,1,1,1,CHEMBL1682948,CC[C@H](NC(=O)c1c([S+](C)[O-])c(nc2ccccc12)c3ccc(F)cc3)c4ccccc4,3.72
44 | 42,2,2,2,2,2,CHEMBL1682945,CC[C@H](NC(=O)c1c(CC[S+](C)[O-])c(nc2ccccc12)c3ccccc3)c4ccccc4,3.15
45 | 43,4,4,4,4,1,CHEMBL1364723,CC1=C(C)C[S+]([O-])N(S(=O)(=O)c2ccc(Cl)cc2)C1,3.5
46 | 


--------------------------------------------------------------------------------
/prefer/tests/file_for_test/config_PREFER_test_custom_autosklearn.yaml:
--------------------------------------------------------------------------------
 1 | path_to_df: '../data_for_test/logDPublic.csv'
 2 | experiment_name: 'logD'
 3 | id_column_name:  'Molecule ChEMBL ID'
 4 | smiles_column_name:  'Smiles'
 5 | properties_column_name_list: 
 6 |       - 'Standard Value' 
 7 | problem_type: 'regression'
 8 | splitting_strategy: 'random'
 9 | model_instance: 
10 |       - 'resampling_strategy="cv"'
11 |       - 'per_run_time_limit=30'
12 |       - 'metric = "balanced_accuracy"'
13 |       - ' n_jobs = 3'
14 |       - 'ppppp = "uncorrect"' 
15 | 
16 | 


--------------------------------------------------------------------------------
/prefer/tests/file_for_test/logD_desirability_scores.yaml:
--------------------------------------------------------------------------------
 1 | desirability_scores:
 2 |       score1:
 3 |             - x : -1.0
 4 |               y : 0.0
 5 |             - x : 0.0
 6 |               y : 0.2
 7 |             - x : 1.0
 8 |               y : 0.9
 9 |             - x : 2.0
10 |               y : 1.0
11 |             - x : 3.0
12 |               y : 0.5
13 |             - x : 4.0
14 |               y : 0.0
15 | 
16 | 


--------------------------------------------------------------------------------
/prefer/tests/test_autosklearn_customization.py:
--------------------------------------------------------------------------------
 1 | #  Copyright (c) 2023, Novartis Institutes for BioMedical Research Inc. and Microsoft Corporation
 2 | #  All rights reserved.
 3 | # 
 4 | # Redistribution and use in source and binary forms, with or without
 5 | # modification, are permitted provided that the following conditions are
 6 | # met: 
 7 | #
 8 | #     * Redistributions of source code must retain the above copyright 
 9 | #       notice, this list of conditions and the following disclaimer.
10 | #     * Redistributions in binary form must reproduce the above
11 | #       copyright notice, this list of conditions and the following 
12 | #       disclaimer in the documentation and/or other materials provided 
13 | #       with the distribution.
14 | #     * Neither the name of Novartis Institutes for BioMedical Research Inc. nor Microsoft Corporation 
15 | #       nor the names of its contributors may be used to endorse or promote 
16 | #       products derived from this software without specific prior written permission.
17 | #
18 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19 | # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20 | # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21 | # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
22 | # OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23 | # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
24 | # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25 | # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26 | # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 | #
30 | # Created by Jessica Lanini, January 2023
31 | 
32 | 
33 | import sys
34 | import unittest
35 | import yaml
36 | 
37 | 
38 | from autosklearn.regression import AutoSklearnRegressor
39 | from prefer.utils.models_utils import (
40 |     get_autosklearn_customized_model,
41 |     convert_atype_to_btype,
42 |     convert_list_into_dict,
43 | )
44 | 
45 | 
46 | class TestAutosklearnCustomization(unittest.TestCase):
47 |     def test_get_autosklearn_customized_model(self):
48 | 
49 |         prefer_args = "./file_for_test/config_PREFER_test_custom_autosklearn.yaml"
50 |         a_yaml_file = open(prefer_args)
51 |         parsed_yaml_file = yaml.load(a_yaml_file, Loader=yaml.FullLoader)
52 | 
53 |         if "model_instance" in parsed_yaml_file:
54 |             model_instance = parsed_yaml_file["model_instance"]
55 |         else:
56 |             model_instance = None
57 | 
58 |         ml = get_autosklearn_customized_model(
59 |             model_instance=model_instance, model_type="regression", working_directory="."
60 |         )
61 | 
62 |         self.assertTrue(isinstance(ml, AutoSklearnRegressor))
63 | 
64 |     def test_convert_atype_to_btype(self):
65 |         a = 1
66 |         b = "test"
67 |         new_a = convert_atype_to_btype(a, b)
68 |         self.assertTrue(isinstance(new_a, str))
69 | 
70 |     def test_convert_list_into_dict(self):
71 |         list_ = ["key1 = value1", "key2 : value2"]
72 |         dict_test = {"key1": "value1", "key2": "value2"}
73 |         dict_ = convert_list_into_dict(list_)
74 |         self.assertTrue(dict_test == dict_)
75 | 
76 | 
77 | if __name__ == "__main__":
78 |     unittest.main()
79 | 


--------------------------------------------------------------------------------
/prefer/tests/test_check_input_dataframe.py:
--------------------------------------------------------------------------------
  1 | #  Copyright (c) 2023, Novartis Institutes for BioMedical Research Inc. and Microsoft Corporation
  2 | #  All rights reserved.
  3 | # 
  4 | # Redistribution and use in source and binary forms, with or without
  5 | # modification, are permitted provided that the following conditions are
  6 | # met: 
  7 | #
  8 | #     * Redistributions of source code must retain the above copyright 
  9 | #       notice, this list of conditions and the following disclaimer.
 10 | #     * Redistributions in binary form must reproduce the above
 11 | #       copyright notice, this list of conditions and the following 
 12 | #       disclaimer in the documentation and/or other materials provided 
 13 | #       with the distribution.
 14 | #     * Neither the name of Novartis Institutes for BioMedical Research Inc. nor Microsoft Corporation 
 15 | #       nor the names of its contributors may be used to endorse or promote 
 16 | #       products derived from this software without specific prior written permission.
 17 | #
 18 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 19 | # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 20 | # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 21 | # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 22 | # OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 23 | # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 24 | # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 25 | # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 26 | # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 27 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 28 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 29 | #
 30 | # Created by Jessica Lanini, January 2023
 31 | 
 32 | 
 33 | import sys
 34 | import unittest
 35 | import numpy as np
 36 | import pandas as pd
 37 | 
 38 | 
 39 | from prefer.utils.check_input_dataframe import (
 40 |     check_dataframe,
 41 |     check_fields,
 42 |     check_fields_types,
 43 |     check_final_structure,
 44 | )
 45 | 
 46 | 
 47 | class TestCheckDataStorage(unittest.TestCase):
 48 |     def setUp(self):
 49 |         """Executed before every test case"""
 50 |         mol_representation_df = pd.DataFrame(
 51 |             np.random.randint(0, 100, size=(100, 4)), columns=list("ABCD")
 52 |         )
 53 |         mol_representation_df.iloc[0, 0] = np.nan
 54 |         mol_representation_df = mol_representation_df.dropna()
 55 |         self.globalvar = mol_representation_df
 56 | 
 57 |     def tearDown(self):
 58 |         """Executed after every test case"""
 59 |         print("\ntearDown executing after the test case. Result:")
 60 | 
 61 |     def test_check_dataframe(self):
 62 |         self.assertFalse(check_dataframe(self.globalvar))
 63 | 
 64 |     def test_check_fields(self):
 65 |         df = pd.DataFrame(
 66 |             np.random.randint(0, 100, size=(100, 4)),
 67 |             columns=list(["Smiles", "ID", "Property_2", "Property_3"]),
 68 |         )
 69 |         self.assertFalse(check_fields(df))
 70 | 
 71 |     def test_check_fields_types(self):
 72 |         df = pd.DataFrame(
 73 |             np.random.randint(0, 100, size=(100, 4)),
 74 |             columns=list(["Smiles", "ID", "Property_2", "Property_3"]),
 75 |         )
 76 |         experiment_name = "experim_1"
 77 |         index_of_separation = 55
 78 |         split_type = "wrong_split_type"
 79 |         mask = False
 80 |         mask_value = -1
 81 |         problem_type = "regression"
 82 |         self.assertFalse(
 83 |             check_fields_types(
 84 |                 df, experiment_name, problem_type, mask, mask_value, split_type, index_of_separation
 85 |             )
 86 |         )
 87 | 
 88 |     def test_check_fields_types_2(self):
 89 |         df = pd.DataFrame(
 90 |             np.random.randint(0, 100, size=(100, 4)),
 91 |             columns=list(["Smiles", "ID", "Property_2", "Property_3"]),
 92 |         )
 93 |         experiment_name = "experim_1"
 94 |         index_of_separation = 55
 95 |         split_type = "temporal"
 96 |         mask = False
 97 |         mask_value = -1
 98 |         problem_type = "regression"
 99 |         self.assertTrue(
100 |             check_fields_types(
101 |                 df, experiment_name, problem_type, mask, mask_value, split_type, index_of_separation
102 |             )
103 |         )
104 | 
105 |     def test_check_final_structure(self):
106 |         df = pd.DataFrame(
107 |             np.random.randint(0, 100, size=(100, 4)),
108 |             columns=list(["Smiles", "ID", "Property_1", "Property_2"]),
109 |         )
110 |         df["Property_1"][0] = np.nan
111 |         self.assertFalse(check_final_structure(df))
112 | 
113 |     def test_check_final_structure_1(self):
114 |         df = pd.DataFrame(
115 |             np.random.randint(0, 100, size=(100, 4)),
116 |             columns=list(["Smiles", "ID", "Property_1", "Property_2"]),
117 |         )
118 |         self.assertTrue(check_final_structure(df))
119 | 
120 | 
121 | if __name__ == "__main__":
122 |     unittest.main()
123 | 


--------------------------------------------------------------------------------
/prefer/tests/test_data_preparation.py:
--------------------------------------------------------------------------------
 1 | #  Copyright (c) 2023, Novartis Institutes for BioMedical Research Inc. and Microsoft Corporation
 2 | #  All rights reserved.
 3 | # 
 4 | # Redistribution and use in source and binary forms, with or without
 5 | # modification, are permitted provided that the following conditions are
 6 | # met: 
 7 | #
 8 | #     * Redistributions of source code must retain the above copyright 
 9 | #       notice, this list of conditions and the following disclaimer.
10 | #     * Redistributions in binary form must reproduce the above
11 | #       copyright notice, this list of conditions and the following 
12 | #       disclaimer in the documentation and/or other materials provided 
13 | #       with the distribution.
14 | #     * Neither the name of Novartis Institutes for BioMedical Research Inc. nor Microsoft Corporation 
15 | #       nor the names of its contributors may be used to endorse or promote 
16 | #       products derived from this software without specific prior written permission.
17 | #
18 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19 | # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20 | # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21 | # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
22 | # OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23 | # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
24 | # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25 | # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26 | # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 | #
30 | # Created by Jessica Lanini, January 2023
31 | 
32 | 
33 | import sys
34 | import unittest
35 | 
36 | import numpy as np
37 | import pandas as pd
38 | 
39 | 
40 | from prefer.utils.data_preparation import prepare_data
41 | 
42 | 
43 | class TestDataPreparation(unittest.TestCase):
44 |     def test_prepare_data(self):
45 |         df = pd.DataFrame(
46 |             np.random.randint(0, 100, size=(100, 4)),
47 |             columns=list(["Smiles", "ID", "Property_1", "Property_2"]),
48 |         )
49 |         with self.assertRaises(ValueError) as context:
50 |             prepare_data(
51 |                 df,
52 |                 id_column_name="invalid_name",
53 |                 smiles_column_name="Smiles",
54 |                 properties_column_name_list=["Property_1", "Property_2"],
55 |             )
56 |         self.assertEqual("ERROR: columns name not found in the dataframe", str(context.exception))
57 | 
58 | 
59 | if __name__ == "__main__":
60 |     unittest.main()
61 | 


--------------------------------------------------------------------------------
/prefer/tests/test_filtering.py:
--------------------------------------------------------------------------------
 1 | #  Copyright (c) 2023, Novartis Institutes for BioMedical Research Inc. and Microsoft Corporation
 2 | #  All rights reserved.
 3 | # 
 4 | # Redistribution and use in source and binary forms, with or without
 5 | # modification, are permitted provided that the following conditions are
 6 | # met: 
 7 | #
 8 | #     * Redistributions of source code must retain the above copyright 
 9 | #       notice, this list of conditions and the following disclaimer.
10 | #     * Redistributions in binary form must reproduce the above
11 | #       copyright notice, this list of conditions and the following 
12 | #       disclaimer in the documentation and/or other materials provided 
13 | #       with the distribution.
14 | #     * Neither the name of Novartis Institutes for BioMedical Research Inc. nor Microsoft Corporation 
15 | #       nor the names of its contributors may be used to endorse or promote 
16 | #       products derived from this software without specific prior written permission.
17 | #
18 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19 | # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20 | # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21 | # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
22 | # OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23 | # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
24 | # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25 | # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26 | # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 | #
30 | # Created by Jessica Lanini, January 2023
31 | 
32 | 
33 | import sys
34 | import unittest
35 | 
36 | import numpy as np
37 | import pandas as pd
38 | 
39 | 
40 | from prefer.utils.filtering import filter_and_normalize_mols, find_nan
41 | 
42 | 
43 | class TestFiltering(unittest.TestCase):
44 |     def test_find_nan_1(self):
45 |         # Empty df
46 |         df = pd.DataFrame()
47 |         representation_to_evaluate = ["Fingerprints", "_2DDescriptors", "Embedded_cddd"]
48 |         with self.assertRaises(ValueError):
49 |             find_nan(df, representation_to_evaluate)
50 | 
51 |     def test_find_nan_2(self):
52 |         # Invalid representation
53 |         df = pd.DataFrame(
54 |             np.random.randint(0, 100, size=(100, 4)),
55 |             columns=list(["Smiles", "ID", "Fingerprints", "_2DD"]),
56 |         )
57 |         representation_to_evaluate = ["invalid"]
58 |         with self.assertRaises(ValueError):
59 |             find_nan(df, representation_to_evaluate)
60 | 
61 |     def test_filter_salt(self):
62 |         df = pd.DataFrame(
63 |             np.random.randint(0, 100, size=(100, 3)), columns=list(["ID", "Fingerprints", "_2DD"])
64 |         )
65 |         with self.assertRaises(ValueError):
66 |             filter_and_normalize_mols(df)
67 | 
68 | 
69 | if __name__ == "__main__":
70 |     unittest.main()
71 | 


--------------------------------------------------------------------------------
/prefer/tests/test_helpers.py:
--------------------------------------------------------------------------------
  1 | #  Copyright (c) 2023, Novartis Institutes for BioMedical Research Inc. and Microsoft Corporation
  2 | #  All rights reserved.
  3 | # 
  4 | # Redistribution and use in source and binary forms, with or without
  5 | # modification, are permitted provided that the following conditions are
  6 | # met: 
  7 | #
  8 | #     * Redistributions of source code must retain the above copyright 
  9 | #       notice, this list of conditions and the following disclaimer.
 10 | #     * Redistributions in binary form must reproduce the above
 11 | #       copyright notice, this list of conditions and the following 
 12 | #       disclaimer in the documentation and/or other materials provided 
 13 | #       with the distribution.
 14 | #     * Neither the name of Novartis Institutes for BioMedical Research Inc. nor Microsoft Corporation 
 15 | #       nor the names of its contributors may be used to endorse or promote 
 16 | #       products derived from this software without specific prior written permission.
 17 | #
 18 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 19 | # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 20 | # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 21 | # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 22 | # OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 23 | # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 24 | # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 25 | # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 26 | # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 27 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 28 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 29 | #
 30 | # Created by Jessica Lanini, January 2023
 31 | 
 32 | 
 33 | import sys
 34 | import unittest
 35 | import pandas as pd
 36 | import numpy as np
 37 | 
 38 | 
 39 | from prefer.utils.models_utils import output_dataframe_preparation
 40 | 
 41 | 
 42 | class TestHelpers(unittest.TestCase):
 43 |     def test_output_dataframe_preparation_singleTask(self):
 44 |         # list of lists
 45 |         data = [
 46 |             ["a1", "b1", "c1"],
 47 |             ["a2", "b2", "c2"],
 48 |             ["a3", "b3", "c3"],
 49 |             ["a4", "b4", "c4"],
 50 |             ["a5", "b5", "c5"],
 51 |         ]
 52 | 
 53 |         df = pd.DataFrame(data)
 54 |         df["Property_1"] = [
 55 |             "test_label1",
 56 |             "train_label1",
 57 |             "test_label2",
 58 |             "train_label2",
 59 |             "train_label3",
 60 |         ]
 61 |         index_train = [1, 3, 4]
 62 |         index_test = [0, 2]
 63 |         predictions_train = ["train_val1", "train_val2", "train_val3"]
 64 |         predictions_test = ["test_val1", "test_val2"]
 65 |         expected_df = df.copy()
 66 |         expected_df["model_predictions_property_1"] = [
 67 |             "test_val1",
 68 |             "train_val1",
 69 |             "test_val2",
 70 |             "train_val2",
 71 |             "train_val3",
 72 |         ]
 73 |         expected_df["is_train"] = [False, True, False, True, True]
 74 | 
 75 |         output_df = output_dataframe_preparation(
 76 |             df,
 77 |             index_train=index_train,
 78 |             index_test=index_test,
 79 |             predictions_train=predictions_train,
 80 |             predictions_test=predictions_test,
 81 |         )
 82 | 
 83 |         all_collect = []
 84 |         for col in output_df.columns:
 85 |             all_collect.append(all(output_df[0].values == expected_df[0].values))
 86 |         all_collect.append(all(output_df.columns.values == expected_df.columns.values))
 87 | 
 88 |         self.assertTrue(all(all_collect))
 89 | 
 90 |     def test_output_dataframe_preparation_multiTask(self):
 91 |         # list of lists
 92 |         data = [
 93 |             ["a1", "b1", "c1"],
 94 |             ["a2", "b2", "c2"],
 95 |             ["a3", "b3", "c3"],
 96 |             ["a4", "b4", "c4"],
 97 |             ["a5", "b5", "c5"],
 98 |         ]
 99 | 
100 |         df = pd.DataFrame(data)
101 |         df["Property_1"] = [
102 |             "test_label1",
103 |             "train_label1",
104 |             "test_label2",
105 |             "train_label2",
106 |             "train_label3",
107 |         ]
108 | 
109 |         df["Property_2"] = [
110 |             "test_label1",
111 |             "train_label1",
112 |             "test_label2",
113 |             "train_label2",
114 |             "train_label3",
115 |         ]
116 |         index_train = [1, 3, 4]
117 |         index_test = [0, 2]
118 |         predictions_train = np.array(
119 |             [["train_val1", "train_val2", "train_val3"], ["train_val1", "train_val2", "train_val3"]]
120 |         )
121 |         predictions_train = predictions_train.T
122 |         predictions_test = np.array([["test_val1", "test_val2"], ["test_val1", "test_val2"]])
123 |         predictions_test = predictions_test.T
124 |         expected_df = df.copy()
125 |         expected_df["model_predictions_property_1"] = np.array(
126 |             ["test_val1", "train_val1", "test_val2", "train_val2", "train_val3"]
127 |         )
128 |         expected_df["model_predictions_property_2"] = np.array(
129 |             ["test_val1", "train_val1", "test_val2", "train_val2", "train_val3"]
130 |         )
131 |         expected_df["is_train"] = [False, True, False, True, True]
132 | 
133 |         output_df = output_dataframe_preparation(
134 |             df,
135 |             index_train=index_train,
136 |             index_test=index_test,
137 |             predictions_train=predictions_train,
138 |             predictions_test=predictions_test,
139 |         )
140 | 
141 |         all_collect = []
142 |         for col in output_df.columns:
143 |             all_collect.append(all(output_df[0].values == expected_df[0].values))
144 |         all_collect.append(all(output_df.columns.values == expected_df.columns.values))
145 | 
146 |         self.assertTrue(all(all_collect))
147 | 
148 | 
149 | if __name__ == "__main__":
150 |     unittest.main()
151 | 


--------------------------------------------------------------------------------
/prefer/tests/test_prefer_model_wrapper.py:
--------------------------------------------------------------------------------
 1 | #  Copyright (c) 2023, Novartis Institutes for BioMedical Research Inc. and Microsoft Corporation
 2 | #  All rights reserved.
 3 | # 
 4 | # Redistribution and use in source and binary forms, with or without
 5 | # modification, are permitted provided that the following conditions are
 6 | # met: 
 7 | #
 8 | #     * Redistributions of source code must retain the above copyright 
 9 | #       notice, this list of conditions and the following disclaimer.
10 | #     * Redistributions in binary form must reproduce the above
11 | #       copyright notice, this list of conditions and the following 
12 | #       disclaimer in the documentation and/or other materials provided 
13 | #       with the distribution.
14 | #     * Neither the name of Novartis Institutes for BioMedical Research Inc. nor Microsoft Corporation 
15 | #       nor the names of its contributors may be used to endorse or promote 
16 | #       products derived from this software without specific prior written permission.
17 | #
18 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19 | # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20 | # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21 | # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
22 | # OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23 | # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
24 | # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25 | # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26 | # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 | #
30 | # Created by Jessica Lanini, January 2023
31 | 
32 | 
33 | import sys
34 | import unittest
35 | 
36 | 
37 | from prefer.src.prefer_model_wrapper import PreferModelWrapper
38 | from sklearn.linear_model import LinearRegression
39 | import numpy as np
40 | 
41 | 
42 | class TestPreferModelWrapper(unittest.TestCase):
43 |     def test_prefer_model_wrapper(self):
44 |         fingerprint_length = 2048  # Default value in `get_fingerprints`
45 | 
46 |         # Dummy model predicts 3.0 everywhere
47 |         X = np.zeros((2, fingerprint_length))
48 |         y = np.dot(X, np.ones(fingerprint_length)) + 3
49 |         model = LinearRegression().fit(X, y)
50 | 
51 |         # When molecule is un-scoreable, PreferModelWrapper gives worst possible score
52 |         worst_score = 0.32
53 | 
54 |         wrapper = PreferModelWrapper(
55 |             model=model,
56 |             metadata={
57 |                 "problem_type": "regression",
58 |                 "best_model_representation": "FINGERPRINTS",
59 |                 "friendly_model_name": "jan",
60 |                 "desirability_scores": {"junk": [{"x": 0, "y": 1.0}, {"x": worst_score, "y": 0.0}]},
61 |                 "rep_model_id": "the_rep_model",
62 |             },
63 |         )
64 |         scores = wrapper.predict(
65 |             ["CC", "CCC", "unparseable SMILES"], is_smiles_func=True, rep_model_id=None
66 |         )
67 | 
68 |         assertion = scores == [3.0, 3.0, 3.0]
69 |         self.assertTrue(all(assertion) == True)
70 | 
71 | 
72 | if __name__ == "__main__":
73 |     unittest.main()
74 | 


--------------------------------------------------------------------------------
/prefer/tests/test_scripts.py:
--------------------------------------------------------------------------------
  1 | #  Copyright (c) 2023, Novartis Institutes for BioMedical Research Inc. and Microsoft Corporation
  2 | #  All rights reserved.
  3 | # 
  4 | # Redistribution and use in source and binary forms, with or without
  5 | # modification, are permitted provided that the following conditions are
  6 | # met: 
  7 | #
  8 | #     * Redistributions of source code must retain the above copyright 
  9 | #       notice, this list of conditions and the following disclaimer.
 10 | #     * Redistributions in binary form must reproduce the above
 11 | #       copyright notice, this list of conditions and the following 
 12 | #       disclaimer in the documentation and/or other materials provided 
 13 | #       with the distribution.
 14 | #     * Neither the name of Novartis Institutes for BioMedical Research Inc. nor Microsoft Corporation 
 15 | #       nor the names of its contributors may be used to endorse or promote 
 16 | #       products derived from this software without specific prior written permission.
 17 | #
 18 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 19 | # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 20 | # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 21 | # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 22 | # OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 23 | # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 24 | # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 25 | # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 26 | # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 27 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 28 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 29 | #
 30 | # Created by Jessica Lanini, January 2023
 31 | 
 32 | 
 33 | import sys
 34 | import unittest
 35 | import os
 36 | import ast
 37 | 
 38 | import numpy as np
 39 | import json
 40 | import yaml
 41 | import pandas as pd
 42 | 
 43 | 
 44 | from prefer.scripts.get_representations import compute_representations_from_args
 45 | from prefer.scripts.run_PREFER import run_PREFER_from_args
 46 | from prefer.scripts.model_wrapper import store_metadata
 47 | 
 48 | 
 49 | class TestScripts(unittest.TestCase):
 50 |     def test_get_representations(self):
 51 |         path_to_df = "./data_for_test/logDPublic.csv"
 52 |         representation_to_compute = "FINGERPRINTS"
 53 |         path_to_model = None
 54 |         output_dir = "./representations_dir/"
 55 |         experiment_name = "test_logDPublic"
 56 |         id_column_name = "Molecule ChEMBL ID"
 57 |         smiles_column_name = "Smiles"
 58 |         splitting_strategy = "random"
 59 |         temporal_info_column_name = None
 60 |         properties_column_name_list = ["Standard Value"]
 61 |         try:
 62 |             os.makedirs(output_dir, exist_ok=True)
 63 |             print("Directory '%s' created successfully" % output_dir)
 64 |         except OSError as error:
 65 |             print("Directory '%s' can not be created")
 66 | 
 67 |         output_dir_new = compute_representations_from_args(
 68 |             path_to_df,
 69 |             representation_to_compute,
 70 |             path_to_model,
 71 |             output_dir,
 72 |             experiment_name,
 73 |             id_column_name,
 74 |             smiles_column_name,
 75 |             splitting_strategy,
 76 |             temporal_info_column_name,
 77 |             properties_column_name_list,
 78 |         )
 79 |         self.assertTrue(output_dir == output_dir_new)
 80 | 
 81 |     def test_run_PREFER(self):
 82 |         problem_type = "regression"
 83 |         representation_name = "FINGERPRINTS"
 84 |         repr_dir = "./representations_dir/"
 85 |         final_folder_path = "./output_dir/"
 86 |         experiment_name = "test_logDPublic"
 87 |         try:
 88 |             os.makedirs(final_folder_path, exist_ok=True)
 89 |             print("Directory '%s' created successfully" % final_folder_path)
 90 |         except OSError as error:
 91 |             print("Directory '%s' can not be created")
 92 | 
 93 |         run_PREFER_from_args(
 94 |             problem_type, representation_name, repr_dir, final_folder_path, experiment_name,
 95 |         )
 96 | 
 97 |     def test_store_metadata(self):
 98 |         path_to_df = "./data_for_test/logDPublic.csv"
 99 |         path_to_model = None
100 |         problem_type = "regression"
101 |         experiment_name = "test_logDPublic_wrapper"
102 |         id_column_name = "Molecule ChEMBL ID"
103 |         smiles_column_name = "Smiles"
104 |         properties_column_name_list = ["Standard Value"]
105 |         representation_name = "FINGERPRINTS"
106 |         final_folder_path = "./wrappers_dir/"
107 |         try:
108 |             os.makedirs(final_folder_path, exist_ok=True)
109 |             print("Directory '%s' created successfully" % final_folder_path)
110 |         except OSError as error:
111 |             print("Directory '%s' can not be created")
112 | 
113 |         property_model_folder_path = "./output_dir/"
114 |         repr_dir = "./representations_dir/"
115 |         with open("./file_for_test/logD_desirability_scores.yaml") as file:
116 |             try:
117 |                 parsed_yaml_file = yaml.safe_load(file)
118 |             except yaml.YAMLError as exception:
119 |                 print(exception)
120 | 
121 |             desirability_scores = json.dumps(parsed_yaml_file["desirability_scores"])
122 |             is_str = isinstance(desirability_scores, str)
123 |             if is_str:
124 |                 desirability_scores = ast.literal_eval(desirability_scores)
125 |             store_metadata(
126 |                 path_to_df=path_to_df,
127 |                 path_to_model=path_to_model,
128 |                 problem_type=problem_type,
129 |                 experiment_name=experiment_name,
130 |                 id_column_name=id_column_name,
131 |                 smiles_column_name=smiles_column_name,
132 |                 properties_column_name_list=properties_column_name_list,
133 |                 representation_name=representation_name,
134 |                 final_folder_path=final_folder_path,
135 |                 property_model_folder_path=property_model_folder_path,
136 |                 repr_dir=repr_dir,
137 |                 desirability_scores=desirability_scores,
138 |             )
139 | 
140 | 
141 | if __name__ == "__main__":
142 |     unittest.main()
143 | 


--------------------------------------------------------------------------------
/prefer/utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rdkit/PREFER/16986ca183b38b94f9b3b62b441ec40671b8cf8a/prefer/utils/__init__.py


--------------------------------------------------------------------------------
/prefer/utils/check_input_dataframe.py:
--------------------------------------------------------------------------------
 1 | #  Copyright (c) 2023, Novartis Institutes for BioMedical Research Inc. and Microsoft Corporation
 2 | #  All rights reserved.
 3 | # 
 4 | # Redistribution and use in source and binary forms, with or without
 5 | # modification, are permitted provided that the following conditions are
 6 | # met: 
 7 | #
 8 | #     * Redistributions of source code must retain the above copyright 
 9 | #       notice, this list of conditions and the following disclaimer.
10 | #     * Redistributions in binary form must reproduce the above
11 | #       copyright notice, this list of conditions and the following 
12 | #       disclaimer in the documentation and/or other materials provided 
13 | #       with the distribution.
14 | #     * Neither the name of Novartis Institutes for BioMedical Research Inc. nor Microsoft Corporation 
15 | #       nor the names of its contributors may be used to endorse or promote 
16 | #       products derived from this software without specific prior written permission.
17 | #
18 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19 | # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20 | # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21 | # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
22 | # OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23 | # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
24 | # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25 | # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26 | # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 | #
30 | # Created by Jessica Lanini, January 2023
31 | 
32 | 
33 | """
34 | Helper functions to check the validity of the input dataframe.
35 | """
36 | import logging
37 | 
38 | import pandas as pd
39 | 
40 | 
41 | def check_dataframe(df):
42 |     """
43 |     This function helps to evaluate whether a dataframe has a correct indices of the rows
44 |     """
45 |     if df.index[-1] != (df.shape[0] - 1):
46 |         print("Indices of the dataframe are not correct")
47 |     return df.index[-1] == (df.shape[0] - 1)
48 | 
49 | 
50 | def check_fields(df):
51 |     cols = df.columns.values
52 |     property_to_eval = df.columns[["Property" in str(x) for x in df.columns.values]].values
53 |     expected_cols = ["Smiles", "ID"] + [
54 |         "Property_" + str(x + 1) for x, elem in enumerate(property_to_eval)
55 |     ]
56 |     return all([x in cols for x in expected_cols])
57 | 
58 | 
59 | def check_fields_types(
60 |     df, experiment_name, problem_type, mask, mask_value, split_type, index_of_separation
61 | ):
62 |     """
63 |     This function helps to evaluate whether the dataframe fields are of the correct type.
64 |     """
65 |     return all(
66 |         [
67 |             isinstance(df, pd.DataFrame),
68 |             isinstance(experiment_name, str),
69 |             isinstance(problem_type, str),
70 |             isinstance(split_type, str),
71 |             split_type in ["random", "cluster", "temporal"],
72 |             problem_type in ["regression", "classification"],
73 |             isinstance(index_of_separation, int),
74 |             isinstance(mask, bool),
75 |             isinstance(mask_value, (int, float, complex)),
76 |         ]
77 |     )
78 | 
79 | 
80 | def check_final_structure(df):
81 |     """
82 |     Function to check if the dataframes are proper for the building of the models
83 |     """
84 |     property_to_eval = df.columns[["Property" in str(x) for x in df.columns.values]].values
85 |     for prop in property_to_eval:
86 |         if df[prop].isnull().sum() > 0:
87 |             logging.error(
88 |                 "ERROR --> some labels are NaN. Please check your dataframes before running eval_.BenchMoleProp()"
89 |             )
90 |             return False
91 |     return True
92 | 


--------------------------------------------------------------------------------
/prefer/utils/data_preparation.py:
--------------------------------------------------------------------------------
  1 | #  Copyright (c) 2023, Novartis Institutes for BioMedical Research Inc. and Microsoft Corporation
  2 | #  All rights reserved.
  3 | # 
  4 | # Redistribution and use in source and binary forms, with or without
  5 | # modification, are permitted provided that the following conditions are
  6 | # met: 
  7 | #
  8 | #     * Redistributions of source code must retain the above copyright 
  9 | #       notice, this list of conditions and the following disclaimer.
 10 | #     * Redistributions in binary form must reproduce the above
 11 | #       copyright notice, this list of conditions and the following 
 12 | #       disclaimer in the documentation and/or other materials provided 
 13 | #       with the distribution.
 14 | #     * Neither the name of Novartis Institutes for BioMedical Research Inc. nor Microsoft Corporation 
 15 | #       nor the names of its contributors may be used to endorse or promote 
 16 | #       products derived from this software without specific prior written permission.
 17 | #
 18 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 19 | # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 20 | # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 21 | # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 22 | # OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 23 | # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 24 | # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 25 | # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 26 | # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 27 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 28 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 29 | #
 30 | # Created by Jessica Lanini, January 2023
 31 | 
 32 | 
 33 | #!/usr/bin/env python
 34 | # -*- coding: utf-8 -*-
 35 | 
 36 | import sys
 37 | 
 38 | 
 39 | from prefer.utils.filtering import filter_and_normalize_mols
 40 | import pandas as pd
 41 | 
 42 | 
 43 | def prepare_data(
 44 |     df,
 45 |     id_column_name: str,
 46 |     smiles_column_name: str,
 47 |     properties_column_name_list: list,
 48 |     temporal_info_column_name: str = None,
 49 |     filter_flag: bool = True,
 50 | ):
 51 | 
 52 |     """
 53 |     Function to prepare datasets.
 54 |     The inputs are:
 55 |     df: dataframe to be manipulated
 56 |     id_column_name: string of the name of the column where the ID is stored
 57 |     smiles_column_name: string of the name of the column where the smile representation is stored
 58 |     properties_column_name_list: list of the strings/names of the property/ies to evaluate
 59 | 
 60 |     """
 61 |     if not isinstance(properties_column_name_list, list):
 62 |         raise ValueError('properties_column_name_list should be a list of names of the selected labels')
 63 |     # Evaluate whether unique labels
 64 |     if len(properties_column_name_list) > len(set(properties_column_name_list)):
 65 |         raise ValueError('Duplicates in the labels list cannot be handled by PREFER - please provide unique labels names')
 66 |         
 67 |     # Check if consistent
 68 |     check = list()
 69 |     check.append(all([x in df.columns.values for x in properties_column_name_list]))
 70 |     check.append(id_column_name in df.columns.values)
 71 |     check.append(smiles_column_name in df.columns.values)
 72 |     if temporal_info_column_name:
 73 |         check.append(temporal_info_column_name in df.columns.values)
 74 |     if not all(check):
 75 |         raise ValueError("ERROR: columns name not found in the dataframe")
 76 | 
 77 |     cols = list()
 78 |     df.rename(columns={id_column_name: "ID"}, inplace=True)
 79 |     cols.append("ID")
 80 |     df.rename(columns={smiles_column_name: "Smiles"}, inplace=True)
 81 |     cols.append("Smiles")
 82 | 
 83 |     if temporal_info_column_name:
 84 |         df[temporal_info_column_name] = pd.to_datetime(df[temporal_info_column_name])
 85 |         df.rename(columns={temporal_info_column_name: "Time"}, inplace=True)
 86 |         cols.append("Time")
 87 | 
 88 |     # TO DO extend AutoSklearn in the case of sparsity of the label matrx. For now we need to remove nans
 89 |     print(
 90 |         "WARNING: Autosklearn does not handle for now label matrix sparsity, thus nan values will be removed both for single task and multitasking cases"
 91 |     )
 92 |     for index, _ in enumerate(properties_column_name_list):
 93 |         df = df[df[properties_column_name_list[index]].notna()]
 94 |         df = df.reset_index(drop=True)
 95 | 
 96 |     for index, properties_column_name in enumerate(properties_column_name_list):
 97 |         df.rename(columns={properties_column_name: "Property_" + str(index + 1)}, inplace=True)
 98 |         cols.append("Property_" + str(index + 1))
 99 |     if filter_flag:
100 |         return filter_and_normalize_mols(df[cols])
101 |     else:
102 |         return df[cols]
103 | 


--------------------------------------------------------------------------------
/prefer/utils/features_scaling.py:
--------------------------------------------------------------------------------
  1 | #  Copyright (c) 2023, Novartis Institutes for BioMedical Research Inc. and Microsoft Corporation
  2 | #  All rights reserved.
  3 | # 
  4 | # Redistribution and use in source and binary forms, with or without
  5 | # modification, are permitted provided that the following conditions are
  6 | # met: 
  7 | #
  8 | #     * Redistributions of source code must retain the above copyright 
  9 | #       notice, this list of conditions and the following disclaimer.
 10 | #     * Redistributions in binary form must reproduce the above
 11 | #       copyright notice, this list of conditions and the following 
 12 | #       disclaimer in the documentation and/or other materials provided 
 13 | #       with the distribution.
 14 | #     * Neither the name of Novartis Institutes for BioMedical Research Inc. nor Microsoft Corporation 
 15 | #       nor the names of its contributors may be used to endorse or promote 
 16 | #       products derived from this software without specific prior written permission.
 17 | #
 18 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 19 | # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 20 | # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 21 | # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 22 | # OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 23 | # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 24 | # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 25 | # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 26 | # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 27 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 28 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 29 | #
 30 | # Created by Jessica Lanini, January 2023
 31 | 
 32 | 
 33 | #!/usr/bin/env python
 34 | # -*- coding: utf-8 -*-
 35 | import pandas as pd
 36 | import numpy as np
 37 | 
 38 | 
 39 | def scale_features(Xtrain, Xtest, scaling_type="standardization"):
 40 |     """
 41 |     method used to normalize features
 42 |     Xtrain and Xtest are 2D numpy arrays
 43 |     scaling_type is a tring that can be standardization or normalization
 44 |     """
 45 |     print("Scaling features")
 46 |     Xtrain_df = pd.DataFrame(Xtrain)
 47 |     Xtest_df = pd.DataFrame(Xtest)
 48 |     # compute mean and std of the train set
 49 |     means_ = Xtrain_df.mean()  # only from the training set
 50 |     stds_ = Xtrain_df.std()
 51 | 
 52 |     Xtrain_scaled = []
 53 |     for index, row in Xtrain_df.iterrows():
 54 |         scaled_row = apply_scaling(
 55 |             np.array(row), scaling_type=scaling_type, means=means_, stds=stds_
 56 |         )
 57 |         Xtrain_scaled.append(scaled_row)
 58 |     Xtrain_scaled = np.array(Xtrain_scaled)
 59 |     Xtrain_scaled_df = pd.DataFrame(Xtrain_scaled)
 60 | 
 61 |     Xtest_scaled = []
 62 |     for index, row in Xtest_df.iterrows():
 63 |         scaled_row = apply_scaling(
 64 |             np.array(row), scaling_type=scaling_type, means=means_, stds=stds_
 65 |         )
 66 |         Xtest_scaled.append(scaled_row)
 67 |     Xtest_scaled = np.array(Xtest_scaled)
 68 | 
 69 |     if scaling_type == "standardization":
 70 |         means_sc = Xtrain_scaled_df.mean()  # only from the training set
 71 |         stds_sc = Xtrain_scaled_df.std()
 72 | 
 73 |         if not (np.array(round(stds_sc, 1)) == 1.0).all():
 74 |             raise ValueError(
 75 |                 "ERROR: when standardizing matrix; not all stds of the scaled matrix are 1.0"
 76 |             )
 77 |         if not (np.array(round(means_sc, 1)) == 0.0).all():
 78 |             raise ValueError(
 79 |                 "ERROR: when standardizing matrix; not all means of the scaled matrix are 0.0"
 80 |             )
 81 | 
 82 |     if scaling_type == "normalization":
 83 |         means_sc = Xtrain_scaled_df.mean()  # only from the training set
 84 | 
 85 |         if not (np.array(round(means_sc, 1)) == 0.0).all():
 86 |             raise ValueError(
 87 |                 "ERROR: when normalizing matrix; not all means of the scaled matrix are 0.0"
 88 |             )
 89 | 
 90 |     return Xtrain_scaled, Xtest_scaled, means_, stds_
 91 | 
 92 | 
 93 | def apply_scaling(features_vect, scaling_type="standardization", means=None, stds=None):
 94 |     """
 95 |     function to apply a specific scaling given means and stds.
 96 |     Inputs:
 97 |         - features_vect, must be a numpy array of the features related to a single sample
 98 |         - scaling_type, is a string that can be standardization or normalization
 99 |         - means, is a numpy array of the means (one for each feature)
100 |         - stds, is a numpy array of the standard deviation values (one for each feature)
101 |     Output:
102 |         - numpy array of scaled features
103 |     """
104 | 
105 |     # making sure the features_vect is numpy arrays:
106 |     features_vect = np.array(features_vect)
107 |     array_sum = np.sum(features_vect)
108 |     array_has_nan = not np.isfinite(array_sum)
109 |     # check if features_vect contains nan
110 |     if array_has_nan:
111 |         raise ValueError("features_vect provided to the apply_scaling contains nan - cannot scale.")
112 | 
113 |     if means is not None:
114 |         # making sure the means is numpy arrays:
115 |         means = np.array(means)
116 | 
117 |         # check the dimension
118 |         if means.shape[0] != features_vect.shape[0]:
119 |             raise ValueError(
120 |                 f"ERROR: features_vect dimension ({features_vect.shape[0]}) does not match with means dimension ({means.shape[0]})"
121 |             )
122 | 
123 |         if stds is not None:
124 |             # making sure the stds is numpy arrays:
125 |             stds = np.array(stds)
126 | 
127 |             # check the dimension
128 |             if stds.shape[0] != features_vect.shape[0]:
129 |                 raise ValueError(
130 |                     "ERROR: features_vect dimension does not match with stds dimension"
131 |                 )
132 | 
133 |             if scaling_type == "standardization":
134 | 
135 |                 features_vect = (features_vect - means) / stds
136 |                 # Replace possible inf with nans
137 |                 features_vect[features_vect == -np.inf] = np.nan
138 |                 features_vect[features_vect == np.inf] = np.nan
139 |                 # if zeros in stds we should also have nans in  features_vect
140 |                 if any(np.isnan(features_vect)) == (0 in stds):
141 |                     if any(np.isnan(features_vect)):
142 |                         # important check in case of zeros in stds
143 |                         (stds_zeros,) = np.where(
144 |                             stds == 0
145 |                         )  # zeros in stds should correspond to nans in standardize array
146 |                         features_vect_nans = [x[0] for x in np.argwhere(np.isnan(features_vect))]
147 | 
148 |                         if list(stds_zeros) == list(features_vect_nans):
149 |                             return list(features_vect[~np.isnan(features_vect)])
150 |                         else:
151 |                             raise ValueError(
152 |                                 "ERROR: there is a problem with the standardization: no match between zeros in the stds vector and nans in the standardize features vector"
153 |                             )
154 |                     else:  # no zeros in stds and no nans in feature_vect
155 |                         return list(features_vect)
156 |                 else:
157 |                     raise ValueError(
158 |                         "ERROR: found nans in standardize features_vect but no zeros in stds or viceversa"
159 |                     )
160 | 
161 |             elif scaling_type == "normalization":
162 |                 features_vect = features_vect - means
163 |                 return list(features_vect)
164 |             else:
165 |                 raise ValueError(
166 |                     "ERROR: only standardization or normalization are possible scaling_type"
167 |                 )
168 |         else:
169 |             if scaling_type == "standardization":
170 |                 raise ValueError(
171 |                     "ERROR: only normalization is possible since stds vector is not provided"
172 |                 )
173 | 
174 |             elif scaling_type == "normalization":
175 |                 features_vect = features_vect - means
176 |                 return list(features_vect)
177 |             else:
178 |                 raise ValueError(
179 |                     "ERROR: only standardization or normalization are possible scaling_type"
180 |                 )
181 | 
182 |     else:
183 |         raise ValueError("ERROR: please provide a means vector (one value for each feature)")
184 | 


--------------------------------------------------------------------------------
/prefer/utils/filtering.py:
--------------------------------------------------------------------------------
  1 | #  Copyright (c) 2023, Novartis Institutes for BioMedical Research Inc. and Microsoft Corporation
  2 | #  All rights reserved.
  3 | # 
  4 | # Redistribution and use in source and binary forms, with or without
  5 | # modification, are permitted provided that the following conditions are
  6 | # met: 
  7 | #
  8 | #     * Redistributions of source code must retain the above copyright 
  9 | #       notice, this list of conditions and the following disclaimer.
 10 | #     * Redistributions in binary form must reproduce the above
 11 | #       copyright notice, this list of conditions and the following 
 12 | #       disclaimer in the documentation and/or other materials provided 
 13 | #       with the distribution.
 14 | #     * Neither the name of Novartis Institutes for BioMedical Research Inc. nor Microsoft Corporation 
 15 | #       nor the names of its contributors may be used to endorse or promote 
 16 | #       products derived from this software without specific prior written permission.
 17 | #
 18 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 19 | # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 20 | # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 21 | # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 22 | # OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 23 | # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 24 | # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 25 | # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 26 | # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 27 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 28 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 29 | #
 30 | # Created by Jessica Lanini, January 2023
 31 | 
 32 | 
 33 | import logging
 34 | import sys
 35 | from pathlib import Path
 36 | 
 37 | import numpy as np
 38 | from rdkit import Chem
 39 | from rdkit import RDLogger
 40 | from rdkit.Chem import SaltRemover
 41 | from rdkit.Chem.MolStandardize import rdMolStandardize
 42 | 
 43 | directory_path = Path(__file__).parents[1]
 44 | SALTS_FILE = "docs/SaltsMod.txt"
 45 | 
 46 | 
 47 | from prefer.utils.check_input_dataframe import check_dataframe
 48 | from prefer.utils.data_utils import extract_representations
 49 | 
 50 | 
 51 | class MissingSmiles(ValueError):
 52 |     pass
 53 | 
 54 | 
 55 | class EmptyDataframe(ValueError):
 56 |     pass
 57 | 
 58 | 
 59 | class MissingRepresentation(ValueError):
 60 |     pass
 61 | 
 62 | 
 63 | def filter_and_normalize_smiles(smiles):
 64 | 
 65 |     uncharger = rdMolStandardize.Uncharger()
 66 |     # format is smarts, then flag to allow one to pick and choose which to include
 67 |     forbidden_elements = Chem.MolFromSmarts(
 68 |         "[Cu,Sb,As,Sn,Pt,Te,Pd,Lu,Ge,Zn,Cu,Co,Ni,Fe,Hg,Zr,Mn,Ag,Bi,Cd,Cr,Ti,Al,Au,Mo,V,Mg,In,Ga,Pb,Ca,W]"
 69 |     )
 70 |     remover = SaltRemover.SaltRemover(defnFilename=directory_path / SALTS_FILE)
 71 | 
 72 |     try:
 73 |         mol = Chem.MolFromSmiles(str(smiles))
 74 |     except Exception as e:
 75 |         logging.error(f"ERROR: Invalid SMILES {smiles}.{e}")
 76 | 
 77 |     if mol is None or mol.GetNumAtoms() > 100:
 78 |         return None
 79 | 
 80 |     res, deleted = remover.StripMolWithDeleted(mol)
 81 |     # add a flag in case you want to remove or keep the salts - Default we keep it without the salt
 82 |     if len(deleted) != 0:
 83 |         return None
 84 | 
 85 |     mol = uncharger.uncharge(mol)
 86 |     if mol is None:
 87 |         return None
 88 | 
 89 |     if Chem.SanitizeMol(mol, catchErrors=True):  # maybe the molecule has changed
 90 |         return None
 91 | 
 92 |     if mol.HasSubstructMatch(forbidden_elements):
 93 |         return None
 94 | 
 95 |     smi = Chem.MolToSmiles(mol, isomericSmiles=False)
 96 |     if len(smi) < 2 or "*" in smi or "R" in smi:
 97 |         return None
 98 | 
 99 |     return smi
100 | 
101 | 
102 | def filter_and_normalize_mols(df):
103 |     """
104 |     function to filter the row dataset at the beginning of the benchmarking pipeline
105 |     """
106 | 
107 |     # Turn off the warning
108 |     lg = RDLogger.logger()
109 |     lg.setLevel(RDLogger.CRITICAL)
110 | 
111 |     # Check if 'Smiles' columns is in the current dataframe
112 |     if "Smiles" not in df.columns.values:
113 |         raise MissingSmiles("ERROR: Smiles column not in the dataframe")
114 | 
115 |     uncharger = rdMolStandardize.Uncharger()
116 |     # format is smarts, then flag to allow one to pick and choose which to include
117 |     forbidden_elements = Chem.MolFromSmarts(
118 |         "[Cu,Sb,As,Sn,Pt,Te,Pd,Lu,Ge,Zn,Cu,Co,Ni,Fe,Hg,Zr,Mn,Ag,Bi,Cd,Cr,Ti,Al,Au,Mo,V,Mg,In,Ga,Pb,Ca,W]"
119 |     )
120 |     # Define the list to store the indices of the rows to be dropped
121 |     rows_to_drop = []
122 | 
123 |     for index, smile in enumerate(df["Smiles"]):
124 | 
125 |         mol = Chem.MolFromSmiles(str(smile))
126 | 
127 |         if mol is None:
128 |             logging.warning("WARNING: mol is None for smile: " + str(smile))
129 |             rows_to_drop.append(index)
130 |             continue
131 |         if mol.GetNumAtoms() > 100:
132 |             rows_to_drop.append(index)
133 |             continue
134 |         remover = SaltRemover.SaltRemover(defnFilename=directory_path / SALTS_FILE)
135 | 
136 |         res, deleted = remover.StripMolWithDeleted(mol)
137 |         # add a flag in case you want to remove or keep the salts - Default we keep it without the salt
138 |         if len(deleted) != 0:
139 |             rows_to_drop.append(index)
140 |             continue
141 | 
142 |         mol = uncharger.uncharge(mol)
143 |         if mol is None:
144 |             rows_to_drop.append(index)
145 |             continue
146 | 
147 |         if Chem.SanitizeMol(mol, catchErrors=True):  # maybe the molecule has changed
148 |             rows_to_drop.append(index)
149 |             continue
150 | 
151 |         if mol.HasSubstructMatch(forbidden_elements):
152 |             rows_to_drop.append(index)
153 |             continue
154 | 
155 |         smi = Chem.MolToSmiles(mol, isomericSmiles=False)
156 |         if len(smi) < 2 or "*" in smi or "R" in smi:
157 |             rows_to_drop.append(index)
158 |             continue
159 | 
160 |         # Update the smile at index index
161 |         df["Smiles"][index] = smi
162 | 
163 |     print("Percentage of dropped molecule: " + str((len(rows_to_drop) * 100) / df.shape[0]))
164 |     filtered_df = df.drop(rows_to_drop).reset_index(drop=True)
165 |     if check_dataframe(
166 |         filtered_df
167 |     ):  # check whether the indices are all correct or something went wrong
168 |         df = filtered_df
169 |     else:
170 |         raise ValueError(
171 |             "ERROR: Problem with inidices. Maybe a reset_index() is needed. The dataset will not be updated."
172 |         )
173 |     return df
174 | 
175 | 
176 | def find_nan(df, representation_to_evaluate=[], drop=False):
177 |     """
178 |     This function check for each dataset, each dataframe representations to evaluate if some rows contain nan values.
179 |     The indices corresponding to the rows with nan values will be stored in molecules_to_drop variable and if drop is True
180 |     the rows will be directly removed and the indices will be restored.
181 |     """
182 | 
183 |     logging.info("filter nan values in the molecular representations")
184 | 
185 |     # Check if empty
186 |     if df.empty:
187 |         raise EmptyDataframe("ERROR: df is empty")
188 | 
189 |     # Extract representation_to_evaluate if empty
190 |     if not representation_to_evaluate:
191 |         representation_to_evaluate = extract_representations(df)
192 |     elif not all([repr_ in df.columns.values for repr_ in representation_to_evaluate]):
193 |         # Check if the representation is in the dataframe
194 |         raise MissingRepresentation(
195 |             "ERROR: One or more representations are not in the dataset stored. HINT: Run Molecules_Representations to compute the representations needed"
196 |         )
197 | 
198 |     # find nan
199 |     for representation in representation_to_evaluate:
200 |         logging.info("For the representation " + representation)
201 |         find_nan_vect = [np.isnan(x).any().sum() for x in df[representation]]
202 |         if np.sum(find_nan_vect) > 0 and drop:
203 |             indx_nan = [indx for indx, elem in enumerate(find_nan_vect) if elem > 0]
204 |             logging.info("Drop Molecules at positions:" + str(indx_nan))
205 |             df.drop(indx_nan, inplace=True)
206 |             df = df.reset_index()
207 |         else:
208 |             logging.info(
209 |                 "No molecules need to be dropped for " + representation + " representation"
210 |             )
211 | 


--------------------------------------------------------------------------------
/prefer/utils/mapping.py:
--------------------------------------------------------------------------------
 1 | #  Copyright (c) 2023, Novartis Institutes for BioMedical Research Inc. and Microsoft Corporation
 2 | #  All rights reserved.
 3 | # 
 4 | # Redistribution and use in source and binary forms, with or without
 5 | # modification, are permitted provided that the following conditions are
 6 | # met: 
 7 | #
 8 | #     * Redistributions of source code must retain the above copyright 
 9 | #       notice, this list of conditions and the following disclaimer.
10 | #     * Redistributions in binary form must reproduce the above
11 | #       copyright notice, this list of conditions and the following 
12 | #       disclaimer in the documentation and/or other materials provided 
13 | #       with the distribution.
14 | #     * Neither the name of Novartis Institutes for BioMedical Research Inc. nor Microsoft Corporation 
15 | #       nor the names of its contributors may be used to endorse or promote 
16 | #       products derived from this software without specific prior written permission.
17 | #
18 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19 | # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20 | # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21 | # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
22 | # OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23 | # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
24 | # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25 | # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26 | # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 | #
30 | # Created by Jessica Lanini, January 2023
31 | 
32 | 
33 | import sys
34 | 
35 | 
36 | from pandas import DataFrame
37 | 
38 | 
39 | from prefer.molecule_representations.descriptors2D_representations_builder import (
40 |     Descriptors2DRepresentationsBuilder,
41 | )
42 | from prefer.molecule_representations.fingerprints_representations_builder import (
43 |     FingerprintsRepresentationsBuilder,
44 | )
45 | from prefer.molecule_representations.model_representations_builder import (
46 |     ModelRepresentationsBuilder,
47 | )
48 | 
49 | 
50 | def mapping_representations(
51 |     representation_name: str,
52 |     df: DataFrame,
53 |     output_dir: str,
54 |     path_to_model: str = "",
55 |     path_to_df: str = "",
56 |     experiment_name: str = "",
57 |     split_type: str = "random",
58 | ):  # obj should be the object of the class for generic model
59 |     """
60 |     Function to map representation names to the corresponding molecule representation builder. The function generate the representation and it save it in a
61 |     define directory.
62 |     The function returns the directory name (string) and the representation type (MoleculeRepresentations object)
63 |     """
64 | 
65 |     if representation_name == "DESCRIPTORS2D":
66 |         builder = Descriptors2DRepresentationsBuilder()
67 |     elif representation_name == "FINGERPRINTS":
68 |         builder = FingerprintsRepresentationsBuilder()
69 |     else:
70 |         builder = ModelRepresentationsBuilder(
71 |             path_to_model=path_to_model, representation_name=representation_name
72 |         )
73 | 
74 |     representations = builder.build_representations(molecule_data_orig=df, split_type=split_type)
75 |     representations.save(output_dir, representation_name, experiment_name, path_to_df)
76 | 
77 | 
78 | def representations_supported():
79 |     """
80 |     Function to return the names of the representations currently supported by PREFER
81 |     """
82 | 
83 |     return ["CDDD", "DESCRIPTORS2D", "MOLER", "FINGERPRINTS"]
84 | 


--------------------------------------------------------------------------------
/prefer/utils/random_utils.py:
--------------------------------------------------------------------------------
 1 | #  Copyright (c) 2023, Novartis Institutes for BioMedical Research Inc. and Microsoft Corporation
 2 | #  All rights reserved.
 3 | # 
 4 | # Redistribution and use in source and binary forms, with or without
 5 | # modification, are permitted provided that the following conditions are
 6 | # met: 
 7 | #
 8 | #     * Redistributions of source code must retain the above copyright 
 9 | #       notice, this list of conditions and the following disclaimer.
10 | #     * Redistributions in binary form must reproduce the above
11 | #       copyright notice, this list of conditions and the following 
12 | #       disclaimer in the documentation and/or other materials provided 
13 | #       with the distribution.
14 | #     * Neither the name of Novartis Institutes for BioMedical Research Inc. nor Microsoft Corporation 
15 | #       nor the names of its contributors may be used to endorse or promote 
16 | #       products derived from this software without specific prior written permission.
17 | #
18 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19 | # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20 | # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21 | # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
22 | # OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23 | # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
24 | # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25 | # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26 | # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 | #
30 | # Created by Jessica Lanini, January 2023
31 | 
32 | 
33 | import random
34 | import numpy as np
35 | 
36 | 
37 | def set_random_seed(seed: int = 42):
38 |     """
39 |     Set random seed for random, numpy, tensorflow and pytorch
40 |     Args:
41 |         seed: the magic hyperparameter
42 | 
43 |     Returns: None
44 | 
45 |     """
46 | 
47 |     np.random.seed(seed)
48 |     random.seed(seed)
49 | 
50 |     try:
51 |         import tensorflow as tf
52 | 
53 |         tf_version = int(tf.version.VERSION.split(".")[0])
54 | 
55 |         if tf_version <= 1:
56 |             tf.set_random_seed(seed)
57 |         else:
58 | 
59 |             tf.random.set_seed(seed)
60 |     except ModuleNotFoundError:
61 |         print("Tensorflow not found; skipping: tf.random.set_seed(...)")
62 |         pass
63 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [tool.black]
 2 | line-length = 100
 3 | target-version = ['py37']
 4 | exclude = '''
 5 | /(
 6 |     \.git
 7 |   | \.hg
 8 |   | \.mypy_cache
 9 |   | \.tox
10 |   | \.venv
11 |   | _build
12 |   | buck-out
13 |   | build
14 |   | dist
15 | )/
16 | '''
17 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import setuptools
 3 | 
 4 | this_directory = os.path.abspath(os.path.dirname(__file__))
 5 | with open(os.path.join(this_directory, "README.md"), encoding="utf-8") as f:
 6 |     long_description = f.read()
 7 | 
 8 | setuptools.setup(
 9 |     name="prefer",
10 |     use_scm_version=True,
11 |     license="MIT",
12 |     author="Jessica Lanini",
13 |     author_email="jessica.lanini@novartis.com",
14 |     description="benchmarking and Property pREdiction FramEwoRk",
15 |     long_description=long_description,
16 |     long_description_content_type="text/markdown",
17 |     url="https://dev.azure.com/MSAI-DevOps-Org/FormulaOne%20Azure%20-%20AI%20Exploration%20-%20Gen%20Chem/_git/PREFER", #TO DO change it accprding to the final GitHub location
18 |     setup_requires=["setuptools_scm"],
19 |     python_requires="==3.7.7",
20 |     install_requires=[
21 |         "dpu-utils>=0.2.13",
22 |         "scikit-learn==0.24.1",
23 |         "numpy==1.19.2",
24 |         "pandas>=1.2.4",
25 |         "auto-sklearn==0.14.7",
26 |     ],
27 |     packages=setuptools.find_packages(),
28 |     entry_points={"console_scripts": ["prefer = prefer.run_prefer_automation:run_PREFER"]},
29 | )
30 | 


--------------------------------------------------------------------------------
/small_data_experiments/README_smalldata.txt:
--------------------------------------------------------------------------------
1 | The set of files stored in this folder can be used to 
2 | 1. Download the FS-Mol test sets from the main FS-Mol repo (extract_zipped_files.ipynb) [you need to unzip the main fsmol.tar file to run the script]
3 | 2. Run PREFER on the FS-Mol test sub-sets (run_PREFER_smalldata_example.ipynb)
4 | 3. Analyze the results and compare them with respect to the FS-Mol results (analysis_smalldata_example.ipynb)


--------------------------------------------------------------------------------
/small_data_experiments/extract_zipped_files.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "id": "48c532a6-753c-4eaa-a68f-a87a8d91d139",
  6 |    "metadata": {},
  7 |    "source": [
  8 |     "## Notebook to extract zip files downloaded from [FS-Mol repo](https://figshare.com/ndownloader/files/31345321)"
  9 |    ]
 10 |   },
 11 |   {
 12 |    "cell_type": "markdown",
 13 |    "id": "6402e340-f77c-4a05-82dc-88adebd54d6d",
 14 |    "metadata": {},
 15 |    "source": [
 16 |     "This notebook is provided to easily extract and convert zipped files from the FS-Mol repository to .csv files. So before running the cells please download the zip files from [here](https://figshare.com/ndownloader/files/31345321)"
 17 |    ]
 18 |   },
 19 |   {
 20 |    "cell_type": "markdown",
 21 |    "id": "e1a35cd3-16bb-4a4d-b77e-f7ac9cd3e0ab",
 22 |    "metadata": {},
 23 |    "source": [
 24 |     "### To run the notebook please extract the fsmol.tar"
 25 |    ]
 26 |   },
 27 |   {
 28 |    "cell_type": "code",
 29 |    "execution_count": null,
 30 |    "id": "3cd989e0-c560-4845-80bc-90a4411cb38c",
 31 |    "metadata": {
 32 |     "tags": []
 33 |    },
 34 |    "outputs": [],
 35 |    "source": [
 36 |     "# Add the path where you have saved the zip files\n",
 37 |     "path_to_zip_files = None\n",
 38 |     "# Add path where you would like to store the converted csv files to be used in PREFER\n",
 39 |     "path_where_to_store_csv_files = None"
 40 |    ]
 41 |   },
 42 |   {
 43 |    "cell_type": "code",
 44 |    "execution_count": null,
 45 |    "id": "673ef336-c98b-4e6b-8641-a1d7af0dd5e0",
 46 |    "metadata": {
 47 |     "tags": []
 48 |    },
 49 |    "outputs": [],
 50 |    "source": [
 51 |     "# unzip files\n",
 52 |     "import os\n",
 53 |     "import json, gzip\n",
 54 |     "\n",
 55 |     "# only test sets will be used for training and testing the PREFER model\n",
 56 |     "sets = ['test']\n",
 57 |     "index = 0\n",
 58 |     "for set_ in sets:\n",
 59 |     "    path_to_files = f'{path_to_zip_files}/{set_}/'\n",
 60 |     "    zipped_file_names = os.listdir(path_to_files)\n",
 61 |     "    for zipped_file in zipped_file_names:\n",
 62 |     "        print(f'current file is: {zipped_file} - iteration number: {index}')\n",
 63 |     "        index=index+1\n",
 64 |     "        run = f'gunzip {path_to_files}{zipped_file}'\n",
 65 |     "        !{run}"
 66 |    ]
 67 |   },
 68 |   {
 69 |    "cell_type": "code",
 70 |    "execution_count": null,
 71 |    "id": "c09e06bd-6d78-4315-bd90-328628d7d7fb",
 72 |    "metadata": {
 73 |     "tags": []
 74 |    },
 75 |    "outputs": [],
 76 |    "source": [
 77 |     "# covert data for PREFER\n",
 78 |     "\n",
 79 |     "import json\n",
 80 |     "import pandas as pd\n",
 81 |     "file_names = os.listdir(path_to_files)\n",
 82 |     "\n",
 83 |     "for file_name in file_names:\n",
 84 |     "    print(f'Current file is {file_name}')\n",
 85 |     "    data = []\n",
 86 |     "    df = pd.DataFrame()\n",
 87 |     "    collect_smiles = []\n",
 88 |     "    collect_ids = []\n",
 89 |     "    collect_labels = []\n",
 90 |     "    with open(path_to_files+file_name) as f:\n",
 91 |     "        for line in f:\n",
 92 |     "            data.append(json.loads(line))\n",
 93 |     "        for elem in data:\n",
 94 |     "            collect_smiles.append(elem['SMILES'])\n",
 95 |     "            collect_ids.append(elem['Assay_ID'])\n",
 96 |     "            collect_labels.append(elem['Property'])\n",
 97 |     "        df = pd.DataFrame({'SMILES': collect_smiles, 'Assay_ID': collect_ids, 'Property': collect_labels})\n",
 98 |     "        file_name = file_name.replace('.jsonl', '')\n",
 99 |     "        df.to_csv(f'{path_where_to_store_csv_files}/{file_name}.csv', index = False)"
100 |    ]
101 |   },
102 |   {
103 |    "cell_type": "code",
104 |    "execution_count": null,
105 |    "id": "35a91a35-0bd2-4b87-b206-d4138a366b6a",
106 |    "metadata": {},
107 |    "outputs": [],
108 |    "source": []
109 |   }
110 |  ],
111 |  "metadata": {
112 |   "interpreter": {
113 |    "hash": "e06fcc65451699fab52210cecc89ce74d347871d8379f3a65371b5502fcda228"
114 |   },
115 |   "kernelspec": {
116 |    "display_name": "Python (prefer-env-released2)",
117 |    "language": "python",
118 |    "name": "prefer-env-released2"
119 |   },
120 |   "language_info": {
121 |    "codemirror_mode": {
122 |     "name": "ipython",
123 |     "version": 3
124 |    },
125 |    "file_extension": ".py",
126 |    "mimetype": "text/x-python",
127 |    "name": "python",
128 |    "nbconvert_exporter": "python",
129 |    "pygments_lexer": "ipython3",
130 |    "version": "3.7.7"
131 |   }
132 |  },
133 |  "nbformat": 4,
134 |  "nbformat_minor": 5
135 | }
136 | 


--------------------------------------------------------------------------------