├── .gitignore ├── LICENSE ├── README.md ├── mt_pack ├── __init__.py ├── multi_task.py ├── tasks │ ├── gpr.py │ ├── nn_mt.py │ ├── nn_mt2.py │ ├── nn_st.py │ └── transform.py └── utils.py ├── poetry.lock └── pyproject.toml /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | test/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | wheels/ 24 | pip-wheel-metadata/ 25 | share/python-wheels/ 26 | *.egg-info/ 27 | .installed.cfg 28 | *.egg 29 | MANIFEST 30 | 31 | # PyInstaller 32 | # Usually these files are written by a python script from a template 33 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 34 | *.manifest 35 | *.spec 36 | 37 | # Installer logs 38 | pip-log.txt 39 | pip-delete-this-directory.txt 40 | 41 | # Unit test / coverage reports 42 | htmlcov/ 43 | .tox/ 44 | .nox/ 45 | .coverage 46 | .coverage.* 47 | .cache 48 | nosetests.xml 49 | coverage.xml 50 | *.cover 51 | *.py,cover 52 | .hypothesis/ 53 | .pytest_cache/ 54 | 55 | # Translations 56 | *.mo 57 | *.pot 58 | 59 | # Django stuff: 60 | *.log 61 | local_settings.py 62 | db.sqlite3 63 | db.sqlite3-journal 64 | 65 | # Flask stuff: 66 | instance/ 67 | .webassets-cache 68 | 69 | # Scrapy stuff: 70 | .scrapy 71 | 72 | # Sphinx documentation 73 | docs/_build/ 74 | 75 | # PyBuilder 76 | target/ 77 | 78 | # Jupyter Notebook 79 | .ipynb_checkpoints 80 | 81 | # IPython 82 | profile_default/ 83 | ipython_config.py 84 | 85 | # pyenv 86 | .python-version 87 | 88 | # pipenv 89 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 90 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 91 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 92 | # install all needed dependencies. 93 | #Pipfile.lock 94 | 95 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 96 | __pypackages__/ 97 | 98 | # Celery stuff 99 | celerybeat-schedule 100 | celerybeat.pid 101 | 102 | # SageMath parsed files 103 | *.sage.py 104 | 105 | # Environments 106 | .env 107 | .venv 108 | env/ 109 | venv/ 110 | ENV/ 111 | env.bak/ 112 | venv.bak/ 113 | 114 | # Spyder project settings 115 | .spyderproject 116 | .spyproject 117 | 118 | # Rope project settings 119 | .ropeproject 120 | 121 | # mkdocs documentation 122 | /site 123 | 124 | # mypy 125 | .mypy_cache/ 126 | .dmypy.json 127 | dmypy.json 128 | 129 | # Pyre type checker 130 | .pyre/ 131 | 132 | .vscode 133 | .venv 134 | 135 | *.lprof 136 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | GENERAL PUBLIC USE LICENSE AGREEMENT 2 | 3 | PLEASE READ THIS DOCUMENT CAREFULLY BEFORE UTILIZING THE PROGRAM 4 | 5 | BY UTILIZING THIS PROGRAM, YOU AGREE TO BECOME BOUND BY THE TERMS OF THIS LICENSE. IF YOU DO NOT AGREE TO THE TERMS OF THIS LICENSE, DO NOT USE THIS PROGRAM OR ANY PORTION THEREOF IN ANY FORM OR MANNER. 6 | 7 | This Program is licensed, not sold to you by GEORGIA TECH RESEARCH CORPORATION ("GTRC"), owner of all code, data and accompanying documentation (hereinafter “Program”), for use only under the terms of this License, and GTRC reserves any rights not expressly granted to you. 8 | 9 | 1. In accordance with the terms and conditions set forth herein, this License allows you to: 10 | 11 | (a) make copies and distribute copies of the Program’s source code provide that any such copy clearly displays any and all appropriate copyright notices and disclaimer of warranty as set forth in Article 5 and 6 of this License. All notices that refer to this License, the developers of this Program, and to the absence of any warranty must be kept intact at all times. A copy of this License must accompany any and all copies of the Program distributed to third parties. 12 | 13 | Notwithstanding anything to the contrary contained herein, a fee may be charged to cover the actual cost of the physical act of transferring a copy to a third party. At no time shall the program be sold for commercial gain either alone or incorporated with other program(s) without entering into a separate agreement with GTRC. 14 | 15 | (b) modify the original copy or copies of the Program or any portion thereof (“Modification(s)”). Modifications may be copied and distributed under the terms and conditions as set forth above, provided the following conditions are met: 16 | 17 | i) any and all modified files must be affixed with prominent notices that you have changed the files and the date that the changes occurred. 18 | 19 | ii) any work that you distribute, publish, or make available, that in whole or in part contains portions of the Program or derivative work thereof, must be licensed at no charge to all third parties under the terms of this License. 20 | 21 | iii) if the modified program normally reads commands interactively when run, you must cause it, when started running for such interactive use in the most ordinary way, to display and/or print an announcement with all appropriate copyright notices and disclaimer of warranty as set forth in Article 5 and 6 of this License to be clearly displayed. In addition, you must provide reasonable access to this License to the user. 22 | 23 | Any portion of a Modification that can be reasonably considered independent of the Program and separate work in and of itself is not subject to the terms and conditions set forth in this License as long as it is not distributed with the Program or any portion thereof. 24 | 25 | 2. This License further allows you to copy and distribute the Program or a work based on it, as set forth in Article 1 Section b in object code or executable form under the terms of Article 1 above provided that you also either: 26 | 27 | i) accompany it with complete corresponding machine-readable source code, which must be distributed under the terms of Article 1, on a medium customarily used for software interchange; or, 28 | 29 | ii) accompany it with a written offer, valid for no less than three (3) years from the time of distribution, to give any third party, for no consideration greater than the cost of physical transfer, a complete machine-readable copy of the corresponding source code, to be distributed under the terms of Article 1 on a medium customarily used for software interchange; or, 30 | 31 | 3. Export Law Assurance. 32 | 33 | You agree that the Software will not be shipped, transferred or exported, directly into any country prohibited by the United States Export Administration Act and the regulations thereunder nor will be used for any purpose prohibited by the Act. 34 | 35 | 4. Termination. 36 | 37 | If at anytime you are unable to comply with any portion of this License you must immediately cease use of the Program and all distribution activities involving the Program or any portion thereof. 38 | 39 | 5. Disclaimer of Warranties and Limitation on Liability. 40 | 41 | YOU ACCEPT THE PROGRAM ON AN "AS IS" BASIS. GTRC MAKES NO WARRANTY THAT ALL ERRORS CAN BE OR HAVE BEEN ELIMINATED FROM PROGRAM. GTRC SHALL NOT BE RESPONSIBLE FOR LOSSES OF ANY KIND RESULTING FROM THE USE OF PROGRAM AND ITS ACCOMPANYING DOCUMENT(S), AND CAN IN NO WAY PROVIDE COMPENSATION FOR ANY LOSSES SUSTAINED, INCLUDING BUT NOT LIMITED TO ANY OBLIGATION, LIABILITY, RIGHT, CLAIM OR REMEDY FOR TORT, OR FOR ANY ACTUAL OR ALLEGED INFRINGEMENT OF PATENTS, COPYRIGHTS, TRADE SECRETS, OR SIMILAR RIGHTS OF THIRD PARTIES, NOR ANY BUSINESS EXPENSE, MACHINE DOWNTIME OR DAMAGES CAUSED TO YOU BY ANY DEFICIENCY, DEFECT OR ERROR IN PROGRAM OR MALFUNCTION THEREOF, NOR ANY INCIDENTAL OR CONSEQUENTIAL DAMAGES, HOWEVER CAUSED. GTRC DISCLAIMS ALL WARRANTIES, BOTH EXPRESS AND IMPLIED RESPECTING THE USE AND OPERATION OF PROGRAM AND ITS ACCOMPANYING DOCUMENTATION, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR PARTICULAR PURPOSE AND ANY IMPLIED WARRANTY ARISING FROM COURSE OF PERFORMANCE, COURSE OF DEALING OR USAGE OF TRADE. GTRC MAKES NO WARRANTY THAT PROGRAM IS ADEQUATELY OR COMPLETELY DESCRIBED IN, OR BEHAVES IN ACCORDANCE WITH ANY ACCOMPANYING DOCUMENTATION. THE USER OF PROGRAM IS EXPECTED TO MAKE THE FINAL EVALUATION OF PROGRAM'S USEFULNESS IN USER'S OWN ENVIRONMENT. 42 | 43 | GTRC represents that, to the best of its knowledge, the software furnished hereunder does not infringe any copyright or patent. 44 | 45 | GTRC shall have no obligation for support or maintenance of Program. 46 | 47 | 6. Copyright Notice. 48 | 49 | THE SOFTWARE AND ACCOMPANYING DOCUMENTATION ARE COPYRIGHTED WITH ALL RIGHTS RESERVED BY GTRC. UNDER UNITED STATES COPYRIGHT LAWS, THE SOFTWARE AND ITS ACCOMPANYING DOCUMENTATION MAY NOT BE COPIED EXCEPT AS GRANTED HEREIN. 50 | 51 | You acknowledge that GTRC is the sole owner of Program, including all copyrights subsisting therein. Any and all copies or partial copies of Program made by you shall bear the copyright notice set forth below and affixed to the original version or such other notice as GTRC shall designate. Such notice shall also be affixed to all improvements or enhancements of Program made by you or portions thereof in such a manner and location as to give reasonable notice of GTRC's copyright as set forth in Article 1. 52 | 53 | Said copyright notice shall read as follows: 54 | 55 | Copyright 2022 56 | Georgia Tech Research Corporation 57 | Atlanta, Georgia 30332-4024 58 | All Rights Reserved -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Polymer Informatics with Multi-Task Learning 2 | 3 | # IMPORTANT NOTE: The code and data shared here is available for academic non-commercial use only 4 | 5 | This repository contains the code to the [paper](https://www.cell.com/patterns/fulltext/S2666-3899(21)00058-1). The code trains four different machine learning models for the prediction of polymer properties. Two of the models are single-task (nn_st.py and gpr.py) and two are multi-task (nn_mt2.py and nn_mt.py) models. Please see the paper for more details. 6 | 7 | ## Prerequisites 8 | 9 | - [Poetry](https://python-poetry.org/docs/) must be installed. See https://python-poetry.org/docs/#installation 10 | 11 | ## Install 12 | 13 | 1. Clone repo 14 | ```bash 15 | git clone https://gitlab.com/ramprasad-group/multi-task-learning && cd multi-task-learning 16 | ``` 17 | 18 | 2. Init poetry 19 | ```bash 20 | poetry install 21 | poetry shell 22 | cd .. && mkdir test && cd test 23 | ``` 24 | 25 | 3. Run 26 | 27 | ```bash 28 | mtask -h 29 | ``` 30 | 31 | ## Use 32 | 33 | 1. In the test directory 34 | 35 | ```bash 36 | mkdir dataset && cd dataset 37 | ``` 38 | 39 | 2. Place dataset in the `dataset/` directory 40 | 41 | 3. Run 42 | 43 | ```bash 44 | mtask_new nn-st train 45 | or 46 | mtask_new nn-mt train 47 | or 48 | mtask_new nn-st2 train 49 | ``` 50 | 51 | 52 | -------------------------------------------------------------------------------- /mt_pack/__init__.py: -------------------------------------------------------------------------------- 1 | __version__ = '0.1.0' 2 | -------------------------------------------------------------------------------- /mt_pack/multi_task.py: -------------------------------------------------------------------------------- 1 | """Main routine""" 2 | 3 | import click 4 | import os 5 | import pandas as pd 6 | import tensorflow as tf 7 | from pathlib import Path 8 | from pathlib import Path 9 | import toml 10 | import pandas as pd 11 | 12 | 13 | def read_config_file(): 14 | config_file = Path('config.toml') 15 | if not config_file.exists(): 16 | raise UserWarning('Config.toml does not exists.') 17 | fl = toml.load(config_file.open()) 18 | return fl 19 | 20 | @click.group( 21 | context_settings=dict(help_option_names=["-h", "--help"]), 22 | invoke_without_command=True, 23 | ) 24 | @click.version_option(version="0.1.0") 25 | @click.option("--cpu", is_flag=True, help="Use only CPUs") 26 | @click.pass_context 27 | def cli(ctx, cpu): 28 | """A Multi-Task Toolkit by Christopher Kuenneth @ Georgia Tech in the Ramprasad Research Group""" 29 | ctx.ensure_object(dict) 30 | conf = ctx.obj 31 | 32 | if cpu: 33 | os.environ["CUDA_VISIBLE_DEVICES"] = "" 34 | tf.config.set_visible_devices([], "GPU") 35 | 36 | if not cpu: 37 | # Get all GPUS which are selected in in CUDA_VISIBLE_DEVICES 38 | gpus = tf.config.experimental.list_physical_devices("GPU") 39 | 40 | # Allow growth 41 | for gpu in gpus: 42 | tf.config.experimental.set_memory_growth(gpu, True) 43 | 44 | 45 | # check_config_file() 46 | config_read = read_config_file() 47 | conf.update(config_read) 48 | 49 | conf["checkpoint_path"] = Path('./checkpoints') 50 | conf["checkpoint_path"].mkdir(exist_ok=True) 51 | 52 | conf["results"] = Path('./results') 53 | conf["results"].mkdir(exist_ok=True) 54 | 55 | from mt_pack.tasks.gpr import gpr 56 | cli.add_command(gpr) 57 | 58 | from mt_pack.tasks.nn_st import nn_st 59 | cli.add_command(nn_st) 60 | 61 | from mt_pack.tasks.nn_mt import nn_mt 62 | cli.add_command(nn_mt) 63 | 64 | from mt_pack.tasks.nn_mt2 import nn_mt2 65 | cli.add_command(nn_mt2) 66 | 67 | 68 | def script(): 69 | cli(obj={}) 70 | -------------------------------------------------------------------------------- /mt_pack/tasks/gpr.py: -------------------------------------------------------------------------------- 1 | """Trains the GRP-ST models""" 2 | import logging 3 | import pickle 4 | 5 | import click 6 | import numpy as np 7 | import pandas as pd 8 | from sklearn.gaussian_process import GaussianProcessRegressor 9 | from sklearn.gaussian_process.kernels import RBF, WhiteKernel 10 | from sklearn.metrics import (explained_variance_score, max_error, 11 | mean_absolute_error, mean_squared_error, r2_score) 12 | 13 | from pathlib import Path 14 | from mt_pack.tasks.transform import get_dataset_st 15 | 16 | @click.group(help="Train GPR models") 17 | @click.pass_context 18 | def gpr(ctx): 19 | logging.info("GPR models") 20 | conf = ctx.obj 21 | 22 | @gpr.command(name="train", help="Train model") 23 | @click.pass_context 24 | def train(ctx): 25 | conf = ctx.obj 26 | 27 | dds = [] 28 | for prop, prop_dict in conf["properties"].items(): 29 | datasets = get_dataset_st(conf, prop_dict['name']) 30 | for fold, dataset in enumerate(datasets): 31 | logging.info(f"GPR: {prop_dict['name']}, fold: {fold}") 32 | 33 | # init GPR 34 | kernel = RBF(length_scale=1) + WhiteKernel(1e-4) 35 | gpr = GaussianProcessRegressor(kernel=kernel) 36 | 37 | gpr.fit(dataset["train"].drop(columns=prop_dict['name']), dataset["train"][prop_dict['name']]) 38 | 39 | # Inverse scaling 40 | sc = dataset['prop_scaler'].inverse_transform 41 | pred = gpr.predict(dataset["val"].drop(columns=prop_dict['name'])) 42 | pred = sc(pred.reshape(-1, 1)) 43 | truth = sc(dataset["val"][[prop_dict['name']]]) 44 | 45 | _d = dict( 46 | rmse=mean_squared_error(pred, truth, squared=False), 47 | r2=r2_score(pred, truth), 48 | me=max_error(pred, truth), 49 | mae=mean_absolute_error(pred, truth), 50 | evs=explained_variance_score(pred, truth), 51 | prop=prop_dict['name'], 52 | task="GPR-ST", 53 | type='val', 54 | fold=fold 55 | ) 56 | 57 | print(_d) 58 | 59 | # save model 60 | fl : Path = conf["checkpoint_path"].joinpath(f"gpr/{prop_dict['name']}/{fold}") 61 | fl.mkdir(exist_ok=True, parents=True) 62 | fl.joinpath('gpr.pkl').write_bytes(pickle.dumps(gpr)) 63 | # save scalers 64 | fl.joinpath('fp_scaler.pkl').write_bytes(pickle.dumps(dataset['fp_scaler'])) 65 | fl.joinpath('prop_scaler.pkl').write_bytes(pickle.dumps(dataset['prop_scaler'])) 66 | 67 | dds.append(_d) 68 | 69 | df = pd.DataFrame(dds) 70 | df.to_csv(conf["results"].joinpath(f"k_{conf['kfolds']}_gpr.csv")) 71 | print(df) 72 | 73 | -------------------------------------------------------------------------------- /mt_pack/tasks/nn_mt.py: -------------------------------------------------------------------------------- 1 | """Trains the NN-MT models""" 2 | import logging 3 | import pickle 4 | from pathlib import Path 5 | 6 | import click 7 | import numpy as np 8 | import pandas as pd 9 | import tensorflow as tf 10 | from sklearn.metrics import (explained_variance_score, max_error, 11 | mean_absolute_error, mean_squared_error, r2_score) 12 | from tensorflow import keras 13 | 14 | from mt_pack.tasks.transform import get_dataset_mt, to_tf_datasets 15 | from mt_pack.utils import get_checkpoint_log 16 | from kerastuner.tuners import Hyperband 17 | from kerastuner import HyperModel 18 | 19 | @click.group(help="MT NN") 20 | @click.pass_context 21 | def nn_mt(ctx): 22 | logging.info("NN MT models") 23 | conf = ctx.obj 24 | 25 | 26 | @nn_mt.command(name="train", help="Train model") 27 | @click.pass_context 28 | def train(ctx): 29 | conf = ctx.obj 30 | props = [v['name'] for k, v in conf['properties'].items()] 31 | 32 | 33 | datasets = get_dataset_mt(conf) 34 | datasets = to_tf_datasets(datasets, conf) 35 | 36 | dds = [] 37 | for fold, dataset in enumerate(datasets): 38 | logging.info(f"NN-MT: {props}, fold: {fold}") 39 | ckpt_path, logdir = get_checkpoint_log(conf, f"multi_task/{fold}/") 40 | 41 | tuner = search_hps(conf, dataset) 42 | model = tuner.get_best_models(num_models=1)[0] 43 | 44 | # predictions 45 | sc = dataset['prop_scaler'].inverse_transform 46 | pred = model.predict(dataset['val']) 47 | pred_df = sc(pd.DataFrame(pred)) 48 | 49 | truth_df = pd.concat([pd.DataFrame(i[1]) for i in dataset['val'].as_numpy_iterator()], ignore_index=True) 50 | truth_df: pd.DataFrame = sc(truth_df) 51 | 52 | for name, truth_col in truth_df.iteritems(): 53 | # import ipdb; ipdb.set_trace() 54 | nans = truth_col.isnull() 55 | pred = pred_df[name][~nans] 56 | truth = truth_col[~nans] 57 | 58 | _d = dict( 59 | rmse=mean_squared_error(pred, truth, squared=False), 60 | r2=r2_score(pred, truth), 61 | me=max_error(pred, truth), 62 | mae=mean_absolute_error(pred, truth), 63 | evs=explained_variance_score(pred, truth), 64 | prop=name, 65 | task="NN-MT", 66 | type='val', 67 | fold=fold 68 | ) 69 | 70 | print(_d) 71 | dds.append(_d) 72 | 73 | Path(ckpt_path).parent.joinpath('fp_scaler.pkl').write_bytes(pickle.dumps(dataset['fp_scaler'])) 74 | Path(ckpt_path).parent.joinpath('prop_scaler.pkl').write_bytes(pickle.dumps(dataset['prop_scaler'])) 75 | 76 | df = pd.DataFrame(dds) 77 | df.to_csv(conf["results"].joinpath(f"k_{conf['kfolds']}_nn_mt.csv")) 78 | print(df) 79 | 80 | 81 | 82 | ##### Loss function 83 | def remove_nan(y_true, y_pred): 84 | y_pred = tf.convert_to_tensor(y_pred) 85 | y_true = tf.cast(y_true, y_pred.dtype) 86 | 87 | nans = tf.math.is_nan(y_true) 88 | zero = tf.constant(0.0, dtype=y_pred.dtype) 89 | 90 | y_true = tf.where(nans, zero, y_true) 91 | y_pred = tf.where(nans, zero, y_pred) 92 | return y_true, y_pred 93 | 94 | def loss_mse(y_true, y_pred, sample_weight=None): 95 | y_true, y_pred = remove_nan(y_true, y_pred) 96 | return tf.keras.losses.mse(y_pred, y_true) 97 | 98 | ################ 99 | #### ### 100 | ## HyperParameter Search 101 | 102 | def search_hps(conf, dataset): 103 | 104 | tuner = Hyperband( 105 | MyHyperModel(conf), 106 | objective='val_loss', 107 | max_epochs=100, 108 | seed=10, 109 | directory='hyperparamter_search', 110 | project_name='nn-mt') 111 | 112 | print(tuner.search_space_summary()) 113 | 114 | tuner.search(dataset['train'], 115 | epochs=100, 116 | validation_data=dataset['val']) 117 | 118 | print(tuner.results_summary()) 119 | return tuner 120 | 121 | class MyHyperModel(HyperModel): 122 | 123 | def __init__(self, conf): 124 | self.conf = conf 125 | 126 | def build(self, hp): 127 | 128 | model = NNMT_HP(self.conf, hp) 129 | 130 | model.compile( 131 | optimizer=keras.optimizers.Adam( 132 | hp.Choice('learning_rate', 133 | values=[1e-2, 1e-3, 1e-4])), 134 | loss=loss_mse) 135 | 136 | return model 137 | 138 | class NNMT_HP(keras.Model): 139 | def __init__(self, conf, hp): 140 | super().__init__() 141 | self.properties = [v['name'] for k, v in conf['properties'].items()] 142 | self.my_layers = [] 143 | for i in range(hp.Int('num_layers', 1, 2)): 144 | self.my_layers.append(tf.keras.layers.Dense(units=hp.Int('units_' + str(i), 145 | min_value=32, 146 | max_value=512, 147 | step=32),)) 148 | 149 | self.my_layers.append(tf.keras.layers.ReLU()) 150 | self.my_layers.append(tf.keras.layers.Dropout(0.5)) 151 | 152 | self.last_layer = tf.keras.layers.Dense(len(self.properties)) 153 | 154 | 155 | def call(self, inputs): 156 | out = inputs 157 | for layer in self.my_layers: 158 | out = layer(out) 159 | out = self.last_layer(out) 160 | 161 | return_dict = {} 162 | for num, prop in enumerate(self.properties): 163 | return_dict[prop] = out[..., num] 164 | return return_dict 165 | -------------------------------------------------------------------------------- /mt_pack/tasks/nn_mt2.py: -------------------------------------------------------------------------------- 1 | """Trains the NN-MT2 models""" 2 | import logging 3 | import pickle 4 | from pathlib import Path 5 | from typing import Dict 6 | 7 | import click 8 | import numpy as np 9 | import pandas as pd 10 | import tensorflow as tf 11 | from kerastuner import HyperModel 12 | from kerastuner.tuners import Hyperband, RandomSearch 13 | from sklearn.metrics import (explained_variance_score, max_error, 14 | mean_absolute_error, mean_squared_error, r2_score) 15 | from tensorflow import keras 16 | from tensorflow.keras import callbacks 17 | import tensorflow_addons as tfa 18 | 19 | from mt_pack.tasks.transform import get_dataset_mt2, to_tf_datasets 20 | from mt_pack.utils import get_checkpoint_log 21 | from sklearn.preprocessing import MinMaxScaler, RobustScaler 22 | 23 | np.random.seed(10) 24 | tf.random.set_seed(10) 25 | 26 | @click.group(help="MT2 NN") 27 | @click.pass_context 28 | def nn_mt2(ctx): 29 | logging.info("NN MT2 models") 30 | 31 | @nn_mt2.command(name="train", help="Train model") 32 | @click.pass_context 33 | def train(ctx): 34 | conf = ctx.obj 35 | props = [v['name'] for k, v in conf['properties'].items()] 36 | 37 | datasets = get_dataset_mt2(conf) 38 | 39 | dds = [] 40 | for fold, dataset in enumerate(datasets): 41 | 42 | logging.info(f"NN-MT2: {props}, fold: {fold}") 43 | ckpt_path, logdir = get_checkpoint_log(conf, f"multi_task_mt2/{fold}/") 44 | 45 | tuner = search_hps(dataset, fold) 46 | model = tuner.get_best_models(num_models=1)[0] 47 | 48 | 49 | # predictions 50 | pred = model.predict(dataset['val']) 51 | prop_names = np.concatenate([i[0][2] for i in dataset['val'].as_numpy_iterator()], 0).flatten().astype(np.str) 52 | pred_df = pd.DataFrame(np.concatenate([pred, prop_names[:,np.newaxis]], 1), columns=['value', 'variable']) 53 | pred_df.value = pred_df.value.astype('float32') 54 | 55 | truth_df = pd.concat([pd.DataFrame(i[1]) for i in dataset['val'].as_numpy_iterator()], ignore_index=True) 56 | truth_df = pd.DataFrame(np.concatenate([truth_df, prop_names[:,np.newaxis]], 1), columns=['value', 'variable']) 57 | truth_df.value = truth_df.value.astype('float32') 58 | 59 | props = pred_df.variable.unique().tolist() 60 | for prop in props: 61 | sc = dataset['prop_scaler'][prop].inverse_transform 62 | cond = pred_df.variable == prop 63 | pred_df.loc[cond, ['value']] = sc(pred_df.loc[cond, ['value']].values) 64 | truth_df.loc[cond, ['value']] = sc(truth_df.loc[cond, ['value']].values) 65 | 66 | pred = pred_df.loc[cond, ['value']] 67 | truth = truth_df.loc[cond, ['value']] 68 | 69 | _d = dict( 70 | rmse=mean_squared_error(pred, truth, squared=False), 71 | r2=r2_score(pred, truth), 72 | me=max_error(pred, truth), 73 | mae=mean_absolute_error(pred, truth), 74 | evs=explained_variance_score(pred, truth), 75 | prop=prop, 76 | task="NN-MT2", 77 | type='val', 78 | fold=fold 79 | ) 80 | 81 | print(_d) 82 | dds.append(_d) 83 | 84 | Path(ckpt_path).parent.joinpath('fp_scaler.pkl').write_bytes(pickle.dumps(dataset['fp_scaler'])) 85 | Path(ckpt_path).parent.joinpath('prop_scaler.pkl').write_bytes(pickle.dumps(dataset['prop_scaler'])) 86 | 87 | df = pd.DataFrame(dds) 88 | df.to_csv(conf["results"].joinpath(f"k_{conf['kfolds']}_nn_mt2.csv")) 89 | print(df) 90 | 91 | 92 | 93 | 94 | ################ 95 | #### ### 96 | ## HyperParameter Search 97 | 98 | def search_hps(dataset, fold, sp = None): 99 | 100 | tuner = Hyperband( 101 | MyHyperModel(scaler_path=sp if sp else None), 102 | objective='val_loss', 103 | max_epochs=400, 104 | seed=10, 105 | directory='hyperparamter_search', 106 | project_name='nn-mt2_' + str(fold), 107 | ) 108 | 109 | print(tuner.search_space_summary()) 110 | 111 | earlystop = tf.keras.callbacks.EarlyStopping(monitor="val_loss", patience=200) 112 | 113 | 114 | reduce_lr = tf.keras.callbacks.ReduceLROnPlateau( 115 | monitor="val_loss", 116 | factor=0.9, 117 | patience=20, 118 | cooldown=5, 119 | verbose=True, 120 | ) 121 | 122 | tuner.search(dataset['train'], 123 | epochs=100, 124 | validation_data=dataset['val'], 125 | callbacks=[earlystop, reduce_lr]) 126 | 127 | # print(tuner.results_summary(num_trials=1)) 128 | return tuner 129 | 130 | class MyHyperModel(HyperModel): 131 | 132 | def __init__(self, scaler_path=None): 133 | if scaler_path: 134 | self.scaler_path = scaler_path 135 | else: 136 | self.scaler_path = None 137 | 138 | 139 | def build(self, hp): 140 | 141 | model = NNMT2_HP(hp, scaler_path=self.scaler_path if self.scaler_path else None) 142 | opt = keras.optimizers.Adam( 143 | hp.Choice('learning_rate', 144 | values=[1e-2, 1e-3])) 145 | 146 | stocastic_avg_sgd = tfa.optimizers.SWA(opt) 147 | # loss_weights = {'sol': 100, 'Eat': 100} 148 | 149 | model.compile( 150 | optimizer=stocastic_avg_sgd, 151 | loss='mse') 152 | 153 | return model 154 | 155 | class NNMT2_HP(keras.Model): 156 | def __init__(self, hp, scaler_path=None): 157 | super().__init__() 158 | 159 | if scaler_path: 160 | self.fp_scaler: MinMaxScaler = pickle.loads(scaler_path.joinpath('fp_scaler.pkl').read_bytes()) 161 | self.prop_scalers: Dict[RobustScaler] = pickle.loads(scaler_path.joinpath('prop_scaler.pkl').read_bytes()) 162 | 163 | self.my_layers = [] 164 | # self.bn1 = tf.keras.layers.BatchNormalization() 165 | for i in range(hp.Int('num_layers', 2, 2)): 166 | self.my_layers.append(tf.keras.layers.Dense(units=hp.Int('units_' + str(i), 167 | min_value=128, 168 | max_value=512, 169 | step=32),)) 170 | 171 | self.my_layers.append(tf.keras.layers.ReLU()) 172 | self.my_layers.append(tf.keras.layers.Dropout(hp.Float( 173 | 'dropout_' + str(i), 174 | min_value=0.0, 175 | max_value=0.7, 176 | default=0.25, 177 | step=0.1, 178 | ))) 179 | 180 | self.last_layer = tf.keras.layers.Dense(1) 181 | 182 | 183 | def call(self, inputs): 184 | out = tf.concat(inputs[:2], 1) 185 | # out = self.bn1(out) 186 | for layer in self.my_layers: 187 | out = layer(out) 188 | out = self.last_layer(out) 189 | return out 190 | 191 | 192 | -------------------------------------------------------------------------------- /mt_pack/tasks/nn_st.py: -------------------------------------------------------------------------------- 1 | """Trains the NN-ST models""" 2 | import logging 3 | import pickle 4 | from pathlib import Path 5 | 6 | import click 7 | import numpy as np 8 | import pandas as pd 9 | import tensorflow as tf 10 | from sklearn.metrics import (explained_variance_score, max_error, 11 | mean_absolute_error, mean_squared_error, r2_score) 12 | from tensorflow import keras 13 | 14 | from mt_pack.tasks.transform import get_dataset_st, to_tf_datasets 15 | from mt_pack.utils import get_checkpoint_log 16 | 17 | import matplotlib.pyplot as plt 18 | from kerastuner.tuners import Hyperband 19 | from kerastuner import HyperModel 20 | 21 | 22 | @click.group(help="ST NN") 23 | @click.pass_context 24 | def nn_st(ctx): 25 | logging.info("NN ST models") 26 | conf = ctx.obj 27 | 28 | @nn_st.command(name="train", help="Train model") 29 | @click.pass_context 30 | def train(ctx): 31 | conf = ctx.obj 32 | 33 | dds = [] 34 | for prop, prop_dict in conf["properties"].items(): 35 | datasets = get_dataset_st(conf, prop_dict['name']) 36 | datasets = to_tf_datasets(datasets, conf) 37 | 38 | for fold, dataset in enumerate(datasets): 39 | logging.info(f"NN-ST: {prop_dict['name']}, fold: {fold}") 40 | 41 | ckpt_path, logdir = get_checkpoint_log(conf, f"single_task/{prop_dict['name']}/{fold}/") 42 | 43 | # Hyperparamter search 44 | tuner = search_hps(prop_dict, dataset) 45 | model = tuner.get_best_models(num_models=1)[0] 46 | 47 | # predictions 48 | sc = dataset['prop_scaler'].inverse_transform 49 | pred = model.predict(dataset['val'])[prop_dict['name']] 50 | pred = sc(pred.flatten().reshape(-1, 1)) 51 | 52 | truth = [i[1][prop_dict['name']] for i in dataset['val'].as_numpy_iterator()] 53 | truth = np.concatenate(truth, axis=0) 54 | truth = sc(truth.flatten().reshape(-1, 1)) 55 | 56 | _d = dict( 57 | rmse=mean_squared_error(pred, truth, squared=False), 58 | r2=r2_score(pred, truth), 59 | me=max_error(pred, truth), 60 | mae=mean_absolute_error(pred, truth), 61 | evs=explained_variance_score(pred, truth), 62 | prop=prop_dict['name'], 63 | task="NN-ST", 64 | type='val', 65 | fold=fold 66 | ) 67 | 68 | print(_d) 69 | 70 | # save scalers 71 | Path(ckpt_path).parent.joinpath('fp_scaler.pkl').write_bytes(pickle.dumps(dataset['fp_scaler'])) 72 | Path(ckpt_path).parent.joinpath('prop_scaler.pkl').write_bytes(pickle.dumps(dataset['prop_scaler'])) 73 | dds.append(_d) 74 | 75 | df = pd.DataFrame(dds) 76 | df.to_csv(conf["results"].joinpath(f"k_{conf['kfolds']}_nn_st.csv")) 77 | print(df) 78 | 79 | 80 | ################ 81 | #### ### 82 | ## HyperParameter Search 83 | 84 | def search_hps(prop_dict, dataset): 85 | 86 | tuner = Hyperband( 87 | MyHyperModel(prop_dict['name']), 88 | objective='val_loss', 89 | max_epochs=100, 90 | seed=10, 91 | directory='hyperparamter_search', 92 | project_name='nn-st') 93 | 94 | print(tuner.search_space_summary()) 95 | 96 | tuner.search(dataset['train'], 97 | epochs=100, 98 | validation_data=dataset['val']) 99 | 100 | print(tuner.results_summary()) 101 | return tuner 102 | 103 | class MyHyperModel(HyperModel): 104 | 105 | def __init__(self, prop): 106 | self.prop = prop 107 | 108 | def build(self, hp): 109 | 110 | model = NNST_HP(self.prop, hp) 111 | 112 | model.compile( 113 | optimizer=keras.optimizers.Adam( 114 | hp.Choice('learning_rate', 115 | values=[1e-2, 1e-3, 1e-4])), 116 | loss='mse') 117 | 118 | return model 119 | 120 | class NNST_HP(keras.Model): 121 | def __init__(self, prop_name, hp): 122 | super().__init__() 123 | self.prop = prop_name 124 | self.my_layers = [] 125 | for i in range(hp.Int('num_layers', 1, 2)): 126 | self.my_layers.append(tf.keras.layers.Dense(units=hp.Int('units_' + str(i), 127 | min_value=32, 128 | max_value=512, 129 | step=32),)) 130 | 131 | self.my_layers.append(tf.keras.layers.ReLU()) 132 | self.my_layers.append(tf.keras.layers.Dropout(0.5)) 133 | 134 | self.last_layer = tf.keras.layers.Dense(1, name=prop_name) 135 | 136 | def call(self, inputs): 137 | out = inputs 138 | for layer in self.my_layers: 139 | out = layer(out) 140 | return {self.prop: self.last_layer(out)} 141 | -------------------------------------------------------------------------------- /mt_pack/tasks/transform.py: -------------------------------------------------------------------------------- 1 | """Hold preprocessing functions for the data set""" 2 | from sklearn.model_selection import KFold, StratifiedKFold 3 | from sklearn.preprocessing import MinMaxScaler, RobustScaler 4 | import pandas as pd 5 | from typing import List, Dict 6 | import numpy as np 7 | import tensorflow as tf 8 | from pathlib import Path 9 | 10 | def kfold_split(df, conf): 11 | df = df.sample(frac=1, random_state=1).reset_index(drop=True) 12 | kf = KFold(n_splits=conf['kfolds'], shuffle=False, random_state=0) 13 | datasets = [] 14 | for idx_train, idx_val in kf.split(df): 15 | train, val = df.iloc[idx_train].copy(), df.iloc[idx_val].copy() 16 | datasets.append(dict(train=train, val=val)) 17 | return datasets 18 | 19 | 20 | def scale(datasets): 21 | for num, data in enumerate(datasets): 22 | # fps 23 | datasets[num]['fp_scaler'] = MinMaxScaler() 24 | fps_names = data['train'].loc[:, data['train'].columns.str.startswith('fingerprints')].columns.values.tolist() 25 | datasets[num]['train'].loc[:, fps_names] = datasets[num]['fp_scaler'].fit_transform(data['train'][fps_names]) 26 | datasets[num]['val'].loc[:, fps_names] = datasets[num]['fp_scaler'].transform(data['val'][fps_names]) 27 | 28 | # props 29 | datasets[num]['prop_scaler'] = RobustScaler() 30 | props_name = data['train'].columns[~data['train'].columns.str.startswith('fingerprints')].values.tolist() 31 | 32 | datasets[num]['train'].loc[:, props_name] = datasets[num]['prop_scaler'].fit_transform(data['train'][props_name]) 33 | datasets[num]['val'].loc[:, props_name] = datasets[num]['prop_scaler'].transform(data['val'][props_name]) 34 | 35 | return datasets 36 | 37 | def prepare_and_read_dataframe(conf: dict): 38 | df = pd.read_pickle(conf['dataset']) 39 | 40 | # Change names, remove 'properties.' 41 | map_dict = {} 42 | for prop, prop_dict in conf['properties'].items(): 43 | map_dict[f"properties.{prop}"] = f"{prop_dict['name']}" 44 | df = df.rename(columns=map_dict) 45 | 46 | return df 47 | 48 | def get_dataset_mt2(conf: dict) -> List[Dict]: 49 | 50 | df = prepare_and_read_dataframe(conf) 51 | fps = df.loc[:, df.columns.str.startswith('fingerprints')].columns.values.tolist() 52 | 53 | props = [v['name'] for k, v in conf['properties'].items()] 54 | df = df[props + fps] 55 | 56 | 57 | # Takes time 58 | df = df.melt(id_vars=fps).dropna().reset_index(drop=True) 59 | dum = pd.get_dummies(df['variable'])[props] 60 | df = pd.concat([dum, df], 1, keys=['dummy','fps']) 61 | 62 | new_index = df.columns.to_list() 63 | new_index[-2:] = [('data', 'variable'), ('data', 'value')] 64 | df.columns = pd.MultiIndex.from_tuples(new_index) 65 | 66 | # # Split 67 | # 68 | 69 | kf = StratifiedKFold(n_splits=conf['kfolds'], shuffle=True, random_state=20) 70 | datasets = [] 71 | for idx_train, idx_val in kf.split(df, df.data.variable): 72 | train, val = df.iloc[idx_train].copy(), df.iloc[idx_val].copy() 73 | datasets.append(dict(train=train, val=val)) 74 | 75 | # # Scale 76 | # 77 | for num, data in enumerate(datasets): 78 | # fps 79 | datasets[num]['fp_scaler'] = MinMaxScaler() 80 | datasets[num]['train'].fps = datasets[num]['fp_scaler'].fit_transform(data['train'].fps) 81 | datasets[num]['val'].fps = datasets[num]['fp_scaler'].transform(data['val'].fps) 82 | 83 | # scale property values 84 | sc = RobustScaler 85 | 86 | datasets[num]['prop_scaler'] = {} 87 | 88 | for prop in datasets[num]['train'].dummy.columns.to_list(): 89 | _sc = sc() 90 | 91 | # Train 92 | cond = datasets[num]['train'].data.variable == prop 93 | datasets[num]['train'].loc[cond, ('data', ['value'])] = _sc.fit_transform(datasets[num]['train'].loc[cond, ('data', ['value'])] ) 94 | 95 | # val 96 | cond = datasets[num]['val'].data.variable == prop 97 | datasets[num]['val'].loc[cond, ('data', ['value'])] = _sc.transform(datasets[num]['val'].loc[cond, ('data', ['value'])] ) 98 | 99 | datasets[num]['prop_scaler'][prop] = _sc 100 | 101 | for num, dataset in enumerate(datasets): 102 | for set_name in ['train', 'val']: 103 | # fps 104 | fps = dataset[set_name].fps.astype('float32') 105 | value = dataset[set_name].data.value.astype('float32') 106 | selector = dataset[set_name].dummy.astype('float32') 107 | var = dataset[set_name].data.variable 108 | 109 | data = tf.data.Dataset.from_tensor_slices( 110 | ((fps.values, selector.values, var.values), value.values) 111 | ) 112 | 113 | data = data.cache().batch(conf['batchsize']).prefetch(tf.data.experimental.AUTOTUNE) 114 | datasets[num][set_name] = data 115 | 116 | return datasets 117 | 118 | def get_dataset_mt(conf: dict) -> List[Dict]: 119 | df = prepare_and_read_dataframe(conf) 120 | fps = df.loc[:, df.columns.str.startswith('fingerprints')].columns.values.tolist() 121 | 122 | props = [v['name'] for k, v in conf['properties'].items()] 123 | df = df[props + fps] 124 | 125 | # process 126 | datasets = kfold_split(df, conf) 127 | datasets = scale(datasets) 128 | return datasets 129 | 130 | def get_dataset_st(conf: dict, prop_name: str) -> List[Dict]: 131 | df = prepare_and_read_dataframe(conf) 132 | 133 | # select and cut 134 | fps = df.loc[:, df.columns.str.startswith('fingerprints')].columns.values.tolist() 135 | df = df[[prop_name] + fps] 136 | df.dropna(subset=[prop_name], inplace=True) 137 | 138 | # process 139 | datasets = kfold_split(df, conf) 140 | datasets = scale(datasets) 141 | 142 | return datasets 143 | 144 | def to_tf_datasets( 145 | datasets: dict, conf: dict) -> dict: 146 | 147 | for num, dataset in enumerate(datasets): 148 | 149 | for set_name in ['train', 'val']: 150 | dataset[set_name] = dataset[set_name].astype('float32') 151 | 152 | # train 153 | fps_names = dataset[set_name].loc[:, dataset[set_name].columns.str.startswith('fingerprints')].columns.values.tolist() 154 | prop_names = dataset[set_name].loc[:, ~dataset[set_name].columns.str.startswith('fingerprints')].columns.values.tolist() 155 | 156 | fingerprints = dataset[set_name][fps_names] 157 | properties = dataset[set_name][prop_names] 158 | 159 | data = tf.data.Dataset.from_tensor_slices( 160 | (fingerprints.values, properties.to_dict("list")) 161 | ) 162 | data = data.cache().batch(conf['batchsize']).prefetch(tf.data.experimental.AUTOTUNE) 163 | datasets[num][set_name] = data 164 | 165 | return datasets -------------------------------------------------------------------------------- /mt_pack/utils.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | from pathlib import Path 3 | import shutil 4 | 5 | def get_checkpoint_log(conf, name): 6 | name = str(name) 7 | ckpt_path = str(conf["checkpoint_path"].joinpath(name, "single.checkpoint")) 8 | logdir = f"./logs/{name}/" 9 | 10 | Path(logdir).exists() and shutil.rmtree(logdir) 11 | return ckpt_path, logdir 12 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.poetry] 2 | name = "mt_pack" 3 | version = "1.0.0" 4 | description = "Multi-Task Learner" 5 | authors = ["Christopher Kuenneth "] 6 | 7 | [tool.poetry.dependencies] 8 | python = ">=3.7,<3.8" 9 | pandas = "1.0.0" 10 | matplotlib = "3.1.2" 11 | numpy = "1.18.1" 12 | seaborn = "0.10.0" 13 | click = "7.0" 14 | tabulate = "0.8.6" 15 | tqdm = "4.42.1" 16 | tensorflow-addons = "0.10" 17 | pillow = "7.0.0" 18 | toml = "0.10.0" 19 | plotly = "4.5.2" 20 | nbformat = "4.4.0" 21 | deepdiff = "4.2.0" 22 | psutil = "5.7.0" 23 | notebook = "6.0.3" 24 | xgboost = "1.0.2" 25 | keras-tuner = "1.0.1" 26 | shap = "0.35.0" 27 | tensorflow = "2.3.1" 28 | scikit-learn = "0.24" 29 | protobuf = "3.20" 30 | 31 | [tool.poetry.dev-dependencies] 32 | pytest = "^4.6" 33 | black = "^19.10b0" 34 | line_profiler = "^3.0.2" 35 | pylint = "^2.4.4" 36 | pydot = "^1.4.1" 37 | graphviz = "^0.13.2" 38 | 39 | [tool.poetry.scripts] 40 | mtask = "mt_pack.multi_task:script" 41 | [build-system] 42 | requires = ["poetry>=0.12"] 43 | build-backend = "poetry.masonry.api" 44 | 45 | [virtualenvs] 46 | create = true 47 | in-project = true 48 | --------------------------------------------------------------------------------