├── mdml ├── mdml │ ├── __init__.py │ ├── __pycache__ │ │ ├── base.cpython-39.pyc │ │ ├── cli.cpython-39.pyc │ │ ├── model.cpython-39.pyc │ │ ├── plot.cpython-39.pyc │ │ └── __init__.cpython-39.pyc │ ├── plot.py │ ├── base.py │ └── cli.py ├── examples │ └── PD-L1 │ │ ├── lasso │ │ ├── STDOUT │ │ ├── model.pkl │ │ ├── importance.csv │ │ └── STDERR │ │ ├── linear │ │ ├── STDOUT │ │ ├── model.pkl │ │ ├── STDERR │ │ └── importance.csv │ │ ├── ridge │ │ ├── STDOUT │ │ ├── model.pkl │ │ ├── importance.csv │ │ └── STDERR │ │ ├── lasso_mean │ │ ├── STDOUT │ │ ├── model.pkl │ │ └── importance.csv │ │ ├── linear_mean │ │ ├── STDOUT │ │ ├── model.pkl │ │ ├── STDERR │ │ └── importance.csv │ │ ├── ridge_mean │ │ ├── STDOUT │ │ ├── model.pkl │ │ ├── importance.csv │ │ └── STDERR │ │ ├── random_forest │ │ ├── STDOUT │ │ ├── model.pkl │ │ ├── importance.csv │ │ └── STDERR │ │ ├── gradient_boosting │ │ ├── STDOUT │ │ ├── model.pkl │ │ ├── importance.csv │ │ └── STDERR │ │ ├── random_forest_mean │ │ ├── STDOUT │ │ ├── model.pkl │ │ ├── importance.csv │ │ └── STDERR │ │ ├── gradient_boosting_mean │ │ ├── STDOUT │ │ ├── model.pkl │ │ ├── importance.csv │ │ └── STDERR │ │ └── README.md ├── setup.py ├── .gitignore ├── bin │ ├── polynomial_features │ ├── mdml_predict │ └── mdml_train └── README.md ├── bin ├── __pycache__ │ ├── mdfit_ligprep.cpython-38.pyc │ ├── mdfit_run_md.cpython-38.pyc │ ├── mdfit_build_box.cpython-38.pyc │ ├── mdfit_desmond_md.cpython-38.pyc │ ├── mdfit_ffbuilder.cpython-38.pyc │ ├── mdfit_get_charge.cpython-38.pyc │ ├── mdfit_initiate.cpython-38.pyc │ ├── mdfit_parseargs.cpython-38.pyc │ ├── mdfit_slicetrj.cpython-38.pyc │ ├── mdfit_cluster_traj.cpython-38.pyc │ ├── mdfit_combine_csvs.cpython-38.pyc │ ├── mdfit_extract_dat.cpython-38.pyc │ ├── mdfit_prep_complex.cpython-38.pyc │ ├── mdfit_read_params.cpython-38.pyc │ ├── mdfit_event_analysis.cpython-38.pyc │ ├── mdfit_glide_docking.cpython-38.pyc │ ├── mdfit_desmond_analysis.cpython-38.pyc │ └── mdfit_run_minimization.cpython-38.pyc ├── mdfit_run_md.py ├── mdfit_run_minimization.py ├── mdfit_read_params.py ├── mdfit_build_box.py ├── mdfit_get_charge.py ├── mdfit_slicetrj.py ├── mdfit_combine_csvs.py ├── mdfit_prep_complex.py ├── mdfit_initiate.py ├── mdfit_event_analysis.py ├── mdfit_parseargs.py ├── mdfit_cluster_traj.py ├── mdfit_ffbuilder.py ├── mdfit_desmond_analysis.py ├── mdfit_extract_dat.py └── mdfit_desmond_md.py ├── templates ├── neutral_template.msj ├── negative_template.msj ├── positive_template.msj ├── bmin_template.com ├── desmond_md_job_template.cfg └── desmond_md_job_template.msj ├── Examples └── PDL1 │ └── MDFit_PDL1_Commands.sh ├── LICENSE ├── README.md ├── .gitignore └── MDFit.py /mdml/mdml/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /mdml/examples/PD-L1/lasso/STDOUT: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /mdml/examples/PD-L1/linear/STDOUT: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /mdml/examples/PD-L1/ridge/STDOUT: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /mdml/examples/PD-L1/lasso_mean/STDOUT: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /mdml/examples/PD-L1/linear_mean/STDOUT: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /mdml/examples/PD-L1/ridge_mean/STDOUT: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /mdml/examples/PD-L1/random_forest/STDOUT: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /mdml/examples/PD-L1/gradient_boosting/STDOUT: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /mdml/examples/PD-L1/random_forest_mean/STDOUT: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /mdml/examples/PD-L1/gradient_boosting_mean/STDOUT: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /mdml/examples/PD-L1/lasso/model.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/brueckna2020/MDFit/HEAD/mdml/examples/PD-L1/lasso/model.pkl -------------------------------------------------------------------------------- /mdml/examples/PD-L1/linear/model.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/brueckna2020/MDFit/HEAD/mdml/examples/PD-L1/linear/model.pkl -------------------------------------------------------------------------------- /mdml/examples/PD-L1/ridge/model.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/brueckna2020/MDFit/HEAD/mdml/examples/PD-L1/ridge/model.pkl -------------------------------------------------------------------------------- /mdml/examples/PD-L1/lasso_mean/model.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/brueckna2020/MDFit/HEAD/mdml/examples/PD-L1/lasso_mean/model.pkl -------------------------------------------------------------------------------- /mdml/examples/PD-L1/linear_mean/model.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/brueckna2020/MDFit/HEAD/mdml/examples/PD-L1/linear_mean/model.pkl -------------------------------------------------------------------------------- /mdml/examples/PD-L1/ridge_mean/model.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/brueckna2020/MDFit/HEAD/mdml/examples/PD-L1/ridge_mean/model.pkl -------------------------------------------------------------------------------- /mdml/mdml/__pycache__/base.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/brueckna2020/MDFit/HEAD/mdml/mdml/__pycache__/base.cpython-39.pyc -------------------------------------------------------------------------------- /mdml/mdml/__pycache__/cli.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/brueckna2020/MDFit/HEAD/mdml/mdml/__pycache__/cli.cpython-39.pyc -------------------------------------------------------------------------------- /mdml/mdml/__pycache__/model.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/brueckna2020/MDFit/HEAD/mdml/mdml/__pycache__/model.cpython-39.pyc -------------------------------------------------------------------------------- /mdml/mdml/__pycache__/plot.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/brueckna2020/MDFit/HEAD/mdml/mdml/__pycache__/plot.cpython-39.pyc -------------------------------------------------------------------------------- /bin/__pycache__/mdfit_ligprep.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/brueckna2020/MDFit/HEAD/bin/__pycache__/mdfit_ligprep.cpython-38.pyc -------------------------------------------------------------------------------- /bin/__pycache__/mdfit_run_md.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/brueckna2020/MDFit/HEAD/bin/__pycache__/mdfit_run_md.cpython-38.pyc -------------------------------------------------------------------------------- /mdml/examples/PD-L1/random_forest/model.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/brueckna2020/MDFit/HEAD/mdml/examples/PD-L1/random_forest/model.pkl -------------------------------------------------------------------------------- /bin/__pycache__/mdfit_build_box.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/brueckna2020/MDFit/HEAD/bin/__pycache__/mdfit_build_box.cpython-38.pyc -------------------------------------------------------------------------------- /bin/__pycache__/mdfit_desmond_md.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/brueckna2020/MDFit/HEAD/bin/__pycache__/mdfit_desmond_md.cpython-38.pyc -------------------------------------------------------------------------------- /bin/__pycache__/mdfit_ffbuilder.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/brueckna2020/MDFit/HEAD/bin/__pycache__/mdfit_ffbuilder.cpython-38.pyc -------------------------------------------------------------------------------- /bin/__pycache__/mdfit_get_charge.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/brueckna2020/MDFit/HEAD/bin/__pycache__/mdfit_get_charge.cpython-38.pyc -------------------------------------------------------------------------------- /bin/__pycache__/mdfit_initiate.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/brueckna2020/MDFit/HEAD/bin/__pycache__/mdfit_initiate.cpython-38.pyc -------------------------------------------------------------------------------- /bin/__pycache__/mdfit_parseargs.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/brueckna2020/MDFit/HEAD/bin/__pycache__/mdfit_parseargs.cpython-38.pyc -------------------------------------------------------------------------------- /bin/__pycache__/mdfit_slicetrj.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/brueckna2020/MDFit/HEAD/bin/__pycache__/mdfit_slicetrj.cpython-38.pyc -------------------------------------------------------------------------------- /mdml/examples/PD-L1/gradient_boosting/model.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/brueckna2020/MDFit/HEAD/mdml/examples/PD-L1/gradient_boosting/model.pkl -------------------------------------------------------------------------------- /mdml/mdml/__pycache__/__init__.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/brueckna2020/MDFit/HEAD/mdml/mdml/__pycache__/__init__.cpython-39.pyc -------------------------------------------------------------------------------- /bin/__pycache__/mdfit_cluster_traj.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/brueckna2020/MDFit/HEAD/bin/__pycache__/mdfit_cluster_traj.cpython-38.pyc -------------------------------------------------------------------------------- /bin/__pycache__/mdfit_combine_csvs.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/brueckna2020/MDFit/HEAD/bin/__pycache__/mdfit_combine_csvs.cpython-38.pyc -------------------------------------------------------------------------------- /bin/__pycache__/mdfit_extract_dat.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/brueckna2020/MDFit/HEAD/bin/__pycache__/mdfit_extract_dat.cpython-38.pyc -------------------------------------------------------------------------------- /bin/__pycache__/mdfit_prep_complex.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/brueckna2020/MDFit/HEAD/bin/__pycache__/mdfit_prep_complex.cpython-38.pyc -------------------------------------------------------------------------------- /bin/__pycache__/mdfit_read_params.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/brueckna2020/MDFit/HEAD/bin/__pycache__/mdfit_read_params.cpython-38.pyc -------------------------------------------------------------------------------- /mdml/examples/PD-L1/random_forest_mean/model.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/brueckna2020/MDFit/HEAD/mdml/examples/PD-L1/random_forest_mean/model.pkl -------------------------------------------------------------------------------- /bin/__pycache__/mdfit_event_analysis.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/brueckna2020/MDFit/HEAD/bin/__pycache__/mdfit_event_analysis.cpython-38.pyc -------------------------------------------------------------------------------- /bin/__pycache__/mdfit_glide_docking.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/brueckna2020/MDFit/HEAD/bin/__pycache__/mdfit_glide_docking.cpython-38.pyc -------------------------------------------------------------------------------- /mdml/examples/PD-L1/gradient_boosting_mean/model.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/brueckna2020/MDFit/HEAD/mdml/examples/PD-L1/gradient_boosting_mean/model.pkl -------------------------------------------------------------------------------- /bin/__pycache__/mdfit_desmond_analysis.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/brueckna2020/MDFit/HEAD/bin/__pycache__/mdfit_desmond_analysis.cpython-38.pyc -------------------------------------------------------------------------------- /bin/__pycache__/mdfit_run_minimization.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/brueckna2020/MDFit/HEAD/bin/__pycache__/mdfit_run_minimization.cpython-38.pyc -------------------------------------------------------------------------------- /templates/neutral_template.msj: -------------------------------------------------------------------------------- 1 | task { 2 | task = "desmond:auto" 3 | } 4 | 5 | build_geometry { 6 | box = { 7 | shape = orthorhombic 8 | size = [10.0 10.0 10.0] 9 | size_type = buffer 10 | } 11 | rezero_system = true 12 | solvent = 13 | } 14 | 15 | assign_forcefield { 16 | forcefield = OPLS4 17 | water = 18 | } 19 | -------------------------------------------------------------------------------- /templates/negative_template.msj: -------------------------------------------------------------------------------- 1 | task { 2 | task = "desmond:auto" 3 | } 4 | 5 | build_geometry { 6 | add_counterion = { 7 | ion = Na 8 | number = neutralize_system 9 | } 10 | box = { 11 | shape = orthorhombic 12 | size = [10.0 10.0 10.0] 13 | size_type = buffer 14 | } 15 | rezero_system = true 16 | solvent = 17 | } 18 | 19 | assign_forcefield { 20 | forcefield = OPLS4 21 | water = 22 | } 23 | -------------------------------------------------------------------------------- /templates/positive_template.msj: -------------------------------------------------------------------------------- 1 | task { 2 | task = "desmond:auto" 3 | } 4 | 5 | build_geometry { 6 | add_counterion = { 7 | ion = Cl 8 | number = neutralize_system 9 | } 10 | box = { 11 | shape = orthorhombic 12 | size = [10.0 10.0 10.0] 13 | size_type = buffer 14 | } 15 | rezero_system = true 16 | solvent = 17 | } 18 | 19 | assign_forcefield { 20 | forcefield = OPLS4 21 | water = 22 | } 23 | -------------------------------------------------------------------------------- /mdml/examples/PD-L1/linear_mean/STDERR: -------------------------------------------------------------------------------- 1 | DEBUG:__main__:Workflow set to nested=True 2 | DEBUG:__main__:Loading data: data.csv 3 | DEBUG:__main__:Dataset contains 61 entries 4 | DEBUG:__main__:Building initial model 5 | DEBUG:__main__:Saving model to mean_linear/model.pkl 6 | DEBUG:__main__:Saving feature importance to mean_linear/importance.csv 7 | DEBUG:__main__:Running cross-validation 8 | Molecule CV: 0it [00:00, ?it/s] Molecule CV: 19it [00:00, 172.20it/s] Molecule CV: 37it [00:00, 170.46it/s] Molecule CV: 55it [00:00, 98.05it/s] Molecule CV: 61it [00:00, 90.47it/s] 9 | DEBUG:__main__:Saving cross-validation results to mean_linear/cross_validation.json and mean_linear/cross_validation.svg 10 | -------------------------------------------------------------------------------- /Examples/PDL1/MDFit_PDL1_Commands.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | ###Example MDFit command for 4 PD-L1 peptides 4 | ###Flag details: 5 | #Prepared protein file (-p 6PV9_PDL1.mae) 6 | #Prepared ligand file (-l MDFit_PDL1_Example_Ligands.mae) 7 | #Custom OPLS directory (-o "/Examples/PDL1/PDL1_oplsdir") 8 | #100 ns MD (-t 100000) 9 | #Three repetitions (-r 3) 10 | #Remove first 100 frames before analysis (--slice_start = 100) 11 | #Require interaction to exist for 30% of simulation (--analysis_cutoff 0.3) 12 | #Run with debug (-d) 13 | 14 | MDFit -p 6PV9_PDL1.mae -l MDFit_PDL1_Example_Ligands.mae -o "/Examples/PDL1/PDL1_oplsdir" -t 100000 -r 3 --slice_start 100 --analysis_cutoff 0.3 -d 15 | -------------------------------------------------------------------------------- /mdml/setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup 2 | 3 | setup( 4 | name='mdml', 5 | packages=['mdml'], 6 | version='0.1.0', 7 | author='Benjamin J. Shields', 8 | author_email='shields.benjamin.j@gmail.com', 9 | keywords=['MD', 'Machine Learning'], 10 | description='Simulation Fingerprint Machine Learning Models.', 11 | install_requires=[ 12 | 'pandas', 13 | 'numpy', 14 | 'scikit-learn', 15 | 'matplotlib', 16 | 'dill', 17 | 'pyarrow', 18 | 'tqdm' 19 | ], 20 | classifiers=[ 21 | 'Development Status 3 - Alpha', 22 | 'Intended Audience ScienceResearch', 23 | 'Topic ScientificEngineering Chemistry', 24 | 'Programming Language Python 3', 25 | ], 26 | scripts=[ 27 | 'bin/mdml_train', 28 | 'bin/mdml_predict', 29 | 'bin/polynomial_features' 30 | ] 31 | ) 32 | -------------------------------------------------------------------------------- /templates/bmin_template.com: -------------------------------------------------------------------------------- 1 | IN_NAME 2 | OUT_NAME 3 | MMOD 0 1 0 0 0.0000 0.0000 0.0000 0.0000 4 | DEBG 55 0 0 0 0.0000 0.0000 0.0000 0.0000 5 | FFLD 16 1 0 0 1.0000 0.0000 0.0000 0.0000 6 | SOLV 3 1 0 0 0.0000 0.0000 0.0000 0.0000 7 | EXNB 0 0 0 0 0.0000 0.0000 0.0000 0.0000 8 | BDCO 0 0 0 0 89.4427 99999.0000 0.0000 0.0000 9 | CRMS 0 0 0 0 0.0000 0.5000 0.0000 0.0000 10 | BGIN 0 0 0 0 0.0000 0.0000 0.0000 0.0000 11 | READ 0 0 0 0 0.0000 0.0000 0.0000 0.0000 12 | CONV 2 0 0 0 0.3000 0.0000 0.0000 0.0000 13 | MINI 1 0 500 0 0.0000 0.0000 0.0000 0.0000 14 | END 0 0 0 0 0.0000 0.0000 0.0000 0.0000 15 | -------------------------------------------------------------------------------- /mdml/examples/PD-L1/linear/STDERR: -------------------------------------------------------------------------------- 1 | DEBUG:__main__:Workflow set to nested=True 2 | DEBUG:__main__:Loading data: data.csv 3 | DEBUG:__main__:Dataset contains 183 entries 4 | DEBUG:__main__:Building initial model 5 | DEBUG:__main__:Saving model to linear/model.pkl 6 | DEBUG:__main__:Saving feature importance to linear/importance.csv 7 | DEBUG:__main__:Running cross-validation 8 | Molecule CV: 0it [00:00, ?it/s] Molecule CV: 1it [00:00, 5.50it/s] Molecule CV: 2it [00:00, 5.62it/s] Molecule CV: 4it [00:00, 9.64it/s] Molecule CV: 6it [00:00, 12.63it/s] Molecule CV: 9it [00:00, 13.57it/s] Molecule CV: 11it [00:01, 10.67it/s] Molecule CV: 13it [00:01, 12.44it/s] Molecule CV: 19it [00:01, 21.93it/s] Molecule CV: 22it [00:01, 22.04it/s] Molecule CV: 28it [00:01, 30.90it/s] Molecule CV: 36it [00:01, 37.93it/s] Molecule CV: 43it [00:01, 45.15it/s] Molecule CV: 48it [00:01, 39.26it/s] Molecule CV: 53it [00:02, 36.20it/s] Molecule CV: 57it [00:02, 32.07it/s] Molecule CV: 61it [00:02, 25.82it/s] 9 | DEBUG:__main__:Saving cross-validation results to linear/cross_validation.json and linear/cross_validation.svg 10 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2024 Alex Brueckner 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /mdml/examples/PD-L1/lasso/importance.csv: -------------------------------------------------------------------------------- 1 | ,importance 2 | A:76:WaterBridge,0.6355123047559382 3 | A:115:Hydrophobic,0.5916237096568172 4 | A:123:Hydrophobic,0.4833926972942506 5 | A:117:WaterBridge,0.22377663129471045 6 | A:168:HBond,0.0 7 | A:168:WaterBridge,0.0 8 | A:172:HBond,-0.0 9 | A:147:Hydrophobic,0.0 10 | A:223:HBond,0.0 11 | A:71:HBond,-0.0 12 | A:68:WaterBridge,-0.0 13 | A:72:WaterBridge,-0.0 14 | A:171:HBond,0.0 15 | A:170:HBond,0.0 16 | A:78:HBond,0.0 17 | A:170:WaterBridge,0.0 18 | A:171:WaterBridge,0.0 19 | A:75:WaterBridge,-0.0 20 | A:76:HBond,-0.0 21 | A:212:WaterBridge,0.0 22 | A:217:WaterBridge,0.0 23 | A:172:WaterBridge,0.0 24 | A:63:HBond,0.0 25 | A:66:HBond,-0.0 26 | A:223:WaterBridge,0.0 27 | A:113:Pi-Cation,-0.0 28 | A:56:Pi-Pi,0.0 29 | A:58:WaterBridge,0.0 30 | A:73:WaterBridge,0.0 31 | A:208:WaterBridge,0.0 32 | A:61:WaterBridge,-0.0 33 | A:71:WaterBridge,0.0 34 | A:69:WaterBridge,-0.0 35 | A:54:Hydrophobic,-0.052067423994442155 36 | A:76:Hydrophobic,-0.18879417402790047 37 | A:73:HBond,-0.325042493705559 38 | A:66:WaterBridge,-0.38101046942183686 39 | A:56:HBond,-0.41278607679517754 40 | A:68:Hydrophobic,-0.4245199811667292 41 | A:123:Pi-Pi,-0.7121283349718759 42 | A:56:Hydrophobic,-0.7219538291217276 43 | A:63:WaterBridge,-0.9572379222516725 44 | Strain_perHeavyAtom,-2.831340128684278 45 | -------------------------------------------------------------------------------- /mdml/examples/PD-L1/lasso_mean/importance.csv: -------------------------------------------------------------------------------- 1 | ,importance 2 | A:115:Hydrophobic,0.6458663480750456 3 | A:76:WaterBridge,0.590712865608462 4 | A:168:WaterBridge,0.49198032032786115 5 | A:123:Hydrophobic,0.24037176236872668 6 | A:117:WaterBridge,0.17455220407263283 7 | A:170:HBond,0.0 8 | A:147:Hydrophobic,0.0 9 | A:223:HBond,-0.0 10 | A:71:HBond,-0.0 11 | A:68:WaterBridge,-0.0 12 | A:72:WaterBridge,-0.0 13 | A:168:HBond,-0.0 14 | A:78:HBond,0.0 15 | A:171:HBond,0.0 16 | A:170:WaterBridge,0.0 17 | A:171:WaterBridge,0.0 18 | A:75:WaterBridge,-0.0 19 | A:76:HBond,0.0 20 | A:56:HBond,-0.0 21 | A:212:WaterBridge,0.0 22 | A:217:WaterBridge,0.0 23 | A:172:WaterBridge,0.0 24 | A:63:HBond,0.0 25 | A:66:HBond,-0.0 26 | A:73:WaterBridge,0.0 27 | A:69:WaterBridge,-0.0 28 | A:54:Hydrophobic,-0.0 29 | A:71:WaterBridge,0.0 30 | A:56:Pi-Pi,0.0 31 | A:223:WaterBridge,-0.0 32 | A:208:WaterBridge,0.0 33 | A:58:WaterBridge,0.0 34 | A:61:WaterBridge,0.0 35 | A:113:Pi-Cation,-0.15423218596949798 36 | A:172:HBond,-0.19804062054386898 37 | A:76:Hydrophobic,-0.27325988226858033 38 | A:73:HBond,-0.3051078974656791 39 | A:68:Hydrophobic,-0.6111311180996669 40 | A:56:Hydrophobic,-0.7426770159246496 41 | A:66:WaterBridge,-0.8482320040521754 42 | A:123:Pi-Pi,-0.8757172308977965 43 | A:63:WaterBridge,-0.9636928160030664 44 | Strain_perHeavyAtom,-2.781332741051147 45 | -------------------------------------------------------------------------------- /mdml/examples/PD-L1/gradient_boosting_mean/importance.csv: -------------------------------------------------------------------------------- 1 | ,importance 2 | A:123:Hydrophobic,0.2821060193868918 3 | Strain_perHeavyAtom,0.26318741427451153 4 | A:66:HBond,0.16461120969484933 5 | A:56:Hydrophobic,0.10060308280735433 6 | A:68:Hydrophobic,0.050545628596954655 7 | A:73:HBond,0.030331473020238076 8 | A:66:WaterBridge,0.023423991379755458 9 | A:63:HBond,0.02145866961207075 10 | A:115:Hydrophobic,0.01646546599911657 11 | A:73:WaterBridge,0.012748523805944661 12 | A:76:WaterBridge,0.010465799420052452 13 | A:113:Pi-Cation,0.007126375620531979 14 | A:58:WaterBridge,0.006149712760844277 15 | A:63:WaterBridge,0.0046874728242335285 16 | A:54:Hydrophobic,0.0033729440743602364 17 | A:56:Pi-Pi,0.0023913282266896 18 | A:71:WaterBridge,0.00032488849560064777 19 | A:208:WaterBridge,0.0 20 | A:171:WaterBridge,0.0 21 | A:170:HBond,0.0 22 | A:171:HBond,0.0 23 | A:78:HBond,0.0 24 | A:170:WaterBridge,0.0 25 | A:56:HBond,0.0 26 | A:75:WaterBridge,0.0 27 | A:76:HBond,0.0 28 | A:72:WaterBridge,0.0 29 | A:212:WaterBridge,0.0 30 | A:217:WaterBridge,0.0 31 | A:168:HBond,0.0 32 | A:71:HBond,0.0 33 | A:68:WaterBridge,0.0 34 | A:223:WaterBridge,0.0 35 | A:223:HBond,0.0 36 | A:147:Hydrophobic,0.0 37 | A:76:Hydrophobic,0.0 38 | A:168:WaterBridge,0.0 39 | A:172:WaterBridge,0.0 40 | A:117:WaterBridge,0.0 41 | A:61:WaterBridge,0.0 42 | A:123:Pi-Pi,0.0 43 | A:69:WaterBridge,0.0 44 | A:172:HBond,0.0 45 | -------------------------------------------------------------------------------- /mdml/examples/PD-L1/gradient_boosting/importance.csv: -------------------------------------------------------------------------------- 1 | ,importance 2 | Strain_perHeavyAtom,0.3431680590709321 3 | A:68:Hydrophobic,0.08498890033830075 4 | A:123:Hydrophobic,0.08306809724470618 5 | A:58:WaterBridge,0.06850514383039635 6 | A:56:Hydrophobic,0.05882407959669033 7 | A:113:Pi-Cation,0.0578694494148487 8 | A:73:WaterBridge,0.05579569415986692 9 | A:56:Pi-Pi,0.0475337961549079 10 | A:66:HBond,0.03530337434479803 11 | A:73:HBond,0.03458732388013048 12 | A:63:HBond,0.028567385483523607 13 | A:123:Pi-Pi,0.021029313853795566 14 | A:71:WaterBridge,0.016730898997036513 15 | A:61:WaterBridge,0.01577019639100805 16 | A:115:Hydrophobic,0.01423470779211056 17 | A:63:WaterBridge,0.009774057171253639 18 | A:66:WaterBridge,0.007342390451557968 19 | A:76:Hydrophobic,0.005420417705953217 20 | A:54:Hydrophobic,0.004645922697275947 21 | A:76:WaterBridge,0.003878368172745101 22 | A:117:WaterBridge,0.002897545736279258 23 | A:223:WaterBridge,6.437443250466106e-05 24 | A:172:WaterBridge,5.030793780147951e-07 25 | A:69:WaterBridge,0.0 26 | A:171:HBond,0.0 27 | A:217:WaterBridge,0.0 28 | A:212:WaterBridge,0.0 29 | A:56:HBond,0.0 30 | A:76:HBond,0.0 31 | A:75:WaterBridge,0.0 32 | A:171:WaterBridge,0.0 33 | A:170:WaterBridge,0.0 34 | A:78:HBond,0.0 35 | A:168:HBond,0.0 36 | A:170:HBond,0.0 37 | A:72:WaterBridge,0.0 38 | A:68:WaterBridge,0.0 39 | A:71:HBond,0.0 40 | A:208:WaterBridge,0.0 41 | A:147:Hydrophobic,0.0 42 | A:172:HBond,0.0 43 | A:168:WaterBridge,0.0 44 | A:223:HBond,0.0 45 | -------------------------------------------------------------------------------- /templates/desmond_md_job_template.cfg: -------------------------------------------------------------------------------- 1 | annealing = false 2 | backend = { 3 | } 4 | bigger_rclone = false 5 | checkpt = { 6 | first = 0.0 7 | interval = 240.06 8 | name = "$JOBNAME.cpt" 9 | write_last_step = true 10 | } 11 | cpu = 1 12 | cutoff_radius = 9.0 13 | elapsed_time = 0.0 14 | energy_group = false 15 | eneseq = { 16 | first = 0.0 17 | interval = 5.0 18 | name = "$JOBNAME$[_replica$REPLICA$].ene" 19 | } 20 | ensemble = { 21 | barostat = { 22 | tau = 2.0 23 | } 24 | class = NPT 25 | method = MTK 26 | thermostat = { 27 | tau = 1.0 28 | } 29 | } 30 | glue = solute 31 | maeff_output = { 32 | first = 0.0 33 | interval = 120.0 34 | name = "$JOBNAME$[_replica$REPLICA$]-out.cms" 35 | periodicfix = true 36 | trjdir = "$JOBNAME$[_replica$REPLICA$]_trj" 37 | } 38 | meta = false 39 | meta_file = ? 40 | pressure = [1.01325 isotropic ] 41 | randomize_velocity = { 42 | first = 0.0 43 | interval = inf 44 | seed = RSEED 45 | temperature = "@*.temperature" 46 | } 47 | restrain = none 48 | simbox = { 49 | first = 0.0 50 | interval = 1.2 51 | name = "$JOBNAME$[_replica$REPLICA$]_simbox.dat" 52 | } 53 | surface_tension = 0.0 54 | taper = false 55 | temperature = [ 56 | [300.0 0 ] 57 | ] 58 | time = SIMTIME 59 | timestep = [0.002 0.002 0.006 ] 60 | trajectory = { 61 | center = [] 62 | first = 0.0 63 | format = dtr 64 | frames_per_file = 250 65 | interval = WRITEFRQ 66 | name = "$JOBNAME$[_replica$REPLICA$]_trj" 67 | periodicfix = true 68 | write_velocity = false 69 | } 70 | -------------------------------------------------------------------------------- /mdml/examples/PD-L1/README.md: -------------------------------------------------------------------------------- 1 | # PD-L1 Example 2 | 3 | The following commands will reproduce the PD-L1 modeling and feature importance from the MDFit paper. 4 | 5 | Evaluate models with duplicate SimFPs included. 6 | ``` 7 | mdml_train data.csv linear -nproc 10 -id_col Molecule -target_col pIC50 -model_type linear 8 | mdml_train data.csv ridge -nproc 10 -id_col Molecule -target_col pIC50 -model_type ridge 9 | mdml_train data.csv lasso -nproc 10 -id_col Molecule -target_col pIC50 -model_type lasso 10 | mdml_train data.csv random_forest -nproc 10 -id_col Molecule -target_col pIC50 -model_type random_forest 11 | mdml_train data.csv gradient_boosting -nproc 10 -id_col Molecule -target_col pIC50 -model_type gradient_boosting 12 | ``` 13 | 14 | Evaluate models with averaged SimFPs. 15 | ``` 16 | mdml_train data.csv linear_mean -nproc 10 -id_col Molecule -target_col pIC50 -model_type linear -group mean 17 | mdml_train data.csv ridge_mean -nproc 10 -id_col Molecule -target_col pIC50 -model_type ridge -group mean 18 | mdml_train data.csv lasso_mean -nproc 10 -id_col Molecule -target_col pIC50 -model_type lasso -group mean 19 | mdml_train data.csv random_forest_mean -nproc 10 -id_col Molecule -target_col pIC50 -model_type random_forest -group mean 20 | mdml_train data.csv gradient_boosting_mean -nproc 10 -id_col Molecule -target_col pIC50 -model_type gradient_boosting -group mean 21 | ``` 22 | 23 | **Note:** Metrics (e.g., $Q^2$) in `cross_validation.json` are computed using all entries (duplicates included) 24 | while metrics in `cross_validation.svg` are computed using the average of the predictions with SimFPs from 25 | different simulations for each molecule. 26 | -------------------------------------------------------------------------------- /mdml/examples/PD-L1/linear_mean/importance.csv: -------------------------------------------------------------------------------- 1 | ,importance 2 | A:115:Hydrophobic,3.4179049015847265 3 | A:56:HBond,3.0099879314819784 4 | A:208:WaterBridge,2.2566332935566313 5 | A:168:WaterBridge,1.738912227381602 6 | A:66:HBond,1.4748026123484463 7 | A:212:WaterBridge,1.3971231426306736 8 | A:123:Hydrophobic,0.9201853566910351 9 | A:117:WaterBridge,0.892846030163572 10 | A:76:WaterBridge,0.7716865020658565 11 | A:71:WaterBridge,0.7192231009122938 12 | A:63:HBond,0.6651088808964104 13 | A:168:HBond,0.6621585412820343 14 | A:172:WaterBridge,0.38829211645897277 15 | A:61:WaterBridge,0.27053382254314395 16 | A:73:WaterBridge,0.16993983811128135 17 | A:147:Hydrophobic,-0.23603163827478174 18 | A:72:WaterBridge,-0.295817760619532 19 | A:68:WaterBridge,-0.2958177606195367 20 | A:75:WaterBridge,-0.39128820753876475 21 | A:58:WaterBridge,-0.39655979386862417 22 | A:223:WaterBridge,-0.41118191846010677 23 | A:56:Hydrophobic,-0.44938163700422307 24 | A:170:HBond,-0.5870451891729009 25 | A:171:WaterBridge,-0.587045189172902 26 | A:170:WaterBridge,-0.5870451891729034 27 | A:171:HBond,-0.5870451891729043 28 | A:78:HBond,-0.5870451891729056 29 | A:56:Pi-Pi,-0.6825689511930271 30 | A:66:WaterBridge,-0.8319188984488853 31 | A:223:HBond,-0.8531626842457735 32 | A:54:Hydrophobic,-0.8567928166605197 33 | A:217:WaterBridge,-0.8639217454395056 34 | A:76:Hydrophobic,-0.9280147159776061 35 | A:172:HBond,-0.9977551305922331 36 | A:73:HBond,-1.0644968001332469 37 | A:69:WaterBridge,-1.1159989479455308 38 | A:123:Pi-Pi,-1.195543598779107 39 | A:76:HBond,-1.4885807597267475 40 | A:63:WaterBridge,-1.5556880962561483 41 | A:71:HBond,-2.079082284457193 42 | A:68:Hydrophobic,-2.861527931795211 43 | Strain_perHeavyAtom,-4.030727120928537 44 | A:113:Pi-Cation,-4.603684193032383 45 | -------------------------------------------------------------------------------- /mdml/examples/PD-L1/linear/importance.csv: -------------------------------------------------------------------------------- 1 | ,importance 2 | A:115:Hydrophobic,1.2120060469302938 3 | A:212:WaterBridge,1.168176577723823 4 | A:168:WaterBridge,1.0865861471438563 5 | A:72:WaterBridge,0.9529641147783804 6 | A:147:Hydrophobic,0.8651040669830239 7 | A:73:WaterBridge,0.7580069156315616 8 | A:63:HBond,0.6635612489400315 9 | A:123:Hydrophobic,0.6560141197442709 10 | A:117:WaterBridge,0.5119784911486394 11 | A:208:WaterBridge,0.48906566692723774 12 | A:76:WaterBridge,0.4577167199866073 13 | A:168:HBond,0.42643379090330547 14 | A:172:WaterBridge,0.38628544472597476 15 | A:71:WaterBridge,0.22892684138671585 16 | A:223:WaterBridge,0.20984114039285134 17 | A:71:HBond,0.19348395983266614 18 | A:58:WaterBridge,0.1603143364407486 19 | A:171:WaterBridge,0.003791040265698497 20 | A:78:HBond,0.003791040265698275 21 | A:170:HBond,0.0037910402656928348 22 | A:170:WaterBridge,0.0037910402656875326 23 | A:171:HBond,0.003791040265687477 24 | A:61:WaterBridge,-0.07425602996985581 25 | A:66:HBond,-0.07585524033968483 26 | A:217:WaterBridge,-0.08138211203458502 27 | A:113:Pi-Cation,-0.1854874569122909 28 | A:223:HBond,-0.21080863429609223 29 | A:54:Hydrophobic,-0.2800521847061306 30 | A:56:Pi-Pi,-0.37513306595199836 31 | A:76:Hydrophobic,-0.41461902537342826 32 | A:66:WaterBridge,-0.4241472684941725 33 | A:75:WaterBridge,-0.4403105143780974 34 | A:56:HBond,-0.4471027457766443 35 | A:172:HBond,-0.4952251317488825 36 | A:69:WaterBridge,-0.5333669660366571 37 | A:76:HBond,-0.5713781299200659 38 | A:68:Hydrophobic,-0.7192994606837416 39 | A:73:HBond,-0.757435393881754 40 | A:63:WaterBridge,-0.8664388895174614 41 | A:123:Pi-Pi,-0.8880929506186971 42 | A:56:Hydrophobic,-1.0181739818333204 43 | A:68:WaterBridge,-1.739478706104431 44 | Strain_perHeavyAtom,-3.3243059445607055 45 | -------------------------------------------------------------------------------- /mdml/examples/PD-L1/ridge/importance.csv: -------------------------------------------------------------------------------- 1 | ,importance 2 | A:115:Hydrophobic,0.7771269646842853 3 | A:123:Hydrophobic,0.6210610895519609 4 | A:76:WaterBridge,0.6089400075223189 5 | A:168:WaterBridge,0.5574300132399377 6 | A:117:WaterBridge,0.48059961567284404 7 | A:212:WaterBridge,0.40212521718288974 8 | A:73:WaterBridge,0.3478662428863717 9 | A:147:Hydrophobic,0.28854630058530073 10 | A:63:HBond,0.2272329456223205 11 | A:58:WaterBridge,0.18269862738738732 12 | A:172:WaterBridge,0.17630182718295317 13 | A:71:WaterBridge,0.15727243436725064 14 | A:208:WaterBridge,0.14223661094999115 15 | A:168:HBond,0.1290173710046051 16 | A:217:WaterBridge,0.12215918755110237 17 | A:223:WaterBridge,0.11630187952201819 18 | A:170:HBond,0.07471694786013153 19 | A:171:WaterBridge,0.07471694786013142 20 | A:170:WaterBridge,0.07471694786013136 21 | A:78:HBond,0.07471694786013133 22 | A:171:HBond,0.07471694786013132 23 | A:56:Pi-Pi,0.06214510366123463 24 | A:223:HBond,0.04517123540098317 25 | A:72:WaterBridge,-0.037306096238321006 26 | A:76:HBond,-0.07464996514888496 27 | A:71:HBond,-0.127599967988499 28 | A:75:WaterBridge,-0.12795018988433315 29 | A:61:WaterBridge,-0.13488085583473367 30 | A:113:Pi-Cation,-0.1724924494297132 31 | A:54:Hydrophobic,-0.18216700507201314 32 | A:172:HBond,-0.22790639579740693 33 | A:69:WaterBridge,-0.24592228065556734 34 | A:66:HBond,-0.25496888863217887 35 | A:76:Hydrophobic,-0.2782366980225123 36 | A:68:WaterBridge,-0.2909295771887263 37 | A:66:WaterBridge,-0.47955004808987833 38 | A:56:HBond,-0.48296469382299007 39 | A:73:HBond,-0.5643187359447307 40 | A:56:Hydrophobic,-0.6457807943752559 41 | A:68:Hydrophobic,-0.6552799976029455 42 | A:123:Pi-Pi,-0.7164696676908707 43 | A:63:WaterBridge,-0.8507228121803795 44 | Strain_perHeavyAtom,-2.3851627198597454 45 | -------------------------------------------------------------------------------- /mdml/examples/PD-L1/ridge_mean/importance.csv: -------------------------------------------------------------------------------- 1 | ,importance 2 | A:115:Hydrophobic,0.7299994676201763 3 | A:168:WaterBridge,0.6589317315664275 4 | A:76:WaterBridge,0.5581218314756109 5 | A:123:Hydrophobic,0.4704127905526515 6 | A:117:WaterBridge,0.4629728122352017 7 | A:212:WaterBridge,0.27553742311804563 8 | A:58:WaterBridge,0.2698383136154038 9 | A:56:Pi-Pi,0.23132274040074408 10 | A:63:HBond,0.19101828576556776 11 | A:147:Hydrophobic,0.18365680869013698 12 | A:217:WaterBridge,0.1780485363227274 13 | A:71:WaterBridge,0.1691649988414725 14 | A:73:WaterBridge,0.15895865792798008 15 | A:170:HBond,0.06741611732077547 16 | A:171:WaterBridge,0.06741611732077545 17 | A:170:WaterBridge,0.06741611732077545 18 | A:171:HBond,0.06741611732077533 19 | A:78:HBond,0.06741611732077529 20 | A:76:HBond,0.06338237997265034 21 | A:223:HBond,0.053391809815708555 22 | A:172:WaterBridge,0.03854497117463989 23 | A:61:WaterBridge,0.01668464997454458 24 | A:168:HBond,0.008529850847597782 25 | A:208:WaterBridge,-0.04557491968229436 26 | A:69:WaterBridge,-0.06956416572147074 27 | A:71:HBond,-0.1303990176825092 28 | A:72:WaterBridge,-0.14170557500168474 29 | A:68:WaterBridge,-0.14170557500168485 30 | A:54:Hydrophobic,-0.1823986543381202 31 | A:75:WaterBridge,-0.19792939804721865 32 | A:223:WaterBridge,-0.2251929776623398 33 | A:172:HBond,-0.27535920536088293 34 | A:66:HBond,-0.33686245854134406 35 | A:56:HBond,-0.35056770066808557 36 | A:76:Hydrophobic,-0.3891756666186929 37 | A:63:WaterBridge,-0.43982142871192126 38 | A:113:Pi-Cation,-0.45948548374961407 39 | A:56:Hydrophobic,-0.5472845686931 40 | A:73:HBond,-0.5987879917005361 41 | A:123:Pi-Pi,-0.7662767999190696 42 | A:68:Hydrophobic,-0.9491591909424791 43 | A:66:WaterBridge,-0.9778540167790951 44 | Strain_perHeavyAtom,-1.8010590595672384 45 | -------------------------------------------------------------------------------- /mdml/examples/PD-L1/random_forest/importance.csv: -------------------------------------------------------------------------------- 1 | ,importance 2 | Strain_perHeavyAtom,0.30860971827157935 3 | A:68:Hydrophobic,0.08001172921982987 4 | A:123:Hydrophobic,0.07715452869124684 5 | A:113:Pi-Cation,0.07077788988811898 6 | A:56:Hydrophobic,0.06629369560444234 7 | A:58:WaterBridge,0.04896082890640077 8 | A:66:HBond,0.048801772310769966 9 | A:115:Hydrophobic,0.04644891128863812 10 | A:56:Pi-Pi,0.03424488250882641 11 | A:73:HBond,0.03136362199068667 12 | A:73:WaterBridge,0.027876884140904053 13 | A:63:HBond,0.026941986205677525 14 | A:123:Pi-Pi,0.02686597995865185 15 | A:66:WaterBridge,0.026625352074648258 16 | A:71:WaterBridge,0.02566507104440312 17 | A:61:WaterBridge,0.01858699133485243 18 | A:76:Hydrophobic,0.012686436202685678 19 | A:63:WaterBridge,0.006312631543299066 20 | A:54:Hydrophobic,0.005471708581803852 21 | A:76:WaterBridge,0.0036530024544970284 22 | A:56:HBond,0.0032085902816660654 23 | A:117:WaterBridge,0.0010818161281157467 24 | A:223:WaterBridge,0.0006955289819360857 25 | A:172:WaterBridge,0.0002604124350259707 26 | A:208:WaterBridge,0.00023179598795874113 27 | A:71:HBond,0.0002157967496199828 28 | A:76:HBond,0.00019343692532114285 29 | A:168:HBond,0.0001784188731492803 30 | A:68:WaterBridge,0.00015537415539062873 31 | A:172:HBond,0.0001261767175569754 32 | A:168:WaterBridge,0.00012313980832576857 33 | A:212:WaterBridge,4.9421997946330464e-05 34 | A:217:WaterBridge,4.537083895770855e-05 35 | A:75:WaterBridge,3.787606086354798e-05 36 | A:147:Hydrophobic,2.7658529979795518e-05 37 | A:170:WaterBridge,8.978550407620781e-06 38 | A:170:HBond,4.3284678571984724e-06 39 | A:72:WaterBridge,1.4129122784170004e-06 40 | A:171:HBond,8.137160234610557e-07 41 | A:223:HBond,2.490689120873583e-08 42 | A:69:WaterBridge,4.75276622705658e-09 43 | A:171:WaterBridge,0.0 44 | A:78:HBond,0.0 45 | -------------------------------------------------------------------------------- /mdml/examples/PD-L1/random_forest_mean/importance.csv: -------------------------------------------------------------------------------- 1 | ,importance 2 | Strain_perHeavyAtom,0.2125836786086668 3 | A:123:Hydrophobic,0.20602398576046763 4 | A:66:HBond,0.10624292160645096 5 | A:56:Hydrophobic,0.09805387490680063 6 | A:68:Hydrophobic,0.06950713857033447 7 | A:66:WaterBridge,0.04686196685383211 8 | A:113:Pi-Cation,0.04533804482304102 9 | A:115:Hydrophobic,0.03594467618337902 10 | A:71:WaterBridge,0.027027930926490154 11 | A:63:HBond,0.02477567376923509 12 | A:123:Pi-Pi,0.02353756293551733 13 | A:56:Pi-Pi,0.017746141510603216 14 | A:58:WaterBridge,0.01765104282246242 15 | A:73:HBond,0.016441771355816245 16 | A:61:WaterBridge,0.014115551294360444 17 | A:73:WaterBridge,0.013948703421524447 18 | A:76:Hydrophobic,0.009416816299792036 19 | A:54:Hydrophobic,0.00623412286252767 20 | A:223:WaterBridge,0.001286130593757802 21 | A:217:WaterBridge,0.0010723045600818048 22 | A:63:WaterBridge,0.0009150821291903253 23 | A:172:WaterBridge,0.0008452712591777811 24 | A:117:WaterBridge,0.0007361624896637547 25 | A:56:HBond,0.0006333045115570916 26 | A:69:WaterBridge,0.0005595236851902069 27 | A:223:HBond,0.00045089630275727856 28 | A:172:HBond,0.00039886955842963456 29 | A:168:WaterBridge,0.00036978891593451407 30 | A:168:HBond,0.0003242875196668843 31 | A:68:WaterBridge,0.0003199809489816599 32 | A:76:WaterBridge,0.0002786035594121451 33 | A:71:HBond,0.0001881254027270712 34 | A:208:WaterBridge,9.087622577976781e-05 35 | A:76:HBond,2.9612756342381666e-05 36 | A:212:WaterBridge,1.960617059264951e-05 37 | A:147:Hydrophobic,1.5013039259273214e-05 38 | A:72:WaterBridge,3.768469760156854e-06 39 | A:171:WaterBridge,3.501386338798146e-06 40 | A:171:HBond,2.330072877036284e-06 41 | A:170:WaterBridge,2.0384057412636837e-06 42 | A:170:HBond,1.8026448436845405e-06 43 | A:78:HBond,1.2133103595032605e-06 44 | A:75:WaterBridge,3.0157027591189784e-07 45 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # MDFit 2 | Python wrapper for high-throughput molecular dynamics. A workflow overview and application of MDFit to a data set of macrocyclic peptides targetting PD-L1 are discussed in _ChemRxiv_.[^1] 3 | [^1]: [MDFit: Automated molecular simulations workflow enables high throughput assessment of ligands-protein dynamics](https://doi.org/10.26434/chemrxiv-2024-gfcqx) 4 | 5 | MDFit currently uses Schrodinger tools. Implementation of alternatives, including open-source tools, are ongoing. 6 | # Prerequisites 7 | MDFit assumes the `$SCHRODINGER` environmental variable has been set. This should point to the current Schrodinger installation. To check if `$SCHRODINGER` has been set correctly, try running: `$SCHRODINGER/run -h` 8 | 9 | 10 | MDFit attempts to get the current Schrodinger release by reading the `$SCHRODINGER` pathname. For example, if the current release is installed in `/schrodinger/2023-2/`, MDFit will set the release to 2023-2. This value can also be hard-coded in MDFit (line 38) if a different directory naming scheme is used. 11 | 12 | The first time MDFit.py is called, a `parameters_TEMPLATE.json` file is generated in the installation directory. Replace `localhost` with your institution's Schrodinger hostnames and rename the file to `parameters.json`. This is required only once and MDFit will always read `parameters.json` to get host information on subsequent runs. General runtime limit guidance: 13 | ``` 14 | FFBUILDER 10 hours 15 | BMIN 2 hours 16 | MULTISIM 2 hours 17 | DESMOND 24 hours 18 | ANALYSIS 8 hours 19 | ``` 20 | # Usage 21 | ``` 22 | $SCHRODINGER/run python3 MDFit.py -h 23 | ``` 24 | Self-contained example available in `MDFit/Examples/PDL1/`. The following command will run FFBuilder, three repetitions of 100 ns Desmond MD, and MD analysis for Pep-01, Pep-41, Pep-52, and Pep-66. The first 100 frames will be removed from the trajectory before analysis (`--slice_start`) and the cutoff for retaining a protein-ligand interaction is 0.3 (`--analysis_cutoff`). 25 | ``` 26 | $SCHRODINGER/run python3 MDFit.py -p 6PV9_PDL1.mae -l MDFit_PDL1_Example_Ligands.mae -o "MDFit/Examples/PDL1/PDL1_oplsdir" -t 100000 -r 3 --slice_start 100 --analysis_cutoff 0.3 -d 27 | ``` 28 | It is strongly encouraged to use the debug flag `-d` for initial MDFit usage. Errors may occur if packages are not where MDFit expects them to be. 29 | 30 | 31 | # Bugs and Known Errors 32 | + Schrodinger release relying on installation pathname. 33 | -------------------------------------------------------------------------------- /mdml/.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 95 | __pypackages__/ 96 | 97 | # Celery stuff 98 | celerybeat-schedule 99 | celerybeat.pid 100 | 101 | # SageMath parsed files 102 | *.sage.py 103 | 104 | # Environments 105 | .env 106 | .venv 107 | env/ 108 | venv/ 109 | ENV/ 110 | env.bak/ 111 | venv.bak/ 112 | 113 | # Spyder project settings 114 | .spyderproject 115 | .spyproject 116 | 117 | # Rope project settings 118 | .ropeproject 119 | 120 | # mkdocs documentation 121 | /site 122 | 123 | # mypy 124 | .mypy_cache/ 125 | .dmypy.json 126 | dmypy.json 127 | 128 | # Pyre type checker 129 | .pyre/ 130 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # Mac 7 | .DS_Store 8 | 9 | # C extensions 10 | *.so 11 | 12 | # Distribution / packaging 13 | .Python 14 | build/ 15 | develop-eggs/ 16 | dist/ 17 | downloads/ 18 | eggs/ 19 | .eggs/ 20 | lib/ 21 | lib64/ 22 | parts/ 23 | sdist/ 24 | var/ 25 | wheels/ 26 | pip-wheel-metadata/ 27 | share/python-wheels/ 28 | *.egg-info/ 29 | .installed.cfg 30 | *.egg 31 | MANIFEST 32 | 33 | # PyInstaller 34 | # Usually these files are written by a python script from a template 35 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 36 | *.manifest 37 | *.spec 38 | 39 | # Installer logs 40 | pip-log.txt 41 | pip-delete-this-directory.txt 42 | 43 | # Unit test / coverage reports 44 | htmlcov/ 45 | .tox/ 46 | .nox/ 47 | .coverage 48 | .coverage.* 49 | .cache 50 | nosetests.xml 51 | coverage.xml 52 | *.cover 53 | *.py,cover 54 | .hypothesis/ 55 | .pytest_cache/ 56 | 57 | # Translations 58 | *.mo 59 | *.pot 60 | 61 | # Django stuff: 62 | *.log 63 | local_settings.py 64 | db.sqlite3 65 | db.sqlite3-journal 66 | 67 | # Flask stuff: 68 | instance/ 69 | .webassets-cache 70 | 71 | # Scrapy stuff: 72 | .scrapy 73 | 74 | # Sphinx documentation 75 | docs/_build/ 76 | 77 | # PyBuilder 78 | target/ 79 | 80 | # Jupyter Notebook 81 | .ipynb_checkpoints 82 | 83 | # IPython 84 | profile_default/ 85 | ipython_config.py 86 | 87 | # pyenv 88 | .python-version 89 | 90 | # pipenv 91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 94 | # install all needed dependencies. 95 | #Pipfile.lock 96 | 97 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 98 | __pypackages__/ 99 | 100 | # Celery stuff 101 | celerybeat-schedule 102 | celerybeat.pid 103 | 104 | # SageMath parsed files 105 | *.sage.py 106 | 107 | # Environments 108 | .env 109 | .venv 110 | env/ 111 | venv/ 112 | ENV/ 113 | env.bak/ 114 | venv.bak/ 115 | 116 | # Spyder project settings 117 | .spyderproject 118 | .spyproject 119 | 120 | # Rope project settings 121 | .ropeproject 122 | 123 | # mkdocs documentation 124 | /site 125 | 126 | # mypy 127 | .mypy_cache/ 128 | .dmypy.json 129 | dmypy.json 130 | 131 | # Pyre type checker 132 | .pyre/ 133 | -------------------------------------------------------------------------------- /mdml/bin/polynomial_features: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | """ 3 | Polynomial Features 4 | ------------------- 5 | A helper script to include include squared and interaction terms (polynomial 6 | features) to SimFPs. 7 | 8 | @author: Benjamin Shields 9 | @email: benjamin.shields@bms.com 10 | """ 11 | 12 | ############################################################################## Imports 13 | 14 | import pandas as pd 15 | from mdml import cli 16 | from sklearn.preprocessing import PolynomialFeatures 17 | 18 | ############################################################################## Interface 19 | 20 | def get_parser(): 21 | description = """Add polynomial features to a SimFP output CSV.""" 22 | parser, groups = cli.parser(description) 23 | opts = parser.add_argument_group("OPTIONS") 24 | opts.add_argument( 25 | '-target_col', 26 | type=str, 27 | default=None, 28 | help="Name of target column in training data CSV.", 29 | ) 30 | opts.add_argument( 31 | '-drop_col', 32 | nargs='+', 33 | type=str, 34 | default=[], 35 | help="Columns that should be removed from the training CSV." 36 | ) 37 | required = parser.add_argument_group("REQUIRED") 38 | required.add_argument( 39 | 'input', 40 | help="Path to CSV containing features, IDs, and target (optional).", 41 | ) 42 | required.add_argument( 43 | 'output', 44 | help="Path to save output CSV.", 45 | ) 46 | required.add_argument( 47 | '-id_col', 48 | type=str, 49 | required=True, 50 | help="Name of compound ID column.", 51 | ) 52 | 53 | return parser 54 | 55 | ############################################################################## Main 56 | 57 | def main(args): 58 | data = cli.load_data(args.input, args.id_col, drop=args.drop_col) 59 | drop = ['ID'] 60 | if args.target_col is not None: 61 | drop.append(args.target_col) 62 | X = data.copy().drop(drop, axis=1) 63 | poly = PolynomialFeatures( 64 | degree=2, 65 | include_bias=False, 66 | interaction_only=False 67 | ) 68 | X = pd.DataFrame( 69 | poly.fit_transform(X), 70 | columns=poly.get_feature_names_out(), 71 | index=data.index 72 | ) 73 | if args.target_col is not None: 74 | X[args.target_col] = data[args.target_col] 75 | X.insert(0, args.id_col, data['ID'].values) 76 | X.to_csv(args.output, index=False) 77 | 78 | 79 | if __name__ == "__main__": 80 | parser = get_parser() 81 | args = parser.parse_args() 82 | main(args) 83 | 84 | -------------------------------------------------------------------------------- /mdml/examples/PD-L1/lasso/STDERR: -------------------------------------------------------------------------------- 1 | DEBUG:__main__:Workflow set to nested=True 2 | DEBUG:__main__:Loading data: data.csv 3 | DEBUG:__main__:Dataset contains 183 entries 4 | DEBUG:__main__:Building initial model 5 | DEBUG:__main__:Saving model to lasso/model.pkl 6 | DEBUG:__main__:Saving feature importance to lasso/importance.csv 7 | DEBUG:__main__:Running cross-validation 8 | Molecule CV: 0it [00:00, ?it/s] Molecule CV: 1it [00:07, 7.34s/it] Molecule CV: 2it [00:14, 7.24s/it] Molecule CV: 3it [00:21, 7.19s/it] Molecule CV: 4it [00:28, 7.19s/it] Molecule CV: 5it [00:35, 7.17s/it] Molecule CV: 6it [00:43, 7.18s/it] Molecule CV: 7it [00:50, 7.17s/it] Molecule CV: 8it [00:57, 7.16s/it] Molecule CV: 9it [01:04, 7.17s/it] Molecule CV: 10it [01:11, 7.15s/it] Molecule CV: 11it [01:18, 7.14s/it] Molecule CV: 12it [01:26, 7.23s/it] Molecule CV: 13it [01:33, 7.22s/it] Molecule CV: 14it [01:40, 7.24s/it] Molecule CV: 15it [01:48, 7.24s/it] Molecule CV: 16it [01:55, 7.19s/it] Molecule CV: 17it [02:02, 7.19s/it] Molecule CV: 18it [02:09, 7.17s/it] Molecule CV: 19it [02:16, 7.15s/it] Molecule CV: 20it [02:24, 7.26s/it] Molecule CV: 21it [02:31, 7.34s/it] Molecule CV: 22it [02:38, 7.25s/it] Molecule CV: 23it [02:45, 7.25s/it] Molecule CV: 24it [02:53, 7.25s/it] Molecule CV: 25it [03:00, 7.22s/it] Molecule CV: 26it [03:07, 7.20s/it] Molecule CV: 27it [03:14, 7.17s/it] Molecule CV: 28it [03:21, 7.19s/it] Molecule CV: 29it [03:28, 7.18s/it] Molecule CV: 30it [03:36, 7.21s/it] Molecule CV: 31it [03:43, 7.23s/it] Molecule CV: 32it [03:50, 7.29s/it] Molecule CV: 33it [03:58, 7.24s/it] Molecule CV: 34it [04:05, 7.20s/it] Molecule CV: 35it [04:12, 7.17s/it] Molecule CV: 36it [04:19, 7.11s/it] Molecule CV: 37it [04:26, 7.07s/it] Molecule CV: 38it [04:33, 7.11s/it] Molecule CV: 39it [04:40, 7.10s/it] Molecule CV: 40it [04:47, 7.08s/it] Molecule CV: 41it [04:54, 7.13s/it] Molecule CV: 42it [05:02, 7.17s/it] Molecule CV: 43it [05:08, 7.10s/it] Molecule CV: 44it [05:15, 7.03s/it] Molecule CV: 45it [05:23, 7.15s/it] Molecule CV: 46it [05:30, 7.19s/it] Molecule CV: 47it [05:37, 7.18s/it] Molecule CV: 48it [05:44, 7.19s/it] Molecule CV: 49it [05:52, 7.17s/it] Molecule CV: 50it [05:59, 7.27s/it] Molecule CV: 51it [06:06, 7.24s/it] Molecule CV: 52it [06:13, 7.26s/it] Molecule CV: 53it [06:21, 7.25s/it] Molecule CV: 54it [06:28, 7.24s/it] Molecule CV: 55it [06:35, 7.23s/it] Molecule CV: 56it [06:42, 7.22s/it] Molecule CV: 57it [06:50, 7.23s/it] Molecule CV: 58it [06:57, 7.21s/it] Molecule CV: 59it [07:04, 7.26s/it] Molecule CV: 60it [07:11, 7.29s/it] Molecule CV: 61it [07:19, 7.31s/it] Molecule CV: 61it [07:19, 7.20s/it] 9 | DEBUG:__main__:Saving cross-validation results to lasso/cross_validation.json and lasso/cross_validation.svg 10 | -------------------------------------------------------------------------------- /mdml/examples/PD-L1/ridge/STDERR: -------------------------------------------------------------------------------- 1 | DEBUG:__main__:Workflow set to nested=True 2 | DEBUG:__main__:Loading data: data.csv 3 | DEBUG:__main__:Dataset contains 183 entries 4 | DEBUG:__main__:Building initial model 5 | DEBUG:__main__:Saving model to ridge/model.pkl 6 | DEBUG:__main__:Saving feature importance to ridge/importance.csv 7 | DEBUG:__main__:Running cross-validation 8 | Molecule CV: 0it [00:00, ?it/s] Molecule CV: 1it [00:11, 11.21s/it] Molecule CV: 2it [00:22, 11.16s/it] Molecule CV: 3it [00:33, 11.11s/it] Molecule CV: 4it [00:44, 11.09s/it] Molecule CV: 5it [00:55, 11.06s/it] Molecule CV: 6it [01:06, 11.05s/it] Molecule CV: 7it [01:17, 11.11s/it] Molecule CV: 8it [01:28, 11.12s/it] Molecule CV: 9it [01:40, 11.13s/it] Molecule CV: 10it [01:51, 11.12s/it] Molecule CV: 11it [02:02, 11.32s/it] Molecule CV: 12it [02:13, 11.24s/it] Molecule CV: 13it [02:24, 11.19s/it] Molecule CV: 14it [02:36, 11.15s/it] Molecule CV: 15it [02:47, 11.16s/it] Molecule CV: 16it [02:58, 11.15s/it] Molecule CV: 17it [03:09, 11.14s/it] Molecule CV: 18it [03:20, 11.19s/it] Molecule CV: 19it [03:31, 11.15s/it] Molecule CV: 20it [03:42, 11.14s/it] Molecule CV: 21it [03:54, 11.14s/it] Molecule CV: 22it [04:05, 11.10s/it] Molecule CV: 23it [04:16, 11.09s/it] Molecule CV: 24it [04:27, 11.11s/it] Molecule CV: 25it [04:38, 11.13s/it] Molecule CV: 26it [04:49, 11.14s/it] Molecule CV: 27it [05:00, 11.15s/it] Molecule CV: 28it [05:11, 11.12s/it] Molecule CV: 29it [05:23, 11.14s/it] Molecule CV: 30it [05:34, 11.15s/it] Molecule CV: 31it [05:45, 11.14s/it] Molecule CV: 32it [05:56, 11.28s/it] Molecule CV: 33it [06:08, 11.23s/it] Molecule CV: 34it [06:19, 11.40s/it] Molecule CV: 35it [06:30, 11.31s/it] Molecule CV: 36it [06:42, 11.26s/it] Molecule CV: 37it [06:53, 11.25s/it] Molecule CV: 38it [07:04, 11.25s/it] Molecule CV: 39it [07:15, 11.21s/it] Molecule CV: 40it [07:26, 11.16s/it] Molecule CV: 41it [07:37, 11.13s/it] Molecule CV: 42it [07:48, 11.12s/it] Molecule CV: 43it [08:00, 11.12s/it] Molecule CV: 44it [08:11, 11.09s/it] Molecule CV: 45it [08:22, 11.11s/it] Molecule CV: 46it [08:33, 11.11s/it] Molecule CV: 47it [08:44, 11.09s/it] Molecule CV: 48it [08:55, 11.08s/it] Molecule CV: 49it [09:06, 11.08s/it] Molecule CV: 50it [09:17, 11.11s/it] Molecule CV: 51it [09:28, 11.10s/it] Molecule CV: 52it [09:39, 11.12s/it] Molecule CV: 53it [09:51, 11.12s/it] Molecule CV: 54it [10:02, 11.09s/it] Molecule CV: 55it [10:13, 11.09s/it] Molecule CV: 56it [10:24, 11.08s/it] Molecule CV: 57it [10:35, 11.11s/it] Molecule CV: 58it [10:46, 11.11s/it] Molecule CV: 59it [10:57, 11.09s/it] Molecule CV: 60it [11:08, 11.09s/it] Molecule CV: 61it [11:19, 11.09s/it] Molecule CV: 61it [11:19, 11.14s/it] 9 | DEBUG:__main__:Saving cross-validation results to ridge/cross_validation.json and ridge/cross_validation.svg 10 | -------------------------------------------------------------------------------- /mdml/examples/PD-L1/ridge_mean/STDERR: -------------------------------------------------------------------------------- 1 | DEBUG:__main__:Workflow set to nested=True 2 | DEBUG:__main__:Loading data: data.csv 3 | DEBUG:__main__:Dataset contains 61 entries 4 | DEBUG:__main__:Building initial model 5 | DEBUG:__main__:Saving model to mean_ridge/model.pkl 6 | DEBUG:__main__:Saving feature importance to mean_ridge/importance.csv 7 | DEBUG:__main__:Running cross-validation 8 | Molecule CV: 0it [00:00, ?it/s] Molecule CV: 1it [00:03, 3.30s/it] Molecule CV: 2it [00:06, 3.22s/it] Molecule CV: 3it [00:09, 3.23s/it] Molecule CV: 4it [00:12, 3.19s/it] Molecule CV: 5it [00:16, 3.26s/it] Molecule CV: 6it [00:19, 3.20s/it] Molecule CV: 7it [00:22, 3.11s/it] Molecule CV: 8it [00:25, 3.11s/it] Molecule CV: 9it [00:28, 3.09s/it] Molecule CV: 10it [00:31, 3.16s/it] Molecule CV: 11it [00:34, 3.11s/it] Molecule CV: 12it [00:37, 3.13s/it] Molecule CV: 13it [00:40, 3.08s/it] Molecule CV: 14it [00:43, 3.10s/it] Molecule CV: 15it [00:47, 3.08s/it] Molecule CV: 16it [00:49, 3.02s/it] Molecule CV: 17it [00:53, 3.06s/it] Molecule CV: 18it [00:56, 3.06s/it] Molecule CV: 19it [00:59, 3.16s/it] Molecule CV: 20it [01:02, 3.14s/it] Molecule CV: 21it [01:05, 3.16s/it] Molecule CV: 22it [01:08, 3.12s/it] Molecule CV: 23it [01:12, 3.15s/it] Molecule CV: 24it [01:15, 3.21s/it] Molecule CV: 25it [01:18, 3.17s/it] Molecule CV: 26it [01:21, 3.23s/it] Molecule CV: 27it [01:24, 3.19s/it] Molecule CV: 28it [01:28, 3.22s/it] Molecule CV: 29it [01:31, 3.18s/it] Molecule CV: 30it [01:34, 3.26s/it] Molecule CV: 31it [01:37, 3.20s/it] Molecule CV: 32it [01:40, 3.19s/it] Molecule CV: 33it [01:44, 3.17s/it] Molecule CV: 34it [01:47, 3.12s/it] Molecule CV: 35it [01:50, 3.13s/it] Molecule CV: 36it [01:53, 3.09s/it] Molecule CV: 37it [01:56, 3.09s/it] Molecule CV: 38it [01:59, 2.99s/it] Molecule CV: 39it [02:02, 3.08s/it] Molecule CV: 40it [02:05, 3.09s/it] Molecule CV: 41it [02:08, 3.16s/it] Molecule CV: 42it [02:11, 3.14s/it] Molecule CV: 43it [02:14, 3.09s/it] Molecule CV: 44it [02:18, 3.15s/it] Molecule CV: 45it [02:21, 3.13s/it] Molecule CV: 46it [02:24, 3.15s/it] Molecule CV: 47it [02:27, 3.12s/it] Molecule CV: 48it [02:30, 3.17s/it] Molecule CV: 49it [02:33, 3.14s/it] Molecule CV: 50it [02:36, 3.09s/it] Molecule CV: 51it [02:40, 3.14s/it] Molecule CV: 52it [02:43, 3.14s/it] Molecule CV: 53it [02:46, 3.24s/it] Molecule CV: 54it [02:49, 3.21s/it] Molecule CV: 55it [02:53, 3.26s/it] Molecule CV: 56it [02:56, 3.23s/it] Molecule CV: 57it [02:59, 3.25s/it] Molecule CV: 58it [03:03, 3.28s/it] Molecule CV: 59it [03:06, 3.29s/it] Molecule CV: 60it [03:09, 3.29s/it] Molecule CV: 61it [03:12, 3.27s/it] Molecule CV: 61it [03:12, 3.16s/it] 9 | DEBUG:__main__:Saving cross-validation results to mean_ridge/cross_validation.json and mean_ridge/cross_validation.svg 10 | -------------------------------------------------------------------------------- /mdml/examples/PD-L1/random_forest/STDERR: -------------------------------------------------------------------------------- 1 | DEBUG:__main__:Workflow set to nested=True 2 | DEBUG:__main__:Loading data: data.csv 3 | DEBUG:__main__:Dataset contains 183 entries 4 | DEBUG:__main__:Building initial model 5 | DEBUG:__main__:Saving model to random_forest/model.pkl 6 | DEBUG:__main__:Saving feature importance to random_forest/importance.csv 7 | DEBUG:__main__:Running cross-validation 8 | Molecule CV: 0it [00:00, ?it/s] Molecule CV: 1it [00:54, 54.09s/it] Molecule CV: 2it [01:46, 52.81s/it] Molecule CV: 3it [02:37, 52.30s/it] Molecule CV: 4it [03:29, 52.05s/it] Molecule CV: 5it [04:21, 51.97s/it] Molecule CV: 6it [05:13, 51.95s/it] Molecule CV: 7it [06:04, 51.86s/it] Molecule CV: 8it [06:57, 51.98s/it] Molecule CV: 9it [07:49, 52.02s/it] Molecule CV: 10it [08:41, 52.00s/it] Molecule CV: 11it [09:32, 51.90s/it] Molecule CV: 12it [10:24, 51.92s/it] Molecule CV: 13it [11:17, 52.27s/it] Molecule CV: 14it [12:09, 51.96s/it] Molecule CV: 15it [13:00, 51.86s/it] Molecule CV: 16it [13:52, 51.92s/it] Molecule CV: 17it [14:44, 51.76s/it] Molecule CV: 18it [15:35, 51.78s/it] Molecule CV: 19it [16:27, 51.82s/it] Molecule CV: 20it [17:19, 51.80s/it] Molecule CV: 21it [18:11, 51.81s/it] Molecule CV: 22it [19:02, 51.63s/it] Molecule CV: 23it [19:54, 51.65s/it] Molecule CV: 24it [20:46, 51.69s/it] Molecule CV: 25it [21:39, 52.23s/it] Molecule CV: 26it [22:31, 52.25s/it] Molecule CV: 27it [23:24, 52.47s/it] Molecule CV: 28it [24:17, 52.37s/it] Molecule CV: 29it [25:09, 52.40s/it] Molecule CV: 30it [26:01, 52.27s/it] Molecule CV: 31it [26:54, 52.44s/it] Molecule CV: 32it [27:47, 52.74s/it] Molecule CV: 33it [28:40, 52.79s/it] Molecule CV: 34it [29:33, 52.74s/it] Molecule CV: 35it [30:26, 52.77s/it] Molecule CV: 36it [31:20, 53.18s/it] Molecule CV: 37it [32:13, 53.29s/it] Molecule CV: 38it [33:06, 53.18s/it] Molecule CV: 39it [34:00, 53.40s/it] Molecule CV: 40it [34:53, 53.14s/it] Molecule CV: 41it [35:45, 52.95s/it] Molecule CV: 42it [36:38, 52.88s/it] Molecule CV: 43it [37:30, 52.67s/it] Molecule CV: 44it [38:23, 52.60s/it] Molecule CV: 45it [39:15, 52.50s/it] Molecule CV: 46it [40:07, 52.38s/it] Molecule CV: 47it [40:59, 52.32s/it] Molecule CV: 48it [41:53, 52.73s/it] Molecule CV: 49it [42:45, 52.68s/it] Molecule CV: 50it [43:38, 52.60s/it] Molecule CV: 51it [44:31, 52.74s/it] Molecule CV: 52it [45:24, 52.95s/it] Molecule CV: 53it [46:17, 52.87s/it] Molecule CV: 54it [47:09, 52.71s/it] Molecule CV: 55it [48:03, 53.12s/it] Molecule CV: 56it [48:56, 52.91s/it] Molecule CV: 57it [49:48, 52.81s/it] Molecule CV: 58it [50:41, 52.77s/it] Molecule CV: 59it [51:36, 53.40s/it] Molecule CV: 60it [52:30, 53.65s/it] Molecule CV: 61it [53:22, 53.24s/it] Molecule CV: 61it [53:22, 52.51s/it] 9 | DEBUG:__main__:Saving cross-validation results to random_forest/cross_validation.json and random_forest/cross_validation.svg 10 | -------------------------------------------------------------------------------- /mdml/examples/PD-L1/random_forest_mean/STDERR: -------------------------------------------------------------------------------- 1 | DEBUG:__main__:Workflow set to nested=True 2 | DEBUG:__main__:Loading data: data.csv 3 | DEBUG:__main__:Dataset contains 61 entries 4 | DEBUG:__main__:Building initial model 5 | DEBUG:__main__:Saving model to mean_random_forest/model.pkl 6 | DEBUG:__main__:Saving feature importance to mean_random_forest/importance.csv 7 | DEBUG:__main__:Running cross-validation 8 | Molecule CV: 0it [00:00, ?it/s] Molecule CV: 1it [00:09, 9.32s/it] Molecule CV: 2it [00:18, 9.30s/it] Molecule CV: 3it [00:28, 9.35s/it] Molecule CV: 4it [00:37, 9.32s/it] Molecule CV: 5it [00:46, 9.30s/it] Molecule CV: 6it [00:55, 9.28s/it] Molecule CV: 7it [01:05, 9.27s/it] Molecule CV: 8it [01:14, 9.31s/it] Molecule CV: 9it [01:23, 9.31s/it] Molecule CV: 10it [01:33, 9.30s/it] Molecule CV: 11it [01:42, 9.29s/it] Molecule CV: 12it [01:52, 9.50s/it] Molecule CV: 13it [02:01, 9.41s/it] Molecule CV: 14it [02:10, 9.38s/it] Molecule CV: 15it [02:20, 9.34s/it] Molecule CV: 16it [02:29, 9.31s/it] Molecule CV: 17it [02:38, 9.28s/it] Molecule CV: 18it [02:47, 9.28s/it] Molecule CV: 19it [02:57, 9.29s/it] Molecule CV: 20it [03:06, 9.28s/it] Molecule CV: 21it [03:15, 9.28s/it] Molecule CV: 22it [03:24, 9.28s/it] Molecule CV: 23it [03:34, 9.30s/it] Molecule CV: 24it [03:43, 9.29s/it] Molecule CV: 25it [03:52, 9.27s/it] Molecule CV: 26it [04:02, 9.27s/it] Molecule CV: 27it [04:11, 9.26s/it] Molecule CV: 28it [04:20, 9.28s/it] Molecule CV: 29it [04:29, 9.26s/it] Molecule CV: 30it [04:39, 9.26s/it] Molecule CV: 31it [04:48, 9.26s/it] Molecule CV: 32it [04:57, 9.28s/it] Molecule CV: 33it [05:06, 9.26s/it] Molecule CV: 34it [05:16, 9.25s/it] Molecule CV: 35it [05:25, 9.24s/it] Molecule CV: 36it [05:34, 9.23s/it] Molecule CV: 37it [05:43, 9.23s/it] Molecule CV: 38it [05:52, 9.23s/it] Molecule CV: 39it [06:02, 9.24s/it] Molecule CV: 40it [06:11, 9.23s/it] Molecule CV: 41it [06:20, 9.23s/it] Molecule CV: 42it [06:29, 9.21s/it] Molecule CV: 43it [06:39, 9.22s/it] Molecule CV: 44it [06:48, 9.24s/it] Molecule CV: 45it [06:58, 9.37s/it] Molecule CV: 46it [07:07, 9.49s/it] Molecule CV: 47it [07:17, 9.43s/it] Molecule CV: 48it [07:26, 9.36s/it] Molecule CV: 49it [07:35, 9.32s/it] Molecule CV: 50it [07:44, 9.29s/it] Molecule CV: 51it [07:53, 9.29s/it] Molecule CV: 52it [08:03, 9.27s/it] Molecule CV: 53it [08:12, 9.26s/it] Molecule CV: 54it [08:21, 9.27s/it] Molecule CV: 55it [08:30, 9.26s/it] Molecule CV: 56it [08:40, 9.24s/it] Molecule CV: 57it [08:49, 9.25s/it] Molecule CV: 58it [08:58, 9.24s/it] Molecule CV: 59it [09:07, 9.24s/it] Molecule CV: 60it [09:17, 9.25s/it] Molecule CV: 61it [09:26, 9.25s/it] Molecule CV: 61it [09:26, 9.29s/it] 9 | DEBUG:__main__:Saving cross-validation results to mean_random_forest/cross_validation.json and mean_random_forest/cross_validation.svg 10 | -------------------------------------------------------------------------------- /mdml/examples/PD-L1/gradient_boosting_mean/STDERR: -------------------------------------------------------------------------------- 1 | DEBUG:__main__:Workflow set to nested=True 2 | DEBUG:__main__:Loading data: data.csv 3 | DEBUG:__main__:Dataset contains 61 entries 4 | DEBUG:__main__:Building initial model 5 | DEBUG:__main__:Saving model to mean_gradient_boosting/model.pkl 6 | DEBUG:__main__:Saving feature importance to mean_gradient_boosting/importance.csv 7 | DEBUG:__main__:Running cross-validation 8 | Molecule CV: 0it [00:00, ?it/s] Molecule CV: 1it [00:20, 20.99s/it] Molecule CV: 2it [00:42, 21.03s/it] Molecule CV: 3it [01:03, 21.01s/it] Molecule CV: 4it [01:24, 21.06s/it] Molecule CV: 5it [01:45, 21.05s/it] Molecule CV: 6it [02:06, 21.06s/it] Molecule CV: 7it [02:27, 21.05s/it] Molecule CV: 8it [02:48, 21.04s/it] Molecule CV: 9it [03:09, 21.04s/it] Molecule CV: 10it [03:30, 21.02s/it] Molecule CV: 11it [03:51, 21.05s/it] Molecule CV: 12it [04:12, 21.06s/it] Molecule CV: 13it [04:34, 21.29s/it] Molecule CV: 14it [04:55, 21.16s/it] Molecule CV: 15it [05:16, 21.08s/it] Molecule CV: 16it [05:36, 20.99s/it] Molecule CV: 17it [05:57, 20.92s/it] Molecule CV: 18it [06:18, 20.94s/it] Molecule CV: 19it [06:40, 21.14s/it] Molecule CV: 20it [07:01, 21.06s/it] Molecule CV: 21it [07:22, 21.08s/it] Molecule CV: 22it [07:43, 21.08s/it] Molecule CV: 23it [08:04, 21.11s/it] Molecule CV: 24it [08:25, 21.10s/it] Molecule CV: 25it [08:46, 21.13s/it] Molecule CV: 26it [09:07, 21.11s/it] Molecule CV: 27it [09:29, 21.17s/it] Molecule CV: 28it [09:50, 21.16s/it] Molecule CV: 29it [10:11, 21.12s/it] Molecule CV: 30it [10:32, 21.13s/it] Molecule CV: 31it [10:53, 21.20s/it] Molecule CV: 32it [11:14, 21.16s/it] Molecule CV: 33it [11:35, 21.12s/it] Molecule CV: 34it [11:57, 21.16s/it] Molecule CV: 35it [12:18, 21.17s/it] Molecule CV: 36it [12:39, 21.14s/it] Molecule CV: 37it [13:00, 21.08s/it] Molecule CV: 38it [13:21, 21.11s/it] Molecule CV: 39it [13:42, 20.99s/it] Molecule CV: 40it [14:03, 21.01s/it] Molecule CV: 41it [14:25, 21.37s/it] Molecule CV: 42it [14:46, 21.37s/it] Molecule CV: 43it [15:07, 21.24s/it] Molecule CV: 44it [15:28, 21.14s/it] Molecule CV: 45it [15:49, 21.15s/it] Molecule CV: 46it [16:11, 21.27s/it] Molecule CV: 47it [16:32, 21.28s/it] Molecule CV: 48it [16:53, 21.18s/it] Molecule CV: 49it [17:14, 21.13s/it] Molecule CV: 50it [17:35, 21.11s/it] Molecule CV: 51it [17:56, 21.12s/it] Molecule CV: 52it [18:17, 21.08s/it] Molecule CV: 53it [18:38, 21.07s/it] Molecule CV: 54it [18:59, 21.04s/it] Molecule CV: 55it [19:20, 21.04s/it] Molecule CV: 56it [19:41, 20.98s/it] Molecule CV: 57it [20:02, 20.95s/it] Molecule CV: 58it [20:23, 20.95s/it] Molecule CV: 59it [20:45, 21.07s/it] Molecule CV: 60it [21:06, 21.16s/it] Molecule CV: 61it [21:27, 21.05s/it] Molecule CV: 61it [21:27, 21.10s/it] 9 | DEBUG:__main__:Saving cross-validation results to mean_gradient_boosting/cross_validation.json and mean_gradient_boosting/cross_validation.svg 10 | -------------------------------------------------------------------------------- /bin/mdfit_run_md.py: -------------------------------------------------------------------------------- 1 | #!/ap/rhel7/bin/python3.6 2 | 3 | #################################################################### 4 | # Corresponding Authors : Alexander Brueckner, Kaushik Lakkaraju ### 5 | # Contact : alexander.brueckner@bms.com, kaushik.lakkaraju@bms.com # 6 | #################################################################### 7 | 8 | #Import Python modules 9 | import logging 10 | import os 11 | import subprocess 12 | 13 | ###Initiate logger### 14 | logger = logging.getLogger(__name__) 15 | 16 | def run_job(command): 17 | #Run provided command, joining list with space. Pipe stdout and sdterror to log file 18 | process = subprocess.run(' '.join(command), stdout=subprocess.PIPE, \ 19 | stderr=subprocess.STDOUT, shell=True, text=True) 20 | 21 | #Iterate over sdtout and sdterror 22 | for line in process.stdout.split('\n'): 23 | #Ignore blank lines 24 | if line != "": 25 | #Ignore ExitStatus 26 | if "ExitStatus" not in line: 27 | #Write to log file for debugging 28 | logger.debug(line) 29 | 30 | def main(ligname, args, desmond_host, SCHRODINGER, master_dir): 31 | #Generate trajectory file name 32 | outcms = "%s-out.cms"%ligname 33 | 34 | #Generate trajectory directory name 35 | outtrj = "%s_trj"%ligname 36 | 37 | #Get base ligand name 38 | lig_basename = ligname.split("_repetition")[0] 39 | 40 | #Check if trajectory file and directory exist 41 | if os.path.isfile(os.path.join(master_dir, "desmond_md", lig_basename, ligname, outcms)) == False or os.path.isdir(os.path.join(master_dir, "desmond_md", lig_basename, ligname, outtrj)) == False: 42 | #If not, prepare Schrodinger's multisim command ($SCHRODINGER/utilities/multisim) 43 | run_cmd = os.path.join(SCHRODINGER, "utilities", "multisim") 44 | 45 | #Prepare Desmond MD command 46 | command = [run_cmd, '-JOBNAME', ligname, '-HOST', desmond_host, '-maxjob', '1', '-cpu', '1', '-m', '%s_md.msj'%ligname, '-c', '%s_md.cfg'%ligname, '-description', '"Molecular Dynamics"', '%s_md.cms'%ligname, '-mode', 'umbrella', '-set', '"stage[1].set_family.md.jlaunch_opt=[\"-gpu\"]"', '-o', outcms, '-OPLSDIR', args.oplsdir, '-lic', 'DESMOND_GPGPU:16', '-ATTACHED', '-WAIT'] 47 | 48 | #Capture current step 49 | logger.info("Running Desmond: %s"%' '.join(command)) 50 | 51 | #Run Desmond MD 52 | run_job(command) 53 | 54 | #Trajectory file(s) exist 55 | else: 56 | #Capture current step 57 | logger.info("Desmond trajectory found: %s, %s"%(outcms, outtrj)) 58 | 59 | #Return trajectory file and directory names and ligand name 60 | return outcms, outtrj, lig_basename 61 | 62 | if __name__ == '__main__': 63 | main(ligname, args, desmond_host, SCHRODINGER, master_dir) -------------------------------------------------------------------------------- /mdml/examples/PD-L1/gradient_boosting/STDERR: -------------------------------------------------------------------------------- 1 | DEBUG:__main__:Workflow set to nested=True 2 | DEBUG:__main__:Loading data: data.csv 3 | DEBUG:__main__:Dataset contains 183 entries 4 | DEBUG:__main__:Building initial model 5 | DEBUG:__main__:Saving model to gradient_boosting/model.pkl 6 | DEBUG:__main__:Saving feature importance to gradient_boosting/importance.csv 7 | DEBUG:__main__:Running cross-validation 8 | Molecule CV: 0it [00:00, ?it/s] Molecule CV: 1it [03:02, 182.44s/it] Molecule CV: 2it [06:07, 183.98s/it] Molecule CV: 3it [09:12, 184.35s/it] Molecule CV: 4it [12:17, 184.81s/it] Molecule CV: 5it [15:22, 184.88s/it] Molecule CV: 6it [18:26, 184.51s/it] Molecule CV: 7it [21:33, 185.21s/it] Molecule CV: 8it [24:38, 185.28s/it] Molecule CV: 9it [27:43, 185.15s/it] Molecule CV: 10it [30:48, 185.18s/it] Molecule CV: 11it [33:52, 184.82s/it] Molecule CV: 12it [36:55, 184.22s/it] Molecule CV: 13it [40:00, 184.51s/it] Molecule CV: 14it [43:06, 184.87s/it] Molecule CV: 15it [46:09, 184.26s/it] Molecule CV: 16it [49:16, 184.99s/it] Molecule CV: 17it [52:19, 184.50s/it] Molecule CV: 18it [55:23, 184.32s/it] Molecule CV: 19it [58:27, 184.40s/it] Molecule CV: 20it [1:01:32, 184.50s/it] Molecule CV: 21it [1:04:36, 184.30s/it] Molecule CV: 22it [1:07:39, 184.03s/it] Molecule CV: 23it [1:10:44, 184.14s/it] Molecule CV: 24it [1:13:47, 183.75s/it] Molecule CV: 25it [1:16:51, 183.81s/it] Molecule CV: 26it [1:19:55, 184.11s/it] Molecule CV: 27it [1:22:58, 183.80s/it] Molecule CV: 28it [1:26:03, 184.10s/it] Molecule CV: 29it [1:29:08, 184.35s/it] Molecule CV: 30it [1:32:15, 185.19s/it] Molecule CV: 31it [1:35:20, 184.95s/it] Molecule CV: 32it [1:38:22, 184.13s/it] Molecule CV: 33it [1:41:27, 184.37s/it] Molecule CV: 34it [1:44:30, 183.89s/it] Molecule CV: 35it [1:47:38, 185.21s/it] Molecule CV: 36it [1:50:41, 184.57s/it] Molecule CV: 37it [1:53:45, 184.34s/it] Molecule CV: 38it [1:56:49, 184.29s/it] Molecule CV: 39it [1:59:55, 184.73s/it] Molecule CV: 40it [2:02:59, 184.65s/it] Molecule CV: 41it [2:06:05, 184.87s/it] Molecule CV: 42it [2:09:10, 185.07s/it] Molecule CV: 43it [2:12:17, 185.70s/it] Molecule CV: 44it [2:15:23, 185.63s/it] Molecule CV: 45it [2:18:28, 185.53s/it] Molecule CV: 46it [2:21:31, 184.75s/it] Molecule CV: 47it [2:24:36, 184.71s/it] Molecule CV: 48it [2:27:47, 186.62s/it] Molecule CV: 49it [2:30:50, 185.60s/it] Molecule CV: 50it [2:33:55, 185.55s/it] Molecule CV: 51it [2:36:59, 184.93s/it] Molecule CV: 52it [2:40:03, 184.85s/it] Molecule CV: 53it [2:43:09, 185.08s/it] Molecule CV: 54it [2:46:14, 185.00s/it] Molecule CV: 55it [2:49:18, 184.79s/it] Molecule CV: 56it [2:52:21, 184.08s/it] Molecule CV: 57it [2:55:23, 183.47s/it] Molecule CV: 58it [2:58:31, 184.79s/it] Molecule CV: 59it [3:01:41, 186.43s/it] Molecule CV: 60it [3:04:48, 186.54s/it] Molecule CV: 61it [3:07:52, 186.01s/it] Molecule CV: 61it [3:07:52, 184.80s/it] 9 | DEBUG:__main__:Saving cross-validation results to gradient_boosting/cross_validation.json and gradient_boosting/cross_validation.svg 10 | -------------------------------------------------------------------------------- /bin/mdfit_run_minimization.py: -------------------------------------------------------------------------------- 1 | #!/ap/rhel7/bin/python3.6 2 | 3 | #################################################################### 4 | # Corresponding Authors : Alexander Brueckner, Kaushik Lakkaraju ### 5 | # Contact : alexander.brueckner@bms.com, kaushik.lakkaraju@bms.com # 6 | #################################################################### 7 | 8 | #Import Python modules 9 | import logging 10 | import os 11 | import subprocess 12 | 13 | ###Initiate logger### 14 | logger = logging.getLogger(__name__) 15 | 16 | def run_job(command): 17 | #Run provided command, joining list with space. Pipe stdout and sdterror to log file 18 | process = subprocess.run(' '.join(command), stdout=subprocess.PIPE, \ 19 | stderr=subprocess.STDOUT, shell=True, text=True) 20 | 21 | #Iterate over sdtout and sdterror 22 | for line in process.stdout.split('\n'): 23 | #Ignore blank lines 24 | if line != "": 25 | #Ignore ExitStatus 26 | if "ExitStatus" not in line: 27 | #Write to log file for debugging 28 | logger.debug(line) 29 | 30 | def main(ligname, pvcomplex, args, bmin_host, SCHRODINGER, master_dir, template_dir): 31 | #Generate output minimized complex filename 32 | bmincomplex = "%s_out_complex_min.mae"%ligname 33 | 34 | #Check if minimized complex exists 35 | if os.path.isfile(os.path.join(master_dir, "desmond_md", ligname, "md_setup", bmincomplex)) == False: 36 | #If not, read in minimization template 37 | with open(os.path.join(template_dir, "bmin_template.com"), "r") as template: 38 | #Put all lines in variable 39 | lines = template.readlines() 40 | 41 | #Open ligand-specific job file for writing 42 | with open("%s_min.com"%ligname, "w") as ligoutput: 43 | #Iterate over all lines in template 44 | for line in lines: 45 | #Write line to file, replacing key strings IN_NAME and OUT_NAME (complex filename and output filename) 46 | ligoutput.write(line.replace("IN_NAME","%s_out_complex.mae"%ligname).replace("OUT_NAME","%s_out_complex_min.mae"%ligname)) 47 | 48 | #Prepare Schrodinger's bmin command ($SCHRODINGER/bmin) 49 | run_cmd = os.path.join(SCHRODINGER, "bmin") 50 | 51 | #Prepare minimization command 52 | command = [run_cmd, "%s_min"%ligname, "-OPLSDIR", args.oplsdir, "-HOST", bmin_host, "-WAIT"] 53 | 54 | #Capture current step 55 | logger.info("Running minimization: %s"%' '.join(command)) 56 | 57 | #Run minimization 58 | run_job(command) 59 | 60 | #Minimized complex exists 61 | else: 62 | #Capture current step 63 | logger.info("Minimized complex found: %s"%bmincomplex) 64 | 65 | #Return minimized complex filename 66 | return bmincomplex 67 | 68 | if __name__ == '__main__': 69 | main(ligname, pvcomplex, args, bmin_host, SCHRODINGER, master_dir, template_dir) -------------------------------------------------------------------------------- /mdml/mdml/plot.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Plotting Methods 4 | ---------------- 5 | Generate basic plots for MDML analysis. 6 | 7 | @author: Benjamin Shields 8 | @email: benjamin.shields@bms.com 9 | """ 10 | 11 | ############################################################################## Imports 12 | 13 | import logging 14 | import numpy as np 15 | import matplotlib.pyplot as plt 16 | from sklearn.metrics import r2_score 17 | 18 | ############################################################################## Logger 19 | 20 | logger = logging.getLogger(__name__) 21 | 22 | ############################################################################## Functions 23 | 24 | def parity_plot(pred:list, obs:list, title:str='Fit', export_path:str=None, 25 | xlabel:str='Predicted', ylabel:str='Observed', 26 | color:str='black', cod:str='R^2') -> (float, float): 27 | """ 28 | Plot predicted versus observed and return the RMSE and coefficient of 29 | determination. 30 | 31 | Parameters 32 | ---------- 33 | pred : list 34 | Predicted values 35 | obs : list 36 | Observed values. 37 | title : str, optional 38 | Plot title. The default is 'Fit'. 39 | export_path : str, optional 40 | Path to export SVG. Images are exported as export_path.svg. The default 41 | is None. 42 | xlabel : str, optional 43 | Label for x-axis. The default is 'Predicted'. 44 | ylabel : str, optional 45 | Label for y-axis. The default is 'Observed'. 46 | color : str, optional 47 | Color of plot points and error bars. The default is 'black'. 48 | cod : str, optional 49 | Nominclature used for the computed coefficient of determination. If the 50 | results are from cross-validation use 'Q^2'. The default is 'R^2'. 51 | 52 | Returns 53 | ------- 54 | rmse : float 55 | Root mean squared error for predicted values. 56 | r2 : float 57 | Coefficient of determination for predicted values. 58 | """ 59 | 60 | plt.set_loglevel("info") 61 | 62 | # Compute RMSE and R^2 values 63 | pred = np.array(pred) 64 | obs = np.array(obs) 65 | rmse = np.sqrt(np.mean((pred - obs) ** 2)) 66 | r2 = r2_score(obs, pred) 67 | 68 | # Get upper and lower bounds of plot 69 | upper = max([max(pred), max(obs)]) 70 | lower = min([min(pred), min(obs)]) 71 | pad = (upper - lower) * 0.05 72 | 73 | # Plot 74 | plt.figure(figsize=(6,6)) 75 | plt.scatter(pred, obs, color=color, alpha=0.4) 76 | plt.xlabel(xlabel) 77 | plt.ylabel(ylabel) 78 | plt.title('{0}($RMSE={1}$, ${2}={3}$)'.format( 79 | title, 80 | round(float(rmse),2), 81 | cod, 82 | round(r2,2)) 83 | ) 84 | plt.plot([lower,upper], [lower,upper], 'k-', alpha=0.75, zorder=0) 85 | plt.xlim(lower - pad, upper + pad) 86 | plt.ylim(lower - pad, upper + pad) 87 | 88 | # Save and/or show 89 | if export_path is not None: 90 | plt.savefig(export_path + '.svg', format='svg', dpi=1200, bbox_inches='tight') 91 | plt.close() 92 | else: 93 | plt.show() 94 | 95 | return rmse, r2 96 | 97 | 98 | -------------------------------------------------------------------------------- /bin/mdfit_read_params.py: -------------------------------------------------------------------------------- 1 | #!/ap/rhel7/bin/python3.6 2 | 3 | #################################################################### 4 | # Corresponding Authors : Alexander Brueckner, Kaushik Lakkaraju ### 5 | # Contact : alexander.brueckner@bms.com, kaushik.lakkaraju@bms.com # 6 | #################################################################### 7 | 8 | #Import Python modules 9 | import logging 10 | import sys 11 | import os 12 | import json 13 | 14 | ###Initiate logger### 15 | logger = logging.getLogger(__name__) 16 | 17 | def read_json(MDFit_path): 18 | #Open institution-based parameters json file for reading 19 | with open(os.path.join(MDFit_path, "parameters.json"), "r") as injson: 20 | #Put parameters in a list 21 | parameters = json.load(injson) 22 | 23 | #Capture current step 24 | logger.info("json file found. Institution parameters read in.") 25 | 26 | #Write parameters for debugging 27 | logger.debug("Parameters: %s"%parameters) 28 | 29 | #Return parameters list 30 | return parameters 31 | 32 | def write_json(MDFit_path): 33 | #Template data to write to json file 34 | dictionary = { 35 | "hostnames": { 36 | "FFBUILDER":"localhost", 37 | "BMIN":"localhost", 38 | "MULTISIM":"localhost", 39 | "DESMOND":"localhost-gpu", 40 | "ANALYSIS":"localhost" 41 | }, 42 | "parameters": { 43 | "MAXLIGS":100, 44 | "FFPROC":32 45 | } 46 | } 47 | 48 | #Convert dictionary to json object 49 | json_object = json.dumps(dictionary, indent=4) 50 | 51 | #Open template json file for writing 52 | with open(os.path.join(MDFit_path, "parameters_TEMPLATE.json"), "w") as outfile: 53 | #Write lines to json file 54 | outfile.write(json_object) 55 | 56 | #Capture current step 57 | logger.critical("Edit the %s file and rename to %s"%(os.path.join(MDFit_path, "parameters_TEMPLATE.json"), os.path.join(MDFit_path, "parameters.json"))) 58 | 59 | def main(MDFit_path): 60 | #Check if json file exists 61 | if os.path.isfile(os.path.join(MDFit_path, "parameters.json")) == True: 62 | #If it does, read in parameters 63 | inst_params = read_json(MDFit_path) 64 | 65 | #json file does not exist 66 | else: 67 | #Check if template exists 68 | if os.path.isfile(os.path.join(MDFit_path, "parameters_TEMPLATE.json")) == True: 69 | #If it does, print path to template for user 70 | logger.critical("Parameter template file found: %s"%(os.path.join(MDFit_path, "parameters_TEMPLATE.json"))) 71 | 72 | #Print path needed for user 73 | logger.critical("Rename to: %s"%(os.path.join(MDFit_path, "parameters.json"))) 74 | 75 | #Template file does noto exist 76 | else: 77 | #Write out template for user to edit 78 | write_json(MDFit_path) 79 | 80 | #Print to screen 81 | print("Parameters not found. Information captured in MDFit.log") 82 | 83 | #Exit 84 | sys.exit() 85 | 86 | #Return parameters 87 | return inst_params 88 | 89 | if __name__ == '__main__': 90 | main(MDFit_path) -------------------------------------------------------------------------------- /mdml/bin/mdml_predict: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | """ 3 | MD Simulation Fingerprint Model Predictions 4 | -------------------------------------------------------------- 5 | Make predictions using trained MD simulation fingerprint models. 6 | 7 | @author: Benjamin Shields 8 | @email: benjamin.shields@bms.com 9 | """ 10 | 11 | ############################################################################## Imports 12 | 13 | import logging 14 | import os 15 | import pandas as pd 16 | import dill 17 | 18 | from mdml import model, cli 19 | 20 | ############################################################################## Setup 21 | 22 | logger = logging.getLogger(__name__) 23 | 24 | ############################################################################## Interface 25 | 26 | def get_parser(): 27 | description = """Make predictions using trained MD simulation fingerprint models.""" 28 | parser, groups = cli.parser(description, add_computation=False) 29 | 30 | # Input/Output Data 31 | data = parser.add_argument_group("DATA") 32 | data.add_argument( 33 | 'input', 34 | help="Path to CSV containing features and molecule IDs." 35 | ) 36 | data.add_argument( 37 | 'output', 38 | help="Path to save prediction CSV." 39 | ) 40 | data.add_argument( 41 | '-model', 42 | type=str, 43 | default=None, 44 | help="Directory containing trained ML model", 45 | required=True 46 | ) 47 | data.add_argument( 48 | '-id_col', 49 | type=str, 50 | default=None, 51 | help="Header of compound ID column.", 52 | required=True 53 | ) 54 | data.add_argument( 55 | '-drop_col', 56 | nargs='+', 57 | type=str, 58 | default=[], 59 | help="Columns that should be removed from the input feature CSV." 60 | ) 61 | data.add_argument( 62 | '-group', 63 | type=str, 64 | default=None, 65 | help="""Grouping method. The options include 'mean', 'min', and 'max'. 66 | Data will be grouped by compound ID (id_col).""" 67 | ) 68 | 69 | return parser 70 | 71 | ############################################################################## Main 72 | 73 | def main(args): 74 | # Load and preprocess data 75 | logger.debug(f'Loading data: {args.input}') 76 | data = cli.load_data( 77 | args.input, args.id_col, drop=args.drop_col, aggregation=args.group 78 | ) 79 | logger.debug(f'Dataset contains {len(data)} entries') 80 | 81 | # Load modeling workflow 82 | logger.debug(f'Loading model: {args.model}') 83 | workflow = model.load_workflow(os.path.join(args.model, 'model.pkl')) 84 | 85 | # Make predictions 86 | logger.debug(f'Making predictions: {workflow.target}') 87 | data[f'Predicted {workflow.target}'] = workflow.predict(data) 88 | 89 | # Save results 90 | logger.debug(f'Saving results to {args.output}') 91 | data.to_csv(args.output) 92 | 93 | 94 | if __name__ == "__main__": 95 | parser = get_parser() 96 | args = parser.parse_args() 97 | if args.debug: 98 | logging.basicConfig(level=logging.DEBUG) 99 | main(args) 100 | 101 | 102 | -------------------------------------------------------------------------------- /templates/desmond_md_job_template.msj: -------------------------------------------------------------------------------- 1 | # Desmond standard NPT relaxation protocol 2 | # All times are in the unit of ps. 3 | # Energy is in the unit of kcal/mol. 4 | task { 5 | task = "desmond:auto" 6 | set_family = { 7 | desmond = { 8 | checkpt.write_last_step = no 9 | } 10 | } 11 | } 12 | 13 | simulate { 14 | title = "Brownian Dynamics NVT, T = 10 K, small timesteps, and restraints on solute heavy atoms, 100ps" 15 | annealing = off 16 | time = 100 17 | timestep = [0.001 0.001 0.003 ] 18 | temperature = 10.0 19 | ensemble = { 20 | class = "NVT" 21 | method = "Brownie" 22 | brownie = { 23 | delta_max = 0.1 24 | } 25 | } 26 | restrain = { 27 | atom = "solute_heavy_atom" 28 | force_constant = 50.0 29 | } 30 | } 31 | 32 | simulate { 33 | effect_if = [["==" "-gpu" "@*.*.jlaunch_opt[-1]"] 'ensemble.method = Langevin'] 34 | title = "NVT, T = 10 K, small timesteps, and restraints on solute heavy atoms, 12ps" 35 | annealing = off 36 | time = 12 37 | timestep = [0.001 0.001 0.003] 38 | temperature = 10.0 39 | restrain = { atom = solute_heavy_atom force_constant = 50.0 } 40 | ensemble = { 41 | class = NVT 42 | method = Berendsen 43 | thermostat.tau = 0.1 44 | } 45 | 46 | randomize_velocity.interval = 1.0 47 | eneseq.interval = 0.3 48 | trajectory.center = [] 49 | } 50 | 51 | simulate { 52 | title = "NPT, T = 10 K, and restraints on solute heavy atoms, 12ps" 53 | effect_if = [["==" "-gpu" "@*.*.jlaunch_opt[-1]"] 'ensemble.method = Langevin'] 54 | annealing = off 55 | time = 12 56 | temperature = 10.0 57 | restrain = retain 58 | ensemble = { 59 | class = NPT 60 | method = Berendsen 61 | thermostat.tau = 0.1 62 | barostat .tau = 50.0 63 | } 64 | 65 | randomize_velocity.interval = 1.0 66 | eneseq.interval = 0.3 67 | trajectory.center = [] 68 | } 69 | 70 | solvate_pocket { 71 | should_skip = true 72 | ligand_file = ? 73 | } 74 | 75 | simulate { 76 | title = "NPT and restraints on solute heavy atoms, 12ps" 77 | effect_if = [["@*.*.annealing"] 'annealing = off temperature = "@*.*.temperature[0][0]"' 78 | ["==" "-gpu" "@*.*.jlaunch_opt[-1]"] 'ensemble.method = Langevin'] 79 | time = 12 80 | restrain = retain 81 | ensemble = { 82 | class = NPT 83 | method = Berendsen 84 | thermostat.tau = 0.1 85 | barostat .tau = 50.0 86 | } 87 | 88 | randomize_velocity.interval = 1.0 89 | eneseq.interval = 0.3 90 | trajectory.center = [] 91 | } 92 | 93 | simulate { 94 | title = "NPT and no restraints, 24ps" 95 | effect_if = [["@*.*.annealing"] 'annealing = off temperature = "@*.*.temperature[0][0]"' 96 | ["==" "-gpu" "@*.*.jlaunch_opt[-1]"] 'ensemble.method = Langevin'] 97 | time = 24 98 | ensemble = { 99 | class = NPT 100 | method = Berendsen 101 | thermostat.tau = 0.1 102 | barostat .tau = 2.0 103 | } 104 | 105 | eneseq.interval = 0.3 106 | trajectory.center = solute 107 | } 108 | 109 | simulate { 110 | cfg_file = "CONFIG_NAME" 111 | jobname = "$MASTERJOBNAME" 112 | dir = "." 113 | compress = "" 114 | } 115 | 116 | pl_analysis {} 117 | -------------------------------------------------------------------------------- /mdml/mdml/base.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Base Classes 4 | ------------ 5 | Helpful base classes for development. 6 | 7 | @author: Benjamin Shields 8 | @email: benjamin.shields@bms.com 9 | """ 10 | 11 | ############################################################################## Imports 12 | 13 | import logging 14 | import dill 15 | 16 | ############################################################################## Logger 17 | 18 | logger = logging.getLogger(__name__) 19 | 20 | ############################################################################## Methods 21 | 22 | class Savable: 23 | """ 24 | The Savable class implements general methods for saving and loading class 25 | objects. The __save__ method will save the state dictionary (__dict__) or 26 | any key/value pair as a pickled dictionary. The __load__ method will load 27 | and automatically name an entire pickled dictionary or any individual key 28 | value pair. In addition, __load__ will load non-dictionary pickled objects 29 | into a specified attribute. 30 | """ 31 | 32 | def __save__(self, path:str, attrname:[str,list]=None) -> None: 33 | """ 34 | Save the state dictionary or an individual attribute as a key/value 35 | pair. If attrname is None then the entire state dictionary is saved. 36 | 37 | Parameters 38 | ---------- 39 | path : str 40 | Path to save attribute. Data will be saved as a pickle file using 41 | dill. 42 | attrname : str, list, optional 43 | Name of attribute. The default is None. If None, the entire state 44 | dictionary will be saved. 45 | """ 46 | 47 | if isinstance(attrname, str): 48 | assert hasattr(self, attrname) 49 | savedict = {attrname: getattr(self, attrname)} 50 | elif isinstance(attrname, list): 51 | savedict = {} 52 | for name in attrname: 53 | assert hasattr(self, name) 54 | savedict[name] = getattr(self, name) 55 | else: 56 | savedict = self.__dict__ 57 | with open(path, 'wb') as file: 58 | dill.dump(savedict, file) 59 | 60 | def __load__(self, path:str, attrname:[str, list]=None) -> None: 61 | """ 62 | Load saved objects to attributes based on dictionary key/value pairs, 63 | load an individual element from a pickled dictionary, or load objects 64 | to a specified attribute. 65 | 66 | Parameters 67 | ---------- 68 | path : str 69 | Path to pickled object file. For example a scikit-learn model. 70 | attrname : str, list, optional 71 | Name of attribute when loaded. For example 'model'. If attrname is 72 | None or not present in the loaded dict, all key/value pairs will be 73 | loaded. 74 | """ 75 | 76 | # Load pickle file 77 | with open(path, 'rb') as file: 78 | loaded = dill.load(file) 79 | 80 | # Make sure it is a dict 81 | if not isinstance(loaded, dict): 82 | loaded = {attrname: loaded} 83 | 84 | # Load specific attributes 85 | if isinstance(attrname, str): 86 | attrname = [attrname] 87 | if isinstance(attrname, list): 88 | for name in attrname: 89 | attr = loaded.get(name) 90 | if attr is not None: 91 | setattr(self, name, attr) 92 | 93 | # Load full dict 94 | else: 95 | self.__dict__.update(loaded) 96 | 97 | 98 | 99 | -------------------------------------------------------------------------------- /bin/mdfit_build_box.py: -------------------------------------------------------------------------------- 1 | #!/ap/rhel7/bin/python3.6 2 | 3 | #################################################################### 4 | # Corresponding Authors : Alexander Brueckner, Kaushik Lakkaraju ### 5 | # Contact : alexander.brueckner@bms.com, kaushik.lakkaraju@bms.com # 6 | #################################################################### 7 | 8 | #Import Python modules 9 | import logging 10 | import sys 11 | import os 12 | import subprocess 13 | 14 | ###Initiate logger### 15 | logger = logging.getLogger(__name__) 16 | 17 | def run_job(command): 18 | #Run provided command, joining list with space. Pipe stdout and sdterror to log file 19 | process = subprocess.run(' '.join(command), stdout=subprocess.PIPE, \ 20 | stderr=subprocess.STDOUT, shell=True, text=True) 21 | 22 | #Iterate over sdtout and sdterror 23 | for line in process.stdout.split('\n'): 24 | #Ignore blank lines 25 | if line != "": 26 | #Ignore ExitStatus 27 | if "ExitStatus" not in line: 28 | #Write to log file for debugging 29 | logger.debug(line) 30 | 31 | def main(master_dir, SCHRODINGER, args, charge, ligname, multisim_host, bmincomplex, template_dir): 32 | #Prepare Schrodinger multisim command ($SCHRODINGER/utilities/multisim) 33 | run_cmd = os.path.join(SCHRODINGER, "utilities", "multisim") 34 | 35 | #Generate output filename 36 | simbox="%s_md_setup_out.cms"%ligname 37 | 38 | #Generate setup filename 39 | inputfile="%s_md_setup.msj"%ligname 40 | 41 | #Generate ligand-specific job name 42 | jobname="%s_setup"%ligname 43 | 44 | #Check that simulation box does not exist 45 | if os.path.isfile(os.path.join(master_dir, "desmond_md", ligname, "md_setup", simbox)) == False: 46 | #If not, check system charge 47 | if charge > 0: 48 | #If positive, set template filename to positive 49 | filename = os.path.join(template_dir, "positive_template.msj") 50 | 51 | elif charge == 0: 52 | #If neutral, set template filename to netural 53 | filename = os.path.join(template_dir, "neutral_template.msj") 54 | 55 | #Must be negative 56 | else: 57 | #If negative, set template filename to negative 58 | filename = os.path.join(template_dir, "negative_template.msj") 59 | 60 | #Read in template file 61 | with open(filename, "r") as template: 62 | #Put lines in a variable 63 | lines = template.readlines() 64 | 65 | #Open setup filename for writing 66 | with open(inputfile, "w") as ligoutput: 67 | #Iterate through all the template lines 68 | 69 | for line in lines: 70 | #Write line to out file, replacing solvent keyword with desired solvent 71 | ligoutput.write(line.replace("",args.solvent)) 72 | 73 | #Generate command for building the box 74 | command = [run_cmd, "-maxjob", "1", "-JOBNAME", jobname, "-m", inputfile, bmincomplex, "-o", simbox, "-OPLSDIR", args.oplsdir, "-HOST", multisim_host, "-WAIT"] 75 | 76 | #Document current step 77 | logger.info("Building simulation box: %s"%simbox) 78 | 79 | #Run command 80 | run_job(command) 81 | 82 | #Simulation box exists 83 | else: 84 | #Document current step 85 | logger.info("Simulation box found: %s"%simbox) 86 | 87 | #Return output filename 88 | return simbox 89 | 90 | if __name__ == '__main__': 91 | main(master_dir, SCHRODINGER, args, charge, ligname, multisim_host, bmincomplex, template_dir) -------------------------------------------------------------------------------- /bin/mdfit_get_charge.py: -------------------------------------------------------------------------------- 1 | #!/ap/rhel7/bin/python3.6 2 | 3 | #################################################################### 4 | # Corresponding Authors : Alexander Brueckner, Kaushik Lakkaraju ### 5 | # Contact : alexander.brueckner@bms.com, kaushik.lakkaraju@bms.com # 6 | #################################################################### 7 | 8 | #Import Python modules 9 | import logging 10 | import os 11 | import shutil 12 | import subprocess 13 | import re 14 | 15 | ###Initiate logger### 16 | logger = logging.getLogger(__name__) 17 | 18 | def run_job(command): 19 | #Run provided command, joining list with space. Pipe stdout and sdterror to log file 20 | process = subprocess.run(' '.join(command), stdout=subprocess.PIPE, \ 21 | stderr=subprocess.STDOUT, shell=True, text=True) 22 | 23 | #Iterate over sdtout and sdterror 24 | for line in process.stdout.split('\n'): 25 | #Ignore blank lines 26 | if line != "": 27 | #Ignore ExitStatus 28 | if "ExitStatus" not in line: 29 | #Write to log file for debugging 30 | logger.debug(line) 31 | 32 | def main(SCHRODINGER, ligname, master_dir, args): 33 | #Prepare Schrodinger proplister command ($SCHRODINGER/utilities/proplister) 34 | run_cmd = os.path.join(SCHRODINGER, "utilities", "proplister") 35 | 36 | #Check if proplister has been run before 37 | if os.path.isfile(os.path.join(master_dir, "desmond_md", ligname, "md_setup", "%s_atoms.csv"%ligname)) == False: 38 | #If not, prepare proplister command 39 | command = [run_cmd, "-atom_bond_props", "%s_out_complex_min.mae"%ligname, "-c", "-o", "%s.csv"%ligname] 40 | 41 | #Run proplister 42 | run_job(command) 43 | 44 | #Proplister has been run before 45 | else: 46 | #Copy proplister output from md_setup directory to scratch space 47 | shutil.copy(os.path.join(master_dir, "desmond_md", ligname, "md_setup", "%s_atoms.csv"%ligname), "%s_atoms.csv"%ligname) 48 | 49 | #Define pattern for matching 50 | fcFinder = re.compile(r'i_m_formal_charge') 51 | 52 | #Initiate variable for counting lines 53 | lineCount = 0 54 | 55 | #Initiate total charge variable 56 | totQ = 0 57 | 58 | #Read in proplister output 59 | with open("%s_atoms.csv"%ligname, "r") as inFile: 60 | #Initiate infinite loop to iterate through file 61 | while 1: 62 | #Read in next line of file 63 | dataLine = inFile.readline() 64 | 65 | #Break loop if end of file 66 | if not dataLine: break 67 | 68 | #Split line into list by comma (csv) 69 | tmpList = dataLine[:-1].split(",") 70 | 71 | #Check if first line 72 | if lineCount == 0: 73 | #Initiate variable for number of columns 74 | columnCount = 0 75 | 76 | #Iterate over all columns in first line of file 77 | for column in tmpList: 78 | #Locate the formal charge column 79 | theMatch = fcFinder.match(column) 80 | 81 | #If formal charge column is located 82 | if theMatch: 83 | #Get column number 84 | chargeColumn = columnCount 85 | 86 | #Break loop 87 | break 88 | 89 | #Increment column variable 90 | columnCount += 1 91 | 92 | #Not first line 93 | else: 94 | #Increment by value in formal charge column 95 | totQ += int(tmpList[chargeColumn]) 96 | 97 | #Increment line count variable 98 | lineCount += 1 99 | 100 | #Document current step 101 | logger.info("Total system charge for %s: %s"%(ligname, totQ)) 102 | 103 | #Return total system charge 104 | return totQ 105 | 106 | if __name__ == '__main__': 107 | main(SCHRODINGER, ligname. master_dir, args) -------------------------------------------------------------------------------- /bin/mdfit_slicetrj.py: -------------------------------------------------------------------------------- 1 | #!/ap/rhel7/bin/python3.6 2 | 3 | #################################################################### 4 | # Corresponding Authors : Alexander Brueckner, Kaushik Lakkaraju ### 5 | # Contact : alexander.brueckner@bms.com, kaushik.lakkaraju@bms.com # 6 | #################################################################### 7 | 8 | #Import Python modules 9 | import logging 10 | import sys 11 | import os 12 | import subprocess 13 | 14 | #Import Schrodinger modules 15 | from schrodinger.application.desmond.packages import traj 16 | 17 | ###Initiate logger### 18 | logger = logging.getLogger(__name__) 19 | 20 | def run_job(command): 21 | #Run provided command, joining list with space. Pipe stdout and sdterror to log file 22 | process = subprocess.run(' '.join(command), stdout=subprocess.PIPE, \ 23 | stderr=subprocess.STDOUT, shell=True, text=True) 24 | 25 | #Iterate over sdtout and sdterror 26 | for line in process.stdout.split('\n'): 27 | #Ignore blank lines 28 | if line != "": 29 | #Ignore ExitStatus 30 | if "ExitStatus" not in line: 31 | #Write to log file for debugging 32 | logger.debug(line) 33 | 34 | def count_frames(trj_path): 35 | #Read in trajectory with Schrodinger's read_traj utilty 36 | tr = traj.read_traj(trj_path) 37 | 38 | #Return length of trajectory (number of frames) 39 | return len(tr) 40 | 41 | def main(SCHRODINGER, rep, master_dir, args): 42 | #Prepare Schrodinger's run command ($SCHRODINGER/run) 43 | run_cmd = os.path.join(SCHRODINGER, 'run') 44 | 45 | #Generate repetition name _repetition<#> 46 | basename = os.path.basename(rep) 47 | 48 | #Generate ligand name 49 | ligbase = basename.split("_repetition")[0] 50 | 51 | #Generate path to trajectory files in scratch space 52 | md_path = os.path.join(master_dir, "desmond_md", "scratch") 53 | 54 | #Check if trajectory files are in scratch 55 | if os.path.isfile(os.path.join(md_path, "%s-out.cms"%basename)) == True: 56 | #If they are, generate path to trajectory file in scratch space 57 | cms_path = os.path.join(md_path, "%s-out.cms"%basename) 58 | 59 | #If they are, generate path to trajectory directory in scratch space 60 | trj_path = os.path.join(md_path, "%s_trj"%basename) 61 | 62 | #Trajectory files are in permanent directories; allows slice to be done separate from Desmond MD 63 | else: 64 | #Generate path to trajectory file in permanent directory 65 | cms_path = os.path.join(master_dir, "desmond_md", ligbase, basename, "%s-out.cms"%basename) 66 | #Generate path to trajectory directory in permanent directory 67 | trj_path = os.path.join(master_dir, "desmond_md", ligbase, basename, "%s_trj"%basename) 68 | 69 | #Check if the user wants to remove frames 70 | if args.slice_start != 0 or args.slice_end != None: 71 | #Slice hasn't been done before 72 | if os.path.isfile(os.path.join(master_dir, "desmond_md", ligbase, basename, "%s_sliced-out.cms"%basename)) == False: 73 | #Get total number of frames in trajectory 74 | total_frames = count_frames(trj_path) 75 | 76 | #Check if user wants to remove frames from end of trajectory 77 | if args.slice_end == None: 78 | #If not, set variable to total number of frames 79 | args.slice_end = total_frames 80 | 81 | #Prepare slice command 82 | trj_slice = [run_cmd, "trj_merge.py", "-s", "%s:%s:1"%(args.slice_start, args.slice_end), "-o", "%s_sliced"%basename, cms_path, trj_path] 83 | 84 | #Capture current step 85 | logger.info("Removing frames from trajectory: %s"%' '.join(trj_slice)) 86 | 87 | #Run trajectory slicing 88 | run_job(trj_slice) 89 | 90 | #Slice has been done before 91 | else: 92 | #Capture current step 93 | logger.info("Sliced trajectory already found. Skipping slice.") 94 | 95 | #No slice desired 96 | else: 97 | #Capture current step 98 | logger.info("Not removing frames from trajectory") 99 | 100 | if __name__ == '__main__': 101 | main(SCHRODINGER, rep, master_dir, args) -------------------------------------------------------------------------------- /mdml/README.md: -------------------------------------------------------------------------------- 1 | # MDML 2 | 3 | Train machine learning models to predict potency from SimFPs and automatically identify 4 | important interactions via feature importance. 5 | 6 | ## Installation 7 | 8 | Build and activate a python 3 environment. 9 | 10 | ``` 11 | conda create -n mdml python=3.10 12 | conda activate mdml 13 | ``` 14 | 15 | Install `mdml`. 16 | 17 | ``` 18 | git clone git@github.com:brueckna2020/MDFit.git 19 | cd MDFit/mdml 20 | pip install . 21 | ``` 22 | 23 | ## Command Line Interface 24 | 25 | After installation the `mdml_train` and `mdml_predict` command line scripts will be executable 26 | in the `mdml` environment. 27 | 28 | ### Train 29 | 30 | Use pre-computed SimFPs to train regression models, evaluate performance using nested leave-one- 31 | molecule-out cross-validation, and identify important interactions via feature importance. This 32 | script generates a directory with the following results files. 33 | - `model.pkl`: A model trained using all SimFP data. 34 | - `cross_validation.json`: A summary of cross-validation results including predictions, observations, and feature importances from each fold and LOMO-CV metrics (computed using all folds). 35 | - `cross_validation.svg`: A parity plot showing the LOMO-CV performance with metrics computed using the average predictions from SimFPs from different simulations for each molecule. 36 | - `importance.csv`: The feature importance computed using the specified model type (e.g., weights for linear and node impurity for tree-based regressions). 37 | 38 | ``` 39 | usage: mdml_train [-h] [-debug] [-nproc NPROC] -target_col TARGET_COL -id_col ID_COL 40 | [-drop_col DROP_COL [DROP_COL ...]] [-group GROUP] [-model_type MODEL_TYPE] 41 | [-cv CV] [-lofo] [-nested] 42 | input output 43 | 44 | Train regression models, evaluate performance using nested leave-one-molecule-out cross- 45 | validation, and identify important interactions via feature importance. 46 | 47 | HELP: 48 | -h Show this help message and exit. 49 | -debug Print debugging messages. (default: False) 50 | 51 | COMPUTATION: 52 | -nproc NPROC Number of processors to use for parallel computation. (default: 1) 53 | 54 | DATA: 55 | input Path to CSV containing SimFP features, IDs, and target (optional). 56 | output Directory path to save output files. 57 | -target_col TARGET_COL 58 | Header of target column in training data CSV. (default: None) 59 | -id_col ID_COL Header of compound ID column. (default: None) 60 | -drop_col DROP_COL [DROP_COL ...] 61 | Columns that should be removed from the training CSV. (default: []) 62 | -group GROUP Grouping method. The options include 'mean', 'min', and 'max'. Data will 63 | be grouped by compound ID (id_col). (default: None) 64 | 65 | MODEL: 66 | -model_type MODEL_TYPE 67 | Regression model type. The options are 'linear', 'ridge', 'lasso', 68 | 'random_forest', and 'gradient_boosting'. (default: linear) 69 | -cv CV Number of cross-validation folds to use in hyperparameter tuning. Specify 70 | -1 for leave-one-out. (default: -1) 71 | -lofo Run leave-one-feature-out cross-validation analysis. (default: False) 72 | -nested Toggle nested cross-validation. Hyperparamters will NOT be optimized in 73 | each fold. (default: True) 74 | ``` 75 | 76 | ### Predict 77 | 78 | ``` 79 | usage: mdml_predict [-h] [-debug] -model MODEL -id_col ID_COL [-drop_col DROP_COL [DROP_COL ...]] 80 | [-group GROUP] 81 | input output 82 | 83 | Make predictions using trained MD simulation fingerprint models. 84 | 85 | HELP: 86 | -h Show this help message and exit. 87 | -debug Print debugging messages. (default: False) 88 | 89 | DATA: 90 | input Path to CSV containing features and molecule IDs. 91 | output Path to save prediction CSV. 92 | -model MODEL Directory containing trained ML model (default: None) 93 | -id_col ID_COL Header of compound ID column. (default: None) 94 | -drop_col DROP_COL [DROP_COL ...] 95 | Columns that should be removed from the input feature CSV. (default: []) 96 | -group GROUP Grouping method. The options include 'mean', 'min', and 'max'. Data will 97 | be grouped by compound ID (id_col). (default: None) 98 | ``` 99 | 100 | -------------------------------------------------------------------------------- /bin/mdfit_combine_csvs.py: -------------------------------------------------------------------------------- 1 | #!/ap/rhel7/bin/python3.6 2 | 3 | #################################################################### 4 | # Corresponding Authors : Alexander Brueckner, Kaushik Lakkaraju ### 5 | # Contact : alexander.brueckner@bms.com, kaushik.lakkaraju@bms.com # 6 | #################################################################### 7 | 8 | #Import Python modules 9 | import logging 10 | import os 11 | import pandas as pd 12 | import glob 13 | 14 | ###Initiate logger### 15 | logger = logging.getLogger(__name__) 16 | 17 | def generate_master_simfp(master_dir, df_simfp): 18 | #Initiate empty list in case files do not exist 19 | simfp_scratch_files = [] 20 | simfp_files = [] 21 | 22 | #Get paths to individual SimFP files in scratch directory 23 | simfp_scratch_files = glob.glob(os.path.join(master_dir, "desmond_md_analysis", "scratch", "*SimFP.csv")) 24 | 25 | #Get paths to individual SimFP files in permanent directories 26 | simfp_files = glob.glob(os.path.join(master_dir, "desmond_md_analysis", "*", "*repetition*", "*SimFP.csv")) 27 | 28 | #Iterate over files in the scratch directory 29 | for file in simfp_scratch_files: 30 | #Add to list of files in permanent directories 31 | simfp_files.append(file) 32 | 33 | #Iterate over each file 34 | for file in simfp_files: 35 | #Read file into temporary dataframe 36 | df_temp = pd.read_csv(file) 37 | 38 | #Append ligand SimFP to master SimFP dataframe 39 | df_simfp = df_simfp.append(df_temp, ignore_index=True) 40 | 41 | #Sort master SimFP dataframe by molecule name and replace "NaN" values with zeros 42 | df_simfp = df_simfp.sort_values(by=['Molecule', 'Repetition']).fillna("0.0000") 43 | 44 | #Remove duplicate ligand entires (if user re-runs analysis) 45 | no_duplicates = df_simfp.drop_duplicates() 46 | 47 | #Return master SimFP dataframe 48 | return no_duplicates 49 | 50 | def generate_master_compat(master_dir, df_compat): 51 | #Initiate empty list in case files do not exist 52 | compat_scratch_files = [] 53 | compat_files = [] 54 | 55 | #Get paths to individual compatibility files in scratch directory 56 | compat_scratch_files = glob.glob(os.path.join(master_dir, "desmond_md_analysis", "scratch", "*compatibility.csv")) 57 | 58 | #Get paths to individual compatibility files in permanent directories 59 | compat_files = glob.glob(os.path.join(master_dir, "desmond_md_analysis", "*", "*repetition*", "*compatibility.csv")) 60 | 61 | #Iterate over files in the scratch directory 62 | for file in compat_scratch_files: 63 | #Add to list of files in permanent directories 64 | compat_files.append(file) 65 | 66 | #Iterate over each file 67 | for file in compat_files: 68 | #Read file into temporary dataframe 69 | df_temp = pd.read_csv(file) 70 | 71 | #Append ligand compatibility to master compatibility dataframe 72 | df_compat = df_compat.append(df_temp, ignore_index=True) 73 | 74 | #Sort master compatibility dataframe by molecule name and replace "NaN" values with zeros 75 | df_compat = df_compat.sort_values(by=['Molecule', 'Repetition']).fillna("0.0000") 76 | 77 | #Remove duplicate ligand entires (if user re-runs analysis) 78 | no_duplicates = df_compat.drop_duplicates() 79 | 80 | #Return master compatibility dataframe 81 | return no_duplicates 82 | 83 | def main(master_dir): 84 | #Initiate master SimFP dataframe 85 | df_simfp = pd.DataFrame() 86 | 87 | #Initiate master compatibility dataframe 88 | df_compat = pd.DataFrame() 89 | 90 | #Generate final SimFP dataframe 91 | df_simfp_final = generate_master_simfp(master_dir, df_simfp) 92 | 93 | #Generate final compatibility dataframe 94 | df_compat_final = generate_master_compat(master_dir, df_compat) 95 | 96 | #Write final SimFP dataframe to "MDFit_SimFPs.csv" file in desmond_md_analysis 97 | df_simfp_final.to_csv(os.path.join(master_dir, "desmond_md_analysis", "MDFit_SimFPs.csv"), index=False) 98 | 99 | #Write final compatibility dataframe to "MDFit_Compatibility.csv" file in desmond_md_analysis 100 | df_compat_final.to_csv(os.path.join(master_dir, "desmond_md_analysis", "MDFit_Compatibility.csv"), index=False) 101 | 102 | #Document current step 103 | logger.info("SimFPs are captured in %s"%os.path.join(master_dir, "desmond_md_analysis", "MDFit_SimFPs.csv")) 104 | 105 | #Document current step 106 | logger.info("Compatibility metrics are captured in %s"%os.path.join(master_dir, "desmond_md_analysis", "MDFit_Compatibility.csv")) 107 | 108 | if __name__ == '__main__': 109 | main(master_dir) -------------------------------------------------------------------------------- /bin/mdfit_prep_complex.py: -------------------------------------------------------------------------------- 1 | #!/ap/rhel7/bin/python3.6 2 | 3 | #################################################################### 4 | # Corresponding Authors : Alexander Brueckner, Kaushik Lakkaraju ### 5 | # Contact : alexander.brueckner@bms.com, kaushik.lakkaraju@bms.com # 6 | #################################################################### 7 | 8 | #Import Python modules 9 | import logging 10 | import os 11 | import shutil 12 | import subprocess 13 | 14 | ###Initiate logger### 15 | logger = logging.getLogger(__name__) 16 | 17 | def run_job(command): 18 | #Run provided command, joining list with space. Pipe stdout and sdterror to log file 19 | process = subprocess.run(' '.join(command), stdout=subprocess.PIPE, \ 20 | stderr=subprocess.STDOUT, shell=True, text=True) 21 | 22 | #Iterate over sdtout and sdterror 23 | for line in process.stdout.split('\n'): 24 | #Ignore blank lines 25 | if line != "": 26 | #Ignore ExitStatus 27 | if "ExitStatus" not in line: 28 | #Write to log file for debugging 29 | logger.debug(line) 30 | 31 | def main(SCHRODINGER, ligpath, ligname, i, master_dir, args): 32 | #Prepare Schrodinger's structure subset command ($SCHRODINGER/utilities/structsubset) 33 | run_cmd = os.path.join(SCHRODINGER, 'utilities', 'structsubset') 34 | 35 | #Prepare Schrodinger's structure concatination command ($SCHRODINGER/utilities/structcat) 36 | structcat = os.path.join(SCHRODINGER, 'utilities', 'structcat') 37 | 38 | #Prepare Schrodinger's structure run command ($SCHRODINGER/run) 39 | schrun = os.path.join(SCHRODINGER, 'run') 40 | 41 | #Generate pose viewer filename 42 | pvcomplex = "%s_pv.mae"%ligname 43 | 44 | #Generate complex output filename 45 | outname = "%s_out_complex.mae"%ligname 46 | 47 | #Check if pose viewer file exists 48 | if os.path.isfile(os.path.join(master_dir, "desmond_md", ligname, "md_setup", pvcomplex)) == False: 49 | #If not, check if ligand sdf file exists and ligands are not precomplexed with protein 50 | if os.path.isfile("%s.sdf"%ligname) == False and not args.precomplex: 51 | #If not, prepare structure subset command (extract ligand from library) 52 | command = [run_cmd, '-n', str(i+1), ligpath, '%s.sdf'%ligname] 53 | 54 | #Document current step 55 | logger.info("Getting ligand: %s"%' '.join(command)) 56 | 57 | #Run structure subset (extract ligand from library) 58 | run_job(command) 59 | 60 | #Ligands are precomplexed with protein 61 | else: 62 | #If not, prepare structure subset command (extract ligand from library) 63 | command = [run_cmd, '-n', str(i+1), ligpath, '%s.mae'%ligname] 64 | 65 | #Document current step 66 | logger.info("Getting ligand: %s"%' '.join(command)) 67 | 68 | #Run structure subset (extract ligand from library) 69 | run_job(command) 70 | 71 | 72 | #Check if protein and ligand are pre-complexed 73 | if not args.precomplex: 74 | #If not, generate path to protein file 75 | protein_path = os.path.join(master_dir, args.prot) 76 | 77 | #Prepare structure concatination command (combine protein and ligand files) 78 | command1 = [structcat, "-i", protein_path, "%s.sdf"%ligname, "-o", pvcomplex] 79 | 80 | #Capture current step 81 | logger.info("Complexing protein and ligand: %s"%' '.join(command1)) 82 | 83 | #Prepare pose viewer command 84 | command2 = [schrun, "pv_convert.py", "-mode", "merge", pvcomplex] 85 | 86 | #Capture current step 87 | logger.info("Merging protein and ligand: %s"%' '.join(command2)) 88 | 89 | #Run concatination command 90 | run_job(command1) 91 | 92 | #Run pose viewer command 93 | run_job(command2) 94 | 95 | #Rename auto-generated output complex name to desired filename ("-out" > "_out") 96 | os.rename("%s-out_complex.mae"%ligname,outname) 97 | 98 | #Protein and ligand are pre-complexed 99 | else: 100 | #Prepare structure concatination command (translate sdf to mae) 101 | command1 = [structcat, "-i", "%s.mae"%ligname, "-o", pvcomplex] 102 | 103 | #Capture current step 104 | logger.info("Complexing protein and ligand: %s"%' '.join(command1)) 105 | 106 | #Prepare pose viewer command 107 | command2 = [schrun, "pv_convert.py", "-mode", "merge", pvcomplex] 108 | 109 | #Capture current step 110 | logger.info("Merging protein and ligand: %s"%' '.join(command2)) 111 | 112 | #Run concatination command 113 | run_job(command1) 114 | 115 | #Run pose viewer command 116 | run_job(command2) 117 | 118 | #Copy pose viewer complex to desired filename 119 | shutil.copy(pvcomplex,outname) 120 | 121 | #Pose viewer file exists 122 | else: 123 | #Capture current step 124 | logger.info("Complex file found: %s"%pvcomplex) 125 | 126 | #Return pose viewer filename 127 | return pvcomplex 128 | 129 | if __name__ == '__main__': 130 | main(SCHRODINGER, ligpath, ligname, i, master_dir, args) -------------------------------------------------------------------------------- /mdml/bin/mdml_train: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | """ 3 | Train MD Simulation Fingerprint Models 4 | -------------------------------------- 5 | Train simulation fingerprint models, evaluate performance via nested leave-one 6 | -molecule-out cross-validation, and identify important interactions via 7 | feature importance. 8 | 9 | @author: Benjamin Shields 10 | @email: benjamin.shields@bms.com 11 | """ 12 | 13 | ############################################################################## Imports 14 | 15 | import logging 16 | import os 17 | import pandas as pd 18 | 19 | from mdml import model, cli, plot 20 | 21 | ############################################################################## Setup 22 | 23 | logger = logging.getLogger(__name__) 24 | 25 | MODELS = { 26 | 'linear':model.LinearRegression, 27 | 'ridge':model.RidgeRegression, 28 | 'lasso':model.LassoRegression, 29 | 'random_forest':model.RandomForestRegression, 30 | 'gradient_boosting':model.GradientBoostingRegression 31 | } 32 | 33 | ############################################################################## Interface 34 | 35 | def get_parser(): 36 | description = """Train regression models, evaluate performance using 37 | nested leave-one-molecule-out cross-validation, and identify important 38 | interactions via feature importance.""" 39 | parser, groups = cli.parser(description, add_computation=True) 40 | 41 | # Input/Output Data 42 | data = parser.add_argument_group("DATA") 43 | data.add_argument( 44 | 'input', 45 | help="Path to CSV containing SimFP features, IDs, and target (optional)." 46 | ) 47 | data.add_argument( 48 | 'output', 49 | help="Directory path to save output files." 50 | ) 51 | data.add_argument( 52 | '-target_col', 53 | type=str, 54 | default=None, 55 | help="Header of target column in training data CSV.", 56 | required=True 57 | ) 58 | data.add_argument( 59 | '-id_col', 60 | type=str, 61 | default=None, 62 | help="Header of compound ID column.", 63 | required=True 64 | ) 65 | data.add_argument( 66 | '-drop_col', 67 | nargs='+', 68 | type=str, 69 | default=[], 70 | help="Columns that should be removed from the training CSV." 71 | ) 72 | data.add_argument( 73 | '-group', 74 | type=str, 75 | default=None, 76 | help="""Grouping method. The options include 'mean', 'min', and 'max'. 77 | Data will be grouped by compound ID (id_col).""" 78 | ) 79 | 80 | # Model Details 81 | model = parser.add_argument_group("MODEL") 82 | model.add_argument( 83 | '-model_type', 84 | type=str, 85 | default='linear', 86 | help="""Regression model type. The options are 'linear', 'ridge', 'lasso', 87 | 'random_forest', and 'gradient_boosting'.""" 88 | ) 89 | model.add_argument( 90 | '-cv', 91 | type=int, 92 | default=-1, 93 | help="""Number of cross-validation folds to use in hyperparameter 94 | tuning. Specify -1 for leave-one-out.""" 95 | ) 96 | model.add_argument( 97 | '-lofo', 98 | action='store_true', 99 | help="""Run leave-one-feature-out cross-validation analysis.""" 100 | ) 101 | model.add_argument( 102 | '-nested', 103 | dest='nested', 104 | action='store_false', 105 | help="""Toggle nested cross-validation. Hyperparamters will NOT be 106 | optimized in each fold.""" 107 | ) 108 | 109 | return parser 110 | 111 | ############################################################################## Main 112 | 113 | def main(args): 114 | # Details 115 | logger.debug(f'Workflow set to nested={args.nested}') 116 | 117 | # Load and preprocess data 118 | logger.debug(f'Loading data: {args.input}') 119 | data = cli.load_data( 120 | args.input, args.id_col, drop=args.drop_col, aggregation=args.group 121 | ) 122 | logger.debug(f'Dataset contains {len(data)} entries') 123 | 124 | # Output directory 125 | if not os.path.isdir(args.output): 126 | os.mkdir(args.output) 127 | 128 | # Build and fit model 129 | logger.debug('Building initial model') 130 | workflow = MODELS[args.model_type](data, args.target_col) 131 | workflow.fit(n_jobs=args.nproc, cv=args.cv) 132 | imp = workflow.feature_importance() 133 | 134 | # Save model 135 | path = os.path.join(args.output, 'model.pkl') 136 | logger.debug(f'Saving model to {path}') 137 | workflow.__save__(path) 138 | path = os.path.join(args.output, 'importance.csv') 139 | logger.debug(f'Saving feature importance to {path}') 140 | imp.to_csv(path) 141 | 142 | # Run (possibly nested) cross-validation 143 | logger.debug('Running cross-validation') 144 | cv = workflow.cross_validate( 145 | nested=args.nested, cv=args.cv, n_jobs=args.nproc 146 | ) 147 | path = os.path.join(args.output, 'cross_validation') 148 | logger.debug( 149 | f'Saving cross-validation results to {path}.json and {path}.svg' 150 | ) 151 | cli.save_json(cv, f'{path}.json') 152 | cv_df = pd.DataFrame() 153 | cv_df['ID'] = cv['ids'] 154 | cv_df['pred'] = cv['pred'] 155 | cv_df['obs'] = cv['obs'] 156 | cv_df = cv_df.groupby('ID').mean() 157 | plot.parity_plot( 158 | cv_df['pred'], cv_df['obs'], title='Cross-Validation', cod='Q^2', 159 | export_path=path 160 | ) 161 | 162 | # Run leave-one-feature-out CV 163 | if args.lofo: 164 | logger.debug('Running leave-one-feature-out cross-validation') 165 | lofo = model.leave_one_feature_out_importance( 166 | workflow, nested=args.nested, cv=args.cv, n_jobs=args.nproc 167 | ) 168 | path = os.path.join(args.output, 'lofo_cross_validation.json') 169 | logger.debug( 170 | f'Saving lofo cross-validation results to {path}' 171 | ) 172 | cli.save_json(lofo, f'{path}.json') 173 | 174 | 175 | if __name__ == "__main__": 176 | parser = get_parser() 177 | args = parser.parse_args() 178 | if args.debug: 179 | logging.basicConfig(level=logging.DEBUG) 180 | main(args) 181 | 182 | -------------------------------------------------------------------------------- /bin/mdfit_initiate.py: -------------------------------------------------------------------------------- 1 | #!/ap/rhel7/bin/python3.6 2 | 3 | #################################################################### 4 | # Corresponding Authors : Alexander Brueckner, Kaushik Lakkaraju ### 5 | # Contact : alexander.brueckner@bms.com, kaushik.lakkaraju@bms.com # 6 | #################################################################### 7 | 8 | #Import Python modules 9 | import logging 10 | import sys 11 | import os 12 | import subprocess 13 | 14 | #Import Schrodinger modules 15 | from schrodinger import structure 16 | 17 | ###Initiate logger### 18 | logger = logging.getLogger(__name__) 19 | 20 | def run_job(command): 21 | #Run provided command, joining list with space. Pipe stdout and sdterror to log file 22 | process = subprocess.run(' '.join(command), stdout=subprocess.PIPE, \ 23 | stderr=subprocess.STDOUT, shell=True, text=True) 24 | 25 | #Iterate over sdtout and sdterror 26 | for line in process.stdout.split('\n'): 27 | #Ignore blank lines 28 | if line != "": 29 | #Ignore ExitStatus 30 | if "ExitStatus" not in line: 31 | #Write to log file for debugging 32 | logger.debug(line) 33 | 34 | def filecheck(args, master_dir): 35 | #Document current step 36 | logger.info('Checking working directory for provided files: %s'%master_dir) 37 | 38 | #Check file existance, if provided by user 39 | #Generate list with all possible files 40 | filenames=[args.prot, args.liglib, args.precomplex] 41 | 42 | #Iterate over list of files 43 | for file in filenames: 44 | #Check if filename was provided by user 45 | if file != None: 46 | #If filename provided, check it exists 47 | exists = os.path.exists(file) 48 | 49 | #If it does not exist 50 | if exists == False: 51 | #Log error 52 | logger.critical("%s is not accessible or does not exist; "\ 53 | "cannot proceed"%file) 54 | 55 | #Exit 56 | sys.exit() 57 | 58 | #Document current step 59 | logger.info("All provided filenames were successfully located") 60 | 61 | def set_vars(args, master_dir, SCHRODINGER): 62 | #Get file extension for ligand library (e.g., ".sdf") 63 | if args.liglib and not args.precomplex: 64 | ligfiletype = os.path.splitext(args.liglib)[-1].lower() 65 | elif args.precomplex: 66 | ligfiletype = os.path.splitext(args.precomplex)[-1].lower() 67 | 68 | #Get filename, extension removed (e.g., "ligand_library") 69 | if args.liglib: 70 | ligfileprefix = os.path.splitext(args.liglib)[0] 71 | elif args.precomplex: 72 | ligfileprefix = os.path.splitext(args.precomplex)[0] 73 | 74 | #Get file extension for protein (e.g., ".mae") 75 | if args.prot: 76 | protfiletype = os.path.splitext(args.prot)[-1].lower() 77 | else: 78 | protfiletype = ".mae" 79 | 80 | #Check if ligand library extension is mae or sdf 81 | if ligfiletype != ".sdf" and ligfiletype != ".mae": 82 | #If not, log error 83 | logger.critical("%s must be in mae or sdf format; cannot proceed"%args.liglib) 84 | 85 | #Exit 86 | sys.exit(1) 87 | 88 | #Check if protein extension is mae 89 | if protfiletype != ".mae" and not args.precomplex: 90 | #If not, log error 91 | logger.critical("%s must be in mae format; cannot proceed"%args.prot) 92 | 93 | #Exit 94 | sys.exit(1) 95 | 96 | #Check if ligand library extension is mae and convert to SDF for downstream compatibility and non-Schrodinger MD engines 97 | if ligfiletype == ".mae" and not args.precomplex: 98 | #Set up Schrodinger run command ($SCHRODINGER/utilities/structconvert) 99 | run_cmd = os.path.join(SCHRODINGER, 'utilities', 'structconvert') 100 | 101 | #Prepare full command to convert mae file to sdf 102 | command = [run_cmd, args.liglib, "%s.sdf"%ligfileprefix] 103 | 104 | #Run the command using subprocess 105 | run_job(command) 106 | 107 | #Change filetype to sdf 108 | ligfiletype = ".sdf" 109 | 110 | #Return ligand library extension, ligand library name, and protein extension 111 | return ligfiletype, ligfileprefix, protfiletype 112 | 113 | def count_ligs(args, ligfiletype, maxliglimit): 114 | #Initiate variable 115 | nlig = 0 116 | 117 | #Use StructureReader to iterate through ligands 118 | if args.liglib and not args.precomplex: 119 | for s in structure.StructureReader(args.liglib): 120 | #Increment nlig for each ligand 121 | nlig+=1 122 | elif args.precomplex: 123 | for s in structure.StructureReader(args.precomplex): 124 | #Increment nlig for each ligand 125 | nlig+=1 126 | 127 | #If ligands are not found 128 | if nlig == 0: 129 | #Log error 130 | logger.critical("No ligands captured. Please check input file.") 131 | 132 | #Exit 133 | sys.exit(1) 134 | 135 | #Ligands must have been found 136 | else: 137 | #Document current step 138 | logger.info("Number of ligands in library = %s"%nlig) 139 | 140 | #Check that the number of ligands is less than the max limit for MD 141 | if nlig > maxliglimit and args.skip_md == False: 142 | #If true, log error 143 | logger.critical("Number of ligands in library (%s) exceeds the "\ 144 | "allowed limit (%s); cannot proceed" % (nlig, maxliglimit)) 145 | 146 | #Exit 147 | sys.exit(1) 148 | 149 | #Return number of ligands 150 | return nlig 151 | 152 | def main(args, master_dir, maxliglimit, SCHRODINGER): 153 | #Check if files exist and dependencies are met 154 | filecheck(args, master_dir) 155 | 156 | #Check file extensions 157 | ligfiletype, ligfileprefix, protfiletype = set_vars(args, master_dir, SCHRODINGER) 158 | 159 | #Count number of ligands in ligand library 160 | nlig = count_ligs(args, ligfiletype, maxliglimit) 161 | 162 | #Return ligand library extension, ligand library name, protein extension, and number of ligands 163 | return ligfiletype, ligfileprefix, protfiletype, nlig 164 | 165 | if __name__ == '__main__': 166 | main(args, master_dir, maxliglimit, SCHRODINGER) 167 | -------------------------------------------------------------------------------- /MDFit.py: -------------------------------------------------------------------------------- 1 | #!/ap/rhel7/bin/python 2 | 3 | #################################################################### 4 | # Corresponding Authors : Alexander Brueckner, Kaushik Lakkaraju ### 5 | # Contact : alexander.brueckner@bms.com, kaushik.lakkaraju@bms.com # 6 | #################################################################### 7 | 8 | #Import Python modules 9 | import logging 10 | import sys 11 | import os 12 | 13 | #Get MDFit installation path 14 | MDFit_path = os.path.dirname(__file__) 15 | 16 | #Add MDFit bin directory to path 17 | sys.path.insert(0, os.path.join(MDFit_path, 'bin')) 18 | 19 | #Import MDFit modules 20 | import mdfit_read_params 21 | import mdfit_parseargs 22 | import mdfit_initiate 23 | import mdfit_ffbuilder 24 | import mdfit_desmond_md 25 | import mdfit_desmond_analysis 26 | 27 | #Generate path to template directory 28 | template_dir = os.path.join(MDFit_path, 'templates') 29 | 30 | #Get path to user directory 31 | master_dir = os.getcwd() 32 | 33 | #Get Schrodinger environmental variable 34 | SCHRODINGER = os.getenv('SCHRODINGER') 35 | 36 | #Get Schrodinger release version. Assumes pathname has version 37 | #E.g., "/schrodinger/2023-2/" 38 | schrodinger_version = os.path.basename(SCHRODINGER) 39 | 40 | #Get home path environmental variable 41 | homepath = os.getenv('HOME') 42 | 43 | #Initiate logger 44 | logger = logging.getLogger() 45 | 46 | #Point logger to file 47 | fh = logging.FileHandler(os.path.join(master_dir, 'MDFit.log')) 48 | 49 | #Create writing format for logging 50 | fh.setFormatter(logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')) 51 | 52 | #Add to logger 53 | logger.addHandler(fh) 54 | 55 | #Set logger default to debug 56 | logger.setLevel(logging.DEBUG) 57 | 58 | def read_json(MDFit_path): 59 | #Read in instiutional parameters ("inst params") 60 | #Hostnames, maximum number of ligands, number of processors for FFBuilder 61 | #Calls mdfit_read_params.py 62 | inst_params = mdfit_read_params.main(MDFit_path) 63 | 64 | #Return parameters 65 | return inst_params 66 | 67 | def parseargs(master_dir, homepath): 68 | #Get user flags and options 69 | #Calls mdfit_parseargs.py 70 | args = mdfit_parseargs.main(master_dir, homepath) 71 | 72 | #Set logger level based on user input 73 | logger.setLevel(args.loglevel) 74 | 75 | #Print arguments for future reference 76 | logger.info('Parsed arguments: %s', args) 77 | 78 | #Return arguments 79 | return args 80 | 81 | def initiate_mdfit(SCHRODINGER, args, master_dir, maxliglimit): 82 | #Document current step 83 | logger.info("Initiating MDFit...") 84 | 85 | #Calls mdfit_initiate.py 86 | ligfiletype, ligfileprefix, protfiletype, nlig = mdfit_initiate.main(args, \ 87 | master_dir, maxliglimit, SCHRODINGER) 88 | 89 | #Document current step 90 | logger.info("Completed initiation of MDFit") 91 | 92 | #Return arguments 93 | return ligfiletype, ligfileprefix, protfiletype, nlig 94 | 95 | def run_ffbuilder(args, master_dir, SCHRODINGER, ligpath, ligfileprefix, schrodinger_version, inst_params, homepath): 96 | #Check if user wants FFBuilder 97 | if args.skip_ff == False: 98 | #If they do, document current step 99 | logger.info("Initiating FFBuilder...") 100 | 101 | #Calls mdfit_ffbuilder.py 102 | mdfit_ffbuilder.main(args, master_dir, SCHRODINGER, ligpath, \ 103 | ligfileprefix, schrodinger_version, inst_params, homepath) 104 | 105 | #Document current step 106 | logger.info("Completed FFBuilder") 107 | 108 | #User requests to skip FFBuilder 109 | else: 110 | #Document current step 111 | logger.info("Skipping FFBuilder") 112 | 113 | #Useful to check that ligand file is correctly assigned 114 | logger.debug("Current ligand file = %s"%ligpath) 115 | 116 | def run_md(args, master_dir, ligfileprefix, SCHRODINGER, ligpath, template_dir, inst_params): 117 | #Check if user wants Desmond MD 118 | if args.skip_md == False: 119 | #If they do, document current step 120 | logger.info("Initiating Desmond MD...") 121 | 122 | #Calls mdfit_desmond_md.py 123 | mdfit_desmond_md.main(args, master_dir, ligfileprefix, SCHRODINGER, ligpath, template_dir, inst_params) 124 | 125 | #Document current step 126 | logger.info("Completed Desmond MD") 127 | 128 | #User requests to skip Desmond MD 129 | else: 130 | #Document current step 131 | logger.info("Skipping Desmond MD") 132 | 133 | def run_analysis(args, master_dir, SCHRODINGER, inst_params): 134 | #Check if user wants Desmond MD analysis 135 | if args.skip_analysis == False: 136 | #If they do, document current step 137 | logger.info("Initiating MD analysis...") 138 | 139 | #Calls mdfit_desmond_analysis.py 140 | mdfit_desmond_analysis.main(args, master_dir, SCHRODINGER, inst_params) 141 | 142 | #Document current step 143 | logger.info("Completed MD analysis") 144 | 145 | #User requests to skip Desmond MD analysis 146 | else: 147 | #Document current step 148 | logger.info("Skipping MD analysis") 149 | 150 | def main(): 151 | #Get institution parameters from json file (hostnames, max number of ligs, etc.) 152 | inst_params = read_json(MDFit_path) 153 | 154 | #Get user flags and options 155 | args = parseargs(master_dir, homepath) 156 | 157 | #Get maximum number of ligands from json file 158 | maxliglimit = inst_params["parameters"]["MAXLIGS"] 159 | 160 | #Check file viability, flag compatibility, etc. 161 | ligfiletype, ligfileprefix, protfiletype, nlig = initiate_mdfit(SCHRODINGER, args, master_dir, maxliglimit) 162 | 163 | #Generate path to ligand library 164 | if args.liglib and not args.precomplex: 165 | ligpath = os.path.join(master_dir, "%s.sdf"%ligfileprefix) 166 | elif args.precomplex: 167 | ligpath = os.path.join(master_dir, args.precomplex) 168 | 169 | #Run FFBuilder, if requested 170 | run_ffbuilder(args, master_dir, SCHRODINGER, ligpath, \ 171 | ligfileprefix, schrodinger_version, inst_params, homepath) 172 | 173 | #Run Desmond MD, if requested 174 | run_md(args, master_dir, ligfileprefix, SCHRODINGER, ligpath, template_dir, inst_params) 175 | 176 | #Analyze Desmond trajectories, if requested 177 | run_analysis(args, master_dir, SCHRODINGER, inst_params) 178 | 179 | if __name__ == '__main__': 180 | main() -------------------------------------------------------------------------------- /mdml/mdml/cli.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Command Line Interface 4 | ---------------------- 5 | A basic interface for MDML scripts. 6 | 7 | @author: Benjamin Shields 8 | @email: benjamin.shields@bms.com 9 | """ 10 | 11 | ############################################################################## Imports 12 | 13 | import logging 14 | import pandas as pd 15 | import json 16 | from argparse import ArgumentParser, ArgumentDefaultsHelpFormatter 17 | from pandas.core.frame import DataFrame 18 | 19 | ############################################################################## Logger 20 | 21 | logger = logging.getLogger(__name__) 22 | 23 | ############################################################################## Interface 24 | 25 | def parser(description:str, add_computation:bool=False, **kwargs) -> (ArgumentParser, dict): 26 | """ 27 | Generate a formatted argparse parser. 28 | 29 | Parameters 30 | ---------- 31 | description : str 32 | CLI description. 33 | add_computation : bool, optional 34 | Add a computation parameters group including nproc. The default is True. 35 | **kwargs 36 | Keyword arguments passed to argparse.ArgumentParser. 37 | 38 | Returns 39 | ------- 40 | parser : ArgumentParser 41 | Argument parser. 42 | groups : dict 43 | Pre-generated argument groups. 44 | """ 45 | 46 | # Formatted parser 47 | class ArgparseFormatter(ArgumentDefaultsHelpFormatter): 48 | pass 49 | 50 | parser = ArgumentParser( 51 | description=description, 52 | add_help=False, 53 | formatter_class=lambda prog: ArgparseFormatter(prog, width=98), 54 | **kwargs 55 | ) 56 | groups = {} 57 | 58 | # Help 59 | helper = parser.add_argument_group('HELP') 60 | helper.add_argument( 61 | '-h', 62 | action='help', 63 | help="""Show this help message and exit.""" 64 | ) 65 | helper.add_argument( 66 | '-debug', 67 | action='store_true', 68 | dest='debug', 69 | help="""Print debugging messages.""" 70 | ) 71 | groups['help'] = helper 72 | 73 | # Resource Management 74 | if add_computation: 75 | comp = parser.add_argument_group('COMPUTATION') 76 | comp.add_argument( 77 | '-nproc', 78 | action='store', 79 | dest='nproc', 80 | type=int, 81 | default=1, 82 | help="""Number of processors to use for parallel computation.""" 83 | ) 84 | groups['computation'] = comp 85 | 86 | return parser, groups 87 | 88 | ############################################################################## Data 89 | 90 | def aggregate_data(data:DataFrame, id_col:str, method:str) -> DataFrame: 91 | """ 92 | Aggregate SimFPs by computing the mean, min, or max of duplicate MD runs. 93 | 94 | Parameters 95 | ---------- 96 | features : DataFrame 97 | Feature matrix including duplicate SimFPs. 98 | id_col : str 99 | Molecule ID column. This column is used to identify duplicate SimFPs. 100 | method : str 101 | Aggregation method. The options are 'mean', 'min', and 'max'. 102 | 103 | Raises 104 | ------ 105 | ValueError 106 | The specified method is not supported. 107 | 108 | Returns 109 | ------- 110 | data : DataFrame 111 | Aggregated features. 112 | """ 113 | 114 | if method == 'min': 115 | data = data.groupby(id_col).min() 116 | elif method == 'max': 117 | data = data.groupby(id_col).max() 118 | elif method == 'mean': 119 | data = data.groupby(id_col).mean() 120 | else: 121 | raise ValueError(f'Grouping method {method} not recognized.') 122 | 123 | data.insert(0, 'ID', data.index.values) 124 | 125 | return data.reset_index(drop=True) 126 | 127 | def load_data(path:str, id_col:str, drop:bool=None, aggregation:str=None) -> DataFrame: 128 | """ 129 | Load SimFP and target data and preprocess it by aggregating SimFPs from 130 | duplicate MD runs. 131 | 132 | Note 133 | ---- 134 | Rows containing SimFPs from duplicate MD runs should all of the same ID and 135 | target value. Any columns not corresponding to ID, features, or target 136 | should be removed via the drop argument. 137 | 138 | Parameters 139 | ---------- 140 | path : str 141 | Path to input CSV file. 142 | id_col : str, optional 143 | Column header containing unique molecule IDs. 144 | drop : bool, optional 145 | Remove . The default is None. 146 | aggregation : str, optional 147 | Type of aggregation to use. The options are None, 'mean', 'min', and 148 | 'max'. The default is None. 149 | 150 | Raises 151 | ------ 152 | ValueError 153 | An ID column is required for aggregation. 154 | 155 | Returns 156 | ------- 157 | data : DataFrame 158 | Loaded and preprocessed data. 159 | """ 160 | 161 | data = pd.read_csv(path) 162 | if drop is not None and len(drop) > 0: 163 | data = data.drop(drop, axis=1) 164 | if aggregation is None: 165 | if id_col != 'ID': 166 | data.insert(0, 'ID', data[id_col].values) 167 | data = data.drop(id_col, axis=1) 168 | else: 169 | if id_col is None: 170 | raise ValueError('An ID column is required for aggregation.') 171 | data = aggregate_data(data, id_col, aggregation) 172 | 173 | return data 174 | 175 | def save_json(data:dict, path:str) -> None: 176 | """ 177 | Save a dictionary as a JSON file. 178 | 179 | Parameters 180 | ---------- 181 | data : dict 182 | Dictionary to be saved. 183 | path : str 184 | Path to JSON file. 185 | 186 | Returns 187 | ------- 188 | None 189 | """ 190 | 191 | with open(path, 'w', encoding='utf-8') as file: 192 | json.dump(data, file, ensure_ascii=False, indent=4) 193 | 194 | def load_json(path:str) -> dict: 195 | """ 196 | Load a JSON file as a dictionary. 197 | 198 | Parameters 199 | ---------- 200 | path : str 201 | Path to JSON file. 202 | 203 | Returns 204 | ------- 205 | dict 206 | Loaded data. 207 | """ 208 | 209 | with open(path) as file: 210 | data = json.load(file) 211 | 212 | return data 213 | 214 | 215 | 216 | 217 | 218 | 219 | -------------------------------------------------------------------------------- /bin/mdfit_event_analysis.py: -------------------------------------------------------------------------------- 1 | #!/ap/rhel7/bin/python3.6 2 | 3 | #################################################################### 4 | # Corresponding Authors : Alexander Brueckner, Kaushik Lakkaraju ### 5 | # Contact : alexander.brueckner@bms.com, kaushik.lakkaraju@bms.com # 6 | #################################################################### 7 | 8 | #Import Python modules 9 | import logging 10 | import sys 11 | import os 12 | import subprocess 13 | 14 | #Fixes issue with X11 forwarding 15 | os.environ['QT_QPA_PLATFORM']='offscreen' 16 | 17 | ###Initiate logger### 18 | logger = logging.getLogger(__name__) 19 | 20 | def run_job(command): 21 | #Run provided command, joining list with space. Pipe stdout and sdterror to log file 22 | process = subprocess.run(' '.join(command), stdout=subprocess.PIPE, \ 23 | stderr=subprocess.STDOUT, shell=True, text=True) 24 | 25 | #Iterate over sdtout and sdterror 26 | for line in process.stdout.split('\n'): 27 | #Ignore blank lines 28 | if line != "": 29 | #Ignore ExitStatus 30 | if "ExitStatus" not in line: 31 | #Write to log file for debugging 32 | logger.debug(line) 33 | 34 | def dircheck(master_dir, basename): 35 | #Generate directory name in scratch space for each ligand -repetition<#> 36 | newdir = os.path.join(master_dir, "desmond_md_analysis", "scratch", basename) 37 | 38 | #Check if directory exists 39 | if os.path.isdir(newdir) == False: 40 | #If not, make directory (recursive) 41 | os.makedirs(newdir) 42 | 43 | #Capture current step 44 | logger.info("Created directory: %s"%newdir) 45 | 46 | #Directory exists 47 | else: 48 | #Capture current step 49 | logger.info("Directory already exists: %s"%newdir) 50 | 51 | #Return ligand scratch directory path 52 | return newdir 53 | 54 | def trj_pathnames(md_path, basename): 55 | #Check if slice trajectory exists 56 | if os.path.isfile(os.path.join(md_path, "%s_sliced-out.cms"%basename)) == True: 57 | #If it does, set cms path to sliced trajectory file 58 | cms_path = os.path.join(md_path, "%s_sliced-out.cms"%basename) 59 | 60 | #If it does, set trj path to sliced trajectory directory 61 | trj_path = os.path.join(md_path, "%s_sliced_trj"%basename) 62 | 63 | #Check if unsliced trajectory does and sliced trajectory does not exist 64 | elif os.path.isfile(os.path.join(md_path, "%s_sliced-out.cms"%basename)) == False and os.path.isfile(os.path.join(md_path, "%s-out.cms"%basename)) == True: 65 | #If it does, set cms path to unsliced trajectory file 66 | cms_path = os.path.join(md_path, "%s-out.cms"%basename) 67 | 68 | #If it does, set trj path to unsliced trajectory file 69 | trj_path = os.path.join(md_path, "%s_trj"%basename) 70 | 71 | #Could not locate trajectory 72 | else: 73 | #Log error 74 | logger.critical("Trajectory could not be located!") 75 | 76 | #Exit 77 | sys.exit() 78 | 79 | #Return paths to trajectory files 80 | return cms_path, trj_path 81 | 82 | def gen_outname(basename): 83 | #Generate input eaf filename 84 | eaf_in = "%s-in.eaf"%basename 85 | 86 | #Generate output eaf filename 87 | eaf_out = "%s-out.eaf"%basename 88 | 89 | #Generate output pdf filename 90 | eaf_pdf = "%s_analysis.pdf"%basename 91 | 92 | #Return generated filenames 93 | return eaf_in, eaf_out, eaf_pdf 94 | 95 | 96 | def main(SCHRODINGER, rep, master_dir, args, inst_params): 97 | #Prepare analysis hostname 98 | analysis_host = inst_params["hostnames"]["ANALYSIS"] 99 | 100 | #Prepare Schrodinger run command ($SCHRODINGER/run) 101 | run_cmd = os.path.join(SCHRODINGER, 'run') 102 | 103 | #Get repetition name -repetition<#> 104 | basename = os.path.basename(rep) 105 | 106 | #Get ligand name 107 | ligbase = basename.split("_repetition")[0] 108 | 109 | #Generate pathname to trajectory files 110 | md_path = os.path.join(master_dir, "desmond_md", ligbase, basename) 111 | 112 | #Generate paths to trajectory files 113 | cms_path, trj_path = trj_pathnames(md_path, basename) 114 | 115 | #Generate filenames for event analysis 116 | eaf_in, eaf_out, eaf_pdf = gen_outname(basename) 117 | 118 | #Check if output eaf file exists in scratch and permanent space 119 | if os.path.isfile(os.path.join(master_dir, "desmond_md_analysis", "scratch", eaf_out)) == False and os.path.isfile(os.path.join(master_dir, "desmond_md_analysis", ligbase, basename, eaf_out)) == False: 120 | #If not, prepare directory for text data and plot files 121 | data_dir = dircheck(master_dir, basename) 122 | 123 | #Need to wrap ASL in double quotes for Schrodinger to interpret 124 | prot_ASL = '"%s"'%args.prot_ASL 125 | lig_ASL = '"%s"'%args.lig_ASL 126 | 127 | #Prepare event analysis (analyze) command 128 | event_analysis_command1 = [run_cmd, "event_analysis.py", "analyze", cms_path, "-p", prot_ASL, "-l", lig_ASL, "-out", basename] 129 | 130 | #Prepare simulation analysis command 131 | analyze_simulation_command=[run_cmd, "analyze_simulation.py", "-HOST", analysis_host, "-OPLSDIR", args.oplsdir, "-JOBNAME", basename, "-WAIT", cms_path, trj_path, eaf_out, eaf_in] 132 | 133 | #Prepare event analysis (report) command 134 | event_analysis_command2=[run_cmd, "event_analysis.py", "report", "-pdf", eaf_pdf, "-data", "-plots", "-data_dir", data_dir, eaf_out] 135 | 136 | #Capture current step 137 | logger.info("Generating eaf file: %s"%' '.join(event_analysis_command1)) 138 | 139 | #Run event analysis (analyze) command 140 | run_job(event_analysis_command1) 141 | 142 | #Capture current step 143 | logger.info("Running simulation analysis: %s"%' '.join(analyze_simulation_command)) 144 | 145 | #Run simulation analysis command 146 | run_job(analyze_simulation_command) 147 | 148 | #Limitation of Schrodinger's code. Cannot control output filenames and asynchronous calls clash. Forced to run serially. 149 | #Return event analysis (report) command 150 | return event_analysis_command2 151 | 152 | #Output eaf file exists 153 | else: 154 | #Check if PDF was generated 155 | if os.path.isfile(os.path.join(master_dir, "desmond_md_analysis", "scratch", eaf_pdf)) == False and os.path.isfile(os.path.join(master_dir, "desmond_md_analysis", ligbase, basename, eaf_pdf)) == False: 156 | #If not, prepare directory for text data and plot files 157 | data_dir = dircheck(master_dir, basename) 158 | 159 | #Prepare event analysis (report) command 160 | event_analysis_command2=[run_cmd, "event_analysis.py", "report", "-pdf", eaf_pdf, "-data", "-plots", "-data_dir", data_dir, eaf_out] 161 | 162 | #Limitation of Schrodinger's code. Cannot control output filenames and asynchronous calls clash. Forced to run serially. 163 | #Return event analysis (report) command 164 | return event_analysis_command2 165 | 166 | #PDF was generated 167 | else: 168 | #Capture current step 169 | logger.info("eaf file found. Skipping event analysis.") 170 | 171 | #Return empty list - event analysis (report) not necessary 172 | return [] 173 | 174 | if __name__ == '__main__': 175 | main(SCHRODINGER, rep, master_dir, args, inst_params) -------------------------------------------------------------------------------- /bin/mdfit_parseargs.py: -------------------------------------------------------------------------------- 1 | #!/ap/rhel7/bin/python 2 | 3 | #################################################################### 4 | # Corresponding Authors : Alexander Brueckner, Kaushik Lakkaraju ### 5 | # Contact : alexander.brueckner@bms.com, kaushik.lakkaraju@bms.com # 6 | #################################################################### 7 | 8 | #Import Python modules 9 | import logging 10 | import textwrap 11 | import argparse 12 | import sys 13 | import os 14 | 15 | ###Initiate logger### 16 | logger = logging.getLogger(__name__) 17 | 18 | class ArgumentParser(argparse.ArgumentParser): 19 | def error(self, message): 20 | logger.critical(message) 21 | print("An error has occurred. Check log file for information.") 22 | run_command(('kill','0')) 23 | 24 | """Disables prefix matching in ArgumentParser.""" 25 | def _get_option_tuples(self, option_string): 26 | """Prevent argument parsing from looking for prefix matches.""" 27 | return [] 28 | 29 | def parse_arguments(master_dir, homepath): 30 | parser = ArgumentParser(prog='MDFit', formatter_class=argparse.RawDescriptionHelpFormatter, usage='%(prog)s [options]', 31 | description=textwrap.dedent('''\ 32 | ---------------------------------------------- 33 | MDFit workflow: 34 | a) Run FFBuilder to optimize ligand parameters 35 | b) Prepare protein-ligand complexes using user-supplied protein mae file 36 | c) Solvate each protein-ligand complex 37 | d) Run Desmond MD with each of the solvated systems 38 | e) Generate SimFPs and compatibility information 39 | f) Cluster the simulation, reporting representative structures 40 | ''')) 41 | 42 | structure = parser.add_argument_group("STRUCTURE INPUT") 43 | ffbuilder = parser.add_argument_group("FFBUILDER") 44 | desmond = parser.add_argument_group("DESMOND MD") 45 | analysis = parser.add_argument_group("MD ANALYSIS") 46 | clustering = parser.add_argument_group("MD CLUSTERING") 47 | misc = parser.add_argument_group("MISCELLANEOUS") 48 | 49 | structure.add_argument('-p', '--prot', dest='prot', default=None, help='protein mae file; must also provide liglib') 50 | structure.add_argument('-l', '--liglib', dest='liglib', default=None, help='ligand library in mae or sdf format; must also provide prot') 51 | structure.add_argument('--precomplex', dest='precomplex', default=None, help='mae file with protein and ligand already complexed (e.g., crystal structure); skips FFBuilder') 52 | 53 | ffbuilder.add_argument('--skip_ff', dest='skip_ff', action='store_true', help='skip FFBuilder; default = false') 54 | ffbuilder.add_argument('-o', '--oplsdir', dest='oplsdir', default='%s/.schrodinger/opls_dir'%homepath, help='path to custom forcefield; default = %s/.schrodinger/opls_dir'%homepath) 55 | 56 | desmond.add_argument('--skip_md', dest='skip_md', action='store_true', help='skip MD simulation; default = false') 57 | desmond.add_argument('--solvent', dest='solvent', default='SPC', help='SPC/TIP3P; default = SPC') 58 | desmond.add_argument('-t', '--md_sim_time', dest='md_sim_time', type=float, default='2000', help='in picoseconds; default = 2000') 59 | desmond.add_argument('--md_traj_write_freq', dest='md_traj_write_freq', type=float, default='100', help='in picoseconds; default = 100') 60 | desmond.add_argument('-r', '--md_repetitions', dest='md_repetitions', type=int, default='1', help='number of MD simulations to run for each ligand, each with a different random seed; default = 1') 61 | 62 | analysis.add_argument('--skip_analysis', dest='skip_analysis', action='store_true', help='skip MD simulation analysis; default = false') 63 | analysis.add_argument('--slice_start', dest='slice_start', type=int, default=0, help='frame to start analysis. default: 0') 64 | analysis.add_argument('--slice_end', dest='slice_end', help='frame to end analysis. default: last frame') 65 | analysis.add_argument('--analysis_lig', dest='analysis_lig', default='all', help='name of ligand for MD analysis; default = all') 66 | analysis.add_argument('--prot_ASL', dest='prot_ASL', default='"protein"', help='ASL definition for protein; default = "protein"') 67 | analysis.add_argument('--lig_ASL', dest='lig_ASL', default='"auto"', help='ASL definition for ligands; default = "auto"') 68 | analysis.add_argument('--analysis_cutoff', dest='analysis_cutoff', type=float, default='0.0000', help='interactions above this percentage of the simulation will be recorded; default=0.0000') 69 | 70 | clustering.add_argument('--skip_cluster', dest='skip_cluster', action='store_true', help='skip trajectory clustering; default = false') 71 | clustering.add_argument('--n_clusters', dest='n_clusters', type=int, default='5', help='number of clusters to output; default = 5') 72 | clustering.add_argument('--rmsd_ASL', dest='rmsd_ASL', default='"auto"', help='ASL definition for RMSD clustering (e.g., ligand); default = "auto"') 73 | clustering.add_argument('--centering_ASL', dest='centering_ASL', default='"protein"', help='ASL definition for centering the trajectory (e.g., binding site residues); default = protein') 74 | clustering.add_argument('--parch_align_ASL', dest='parch_align_ASL', default='"protein"', help='ASL definition for alignment during parching (e.g., binding site residues); default = protein') 75 | clustering.add_argument('--parch_solv_ASL', dest='parch_solv_ASL', default='"auto"', help='ASL definition for atoms around which solvent is retained; default = "auto"') 76 | clustering.add_argument('--n_solv', dest='n_solv', type=int, default='100', help='number of solvent molecules to keep during parching; default = 100') 77 | 78 | misc.add_argument('-m', '--max_workers', dest='max_workers', type=int, default=0, help='number of workers for multitasking; default = min(32, os.cpu_count() + 4)') 79 | misc.add_argument('-d', '--debug', action='store_const', dest='loglevel', const=logging.DEBUG, default=logging.INFO, help='Print all debugging statements to log file') 80 | 81 | #Get all arguments and check for any unknown variables 82 | args,unknowns = parser.parse_known_args() 83 | 84 | #Check if no options passed to script 85 | if len(sys.argv)==1: 86 | #Capture error 87 | logger.critical("provide '-prot' and '-liglib' for basic functionality") 88 | 89 | #Print help statement 90 | parser.print_help() 91 | 92 | #Exit 93 | sys.exit() 94 | 95 | #Check if unknown variables passed to script 96 | if unknowns: 97 | #Document warning and ignore variables 98 | logger.warning('ignoring unrecognized arguments: %s'%unknowns) 99 | 100 | #Check if protein and ligands are pre-complexed 101 | if not args.precomplex: 102 | #If not, check if protein and ligand library files are provided 103 | if not args.prot or not args.liglib: 104 | #If not, capture error 105 | logger.critical("'-prot' and '-liglib' arguments are dependent on each other. Alternatively, provide '-precomplex'") 106 | 107 | #Exit 108 | sys.exit() 109 | 110 | #Protein and ligands are pre-complexed 111 | else: 112 | #Get filetype 113 | complexfiletype = os.path.splitext(args.precomplex)[-1].lower() 114 | 115 | #Check it is mae 116 | if complexfiletype == ".mae": 117 | #Skip FFBuilder 118 | args.skip_ff=True 119 | 120 | #Remove protein filename 121 | args.prot=None 122 | 123 | #Remove ligand filename 124 | args.liglib=None 125 | 126 | #Filetype is not mae 127 | else: 128 | #Capture error 129 | logger.critical("Precomplexed systems must be in MAE format") 130 | 131 | #Exit 132 | sys.exit() 133 | 134 | #Return all arguments 135 | return args 136 | 137 | def main(master_dir, homepath): 138 | #Get arguments from user or script defaults 139 | args = parse_arguments(master_dir, homepath) 140 | 141 | #Return all arguemnts 142 | return args 143 | 144 | if __name__ == '__main__': 145 | main(master_dir, homepath) 146 | -------------------------------------------------------------------------------- /bin/mdfit_cluster_traj.py: -------------------------------------------------------------------------------- 1 | #!/ap/rhel7/bin/python3.6 2 | 3 | #################################################################### 4 | # Corresponding Authors : Alexander Brueckner, Kaushik Lakkaraju ### 5 | # Contact : alexander.brueckner@bms.com, kaushik.lakkaraju@bms.com # 6 | #################################################################### 7 | 8 | #Import Python modules 9 | import logging 10 | import sys 11 | import os 12 | import subprocess 13 | import glob 14 | 15 | #Import Schrodinger modules 16 | from schrodinger import structure 17 | from schrodinger.structutils import analyze 18 | 19 | ###Initiate logger### 20 | logger = logging.getLogger(__name__) 21 | 22 | def run_job(command): 23 | #Run provided command, joining list with space. Pipe stdout and sdterror to log file 24 | process = subprocess.run(' '.join(command), stdout=subprocess.PIPE, \ 25 | stderr=subprocess.STDOUT, shell=True, text=True) 26 | 27 | #Iterate over sdtout and sdterror 28 | for line in process.stdout.split('\n'): 29 | #Ignore blank lines 30 | if line != "": 31 | #Ignore ExitStatus 32 | if "ExitStatus" not in line: 33 | #Write to log file for debugging 34 | logger.debug(line) 35 | 36 | def center_traj(SCHRODINGER, cms_path, trj_path, run_cmd, basename, args): 37 | #Prepare centering command 38 | command = [run_cmd, "trj_center.py", "-t", trj_path, "-asl", args.centering_ASL, cms_path, "%s_centered"%basename] 39 | 40 | #Capture current step 41 | logger.info("Centering trajectory: %s"%' '.join(command)) 42 | 43 | #Run centering command 44 | run_job(command) 45 | 46 | def lig_identifier(args, ref_path): 47 | #Read in reference structure 48 | struct = structure.StructureReader.read(ref_path) 49 | 50 | #Initiate Schrodinger's ligand searcher utility 51 | ligand_searcher = analyze.AslLigandSearcher() 52 | 53 | #Set minimum atom count 54 | ligand_searcher.min_atom_count = 15 55 | 56 | #Set maximum atom count 57 | ligand_searcher.max_atom_count = 300 58 | 59 | #Excluse ions as ligands 60 | ligand_searcher.exclude_ions = True 61 | 62 | #Include amino acids (peptides) 63 | ligand_searcher.exclude_amino_acids = False 64 | 65 | #Search reference structure given search criteria 66 | ligand_list = ligand_searcher.search(struct) 67 | 68 | #Grab most-likely ligand from all possible ligands 69 | ligand = max(ligand_list, key=lambda lig: len(lig.atom_indexes)) 70 | 71 | #Return ligand ASL 72 | return ligand 73 | 74 | def parch_traj(SCHRODINGER, ligbase, basename, args, run_cmd, center_cms, center_trj, master_dir, ref_path): 75 | #Check if parch ASL is set to default 76 | if args.parch_solv_ASL == '"auto"': 77 | #If it is, identify ligand ASL using Schrodinger's utilities 78 | ligand = lig_identifier(args, ref_path) 79 | 80 | #Set argument to identified ligand ASL 81 | args.parch_solv_ASL = "%s"%ligand.ligand_asl 82 | 83 | #Prepare trajectory parching command 84 | command = [run_cmd, "trj_parch.py", "-output-trajectory-format", "auto", "-ref-mae", ref_path, "-align-asl", args.parch_align_ASL, "-dew-asl", '"%s"'%args.parch_solv_ASL, "-n", str(args.n_solv), center_cms, center_trj, "%s_parched"%basename] 85 | 86 | #Capture current step 87 | logger.info("Parching trajectory: %s"%' '.join(command)) 88 | 89 | #Run parching command 90 | run_job(command) 91 | 92 | def cluster_traj(SCHRODINGER, basename, args, run_cmd, ref_path, parch_cms, parch_trj): 93 | #Check if rmsd ASL is set to default 94 | if args.rmsd_ASL == '"auto"': 95 | #If it is, identify ligand ASL using Schrodinger's utilities 96 | ligand = lig_identifier(args, ref_path) 97 | 98 | #Set argument to identified ligand ASL 99 | args.rmsd_ASL = "%s"%ligand.ligand_asl 100 | 101 | #Prepare trajectory clustering command 102 | command = [run_cmd, "trj_cluster.py", parch_cms, parch_trj, "%s_cluster"%basename, "-rmsd-asl", '"%s"'%args.rmsd_ASL, "-n", str(args.n_clusters)] 103 | 104 | #Capture current step 105 | logger.info("Clustering trajectory: %s"%' '.join(command)) 106 | 107 | #Run clustering command 108 | run_job(command) 109 | 110 | def main(SCHRODINGER, rep, master_dir, args): 111 | #Prepare Schrodinger run command ($SCHRODINGER/run) 112 | run_cmd = os.path.join(SCHRODINGER, 'run') 113 | 114 | #Generate basename -repetition<#> 115 | basename = os.path.basename(rep) 116 | 117 | #Get ligand basename 118 | ligbase = basename.split("_repetition")[0] 119 | 120 | #Generate path to MD repetition dir 121 | md_path = os.path.join(master_dir, "desmond_md", ligbase, basename) 122 | 123 | #Generate path to reference (pre-simulation) file 124 | ref_path = "%s_out_complex_min.mae"%os.path.join(master_dir, "desmond_md", ligbase, "md_setup", ligbase) 125 | 126 | #Check if slice trajectory exists 127 | if os.path.isfile(os.path.join(md_path, "%s_sliced-out.cms"%basename)) == True: 128 | #If it does, set cms path to slice file 129 | cms_path = os.path.join(md_path, "%s_sliced-out.cms"%basename) 130 | 131 | #If it does, set trj path to slice trajectory 132 | trj_path = os.path.join(md_path, "%s_sliced_trj"%basename) 133 | #Check if unsliced trajectory does and sliced trajectory does not exist 134 | elif os.path.isfile(os.path.join(md_path, "%s_sliced-out.cms"%basename)) == False and os.path.isfile(os.path.join(md_path, "%s-out.cms"%basename)) == True: 135 | #If it does, set cms path to unsliced trajectory 136 | cms_path = os.path.join(md_path, "%s-out.cms"%basename) 137 | 138 | #If it does, set trj path to unsliced trajectory 139 | trj_path = os.path.join(md_path, "%s_trj"%basename) 140 | 141 | #Could not find trajectory 142 | else: 143 | #Log error 144 | logger.critical("Trajectory could not be located!") 145 | 146 | #Exit 147 | sys.exit() 148 | 149 | #Check if centered trajectory exists 150 | if os.path.isfile(os.path.join(master_dir, "desmond_md_analysis", ligbase, basename, "%s_centered-out.cms"%basename)) == False: 151 | #Center trajectory 152 | center_traj(SCHRODINGER, cms_path, trj_path, run_cmd, basename, args) 153 | 154 | #Generate path to centered trajectory file 155 | center_cms = os.path.join(master_dir, "desmond_md_analysis", "scratch", "%s_centered-out.cms"%basename) 156 | 157 | #Generate path to centered trajectory directory 158 | center_trj = os.path.join(master_dir, "desmond_md_analysis", "scratch", "%s_centered_trj"%basename) 159 | 160 | #Centered trajectory exists 161 | else: 162 | #Generate path to centered trajectory file 163 | center_cms = os.path.join(master_dir, "desmond_md_analysis", ligbase, basename, "%s_centered-out.cms"%basename) 164 | 165 | #Generate path to centered trajectory directory 166 | center_trj = os.path.join(master_dir, "desmond_md_analysis", ligbase, basename, "%s_centered_trj"%basename) 167 | 168 | #Document current step 169 | logger.info("Centered trajectory found: %s"%center_cms) 170 | 171 | #Check if parched trajectory exists 172 | if os.path.isfile(os.path.join(master_dir, "desmond_md_analysis", ligbase, basename, "%s_parched-out.cms"%basename)) == False: 173 | #Parch trajectory (remove excess waters) 174 | parch_traj(SCHRODINGER, ligbase, basename, args, run_cmd, center_cms, center_trj, master_dir, ref_path) 175 | 176 | #Generate path to parched trajectory file 177 | parch_cms = os.path.join(master_dir, "desmond_md_analysis", "scratch", "%s_parched-out.cms"%basename) 178 | 179 | #Generate path to parched trajectory directory 180 | parch_trj = os.path.join(master_dir, "desmond_md_analysis", "scratch", "%s_parched_trj"%basename) 181 | 182 | #Parched trajectory does not exist 183 | else: 184 | #Generate path to parched trajectory file 185 | parch_cms = os.path.join(master_dir, "desmond_md_analysis", ligbase, basename, "%s_parched-out.cms"%basename) 186 | 187 | #Generate path to parched trajectory directory 188 | parch_trj = os.path.join(master_dir, "desmond_md_analysis", ligbase, basename, "%s_parched_trj"%basename) 189 | 190 | #Document current step 191 | logger.info("Parched trajectory found: %s"%parch_cms) 192 | 193 | #Generate list of cluster files 194 | cluster_files = glob.glob(os.path.join(master_dir, "desmond_md_analysis", ligbase, basename, "%s_cluster_0*.cms"%basename)) 195 | 196 | #Check if cluster files exist 197 | if cluster_files == []: 198 | #Cluster trajectory 199 | cluster_traj(SCHRODINGER, basename, args, run_cmd, ref_path, parch_cms, parch_trj) 200 | 201 | #Cluster files exist 202 | else: 203 | logger.info("Cluster files found for %s. Skipping clustering."%basename) 204 | 205 | if __name__ == '__main__': 206 | main(SCHRODINGER, rep, master_dir, args) -------------------------------------------------------------------------------- /bin/mdfit_ffbuilder.py: -------------------------------------------------------------------------------- 1 | #!/ap/rhel7/bin/python3.6 2 | 3 | #################################################################### 4 | # Corresponding Authors : Alexander Brueckner, Kaushik Lakkaraju ### 5 | # Contact : alexander.brueckner@bms.com, kaushik.lakkaraju@bms.com # 6 | #################################################################### 7 | 8 | #Import Python modules 9 | import logging 10 | import sys 11 | import os 12 | import shutil 13 | import subprocess 14 | import time 15 | import tarfile 16 | import glob 17 | 18 | ###Initiate logger### 19 | logger = logging.getLogger(__name__) 20 | 21 | def run_job(command): 22 | #Run provided command, joining list with space. Pipe stdout and sdterror to log file 23 | process = subprocess.run(' '.join(command), stdout=subprocess.PIPE, \ 24 | stderr=subprocess.STDOUT, shell=True, text=True) 25 | 26 | #Iterate over sdtout and sdterror 27 | for line in process.stdout.split('\n'): 28 | #Ignore blank lines 29 | if line != "": 30 | #Ignore ExitStatus 31 | if "ExitStatus" not in line: 32 | #Write to log file for debugging 33 | logger.debug(line) 34 | 35 | def prep_hostname(args, inst_params): 36 | #Add job-server prefix to host and append number of processors 37 | hostname = inst_params["hostnames"]["FFBUILDER"] 38 | processors = inst_params["parameters"]["FFPROC"] 39 | host = "%s:%s"%(hostname, processors) 40 | 41 | #Document current step 42 | logger.info("FFBuilder hostname is %s"%host) 43 | 44 | #Return prepared hostname 45 | return host 46 | 47 | 48 | def dircheck(master_dir): 49 | #Generate path to FFBuilder directory 50 | newdir = os.path.join(master_dir, "ffbuilder") 51 | 52 | #Check if FFBuilder directory already exists 53 | if os.path.isdir(newdir) == False: 54 | #If not, make directory 55 | os.mkdir(newdir) 56 | 57 | #Document current step 58 | logger.info("Created directory: %s"%newdir) 59 | 60 | #FFBuilder directory already exists 61 | else: 62 | #Get current date and time in a string 63 | timestr = time.strftime("%Y%m%d-%H%M%S") 64 | 65 | #Generate temporary directory name 66 | tmpfile = "ffbuilder_%s"%timestr 67 | 68 | #Move old FFBuilder directory to temporary directory 69 | os.rename(newdir, tmpfile) 70 | 71 | #Generate a tar directory name 72 | tarfilename = "%s.tar.gz"%tmpfile 73 | 74 | #Document current step 75 | logger.info("Directory found. Archiving to %s"%tarfilename) 76 | 77 | #Tar/compress old FFBuilder directory 78 | with tarfile.open(tarfilename, "w:gz") as tar: 79 | tar.add(tmpfile) 80 | 81 | #Delete (recursive) old FFBuilder directory 82 | shutil.rmtree(tmpfile) 83 | 84 | #Re-create empty FFBuilder directory 85 | os.mkdir(newdir) 86 | 87 | #Document current step 88 | logger.info("Changing directory to %s"%newdir) 89 | 90 | #Change to FFBuilder directory 91 | os.chdir(newdir) 92 | 93 | #Return FFBuilder directory name 94 | return newdir 95 | 96 | def gen_opls(args, schrodinger_version): 97 | #Change sub-version to underscore to comply with FFBuilder formatting 98 | version = schrodinger_version.replace("-", "_") 99 | 100 | #Generate name for force field file 101 | forcefieldfile = "custom_%s.opls"%version 102 | 103 | #Generate full path to force field file 104 | forcefieldfilepath = os.path.join(args.oplsdir, forcefieldfile) 105 | 106 | #Return force field file name and full path 107 | return forcefieldfile, forcefieldfilepath 108 | 109 | def ffbuilder(forcefieldfilepath, SCHRODINGER, ligfileprefix, ligpath, newdir, forcefieldfile, args, host): 110 | #Prepare Schrodinger run command ($SCHRODINGER/ffbuilder) 111 | run_cmd = os.path.join(SCHRODINGER, 'ffbuilder') 112 | 113 | #Generate jobname using ligand file name 114 | jobname = "MDFit_%s"%ligfileprefix 115 | 116 | #Generate path to output opls file for FFBuilder 117 | outopls = os.path.join(newdir, "%s_oplsdir"%jobname, forcefieldfile) 118 | 119 | #Check if custom force field file already exists 120 | if os.path.isfile(forcefieldfilepath) == False: 121 | #If not, capture current step 122 | logger.info("Custom force-field does not exist. Creating one.") 123 | 124 | #Check if oplsdir exists 125 | if os.path.isdir(os.path.join(args.oplsdir)) == False: 126 | #Make directories (recursive) to custom force field file 127 | os.makedirs(os.path.dirname(forcefieldfilepath)) 128 | 129 | #Prepare FFBuilder command 130 | command = [run_cmd, '-HOST', host, '-JOBNAME', jobname, ligpath, '-WAIT'] 131 | 132 | #Custom force field file exists 133 | else: 134 | #Capture current step 135 | logger.info("Custom force-field found. Merging new parameters with custom force-field.") 136 | 137 | #Prepare FFBuilder command 138 | command = [run_cmd, '-HOST', host, '-JOBNAME', jobname, '-OPLSDIR', forcefieldfilepath, ligpath, '-WAIT'] 139 | 140 | #Capture current step 141 | logger.info("Running FFBuilder: %s"%' '.join(command)) 142 | 143 | #Run FFBuilder 144 | run_job(command) 145 | 146 | #Return path to output opls file 147 | return outopls 148 | 149 | def FFcleanup(forcefieldfilepath, forcefieldfile, homepath, outopls, SCHRODINGER): 150 | #Check if custom force field file already exists. If not, copy generated file to new oplsdir 151 | if os.path.isfile(forcefieldfilepath) == False: 152 | #Generate path to home oplsdir 153 | home_oplsdir = os.path.join(homepath, ".schrodinger", "opls_dir") 154 | 155 | #Get any custom opls files 156 | opls_files = glob.glob(os.path.join(home_oplsdir, "custom_*.opls")) 157 | 158 | #Try copying FFBuilder output file to custom force field path 159 | if os.path.isfile(outopls) == True: 160 | #If file was generated, capture current step 161 | logger.info("Copying custom OPLS to oplsdir: %s"%outopls) 162 | logger.info(" : %s"%forcefieldfilepath) 163 | 164 | #Copy custom force field to user-specified oplsdir 165 | shutil.copy(outopls,forcefieldfilepath) 166 | 167 | #This fails if custom force field file does not exist and FFBuilder did not generate new parameters. Try finding custom opls file in $HOME 168 | 169 | #Check if current release force field file exists in default location 170 | elif os.path.isfile(os.path.join(home_oplsdir, forcefieldfile)) == True: 171 | #Capture current step 172 | logger.info("No new parameters generated. Copying opls file from %s"%home_oplsdir) 173 | 174 | #Copy file 175 | shutil.copy(os.path.join(home_oplsdir, forcefieldfile),forcefieldfilepath) 176 | 177 | #See if older version exists in $HOME 178 | elif opls_files != []: 179 | #Prepare Schrodinger custom_params utiltiy 180 | custom_params = os.path.join(SCHRODINGER, "utilities", "custom_params") 181 | 182 | #Prepare command for upgrading custom parameters 183 | command = [custom_params, "upgrade", home_oplsdir] 184 | 185 | #Capture current step 186 | logger.info("Upgrading custom parameters: %s"%' '.join(command)) 187 | 188 | #Run upgrade 189 | run_job(command) 190 | 191 | #Capture current step 192 | logger.info("Copying upgraded custom parameters to desired oplsdir") 193 | 194 | #Copy upgraded opls file 195 | shutil.copy(os.path.join(home_oplsdir, forcefieldfile),forcefieldfilepath) 196 | 197 | else: 198 | #Capture error and provide some work-around 199 | logger.critical("Force field file creation failed and default opls could not be located. Workaround: copy default opls file to the desired --oplsdir and rerun MDFit.") 200 | 201 | #Exit 202 | sys.exit() 203 | 204 | def main(args, master_dir, SCHRODINGER, ligpath, ligfileprefix, schrodinger_version, inst_params, homepath): 205 | #Check if user wants to skip FFBuilder 206 | if args.skip_ff == True: 207 | #If true, document current step 208 | logger.info("Skipping FFBuilder (--skip_ff provided by user)") 209 | 210 | #User wants FFBuilder 211 | else: 212 | #Prepare hostname format 213 | host = prep_hostname(args, inst_params) 214 | 215 | #Generate FFBuilder directory 216 | newdir = dircheck(master_dir) 217 | 218 | #Generate FFBuilder output filename 219 | forcefieldfile, forcefieldfilepath = gen_opls(args, schrodinger_version) 220 | 221 | #Run FFBuilder 222 | outopls = ffbuilder(forcefieldfilepath, SCHRODINGER, ligfileprefix, ligpath, newdir, forcefieldfile, args, host) 223 | 224 | #Move custom parameters to user-specified oplsdir 225 | FFcleanup(forcefieldfilepath, forcefieldfile, homepath, outopls, SCHRODINGER) 226 | 227 | #Document current step 228 | logger.info("Changing directory to %s"%master_dir) 229 | 230 | #Change back to master directory 231 | os.chdir(master_dir) 232 | 233 | if __name__ == '__main__': 234 | main(args, master_dir, SCHRODINGER, ligpath, ligfileprefix, schrodinger_version, inst_params, homepath) 235 | -------------------------------------------------------------------------------- /bin/mdfit_desmond_analysis.py: -------------------------------------------------------------------------------- 1 | #!/ap/rhel7/bin/python3.6 2 | 3 | #################################################################### 4 | # Corresponding Authors : Alexander Brueckner, Kaushik Lakkaraju ### 5 | # Contact : alexander.brueckner@bms.com, kaushik.lakkaraju@bms.com # 6 | #################################################################### 7 | 8 | #Import Python modules 9 | import logging 10 | import sys 11 | import os 12 | import shutil 13 | import subprocess 14 | import glob 15 | import concurrent.futures 16 | 17 | #Import MDFit modules 18 | import mdfit_event_analysis 19 | import mdfit_extract_dat 20 | import mdfit_combine_csvs 21 | import mdfit_cluster_traj 22 | 23 | ###Initiate logger### 24 | logger = logging.getLogger(__name__) 25 | 26 | def run_job(command): 27 | #Run provided command, joining list with space. Pipe stdout and sdterror to log file 28 | process = subprocess.run(' '.join(command), stdout=subprocess.PIPE, \ 29 | stderr=subprocess.STDOUT, shell=True, text=True) 30 | 31 | #Iterate over sdtout and sdterror 32 | for line in process.stdout.split('\n'): 33 | #Ignore blank lines 34 | if line != "": 35 | #Ignore ExitStatus 36 | if "ExitStatus" not in line: 37 | #Write to log file for debugging 38 | logger.debug(line) 39 | 40 | def dircheck(master_dir): 41 | #Generate scratch directory name 42 | newdir = os.path.join(master_dir, "desmond_md_analysis", "scratch") 43 | 44 | #Check if scratch directory exists 45 | if os.path.isdir(newdir) == False: 46 | #If not, make scratch directory 47 | os.makedirs(newdir) 48 | 49 | #Capture current step 50 | logger.info("Created directory: %s"%newdir) 51 | 52 | #Scratch directory exists 53 | else: 54 | #Capture current step 55 | logger.info("Directory already exists: %s"%newdir) 56 | 57 | #Document current step 58 | logger.info("Changing directory to %s"%newdir) 59 | 60 | #Change to scratch directory 61 | os.chdir(newdir) 62 | 63 | #Return scratch directory name 64 | return newdir 65 | 66 | def ligfile_check(master_dir, args): 67 | #Check if user wants to analyze all ligands 68 | if args.analysis_lig == "all": 69 | #If they do, generate list with paths to all repetition directories 70 | reppaths = glob.glob(os.path.join(master_dir, "desmond_md", "*", "*repetition*")) 71 | 72 | #TODO: accept list of ligands for analysis 73 | #Otherwise, user wants single ligand analysis 74 | else: 75 | #Assign list with path to single repetition directory 76 | reppaths = [os.path.join(master_dir, "desmond_md", "*", "*%s*"%args.analysis_lig)] 77 | 78 | #Return list of paths to repeptition directories 79 | return reppaths 80 | 81 | def prep_workers(args): 82 | #Check if user provided a number of workers 83 | if args.max_workers == 0: 84 | #If not, return either 32 or (number of cpu+4) 85 | workers = min(32, os.cpu_count() + 4) 86 | 87 | #User provided number of workers 88 | else: 89 | #Assign to variable 90 | workers = args.max_workers 91 | 92 | #Capture current step 93 | logger.info("Using %s workers for executing calls asynchronously"%workers) 94 | 95 | #Return number of workers 96 | return workers 97 | 98 | def run_analysis(SCHRODINGER, rep, master_dir, args, inst_params): 99 | #Analyze given trajectory. Call mdfit_event_analysis.py 100 | event_analysis_command2 = mdfit_event_analysis.main(SCHRODINGER, rep, master_dir, args, inst_params) 101 | 102 | #Return command for serial job 103 | return event_analysis_command2 104 | 105 | def dat_extract(pdf_commands): 106 | #Iterate over all dat extract commands 107 | for command in pdf_commands: 108 | #Check if commands were generated. Can be empty if previous eaf files are found 109 | if command != []: 110 | #Capture current step 111 | logger.info("Generating data files: %s"%' '.join(command)) 112 | 113 | #Run each job serially 114 | run_job(command) 115 | 116 | def tabulate_simfp(SCHRODINGER, rep, master_dir, args): 117 | #Tabulate SimFP and compatibility data. Calls mdfit_extract_dat.py 118 | mdfit_extract_dat.main(SCHRODINGER, rep, master_dir, args) 119 | 120 | #Check if clustering will happen 121 | if args.skip_cluster == True: 122 | #If skipping, do cleanup now 123 | cleanup(rep, master_dir, args) 124 | 125 | def combine_csvs(master_dir): 126 | #Combine all tabulated ligand-specific SimFPs and compatibility files into master file (serial) 127 | #Call mdfit_combine_csvs.py 128 | mdfit_combine_csvs.main(master_dir) 129 | 130 | def cleanup(rep, master_dir, args): 131 | #Generate ligand name -repetition<#> 132 | basename = os.path.basename(rep) 133 | 134 | #Generate base ligand name 135 | ligbase = basename.split("_repetition")[0] 136 | 137 | #Generate path to repetition directory desmond_md_analysis//-repetition<#> 138 | repdir = os.path.join(master_dir, "desmond_md_analysis", ligbase, basename) 139 | 140 | #Check if reptition directory exists 141 | if os.path.isdir(repdir) == False: 142 | #If not, make directory (recursive) 143 | os.makedirs(repdir) 144 | 145 | #Capture current step 146 | logger.info("Created directory: %s"%repdir) 147 | 148 | #Directory exists 149 | else: 150 | #Capture current step 151 | logger.info("Directory already exists: %s"%repdir) 152 | 153 | #Generate list of paths for files to move from desmond_md_analysis/scratch/* 154 | searches = ("%s_*"%basename, "%s.*"%basename) 155 | move_files = [] 156 | for each_search in searches: 157 | move_files.extend(glob.glob(os.path.join(master_dir, "desmond_md_analysis", "scratch", each_search))) 158 | 159 | #Iterate over list of file paths 160 | for file in move_files: 161 | #Move to repetition directory 162 | shutil.move(file, os.path.join(repdir, os.path.basename(file))) 163 | 164 | def cluster_traj(SCHRODINGER, rep, master_dir, args): 165 | #Cluster trajectories. Calls mdfit_cluster_traj.py 166 | mdfit_cluster_traj.main(SCHRODINGER, rep, master_dir, args) 167 | 168 | #Move files from scratch to repetition directories 169 | cleanup(rep, master_dir, args) 170 | 171 | def main(args, master_dir, SCHRODINGER, inst_params): 172 | #Generate scratch directory 173 | scratch_dir = dircheck(master_dir) 174 | 175 | #Check that user wants analysis 176 | if args.skip_analysis == False: 177 | #If they do, get paths to all the repetition files 178 | reppaths = ligfile_check(master_dir, args) 179 | 180 | #Prepare number of workers based on ThreadPoolExecutor suggestion 181 | workers = prep_workers(args) 182 | 183 | #Initiate empty list for final dat and png file extraction. Has to be run serially 184 | pdf_commands = [] 185 | 186 | #Start parallel task controller 187 | with concurrent.futures.ThreadPoolExecutor(max_workers=workers) as executor: 188 | #Run MD analysis asynchronously 189 | analysis_jobs = {executor.submit(run_analysis, SCHRODINGER, rep, master_dir, args, inst_params): rep for rep in reppaths} 190 | 191 | #For each asynchronous job 192 | for future in concurrent.futures.as_completed(analysis_jobs): 193 | #Capture the output 194 | lig = analysis_jobs[future] 195 | 196 | #Try getting the serial command 197 | try: 198 | pdf_commands.append(future.result()) 199 | 200 | #If a step in MD analysis fails 201 | except Exception as exc: 202 | #Capture error 203 | logger.critical("%s generated an exception during event analysis: %s"%(lig, exc)) 204 | 205 | #Exit 206 | sys.exit() 207 | 208 | #Otherwise, MD analysis was successful 209 | else: 210 | #Capture current step 211 | logger.info("Analysis success: %s"%(lig)) 212 | 213 | #Capture current step 214 | logger.info("Extracting dat and png files serially") 215 | 216 | #Limitation of Schrodinger's utility. Cannot control output filenames. Forced to run dat extraction serially. 217 | dat_extract(pdf_commands) 218 | 219 | #Start parallel task controller 220 | with concurrent.futures.ThreadPoolExecutor(max_workers=workers) as executor: 221 | #Run MD analysis tabulation asynchronously 222 | tabulate_jobs = {executor.submit(tabulate_simfp, SCHRODINGER, rep, master_dir, args): rep for rep in reppaths} 223 | 224 | #For each asynchronous job 225 | for future in concurrent.futures.as_completed(tabulate_jobs): 226 | #Capture the output 227 | lig = tabulate_jobs[future] 228 | 229 | #Try getting returned data 230 | try: 231 | data = future.result() 232 | 233 | #If a step in MD analysis tabulation fails 234 | except Exception as exc: 235 | #Capture error 236 | logger.critical("%s generated an exception during tabulation: %s"%(lig, exc)) 237 | 238 | #Exit 239 | sys.exit() 240 | 241 | #Otherwise, MD analysis tabulation was successful 242 | else: 243 | #Capture current step 244 | logger.info("Trj extraction success: %s"%(lig)) 245 | 246 | #Combine all SimFP and compatibility CSV files into a master file. Must be serial 247 | combine_csvs(master_dir) 248 | 249 | #Check if the user wants to cluster the trajectories 250 | if args.skip_cluster == True: 251 | #If true, document current step 252 | logger.info("Skipping trajectory clustering (--skip_cluster provided by user)") 253 | 254 | #User wants clustering 255 | else: 256 | #If they do, start parallel task controller 257 | with concurrent.futures.ThreadPoolExecutor(max_workers=workers) as executor: 258 | #Run MD trajectory clustering asynchronously 259 | cluster_jobs = {executor.submit(cluster_traj, SCHRODINGER, rep, master_dir, args): rep for rep in reppaths} 260 | 261 | #For each asynchronous job 262 | for future in concurrent.futures.as_completed(cluster_jobs): 263 | #Capture the output 264 | lig = cluster_jobs[future] 265 | 266 | #Try getting returned data 267 | try: 268 | data = future.result() 269 | 270 | #If a step in MD trajectory clustering fails 271 | except Exception as exc: 272 | #Capture the error 273 | logger.critical("%s generated an exception during clustering: %s"%(lig, exc)) 274 | 275 | #Exit 276 | sys.exit() 277 | 278 | #Otherwise, MD trajectory clustering was successful 279 | else: 280 | #Capture current step 281 | logger.info("Clustering success: %s"%(lig)) 282 | 283 | #Capture current step 284 | logger.info("Changing directory to %s"%master_dir) 285 | 286 | #Change to master directory 287 | os.chdir(master_dir) 288 | 289 | if __name__ == '__main__': 290 | main(args, master_dir, SCHRODINGER, inst_params) -------------------------------------------------------------------------------- /bin/mdfit_extract_dat.py: -------------------------------------------------------------------------------- 1 | #!/ap/rhel7/bin/python3.6 2 | 3 | #################################################################### 4 | # Corresponding Authors : Alexander Brueckner, Kaushik Lakkaraju ### 5 | # Contact : alexander.brueckner@bms.com, kaushik.lakkaraju@bms.com # 6 | #################################################################### 7 | 8 | #Import Python modules 9 | import logging 10 | import sys 11 | import os 12 | import pandas as pd 13 | 14 | #Import MDFit modules 15 | import mdfit_slicetrj 16 | 17 | ###Initiate logger### 18 | logger = logging.getLogger(__name__) 19 | 20 | def trj_pathnames(md_path, basename): 21 | #Check if slice trajectory exists 22 | if os.path.isfile(os.path.join(md_path, "%s_sliced-out.cms"%basename)) == True: 23 | #If it does, set cms path to sliced trajectory file 24 | cms_path = os.path.join(md_path, "%s_sliced-out.cms"%basename) 25 | 26 | #If it does, set trj path to sliced trajectory directory 27 | trj_path = os.path.join(md_path, "%s_sliced_trj"%basename) 28 | 29 | #Check if unsliced trajectory does and sliced trajectory does not exist 30 | elif os.path.isfile(os.path.join(md_path, "%s_sliced-out.cms"%basename)) == False and os.path.isfile(os.path.join(md_path, "%s-out.cms"%basename)) == True: 31 | #If it does, set cms path to unsliced trajectory file 32 | cms_path = os.path.join(md_path, "%s-out.cms"%basename) 33 | 34 | #If it does, set trj path to unsliced trajectory file 35 | trj_path = os.path.join(md_path, "%s_trj"%basename) 36 | 37 | #Could not locate trajectory 38 | else: 39 | #Log error 40 | logger.critical("Trajectory could not be located!") 41 | 42 | #Exit 43 | sys.exit() 44 | 45 | #Return paths to trajectory files 46 | return cms_path, trj_path 47 | 48 | def simfp(dat_files, round_int, basename, master_dir, num_frames, args, compat_prep, ligbase, repnum): 49 | #Initiate SimFP dataframe with ligand and repetition information 50 | simfp_prep = pd.DataFrame({'Molecule':['Repetition'], ligbase:[repnum]}) 51 | 52 | #Iterate over all dat files 53 | for file in os.listdir(dat_files): 54 | #Pick out protein-ligand interaction files 55 | if file.startswith('PL-Contacts') and file.endswith('.dat'): 56 | #Get interaction type based on filename 57 | int_type = file.split('_')[1].split('.dat')[0] 58 | 59 | #Initiate dataframe for the dat file 60 | df = pd.read_csv(os.path.join(dat_files, file), sep="\s+") 61 | 62 | #Drop first column (extra "#") 63 | df = df.shift(axis=1).drop('#', axis=1) 64 | 65 | #If reading metal interaction dat file 66 | if file.endswith('Metal.dat'): 67 | #Remove protein-metal interactions 68 | df = df[df["Type"].str.contains("prot") == False] 69 | 70 | #Calculate average number of interactions per frame 71 | avg_int = round((len(df.index)/num_frames), round_int) 72 | 73 | #Add average number of metal interactions to compatibility dataframe 74 | compat_prep.loc[len(compat_prep.index)] = ["Average_%s_per_frame"%int_type, avg_int] 75 | 76 | #Generate SimFP notation (Chain:ResNum:InteractionType) modified for metals (no chain) 77 | df['Sort'] = '_:' + df['MetalSite'] + ':%s'%int_type 78 | 79 | #Initiate temporary dataframe with unique SimFPs 80 | add_res = pd.DataFrame(df['Type'].groupby(df['Sort']).count().index.values, columns=['Molecule']) 81 | 82 | #Calculate SimFPs (fraction of time interaction occurs across frames) 83 | add_res[ligbase] = df['Type'].groupby(df['Sort']).count().values/num_frames 84 | 85 | #Round to desired decimal places 86 | add_res[ligbase] = add_res[ligbase].round(round_int) 87 | 88 | #Remove SimFPs below cutoff 89 | add_res[add_res[ligbase]>=args.analysis_cutoff] 90 | 91 | #Combine interaction-specific dataframe into full dataframe 92 | simfp_prep = simfp_prep.append(add_res,ignore_index=True) 93 | 94 | #Not metal interaction dat file 95 | else: 96 | #Calcaulte average number of interactions per frame 97 | avg_int = round((len(df.index)/num_frames), round_int) 98 | 99 | #Add average number of metal interactions to compatibility dataframe 100 | compat_prep.loc[len(compat_prep.index)] = ["Average_%s_per_frame"%int_type, avg_int] 101 | 102 | #Generate SimFP notation (Chain:ResNum:InterationType) 103 | df['Sort'] = df['Chain'] + ':' + df['Residue#'].astype(str) + ':%s'%int_type 104 | 105 | #Initiate temporary dataframe with unique SimFPs 106 | add_res = pd.DataFrame(df['Residue#'].groupby(df['Sort']).count().index.values, columns=['Molecule']) 107 | 108 | #Calculate SimFPs (fraction of time interaction occurs across frames) 109 | add_res[ligbase] = df['Residue#'].groupby(df['Sort']).count().values/num_frames 110 | 111 | #Round to desired decimal places 112 | add_res[ligbase] = add_res[ligbase].round(round_int) 113 | 114 | #Remove SimFPs below cutoff 115 | add_res[add_res[ligbase]>=args.analysis_cutoff] 116 | 117 | #Combine interaction-specific dataframe into full dataframe 118 | simfp_prep = simfp_prep.append(add_res,ignore_index=True) 119 | 120 | #Transpose SimFP dataframe and write to csv file 121 | simfp_prep.transpose().to_csv(os.path.join(master_dir, "desmond_md_analysis", "scratch", "%s_SimFP.csv"%basename), header=False) 122 | 123 | def compatibility(dat_files, round_int, basename, master_dir, num_frames, args, compat_prep): 124 | #Iterate over all dat files 125 | for file in os.listdir(dat_files): 126 | #If reading ligand properties dat file 127 | if file == "L-Properties.dat": 128 | #Initiate dataframe for the dat file 129 | df = pd.read_csv(os.path.join(dat_files, file), sep="\s+") 130 | 131 | #Drop first column to fix spacing issue 132 | df = df.shift(axis=1).drop('Frame', axis=1) 133 | 134 | #Rename "#" to "Frame" 135 | df = df.rename(columns={'#': 'Frame'}) 136 | 137 | #Iterate over all frames in dataframe 138 | for col in df.columns: 139 | #Check if not reading frame column 140 | if "Frame" not in col: 141 | #Calculate the average value for the given property 142 | avg_val = df.loc[:, '%s'%col].mean().round(round_int) 143 | 144 | #Add average to compatibility dataframe 145 | compat_prep.loc[len(compat_prep.index)] = ["Avg_lig_%s"%col, avg_val] 146 | 147 | #Check if reading ligand RMSF dat file 148 | elif file == "L_RMSF.dat": 149 | #Initiate dataframe for the dat file 150 | df = pd.read_csv(os.path.join(dat_files, file), delim_whitespace=True, na_values=['']) 151 | 152 | #Drop first column to fix spacing (stray "#") 153 | df = df.shift(axis=1).drop('#', axis=1) 154 | 155 | #"PDBResName" can be empty. Hack to get around alignment issues in pandas 156 | #Check if last column in dataframe is empty (NaN) 157 | if df['wrt_Ligand'].isnull().values.any(): 158 | #If it is, shift the dataframe over and drop the Atom# column 159 | df = df.shift(axis=1).drop('Atom#', axis=1) 160 | 161 | #Rename PDBResName to Frame 162 | df = df.rename(columns={'PDBResName': 'Frame'}) 163 | 164 | #Iterate over all columns in dataframe 165 | for col in df.columns: 166 | #Check if reading property column 167 | if "wrt" in col: 168 | #Calculate the average value for the given property 169 | avg_val = df.loc[:, '%s'%col].mean().round(round_int) 170 | 171 | #Add average to compatibility dataframe 172 | compat_prep.loc[len(compat_prep.index)] = ["Avg_ligRMSF_%s"%col, avg_val] 173 | 174 | #Check if reading protein and ligand RMSD dat file 175 | elif file == "PL_RMSD.dat": 176 | #Initiate dataframe for the dat file 177 | df = pd.read_csv(os.path.join(dat_files, file), sep="\s+") 178 | 179 | #Drop first column to fix spacing issue (stray "#") 180 | df = df.shift(axis=1).drop('#', axis=1) 181 | 182 | #Iterate over all columns in dataframe 183 | for col in df.columns: 184 | #Check if column is not "frame" 185 | if "frame" not in col: 186 | #Calculate average value for the given property 187 | avg_val = df.loc[:, '%s'%col].mean().round(round_int) 188 | 189 | #Add average to compatibility dataframe 190 | compat_prep.loc[len(compat_prep.index)] = ["Avg_PL_RMSD_%s"%col, avg_val] 191 | 192 | #Check if reading protein RMSF dat file 193 | elif file == "P_RMSF.dat": 194 | #Initiate dataframe for the dat file 195 | df = pd.read_csv(os.path.join(dat_files, file), sep="\s+") 196 | 197 | #Drop first column to fix spacing (stray "#") 198 | df = df.shift(axis=1).drop('#', axis=1) 199 | 200 | #Initiate list with columns of interest 201 | columns = ["CA", "Backbone", "Sidechain", "All_Heavy"] 202 | 203 | #Iterate over list of column names 204 | for col in columns: 205 | #Calculate average value for the given property 206 | avg_val = df.loc[:, '%s'%col].mean().round(round_int) 207 | 208 | #Add average to compatibility dataframe 209 | compat_prep.loc[len(compat_prep.index)] = ["Avg_protRMSF_%s"%col, avg_val] 210 | 211 | #Transpose compatibility dataframe and write to csv file 212 | compat_prep.transpose().to_csv(os.path.join(master_dir, "desmond_md_analysis", "scratch", "%s_compatibility.csv"%basename), header=False) 213 | 214 | def main(SCHRODINGER, rep, master_dir, args): 215 | #Generate repetition name -repetition<#> 216 | basename = os.path.basename(rep) 217 | 218 | #Generate ligand name 219 | ligbase = basename.split("_repetition")[0] 220 | 221 | #Get repetition number 222 | repnum = basename.split("_repetition")[-1] 223 | 224 | #Check if dat files are in scratch 225 | if os.path.isfile(os.path.join(master_dir, "desmond_md_analysis", "scratch", basename, "P_RMSF.dat")) == True: 226 | #If they are, set path to scratch space 227 | dat_files = os.path.join(master_dir, "desmond_md_analysis", "scratch", basename) 228 | 229 | #Check if dat files are in permanent space 230 | elif os.path.isfile(os.path.join(master_dir, "desmond_md_analysis", ligbase, basename, basename, "P_RMSF.dat")) == True: 231 | #If they are, set path to permanent space 232 | dat_files = os.path.join(master_dir, "desmond_md_analysis", ligbase, basename, basename) 233 | 234 | #Cannot fine dat files 235 | else: 236 | #Capture error 237 | logger.critical("Dat files could not be located for %s. Try removing desmond_md_analysis directory are re-running analysis."%basename) 238 | 239 | #Exit 240 | sys.exit() 241 | 242 | #Generate path to MD trajectories 243 | md_path = os.path.join(master_dir, "desmond_md", ligbase, basename) 244 | 245 | #Generate paths to trajectory files 246 | cms_path, trj_path = trj_pathnames(md_path, basename) 247 | 248 | #Initiate compatibility dataframe with ligand and repetition info 249 | compat_prep = pd.DataFrame({'Molecule':['Repetition'], ligbase:[repnum]}) 250 | 251 | #Get number of frames. Calls mdfit_slicetrj.py 252 | num_frames = mdfit_slicetrj.count_frames(trj_path) 253 | 254 | #Number of decimal places for rounding 255 | round_int=4 256 | 257 | #Generate repetition-specific SimFPs 258 | simfp(dat_files, round_int, basename, master_dir, num_frames, args, compat_prep, ligbase, repnum) 259 | 260 | #Generate repetition-specific compatibility metrics 261 | compatibility(dat_files, round_int, basename, master_dir, num_frames, args, compat_prep) 262 | 263 | if __name__ == '__main__': 264 | main(SCHRODINGER, rep, master_dir, args) -------------------------------------------------------------------------------- /bin/mdfit_desmond_md.py: -------------------------------------------------------------------------------- 1 | #!/ap/rhel7/bin/python3.6 2 | 3 | #################################################################### 4 | # Corresponding Authors : Alexander Brueckner, Kaushik Lakkaraju ### 5 | # Contact : alexander.brueckner@bms.com, kaushik.lakkaraju@bms.com # 6 | #################################################################### 7 | 8 | #Import Python modules 9 | import logging 10 | import sys 11 | import os 12 | import shutil 13 | import subprocess 14 | import threading 15 | import concurrent.futures 16 | import glob 17 | import random 18 | 19 | #Import MDFit modules 20 | import mdfit_prep_complex 21 | import mdfit_run_minimization 22 | import mdfit_get_charge 23 | import mdfit_build_box 24 | import mdfit_run_md 25 | import mdfit_slicetrj 26 | 27 | ###Initiate logger### 28 | logger = logging.getLogger(__name__) 29 | 30 | def run_job(command): 31 | #Run provided command, joining list with space. Pipe stdout and sdterror to log file 32 | process = subprocess.run(' '.join(command), stdout=subprocess.PIPE, \ 33 | stderr=subprocess.STDOUT, shell=True, text=True) 34 | 35 | #Iterate over sdtout and sdterror 36 | for line in process.stdout.split('\n'): 37 | #Ignore blank lines 38 | if line != "": 39 | #Ignore ExitStatus 40 | if "ExitStatus" not in line: 41 | #Write to log file for debugging 42 | logger.debug(line) 43 | 44 | def random_seed(): 45 | #Initialize random number generator 46 | random.seed() 47 | 48 | #Return random number 49 | return int(random.random()*1000000) 50 | 51 | def dircheck(master_dir): 52 | #Generate scratch directory name 53 | newdir = os.path.join(master_dir, "desmond_md", "scratch") 54 | 55 | #Check if scratch directory exists 56 | if os.path.isdir(newdir) == False: 57 | #If not, make scratch directory 58 | os.makedirs(newdir) 59 | 60 | #Capture current step 61 | logger.info("Created directory: %s"%newdir) 62 | 63 | #Scratch directory exists 64 | else: 65 | #Capture current step 66 | logger.info("Directory already exists: %s"%newdir) 67 | 68 | #Return scratch directory name 69 | return newdir 70 | 71 | def countligs(ligpath, SCHRODINGER): 72 | #Check if ligand name file already exists 73 | if os.path.isfile("lignames.csv") == False: 74 | #If not, set up Schrodinger proplister variable ($SCHRODINGER/utilities/proplister) 75 | run_cmd = os.path.join(SCHRODINGER, 'utilities', 'proplister') 76 | 77 | #Set up full command 78 | command = [run_cmd, '-p', 'title', '-noheader', ligpath, '-c', '-o', 'lignames.csv'] 79 | 80 | #Document current step 81 | logger.info("Getting lignames: %s"%' '.join(command)) 82 | 83 | #Run job 84 | run_job(command) 85 | 86 | #Read in ligand name file 87 | with open("lignames.csv", 'r') as fp: 88 | 89 | #Get the number of ligands but counting length of file 90 | numligs = len(fp.readlines()) 91 | 92 | #Document current step 93 | logger.info("Number of ligands: %s"%numligs) 94 | 95 | #Return the number of ligands for MD 96 | return numligs 97 | 98 | def gen_list(num_ligs): 99 | #Initiate empty list 100 | lignum = [] 101 | 102 | #Iterate between 0 and number of ligs 103 | for i in range(0, num_ligs): 104 | #Add to list 105 | lignum.append(i) 106 | 107 | #Return list with explicit ligand numbers 108 | return lignum 109 | 110 | def prep_workers(args): 111 | #Check if user provided a number of workers 112 | if args.max_workers == 0: 113 | #If not, return either 32 or (number of cpu+4) 114 | workers = min(32, os.cpu_count() + 4) 115 | 116 | #User provided number of workers 117 | else: 118 | #Assign to variable 119 | workers = args.max_workers 120 | 121 | #Capture current step 122 | logger.info("Using %s workers for executing calls asynchronously"%workers) 123 | 124 | #Return number of workers 125 | return workers 126 | 127 | def lig_extract(master_dir, i): 128 | #Read in ligand name file 129 | with open("lignames.csv", 'r') as infile: 130 | #Put all ligand names in a variable 131 | all_lines = infile.readlines() 132 | 133 | #Get ligand name for a given ligand number (i). Remove any trailing spaces/new line character 134 | ligname_base = all_lines[i].strip() 135 | 136 | #Return desired ligand name 137 | return ligname_base 138 | 139 | def cleanup_dirs(master_dir, ligname_base, args, all_md_names): 140 | #Generate MD setup directory within each ligand name (desmond_md//md_setup) 141 | setup_dir = os.path.join(master_dir, "desmond_md", ligname_base, "md_setup") 142 | 143 | #Check if directory exists 144 | if os.path.isdir(setup_dir) == False: 145 | #If not, make directory (recursive) 146 | os.makedirs(setup_dir) 147 | 148 | #Document current step 149 | logger.info("Created directory: %s"%setup_dir) 150 | 151 | #Directory does not exist 152 | else: 153 | #Document current step 154 | logger.info("Directory already exists: %s"%setup_dir) 155 | 156 | #Initiate variable for iterating 157 | j = 0 158 | 159 | #Initiate empty list for temporary repetition names 160 | md_names = [] 161 | 162 | #Iterate over number of repetitions 163 | while j < args.md_repetitions: 164 | 165 | #Generate repetition name using ligand name and iteration 166 | repname = "%s_repetition%s"%(ligname_base, j+1) 167 | 168 | #Append repetition name to master repetition name list 169 | all_md_names.append(repname) 170 | 171 | #Append reptition name to temporary name list 172 | md_names.append(repname) 173 | 174 | #Generate repetition directory name 175 | repdir = os.path.join(master_dir, "desmond_md", ligname_base, repname) 176 | 177 | #Check if repetition directory exists 178 | if os.path.isdir(repdir) == False: 179 | #If not, make directory (recursive) 180 | os.makedirs(repdir) 181 | 182 | #Document current step 183 | logger.info("Created directory: %s"%repdir) 184 | 185 | #Repetition directory exists 186 | else: 187 | #Document current step 188 | logger.info("Directory already exists: %s"%repdir) 189 | 190 | #Increment repetition variable 191 | j+=1 192 | 193 | #Return md_setup directory path and temporary reptition name list 194 | return setup_dir, md_names 195 | 196 | def move_copy_files(master_dir, ligname_base, setup_dir, md_names, args, template_dir): 197 | #Generate list of files in desmond_md/scratch/* to move 198 | searches = ("%s_*"%ligname_base, "%s.*"%ligname_base) 199 | move_files = [] 200 | for each_search in searches: 201 | move_files.extend(glob.glob(os.path.join(master_dir, "desmond_md", "scratch", each_search))) 202 | 203 | #Iterate over filenames 204 | for file in move_files: 205 | #Make sure only moving setup files 206 | if "repetition" not in file: 207 | #If setup file, move to setup directory 208 | shutil.move(file, os.path.join(setup_dir, os.path.basename(file))) 209 | 210 | #Iterate over each repetition name 211 | for rep in md_names: 212 | #Copy prepared input geometry back to scratch and rename for repetition 213 | shutil.copy(os.path.join(setup_dir, "%s_md_setup_out.cms"%ligname_base), os.path.join(master_dir, "desmond_md", "scratch", "%s_md.cms"%rep)) 214 | 215 | #Generate random number for seed for MD 216 | rseed = random_seed() 217 | 218 | #Check if cfg file exists 219 | if os.path.isfile(os.path.join(master_dir, "desmond_md", "scratch", "%s_md.cfg"%rep)) == False: 220 | #If not, read in template cfg file 221 | with open(os.path.join(template_dir, "desmond_md_job_template.cfg"), "r") as template: 222 | #Put all lines in a variable 223 | lines = template.readlines() 224 | 225 | #Open repetition-specific cfg file for writing 226 | with open(os.path.join(master_dir, "desmond_md", "scratch", "%s_md.cfg"%rep), "w") as ligoutput: 227 | #Iterate over all template lines 228 | for line in lines: 229 | #Write to output cfg, replacing SIMTIME, RSEED, and WRITEFRQ with prepared variables (simulation time, random seed, simulation write frequency) 230 | ligoutput.write(line.replace("SIMTIME",str(args.md_sim_time)).replace("RSEED",str(rseed)).replace("WRITEFRQ",str(args.md_traj_write_freq))) 231 | 232 | #Check if msj file exists 233 | if os.path.isfile(os.path.join(master_dir, "desmond_md", "scratch", "%s_md.msj"%rep)) == False: 234 | #If not, read in template msj file 235 | with open(os.path.join(template_dir, "desmond_md_job_template.msj"), "r") as template: 236 | #Put all lines in a variable 237 | msjlines = template.readlines() 238 | 239 | #Open repetition-specific msj file for writing 240 | with open(os.path.join(master_dir, "desmond_md", "scratch", "%s_md.msj"%rep), "w") as ligoutput: 241 | #Iterate over all template lines 242 | for msjline in msjlines: 243 | #Write to output msj, replacing CONFIG_NAME (repetition-specific cfg filename) 244 | ligoutput.write(msjline.replace("CONFIG_NAME","%s_md.cfg"%rep)) 245 | 246 | def rep_one_setup(SCHRODINGER, ligpath, i, master_dir, args, bmin_host, multisim_host, all_md_names, template_dir): 247 | #Extract specific ligand from ligand library and get ligand base name 248 | ligname_base = lig_extract(master_dir, i) 249 | 250 | #Complex protein and ligand. Calls mdfit_prep_complex.py 251 | pvcomplex = mdfit_prep_complex.main(SCHRODINGER, ligpath, ligname_base, i, master_dir, args) 252 | 253 | #Minimize prepared complex. Calls mdfit_run_minimization.py 254 | bmincomplex = mdfit_run_minimization.main(ligname_base, pvcomplex, args, bmin_host, SCHRODINGER, master_dir, template_dir) 255 | 256 | #Calculate the total charge of the system. Calls mdfit_get_charge.py 257 | charge = mdfit_get_charge.main(SCHRODINGER, ligname_base, master_dir, args) 258 | 259 | #Neutralizes and solvates the minimized protein and ligand complex. Calls mdfit_build_box.py 260 | simbox = mdfit_build_box.main(master_dir, SCHRODINGER, args, charge, ligname_base, multisim_host, bmincomplex, template_dir) 261 | 262 | #Blocks multiple threads writing to file at the same time 263 | with threading.Lock(): 264 | #Move MD setup files to permanent directory. Generate permanent directory name and get all repetition names 265 | setup_dir, md_names = cleanup_dirs(master_dir, ligname_base, args, all_md_names) 266 | 267 | #Prepares repetition-specfic input files for MD (cfg/msj files) 268 | move_copy_files(master_dir, ligname_base, setup_dir, md_names, args, template_dir) 269 | 270 | #Return ligand base name 271 | return ligname_base 272 | 273 | def move_trj_files(master_dir, lig, lig_basename): 274 | #Get list of all files in desmond_md/scratch/* to move 275 | move_files = glob.glob(os.path.join(master_dir, "desmond_md", "scratch", "%s*"%lig)) 276 | 277 | #Iterate over all files in list 278 | for file in move_files: 279 | #Make sure only moving repetition files 280 | if "repetition" in file: 281 | #If repetition-specific file, check if already exists 282 | if os.path.isfile(os.path.join(master_dir, "desmond_md", lig_basename, lig, os.path.basename(file))) == True: 283 | #Remove file/directory 284 | os.remove(os.path.join(master_dir, "desmond_md", lig_basename, lig, os.path.basename(file))) 285 | elif os.path.isdir(os.path.join(master_dir, "desmond_md", lig_basename, lig, os.path.basename(file))) == True: 286 | shutil.rmtree(os.path.join(master_dir, "desmond_md", lig_basename, lig, os.path.basename(file))) 287 | 288 | #Copy to permanent location 289 | shutil.move(file, os.path.join(master_dir, "desmond_md", lig_basename, lig, os.path.basename(file))) 290 | 291 | def md_production(SCHRODINGER, master_dir, args, desmond_host, lig): 292 | #Run Desmond MD. Generate output trajectory filenames and ligand basename for future use. Calls mdfit_run_md.py 293 | outcms, outtrj, lig_basename = mdfit_run_md.main(lig, args, desmond_host, SCHRODINGER, master_dir) 294 | 295 | #Slice trajectory (remove frames). Calls mdfit_slicetrj.py 296 | sliced_trj = mdfit_slicetrj.main(SCHRODINGER, lig, master_dir, args) 297 | 298 | #Move Desmond MD trajectory files to permanent directory 299 | move_trj_files(master_dir, lig, lig_basename) 300 | 301 | #Return output trajectory filenames 302 | return outcms, outtrj 303 | 304 | def main(args, master_dir, ligfileprefix, SCHRODINGER, ligpath, template_dir, inst_params): 305 | ###TODO: check if lignames are unique 306 | 307 | #Check if user wants Desmond MD 308 | if args.skip_md == True: 309 | #If not, capture current step 310 | logger.info("Skipping Desmond MD (--skip_md provided by user)") 311 | 312 | #User wants Desmond MD 313 | else: 314 | #Generate scratch directory for running jobs 315 | scratch_dir = dircheck(master_dir) 316 | 317 | #Capture current step 318 | logger.info("Changing directory to %s"%scratch_dir) 319 | 320 | #Change to scratch directory 321 | os.chdir(scratch_dir) 322 | 323 | #Prep minimization host 324 | bmin_host = inst_params["hostnames"]["BMIN"] 325 | 326 | #Document current step 327 | logger.debug("Minimization hostname is %s"%bmin_host) 328 | 329 | #Prep multisim host 330 | multisim_host = inst_params["hostnames"]["MULTISIM"] 331 | 332 | #Document current step 333 | logger.debug("Multisim hostname is %s"%multisim_host) 334 | 335 | #Prep hostname for Desmond 336 | desmond_host = inst_params["hostnames"]["DESMOND"] 337 | 338 | #Document current step 339 | logger.debug("Desmond hostname is %s"%desmond_host) 340 | 341 | #Get number of ligs for parallelization 342 | num_ligs = countligs(ligpath, SCHRODINGER) 343 | 344 | #Generate a list with ligand numbers [0, 1, 2, ...] 345 | lignum = gen_list(num_ligs) 346 | 347 | #Prepare number of workers based on ThreadPoolExecutor suggestion 348 | workers = prep_workers(args) 349 | 350 | #Initiate list to capture names for MD jobs 351 | all_md_names = [] 352 | 353 | #Start parallel task controller 354 | with concurrent.futures.ThreadPoolExecutor(max_workers=workers) as executor: 355 | #Run MD setup asynchronously 356 | setup_jobs = {executor.submit(rep_one_setup, SCHRODINGER, ligpath, lig, master_dir, args, bmin_host, multisim_host, all_md_names, template_dir): lig for lig in lignum} 357 | 358 | #For each asynchronous job 359 | for future in concurrent.futures.as_completed(setup_jobs): 360 | #Capture the output 361 | lig = setup_jobs[future] 362 | 363 | #Try getting the ligname 364 | try: 365 | ligname_base = future.result() 366 | 367 | #If a step in MD setup fails 368 | except Exception as exc: 369 | #Capture error 370 | logger.critical("An exception occurred during MD setup: %s"%(exc)) 371 | 372 | #Exit 373 | sys.exit() 374 | 375 | #Otherwise, MD setup was successful 376 | else: 377 | #Capture current step 378 | logger.info("Setup success: %s"%(ligname_base)) 379 | 380 | #Capture current step 381 | logger.info("MD setup complete. Launching %s production jobs."%len(all_md_names)) 382 | 383 | #Start parallel task controller 384 | with concurrent.futures.ThreadPoolExecutor(max_workers=workers) as executor: 385 | #Run MD asynchronously 386 | prod_jobs = {executor.submit(md_production, SCHRODINGER, master_dir, args, desmond_host, lig): lig for lig in all_md_names} 387 | 388 | #For each asynchronous job 389 | for future in concurrent.futures.as_completed(prod_jobs): 390 | #Capture the output 391 | lig = prod_jobs[future] 392 | 393 | #Try getting the trajectory names 394 | try: 395 | outcms, outtrj = future.result() 396 | 397 | #If a step in MD fails 398 | except Exception as exc: 399 | #Capture error 400 | logger.critical("%s generated an exception during production MD: %s"%(outcms, exc)) 401 | 402 | #Exit 403 | sys.exit() 404 | 405 | #Otherwise, MD was successful 406 | else: 407 | #Capture current step 408 | logger.info("Production success: %s, %s"%(outcms, outtrj)) 409 | 410 | #Capture current step 411 | logger.info("Changing directory to %s"%master_dir) 412 | 413 | #Change to master directory 414 | os.chdir(master_dir) 415 | 416 | if __name__ == '__main__': 417 | main(args, master_dir, ligfileprefix, SCHRODINGER, ligpath, template_dir, inst_params) --------------------------------------------------------------------------------