├── mdml
    ├── mdml
    │   ├── __init__.py
    │   ├── __pycache__
    │   │   ├── base.cpython-39.pyc
    │   │   ├── cli.cpython-39.pyc
    │   │   ├── model.cpython-39.pyc
    │   │   ├── plot.cpython-39.pyc
    │   │   └── __init__.cpython-39.pyc
    │   ├── plot.py
    │   ├── base.py
    │   └── cli.py
    ├── examples
    │   └── PD-L1
    │   │   ├── lasso
    │   │       ├── STDOUT
    │   │       ├── model.pkl
    │   │       ├── importance.csv
    │   │       └── STDERR
    │   │   ├── linear
    │   │       ├── STDOUT
    │   │       ├── model.pkl
    │   │       ├── STDERR
    │   │       └── importance.csv
    │   │   ├── ridge
    │   │       ├── STDOUT
    │   │       ├── model.pkl
    │   │       ├── importance.csv
    │   │       └── STDERR
    │   │   ├── lasso_mean
    │   │       ├── STDOUT
    │   │       ├── model.pkl
    │   │       └── importance.csv
    │   │   ├── linear_mean
    │   │       ├── STDOUT
    │   │       ├── model.pkl
    │   │       ├── STDERR
    │   │       └── importance.csv
    │   │   ├── ridge_mean
    │   │       ├── STDOUT
    │   │       ├── model.pkl
    │   │       ├── importance.csv
    │   │       └── STDERR
    │   │   ├── random_forest
    │   │       ├── STDOUT
    │   │       ├── model.pkl
    │   │       ├── importance.csv
    │   │       └── STDERR
    │   │   ├── gradient_boosting
    │   │       ├── STDOUT
    │   │       ├── model.pkl
    │   │       ├── importance.csv
    │   │       └── STDERR
    │   │   ├── random_forest_mean
    │   │       ├── STDOUT
    │   │       ├── model.pkl
    │   │       ├── importance.csv
    │   │       └── STDERR
    │   │   ├── gradient_boosting_mean
    │   │       ├── STDOUT
    │   │       ├── model.pkl
    │   │       ├── importance.csv
    │   │       └── STDERR
    │   │   └── README.md
    ├── setup.py
    ├── .gitignore
    ├── bin
    │   ├── polynomial_features
    │   ├── mdml_predict
    │   └── mdml_train
    └── README.md
├── bin
    ├── __pycache__
    │   ├── mdfit_ligprep.cpython-38.pyc
    │   ├── mdfit_run_md.cpython-38.pyc
    │   ├── mdfit_build_box.cpython-38.pyc
    │   ├── mdfit_desmond_md.cpython-38.pyc
    │   ├── mdfit_ffbuilder.cpython-38.pyc
    │   ├── mdfit_get_charge.cpython-38.pyc
    │   ├── mdfit_initiate.cpython-38.pyc
    │   ├── mdfit_parseargs.cpython-38.pyc
    │   ├── mdfit_slicetrj.cpython-38.pyc
    │   ├── mdfit_cluster_traj.cpython-38.pyc
    │   ├── mdfit_combine_csvs.cpython-38.pyc
    │   ├── mdfit_extract_dat.cpython-38.pyc
    │   ├── mdfit_prep_complex.cpython-38.pyc
    │   ├── mdfit_read_params.cpython-38.pyc
    │   ├── mdfit_event_analysis.cpython-38.pyc
    │   ├── mdfit_glide_docking.cpython-38.pyc
    │   ├── mdfit_desmond_analysis.cpython-38.pyc
    │   └── mdfit_run_minimization.cpython-38.pyc
    ├── mdfit_run_md.py
    ├── mdfit_run_minimization.py
    ├── mdfit_read_params.py
    ├── mdfit_build_box.py
    ├── mdfit_get_charge.py
    ├── mdfit_slicetrj.py
    ├── mdfit_combine_csvs.py
    ├── mdfit_prep_complex.py
    ├── mdfit_initiate.py
    ├── mdfit_event_analysis.py
    ├── mdfit_parseargs.py
    ├── mdfit_cluster_traj.py
    ├── mdfit_ffbuilder.py
    ├── mdfit_desmond_analysis.py
    ├── mdfit_extract_dat.py
    └── mdfit_desmond_md.py
├── templates
    ├── neutral_template.msj
    ├── negative_template.msj
    ├── positive_template.msj
    ├── bmin_template.com
    ├── desmond_md_job_template.cfg
    └── desmond_md_job_template.msj
├── Examples
    └── PDL1
    │   └── MDFit_PDL1_Commands.sh
├── LICENSE
├── README.md
├── .gitignore
└── MDFit.py


/mdml/mdml/__init__.py:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/mdml/examples/PD-L1/lasso/STDOUT:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/mdml/examples/PD-L1/linear/STDOUT:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/mdml/examples/PD-L1/ridge/STDOUT:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/mdml/examples/PD-L1/lasso_mean/STDOUT:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/mdml/examples/PD-L1/linear_mean/STDOUT:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/mdml/examples/PD-L1/ridge_mean/STDOUT:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/mdml/examples/PD-L1/random_forest/STDOUT:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/mdml/examples/PD-L1/gradient_boosting/STDOUT:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/mdml/examples/PD-L1/random_forest_mean/STDOUT:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/mdml/examples/PD-L1/gradient_boosting_mean/STDOUT:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/mdml/examples/PD-L1/lasso/model.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/brueckna2020/MDFit/HEAD/mdml/examples/PD-L1/lasso/model.pkl


--------------------------------------------------------------------------------
/mdml/examples/PD-L1/linear/model.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/brueckna2020/MDFit/HEAD/mdml/examples/PD-L1/linear/model.pkl


--------------------------------------------------------------------------------
/mdml/examples/PD-L1/ridge/model.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/brueckna2020/MDFit/HEAD/mdml/examples/PD-L1/ridge/model.pkl


--------------------------------------------------------------------------------
/mdml/examples/PD-L1/lasso_mean/model.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/brueckna2020/MDFit/HEAD/mdml/examples/PD-L1/lasso_mean/model.pkl


--------------------------------------------------------------------------------
/mdml/examples/PD-L1/linear_mean/model.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/brueckna2020/MDFit/HEAD/mdml/examples/PD-L1/linear_mean/model.pkl


--------------------------------------------------------------------------------
/mdml/examples/PD-L1/ridge_mean/model.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/brueckna2020/MDFit/HEAD/mdml/examples/PD-L1/ridge_mean/model.pkl


--------------------------------------------------------------------------------
/mdml/mdml/__pycache__/base.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/brueckna2020/MDFit/HEAD/mdml/mdml/__pycache__/base.cpython-39.pyc


--------------------------------------------------------------------------------
/mdml/mdml/__pycache__/cli.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/brueckna2020/MDFit/HEAD/mdml/mdml/__pycache__/cli.cpython-39.pyc


--------------------------------------------------------------------------------
/mdml/mdml/__pycache__/model.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/brueckna2020/MDFit/HEAD/mdml/mdml/__pycache__/model.cpython-39.pyc


--------------------------------------------------------------------------------
/mdml/mdml/__pycache__/plot.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/brueckna2020/MDFit/HEAD/mdml/mdml/__pycache__/plot.cpython-39.pyc


--------------------------------------------------------------------------------
/bin/__pycache__/mdfit_ligprep.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/brueckna2020/MDFit/HEAD/bin/__pycache__/mdfit_ligprep.cpython-38.pyc


--------------------------------------------------------------------------------
/bin/__pycache__/mdfit_run_md.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/brueckna2020/MDFit/HEAD/bin/__pycache__/mdfit_run_md.cpython-38.pyc


--------------------------------------------------------------------------------
/mdml/examples/PD-L1/random_forest/model.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/brueckna2020/MDFit/HEAD/mdml/examples/PD-L1/random_forest/model.pkl


--------------------------------------------------------------------------------
/bin/__pycache__/mdfit_build_box.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/brueckna2020/MDFit/HEAD/bin/__pycache__/mdfit_build_box.cpython-38.pyc


--------------------------------------------------------------------------------
/bin/__pycache__/mdfit_desmond_md.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/brueckna2020/MDFit/HEAD/bin/__pycache__/mdfit_desmond_md.cpython-38.pyc


--------------------------------------------------------------------------------
/bin/__pycache__/mdfit_ffbuilder.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/brueckna2020/MDFit/HEAD/bin/__pycache__/mdfit_ffbuilder.cpython-38.pyc


--------------------------------------------------------------------------------
/bin/__pycache__/mdfit_get_charge.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/brueckna2020/MDFit/HEAD/bin/__pycache__/mdfit_get_charge.cpython-38.pyc


--------------------------------------------------------------------------------
/bin/__pycache__/mdfit_initiate.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/brueckna2020/MDFit/HEAD/bin/__pycache__/mdfit_initiate.cpython-38.pyc


--------------------------------------------------------------------------------
/bin/__pycache__/mdfit_parseargs.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/brueckna2020/MDFit/HEAD/bin/__pycache__/mdfit_parseargs.cpython-38.pyc


--------------------------------------------------------------------------------
/bin/__pycache__/mdfit_slicetrj.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/brueckna2020/MDFit/HEAD/bin/__pycache__/mdfit_slicetrj.cpython-38.pyc


--------------------------------------------------------------------------------
/mdml/examples/PD-L1/gradient_boosting/model.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/brueckna2020/MDFit/HEAD/mdml/examples/PD-L1/gradient_boosting/model.pkl


--------------------------------------------------------------------------------
/mdml/mdml/__pycache__/__init__.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/brueckna2020/MDFit/HEAD/mdml/mdml/__pycache__/__init__.cpython-39.pyc


--------------------------------------------------------------------------------
/bin/__pycache__/mdfit_cluster_traj.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/brueckna2020/MDFit/HEAD/bin/__pycache__/mdfit_cluster_traj.cpython-38.pyc


--------------------------------------------------------------------------------
/bin/__pycache__/mdfit_combine_csvs.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/brueckna2020/MDFit/HEAD/bin/__pycache__/mdfit_combine_csvs.cpython-38.pyc


--------------------------------------------------------------------------------
/bin/__pycache__/mdfit_extract_dat.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/brueckna2020/MDFit/HEAD/bin/__pycache__/mdfit_extract_dat.cpython-38.pyc


--------------------------------------------------------------------------------
/bin/__pycache__/mdfit_prep_complex.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/brueckna2020/MDFit/HEAD/bin/__pycache__/mdfit_prep_complex.cpython-38.pyc


--------------------------------------------------------------------------------
/bin/__pycache__/mdfit_read_params.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/brueckna2020/MDFit/HEAD/bin/__pycache__/mdfit_read_params.cpython-38.pyc


--------------------------------------------------------------------------------
/mdml/examples/PD-L1/random_forest_mean/model.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/brueckna2020/MDFit/HEAD/mdml/examples/PD-L1/random_forest_mean/model.pkl


--------------------------------------------------------------------------------
/bin/__pycache__/mdfit_event_analysis.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/brueckna2020/MDFit/HEAD/bin/__pycache__/mdfit_event_analysis.cpython-38.pyc


--------------------------------------------------------------------------------
/bin/__pycache__/mdfit_glide_docking.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/brueckna2020/MDFit/HEAD/bin/__pycache__/mdfit_glide_docking.cpython-38.pyc


--------------------------------------------------------------------------------
/mdml/examples/PD-L1/gradient_boosting_mean/model.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/brueckna2020/MDFit/HEAD/mdml/examples/PD-L1/gradient_boosting_mean/model.pkl


--------------------------------------------------------------------------------
/bin/__pycache__/mdfit_desmond_analysis.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/brueckna2020/MDFit/HEAD/bin/__pycache__/mdfit_desmond_analysis.cpython-38.pyc


--------------------------------------------------------------------------------
/bin/__pycache__/mdfit_run_minimization.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/brueckna2020/MDFit/HEAD/bin/__pycache__/mdfit_run_minimization.cpython-38.pyc


--------------------------------------------------------------------------------
/templates/neutral_template.msj:
--------------------------------------------------------------------------------
 1 | task {
 2 |   task = "desmond:auto"
 3 | }
 4 | 
 5 | build_geometry {
 6 |   box = {
 7 |      shape = orthorhombic
 8 |      size = [10.0 10.0 10.0]
 9 |      size_type = buffer
10 |   }
11 |   rezero_system = true
12 |   solvent = <solvent>
13 | }
14 | 
15 | assign_forcefield {
16 |   forcefield = OPLS4
17 |   water = <solvent>
18 | }
19 | 


--------------------------------------------------------------------------------
/templates/negative_template.msj:
--------------------------------------------------------------------------------
 1 | task {
 2 |   task = "desmond:auto"
 3 | }
 4 | 
 5 | build_geometry {
 6 |   add_counterion = {
 7 |      ion = Na
 8 |      number = neutralize_system
 9 |   }
10 |   box = {
11 |      shape = orthorhombic
12 |      size = [10.0 10.0 10.0]
13 |      size_type = buffer
14 |   }
15 |   rezero_system = true
16 |   solvent = <solvent>
17 | }
18 | 
19 | assign_forcefield {
20 |   forcefield = OPLS4
21 |   water = <solvent>
22 | }
23 | 


--------------------------------------------------------------------------------
/templates/positive_template.msj:
--------------------------------------------------------------------------------
 1 | task {
 2 |   task = "desmond:auto"
 3 | }
 4 | 
 5 | build_geometry {
 6 |   add_counterion = {
 7 |      ion = Cl
 8 |      number = neutralize_system
 9 |   }
10 |   box = {
11 |      shape = orthorhombic
12 |      size = [10.0 10.0 10.0]
13 |      size_type = buffer
14 |   }
15 |   rezero_system = true
16 |   solvent = <solvent>
17 | }
18 | 
19 | assign_forcefield {
20 |   forcefield = OPLS4
21 |   water = <solvent>
22 | }
23 | 


--------------------------------------------------------------------------------
/mdml/examples/PD-L1/linear_mean/STDERR:
--------------------------------------------------------------------------------
 1 | DEBUG:__main__:Workflow set to nested=True
 2 | DEBUG:__main__:Loading data: data.csv
 3 | DEBUG:__main__:Dataset contains 61 entries
 4 | DEBUG:__main__:Building initial model
 5 | DEBUG:__main__:Saving model to mean_linear/model.pkl
 6 | DEBUG:__main__:Saving feature importance to mean_linear/importance.csv
 7 | DEBUG:__main__:Running cross-validation
 8 | Molecule CV: 0it [00:00, ?it/s]Molecule CV: 19it [00:00, 172.20it/s]Molecule CV: 37it [00:00, 170.46it/s]Molecule CV: 55it [00:00, 98.05it/s] Molecule CV: 61it [00:00, 90.47it/s]
 9 | DEBUG:__main__:Saving cross-validation results to mean_linear/cross_validation.json and mean_linear/cross_validation.svg
10 | 


--------------------------------------------------------------------------------
/Examples/PDL1/MDFit_PDL1_Commands.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | ###Example MDFit command for 4 PD-L1 peptides
 4 | ###Flag details:
 5 |     #Prepared protein file (-p 6PV9_PDL1.mae)
 6 |     #Prepared ligand file (-l MDFit_PDL1_Example_Ligands.mae)
 7 |     #Custom OPLS directory (-o "/Examples/PDL1/PDL1_oplsdir")
 8 |     #100 ns MD (-t 100000)
 9 |     #Three repetitions (-r 3)
10 |     #Remove first 100 frames before analysis (--slice_start = 100)
11 |     #Require interaction to exist for 30% of simulation (--analysis_cutoff 0.3)
12 |     #Run with debug (-d)
13 | 
14 | MDFit -p 6PV9_PDL1.mae -l MDFit_PDL1_Example_Ligands.mae -o "/Examples/PDL1/PDL1_oplsdir" -t 100000 -r 3 --slice_start 100 --analysis_cutoff 0.3 -d
15 | 


--------------------------------------------------------------------------------
/mdml/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup
 2 | 
 3 | setup(
 4 |     name='mdml',
 5 |     packages=['mdml'], 
 6 |     version='0.1.0',
 7 |     author='Benjamin J. Shields',
 8 |     author_email='shields.benjamin.j@gmail.com',
 9 |     keywords=['MD', 'Machine Learning'],
10 |     description='Simulation Fingerprint Machine Learning Models.',
11 |     install_requires=[
12 |         'pandas',
13 |         'numpy',
14 |         'scikit-learn',
15 |         'matplotlib',
16 |         'dill',
17 |         'pyarrow',
18 |         'tqdm'
19 |     ],
20 |     classifiers=[
21 |         'Development Status  3 - Alpha',
22 |         'Intended Audience  ScienceResearch', 
23 |         'Topic  ScientificEngineering  Chemistry',
24 |         'Programming Language  Python  3',
25 |     ],
26 |     scripts=[
27 |         'bin/mdml_train',
28 |         'bin/mdml_predict',
29 |         'bin/polynomial_features'
30 |     ]
31 | )
32 | 


--------------------------------------------------------------------------------
/templates/bmin_template.com:
--------------------------------------------------------------------------------
 1 | IN_NAME
 2 | OUT_NAME
 3 |  MMOD       0      1      0      0     0.0000     0.0000     0.0000     0.0000
 4 |  DEBG      55      0      0      0     0.0000     0.0000     0.0000     0.0000
 5 |  FFLD      16      1      0      0     1.0000     0.0000     0.0000     0.0000
 6 |  SOLV       3      1      0      0     0.0000     0.0000     0.0000     0.0000
 7 |  EXNB       0      0      0      0     0.0000     0.0000     0.0000     0.0000
 8 |  BDCO       0      0      0      0    89.4427 99999.0000     0.0000     0.0000
 9 |  CRMS       0      0      0      0     0.0000     0.5000     0.0000     0.0000
10 |  BGIN       0      0      0      0     0.0000     0.0000     0.0000     0.0000
11 |  READ       0      0      0      0     0.0000     0.0000     0.0000     0.0000
12 |  CONV       2      0      0      0     0.3000     0.0000     0.0000     0.0000
13 |  MINI       1      0    500      0     0.0000     0.0000     0.0000     0.0000
14 |  END        0      0      0      0     0.0000     0.0000     0.0000     0.0000
15 | 


--------------------------------------------------------------------------------
/mdml/examples/PD-L1/linear/STDERR:
--------------------------------------------------------------------------------
 1 | DEBUG:__main__:Workflow set to nested=True
 2 | DEBUG:__main__:Loading data: data.csv
 3 | DEBUG:__main__:Dataset contains 183 entries
 4 | DEBUG:__main__:Building initial model
 5 | DEBUG:__main__:Saving model to linear/model.pkl
 6 | DEBUG:__main__:Saving feature importance to linear/importance.csv
 7 | DEBUG:__main__:Running cross-validation
 8 | Molecule CV: 0it [00:00, ?it/s]Molecule CV: 1it [00:00,  5.50it/s]Molecule CV: 2it [00:00,  5.62it/s]Molecule CV: 4it [00:00,  9.64it/s]Molecule CV: 6it [00:00, 12.63it/s]Molecule CV: 9it [00:00, 13.57it/s]Molecule CV: 11it [00:01, 10.67it/s]Molecule CV: 13it [00:01, 12.44it/s]Molecule CV: 19it [00:01, 21.93it/s]Molecule CV: 22it [00:01, 22.04it/s]Molecule CV: 28it [00:01, 30.90it/s]Molecule CV: 36it [00:01, 37.93it/s]Molecule CV: 43it [00:01, 45.15it/s]Molecule CV: 48it [00:01, 39.26it/s]Molecule CV: 53it [00:02, 36.20it/s]Molecule CV: 57it [00:02, 32.07it/s]Molecule CV: 61it [00:02, 25.82it/s]
 9 | DEBUG:__main__:Saving cross-validation results to linear/cross_validation.json and linear/cross_validation.svg
10 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2024 Alex Brueckner
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/mdml/examples/PD-L1/lasso/importance.csv:
--------------------------------------------------------------------------------
 1 | ,importance
 2 | A:76:WaterBridge,0.6355123047559382
 3 | A:115:Hydrophobic,0.5916237096568172
 4 | A:123:Hydrophobic,0.4833926972942506
 5 | A:117:WaterBridge,0.22377663129471045
 6 | A:168:HBond,0.0
 7 | A:168:WaterBridge,0.0
 8 | A:172:HBond,-0.0
 9 | A:147:Hydrophobic,0.0
10 | A:223:HBond,0.0
11 | A:71:HBond,-0.0
12 | A:68:WaterBridge,-0.0
13 | A:72:WaterBridge,-0.0
14 | A:171:HBond,0.0
15 | A:170:HBond,0.0
16 | A:78:HBond,0.0
17 | A:170:WaterBridge,0.0
18 | A:171:WaterBridge,0.0
19 | A:75:WaterBridge,-0.0
20 | A:76:HBond,-0.0
21 | A:212:WaterBridge,0.0
22 | A:217:WaterBridge,0.0
23 | A:172:WaterBridge,0.0
24 | A:63:HBond,0.0
25 | A:66:HBond,-0.0
26 | A:223:WaterBridge,0.0
27 | A:113:Pi-Cation,-0.0
28 | A:56:Pi-Pi,0.0
29 | A:58:WaterBridge,0.0
30 | A:73:WaterBridge,0.0
31 | A:208:WaterBridge,0.0
32 | A:61:WaterBridge,-0.0
33 | A:71:WaterBridge,0.0
34 | A:69:WaterBridge,-0.0
35 | A:54:Hydrophobic,-0.052067423994442155
36 | A:76:Hydrophobic,-0.18879417402790047
37 | A:73:HBond,-0.325042493705559
38 | A:66:WaterBridge,-0.38101046942183686
39 | A:56:HBond,-0.41278607679517754
40 | A:68:Hydrophobic,-0.4245199811667292
41 | A:123:Pi-Pi,-0.7121283349718759
42 | A:56:Hydrophobic,-0.7219538291217276
43 | A:63:WaterBridge,-0.9572379222516725
44 | Strain_perHeavyAtom,-2.831340128684278
45 | 


--------------------------------------------------------------------------------
/mdml/examples/PD-L1/lasso_mean/importance.csv:
--------------------------------------------------------------------------------
 1 | ,importance
 2 | A:115:Hydrophobic,0.6458663480750456
 3 | A:76:WaterBridge,0.590712865608462
 4 | A:168:WaterBridge,0.49198032032786115
 5 | A:123:Hydrophobic,0.24037176236872668
 6 | A:117:WaterBridge,0.17455220407263283
 7 | A:170:HBond,0.0
 8 | A:147:Hydrophobic,0.0
 9 | A:223:HBond,-0.0
10 | A:71:HBond,-0.0
11 | A:68:WaterBridge,-0.0
12 | A:72:WaterBridge,-0.0
13 | A:168:HBond,-0.0
14 | A:78:HBond,0.0
15 | A:171:HBond,0.0
16 | A:170:WaterBridge,0.0
17 | A:171:WaterBridge,0.0
18 | A:75:WaterBridge,-0.0
19 | A:76:HBond,0.0
20 | A:56:HBond,-0.0
21 | A:212:WaterBridge,0.0
22 | A:217:WaterBridge,0.0
23 | A:172:WaterBridge,0.0
24 | A:63:HBond,0.0
25 | A:66:HBond,-0.0
26 | A:73:WaterBridge,0.0
27 | A:69:WaterBridge,-0.0
28 | A:54:Hydrophobic,-0.0
29 | A:71:WaterBridge,0.0
30 | A:56:Pi-Pi,0.0
31 | A:223:WaterBridge,-0.0
32 | A:208:WaterBridge,0.0
33 | A:58:WaterBridge,0.0
34 | A:61:WaterBridge,0.0
35 | A:113:Pi-Cation,-0.15423218596949798
36 | A:172:HBond,-0.19804062054386898
37 | A:76:Hydrophobic,-0.27325988226858033
38 | A:73:HBond,-0.3051078974656791
39 | A:68:Hydrophobic,-0.6111311180996669
40 | A:56:Hydrophobic,-0.7426770159246496
41 | A:66:WaterBridge,-0.8482320040521754
42 | A:123:Pi-Pi,-0.8757172308977965
43 | A:63:WaterBridge,-0.9636928160030664
44 | Strain_perHeavyAtom,-2.781332741051147
45 | 


--------------------------------------------------------------------------------
/mdml/examples/PD-L1/gradient_boosting_mean/importance.csv:
--------------------------------------------------------------------------------
 1 | ,importance
 2 | A:123:Hydrophobic,0.2821060193868918
 3 | Strain_perHeavyAtom,0.26318741427451153
 4 | A:66:HBond,0.16461120969484933
 5 | A:56:Hydrophobic,0.10060308280735433
 6 | A:68:Hydrophobic,0.050545628596954655
 7 | A:73:HBond,0.030331473020238076
 8 | A:66:WaterBridge,0.023423991379755458
 9 | A:63:HBond,0.02145866961207075
10 | A:115:Hydrophobic,0.01646546599911657
11 | A:73:WaterBridge,0.012748523805944661
12 | A:76:WaterBridge,0.010465799420052452
13 | A:113:Pi-Cation,0.007126375620531979
14 | A:58:WaterBridge,0.006149712760844277
15 | A:63:WaterBridge,0.0046874728242335285
16 | A:54:Hydrophobic,0.0033729440743602364
17 | A:56:Pi-Pi,0.0023913282266896
18 | A:71:WaterBridge,0.00032488849560064777
19 | A:208:WaterBridge,0.0
20 | A:171:WaterBridge,0.0
21 | A:170:HBond,0.0
22 | A:171:HBond,0.0
23 | A:78:HBond,0.0
24 | A:170:WaterBridge,0.0
25 | A:56:HBond,0.0
26 | A:75:WaterBridge,0.0
27 | A:76:HBond,0.0
28 | A:72:WaterBridge,0.0
29 | A:212:WaterBridge,0.0
30 | A:217:WaterBridge,0.0
31 | A:168:HBond,0.0
32 | A:71:HBond,0.0
33 | A:68:WaterBridge,0.0
34 | A:223:WaterBridge,0.0
35 | A:223:HBond,0.0
36 | A:147:Hydrophobic,0.0
37 | A:76:Hydrophobic,0.0
38 | A:168:WaterBridge,0.0
39 | A:172:WaterBridge,0.0
40 | A:117:WaterBridge,0.0
41 | A:61:WaterBridge,0.0
42 | A:123:Pi-Pi,0.0
43 | A:69:WaterBridge,0.0
44 | A:172:HBond,0.0
45 | 


--------------------------------------------------------------------------------
/mdml/examples/PD-L1/gradient_boosting/importance.csv:
--------------------------------------------------------------------------------
 1 | ,importance
 2 | Strain_perHeavyAtom,0.3431680590709321
 3 | A:68:Hydrophobic,0.08498890033830075
 4 | A:123:Hydrophobic,0.08306809724470618
 5 | A:58:WaterBridge,0.06850514383039635
 6 | A:56:Hydrophobic,0.05882407959669033
 7 | A:113:Pi-Cation,0.0578694494148487
 8 | A:73:WaterBridge,0.05579569415986692
 9 | A:56:Pi-Pi,0.0475337961549079
10 | A:66:HBond,0.03530337434479803
11 | A:73:HBond,0.03458732388013048
12 | A:63:HBond,0.028567385483523607
13 | A:123:Pi-Pi,0.021029313853795566
14 | A:71:WaterBridge,0.016730898997036513
15 | A:61:WaterBridge,0.01577019639100805
16 | A:115:Hydrophobic,0.01423470779211056
17 | A:63:WaterBridge,0.009774057171253639
18 | A:66:WaterBridge,0.007342390451557968
19 | A:76:Hydrophobic,0.005420417705953217
20 | A:54:Hydrophobic,0.004645922697275947
21 | A:76:WaterBridge,0.003878368172745101
22 | A:117:WaterBridge,0.002897545736279258
23 | A:223:WaterBridge,6.437443250466106e-05
24 | A:172:WaterBridge,5.030793780147951e-07
25 | A:69:WaterBridge,0.0
26 | A:171:HBond,0.0
27 | A:217:WaterBridge,0.0
28 | A:212:WaterBridge,0.0
29 | A:56:HBond,0.0
30 | A:76:HBond,0.0
31 | A:75:WaterBridge,0.0
32 | A:171:WaterBridge,0.0
33 | A:170:WaterBridge,0.0
34 | A:78:HBond,0.0
35 | A:168:HBond,0.0
36 | A:170:HBond,0.0
37 | A:72:WaterBridge,0.0
38 | A:68:WaterBridge,0.0
39 | A:71:HBond,0.0
40 | A:208:WaterBridge,0.0
41 | A:147:Hydrophobic,0.0
42 | A:172:HBond,0.0
43 | A:168:WaterBridge,0.0
44 | A:223:HBond,0.0
45 | 


--------------------------------------------------------------------------------
/templates/desmond_md_job_template.cfg:
--------------------------------------------------------------------------------
 1 | annealing = false
 2 | backend = {
 3 | }
 4 | bigger_rclone = false
 5 | checkpt = {
 6 |    first = 0.0
 7 |    interval = 240.06
 8 |    name = "$JOBNAME.cpt"
 9 |    write_last_step = true
10 | }
11 | cpu = 1
12 | cutoff_radius = 9.0
13 | elapsed_time = 0.0
14 | energy_group = false
15 | eneseq = {
16 |    first = 0.0
17 |    interval = 5.0
18 |    name = "$JOBNAME$[_replica$REPLICA$].ene"
19 | }
20 | ensemble = {
21 |    barostat = {
22 |       tau = 2.0
23 |    }
24 |    class = NPT
25 |    method = MTK
26 |    thermostat = {
27 |       tau = 1.0
28 |    }
29 | }
30 | glue = solute
31 | maeff_output = {
32 |    first = 0.0
33 |    interval = 120.0
34 |    name = "$JOBNAME$[_replica$REPLICA$]-out.cms"
35 |    periodicfix = true
36 |    trjdir = "$JOBNAME$[_replica$REPLICA$]_trj"
37 | }
38 | meta = false
39 | meta_file = ?
40 | pressure = [1.01325 isotropic ]
41 | randomize_velocity = {
42 |    first = 0.0
43 |    interval = inf
44 |    seed = RSEED
45 |    temperature = "@*.temperature"
46 | }
47 | restrain = none
48 | simbox = {
49 |    first = 0.0
50 |    interval = 1.2
51 |    name = "$JOBNAME$[_replica$REPLICA$]_simbox.dat"
52 | }
53 | surface_tension = 0.0
54 | taper = false
55 | temperature = [
56 |    [300.0 0 ]
57 | ]
58 | time = SIMTIME
59 | timestep = [0.002 0.002 0.006 ]
60 | trajectory = {
61 |    center = []
62 |    first = 0.0
63 |    format = dtr
64 |    frames_per_file = 250
65 |    interval = WRITEFRQ
66 |    name = "$JOBNAME$[_replica$REPLICA$]_trj"
67 |    periodicfix = true
68 |    write_velocity = false
69 | }
70 | 


--------------------------------------------------------------------------------
/mdml/examples/PD-L1/README.md:
--------------------------------------------------------------------------------
 1 | # PD-L1 Example
 2 | 
 3 | The following commands will reproduce the PD-L1 modeling and feature importance from the MDFit paper.
 4 | 
 5 | Evaluate models with duplicate SimFPs included.
 6 | ```
 7 | mdml_train data.csv linear -nproc 10 -id_col Molecule -target_col pIC50 -model_type linear
 8 | mdml_train data.csv ridge -nproc 10 -id_col Molecule -target_col pIC50 -model_type ridge
 9 | mdml_train data.csv lasso -nproc 10 -id_col Molecule -target_col pIC50 -model_type lasso
10 | mdml_train data.csv random_forest -nproc 10 -id_col Molecule -target_col pIC50 -model_type random_forest
11 | mdml_train data.csv gradient_boosting -nproc 10 -id_col Molecule -target_col pIC50 -model_type gradient_boosting
12 | ```
13 | 
14 | Evaluate models with averaged SimFPs.
15 | ```
16 | mdml_train data.csv linear_mean -nproc 10 -id_col Molecule -target_col pIC50 -model_type linear -group mean
17 | mdml_train data.csv ridge_mean -nproc 10 -id_col Molecule -target_col pIC50 -model_type ridge -group mean
18 | mdml_train data.csv lasso_mean -nproc 10 -id_col Molecule -target_col pIC50 -model_type lasso -group mean
19 | mdml_train data.csv random_forest_mean -nproc 10 -id_col Molecule -target_col pIC50 -model_type random_forest -group mean
20 | mdml_train data.csv gradient_boosting_mean -nproc 10 -id_col Molecule -target_col pIC50 -model_type gradient_boosting -group mean
21 | ```
22 | 
23 | **Note:** Metrics (e.g., $Q^2$) in `cross_validation.json` are computed using all entries (duplicates included) 
24 | while metrics in `cross_validation.svg` are computed using the average of the predictions with SimFPs from 
25 | different simulations for each molecule.
26 | 


--------------------------------------------------------------------------------
/mdml/examples/PD-L1/linear_mean/importance.csv:
--------------------------------------------------------------------------------
 1 | ,importance
 2 | A:115:Hydrophobic,3.4179049015847265
 3 | A:56:HBond,3.0099879314819784
 4 | A:208:WaterBridge,2.2566332935566313
 5 | A:168:WaterBridge,1.738912227381602
 6 | A:66:HBond,1.4748026123484463
 7 | A:212:WaterBridge,1.3971231426306736
 8 | A:123:Hydrophobic,0.9201853566910351
 9 | A:117:WaterBridge,0.892846030163572
10 | A:76:WaterBridge,0.7716865020658565
11 | A:71:WaterBridge,0.7192231009122938
12 | A:63:HBond,0.6651088808964104
13 | A:168:HBond,0.6621585412820343
14 | A:172:WaterBridge,0.38829211645897277
15 | A:61:WaterBridge,0.27053382254314395
16 | A:73:WaterBridge,0.16993983811128135
17 | A:147:Hydrophobic,-0.23603163827478174
18 | A:72:WaterBridge,-0.295817760619532
19 | A:68:WaterBridge,-0.2958177606195367
20 | A:75:WaterBridge,-0.39128820753876475
21 | A:58:WaterBridge,-0.39655979386862417
22 | A:223:WaterBridge,-0.41118191846010677
23 | A:56:Hydrophobic,-0.44938163700422307
24 | A:170:HBond,-0.5870451891729009
25 | A:171:WaterBridge,-0.587045189172902
26 | A:170:WaterBridge,-0.5870451891729034
27 | A:171:HBond,-0.5870451891729043
28 | A:78:HBond,-0.5870451891729056
29 | A:56:Pi-Pi,-0.6825689511930271
30 | A:66:WaterBridge,-0.8319188984488853
31 | A:223:HBond,-0.8531626842457735
32 | A:54:Hydrophobic,-0.8567928166605197
33 | A:217:WaterBridge,-0.8639217454395056
34 | A:76:Hydrophobic,-0.9280147159776061
35 | A:172:HBond,-0.9977551305922331
36 | A:73:HBond,-1.0644968001332469
37 | A:69:WaterBridge,-1.1159989479455308
38 | A:123:Pi-Pi,-1.195543598779107
39 | A:76:HBond,-1.4885807597267475
40 | A:63:WaterBridge,-1.5556880962561483
41 | A:71:HBond,-2.079082284457193
42 | A:68:Hydrophobic,-2.861527931795211
43 | Strain_perHeavyAtom,-4.030727120928537
44 | A:113:Pi-Cation,-4.603684193032383
45 | 


--------------------------------------------------------------------------------
/mdml/examples/PD-L1/linear/importance.csv:
--------------------------------------------------------------------------------
 1 | ,importance
 2 | A:115:Hydrophobic,1.2120060469302938
 3 | A:212:WaterBridge,1.168176577723823
 4 | A:168:WaterBridge,1.0865861471438563
 5 | A:72:WaterBridge,0.9529641147783804
 6 | A:147:Hydrophobic,0.8651040669830239
 7 | A:73:WaterBridge,0.7580069156315616
 8 | A:63:HBond,0.6635612489400315
 9 | A:123:Hydrophobic,0.6560141197442709
10 | A:117:WaterBridge,0.5119784911486394
11 | A:208:WaterBridge,0.48906566692723774
12 | A:76:WaterBridge,0.4577167199866073
13 | A:168:HBond,0.42643379090330547
14 | A:172:WaterBridge,0.38628544472597476
15 | A:71:WaterBridge,0.22892684138671585
16 | A:223:WaterBridge,0.20984114039285134
17 | A:71:HBond,0.19348395983266614
18 | A:58:WaterBridge,0.1603143364407486
19 | A:171:WaterBridge,0.003791040265698497
20 | A:78:HBond,0.003791040265698275
21 | A:170:HBond,0.0037910402656928348
22 | A:170:WaterBridge,0.0037910402656875326
23 | A:171:HBond,0.003791040265687477
24 | A:61:WaterBridge,-0.07425602996985581
25 | A:66:HBond,-0.07585524033968483
26 | A:217:WaterBridge,-0.08138211203458502
27 | A:113:Pi-Cation,-0.1854874569122909
28 | A:223:HBond,-0.21080863429609223
29 | A:54:Hydrophobic,-0.2800521847061306
30 | A:56:Pi-Pi,-0.37513306595199836
31 | A:76:Hydrophobic,-0.41461902537342826
32 | A:66:WaterBridge,-0.4241472684941725
33 | A:75:WaterBridge,-0.4403105143780974
34 | A:56:HBond,-0.4471027457766443
35 | A:172:HBond,-0.4952251317488825
36 | A:69:WaterBridge,-0.5333669660366571
37 | A:76:HBond,-0.5713781299200659
38 | A:68:Hydrophobic,-0.7192994606837416
39 | A:73:HBond,-0.757435393881754
40 | A:63:WaterBridge,-0.8664388895174614
41 | A:123:Pi-Pi,-0.8880929506186971
42 | A:56:Hydrophobic,-1.0181739818333204
43 | A:68:WaterBridge,-1.739478706104431
44 | Strain_perHeavyAtom,-3.3243059445607055
45 | 


--------------------------------------------------------------------------------
/mdml/examples/PD-L1/ridge/importance.csv:
--------------------------------------------------------------------------------
 1 | ,importance
 2 | A:115:Hydrophobic,0.7771269646842853
 3 | A:123:Hydrophobic,0.6210610895519609
 4 | A:76:WaterBridge,0.6089400075223189
 5 | A:168:WaterBridge,0.5574300132399377
 6 | A:117:WaterBridge,0.48059961567284404
 7 | A:212:WaterBridge,0.40212521718288974
 8 | A:73:WaterBridge,0.3478662428863717
 9 | A:147:Hydrophobic,0.28854630058530073
10 | A:63:HBond,0.2272329456223205
11 | A:58:WaterBridge,0.18269862738738732
12 | A:172:WaterBridge,0.17630182718295317
13 | A:71:WaterBridge,0.15727243436725064
14 | A:208:WaterBridge,0.14223661094999115
15 | A:168:HBond,0.1290173710046051
16 | A:217:WaterBridge,0.12215918755110237
17 | A:223:WaterBridge,0.11630187952201819
18 | A:170:HBond,0.07471694786013153
19 | A:171:WaterBridge,0.07471694786013142
20 | A:170:WaterBridge,0.07471694786013136
21 | A:78:HBond,0.07471694786013133
22 | A:171:HBond,0.07471694786013132
23 | A:56:Pi-Pi,0.06214510366123463
24 | A:223:HBond,0.04517123540098317
25 | A:72:WaterBridge,-0.037306096238321006
26 | A:76:HBond,-0.07464996514888496
27 | A:71:HBond,-0.127599967988499
28 | A:75:WaterBridge,-0.12795018988433315
29 | A:61:WaterBridge,-0.13488085583473367
30 | A:113:Pi-Cation,-0.1724924494297132
31 | A:54:Hydrophobic,-0.18216700507201314
32 | A:172:HBond,-0.22790639579740693
33 | A:69:WaterBridge,-0.24592228065556734
34 | A:66:HBond,-0.25496888863217887
35 | A:76:Hydrophobic,-0.2782366980225123
36 | A:68:WaterBridge,-0.2909295771887263
37 | A:66:WaterBridge,-0.47955004808987833
38 | A:56:HBond,-0.48296469382299007
39 | A:73:HBond,-0.5643187359447307
40 | A:56:Hydrophobic,-0.6457807943752559
41 | A:68:Hydrophobic,-0.6552799976029455
42 | A:123:Pi-Pi,-0.7164696676908707
43 | A:63:WaterBridge,-0.8507228121803795
44 | Strain_perHeavyAtom,-2.3851627198597454
45 | 


--------------------------------------------------------------------------------
/mdml/examples/PD-L1/ridge_mean/importance.csv:
--------------------------------------------------------------------------------
 1 | ,importance
 2 | A:115:Hydrophobic,0.7299994676201763
 3 | A:168:WaterBridge,0.6589317315664275
 4 | A:76:WaterBridge,0.5581218314756109
 5 | A:123:Hydrophobic,0.4704127905526515
 6 | A:117:WaterBridge,0.4629728122352017
 7 | A:212:WaterBridge,0.27553742311804563
 8 | A:58:WaterBridge,0.2698383136154038
 9 | A:56:Pi-Pi,0.23132274040074408
10 | A:63:HBond,0.19101828576556776
11 | A:147:Hydrophobic,0.18365680869013698
12 | A:217:WaterBridge,0.1780485363227274
13 | A:71:WaterBridge,0.1691649988414725
14 | A:73:WaterBridge,0.15895865792798008
15 | A:170:HBond,0.06741611732077547
16 | A:171:WaterBridge,0.06741611732077545
17 | A:170:WaterBridge,0.06741611732077545
18 | A:171:HBond,0.06741611732077533
19 | A:78:HBond,0.06741611732077529
20 | A:76:HBond,0.06338237997265034
21 | A:223:HBond,0.053391809815708555
22 | A:172:WaterBridge,0.03854497117463989
23 | A:61:WaterBridge,0.01668464997454458
24 | A:168:HBond,0.008529850847597782
25 | A:208:WaterBridge,-0.04557491968229436
26 | A:69:WaterBridge,-0.06956416572147074
27 | A:71:HBond,-0.1303990176825092
28 | A:72:WaterBridge,-0.14170557500168474
29 | A:68:WaterBridge,-0.14170557500168485
30 | A:54:Hydrophobic,-0.1823986543381202
31 | A:75:WaterBridge,-0.19792939804721865
32 | A:223:WaterBridge,-0.2251929776623398
33 | A:172:HBond,-0.27535920536088293
34 | A:66:HBond,-0.33686245854134406
35 | A:56:HBond,-0.35056770066808557
36 | A:76:Hydrophobic,-0.3891756666186929
37 | A:63:WaterBridge,-0.43982142871192126
38 | A:113:Pi-Cation,-0.45948548374961407
39 | A:56:Hydrophobic,-0.5472845686931
40 | A:73:HBond,-0.5987879917005361
41 | A:123:Pi-Pi,-0.7662767999190696
42 | A:68:Hydrophobic,-0.9491591909424791
43 | A:66:WaterBridge,-0.9778540167790951
44 | Strain_perHeavyAtom,-1.8010590595672384
45 | 


--------------------------------------------------------------------------------
/mdml/examples/PD-L1/random_forest/importance.csv:
--------------------------------------------------------------------------------
 1 | ,importance
 2 | Strain_perHeavyAtom,0.30860971827157935
 3 | A:68:Hydrophobic,0.08001172921982987
 4 | A:123:Hydrophobic,0.07715452869124684
 5 | A:113:Pi-Cation,0.07077788988811898
 6 | A:56:Hydrophobic,0.06629369560444234
 7 | A:58:WaterBridge,0.04896082890640077
 8 | A:66:HBond,0.048801772310769966
 9 | A:115:Hydrophobic,0.04644891128863812
10 | A:56:Pi-Pi,0.03424488250882641
11 | A:73:HBond,0.03136362199068667
12 | A:73:WaterBridge,0.027876884140904053
13 | A:63:HBond,0.026941986205677525
14 | A:123:Pi-Pi,0.02686597995865185
15 | A:66:WaterBridge,0.026625352074648258
16 | A:71:WaterBridge,0.02566507104440312
17 | A:61:WaterBridge,0.01858699133485243
18 | A:76:Hydrophobic,0.012686436202685678
19 | A:63:WaterBridge,0.006312631543299066
20 | A:54:Hydrophobic,0.005471708581803852
21 | A:76:WaterBridge,0.0036530024544970284
22 | A:56:HBond,0.0032085902816660654
23 | A:117:WaterBridge,0.0010818161281157467
24 | A:223:WaterBridge,0.0006955289819360857
25 | A:172:WaterBridge,0.0002604124350259707
26 | A:208:WaterBridge,0.00023179598795874113
27 | A:71:HBond,0.0002157967496199828
28 | A:76:HBond,0.00019343692532114285
29 | A:168:HBond,0.0001784188731492803
30 | A:68:WaterBridge,0.00015537415539062873
31 | A:172:HBond,0.0001261767175569754
32 | A:168:WaterBridge,0.00012313980832576857
33 | A:212:WaterBridge,4.9421997946330464e-05
34 | A:217:WaterBridge,4.537083895770855e-05
35 | A:75:WaterBridge,3.787606086354798e-05
36 | A:147:Hydrophobic,2.7658529979795518e-05
37 | A:170:WaterBridge,8.978550407620781e-06
38 | A:170:HBond,4.3284678571984724e-06
39 | A:72:WaterBridge,1.4129122784170004e-06
40 | A:171:HBond,8.137160234610557e-07
41 | A:223:HBond,2.490689120873583e-08
42 | A:69:WaterBridge,4.75276622705658e-09
43 | A:171:WaterBridge,0.0
44 | A:78:HBond,0.0
45 | 


--------------------------------------------------------------------------------
/mdml/examples/PD-L1/random_forest_mean/importance.csv:
--------------------------------------------------------------------------------
 1 | ,importance
 2 | Strain_perHeavyAtom,0.2125836786086668
 3 | A:123:Hydrophobic,0.20602398576046763
 4 | A:66:HBond,0.10624292160645096
 5 | A:56:Hydrophobic,0.09805387490680063
 6 | A:68:Hydrophobic,0.06950713857033447
 7 | A:66:WaterBridge,0.04686196685383211
 8 | A:113:Pi-Cation,0.04533804482304102
 9 | A:115:Hydrophobic,0.03594467618337902
10 | A:71:WaterBridge,0.027027930926490154
11 | A:63:HBond,0.02477567376923509
12 | A:123:Pi-Pi,0.02353756293551733
13 | A:56:Pi-Pi,0.017746141510603216
14 | A:58:WaterBridge,0.01765104282246242
15 | A:73:HBond,0.016441771355816245
16 | A:61:WaterBridge,0.014115551294360444
17 | A:73:WaterBridge,0.013948703421524447
18 | A:76:Hydrophobic,0.009416816299792036
19 | A:54:Hydrophobic,0.00623412286252767
20 | A:223:WaterBridge,0.001286130593757802
21 | A:217:WaterBridge,0.0010723045600818048
22 | A:63:WaterBridge,0.0009150821291903253
23 | A:172:WaterBridge,0.0008452712591777811
24 | A:117:WaterBridge,0.0007361624896637547
25 | A:56:HBond,0.0006333045115570916
26 | A:69:WaterBridge,0.0005595236851902069
27 | A:223:HBond,0.00045089630275727856
28 | A:172:HBond,0.00039886955842963456
29 | A:168:WaterBridge,0.00036978891593451407
30 | A:168:HBond,0.0003242875196668843
31 | A:68:WaterBridge,0.0003199809489816599
32 | A:76:WaterBridge,0.0002786035594121451
33 | A:71:HBond,0.0001881254027270712
34 | A:208:WaterBridge,9.087622577976781e-05
35 | A:76:HBond,2.9612756342381666e-05
36 | A:212:WaterBridge,1.960617059264951e-05
37 | A:147:Hydrophobic,1.5013039259273214e-05
38 | A:72:WaterBridge,3.768469760156854e-06
39 | A:171:WaterBridge,3.501386338798146e-06
40 | A:171:HBond,2.330072877036284e-06
41 | A:170:WaterBridge,2.0384057412636837e-06
42 | A:170:HBond,1.8026448436845405e-06
43 | A:78:HBond,1.2133103595032605e-06
44 | A:75:WaterBridge,3.0157027591189784e-07
45 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # MDFit
 2 | Python wrapper for high-throughput molecular dynamics. A workflow overview and application of MDFit to a data set of macrocyclic peptides targetting PD-L1 are discussed in _ChemRxiv_.[^1]
 3 | [^1]: [MDFit: Automated molecular simulations workflow enables high throughput assessment of ligands-protein dynamics](https://doi.org/10.26434/chemrxiv-2024-gfcqx)
 4 | 
 5 | MDFit currently uses Schrodinger tools. Implementation of alternatives, including open-source tools, are ongoing.
 6 | # Prerequisites
 7 | MDFit assumes the `$SCHRODINGER` environmental variable has been set. This should point to the current Schrodinger installation. To check if `$SCHRODINGER` has been set correctly, try running: `$SCHRODINGER/run -h`
 8 | 
 9 | 
10 | MDFit attempts to get the current Schrodinger release by reading the `$SCHRODINGER` pathname. For example, if the current release is installed in `/schrodinger/2023-2/`, MDFit will set the release to 2023-2. This value can also be hard-coded in MDFit (line 38) if a different directory naming scheme is used.
11 | 
12 | The first time MDFit.py is called, a `parameters_TEMPLATE.json` file is generated in the installation directory. Replace `localhost` with your institution's Schrodinger hostnames and rename the file to `parameters.json`. This is required only once and MDFit will always read `parameters.json` to get host information on subsequent runs. General runtime limit guidance:
13 | ```
14 | FFBUILDER   10 hours
15 | BMIN        2 hours
16 | MULTISIM    2 hours
17 | DESMOND     24 hours
18 | ANALYSIS    8 hours
19 | ```
20 | # Usage
21 | ```
22 | $SCHRODINGER/run python3 MDFit.py -h
23 | ```
24 | Self-contained example available in `MDFit/Examples/PDL1/`. The following command will run FFBuilder, three repetitions of 100 ns Desmond MD, and MD analysis for Pep-01, Pep-41, Pep-52, and Pep-66. The first 100 frames will be removed from the trajectory before analysis (`--slice_start`) and the cutoff for retaining a protein-ligand interaction is 0.3 (`--analysis_cutoff`).
25 | ```
26 | $SCHRODINGER/run python3 MDFit.py -p 6PV9_PDL1.mae -l MDFit_PDL1_Example_Ligands.mae -o "MDFit/Examples/PDL1/PDL1_oplsdir" -t 100000 -r 3 --slice_start 100 --analysis_cutoff 0.3 -d
27 | ```
28 | It is strongly encouraged to use the debug flag `-d` for initial MDFit usage. Errors may occur if packages are not where MDFit expects them to be.
29 | 
30 | 
31 | # Bugs and Known Errors
32 | + Schrodinger release relying on installation pathname.
33 | 


--------------------------------------------------------------------------------
/mdml/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | pip-wheel-metadata/
 24 | share/python-wheels/
 25 | *.egg-info/
 26 | .installed.cfg
 27 | *.egg
 28 | MANIFEST
 29 | 
 30 | # PyInstaller
 31 | #  Usually these files are written by a python script from a template
 32 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 33 | *.manifest
 34 | *.spec
 35 | 
 36 | # Installer logs
 37 | pip-log.txt
 38 | pip-delete-this-directory.txt
 39 | 
 40 | # Unit test / coverage reports
 41 | htmlcov/
 42 | .tox/
 43 | .nox/
 44 | .coverage
 45 | .coverage.*
 46 | .cache
 47 | nosetests.xml
 48 | coverage.xml
 49 | *.cover
 50 | *.py,cover
 51 | .hypothesis/
 52 | .pytest_cache/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | target/
 76 | 
 77 | # Jupyter Notebook
 78 | .ipynb_checkpoints
 79 | 
 80 | # IPython
 81 | profile_default/
 82 | ipython_config.py
 83 | 
 84 | # pyenv
 85 | .python-version
 86 | 
 87 | # pipenv
 88 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 89 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 90 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 91 | #   install all needed dependencies.
 92 | #Pipfile.lock
 93 | 
 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 95 | __pypackages__/
 96 | 
 97 | # Celery stuff
 98 | celerybeat-schedule
 99 | celerybeat.pid
100 | 
101 | # SageMath parsed files
102 | *.sage.py
103 | 
104 | # Environments
105 | .env
106 | .venv
107 | env/
108 | venv/
109 | ENV/
110 | env.bak/
111 | venv.bak/
112 | 
113 | # Spyder project settings
114 | .spyderproject
115 | .spyproject
116 | 
117 | # Rope project settings
118 | .ropeproject
119 | 
120 | # mkdocs documentation
121 | /site
122 | 
123 | # mypy
124 | .mypy_cache/
125 | .dmypy.json
126 | dmypy.json
127 | 
128 | # Pyre type checker
129 | .pyre/
130 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # Mac
  7 | .DS_Store
  8 | 
  9 | # C extensions
 10 | *.so
 11 | 
 12 | # Distribution / packaging
 13 | .Python
 14 | build/
 15 | develop-eggs/
 16 | dist/
 17 | downloads/
 18 | eggs/
 19 | .eggs/
 20 | lib/
 21 | lib64/
 22 | parts/
 23 | sdist/
 24 | var/
 25 | wheels/
 26 | pip-wheel-metadata/
 27 | share/python-wheels/
 28 | *.egg-info/
 29 | .installed.cfg
 30 | *.egg
 31 | MANIFEST
 32 | 
 33 | # PyInstaller
 34 | #  Usually these files are written by a python script from a template
 35 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 36 | *.manifest
 37 | *.spec
 38 | 
 39 | # Installer logs
 40 | pip-log.txt
 41 | pip-delete-this-directory.txt
 42 | 
 43 | # Unit test / coverage reports
 44 | htmlcov/
 45 | .tox/
 46 | .nox/
 47 | .coverage
 48 | .coverage.*
 49 | .cache
 50 | nosetests.xml
 51 | coverage.xml
 52 | *.cover
 53 | *.py,cover
 54 | .hypothesis/
 55 | .pytest_cache/
 56 | 
 57 | # Translations
 58 | *.mo
 59 | *.pot
 60 | 
 61 | # Django stuff:
 62 | *.log
 63 | local_settings.py
 64 | db.sqlite3
 65 | db.sqlite3-journal
 66 | 
 67 | # Flask stuff:
 68 | instance/
 69 | .webassets-cache
 70 | 
 71 | # Scrapy stuff:
 72 | .scrapy
 73 | 
 74 | # Sphinx documentation
 75 | docs/_build/
 76 | 
 77 | # PyBuilder
 78 | target/
 79 | 
 80 | # Jupyter Notebook
 81 | .ipynb_checkpoints
 82 | 
 83 | # IPython
 84 | profile_default/
 85 | ipython_config.py
 86 | 
 87 | # pyenv
 88 | .python-version
 89 | 
 90 | # pipenv
 91 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 92 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 93 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 94 | #   install all needed dependencies.
 95 | #Pipfile.lock
 96 | 
 97 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 98 | __pypackages__/
 99 | 
100 | # Celery stuff
101 | celerybeat-schedule
102 | celerybeat.pid
103 | 
104 | # SageMath parsed files
105 | *.sage.py
106 | 
107 | # Environments
108 | .env
109 | .venv
110 | env/
111 | venv/
112 | ENV/
113 | env.bak/
114 | venv.bak/
115 | 
116 | # Spyder project settings
117 | .spyderproject
118 | .spyproject
119 | 
120 | # Rope project settings
121 | .ropeproject
122 | 
123 | # mkdocs documentation
124 | /site
125 | 
126 | # mypy
127 | .mypy_cache/
128 | .dmypy.json
129 | dmypy.json
130 | 
131 | # Pyre type checker
132 | .pyre/
133 | 


--------------------------------------------------------------------------------
/mdml/bin/polynomial_features:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | """
 3 | Polynomial Features
 4 | -------------------
 5 | A helper script to include include squared and interaction terms (polynomial
 6 | features) to SimFPs.
 7 | 
 8 | @author: Benjamin Shields 
 9 | @email: benjamin.shields@bms.com
10 | """
11 | 
12 | ############################################################################## Imports
13 | 
14 | import pandas as pd
15 | from mdml import cli
16 | from sklearn.preprocessing import PolynomialFeatures
17 | 
18 | ############################################################################## Interface
19 | 
20 | def get_parser():
21 |     description = """Add polynomial features to a SimFP output CSV."""
22 |     parser, groups = cli.parser(description)
23 |     opts = parser.add_argument_group("OPTIONS")
24 |     opts.add_argument(
25 |         '-target_col',
26 |         type=str,
27 |         default=None,
28 |         help="Name of target column in training data CSV.",
29 |     )
30 |     opts.add_argument(
31 |         '-drop_col',
32 |         nargs='+',
33 |         type=str,
34 |         default=[],
35 |         help="Columns that should be removed from the training CSV."
36 |     )
37 |     required = parser.add_argument_group("REQUIRED")
38 |     required.add_argument(
39 |         'input',
40 |         help="Path to CSV containing features, IDs, and target (optional).",
41 |     )
42 |     required.add_argument(
43 |         'output',
44 |         help="Path to save output CSV.",
45 |     )
46 |     required.add_argument(
47 |         '-id_col',
48 |         type=str,
49 |         required=True,
50 |         help="Name of compound ID column.",
51 |     )
52 |     
53 |     return parser
54 | 
55 | ############################################################################## Main
56 | 
57 | def main(args):
58 |     data = cli.load_data(args.input, args.id_col, drop=args.drop_col)
59 |     drop = ['ID']
60 |     if args.target_col is not None:
61 |         drop.append(args.target_col)
62 |     X = data.copy().drop(drop, axis=1)
63 |     poly = PolynomialFeatures(
64 |         degree=2, 
65 |         include_bias=False, 
66 |         interaction_only=False
67 |     )
68 |     X = pd.DataFrame(
69 |         poly.fit_transform(X),
70 |         columns=poly.get_feature_names_out(),
71 |         index=data.index
72 |     )
73 |     if args.target_col is not None:
74 |         X[args.target_col] = data[args.target_col]
75 |     X.insert(0, args.id_col, data['ID'].values)
76 |     X.to_csv(args.output, index=False)
77 | 
78 | 
79 | if __name__ == "__main__":
80 |     parser = get_parser()
81 |     args = parser.parse_args()
82 |     main(args)
83 | 
84 | 


--------------------------------------------------------------------------------
/mdml/examples/PD-L1/lasso/STDERR:
--------------------------------------------------------------------------------
 1 | DEBUG:__main__:Workflow set to nested=True
 2 | DEBUG:__main__:Loading data: data.csv
 3 | DEBUG:__main__:Dataset contains 183 entries
 4 | DEBUG:__main__:Building initial model
 5 | DEBUG:__main__:Saving model to lasso/model.pkl
 6 | DEBUG:__main__:Saving feature importance to lasso/importance.csv
 7 | DEBUG:__main__:Running cross-validation
 8 | Molecule CV: 0it [00:00, ?it/s]Molecule CV: 1it [00:07,  7.34s/it]Molecule CV: 2it [00:14,  7.24s/it]Molecule CV: 3it [00:21,  7.19s/it]Molecule CV: 4it [00:28,  7.19s/it]Molecule CV: 5it [00:35,  7.17s/it]Molecule CV: 6it [00:43,  7.18s/it]Molecule CV: 7it [00:50,  7.17s/it]Molecule CV: 8it [00:57,  7.16s/it]Molecule CV: 9it [01:04,  7.17s/it]Molecule CV: 10it [01:11,  7.15s/it]Molecule CV: 11it [01:18,  7.14s/it]Molecule CV: 12it [01:26,  7.23s/it]Molecule CV: 13it [01:33,  7.22s/it]Molecule CV: 14it [01:40,  7.24s/it]Molecule CV: 15it [01:48,  7.24s/it]Molecule CV: 16it [01:55,  7.19s/it]Molecule CV: 17it [02:02,  7.19s/it]Molecule CV: 18it [02:09,  7.17s/it]Molecule CV: 19it [02:16,  7.15s/it]Molecule CV: 20it [02:24,  7.26s/it]Molecule CV: 21it [02:31,  7.34s/it]Molecule CV: 22it [02:38,  7.25s/it]Molecule CV: 23it [02:45,  7.25s/it]Molecule CV: 24it [02:53,  7.25s/it]Molecule CV: 25it [03:00,  7.22s/it]Molecule CV: 26it [03:07,  7.20s/it]Molecule CV: 27it [03:14,  7.17s/it]Molecule CV: 28it [03:21,  7.19s/it]Molecule CV: 29it [03:28,  7.18s/it]Molecule CV: 30it [03:36,  7.21s/it]Molecule CV: 31it [03:43,  7.23s/it]Molecule CV: 32it [03:50,  7.29s/it]Molecule CV: 33it [03:58,  7.24s/it]Molecule CV: 34it [04:05,  7.20s/it]Molecule CV: 35it [04:12,  7.17s/it]Molecule CV: 36it [04:19,  7.11s/it]Molecule CV: 37it [04:26,  7.07s/it]Molecule CV: 38it [04:33,  7.11s/it]Molecule CV: 39it [04:40,  7.10s/it]Molecule CV: 40it [04:47,  7.08s/it]Molecule CV: 41it [04:54,  7.13s/it]Molecule CV: 42it [05:02,  7.17s/it]Molecule CV: 43it [05:08,  7.10s/it]Molecule CV: 44it [05:15,  7.03s/it]Molecule CV: 45it [05:23,  7.15s/it]Molecule CV: 46it [05:30,  7.19s/it]Molecule CV: 47it [05:37,  7.18s/it]Molecule CV: 48it [05:44,  7.19s/it]Molecule CV: 49it [05:52,  7.17s/it]Molecule CV: 50it [05:59,  7.27s/it]Molecule CV: 51it [06:06,  7.24s/it]Molecule CV: 52it [06:13,  7.26s/it]Molecule CV: 53it [06:21,  7.25s/it]Molecule CV: 54it [06:28,  7.24s/it]Molecule CV: 55it [06:35,  7.23s/it]Molecule CV: 56it [06:42,  7.22s/it]Molecule CV: 57it [06:50,  7.23s/it]Molecule CV: 58it [06:57,  7.21s/it]Molecule CV: 59it [07:04,  7.26s/it]Molecule CV: 60it [07:11,  7.29s/it]Molecule CV: 61it [07:19,  7.31s/it]Molecule CV: 61it [07:19,  7.20s/it]
 9 | DEBUG:__main__:Saving cross-validation results to lasso/cross_validation.json and lasso/cross_validation.svg
10 | 


--------------------------------------------------------------------------------
/mdml/examples/PD-L1/ridge/STDERR:
--------------------------------------------------------------------------------
 1 | DEBUG:__main__:Workflow set to nested=True
 2 | DEBUG:__main__:Loading data: data.csv
 3 | DEBUG:__main__:Dataset contains 183 entries
 4 | DEBUG:__main__:Building initial model
 5 | DEBUG:__main__:Saving model to ridge/model.pkl
 6 | DEBUG:__main__:Saving feature importance to ridge/importance.csv
 7 | DEBUG:__main__:Running cross-validation
 8 | Molecule CV: 0it [00:00, ?it/s]Molecule CV: 1it [00:11, 11.21s/it]Molecule CV: 2it [00:22, 11.16s/it]Molecule CV: 3it [00:33, 11.11s/it]Molecule CV: 4it [00:44, 11.09s/it]Molecule CV: 5it [00:55, 11.06s/it]Molecule CV: 6it [01:06, 11.05s/it]Molecule CV: 7it [01:17, 11.11s/it]Molecule CV: 8it [01:28, 11.12s/it]Molecule CV: 9it [01:40, 11.13s/it]Molecule CV: 10it [01:51, 11.12s/it]Molecule CV: 11it [02:02, 11.32s/it]Molecule CV: 12it [02:13, 11.24s/it]Molecule CV: 13it [02:24, 11.19s/it]Molecule CV: 14it [02:36, 11.15s/it]Molecule CV: 15it [02:47, 11.16s/it]Molecule CV: 16it [02:58, 11.15s/it]Molecule CV: 17it [03:09, 11.14s/it]Molecule CV: 18it [03:20, 11.19s/it]Molecule CV: 19it [03:31, 11.15s/it]Molecule CV: 20it [03:42, 11.14s/it]Molecule CV: 21it [03:54, 11.14s/it]Molecule CV: 22it [04:05, 11.10s/it]Molecule CV: 23it [04:16, 11.09s/it]Molecule CV: 24it [04:27, 11.11s/it]Molecule CV: 25it [04:38, 11.13s/it]Molecule CV: 26it [04:49, 11.14s/it]Molecule CV: 27it [05:00, 11.15s/it]Molecule CV: 28it [05:11, 11.12s/it]Molecule CV: 29it [05:23, 11.14s/it]Molecule CV: 30it [05:34, 11.15s/it]Molecule CV: 31it [05:45, 11.14s/it]Molecule CV: 32it [05:56, 11.28s/it]Molecule CV: 33it [06:08, 11.23s/it]Molecule CV: 34it [06:19, 11.40s/it]Molecule CV: 35it [06:30, 11.31s/it]Molecule CV: 36it [06:42, 11.26s/it]Molecule CV: 37it [06:53, 11.25s/it]Molecule CV: 38it [07:04, 11.25s/it]Molecule CV: 39it [07:15, 11.21s/it]Molecule CV: 40it [07:26, 11.16s/it]Molecule CV: 41it [07:37, 11.13s/it]Molecule CV: 42it [07:48, 11.12s/it]Molecule CV: 43it [08:00, 11.12s/it]Molecule CV: 44it [08:11, 11.09s/it]Molecule CV: 45it [08:22, 11.11s/it]Molecule CV: 46it [08:33, 11.11s/it]Molecule CV: 47it [08:44, 11.09s/it]Molecule CV: 48it [08:55, 11.08s/it]Molecule CV: 49it [09:06, 11.08s/it]Molecule CV: 50it [09:17, 11.11s/it]Molecule CV: 51it [09:28, 11.10s/it]Molecule CV: 52it [09:39, 11.12s/it]Molecule CV: 53it [09:51, 11.12s/it]Molecule CV: 54it [10:02, 11.09s/it]Molecule CV: 55it [10:13, 11.09s/it]Molecule CV: 56it [10:24, 11.08s/it]Molecule CV: 57it [10:35, 11.11s/it]Molecule CV: 58it [10:46, 11.11s/it]Molecule CV: 59it [10:57, 11.09s/it]Molecule CV: 60it [11:08, 11.09s/it]Molecule CV: 61it [11:19, 11.09s/it]Molecule CV: 61it [11:19, 11.14s/it]
 9 | DEBUG:__main__:Saving cross-validation results to ridge/cross_validation.json and ridge/cross_validation.svg
10 | 


--------------------------------------------------------------------------------
/mdml/examples/PD-L1/ridge_mean/STDERR:
--------------------------------------------------------------------------------
 1 | DEBUG:__main__:Workflow set to nested=True
 2 | DEBUG:__main__:Loading data: data.csv
 3 | DEBUG:__main__:Dataset contains 61 entries
 4 | DEBUG:__main__:Building initial model
 5 | DEBUG:__main__:Saving model to mean_ridge/model.pkl
 6 | DEBUG:__main__:Saving feature importance to mean_ridge/importance.csv
 7 | DEBUG:__main__:Running cross-validation
 8 | Molecule CV: 0it [00:00, ?it/s]Molecule CV: 1it [00:03,  3.30s/it]Molecule CV: 2it [00:06,  3.22s/it]Molecule CV: 3it [00:09,  3.23s/it]Molecule CV: 4it [00:12,  3.19s/it]Molecule CV: 5it [00:16,  3.26s/it]Molecule CV: 6it [00:19,  3.20s/it]Molecule CV: 7it [00:22,  3.11s/it]Molecule CV: 8it [00:25,  3.11s/it]Molecule CV: 9it [00:28,  3.09s/it]Molecule CV: 10it [00:31,  3.16s/it]Molecule CV: 11it [00:34,  3.11s/it]Molecule CV: 12it [00:37,  3.13s/it]Molecule CV: 13it [00:40,  3.08s/it]Molecule CV: 14it [00:43,  3.10s/it]Molecule CV: 15it [00:47,  3.08s/it]Molecule CV: 16it [00:49,  3.02s/it]Molecule CV: 17it [00:53,  3.06s/it]Molecule CV: 18it [00:56,  3.06s/it]Molecule CV: 19it [00:59,  3.16s/it]Molecule CV: 20it [01:02,  3.14s/it]Molecule CV: 21it [01:05,  3.16s/it]Molecule CV: 22it [01:08,  3.12s/it]Molecule CV: 23it [01:12,  3.15s/it]Molecule CV: 24it [01:15,  3.21s/it]Molecule CV: 25it [01:18,  3.17s/it]Molecule CV: 26it [01:21,  3.23s/it]Molecule CV: 27it [01:24,  3.19s/it]Molecule CV: 28it [01:28,  3.22s/it]Molecule CV: 29it [01:31,  3.18s/it]Molecule CV: 30it [01:34,  3.26s/it]Molecule CV: 31it [01:37,  3.20s/it]Molecule CV: 32it [01:40,  3.19s/it]Molecule CV: 33it [01:44,  3.17s/it]Molecule CV: 34it [01:47,  3.12s/it]Molecule CV: 35it [01:50,  3.13s/it]Molecule CV: 36it [01:53,  3.09s/it]Molecule CV: 37it [01:56,  3.09s/it]Molecule CV: 38it [01:59,  2.99s/it]Molecule CV: 39it [02:02,  3.08s/it]Molecule CV: 40it [02:05,  3.09s/it]Molecule CV: 41it [02:08,  3.16s/it]Molecule CV: 42it [02:11,  3.14s/it]Molecule CV: 43it [02:14,  3.09s/it]Molecule CV: 44it [02:18,  3.15s/it]Molecule CV: 45it [02:21,  3.13s/it]Molecule CV: 46it [02:24,  3.15s/it]Molecule CV: 47it [02:27,  3.12s/it]Molecule CV: 48it [02:30,  3.17s/it]Molecule CV: 49it [02:33,  3.14s/it]Molecule CV: 50it [02:36,  3.09s/it]Molecule CV: 51it [02:40,  3.14s/it]Molecule CV: 52it [02:43,  3.14s/it]Molecule CV: 53it [02:46,  3.24s/it]Molecule CV: 54it [02:49,  3.21s/it]Molecule CV: 55it [02:53,  3.26s/it]Molecule CV: 56it [02:56,  3.23s/it]Molecule CV: 57it [02:59,  3.25s/it]Molecule CV: 58it [03:03,  3.28s/it]Molecule CV: 59it [03:06,  3.29s/it]Molecule CV: 60it [03:09,  3.29s/it]Molecule CV: 61it [03:12,  3.27s/it]Molecule CV: 61it [03:12,  3.16s/it]
 9 | DEBUG:__main__:Saving cross-validation results to mean_ridge/cross_validation.json and mean_ridge/cross_validation.svg
10 | 


--------------------------------------------------------------------------------
/mdml/examples/PD-L1/random_forest/STDERR:
--------------------------------------------------------------------------------
 1 | DEBUG:__main__:Workflow set to nested=True
 2 | DEBUG:__main__:Loading data: data.csv
 3 | DEBUG:__main__:Dataset contains 183 entries
 4 | DEBUG:__main__:Building initial model
 5 | DEBUG:__main__:Saving model to random_forest/model.pkl
 6 | DEBUG:__main__:Saving feature importance to random_forest/importance.csv
 7 | DEBUG:__main__:Running cross-validation
 8 | Molecule CV: 0it [00:00, ?it/s]Molecule CV: 1it [00:54, 54.09s/it]Molecule CV: 2it [01:46, 52.81s/it]Molecule CV: 3it [02:37, 52.30s/it]Molecule CV: 4it [03:29, 52.05s/it]Molecule CV: 5it [04:21, 51.97s/it]Molecule CV: 6it [05:13, 51.95s/it]Molecule CV: 7it [06:04, 51.86s/it]Molecule CV: 8it [06:57, 51.98s/it]Molecule CV: 9it [07:49, 52.02s/it]Molecule CV: 10it [08:41, 52.00s/it]Molecule CV: 11it [09:32, 51.90s/it]Molecule CV: 12it [10:24, 51.92s/it]Molecule CV: 13it [11:17, 52.27s/it]Molecule CV: 14it [12:09, 51.96s/it]Molecule CV: 15it [13:00, 51.86s/it]Molecule CV: 16it [13:52, 51.92s/it]Molecule CV: 17it [14:44, 51.76s/it]Molecule CV: 18it [15:35, 51.78s/it]Molecule CV: 19it [16:27, 51.82s/it]Molecule CV: 20it [17:19, 51.80s/it]Molecule CV: 21it [18:11, 51.81s/it]Molecule CV: 22it [19:02, 51.63s/it]Molecule CV: 23it [19:54, 51.65s/it]Molecule CV: 24it [20:46, 51.69s/it]Molecule CV: 25it [21:39, 52.23s/it]Molecule CV: 26it [22:31, 52.25s/it]Molecule CV: 27it [23:24, 52.47s/it]Molecule CV: 28it [24:17, 52.37s/it]Molecule CV: 29it [25:09, 52.40s/it]Molecule CV: 30it [26:01, 52.27s/it]Molecule CV: 31it [26:54, 52.44s/it]Molecule CV: 32it [27:47, 52.74s/it]Molecule CV: 33it [28:40, 52.79s/it]Molecule CV: 34it [29:33, 52.74s/it]Molecule CV: 35it [30:26, 52.77s/it]Molecule CV: 36it [31:20, 53.18s/it]Molecule CV: 37it [32:13, 53.29s/it]Molecule CV: 38it [33:06, 53.18s/it]Molecule CV: 39it [34:00, 53.40s/it]Molecule CV: 40it [34:53, 53.14s/it]Molecule CV: 41it [35:45, 52.95s/it]Molecule CV: 42it [36:38, 52.88s/it]Molecule CV: 43it [37:30, 52.67s/it]Molecule CV: 44it [38:23, 52.60s/it]Molecule CV: 45it [39:15, 52.50s/it]Molecule CV: 46it [40:07, 52.38s/it]Molecule CV: 47it [40:59, 52.32s/it]Molecule CV: 48it [41:53, 52.73s/it]Molecule CV: 49it [42:45, 52.68s/it]Molecule CV: 50it [43:38, 52.60s/it]Molecule CV: 51it [44:31, 52.74s/it]Molecule CV: 52it [45:24, 52.95s/it]Molecule CV: 53it [46:17, 52.87s/it]Molecule CV: 54it [47:09, 52.71s/it]Molecule CV: 55it [48:03, 53.12s/it]Molecule CV: 56it [48:56, 52.91s/it]Molecule CV: 57it [49:48, 52.81s/it]Molecule CV: 58it [50:41, 52.77s/it]Molecule CV: 59it [51:36, 53.40s/it]Molecule CV: 60it [52:30, 53.65s/it]Molecule CV: 61it [53:22, 53.24s/it]Molecule CV: 61it [53:22, 52.51s/it]
 9 | DEBUG:__main__:Saving cross-validation results to random_forest/cross_validation.json and random_forest/cross_validation.svg
10 | 


--------------------------------------------------------------------------------
/mdml/examples/PD-L1/random_forest_mean/STDERR:
--------------------------------------------------------------------------------
 1 | DEBUG:__main__:Workflow set to nested=True
 2 | DEBUG:__main__:Loading data: data.csv
 3 | DEBUG:__main__:Dataset contains 61 entries
 4 | DEBUG:__main__:Building initial model
 5 | DEBUG:__main__:Saving model to mean_random_forest/model.pkl
 6 | DEBUG:__main__:Saving feature importance to mean_random_forest/importance.csv
 7 | DEBUG:__main__:Running cross-validation
 8 | Molecule CV: 0it [00:00, ?it/s]Molecule CV: 1it [00:09,  9.32s/it]Molecule CV: 2it [00:18,  9.30s/it]Molecule CV: 3it [00:28,  9.35s/it]Molecule CV: 4it [00:37,  9.32s/it]Molecule CV: 5it [00:46,  9.30s/it]Molecule CV: 6it [00:55,  9.28s/it]Molecule CV: 7it [01:05,  9.27s/it]Molecule CV: 8it [01:14,  9.31s/it]Molecule CV: 9it [01:23,  9.31s/it]Molecule CV: 10it [01:33,  9.30s/it]Molecule CV: 11it [01:42,  9.29s/it]Molecule CV: 12it [01:52,  9.50s/it]Molecule CV: 13it [02:01,  9.41s/it]Molecule CV: 14it [02:10,  9.38s/it]Molecule CV: 15it [02:20,  9.34s/it]Molecule CV: 16it [02:29,  9.31s/it]Molecule CV: 17it [02:38,  9.28s/it]Molecule CV: 18it [02:47,  9.28s/it]Molecule CV: 19it [02:57,  9.29s/it]Molecule CV: 20it [03:06,  9.28s/it]Molecule CV: 21it [03:15,  9.28s/it]Molecule CV: 22it [03:24,  9.28s/it]Molecule CV: 23it [03:34,  9.30s/it]Molecule CV: 24it [03:43,  9.29s/it]Molecule CV: 25it [03:52,  9.27s/it]Molecule CV: 26it [04:02,  9.27s/it]Molecule CV: 27it [04:11,  9.26s/it]Molecule CV: 28it [04:20,  9.28s/it]Molecule CV: 29it [04:29,  9.26s/it]Molecule CV: 30it [04:39,  9.26s/it]Molecule CV: 31it [04:48,  9.26s/it]Molecule CV: 32it [04:57,  9.28s/it]Molecule CV: 33it [05:06,  9.26s/it]Molecule CV: 34it [05:16,  9.25s/it]Molecule CV: 35it [05:25,  9.24s/it]Molecule CV: 36it [05:34,  9.23s/it]Molecule CV: 37it [05:43,  9.23s/it]Molecule CV: 38it [05:52,  9.23s/it]Molecule CV: 39it [06:02,  9.24s/it]Molecule CV: 40it [06:11,  9.23s/it]Molecule CV: 41it [06:20,  9.23s/it]Molecule CV: 42it [06:29,  9.21s/it]Molecule CV: 43it [06:39,  9.22s/it]Molecule CV: 44it [06:48,  9.24s/it]Molecule CV: 45it [06:58,  9.37s/it]Molecule CV: 46it [07:07,  9.49s/it]Molecule CV: 47it [07:17,  9.43s/it]Molecule CV: 48it [07:26,  9.36s/it]Molecule CV: 49it [07:35,  9.32s/it]Molecule CV: 50it [07:44,  9.29s/it]Molecule CV: 51it [07:53,  9.29s/it]Molecule CV: 52it [08:03,  9.27s/it]Molecule CV: 53it [08:12,  9.26s/it]Molecule CV: 54it [08:21,  9.27s/it]Molecule CV: 55it [08:30,  9.26s/it]Molecule CV: 56it [08:40,  9.24s/it]Molecule CV: 57it [08:49,  9.25s/it]Molecule CV: 58it [08:58,  9.24s/it]Molecule CV: 59it [09:07,  9.24s/it]Molecule CV: 60it [09:17,  9.25s/it]Molecule CV: 61it [09:26,  9.25s/it]Molecule CV: 61it [09:26,  9.29s/it]
 9 | DEBUG:__main__:Saving cross-validation results to mean_random_forest/cross_validation.json and mean_random_forest/cross_validation.svg
10 | 


--------------------------------------------------------------------------------
/mdml/examples/PD-L1/gradient_boosting_mean/STDERR:
--------------------------------------------------------------------------------
 1 | DEBUG:__main__:Workflow set to nested=True
 2 | DEBUG:__main__:Loading data: data.csv
 3 | DEBUG:__main__:Dataset contains 61 entries
 4 | DEBUG:__main__:Building initial model
 5 | DEBUG:__main__:Saving model to mean_gradient_boosting/model.pkl
 6 | DEBUG:__main__:Saving feature importance to mean_gradient_boosting/importance.csv
 7 | DEBUG:__main__:Running cross-validation
 8 | Molecule CV: 0it [00:00, ?it/s]Molecule CV: 1it [00:20, 20.99s/it]Molecule CV: 2it [00:42, 21.03s/it]Molecule CV: 3it [01:03, 21.01s/it]Molecule CV: 4it [01:24, 21.06s/it]Molecule CV: 5it [01:45, 21.05s/it]Molecule CV: 6it [02:06, 21.06s/it]Molecule CV: 7it [02:27, 21.05s/it]Molecule CV: 8it [02:48, 21.04s/it]Molecule CV: 9it [03:09, 21.04s/it]Molecule CV: 10it [03:30, 21.02s/it]Molecule CV: 11it [03:51, 21.05s/it]Molecule CV: 12it [04:12, 21.06s/it]Molecule CV: 13it [04:34, 21.29s/it]Molecule CV: 14it [04:55, 21.16s/it]Molecule CV: 15it [05:16, 21.08s/it]Molecule CV: 16it [05:36, 20.99s/it]Molecule CV: 17it [05:57, 20.92s/it]Molecule CV: 18it [06:18, 20.94s/it]Molecule CV: 19it [06:40, 21.14s/it]Molecule CV: 20it [07:01, 21.06s/it]Molecule CV: 21it [07:22, 21.08s/it]Molecule CV: 22it [07:43, 21.08s/it]Molecule CV: 23it [08:04, 21.11s/it]Molecule CV: 24it [08:25, 21.10s/it]Molecule CV: 25it [08:46, 21.13s/it]Molecule CV: 26it [09:07, 21.11s/it]Molecule CV: 27it [09:29, 21.17s/it]Molecule CV: 28it [09:50, 21.16s/it]Molecule CV: 29it [10:11, 21.12s/it]Molecule CV: 30it [10:32, 21.13s/it]Molecule CV: 31it [10:53, 21.20s/it]Molecule CV: 32it [11:14, 21.16s/it]Molecule CV: 33it [11:35, 21.12s/it]Molecule CV: 34it [11:57, 21.16s/it]Molecule CV: 35it [12:18, 21.17s/it]Molecule CV: 36it [12:39, 21.14s/it]Molecule CV: 37it [13:00, 21.08s/it]Molecule CV: 38it [13:21, 21.11s/it]Molecule CV: 39it [13:42, 20.99s/it]Molecule CV: 40it [14:03, 21.01s/it]Molecule CV: 41it [14:25, 21.37s/it]Molecule CV: 42it [14:46, 21.37s/it]Molecule CV: 43it [15:07, 21.24s/it]Molecule CV: 44it [15:28, 21.14s/it]Molecule CV: 45it [15:49, 21.15s/it]Molecule CV: 46it [16:11, 21.27s/it]Molecule CV: 47it [16:32, 21.28s/it]Molecule CV: 48it [16:53, 21.18s/it]Molecule CV: 49it [17:14, 21.13s/it]Molecule CV: 50it [17:35, 21.11s/it]Molecule CV: 51it [17:56, 21.12s/it]Molecule CV: 52it [18:17, 21.08s/it]Molecule CV: 53it [18:38, 21.07s/it]Molecule CV: 54it [18:59, 21.04s/it]Molecule CV: 55it [19:20, 21.04s/it]Molecule CV: 56it [19:41, 20.98s/it]Molecule CV: 57it [20:02, 20.95s/it]Molecule CV: 58it [20:23, 20.95s/it]Molecule CV: 59it [20:45, 21.07s/it]Molecule CV: 60it [21:06, 21.16s/it]Molecule CV: 61it [21:27, 21.05s/it]Molecule CV: 61it [21:27, 21.10s/it]
 9 | DEBUG:__main__:Saving cross-validation results to mean_gradient_boosting/cross_validation.json and mean_gradient_boosting/cross_validation.svg
10 | 


--------------------------------------------------------------------------------
/bin/mdfit_run_md.py:
--------------------------------------------------------------------------------
 1 | #!/ap/rhel7/bin/python3.6
 2 | 
 3 | ####################################################################
 4 | # Corresponding Authors : Alexander Brueckner, Kaushik Lakkaraju ###
 5 | # Contact : alexander.brueckner@bms.com, kaushik.lakkaraju@bms.com #
 6 | ####################################################################
 7 | 
 8 | #Import Python modules
 9 | import logging
10 | import os
11 | import subprocess
12 | 
13 | ###Initiate logger###
14 | logger = logging.getLogger(__name__)
15 | 
16 | def run_job(command):
17 |     #Run provided command, joining list with space. Pipe stdout and sdterror to log file
18 |     process = subprocess.run(' '.join(command), stdout=subprocess.PIPE, \
19 |         stderr=subprocess.STDOUT, shell=True, text=True)
20 |     
21 |     #Iterate over sdtout and sdterror
22 |     for line in process.stdout.split('\n'):
23 |         #Ignore blank lines
24 |         if line != "":
25 |             #Ignore ExitStatus
26 |             if "ExitStatus" not in line:
27 |                 #Write to log file for debugging
28 |                 logger.debug(line)
29 | 
30 | def main(ligname, args, desmond_host, SCHRODINGER, master_dir):
31 |     #Generate trajectory file name
32 |     outcms = "%s-out.cms"%ligname
33 | 
34 |     #Generate trajectory directory name
35 |     outtrj = "%s_trj"%ligname
36 | 
37 |     #Get base ligand name <ligand>
38 |     lig_basename = ligname.split("_repetition")[0]
39 | 
40 |     #Check if trajectory file and directory exist
41 |     if os.path.isfile(os.path.join(master_dir, "desmond_md", lig_basename, ligname, outcms)) == False or os.path.isdir(os.path.join(master_dir, "desmond_md", lig_basename, ligname, outtrj)) == False:
42 |         #If not, prepare Schrodinger's multisim command ($SCHRODINGER/utilities/multisim)
43 |         run_cmd = os.path.join(SCHRODINGER, "utilities", "multisim")
44 | 
45 |         #Prepare Desmond MD command
46 |         command = [run_cmd, '-JOBNAME', ligname, '-HOST', desmond_host, '-maxjob', '1', '-cpu', '1', '-m', '%s_md.msj'%ligname, '-c', '%s_md.cfg'%ligname, '-description', '"Molecular Dynamics"', '%s_md.cms'%ligname, '-mode', 'umbrella', '-set', '"stage[1].set_family.md.jlaunch_opt=[\"-gpu\"]"', '-o', outcms, '-OPLSDIR', args.oplsdir, '-lic', 'DESMOND_GPGPU:16', '-ATTACHED', '-WAIT']
47 |         
48 |         #Capture current step
49 |         logger.info("Running Desmond: %s"%' '.join(command))
50 | 
51 |         #Run Desmond MD
52 |         run_job(command)
53 | 
54 |     #Trajectory file(s) exist
55 |     else:
56 |         #Capture current step
57 |         logger.info("Desmond trajectory found: %s, %s"%(outcms, outtrj))
58 |     
59 |     #Return trajectory file and directory names and ligand name
60 |     return outcms, outtrj, lig_basename
61 | 
62 | if __name__ == '__main__':
63 |     main(ligname, args, desmond_host, SCHRODINGER, master_dir)


--------------------------------------------------------------------------------
/mdml/examples/PD-L1/gradient_boosting/STDERR:
--------------------------------------------------------------------------------
 1 | DEBUG:__main__:Workflow set to nested=True
 2 | DEBUG:__main__:Loading data: data.csv
 3 | DEBUG:__main__:Dataset contains 183 entries
 4 | DEBUG:__main__:Building initial model
 5 | DEBUG:__main__:Saving model to gradient_boosting/model.pkl
 6 | DEBUG:__main__:Saving feature importance to gradient_boosting/importance.csv
 7 | DEBUG:__main__:Running cross-validation
 8 | Molecule CV: 0it [00:00, ?it/s]Molecule CV: 1it [03:02, 182.44s/it]Molecule CV: 2it [06:07, 183.98s/it]Molecule CV: 3it [09:12, 184.35s/it]Molecule CV: 4it [12:17, 184.81s/it]Molecule CV: 5it [15:22, 184.88s/it]Molecule CV: 6it [18:26, 184.51s/it]Molecule CV: 7it [21:33, 185.21s/it]Molecule CV: 8it [24:38, 185.28s/it]Molecule CV: 9it [27:43, 185.15s/it]Molecule CV: 10it [30:48, 185.18s/it]Molecule CV: 11it [33:52, 184.82s/it]Molecule CV: 12it [36:55, 184.22s/it]Molecule CV: 13it [40:00, 184.51s/it]Molecule CV: 14it [43:06, 184.87s/it]Molecule CV: 15it [46:09, 184.26s/it]Molecule CV: 16it [49:16, 184.99s/it]Molecule CV: 17it [52:19, 184.50s/it]Molecule CV: 18it [55:23, 184.32s/it]Molecule CV: 19it [58:27, 184.40s/it]Molecule CV: 20it [1:01:32, 184.50s/it]Molecule CV: 21it [1:04:36, 184.30s/it]Molecule CV: 22it [1:07:39, 184.03s/it]Molecule CV: 23it [1:10:44, 184.14s/it]Molecule CV: 24it [1:13:47, 183.75s/it]Molecule CV: 25it [1:16:51, 183.81s/it]Molecule CV: 26it [1:19:55, 184.11s/it]Molecule CV: 27it [1:22:58, 183.80s/it]Molecule CV: 28it [1:26:03, 184.10s/it]Molecule CV: 29it [1:29:08, 184.35s/it]Molecule CV: 30it [1:32:15, 185.19s/it]Molecule CV: 31it [1:35:20, 184.95s/it]Molecule CV: 32it [1:38:22, 184.13s/it]Molecule CV: 33it [1:41:27, 184.37s/it]Molecule CV: 34it [1:44:30, 183.89s/it]Molecule CV: 35it [1:47:38, 185.21s/it]Molecule CV: 36it [1:50:41, 184.57s/it]Molecule CV: 37it [1:53:45, 184.34s/it]Molecule CV: 38it [1:56:49, 184.29s/it]Molecule CV: 39it [1:59:55, 184.73s/it]Molecule CV: 40it [2:02:59, 184.65s/it]Molecule CV: 41it [2:06:05, 184.87s/it]Molecule CV: 42it [2:09:10, 185.07s/it]Molecule CV: 43it [2:12:17, 185.70s/it]Molecule CV: 44it [2:15:23, 185.63s/it]Molecule CV: 45it [2:18:28, 185.53s/it]Molecule CV: 46it [2:21:31, 184.75s/it]Molecule CV: 47it [2:24:36, 184.71s/it]Molecule CV: 48it [2:27:47, 186.62s/it]Molecule CV: 49it [2:30:50, 185.60s/it]Molecule CV: 50it [2:33:55, 185.55s/it]Molecule CV: 51it [2:36:59, 184.93s/it]Molecule CV: 52it [2:40:03, 184.85s/it]Molecule CV: 53it [2:43:09, 185.08s/it]Molecule CV: 54it [2:46:14, 185.00s/it]Molecule CV: 55it [2:49:18, 184.79s/it]Molecule CV: 56it [2:52:21, 184.08s/it]Molecule CV: 57it [2:55:23, 183.47s/it]Molecule CV: 58it [2:58:31, 184.79s/it]Molecule CV: 59it [3:01:41, 186.43s/it]Molecule CV: 60it [3:04:48, 186.54s/it]Molecule CV: 61it [3:07:52, 186.01s/it]Molecule CV: 61it [3:07:52, 184.80s/it]
 9 | DEBUG:__main__:Saving cross-validation results to gradient_boosting/cross_validation.json and gradient_boosting/cross_validation.svg
10 | 


--------------------------------------------------------------------------------
/bin/mdfit_run_minimization.py:
--------------------------------------------------------------------------------
 1 | #!/ap/rhel7/bin/python3.6
 2 | 
 3 | ####################################################################
 4 | # Corresponding Authors : Alexander Brueckner, Kaushik Lakkaraju ###
 5 | # Contact : alexander.brueckner@bms.com, kaushik.lakkaraju@bms.com #
 6 | ####################################################################
 7 | 
 8 | #Import Python modules
 9 | import logging
10 | import os
11 | import subprocess
12 | 
13 | ###Initiate logger###
14 | logger = logging.getLogger(__name__)
15 | 
16 | def run_job(command):
17 |     #Run provided command, joining list with space. Pipe stdout and sdterror to log file
18 |     process = subprocess.run(' '.join(command), stdout=subprocess.PIPE, \
19 |         stderr=subprocess.STDOUT, shell=True, text=True)
20 |     
21 |     #Iterate over sdtout and sdterror
22 |     for line in process.stdout.split('\n'):
23 |         #Ignore blank lines
24 |         if line != "":
25 |             #Ignore ExitStatus
26 |             if "ExitStatus" not in line:
27 |                 #Write to log file for debugging
28 |                 logger.debug(line)
29 | 
30 | def main(ligname, pvcomplex, args, bmin_host, SCHRODINGER, master_dir, template_dir):
31 |     #Generate output minimized complex filename
32 |     bmincomplex = "%s_out_complex_min.mae"%ligname
33 | 
34 |     #Check if minimized complex exists
35 |     if os.path.isfile(os.path.join(master_dir, "desmond_md", ligname, "md_setup", bmincomplex)) == False:
36 |         #If not, read in minimization template
37 |         with open(os.path.join(template_dir, "bmin_template.com"), "r") as template:
38 |             #Put all lines in variable
39 |             lines = template.readlines()
40 |         
41 |         #Open ligand-specific job file for writing
42 |         with open("%s_min.com"%ligname, "w") as ligoutput:
43 |             #Iterate over all lines in template
44 |             for line in lines:
45 |                 #Write line to file, replacing key strings IN_NAME and OUT_NAME (complex filename and output filename)
46 |                 ligoutput.write(line.replace("IN_NAME","%s_out_complex.mae"%ligname).replace("OUT_NAME","%s_out_complex_min.mae"%ligname))
47 |         
48 |         #Prepare Schrodinger's bmin command ($SCHRODINGER/bmin)
49 |         run_cmd = os.path.join(SCHRODINGER, "bmin")
50 | 
51 |         #Prepare minimization command
52 |         command = [run_cmd, "%s_min"%ligname, "-OPLSDIR", args.oplsdir, "-HOST", bmin_host, "-WAIT"]
53 | 
54 |         #Capture current step
55 |         logger.info("Running minimization: %s"%' '.join(command))
56 | 
57 |         #Run minimization
58 |         run_job(command)
59 |     
60 |     #Minimized complex exists
61 |     else:
62 |         #Capture current step
63 |         logger.info("Minimized complex found: %s"%bmincomplex)
64 |     
65 |     #Return minimized complex filename
66 |     return bmincomplex
67 | 
68 | if __name__ == '__main__':
69 |     main(ligname, pvcomplex, args, bmin_host, SCHRODINGER, master_dir, template_dir)


--------------------------------------------------------------------------------
/mdml/mdml/plot.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Plotting Methods
 4 | ----------------
 5 | Generate basic plots for MDML analysis.
 6 | 
 7 | @author: Benjamin Shields 
 8 | @email: benjamin.shields@bms.com 
 9 | """
10 | 
11 | ############################################################################## Imports
12 | 
13 | import logging
14 | import numpy as np
15 | import matplotlib.pyplot as plt
16 | from sklearn.metrics import r2_score
17 | 
18 | ############################################################################## Logger
19 | 
20 | logger = logging.getLogger(__name__)
21 | 
22 | ############################################################################## Functions
23 | 
24 | def parity_plot(pred:list, obs:list, title:str='Fit', export_path:str=None, 
25 |                 xlabel:str='Predicted', ylabel:str='Observed', 
26 |                 color:str='black', cod:str='R^2') -> (float, float):
27 |     """
28 |     Plot predicted versus observed and return the RMSE and coefficient of
29 |     determination.
30 | 
31 |     Parameters
32 |     ----------
33 |     pred : list
34 |         Predicted values
35 |     obs : list
36 |         Observed values.
37 |     title : str, optional
38 |         Plot title. The default is 'Fit'.
39 |     export_path : str, optional
40 |         Path to export SVG. Images are exported as export_path.svg. The default 
41 |         is None.
42 |     xlabel : str, optional
43 |         Label for x-axis. The default is 'Predicted'.
44 |     ylabel : str, optional
45 |         Label for y-axis. The default is 'Observed'.
46 |     color : str, optional
47 |         Color of plot points and error bars. The default is 'black'.
48 |     cod : str, optional
49 |         Nominclature used for the computed coefficient of determination. If the
50 |         results are from cross-validation use 'Q^2'. The default is 'R^2'.
51 | 
52 |     Returns
53 |     -------
54 |     rmse : float
55 |         Root mean squared error for predicted values.
56 |     r2 : float
57 |         Coefficient of determination for predicted values.
58 |     """
59 |     
60 |     plt.set_loglevel("info")
61 |     
62 |     # Compute RMSE and R^2 values
63 |     pred = np.array(pred)
64 |     obs = np.array(obs)
65 |     rmse = np.sqrt(np.mean((pred - obs) ** 2))
66 |     r2 = r2_score(obs, pred)
67 |     
68 |     # Get upper and lower bounds of plot
69 |     upper = max([max(pred), max(obs)])
70 |     lower = min([min(pred), min(obs)])
71 |     pad = (upper - lower) * 0.05
72 |     
73 |     # Plot
74 |     plt.figure(figsize=(6,6))
75 |     plt.scatter(pred, obs, color=color, alpha=0.4)
76 |     plt.xlabel(xlabel)
77 |     plt.ylabel(ylabel)
78 |     plt.title('{0}($RMSE={1}$, ${2}={3}$)'.format(
79 |         title, 
80 |         round(float(rmse),2), 
81 |         cod,
82 |         round(r2,2))
83 |     )
84 |     plt.plot([lower,upper], [lower,upper], 'k-', alpha=0.75, zorder=0)
85 |     plt.xlim(lower - pad, upper + pad)
86 |     plt.ylim(lower - pad, upper + pad)
87 |     
88 |     # Save and/or show
89 |     if export_path is not None:
90 |         plt.savefig(export_path + '.svg', format='svg', dpi=1200, bbox_inches='tight')
91 |         plt.close()
92 |     else:
93 |         plt.show() 
94 |         
95 |     return rmse, r2
96 | 
97 | 
98 | 


--------------------------------------------------------------------------------
/bin/mdfit_read_params.py:
--------------------------------------------------------------------------------
 1 | #!/ap/rhel7/bin/python3.6
 2 | 
 3 | ####################################################################
 4 | # Corresponding Authors : Alexander Brueckner, Kaushik Lakkaraju ###
 5 | # Contact : alexander.brueckner@bms.com, kaushik.lakkaraju@bms.com #
 6 | ####################################################################
 7 | 
 8 | #Import Python modules
 9 | import logging
10 | import sys
11 | import os
12 | import json
13 | 
14 | ###Initiate logger###
15 | logger = logging.getLogger(__name__)
16 | 
17 | def read_json(MDFit_path):
18 |     #Open institution-based parameters json file for reading
19 |     with open(os.path.join(MDFit_path, "parameters.json"), "r") as injson:
20 |         #Put parameters in a list
21 |         parameters = json.load(injson)
22 | 
23 |     #Capture current step
24 |     logger.info("json file found. Institution parameters read in.")
25 |     
26 |     #Write parameters for debugging
27 |     logger.debug("Parameters: %s"%parameters)
28 | 
29 |     #Return parameters list
30 |     return parameters
31 | 
32 | def write_json(MDFit_path):
33 |     #Template data to write to json file
34 |     dictionary = {
35 |         "hostnames": {
36 |             "FFBUILDER":"localhost",
37 |             "BMIN":"localhost",
38 |             "MULTISIM":"localhost",
39 |             "DESMOND":"localhost-gpu",
40 |             "ANALYSIS":"localhost"
41 |         },
42 |         "parameters": {
43 |             "MAXLIGS":100,
44 |             "FFPROC":32
45 |         }
46 |     }
47 | 
48 |     #Convert dictionary to json object
49 |     json_object = json.dumps(dictionary, indent=4)
50 | 
51 |     #Open template json file for writing
52 |     with open(os.path.join(MDFit_path, "parameters_TEMPLATE.json"), "w") as outfile:
53 |         #Write lines to json file
54 |         outfile.write(json_object)
55 |     
56 |     #Capture current step
57 |     logger.critical("Edit the %s file and rename to %s"%(os.path.join(MDFit_path, "parameters_TEMPLATE.json"), os.path.join(MDFit_path, "parameters.json")))
58 | 
59 | def main(MDFit_path):
60 |     #Check if json file exists
61 |     if os.path.isfile(os.path.join(MDFit_path, "parameters.json")) == True:
62 |         #If it does, read in parameters
63 |         inst_params = read_json(MDFit_path)
64 |     
65 |     #json file does not exist
66 |     else:
67 |         #Check if template exists
68 |         if os.path.isfile(os.path.join(MDFit_path, "parameters_TEMPLATE.json")) == True:
69 |             #If it does, print path to template for user
70 |             logger.critical("Parameter template file found: %s"%(os.path.join(MDFit_path, "parameters_TEMPLATE.json")))
71 | 
72 |             #Print path needed for user
73 |             logger.critical("Rename to: %s"%(os.path.join(MDFit_path, "parameters.json")))
74 |         
75 |         #Template file does noto exist
76 |         else:
77 |             #Write out template for user to edit
78 |             write_json(MDFit_path)
79 | 
80 |         #Print to screen
81 |         print("Parameters not found. Information captured in MDFit.log")
82 | 
83 |         #Exit
84 |         sys.exit()
85 | 
86 |     #Return parameters
87 |     return inst_params
88 | 
89 | if __name__ == '__main__':
90 |     main(MDFit_path)


--------------------------------------------------------------------------------
/mdml/bin/mdml_predict:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | """
  3 | MD Simulation Fingerprint Model Predictions
  4 | --------------------------------------------------------------
  5 | Make predictions using trained MD simulation fingerprint models.
  6 | 
  7 | @author: Benjamin Shields 
  8 | @email: benjamin.shields@bms.com
  9 | """
 10 | 
 11 | ############################################################################## Imports
 12 | 
 13 | import logging
 14 | import os
 15 | import pandas as pd
 16 | import dill
 17 | 
 18 | from mdml import model, cli
 19 | 
 20 | ############################################################################## Setup
 21 | 
 22 | logger = logging.getLogger(__name__)
 23 | 
 24 | ############################################################################## Interface
 25 | 
 26 | def get_parser():
 27 |     description = """Make predictions using trained MD simulation fingerprint models."""
 28 |     parser, groups = cli.parser(description, add_computation=False)
 29 |     
 30 |     # Input/Output Data
 31 |     data = parser.add_argument_group("DATA")
 32 |     data.add_argument(
 33 |         'input',
 34 |         help="Path to CSV containing features and molecule IDs."
 35 |     )
 36 |     data.add_argument(
 37 |         'output',
 38 |         help="Path to save prediction CSV."
 39 |     )
 40 |     data.add_argument(
 41 |         '-model',
 42 |         type=str,
 43 |         default=None,
 44 |         help="Directory containing trained ML model",
 45 |         required=True
 46 |     )
 47 |     data.add_argument(
 48 |         '-id_col',
 49 |         type=str,
 50 |         default=None,
 51 |         help="Header of compound ID column.",
 52 |         required=True
 53 |     )
 54 |     data.add_argument(
 55 |         '-drop_col',
 56 |         nargs='+',
 57 |         type=str,
 58 |         default=[],
 59 |         help="Columns that should be removed from the input feature CSV."
 60 |     )
 61 |     data.add_argument(
 62 |         '-group',
 63 |         type=str,
 64 |         default=None,
 65 |         help="""Grouping method. The options include 'mean', 'min', and 'max'. 
 66 |         Data will be grouped by compound ID (id_col)."""
 67 |     )
 68 |     
 69 |     return parser
 70 | 
 71 | ############################################################################## Main
 72 | 
 73 | def main(args):
 74 |     # Load and preprocess data
 75 |     logger.debug(f'Loading data: {args.input}')
 76 |     data = cli.load_data(
 77 |         args.input, args.id_col, drop=args.drop_col, aggregation=args.group
 78 |     )
 79 |     logger.debug(f'Dataset contains {len(data)} entries')
 80 |     
 81 |     # Load modeling workflow
 82 |     logger.debug(f'Loading model: {args.model}')
 83 |     workflow = model.load_workflow(os.path.join(args.model, 'model.pkl'))
 84 | 
 85 |     # Make predictions
 86 |     logger.debug(f'Making predictions: {workflow.target}')
 87 |     data[f'Predicted {workflow.target}'] = workflow.predict(data)
 88 | 
 89 |     # Save results
 90 |     logger.debug(f'Saving results to {args.output}')
 91 |     data.to_csv(args.output)
 92 |     
 93 |     
 94 | if __name__ == "__main__":
 95 |     parser = get_parser()
 96 |     args = parser.parse_args()
 97 |     if args.debug:
 98 |         logging.basicConfig(level=logging.DEBUG)
 99 |     main(args)
100 | 
101 | 
102 | 


--------------------------------------------------------------------------------
/templates/desmond_md_job_template.msj:
--------------------------------------------------------------------------------
  1 | # Desmond standard NPT relaxation protocol
  2 | # All times are in the unit of ps.
  3 | # Energy is in the unit of kcal/mol.
  4 | task {
  5 |    task = "desmond:auto"
  6 |    set_family = {
  7 |       desmond = {
  8 |          checkpt.write_last_step = no
  9 |       }
 10 |    }
 11 | }
 12 | 
 13 | simulate {
 14 |    title       = "Brownian Dynamics NVT, T = 10 K, small timesteps, and restraints on solute heavy atoms, 100ps"
 15 |    annealing   = off
 16 |    time        = 100
 17 |    timestep    = [0.001 0.001 0.003 ]
 18 |    temperature = 10.0
 19 |    ensemble = {
 20 |       class = "NVT"
 21 |       method = "Brownie"
 22 |       brownie = {
 23 |          delta_max = 0.1
 24 |       }
 25 |    }
 26 |    restrain = {
 27 |       atom = "solute_heavy_atom"
 28 |       force_constant = 50.0
 29 |    }
 30 | }
 31 | 
 32 | simulate {
 33 |    effect_if   = [["==" "-gpu" "@*.*.jlaunch_opt[-1]"] 'ensemble.method = Langevin']
 34 |    title       = "NVT, T = 10 K, small timesteps, and restraints on solute heavy atoms, 12ps"
 35 |    annealing   = off
 36 |    time        = 12
 37 |    timestep    = [0.001 0.001 0.003]
 38 |    temperature = 10.0
 39 |    restrain    = { atom = solute_heavy_atom force_constant = 50.0 }
 40 |    ensemble    = {
 41 |       class  = NVT
 42 |       method = Berendsen
 43 |       thermostat.tau = 0.1
 44 |    }
 45 | 
 46 |    randomize_velocity.interval = 1.0
 47 |    eneseq.interval             = 0.3
 48 |    trajectory.center           = []
 49 | }
 50 | 
 51 | simulate {
 52 |    title       = "NPT, T = 10 K, and restraints on solute heavy atoms, 12ps"
 53 |    effect_if   = [["==" "-gpu" "@*.*.jlaunch_opt[-1]"] 'ensemble.method = Langevin']
 54 |    annealing   = off
 55 |    time        = 12
 56 |    temperature = 10.0
 57 |    restrain    = retain
 58 |    ensemble    = {
 59 |       class  = NPT
 60 |       method = Berendsen
 61 |       thermostat.tau = 0.1
 62 |       barostat  .tau = 50.0
 63 |    }
 64 | 
 65 |    randomize_velocity.interval = 1.0
 66 |    eneseq.interval             = 0.3
 67 |    trajectory.center           = []
 68 | }
 69 | 
 70 | solvate_pocket {
 71 |    should_skip = true
 72 |    ligand_file = ?
 73 | }
 74 | 
 75 | simulate {
 76 |    title       = "NPT and restraints on solute heavy atoms, 12ps"
 77 |    effect_if   = [["@*.*.annealing"] 'annealing = off temperature = "@*.*.temperature[0][0]"'
 78 |                   ["==" "-gpu" "@*.*.jlaunch_opt[-1]"] 'ensemble.method = Langevin']
 79 |    time        = 12
 80 |    restrain    = retain
 81 |    ensemble    = {
 82 |       class  = NPT
 83 |       method = Berendsen
 84 |       thermostat.tau = 0.1
 85 |       barostat  .tau = 50.0
 86 |    }
 87 | 
 88 |    randomize_velocity.interval = 1.0
 89 |    eneseq.interval             = 0.3
 90 |    trajectory.center           = []
 91 | }
 92 | 
 93 | simulate {
 94 |    title       = "NPT and no restraints, 24ps"
 95 |    effect_if   = [["@*.*.annealing"] 'annealing = off temperature = "@*.*.temperature[0][0]"'
 96 |                   ["==" "-gpu" "@*.*.jlaunch_opt[-1]"] 'ensemble.method = Langevin']
 97 |    time        = 24
 98 |    ensemble    = {
 99 |       class  = NPT
100 |       method = Berendsen
101 |       thermostat.tau = 0.1
102 |       barostat  .tau = 2.0
103 |    }
104 | 
105 |    eneseq.interval   = 0.3
106 |    trajectory.center = solute
107 | }
108 | 
109 | simulate {
110 |    cfg_file = "CONFIG_NAME"
111 |    jobname  = "$MASTERJOBNAME"
112 |    dir      = "."
113 |    compress = ""
114 | }
115 | 
116 | pl_analysis {}
117 | 


--------------------------------------------------------------------------------
/mdml/mdml/base.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Base Classes
 4 | ------------
 5 | Helpful base classes for development.
 6 | 
 7 | @author: Benjamin Shields 
 8 | @email: benjamin.shields@bms.com
 9 | """
10 | 
11 | ############################################################################## Imports
12 | 
13 | import logging
14 | import dill
15 | 
16 | ############################################################################## Logger
17 | 
18 | logger = logging.getLogger(__name__)
19 | 
20 | ############################################################################## Methods 
21 | 
22 | class Savable:
23 |     """
24 |     The Savable class implements general methods for saving and loading class
25 |     objects. The __save__ method will save the state dictionary (__dict__) or
26 |     any key/value pair as a pickled dictionary. The __load__ method will load
27 |     and automatically name an entire pickled dictionary or any individual key
28 |     value pair. In addition, __load__ will load non-dictionary pickled objects
29 |     into a specified attribute.
30 |     """
31 |     
32 |     def __save__(self, path:str, attrname:[str,list]=None) -> None:
33 |         """
34 |         Save the state dictionary or an individual attribute as a key/value 
35 |         pair. If attrname is None then the entire state dictionary is saved.
36 | 
37 |         Parameters
38 |         ----------
39 |         path : str
40 |             Path to save attribute. Data will be saved as a pickle file using
41 |             dill.
42 |         attrname : str, list, optional
43 |             Name of attribute. The default is None. If None, the entire state 
44 |             dictionary will be saved.
45 |         """
46 |         
47 |         if isinstance(attrname, str):
48 |             assert hasattr(self, attrname)
49 |             savedict = {attrname: getattr(self, attrname)}
50 |         elif isinstance(attrname, list):
51 |             savedict = {}
52 |             for name in attrname:
53 |                 assert hasattr(self, name)
54 |                 savedict[name] = getattr(self, name)
55 |         else:
56 |             savedict = self.__dict__
57 |         with open(path, 'wb') as file:
58 |             dill.dump(savedict, file)    
59 | 
60 |     def __load__(self, path:str, attrname:[str, list]=None) -> None:
61 |         """
62 |         Load saved objects to attributes based on dictionary key/value pairs,
63 |         load an individual element from a pickled dictionary, or load objects 
64 |         to a specified attribute.
65 | 
66 |         Parameters
67 |         ----------
68 |         path : str
69 |             Path to pickled object file. For example a scikit-learn model.
70 |         attrname : str, list, optional
71 |             Name of attribute when loaded. For example 'model'. If attrname is
72 |             None or not present in the loaded dict, all key/value pairs will be
73 |             loaded.
74 |         """
75 |         
76 |         # Load pickle file
77 |         with open(path, 'rb') as file:
78 |             loaded = dill.load(file)
79 |         
80 |         # Make sure it is a dict
81 |         if not isinstance(loaded, dict):
82 |             loaded = {attrname: loaded}
83 |         
84 |         # Load specific attributes
85 |         if isinstance(attrname, str):
86 |             attrname = [attrname]
87 |         if isinstance(attrname, list):
88 |             for name in attrname:
89 |                 attr = loaded.get(name)
90 |                 if attr is not None:
91 |                     setattr(self, name, attr)
92 |         
93 |         # Load full dict
94 |         else:
95 |             self.__dict__.update(loaded) 
96 | 
97 | 
98 | 
99 | 


--------------------------------------------------------------------------------
/bin/mdfit_build_box.py:
--------------------------------------------------------------------------------
 1 | #!/ap/rhel7/bin/python3.6
 2 | 
 3 | ####################################################################
 4 | # Corresponding Authors : Alexander Brueckner, Kaushik Lakkaraju ###
 5 | # Contact : alexander.brueckner@bms.com, kaushik.lakkaraju@bms.com #
 6 | ####################################################################
 7 | 
 8 | #Import Python modules
 9 | import logging
10 | import sys
11 | import os
12 | import subprocess
13 | 
14 | ###Initiate logger###
15 | logger = logging.getLogger(__name__)
16 | 
17 | def run_job(command):
18 |     #Run provided command, joining list with space. Pipe stdout and sdterror to log file
19 |     process = subprocess.run(' '.join(command), stdout=subprocess.PIPE, \
20 |         stderr=subprocess.STDOUT, shell=True, text=True)
21 |     
22 |     #Iterate over sdtout and sdterror
23 |     for line in process.stdout.split('\n'):
24 |         #Ignore blank lines
25 |         if line != "":
26 |             #Ignore ExitStatus
27 |             if "ExitStatus" not in line:
28 |                 #Write to log file for debugging
29 |                 logger.debug(line)
30 | 
31 | def main(master_dir, SCHRODINGER, args, charge, ligname, multisim_host, bmincomplex, template_dir):
32 |     #Prepare Schrodinger multisim command ($SCHRODINGER/utilities/multisim)
33 |     run_cmd = os.path.join(SCHRODINGER, "utilities", "multisim")
34 | 
35 |     #Generate output filename
36 |     simbox="%s_md_setup_out.cms"%ligname
37 | 
38 |     #Generate setup filename
39 |     inputfile="%s_md_setup.msj"%ligname
40 | 
41 |     #Generate ligand-specific job name
42 |     jobname="%s_setup"%ligname
43 | 
44 |     #Check that simulation box does not exist
45 |     if os.path.isfile(os.path.join(master_dir, "desmond_md", ligname, "md_setup", simbox)) == False:
46 |         #If not, check system charge
47 |         if charge > 0:
48 |             #If positive, set template filename to positive
49 |             filename = os.path.join(template_dir, "positive_template.msj")
50 | 
51 |         elif charge == 0:
52 |             #If neutral, set template filename to netural
53 |             filename = os.path.join(template_dir, "neutral_template.msj")
54 |         
55 |         #Must be negative
56 |         else:
57 |             #If negative, set template filename to negative
58 |             filename = os.path.join(template_dir, "negative_template.msj")
59 |         
60 |         #Read in template file
61 |         with open(filename, "r") as template:
62 |             #Put lines in a variable
63 |             lines = template.readlines()
64 |         
65 |         #Open setup filename for writing
66 |         with open(inputfile, "w") as ligoutput:
67 |             #Iterate through all the template lines
68 |             
69 |             for line in lines:
70 |                 #Write line to out file, replacing solvent keyword with desired solvent
71 |                 ligoutput.write(line.replace("<solvent>",args.solvent))
72 |         
73 |         #Generate command for building the box
74 |         command = [run_cmd, "-maxjob", "1", "-JOBNAME", jobname, "-m", inputfile, bmincomplex, "-o", simbox, "-OPLSDIR", args.oplsdir, "-HOST", multisim_host, "-WAIT"]
75 | 
76 |         #Document current step
77 |         logger.info("Building simulation box: %s"%simbox)
78 | 
79 |         #Run command
80 |         run_job(command)
81 | 
82 |     #Simulation box exists
83 |     else:
84 |         #Document current step
85 |         logger.info("Simulation box found: %s"%simbox)
86 | 
87 |     #Return output filename
88 |     return simbox
89 | 
90 | if __name__ == '__main__':
91 |     main(master_dir, SCHRODINGER, args, charge, ligname, multisim_host, bmincomplex, template_dir)


--------------------------------------------------------------------------------
/bin/mdfit_get_charge.py:
--------------------------------------------------------------------------------
  1 | #!/ap/rhel7/bin/python3.6
  2 | 
  3 | ####################################################################
  4 | # Corresponding Authors : Alexander Brueckner, Kaushik Lakkaraju ###
  5 | # Contact : alexander.brueckner@bms.com, kaushik.lakkaraju@bms.com #
  6 | ####################################################################
  7 | 
  8 | #Import Python modules
  9 | import logging
 10 | import os
 11 | import shutil
 12 | import subprocess
 13 | import re
 14 | 
 15 | ###Initiate logger###
 16 | logger = logging.getLogger(__name__)
 17 | 
 18 | def run_job(command):
 19 |     #Run provided command, joining list with space. Pipe stdout and sdterror to log file
 20 |     process = subprocess.run(' '.join(command), stdout=subprocess.PIPE, \
 21 |         stderr=subprocess.STDOUT, shell=True, text=True)
 22 |     
 23 |     #Iterate over sdtout and sdterror
 24 |     for line in process.stdout.split('\n'):
 25 |         #Ignore blank lines
 26 |         if line != "":
 27 |             #Ignore ExitStatus
 28 |             if "ExitStatus" not in line:
 29 |                 #Write to log file for debugging
 30 |                 logger.debug(line)
 31 | 
 32 | def main(SCHRODINGER, ligname, master_dir, args):
 33 |     #Prepare Schrodinger proplister command ($SCHRODINGER/utilities/proplister)
 34 |     run_cmd = os.path.join(SCHRODINGER, "utilities", "proplister")
 35 | 
 36 |     #Check if proplister has been run before
 37 |     if os.path.isfile(os.path.join(master_dir, "desmond_md", ligname, "md_setup", "%s_atoms.csv"%ligname)) == False:
 38 |         #If not, prepare proplister command
 39 |         command = [run_cmd, "-atom_bond_props", "%s_out_complex_min.mae"%ligname, "-c", "-o", "%s.csv"%ligname]
 40 | 
 41 |         #Run proplister
 42 |         run_job(command)
 43 | 
 44 |     #Proplister has been run before
 45 |     else:
 46 |         #Copy proplister output from md_setup directory to scratch space
 47 |         shutil.copy(os.path.join(master_dir, "desmond_md", ligname, "md_setup", "%s_atoms.csv"%ligname), "%s_atoms.csv"%ligname)
 48 | 
 49 |     #Define pattern for matching
 50 |     fcFinder = re.compile(r'i_m_formal_charge')
 51 | 
 52 |     #Initiate variable for counting lines
 53 |     lineCount = 0
 54 | 
 55 |     #Initiate total charge variable
 56 |     totQ = 0
 57 | 
 58 |     #Read in proplister output
 59 |     with open("%s_atoms.csv"%ligname, "r") as inFile:
 60 |         #Initiate infinite loop to iterate through file
 61 |         while 1:
 62 |             #Read in next line of file
 63 |             dataLine = inFile.readline()
 64 | 
 65 |             #Break loop if end of file
 66 |             if not dataLine: break
 67 | 
 68 |             #Split line into list by comma (csv)
 69 |             tmpList = dataLine[:-1].split(",")
 70 | 
 71 |             #Check if first line
 72 |             if lineCount == 0:
 73 |                 #Initiate variable for number of columns
 74 |                 columnCount = 0
 75 | 
 76 |                 #Iterate over all columns in first line of file
 77 |                 for column in tmpList:
 78 |                     #Locate the formal charge column
 79 |                     theMatch =  fcFinder.match(column)
 80 | 
 81 |                     #If formal charge column is located
 82 |                     if theMatch:
 83 |                         #Get column number
 84 |                         chargeColumn = columnCount
 85 | 
 86 |                         #Break loop
 87 |                         break
 88 |                     
 89 |                     #Increment column variable 
 90 |                     columnCount += 1
 91 |             
 92 |             #Not first line
 93 |             else:
 94 |                 #Increment by value in formal charge column
 95 |                 totQ += int(tmpList[chargeColumn])
 96 | 
 97 |             #Increment line count variable
 98 |             lineCount += 1
 99 |     
100 |     #Document current step
101 |     logger.info("Total system charge for %s: %s"%(ligname, totQ))
102 | 
103 |     #Return total system charge
104 |     return totQ
105 | 
106 | if __name__ == '__main__':
107 |     main(SCHRODINGER, ligname. master_dir, args)


--------------------------------------------------------------------------------
/bin/mdfit_slicetrj.py:
--------------------------------------------------------------------------------
  1 | #!/ap/rhel7/bin/python3.6
  2 | 
  3 | ####################################################################
  4 | # Corresponding Authors : Alexander Brueckner, Kaushik Lakkaraju ###
  5 | # Contact : alexander.brueckner@bms.com, kaushik.lakkaraju@bms.com #
  6 | ####################################################################
  7 | 
  8 | #Import Python modules
  9 | import logging
 10 | import sys
 11 | import os
 12 | import subprocess
 13 | 
 14 | #Import Schrodinger modules
 15 | from schrodinger.application.desmond.packages import traj
 16 | 
 17 | ###Initiate logger###
 18 | logger = logging.getLogger(__name__)
 19 | 
 20 | def run_job(command):
 21 |     #Run provided command, joining list with space. Pipe stdout and sdterror to log file
 22 |     process = subprocess.run(' '.join(command), stdout=subprocess.PIPE, \
 23 |         stderr=subprocess.STDOUT, shell=True, text=True)
 24 |     
 25 |     #Iterate over sdtout and sdterror
 26 |     for line in process.stdout.split('\n'):
 27 |         #Ignore blank lines
 28 |         if line != "":
 29 |             #Ignore ExitStatus
 30 |             if "ExitStatus" not in line:
 31 |                 #Write to log file for debugging
 32 |                 logger.debug(line)
 33 | 
 34 | def count_frames(trj_path):
 35 |     #Read in trajectory with Schrodinger's read_traj utilty
 36 |     tr = traj.read_traj(trj_path)
 37 | 
 38 |     #Return length of trajectory (number of frames)
 39 |     return len(tr)
 40 | 
 41 | def main(SCHRODINGER, rep, master_dir, args):
 42 |     #Prepare Schrodinger's run command ($SCHRODINGER/run)
 43 |     run_cmd = os.path.join(SCHRODINGER, 'run')
 44 | 
 45 |     #Generate repetition name <ligname>_repetition<#>
 46 |     basename = os.path.basename(rep)
 47 | 
 48 |     #Generate ligand name <ligname>
 49 |     ligbase = basename.split("_repetition")[0]
 50 | 
 51 |     #Generate path to trajectory files in scratch space
 52 |     md_path = os.path.join(master_dir, "desmond_md", "scratch")
 53 | 
 54 |     #Check if trajectory files are in scratch
 55 |     if os.path.isfile(os.path.join(md_path, "%s-out.cms"%basename)) == True:
 56 |         #If they are, generate path to trajectory file in scratch space
 57 |         cms_path = os.path.join(md_path, "%s-out.cms"%basename)
 58 | 
 59 |         #If they are, generate path to trajectory directory in scratch space
 60 |         trj_path = os.path.join(md_path, "%s_trj"%basename)
 61 |     
 62 |     #Trajectory files are in permanent directories; allows slice to be done separate from Desmond MD
 63 |     else:
 64 |         #Generate path to trajectory file in permanent directory
 65 |         cms_path = os.path.join(master_dir, "desmond_md", ligbase, basename, "%s-out.cms"%basename)
 66 |         #Generate path to trajectory directory in permanent directory
 67 |         trj_path = os.path.join(master_dir, "desmond_md", ligbase, basename, "%s_trj"%basename)
 68 | 
 69 |     #Check if the user wants to remove frames
 70 |     if args.slice_start != 0 or args.slice_end != None:
 71 |         #Slice hasn't been done before
 72 |         if os.path.isfile(os.path.join(master_dir, "desmond_md", ligbase, basename, "%s_sliced-out.cms"%basename)) == False:
 73 |             #Get total number of frames in trajectory
 74 |             total_frames = count_frames(trj_path)
 75 | 
 76 |             #Check if user wants to remove frames from end of trajectory
 77 |             if args.slice_end == None:
 78 |                 #If not, set variable to total number of frames
 79 |                 args.slice_end = total_frames
 80 | 
 81 |             #Prepare slice command
 82 |             trj_slice = [run_cmd, "trj_merge.py", "-s", "%s:%s:1"%(args.slice_start, args.slice_end), "-o", "%s_sliced"%basename, cms_path, trj_path]
 83 | 
 84 |             #Capture current step
 85 |             logger.info("Removing frames from trajectory: %s"%' '.join(trj_slice))
 86 | 
 87 |             #Run trajectory slicing
 88 |             run_job(trj_slice)
 89 |     
 90 |         #Slice has been done before
 91 |         else:
 92 |             #Capture current step
 93 |             logger.info("Sliced trajectory already found. Skipping slice.")
 94 |     
 95 |     #No slice desired
 96 |     else:
 97 |         #Capture current step
 98 |         logger.info("Not removing frames from trajectory")
 99 | 
100 | if __name__ == '__main__':
101 |     main(SCHRODINGER, rep, master_dir, args)


--------------------------------------------------------------------------------
/mdml/README.md:
--------------------------------------------------------------------------------
  1 | # MDML
  2 | 
  3 | Train machine learning models to predict potency from SimFPs and automatically identify
  4 | important interactions via feature importance.
  5 | 
  6 | ## Installation
  7 | 
  8 | Build and activate a python 3 environment.
  9 | 
 10 | ```
 11 | conda create -n mdml python=3.10
 12 | conda activate mdml
 13 | ```
 14 | 
 15 | Install `mdml`.
 16 | 
 17 | ```
 18 | git clone git@github.com:brueckna2020/MDFit.git
 19 | cd MDFit/mdml
 20 | pip install .
 21 | ```
 22 | 
 23 | ## Command Line Interface
 24 | 
 25 | After installation the `mdml_train` and `mdml_predict` command line scripts will be executable 
 26 | in the `mdml` environment.
 27 | 
 28 | ### Train
 29 | 
 30 | Use pre-computed SimFPs to train regression models, evaluate performance using nested leave-one-
 31 | molecule-out cross-validation, and identify important interactions via feature importance. This
 32 | script generates a directory with the following results files.
 33 | - `model.pkl`: A model trained using all SimFP data.
 34 | - `cross_validation.json`: A summary of cross-validation results including predictions, observations, and feature importances from each fold and LOMO-CV metrics (computed using all folds).
 35 | - `cross_validation.svg`: A parity plot showing the LOMO-CV performance with metrics computed using the average predictions from SimFPs from different simulations for each molecule.
 36 | - `importance.csv`: The feature importance computed using the specified model type (e.g., weights for linear and node impurity for tree-based regressions).
 37 | 
 38 | ```
 39 | usage: mdml_train [-h] [-debug] [-nproc NPROC] -target_col TARGET_COL -id_col ID_COL
 40 |                   [-drop_col DROP_COL [DROP_COL ...]] [-group GROUP] [-model_type MODEL_TYPE]
 41 |                   [-cv CV] [-lofo] [-nested]
 42 |                   input output
 43 | 
 44 | Train regression models, evaluate performance using nested leave-one-molecule-out cross-
 45 | validation, and identify important interactions via feature importance.
 46 | 
 47 | HELP:
 48 |   -h                    Show this help message and exit.
 49 |   -debug                Print debugging messages. (default: False)
 50 | 
 51 | COMPUTATION:
 52 |   -nproc NPROC          Number of processors to use for parallel computation. (default: 1)
 53 | 
 54 | DATA:
 55 |   input                 Path to CSV containing SimFP features, IDs, and target (optional).
 56 |   output                Directory path to save output files.
 57 |   -target_col TARGET_COL
 58 |                         Header of target column in training data CSV. (default: None)
 59 |   -id_col ID_COL        Header of compound ID column. (default: None)
 60 |   -drop_col DROP_COL [DROP_COL ...]
 61 |                         Columns that should be removed from the training CSV. (default: [])
 62 |   -group GROUP          Grouping method. The options include 'mean', 'min', and 'max'. Data will
 63 |                         be grouped by compound ID (id_col). (default: None)
 64 | 
 65 | MODEL:
 66 |   -model_type MODEL_TYPE
 67 |                         Regression model type. The options are 'linear', 'ridge', 'lasso',
 68 |                         'random_forest', and 'gradient_boosting'. (default: linear)
 69 |   -cv CV                Number of cross-validation folds to use in hyperparameter tuning. Specify
 70 |                         -1 for leave-one-out. (default: -1)
 71 |   -lofo                 Run leave-one-feature-out cross-validation analysis. (default: False)
 72 |   -nested               Toggle nested cross-validation. Hyperparamters will NOT be optimized in
 73 |                         each fold. (default: True)
 74 | ```
 75 | 
 76 | ### Predict
 77 | 
 78 | ```
 79 | usage: mdml_predict [-h] [-debug] -model MODEL -id_col ID_COL [-drop_col DROP_COL [DROP_COL ...]]
 80 |                     [-group GROUP]
 81 |                     input output
 82 | 
 83 | Make predictions using trained MD simulation fingerprint models.
 84 | 
 85 | HELP:
 86 |   -h                    Show this help message and exit.
 87 |   -debug                Print debugging messages. (default: False)
 88 | 
 89 | DATA:
 90 |   input                 Path to CSV containing features and molecule IDs.
 91 |   output                Path to save prediction CSV.
 92 |   -model MODEL          Directory containing trained ML model (default: None)
 93 |   -id_col ID_COL        Header of compound ID column. (default: None)
 94 |   -drop_col DROP_COL [DROP_COL ...]
 95 |                         Columns that should be removed from the input feature CSV. (default: [])
 96 |   -group GROUP          Grouping method. The options include 'mean', 'min', and 'max'. Data will
 97 |                         be grouped by compound ID (id_col). (default: None)
 98 | ```
 99 | 
100 | 


--------------------------------------------------------------------------------
/bin/mdfit_combine_csvs.py:
--------------------------------------------------------------------------------
  1 | #!/ap/rhel7/bin/python3.6
  2 | 
  3 | ####################################################################
  4 | # Corresponding Authors : Alexander Brueckner, Kaushik Lakkaraju ###
  5 | # Contact : alexander.brueckner@bms.com, kaushik.lakkaraju@bms.com #
  6 | ####################################################################
  7 | 
  8 | #Import Python modules
  9 | import logging
 10 | import os
 11 | import pandas as pd
 12 | import glob
 13 | 
 14 | ###Initiate logger###
 15 | logger = logging.getLogger(__name__)
 16 | 
 17 | def generate_master_simfp(master_dir, df_simfp):
 18 |     #Initiate empty list in case files do not exist
 19 |     simfp_scratch_files = []
 20 |     simfp_files = []
 21 |     
 22 |     #Get paths to individual SimFP files in scratch directory
 23 |     simfp_scratch_files = glob.glob(os.path.join(master_dir, "desmond_md_analysis", "scratch", "*SimFP.csv"))
 24 | 
 25 |     #Get paths to individual SimFP files in permanent directories
 26 |     simfp_files = glob.glob(os.path.join(master_dir, "desmond_md_analysis", "*", "*repetition*", "*SimFP.csv"))
 27 |     
 28 |     #Iterate over files in the scratch directory
 29 |     for file in simfp_scratch_files:
 30 |         #Add to list of files in permanent directories
 31 |         simfp_files.append(file)
 32 | 
 33 |     #Iterate over each file
 34 |     for file in simfp_files:
 35 |         #Read file into temporary dataframe
 36 |         df_temp = pd.read_csv(file)
 37 |         
 38 |         #Append ligand SimFP to master SimFP dataframe
 39 |         df_simfp = df_simfp.append(df_temp, ignore_index=True)
 40 |     
 41 |     #Sort master SimFP dataframe by molecule name and replace "NaN" values with zeros
 42 |     df_simfp = df_simfp.sort_values(by=['Molecule', 'Repetition']).fillna("0.0000")
 43 | 
 44 |     #Remove duplicate ligand entires (if user re-runs analysis)
 45 |     no_duplicates = df_simfp.drop_duplicates()
 46 | 
 47 |     #Return master SimFP dataframe
 48 |     return no_duplicates
 49 | 
 50 | def generate_master_compat(master_dir, df_compat):
 51 |     #Initiate empty list in case files do not exist
 52 |     compat_scratch_files = []
 53 |     compat_files = []
 54 | 
 55 |     #Get paths to individual compatibility files in scratch directory
 56 |     compat_scratch_files = glob.glob(os.path.join(master_dir, "desmond_md_analysis", "scratch", "*compatibility.csv"))
 57 | 
 58 |     #Get paths to individual compatibility files in permanent directories
 59 |     compat_files = glob.glob(os.path.join(master_dir, "desmond_md_analysis", "*", "*repetition*", "*compatibility.csv"))
 60 | 
 61 |     #Iterate over files in the scratch directory
 62 |     for file in compat_scratch_files:
 63 |         #Add to list of files in permanent directories
 64 |         compat_files.append(file)
 65 | 
 66 |     #Iterate over each file
 67 |     for file in compat_files:
 68 |         #Read file into temporary dataframe
 69 |         df_temp = pd.read_csv(file)
 70 | 
 71 |         #Append ligand compatibility to master compatibility dataframe
 72 |         df_compat = df_compat.append(df_temp, ignore_index=True)
 73 | 
 74 |     #Sort master compatibility dataframe by molecule name and replace "NaN" values with zeros
 75 |     df_compat = df_compat.sort_values(by=['Molecule', 'Repetition']).fillna("0.0000")
 76 | 
 77 |     #Remove duplicate ligand entires (if user re-runs analysis)
 78 |     no_duplicates = df_compat.drop_duplicates()
 79 | 
 80 |     #Return master compatibility dataframe
 81 |     return no_duplicates
 82 | 
 83 | def main(master_dir):
 84 |     #Initiate master SimFP dataframe
 85 |     df_simfp = pd.DataFrame()
 86 | 
 87 |     #Initiate master compatibility dataframe
 88 |     df_compat = pd.DataFrame()
 89 | 
 90 |     #Generate final SimFP dataframe
 91 |     df_simfp_final = generate_master_simfp(master_dir, df_simfp)
 92 | 
 93 |     #Generate final compatibility dataframe
 94 |     df_compat_final = generate_master_compat(master_dir, df_compat)
 95 | 
 96 |     #Write final SimFP dataframe to "MDFit_SimFPs.csv" file in desmond_md_analysis
 97 |     df_simfp_final.to_csv(os.path.join(master_dir, "desmond_md_analysis", "MDFit_SimFPs.csv"), index=False)
 98 | 
 99 |     #Write final compatibility dataframe to "MDFit_Compatibility.csv" file in desmond_md_analysis
100 |     df_compat_final.to_csv(os.path.join(master_dir, "desmond_md_analysis", "MDFit_Compatibility.csv"), index=False)
101 | 
102 |     #Document current step
103 |     logger.info("SimFPs are captured in %s"%os.path.join(master_dir, "desmond_md_analysis", "MDFit_SimFPs.csv"))
104 | 
105 |     #Document current step
106 |     logger.info("Compatibility metrics are captured in %s"%os.path.join(master_dir, "desmond_md_analysis", "MDFit_Compatibility.csv"))
107 | 
108 | if __name__ == '__main__':
109 |     main(master_dir)


--------------------------------------------------------------------------------
/bin/mdfit_prep_complex.py:
--------------------------------------------------------------------------------
  1 | #!/ap/rhel7/bin/python3.6
  2 | 
  3 | ####################################################################
  4 | # Corresponding Authors : Alexander Brueckner, Kaushik Lakkaraju ###
  5 | # Contact : alexander.brueckner@bms.com, kaushik.lakkaraju@bms.com #
  6 | ####################################################################
  7 | 
  8 | #Import Python modules
  9 | import logging
 10 | import os
 11 | import shutil
 12 | import subprocess
 13 | 
 14 | ###Initiate logger###
 15 | logger = logging.getLogger(__name__)
 16 | 
 17 | def run_job(command):
 18 |     #Run provided command, joining list with space. Pipe stdout and sdterror to log file
 19 |     process = subprocess.run(' '.join(command), stdout=subprocess.PIPE, \
 20 |         stderr=subprocess.STDOUT, shell=True, text=True)
 21 |     
 22 |     #Iterate over sdtout and sdterror
 23 |     for line in process.stdout.split('\n'):
 24 |         #Ignore blank lines
 25 |         if line != "":
 26 |             #Ignore ExitStatus
 27 |             if "ExitStatus" not in line:
 28 |                 #Write to log file for debugging
 29 |                 logger.debug(line)
 30 | 
 31 | def main(SCHRODINGER, ligpath, ligname, i, master_dir, args):
 32 |     #Prepare Schrodinger's structure subset command ($SCHRODINGER/utilities/structsubset)
 33 |     run_cmd = os.path.join(SCHRODINGER, 'utilities', 'structsubset')
 34 | 
 35 |     #Prepare Schrodinger's structure concatination command ($SCHRODINGER/utilities/structcat)
 36 |     structcat = os.path.join(SCHRODINGER, 'utilities', 'structcat')
 37 | 
 38 |     #Prepare Schrodinger's structure run command ($SCHRODINGER/run)
 39 |     schrun = os.path.join(SCHRODINGER, 'run')
 40 | 
 41 |     #Generate pose viewer filename
 42 |     pvcomplex = "%s_pv.mae"%ligname
 43 | 
 44 |     #Generate complex output filename
 45 |     outname = "%s_out_complex.mae"%ligname
 46 | 
 47 |     #Check if pose viewer file exists
 48 |     if os.path.isfile(os.path.join(master_dir, "desmond_md", ligname, "md_setup", pvcomplex)) == False:
 49 |         #If not, check if ligand sdf file exists and ligands are not precomplexed with protein
 50 |         if os.path.isfile("%s.sdf"%ligname) == False and not args.precomplex:
 51 |             #If not, prepare structure subset command (extract ligand from library)
 52 |             command = [run_cmd, '-n', str(i+1), ligpath, '%s.sdf'%ligname]
 53 | 
 54 |             #Document current step
 55 |             logger.info("Getting ligand: %s"%' '.join(command))
 56 | 
 57 |             #Run structure subset (extract ligand from library)
 58 |             run_job(command)
 59 |         
 60 |         #Ligands are precomplexed with protein
 61 |         else:
 62 |             #If not, prepare structure subset command (extract ligand from library)
 63 |             command = [run_cmd, '-n', str(i+1), ligpath, '%s.mae'%ligname]
 64 | 
 65 |             #Document current step
 66 |             logger.info("Getting ligand: %s"%' '.join(command))
 67 | 
 68 |             #Run structure subset (extract ligand from library)
 69 |             run_job(command)
 70 |         
 71 | 
 72 |         #Check if protein and ligand are pre-complexed
 73 |         if not args.precomplex:
 74 |             #If not, generate path to protein file
 75 |             protein_path = os.path.join(master_dir, args.prot)
 76 | 
 77 |             #Prepare structure concatination command (combine protein and ligand files)
 78 |             command1 = [structcat, "-i", protein_path, "%s.sdf"%ligname, "-o", pvcomplex]
 79 |             
 80 |             #Capture current step
 81 |             logger.info("Complexing protein and ligand: %s"%' '.join(command1))
 82 |             
 83 |             #Prepare pose viewer command
 84 |             command2 = [schrun, "pv_convert.py", "-mode", "merge", pvcomplex]
 85 | 
 86 |             #Capture current step
 87 |             logger.info("Merging protein and ligand: %s"%' '.join(command2))
 88 |             
 89 |             #Run concatination command
 90 |             run_job(command1)
 91 | 
 92 |             #Run pose viewer command
 93 |             run_job(command2)
 94 | 
 95 |             #Rename auto-generated output complex name to desired filename ("-out" > "_out")
 96 |             os.rename("%s-out_complex.mae"%ligname,outname)
 97 |         
 98 |         #Protein and ligand are pre-complexed
 99 |         else:
100 |             #Prepare structure concatination command (translate sdf to mae)
101 |             command1 = [structcat, "-i", "%s.mae"%ligname, "-o", pvcomplex]
102 | 
103 |             #Capture current step
104 |             logger.info("Complexing protein and ligand: %s"%' '.join(command1))
105 | 
106 |             #Prepare pose viewer command
107 |             command2 = [schrun, "pv_convert.py", "-mode", "merge", pvcomplex]
108 | 
109 |             #Capture current step
110 |             logger.info("Merging protein and ligand: %s"%' '.join(command2))
111 | 
112 |             #Run concatination command
113 |             run_job(command1)
114 | 
115 |             #Run pose viewer command
116 |             run_job(command2)
117 | 
118 |             #Copy pose viewer complex to desired filename
119 |             shutil.copy(pvcomplex,outname)
120 |     
121 |     #Pose viewer file exists
122 |     else:
123 |         #Capture current step
124 |         logger.info("Complex file found: %s"%pvcomplex)
125 | 
126 |     #Return pose viewer filename
127 |     return pvcomplex
128 | 
129 | if __name__ == '__main__':
130 |     main(SCHRODINGER, ligpath, ligname, i, master_dir, args)


--------------------------------------------------------------------------------
/mdml/bin/mdml_train:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | """
  3 | Train MD Simulation Fingerprint Models
  4 | --------------------------------------
  5 | Train simulation fingerprint models, evaluate performance via nested leave-one
  6 | -molecule-out cross-validation, and identify important interactions via 
  7 | feature importance.
  8 | 
  9 | @author: Benjamin Shields 
 10 | @email: benjamin.shields@bms.com
 11 | """
 12 | 
 13 | ############################################################################## Imports
 14 | 
 15 | import logging
 16 | import os
 17 | import pandas as pd
 18 | 
 19 | from mdml import model, cli, plot
 20 | 
 21 | ############################################################################## Setup
 22 | 
 23 | logger = logging.getLogger(__name__)
 24 | 
 25 | MODELS = {
 26 |     'linear':model.LinearRegression,
 27 |     'ridge':model.RidgeRegression,
 28 |     'lasso':model.LassoRegression,
 29 |     'random_forest':model.RandomForestRegression,
 30 |     'gradient_boosting':model.GradientBoostingRegression
 31 | }
 32 | 
 33 | ############################################################################## Interface
 34 | 
 35 | def get_parser():
 36 |     description = """Train regression models, evaluate performance using
 37 |     nested leave-one-molecule-out cross-validation, and identify important
 38 |     interactions via feature importance."""
 39 |     parser, groups = cli.parser(description, add_computation=True)
 40 |     
 41 |     # Input/Output Data
 42 |     data = parser.add_argument_group("DATA")
 43 |     data.add_argument(
 44 |         'input',
 45 |         help="Path to CSV containing SimFP features, IDs, and target (optional)."
 46 |     )
 47 |     data.add_argument(
 48 |         'output',
 49 |         help="Directory path to save output files."
 50 |     )
 51 |     data.add_argument(
 52 |         '-target_col',
 53 |         type=str,
 54 |         default=None,
 55 |         help="Header of target column in training data CSV.",
 56 |         required=True
 57 |     )
 58 |     data.add_argument(
 59 |         '-id_col',
 60 |         type=str,
 61 |         default=None,
 62 |         help="Header of compound ID column.",
 63 |         required=True
 64 |     )
 65 |     data.add_argument(
 66 |         '-drop_col',
 67 |         nargs='+',
 68 |         type=str,
 69 |         default=[],
 70 |         help="Columns that should be removed from the training CSV."
 71 |     )
 72 |     data.add_argument(
 73 |         '-group',
 74 |         type=str,
 75 |         default=None,
 76 |         help="""Grouping method. The options include 'mean', 'min', and 'max'. 
 77 |         Data will be grouped by compound ID (id_col)."""
 78 |     )
 79 |     
 80 |     # Model Details
 81 |     model = parser.add_argument_group("MODEL")
 82 |     model.add_argument(
 83 |         '-model_type',
 84 |         type=str,
 85 |         default='linear',
 86 |         help="""Regression model type. The options are 'linear', 'ridge', 'lasso', 
 87 |         'random_forest', and 'gradient_boosting'."""
 88 |     )
 89 |     model.add_argument(
 90 |         '-cv',
 91 |         type=int,
 92 |         default=-1,
 93 |         help="""Number of cross-validation folds to use in hyperparameter 
 94 |         tuning. Specify -1 for leave-one-out."""
 95 |     )
 96 |     model.add_argument(
 97 |         '-lofo',
 98 |         action='store_true',
 99 |         help="""Run leave-one-feature-out cross-validation analysis."""
100 |     )
101 |     model.add_argument(
102 |         '-nested',
103 |         dest='nested',
104 |         action='store_false',
105 |         help="""Toggle nested cross-validation. Hyperparamters will NOT be
106 |         optimized in each fold."""
107 |     )
108 |     
109 |     return parser
110 | 
111 | ############################################################################## Main
112 | 
113 | def main(args):
114 |     # Details
115 |     logger.debug(f'Workflow set to nested={args.nested}')
116 |     
117 |     # Load and preprocess data
118 |     logger.debug(f'Loading data: {args.input}')
119 |     data = cli.load_data(
120 |         args.input, args.id_col, drop=args.drop_col, aggregation=args.group
121 |     )
122 |     logger.debug(f'Dataset contains {len(data)} entries')
123 |     
124 |     # Output directory
125 |     if not os.path.isdir(args.output):
126 |         os.mkdir(args.output)
127 |     
128 |     # Build and fit model
129 |     logger.debug('Building initial model')
130 |     workflow = MODELS[args.model_type](data, args.target_col)
131 |     workflow.fit(n_jobs=args.nproc, cv=args.cv)
132 |     imp = workflow.feature_importance()
133 |     
134 |     # Save model
135 |     path = os.path.join(args.output, 'model.pkl')
136 |     logger.debug(f'Saving model to {path}')
137 |     workflow.__save__(path)
138 |     path = os.path.join(args.output, 'importance.csv')
139 |     logger.debug(f'Saving feature importance to {path}')
140 |     imp.to_csv(path)
141 |     
142 |     # Run (possibly nested) cross-validation
143 |     logger.debug('Running cross-validation')
144 |     cv = workflow.cross_validate(
145 |         nested=args.nested, cv=args.cv, n_jobs=args.nproc
146 |     )
147 |     path = os.path.join(args.output, 'cross_validation')
148 |     logger.debug(
149 |         f'Saving cross-validation results to {path}.json and {path}.svg'
150 |     )
151 |     cli.save_json(cv, f'{path}.json')
152 |     cv_df = pd.DataFrame()
153 |     cv_df['ID'] = cv['ids']
154 |     cv_df['pred'] = cv['pred']
155 |     cv_df['obs'] = cv['obs']
156 |     cv_df = cv_df.groupby('ID').mean()
157 |     plot.parity_plot(
158 |         cv_df['pred'], cv_df['obs'], title='Cross-Validation', cod='Q^2',
159 |         export_path=path
160 |     )
161 |     
162 |     # Run leave-one-feature-out CV
163 |     if args.lofo:
164 |         logger.debug('Running leave-one-feature-out cross-validation')
165 |         lofo = model.leave_one_feature_out_importance(
166 |             workflow, nested=args.nested, cv=args.cv, n_jobs=args.nproc
167 |         )
168 |         path = os.path.join(args.output, 'lofo_cross_validation.json')
169 |         logger.debug(
170 |             f'Saving lofo cross-validation results to {path}'
171 |         )
172 |         cli.save_json(lofo, f'{path}.json')
173 | 
174 | 
175 | if __name__ == "__main__":
176 |     parser = get_parser()
177 |     args = parser.parse_args()
178 |     if args.debug:
179 |         logging.basicConfig(level=logging.DEBUG)
180 |     main(args)
181 | 
182 | 


--------------------------------------------------------------------------------
/bin/mdfit_initiate.py:
--------------------------------------------------------------------------------
  1 | #!/ap/rhel7/bin/python3.6
  2 | 
  3 | ####################################################################
  4 | # Corresponding Authors : Alexander Brueckner, Kaushik Lakkaraju ###
  5 | # Contact : alexander.brueckner@bms.com, kaushik.lakkaraju@bms.com #
  6 | ####################################################################
  7 | 
  8 | #Import Python modules
  9 | import logging
 10 | import sys
 11 | import os
 12 | import subprocess
 13 | 
 14 | #Import Schrodinger modules
 15 | from schrodinger import structure
 16 | 
 17 | ###Initiate logger###
 18 | logger = logging.getLogger(__name__)
 19 | 
 20 | def run_job(command):
 21 |     #Run provided command, joining list with space. Pipe stdout and sdterror to log file
 22 |     process = subprocess.run(' '.join(command), stdout=subprocess.PIPE, \
 23 |         stderr=subprocess.STDOUT, shell=True, text=True)
 24 |     
 25 |     #Iterate over sdtout and sdterror
 26 |     for line in process.stdout.split('\n'):
 27 |         #Ignore blank lines
 28 |         if line != "":
 29 |             #Ignore ExitStatus
 30 |             if "ExitStatus" not in line:
 31 |                 #Write to log file for debugging
 32 |                 logger.debug(line)
 33 | 
 34 | def filecheck(args, master_dir):
 35 |     #Document current step
 36 |     logger.info('Checking working directory for provided files: %s'%master_dir)
 37 | 
 38 |     #Check file existance, if provided by user
 39 |     #Generate list with all possible files
 40 |     filenames=[args.prot, args.liglib, args.precomplex]
 41 |     
 42 |     #Iterate over list of files
 43 |     for file in filenames:
 44 |         #Check if filename was provided by user
 45 |         if file != None:
 46 |             #If filename provided, check it exists
 47 |             exists = os.path.exists(file)
 48 |             
 49 |             #If it does not exist
 50 |             if exists == False:
 51 |                 #Log error
 52 |                 logger.critical("%s is not accessible or does not exist; "\
 53 |                 "cannot proceed"%file)
 54 |                 
 55 |                 #Exit
 56 |                 sys.exit()
 57 |     
 58 |     #Document current step
 59 |     logger.info("All provided filenames were successfully located")
 60 | 
 61 | def set_vars(args, master_dir, SCHRODINGER):
 62 |     #Get file extension for ligand library (e.g., ".sdf")
 63 |     if args.liglib and not args.precomplex:
 64 |         ligfiletype = os.path.splitext(args.liglib)[-1].lower()
 65 |     elif args.precomplex:
 66 |         ligfiletype = os.path.splitext(args.precomplex)[-1].lower()
 67 | 
 68 |     #Get filename, extension removed (e.g., "ligand_library")
 69 |     if args.liglib:
 70 |         ligfileprefix = os.path.splitext(args.liglib)[0]
 71 |     elif args.precomplex:
 72 |         ligfileprefix = os.path.splitext(args.precomplex)[0]
 73 | 
 74 |     #Get file extension for protein (e.g., ".mae")
 75 |     if args.prot:
 76 |         protfiletype = os.path.splitext(args.prot)[-1].lower()
 77 |     else:
 78 |         protfiletype = ".mae"
 79 | 
 80 |     #Check if ligand library extension is mae or sdf
 81 |     if ligfiletype != ".sdf" and ligfiletype != ".mae":
 82 |         #If not, log error
 83 |         logger.critical("%s must be in mae or sdf format; cannot proceed"%args.liglib)
 84 |     
 85 |         #Exit
 86 |         sys.exit(1)
 87 |     
 88 |     #Check if protein extension is mae
 89 |     if protfiletype != ".mae" and not args.precomplex:
 90 |         #If not, log error
 91 |         logger.critical("%s must be in mae format; cannot proceed"%args.prot)
 92 |     
 93 |         #Exit
 94 |         sys.exit(1)
 95 |     
 96 |     #Check if ligand library extension is mae and convert to SDF for downstream compatibility and non-Schrodinger MD engines
 97 |     if ligfiletype == ".mae" and not args.precomplex:
 98 |         #Set up Schrodinger run command ($SCHRODINGER/utilities/structconvert)
 99 |         run_cmd = os.path.join(SCHRODINGER, 'utilities', 'structconvert')
100 | 
101 |         #Prepare full command to convert mae file to sdf
102 |         command = [run_cmd, args.liglib, "%s.sdf"%ligfileprefix]
103 | 
104 |         #Run the command using subprocess
105 |         run_job(command)
106 | 
107 |         #Change filetype to sdf
108 |         ligfiletype = ".sdf"
109 | 
110 |     #Return ligand library extension, ligand library name, and protein extension
111 |     return ligfiletype, ligfileprefix, protfiletype
112 | 
113 | def count_ligs(args, ligfiletype, maxliglimit):
114 |     #Initiate variable
115 |     nlig = 0
116 |     
117 |     #Use StructureReader to iterate through ligands
118 |     if args.liglib and not args.precomplex:
119 |         for s in structure.StructureReader(args.liglib):
120 |             #Increment nlig for each ligand
121 |             nlig+=1
122 |     elif args.precomplex:
123 |         for s in structure.StructureReader(args.precomplex):
124 |             #Increment nlig for each ligand
125 |             nlig+=1
126 | 
127 |     #If ligands are not found
128 |     if nlig == 0:
129 |         #Log error
130 |         logger.critical("No ligands captured. Please check input file.")
131 | 
132 |         #Exit
133 |         sys.exit(1)
134 |     
135 |     #Ligands must have been found
136 |     else:
137 |         #Document current step
138 |         logger.info("Number of ligands in library = %s"%nlig)
139 | 
140 |         #Check that the number of ligands is less than the max limit for MD
141 |         if nlig > maxliglimit and args.skip_md == False:
142 |             #If true, log error
143 |             logger.critical("Number of ligands in library (%s) exceeds the "\
144 |             "allowed limit (%s); cannot proceed" % (nlig, maxliglimit))
145 | 
146 |             #Exit
147 |             sys.exit(1)
148 | 
149 |     #Return number of ligands
150 |     return nlig
151 | 
152 | def main(args, master_dir, maxliglimit, SCHRODINGER):
153 |     #Check if files exist and dependencies are met
154 |     filecheck(args, master_dir)
155 | 
156 |     #Check file extensions
157 |     ligfiletype, ligfileprefix, protfiletype = set_vars(args, master_dir, SCHRODINGER)
158 |     
159 |     #Count number of ligands in ligand library
160 |     nlig = count_ligs(args, ligfiletype, maxliglimit)
161 | 
162 |     #Return ligand library extension, ligand library name, protein extension, and number of ligands
163 |     return ligfiletype, ligfileprefix, protfiletype, nlig
164 | 
165 | if __name__ == '__main__':
166 |     main(args, master_dir, maxliglimit, SCHRODINGER)
167 | 


--------------------------------------------------------------------------------
/MDFit.py:
--------------------------------------------------------------------------------
  1 | #!/ap/rhel7/bin/python
  2 | 
  3 | ####################################################################
  4 | # Corresponding Authors : Alexander Brueckner, Kaushik Lakkaraju ###
  5 | # Contact : alexander.brueckner@bms.com, kaushik.lakkaraju@bms.com #
  6 | ####################################################################
  7 | 
  8 | #Import Python modules
  9 | import logging
 10 | import sys
 11 | import os
 12 | 
 13 | #Get MDFit installation path
 14 | MDFit_path = os.path.dirname(__file__)
 15 | 
 16 | #Add MDFit bin directory to path
 17 | sys.path.insert(0, os.path.join(MDFit_path, 'bin'))
 18 | 
 19 | #Import MDFit modules
 20 | import mdfit_read_params
 21 | import mdfit_parseargs
 22 | import mdfit_initiate
 23 | import mdfit_ffbuilder
 24 | import mdfit_desmond_md
 25 | import mdfit_desmond_analysis
 26 | 
 27 | #Generate path to template directory
 28 | template_dir = os.path.join(MDFit_path, 'templates')
 29 | 
 30 | #Get path to user directory
 31 | master_dir = os.getcwd()
 32 | 
 33 | #Get Schrodinger environmental variable
 34 | SCHRODINGER = os.getenv('SCHRODINGER')
 35 | 
 36 | #Get Schrodinger release version. Assumes pathname has version
 37 | #E.g., "/schrodinger/2023-2/"
 38 | schrodinger_version = os.path.basename(SCHRODINGER)
 39 | 
 40 | #Get home path environmental variable
 41 | homepath = os.getenv('HOME')
 42 | 
 43 | #Initiate logger
 44 | logger = logging.getLogger()
 45 | 
 46 | #Point logger to file
 47 | fh = logging.FileHandler(os.path.join(master_dir, 'MDFit.log'))
 48 | 
 49 | #Create writing format for logging
 50 | fh.setFormatter(logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s'))
 51 | 
 52 | #Add to logger
 53 | logger.addHandler(fh)
 54 | 
 55 | #Set logger default to debug
 56 | logger.setLevel(logging.DEBUG)
 57 | 
 58 | def read_json(MDFit_path):
 59 |     #Read in instiutional parameters ("inst params")
 60 |     #Hostnames, maximum number of ligands, number of processors for FFBuilder
 61 |     #Calls mdfit_read_params.py
 62 |     inst_params = mdfit_read_params.main(MDFit_path)
 63 | 
 64 |     #Return parameters
 65 |     return inst_params
 66 | 
 67 | def parseargs(master_dir, homepath):
 68 |     #Get user flags and options
 69 |     #Calls mdfit_parseargs.py
 70 |     args = mdfit_parseargs.main(master_dir, homepath)
 71 | 
 72 |     #Set logger level based on user input
 73 |     logger.setLevel(args.loglevel)
 74 | 
 75 |     #Print arguments for future reference
 76 |     logger.info('Parsed arguments: %s', args)
 77 | 
 78 |     #Return arguments
 79 |     return args
 80 | 
 81 | def initiate_mdfit(SCHRODINGER, args, master_dir, maxliglimit):
 82 |     #Document current step
 83 |     logger.info("Initiating MDFit...")
 84 |     
 85 |     #Calls mdfit_initiate.py
 86 |     ligfiletype, ligfileprefix, protfiletype, nlig = mdfit_initiate.main(args, \
 87 |         master_dir, maxliglimit, SCHRODINGER)
 88 |     
 89 |     #Document current step
 90 |     logger.info("Completed initiation of MDFit")
 91 | 
 92 |     #Return arguments
 93 |     return ligfiletype, ligfileprefix, protfiletype, nlig
 94 | 
 95 | def run_ffbuilder(args, master_dir, SCHRODINGER, ligpath, ligfileprefix, schrodinger_version, inst_params, homepath):
 96 |     #Check if user wants FFBuilder
 97 |     if args.skip_ff == False:
 98 |         #If they do, document current step
 99 |         logger.info("Initiating FFBuilder...")
100 | 
101 |         #Calls mdfit_ffbuilder.py
102 |         mdfit_ffbuilder.main(args, master_dir, SCHRODINGER, ligpath, \
103 |             ligfileprefix, schrodinger_version, inst_params, homepath)
104 | 
105 |         #Document current step
106 |         logger.info("Completed FFBuilder")
107 |     
108 |     #User requests to skip FFBuilder
109 |     else:
110 |         #Document current step
111 |         logger.info("Skipping FFBuilder")
112 | 
113 |     #Useful to check that ligand file is correctly assigned
114 |     logger.debug("Current ligand file = %s"%ligpath)
115 | 
116 | def run_md(args, master_dir, ligfileprefix, SCHRODINGER, ligpath, template_dir, inst_params):
117 |     #Check if user wants Desmond MD
118 |     if args.skip_md == False:
119 |         #If they do, document current step
120 |         logger.info("Initiating Desmond MD...")
121 | 
122 |         #Calls mdfit_desmond_md.py
123 |         mdfit_desmond_md.main(args, master_dir, ligfileprefix, SCHRODINGER, ligpath, template_dir, inst_params)
124 |         
125 |         #Document current step
126 |         logger.info("Completed Desmond MD")
127 | 
128 |     #User requests to skip Desmond MD
129 |     else:
130 |         #Document current step
131 |         logger.info("Skipping Desmond MD")
132 | 
133 | def run_analysis(args, master_dir, SCHRODINGER, inst_params):
134 |     #Check if user wants Desmond MD analysis
135 |     if args.skip_analysis == False:
136 |         #If they do, document current step
137 |         logger.info("Initiating MD analysis...")
138 | 
139 |         #Calls mdfit_desmond_analysis.py
140 |         mdfit_desmond_analysis.main(args, master_dir, SCHRODINGER, inst_params)
141 |         
142 |         #Document current step
143 |         logger.info("Completed MD analysis")
144 |     
145 |     #User requests to skip Desmond MD analysis
146 |     else:
147 |         #Document current step
148 |         logger.info("Skipping MD analysis")
149 | 
150 | def main():
151 |     #Get institution parameters from json file (hostnames, max number of ligs, etc.)
152 |     inst_params = read_json(MDFit_path)
153 |     
154 |     #Get user flags and options
155 |     args = parseargs(master_dir, homepath)
156 |     
157 |     #Get maximum number of ligands from json file
158 |     maxliglimit = inst_params["parameters"]["MAXLIGS"]
159 |     
160 |     #Check file viability, flag compatibility, etc.
161 |     ligfiletype, ligfileprefix, protfiletype, nlig = initiate_mdfit(SCHRODINGER, args, master_dir, maxliglimit)
162 | 
163 |     #Generate path to ligand library
164 |     if args.liglib and not args.precomplex:
165 |         ligpath = os.path.join(master_dir, "%s.sdf"%ligfileprefix)
166 |     elif args.precomplex:
167 |         ligpath = os.path.join(master_dir, args.precomplex)
168 | 
169 |     #Run FFBuilder, if requested
170 |     run_ffbuilder(args, master_dir, SCHRODINGER, ligpath, \
171 |         ligfileprefix, schrodinger_version, inst_params, homepath)
172 |     
173 |     #Run Desmond MD, if requested
174 |     run_md(args, master_dir, ligfileprefix, SCHRODINGER, ligpath, template_dir, inst_params)
175 | 
176 |     #Analyze Desmond trajectories, if requested
177 |     run_analysis(args, master_dir, SCHRODINGER, inst_params)
178 | 
179 | if __name__ == '__main__':
180 |     main()


--------------------------------------------------------------------------------
/mdml/mdml/cli.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | Command Line Interface
  4 | ----------------------
  5 | A basic interface for MDML scripts.
  6 | 
  7 | @author: Benjamin Shields 
  8 | @email: benjamin.shields@bms.com
  9 | """
 10 | 
 11 | ############################################################################## Imports
 12 | 
 13 | import logging
 14 | import pandas as pd
 15 | import json
 16 | from argparse import ArgumentParser, ArgumentDefaultsHelpFormatter
 17 | from pandas.core.frame import DataFrame
 18 | 
 19 | ############################################################################## Logger
 20 | 
 21 | logger = logging.getLogger(__name__)
 22 | 
 23 | ############################################################################## Interface
 24 | 
 25 | def parser(description:str, add_computation:bool=False, **kwargs) -> (ArgumentParser, dict):
 26 |     """
 27 |     Generate a formatted argparse parser.
 28 | 
 29 |     Parameters
 30 |     ----------
 31 |     description : str
 32 |         CLI description.
 33 |     add_computation : bool, optional
 34 |         Add a computation parameters group including nproc. The default is True.
 35 |     **kwargs
 36 |         Keyword arguments passed to argparse.ArgumentParser.
 37 | 
 38 |     Returns
 39 |     -------
 40 |     parser : ArgumentParser
 41 |         Argument parser.
 42 |     groups : dict
 43 |         Pre-generated argument groups.
 44 |     """
 45 |     
 46 |     # Formatted parser
 47 |     class ArgparseFormatter(ArgumentDefaultsHelpFormatter):
 48 |         pass
 49 |     
 50 |     parser = ArgumentParser(    
 51 |         description=description,
 52 |         add_help=False,
 53 |         formatter_class=lambda prog: ArgparseFormatter(prog, width=98),
 54 |         **kwargs
 55 |     )
 56 |     groups = {}
 57 |     
 58 |     # Help
 59 |     helper = parser.add_argument_group('HELP')
 60 |     helper.add_argument(
 61 |         '-h',
 62 |         action='help',
 63 |         help="""Show this help message and exit."""
 64 |     )
 65 |     helper.add_argument(
 66 |         '-debug',
 67 |         action='store_true',
 68 |         dest='debug',
 69 |         help="""Print debugging messages."""
 70 |     )
 71 |     groups['help'] = helper
 72 |     
 73 |     # Resource Management
 74 |     if add_computation:
 75 |         comp = parser.add_argument_group('COMPUTATION')
 76 |         comp.add_argument(
 77 |             '-nproc',
 78 |             action='store',
 79 |             dest='nproc',
 80 |             type=int,
 81 |             default=1,
 82 |             help="""Number of processors to use for parallel computation."""
 83 |         )
 84 |         groups['computation'] = comp
 85 |     
 86 |     return parser, groups
 87 | 
 88 | ############################################################################## Data
 89 | 
 90 | def aggregate_data(data:DataFrame, id_col:str, method:str) -> DataFrame:
 91 |     """
 92 |     Aggregate SimFPs by computing the mean, min, or max of duplicate MD runs.
 93 | 
 94 |     Parameters
 95 |     ----------
 96 |     features : DataFrame
 97 |         Feature matrix including duplicate SimFPs.
 98 |     id_col : str
 99 |         Molecule ID column. This column is used to identify duplicate SimFPs.
100 |     method : str
101 |         Aggregation method. The options are 'mean', 'min', and 'max'.
102 | 
103 |     Raises
104 |     ------
105 |     ValueError
106 |         The specified method is not supported.
107 | 
108 |     Returns
109 |     -------
110 |     data : DataFrame
111 |         Aggregated features.
112 |     """
113 |     
114 |     if method == 'min':
115 |         data = data.groupby(id_col).min()
116 |     elif method == 'max':
117 |         data = data.groupby(id_col).max()
118 |     elif method == 'mean':
119 |         data = data.groupby(id_col).mean()
120 |     else:
121 |         raise ValueError(f'Grouping method {method} not recognized.')
122 |     
123 |     data.insert(0, 'ID', data.index.values)
124 |     
125 |     return data.reset_index(drop=True)
126 | 
127 | def load_data(path:str, id_col:str, drop:bool=None, aggregation:str=None) -> DataFrame:
128 |     """
129 |     Load SimFP and target data and preprocess it by aggregating SimFPs from 
130 |     duplicate MD runs.
131 |     
132 |     Note
133 |     ----
134 |     Rows containing SimFPs from duplicate MD runs should all of the same ID and
135 |     target value. Any columns not corresponding to ID, features, or target 
136 |     should be removed via the drop argument.
137 | 
138 |     Parameters
139 |     ----------
140 |     path : str
141 |         Path to input CSV file.
142 |     id_col : str, optional
143 |        Column header containing unique molecule IDs.
144 |     drop : bool, optional
145 |         Remove . The default is None.
146 |     aggregation : str, optional
147 |         Type of aggregation to use. The options are None, 'mean', 'min', and 
148 |         'max'. The default is None.
149 | 
150 |     Raises
151 |     ------
152 |     ValueError
153 |         An ID column is required for aggregation.
154 | 
155 |     Returns
156 |     -------
157 |     data : DataFrame
158 |         Loaded and preprocessed data.
159 |     """
160 |     
161 |     data = pd.read_csv(path)
162 |     if drop is not None and len(drop) > 0:
163 |         data = data.drop(drop, axis=1) 
164 |     if aggregation is None:
165 |         if id_col != 'ID':
166 |             data.insert(0, 'ID', data[id_col].values)
167 |             data = data.drop(id_col, axis=1)
168 |     else:
169 |         if id_col is None:
170 |             raise ValueError('An ID column is required for aggregation.')
171 |         data = aggregate_data(data, id_col, aggregation)
172 |         
173 |     return data
174 | 
175 | def save_json(data:dict, path:str) -> None:
176 |     """
177 |     Save a dictionary as a JSON file.
178 | 
179 |     Parameters
180 |     ----------
181 |     data : dict
182 |         Dictionary to be saved.
183 |     path : str
184 |         Path to JSON file.
185 | 
186 |     Returns
187 |     -------
188 |     None
189 |     """
190 |     
191 |     with open(path, 'w', encoding='utf-8') as file:
192 |         json.dump(data, file, ensure_ascii=False, indent=4)
193 |     
194 | def load_json(path:str) -> dict:
195 |     """
196 |     Load a JSON file as a dictionary.
197 | 
198 |     Parameters
199 |     ----------
200 |     path : str
201 |         Path to JSON file.
202 | 
203 |     Returns
204 |     -------
205 |     dict
206 |         Loaded data.
207 |     """
208 |     
209 |     with open(path) as file:
210 |         data = json.load(file)
211 |         
212 |     return data
213 |         
214 | 
215 | 
216 | 
217 | 
218 | 
219 |     


--------------------------------------------------------------------------------
/bin/mdfit_event_analysis.py:
--------------------------------------------------------------------------------
  1 | #!/ap/rhel7/bin/python3.6
  2 | 
  3 | ####################################################################
  4 | # Corresponding Authors : Alexander Brueckner, Kaushik Lakkaraju ###
  5 | # Contact : alexander.brueckner@bms.com, kaushik.lakkaraju@bms.com #
  6 | ####################################################################
  7 | 
  8 | #Import Python modules
  9 | import logging
 10 | import sys
 11 | import os
 12 | import subprocess
 13 | 
 14 | #Fixes issue with X11 forwarding
 15 | os.environ['QT_QPA_PLATFORM']='offscreen'
 16 | 
 17 | ###Initiate logger###
 18 | logger = logging.getLogger(__name__)
 19 | 
 20 | def run_job(command):
 21 |     #Run provided command, joining list with space. Pipe stdout and sdterror to log file
 22 |     process = subprocess.run(' '.join(command), stdout=subprocess.PIPE, \
 23 |         stderr=subprocess.STDOUT, shell=True, text=True)
 24 |     
 25 |     #Iterate over sdtout and sdterror
 26 |     for line in process.stdout.split('\n'):
 27 |         #Ignore blank lines
 28 |         if line != "":
 29 |             #Ignore ExitStatus
 30 |             if "ExitStatus" not in line:
 31 |                 #Write to log file for debugging
 32 |                 logger.debug(line)
 33 | 
 34 | def dircheck(master_dir, basename):
 35 |     #Generate directory name in scratch space for each ligand <ligname>-repetition<#>
 36 |     newdir = os.path.join(master_dir, "desmond_md_analysis", "scratch", basename)
 37 | 
 38 |     #Check if directory exists
 39 |     if os.path.isdir(newdir) == False:
 40 |         #If not, make directory (recursive)
 41 |         os.makedirs(newdir)
 42 | 
 43 |         #Capture current step
 44 |         logger.info("Created directory: %s"%newdir)
 45 | 
 46 |     #Directory exists
 47 |     else:
 48 |         #Capture current step
 49 |         logger.info("Directory already exists: %s"%newdir)
 50 |     
 51 |     #Return ligand scratch directory path
 52 |     return newdir
 53 | 
 54 | def trj_pathnames(md_path, basename):
 55 |     #Check if slice trajectory exists
 56 |     if os.path.isfile(os.path.join(md_path, "%s_sliced-out.cms"%basename)) == True:
 57 |         #If it does, set cms path to sliced trajectory file
 58 |         cms_path = os.path.join(md_path, "%s_sliced-out.cms"%basename)
 59 | 
 60 |         #If it does, set trj path to sliced trajectory directory
 61 |         trj_path = os.path.join(md_path, "%s_sliced_trj"%basename)
 62 |     
 63 |     #Check if unsliced trajectory does and sliced trajectory does not exist
 64 |     elif os.path.isfile(os.path.join(md_path, "%s_sliced-out.cms"%basename)) == False and os.path.isfile(os.path.join(md_path, "%s-out.cms"%basename)) == True:
 65 |         #If it does, set cms path to unsliced trajectory file
 66 |         cms_path = os.path.join(md_path, "%s-out.cms"%basename)
 67 | 
 68 |         #If it does, set trj path to unsliced trajectory file
 69 |         trj_path = os.path.join(md_path, "%s_trj"%basename)
 70 |     
 71 |     #Could not locate trajectory
 72 |     else:
 73 |         #Log error
 74 |         logger.critical("Trajectory could not be located!")
 75 | 
 76 |         #Exit
 77 |         sys.exit()
 78 | 
 79 |     #Return paths to trajectory files
 80 |     return cms_path, trj_path
 81 | 
 82 | def gen_outname(basename):
 83 |     #Generate input eaf filename
 84 |     eaf_in = "%s-in.eaf"%basename
 85 | 
 86 |     #Generate output eaf filename
 87 |     eaf_out = "%s-out.eaf"%basename
 88 | 
 89 |     #Generate output pdf filename
 90 |     eaf_pdf = "%s_analysis.pdf"%basename
 91 | 
 92 |     #Return generated filenames
 93 |     return eaf_in, eaf_out, eaf_pdf
 94 | 
 95 | 
 96 | def main(SCHRODINGER, rep, master_dir, args, inst_params):
 97 |     #Prepare analysis hostname
 98 |     analysis_host = inst_params["hostnames"]["ANALYSIS"]
 99 |     
100 |     #Prepare Schrodinger run command ($SCHRODINGER/run)
101 |     run_cmd = os.path.join(SCHRODINGER, 'run')
102 | 
103 |     #Get repetition name <ligname>-repetition<#>
104 |     basename = os.path.basename(rep)
105 | 
106 |     #Get ligand name <ligname>
107 |     ligbase = basename.split("_repetition")[0]
108 |     
109 |     #Generate pathname to trajectory files
110 |     md_path = os.path.join(master_dir, "desmond_md", ligbase, basename)
111 | 
112 |     #Generate paths to trajectory files
113 |     cms_path, trj_path = trj_pathnames(md_path, basename)
114 | 
115 |     #Generate filenames for event analysis
116 |     eaf_in, eaf_out, eaf_pdf = gen_outname(basename)
117 | 
118 |     #Check if output eaf file exists in scratch and permanent space
119 |     if os.path.isfile(os.path.join(master_dir, "desmond_md_analysis", "scratch", eaf_out)) == False and os.path.isfile(os.path.join(master_dir, "desmond_md_analysis", ligbase, basename, eaf_out)) == False:
120 |         #If not, prepare directory for text data and plot files
121 |         data_dir = dircheck(master_dir, basename)
122 | 
123 |         #Need to wrap ASL in double quotes for Schrodinger to interpret
124 |         prot_ASL = '"%s"'%args.prot_ASL
125 |         lig_ASL = '"%s"'%args.lig_ASL
126 | 
127 |         #Prepare event analysis (analyze) command
128 |         event_analysis_command1 = [run_cmd, "event_analysis.py", "analyze", cms_path, "-p", prot_ASL, "-l", lig_ASL, "-out", basename]
129 | 
130 |         #Prepare simulation analysis command
131 |         analyze_simulation_command=[run_cmd, "analyze_simulation.py", "-HOST", analysis_host, "-OPLSDIR", args.oplsdir, "-JOBNAME", basename, "-WAIT", cms_path, trj_path, eaf_out, eaf_in]
132 | 
133 |         #Prepare event analysis (report) command
134 |         event_analysis_command2=[run_cmd, "event_analysis.py", "report", "-pdf", eaf_pdf, "-data", "-plots", "-data_dir", data_dir, eaf_out]
135 | 
136 |         #Capture current step
137 |         logger.info("Generating eaf file: %s"%' '.join(event_analysis_command1))
138 | 
139 |         #Run event analysis (analyze) command
140 |         run_job(event_analysis_command1)
141 | 
142 |         #Capture current step
143 |         logger.info("Running simulation analysis: %s"%' '.join(analyze_simulation_command))
144 | 
145 |         #Run simulation analysis command
146 |         run_job(analyze_simulation_command)
147 | 
148 |         #Limitation of Schrodinger's code. Cannot control output filenames and asynchronous calls clash. Forced to run serially.
149 |         #Return event analysis (report) command
150 |         return event_analysis_command2
151 | 
152 |     #Output eaf file exists
153 |     else:
154 |         #Check if PDF was generated
155 |         if os.path.isfile(os.path.join(master_dir, "desmond_md_analysis", "scratch", eaf_pdf)) == False and os.path.isfile(os.path.join(master_dir, "desmond_md_analysis", ligbase, basename, eaf_pdf)) == False:
156 |             #If not, prepare directory for text data and plot files
157 |             data_dir = dircheck(master_dir, basename)
158 | 
159 |             #Prepare event analysis (report) command
160 |             event_analysis_command2=[run_cmd, "event_analysis.py", "report", "-pdf", eaf_pdf, "-data", "-plots", "-data_dir", data_dir, eaf_out]
161 | 
162 |             #Limitation of Schrodinger's code. Cannot control output filenames and asynchronous calls clash. Forced to run serially.
163 |             #Return event analysis (report) command
164 |             return event_analysis_command2
165 |         
166 |         #PDF was generated
167 |         else:
168 |             #Capture current step
169 |             logger.info("eaf file found. Skipping event analysis.")
170 | 
171 |             #Return empty list - event analysis (report) not necessary
172 |             return []
173 | 
174 | if __name__ == '__main__':
175 |     main(SCHRODINGER, rep, master_dir, args, inst_params)


--------------------------------------------------------------------------------
/bin/mdfit_parseargs.py:
--------------------------------------------------------------------------------
  1 | #!/ap/rhel7/bin/python
  2 | 
  3 | ####################################################################
  4 | # Corresponding Authors : Alexander Brueckner, Kaushik Lakkaraju ###
  5 | # Contact : alexander.brueckner@bms.com, kaushik.lakkaraju@bms.com #
  6 | ####################################################################
  7 | 
  8 | #Import Python modules
  9 | import logging
 10 | import textwrap
 11 | import argparse
 12 | import sys
 13 | import os
 14 | 
 15 | ###Initiate logger###
 16 | logger = logging.getLogger(__name__)
 17 | 
 18 | class ArgumentParser(argparse.ArgumentParser):
 19 |     def error(self, message):
 20 |         logger.critical(message)
 21 |         print("An error has occurred. Check log file for information.")
 22 |         run_command(('kill','0'))
 23 | 
 24 |     """Disables prefix matching in ArgumentParser."""
 25 |     def _get_option_tuples(self, option_string):
 26 |         """Prevent argument parsing from looking for prefix matches."""
 27 |         return []
 28 | 
 29 | def parse_arguments(master_dir, homepath):
 30 |     parser = ArgumentParser(prog='MDFit', formatter_class=argparse.RawDescriptionHelpFormatter, usage='%(prog)s [options]',
 31 |                                      description=textwrap.dedent('''\
 32 |         ----------------------------------------------
 33 |         MDFit workflow:
 34 |             a) Run FFBuilder to optimize ligand parameters
 35 |             b) Prepare protein-ligand complexes using user-supplied protein mae file
 36 |             c) Solvate each protein-ligand complex
 37 |             d) Run Desmond MD with each of the solvated systems
 38 |             e) Generate SimFPs and compatibility information
 39 |             f) Cluster the simulation, reporting representative structures
 40 |          '''))
 41 | 
 42 |     structure = parser.add_argument_group("STRUCTURE INPUT")
 43 |     ffbuilder = parser.add_argument_group("FFBUILDER")
 44 |     desmond = parser.add_argument_group("DESMOND MD")
 45 |     analysis = parser.add_argument_group("MD ANALYSIS")
 46 |     clustering = parser.add_argument_group("MD CLUSTERING")
 47 |     misc = parser.add_argument_group("MISCELLANEOUS")
 48 | 
 49 |     structure.add_argument('-p', '--prot', dest='prot',  default=None, help='protein mae file; must also provide liglib')
 50 |     structure.add_argument('-l', '--liglib', dest='liglib',  default=None, help='ligand library in mae or sdf format; must also provide prot')
 51 |     structure.add_argument('--precomplex', dest='precomplex', default=None, help='mae file with protein and ligand already complexed (e.g., crystal structure); skips FFBuilder')
 52 | 
 53 |     ffbuilder.add_argument('--skip_ff', dest='skip_ff', action='store_true', help='skip FFBuilder; default = false')
 54 |     ffbuilder.add_argument('-o', '--oplsdir', dest='oplsdir', default='%s/.schrodinger/opls_dir'%homepath, help='path to custom forcefield; default = %s/.schrodinger/opls_dir'%homepath)
 55 | 
 56 |     desmond.add_argument('--skip_md', dest='skip_md', action='store_true', help='skip MD simulation; default = false')
 57 |     desmond.add_argument('--solvent', dest='solvent',  default='SPC', help='SPC/TIP3P; default = SPC')
 58 |     desmond.add_argument('-t', '--md_sim_time', dest='md_sim_time', type=float, default='2000', help='in picoseconds; default = 2000')
 59 |     desmond.add_argument('--md_traj_write_freq', dest='md_traj_write_freq', type=float, default='100', help='in picoseconds; default = 100')
 60 |     desmond.add_argument('-r', '--md_repetitions', dest='md_repetitions', type=int, default='1', help='number of MD simulations to run for each ligand, each with a different random seed; default = 1')
 61 | 
 62 |     analysis.add_argument('--skip_analysis', dest='skip_analysis', action='store_true', help='skip MD simulation analysis; default = false')
 63 |     analysis.add_argument('--slice_start', dest='slice_start', type=int, default=0, help='frame to start analysis. default: 0')
 64 |     analysis.add_argument('--slice_end', dest='slice_end', help='frame to end analysis. default: last frame')
 65 |     analysis.add_argument('--analysis_lig', dest='analysis_lig', default='all', help='name of ligand for MD analysis; default = all')
 66 |     analysis.add_argument('--prot_ASL', dest='prot_ASL', default='"protein"', help='ASL definition for protein; default = "protein"')
 67 |     analysis.add_argument('--lig_ASL', dest='lig_ASL', default='"auto"', help='ASL definition for ligands; default = "auto"')
 68 |     analysis.add_argument('--analysis_cutoff', dest='analysis_cutoff', type=float, default='0.0000', help='interactions above this percentage of the simulation will be recorded; default=0.0000')
 69 |     
 70 |     clustering.add_argument('--skip_cluster', dest='skip_cluster', action='store_true', help='skip trajectory clustering; default = false')
 71 |     clustering.add_argument('--n_clusters', dest='n_clusters', type=int, default='5', help='number of clusters to output; default = 5')
 72 |     clustering.add_argument('--rmsd_ASL', dest='rmsd_ASL', default='"auto"', help='ASL definition for RMSD clustering (e.g., ligand); default = "auto"')
 73 |     clustering.add_argument('--centering_ASL', dest='centering_ASL', default='"protein"', help='ASL definition for centering the trajectory (e.g., binding site residues); default = protein')
 74 |     clustering.add_argument('--parch_align_ASL', dest='parch_align_ASL', default='"protein"', help='ASL definition for alignment during parching (e.g., binding site residues); default = protein')
 75 |     clustering.add_argument('--parch_solv_ASL', dest='parch_solv_ASL', default='"auto"', help='ASL definition for atoms around which solvent is retained; default = "auto"')
 76 |     clustering.add_argument('--n_solv', dest='n_solv', type=int, default='100', help='number of solvent molecules to keep during parching; default = 100')
 77 | 
 78 |     misc.add_argument('-m', '--max_workers', dest='max_workers', type=int, default=0, help='number of workers for multitasking; default = min(32, os.cpu_count() + 4)')
 79 |     misc.add_argument('-d', '--debug', action='store_const', dest='loglevel', const=logging.DEBUG, default=logging.INFO, help='Print all debugging statements to log file')
 80 | 
 81 |     #Get all arguments and check for any unknown variables
 82 |     args,unknowns = parser.parse_known_args()
 83 | 
 84 |     #Check if no options passed to script
 85 |     if len(sys.argv)==1:
 86 |         #Capture error
 87 |         logger.critical("provide '-prot' and '-liglib' for basic functionality")
 88 | 
 89 |         #Print help statement
 90 |         parser.print_help()
 91 | 
 92 |         #Exit
 93 |         sys.exit()
 94 | 
 95 |     #Check if unknown variables passed to script
 96 |     if unknowns:
 97 |         #Document warning and ignore variables
 98 |         logger.warning('ignoring unrecognized arguments: %s'%unknowns)
 99 | 
100 |     #Check if protein and ligands are pre-complexed
101 |     if not args.precomplex:
102 |         #If not, check if protein and ligand library files are provided
103 |         if not args.prot or not args.liglib:
104 |             #If not, capture error
105 |             logger.critical("'-prot' and '-liglib' arguments are dependent on each other. Alternatively, provide '-precomplex'")
106 | 
107 |             #Exit
108 |             sys.exit()
109 | 
110 |     #Protein and ligands are pre-complexed
111 |     else:
112 |         #Get filetype
113 |         complexfiletype = os.path.splitext(args.precomplex)[-1].lower()
114 | 
115 |         #Check it is mae
116 |         if complexfiletype == ".mae":
117 |             #Skip FFBuilder
118 |             args.skip_ff=True
119 | 
120 |             #Remove protein filename
121 |             args.prot=None
122 | 
123 |             #Remove ligand filename
124 |             args.liglib=None
125 |         
126 |         #Filetype is not mae
127 |         else:
128 |             #Capture error
129 |             logger.critical("Precomplexed systems must be in MAE format")
130 | 
131 |             #Exit
132 |             sys.exit()
133 | 
134 |     #Return all arguments
135 |     return args
136 | 
137 | def main(master_dir, homepath):
138 |     #Get arguments from user or script defaults
139 |     args = parse_arguments(master_dir, homepath)
140 | 
141 |     #Return all arguemnts
142 |     return args
143 | 
144 | if __name__ == '__main__':
145 |     main(master_dir, homepath)
146 | 


--------------------------------------------------------------------------------
/bin/mdfit_cluster_traj.py:
--------------------------------------------------------------------------------
  1 | #!/ap/rhel7/bin/python3.6
  2 | 
  3 | ####################################################################
  4 | # Corresponding Authors : Alexander Brueckner, Kaushik Lakkaraju ###
  5 | # Contact : alexander.brueckner@bms.com, kaushik.lakkaraju@bms.com #
  6 | ####################################################################
  7 | 
  8 | #Import Python modules
  9 | import logging
 10 | import sys
 11 | import os
 12 | import subprocess
 13 | import glob
 14 | 
 15 | #Import Schrodinger modules
 16 | from schrodinger import structure
 17 | from schrodinger.structutils import analyze
 18 | 
 19 | ###Initiate logger###
 20 | logger = logging.getLogger(__name__)
 21 | 
 22 | def run_job(command):
 23 |     #Run provided command, joining list with space. Pipe stdout and sdterror to log file
 24 |     process = subprocess.run(' '.join(command), stdout=subprocess.PIPE, \
 25 |         stderr=subprocess.STDOUT, shell=True, text=True)
 26 |     
 27 |     #Iterate over sdtout and sdterror
 28 |     for line in process.stdout.split('\n'):
 29 |         #Ignore blank lines
 30 |         if line != "":
 31 |             #Ignore ExitStatus
 32 |             if "ExitStatus" not in line:
 33 |                 #Write to log file for debugging
 34 |                 logger.debug(line)
 35 | 
 36 | def center_traj(SCHRODINGER, cms_path, trj_path, run_cmd, basename, args):
 37 |     #Prepare centering command
 38 |     command = [run_cmd, "trj_center.py", "-t", trj_path, "-asl", args.centering_ASL, cms_path, "%s_centered"%basename]
 39 | 
 40 |     #Capture current step
 41 |     logger.info("Centering trajectory: %s"%' '.join(command))
 42 | 
 43 |     #Run centering command
 44 |     run_job(command)
 45 | 
 46 | def lig_identifier(args, ref_path):
 47 |     #Read in reference structure
 48 |     struct = structure.StructureReader.read(ref_path)
 49 | 
 50 |     #Initiate Schrodinger's ligand searcher utility
 51 |     ligand_searcher = analyze.AslLigandSearcher()
 52 | 
 53 |     #Set minimum atom count
 54 |     ligand_searcher.min_atom_count = 15
 55 | 
 56 |     #Set maximum atom count
 57 |     ligand_searcher.max_atom_count = 300
 58 | 
 59 |     #Excluse ions as ligands
 60 |     ligand_searcher.exclude_ions = True
 61 | 
 62 |     #Include amino acids (peptides)
 63 |     ligand_searcher.exclude_amino_acids = False
 64 | 
 65 |     #Search reference structure given search criteria
 66 |     ligand_list = ligand_searcher.search(struct)
 67 | 
 68 |     #Grab most-likely ligand from all possible ligands
 69 |     ligand = max(ligand_list, key=lambda lig: len(lig.atom_indexes))
 70 | 
 71 |     #Return ligand ASL
 72 |     return ligand
 73 | 
 74 | def parch_traj(SCHRODINGER, ligbase, basename, args, run_cmd, center_cms, center_trj, master_dir, ref_path):
 75 |     #Check if parch ASL is set to default
 76 |     if args.parch_solv_ASL == '"auto"':
 77 |         #If it is, identify ligand ASL using Schrodinger's utilities
 78 |         ligand = lig_identifier(args, ref_path)
 79 | 
 80 |         #Set argument to identified ligand ASL
 81 |         args.parch_solv_ASL = "%s"%ligand.ligand_asl
 82 | 
 83 |     #Prepare trajectory parching command
 84 |     command = [run_cmd, "trj_parch.py", "-output-trajectory-format", "auto", "-ref-mae", ref_path, "-align-asl", args.parch_align_ASL, "-dew-asl", '"%s"'%args.parch_solv_ASL, "-n", str(args.n_solv), center_cms, center_trj, "%s_parched"%basename]
 85 |     
 86 |     #Capture current step
 87 |     logger.info("Parching trajectory: %s"%' '.join(command))
 88 | 
 89 |     #Run parching command
 90 |     run_job(command)
 91 | 
 92 | def cluster_traj(SCHRODINGER, basename, args, run_cmd, ref_path, parch_cms, parch_trj):
 93 |     #Check if rmsd ASL is set to default
 94 |     if args.rmsd_ASL == '"auto"':
 95 |         #If it is, identify ligand ASL using Schrodinger's utilities
 96 |         ligand = lig_identifier(args, ref_path)
 97 | 
 98 |         #Set argument to identified ligand ASL
 99 |         args.rmsd_ASL = "%s"%ligand.ligand_asl
100 | 
101 |     #Prepare trajectory clustering command
102 |     command = [run_cmd, "trj_cluster.py", parch_cms, parch_trj, "%s_cluster"%basename, "-rmsd-asl", '"%s"'%args.rmsd_ASL, "-n", str(args.n_clusters)]
103 |     
104 |     #Capture current step
105 |     logger.info("Clustering trajectory: %s"%' '.join(command))
106 | 
107 |     #Run clustering command
108 |     run_job(command)
109 | 
110 | def main(SCHRODINGER, rep, master_dir, args):
111 |     #Prepare Schrodinger run command ($SCHRODINGER/run)
112 |     run_cmd = os.path.join(SCHRODINGER, 'run')
113 | 
114 |     #Generate basename <ligand>-repetition<#>
115 |     basename = os.path.basename(rep)
116 | 
117 |     #Get ligand basename <ligand>
118 |     ligbase = basename.split("_repetition")[0]
119 | 
120 |     #Generate path to MD repetition dir
121 |     md_path = os.path.join(master_dir, "desmond_md", ligbase, basename)
122 | 
123 |     #Generate path to reference (pre-simulation) file
124 |     ref_path = "%s_out_complex_min.mae"%os.path.join(master_dir, "desmond_md", ligbase, "md_setup", ligbase)
125 | 
126 |     #Check if slice trajectory exists
127 |     if os.path.isfile(os.path.join(md_path, "%s_sliced-out.cms"%basename)) == True:
128 |         #If it does, set cms path to slice file
129 |         cms_path = os.path.join(md_path, "%s_sliced-out.cms"%basename)
130 | 
131 |         #If it does, set trj path to slice trajectory
132 |         trj_path = os.path.join(md_path, "%s_sliced_trj"%basename)
133 |     #Check if unsliced trajectory does and sliced trajectory does not exist
134 |     elif os.path.isfile(os.path.join(md_path, "%s_sliced-out.cms"%basename)) == False and os.path.isfile(os.path.join(md_path, "%s-out.cms"%basename)) == True:
135 |         #If it does, set cms path to unsliced trajectory
136 |         cms_path = os.path.join(md_path, "%s-out.cms"%basename)
137 | 
138 |         #If it does, set trj path to unsliced trajectory
139 |         trj_path = os.path.join(md_path, "%s_trj"%basename)
140 |     
141 |     #Could not find trajectory
142 |     else:
143 |         #Log error
144 |         logger.critical("Trajectory could not be located!")
145 | 
146 |         #Exit
147 |         sys.exit()
148 | 
149 |     #Check if centered trajectory exists
150 |     if os.path.isfile(os.path.join(master_dir, "desmond_md_analysis", ligbase, basename, "%s_centered-out.cms"%basename)) == False:
151 |         #Center trajectory
152 |         center_traj(SCHRODINGER, cms_path, trj_path, run_cmd, basename, args)
153 | 
154 |         #Generate path to centered trajectory file
155 |         center_cms = os.path.join(master_dir, "desmond_md_analysis", "scratch", "%s_centered-out.cms"%basename)
156 | 
157 |         #Generate path to centered trajectory directory
158 |         center_trj = os.path.join(master_dir, "desmond_md_analysis", "scratch", "%s_centered_trj"%basename)
159 |     
160 |     #Centered trajectory exists
161 |     else:
162 |         #Generate path to centered trajectory file
163 |         center_cms = os.path.join(master_dir, "desmond_md_analysis", ligbase, basename, "%s_centered-out.cms"%basename)
164 | 
165 |         #Generate path to centered trajectory directory
166 |         center_trj = os.path.join(master_dir, "desmond_md_analysis", ligbase, basename, "%s_centered_trj"%basename)
167 |         
168 |         #Document current step
169 |         logger.info("Centered trajectory found: %s"%center_cms)
170 | 
171 |     #Check if parched trajectory exists
172 |     if os.path.isfile(os.path.join(master_dir, "desmond_md_analysis", ligbase, basename, "%s_parched-out.cms"%basename)) == False:
173 |         #Parch trajectory (remove excess waters)
174 |         parch_traj(SCHRODINGER, ligbase, basename, args, run_cmd, center_cms, center_trj, master_dir, ref_path)
175 | 
176 |         #Generate path to parched trajectory file
177 |         parch_cms = os.path.join(master_dir, "desmond_md_analysis", "scratch", "%s_parched-out.cms"%basename)
178 | 
179 |         #Generate path to parched trajectory directory
180 |         parch_trj = os.path.join(master_dir, "desmond_md_analysis", "scratch", "%s_parched_trj"%basename)
181 |     
182 |     #Parched trajectory does not exist
183 |     else:
184 |         #Generate path to parched trajectory file
185 |         parch_cms = os.path.join(master_dir, "desmond_md_analysis", ligbase, basename, "%s_parched-out.cms"%basename)
186 | 
187 |         #Generate path to parched trajectory directory
188 |         parch_trj = os.path.join(master_dir, "desmond_md_analysis", ligbase, basename, "%s_parched_trj"%basename)
189 |         
190 |         #Document current step
191 |         logger.info("Parched trajectory found: %s"%parch_cms)
192 |     
193 |     #Generate list of cluster files
194 |     cluster_files = glob.glob(os.path.join(master_dir, "desmond_md_analysis", ligbase, basename, "%s_cluster_0*.cms"%basename))
195 | 
196 |     #Check if cluster files exist
197 |     if cluster_files == []:
198 |         #Cluster trajectory
199 |         cluster_traj(SCHRODINGER, basename, args, run_cmd, ref_path, parch_cms, parch_trj)
200 |     
201 |     #Cluster files exist
202 |     else:
203 |         logger.info("Cluster files found for %s. Skipping clustering."%basename)
204 | 
205 | if __name__ == '__main__':
206 |     main(SCHRODINGER, rep, master_dir, args)


--------------------------------------------------------------------------------
/bin/mdfit_ffbuilder.py:
--------------------------------------------------------------------------------
  1 | #!/ap/rhel7/bin/python3.6
  2 | 
  3 | ####################################################################
  4 | # Corresponding Authors : Alexander Brueckner, Kaushik Lakkaraju ###
  5 | # Contact : alexander.brueckner@bms.com, kaushik.lakkaraju@bms.com #
  6 | ####################################################################
  7 | 
  8 | #Import Python modules
  9 | import logging
 10 | import sys
 11 | import os
 12 | import shutil
 13 | import subprocess
 14 | import time
 15 | import tarfile
 16 | import glob
 17 | 
 18 | ###Initiate logger###
 19 | logger = logging.getLogger(__name__)
 20 | 
 21 | def run_job(command):
 22 |     #Run provided command, joining list with space. Pipe stdout and sdterror to log file
 23 |     process = subprocess.run(' '.join(command), stdout=subprocess.PIPE, \
 24 |         stderr=subprocess.STDOUT, shell=True, text=True)
 25 |     
 26 |     #Iterate over sdtout and sdterror
 27 |     for line in process.stdout.split('\n'):
 28 |         #Ignore blank lines
 29 |         if line != "":
 30 |             #Ignore ExitStatus
 31 |             if "ExitStatus" not in line:
 32 |                 #Write to log file for debugging
 33 |                 logger.debug(line)
 34 | 
 35 | def prep_hostname(args, inst_params):
 36 |     #Add job-server prefix to host and append number of processors
 37 |     hostname = inst_params["hostnames"]["FFBUILDER"]
 38 |     processors = inst_params["parameters"]["FFPROC"]
 39 |     host = "%s:%s"%(hostname, processors)
 40 | 
 41 |     #Document current step
 42 |     logger.info("FFBuilder hostname is %s"%host)
 43 | 
 44 |     #Return prepared hostname
 45 |     return host
 46 | 
 47 | 
 48 | def dircheck(master_dir):
 49 |     #Generate path to FFBuilder directory
 50 |     newdir = os.path.join(master_dir, "ffbuilder")
 51 | 
 52 |     #Check if FFBuilder directory already exists
 53 |     if os.path.isdir(newdir) == False:
 54 |         #If not, make directory
 55 |         os.mkdir(newdir)
 56 | 
 57 |         #Document current step
 58 |         logger.info("Created directory: %s"%newdir)
 59 | 
 60 |     #FFBuilder directory already exists
 61 |     else:
 62 |         #Get current date and time in a string
 63 |         timestr = time.strftime("%Y%m%d-%H%M%S")
 64 | 
 65 |         #Generate temporary directory name
 66 |         tmpfile = "ffbuilder_%s"%timestr
 67 | 
 68 |         #Move old FFBuilder directory to temporary directory
 69 |         os.rename(newdir, tmpfile)
 70 | 
 71 |         #Generate a tar directory name
 72 |         tarfilename = "%s.tar.gz"%tmpfile
 73 | 
 74 |         #Document current step
 75 |         logger.info("Directory found. Archiving to %s"%tarfilename)
 76 | 
 77 |         #Tar/compress old FFBuilder directory
 78 |         with tarfile.open(tarfilename, "w:gz") as tar:
 79 |             tar.add(tmpfile)
 80 | 
 81 |         #Delete (recursive) old FFBuilder directory
 82 |         shutil.rmtree(tmpfile)
 83 | 
 84 |         #Re-create empty FFBuilder directory
 85 |         os.mkdir(newdir)
 86 | 
 87 |     #Document current step
 88 |     logger.info("Changing directory to %s"%newdir)
 89 | 
 90 |     #Change to FFBuilder directory
 91 |     os.chdir(newdir)
 92 | 
 93 |     #Return FFBuilder directory name
 94 |     return newdir
 95 | 
 96 | def gen_opls(args, schrodinger_version):
 97 |     #Change sub-version to underscore to comply with FFBuilder formatting
 98 |     version = schrodinger_version.replace("-", "_")
 99 | 
100 |     #Generate name for force field file
101 |     forcefieldfile = "custom_%s.opls"%version
102 | 
103 |     #Generate full path to force field file
104 |     forcefieldfilepath = os.path.join(args.oplsdir, forcefieldfile)
105 | 
106 |     #Return force field file name and full path
107 |     return forcefieldfile, forcefieldfilepath
108 | 
109 | def ffbuilder(forcefieldfilepath, SCHRODINGER, ligfileprefix, ligpath, newdir, forcefieldfile, args, host):
110 |     #Prepare Schrodinger run command ($SCHRODINGER/ffbuilder)
111 |     run_cmd = os.path.join(SCHRODINGER, 'ffbuilder')
112 | 
113 |     #Generate jobname using ligand file name
114 |     jobname = "MDFit_%s"%ligfileprefix
115 | 
116 |     #Generate path to output opls file for FFBuilder
117 |     outopls = os.path.join(newdir, "%s_oplsdir"%jobname, forcefieldfile)
118 | 
119 |     #Check if custom force field file already exists
120 |     if os.path.isfile(forcefieldfilepath) == False:
121 |         #If not, capture current step
122 |         logger.info("Custom force-field does not exist. Creating one.")
123 | 
124 |         #Check if oplsdir exists
125 |         if os.path.isdir(os.path.join(args.oplsdir)) == False:
126 |             #Make directories (recursive) to custom force field file
127 |             os.makedirs(os.path.dirname(forcefieldfilepath))
128 | 
129 |         #Prepare FFBuilder command
130 |         command = [run_cmd, '-HOST', host, '-JOBNAME', jobname, ligpath, '-WAIT']
131 |     
132 |     #Custom force field file exists
133 |     else:
134 |         #Capture current step
135 |         logger.info("Custom force-field found. Merging new parameters with custom force-field.")
136 | 
137 |         #Prepare FFBuilder command
138 |         command = [run_cmd, '-HOST', host, '-JOBNAME', jobname, '-OPLSDIR', forcefieldfilepath, ligpath, '-WAIT']
139 |     
140 |     #Capture current step
141 |     logger.info("Running FFBuilder: %s"%' '.join(command))
142 | 
143 |     #Run FFBuilder
144 |     run_job(command)
145 | 
146 |     #Return path to output opls file
147 |     return outopls
148 | 
149 | def FFcleanup(forcefieldfilepath, forcefieldfile, homepath, outopls, SCHRODINGER):
150 |     #Check if custom force field file already exists. If not, copy generated file to new oplsdir
151 |     if os.path.isfile(forcefieldfilepath) == False:
152 |         #Generate path to home oplsdir
153 |         home_oplsdir = os.path.join(homepath, ".schrodinger", "opls_dir")
154 | 
155 |         #Get any custom opls files
156 |         opls_files = glob.glob(os.path.join(home_oplsdir, "custom_*.opls"))
157 | 
158 |         #Try copying FFBuilder output file to custom force field path
159 |         if os.path.isfile(outopls) == True:
160 |             #If file was generated, capture current step
161 |             logger.info("Copying custom OPLS to oplsdir: %s"%outopls)
162 |             logger.info("                              : %s"%forcefieldfilepath)
163 | 
164 |             #Copy custom force field to user-specified oplsdir
165 |             shutil.copy(outopls,forcefieldfilepath)
166 |         
167 |             #This fails if custom force field file does not exist and FFBuilder did not generate new parameters. Try finding custom opls file in $HOME
168 |         
169 |         #Check if current release force field file exists in default location
170 |         elif os.path.isfile(os.path.join(home_oplsdir, forcefieldfile)) == True:
171 |             #Capture current step
172 |             logger.info("No new parameters generated. Copying opls file from %s"%home_oplsdir)
173 | 
174 |             #Copy file
175 |             shutil.copy(os.path.join(home_oplsdir, forcefieldfile),forcefieldfilepath)
176 | 
177 |         #See if older version exists in $HOME
178 |         elif opls_files != []:
179 |             #Prepare Schrodinger custom_params utiltiy
180 |             custom_params = os.path.join(SCHRODINGER, "utilities", "custom_params")
181 | 
182 |             #Prepare command for upgrading custom parameters
183 |             command = [custom_params, "upgrade", home_oplsdir]
184 | 
185 |             #Capture current step
186 |             logger.info("Upgrading custom parameters: %s"%' '.join(command))
187 | 
188 |             #Run upgrade
189 |             run_job(command)
190 | 
191 |             #Capture current step
192 |             logger.info("Copying upgraded custom parameters to desired oplsdir")
193 |             
194 |             #Copy upgraded opls file
195 |             shutil.copy(os.path.join(home_oplsdir, forcefieldfile),forcefieldfilepath)
196 | 
197 |         else:
198 |             #Capture error and provide some work-around
199 |             logger.critical("Force field file creation failed and default opls could not be located. Workaround: copy default opls file to the desired --oplsdir and rerun MDFit.")
200 | 
201 |             #Exit
202 |             sys.exit()
203 | 
204 | def main(args, master_dir, SCHRODINGER, ligpath, ligfileprefix, schrodinger_version, inst_params, homepath):
205 |     #Check if user wants to skip FFBuilder
206 |     if args.skip_ff == True:
207 |         #If true, document current step
208 |         logger.info("Skipping FFBuilder (--skip_ff provided by user)")
209 |     
210 |     #User wants FFBuilder
211 |     else:
212 |         #Prepare hostname format
213 |         host = prep_hostname(args, inst_params)
214 | 
215 |         #Generate FFBuilder directory
216 |         newdir = dircheck(master_dir)
217 | 
218 |         #Generate FFBuilder output filename
219 |         forcefieldfile, forcefieldfilepath = gen_opls(args, schrodinger_version)
220 | 
221 |         #Run FFBuilder
222 |         outopls = ffbuilder(forcefieldfilepath, SCHRODINGER, ligfileprefix, ligpath, newdir, forcefieldfile, args, host)
223 | 
224 |         #Move custom parameters to user-specified oplsdir
225 |         FFcleanup(forcefieldfilepath, forcefieldfile, homepath, outopls, SCHRODINGER)
226 | 
227 |     #Document current step
228 |     logger.info("Changing directory to %s"%master_dir)
229 | 
230 |     #Change back to master directory
231 |     os.chdir(master_dir)
232 | 
233 | if __name__ == '__main__':
234 |     main(args, master_dir, SCHRODINGER, ligpath, ligfileprefix, schrodinger_version, inst_params, homepath)
235 | 


--------------------------------------------------------------------------------
/bin/mdfit_desmond_analysis.py:
--------------------------------------------------------------------------------
  1 | #!/ap/rhel7/bin/python3.6
  2 | 
  3 | ####################################################################
  4 | # Corresponding Authors : Alexander Brueckner, Kaushik Lakkaraju ###
  5 | # Contact : alexander.brueckner@bms.com, kaushik.lakkaraju@bms.com #
  6 | ####################################################################
  7 | 
  8 | #Import Python modules
  9 | import logging
 10 | import sys
 11 | import os
 12 | import shutil
 13 | import subprocess
 14 | import glob
 15 | import concurrent.futures
 16 | 
 17 | #Import MDFit modules
 18 | import mdfit_event_analysis
 19 | import mdfit_extract_dat
 20 | import mdfit_combine_csvs
 21 | import mdfit_cluster_traj
 22 | 
 23 | ###Initiate logger###
 24 | logger = logging.getLogger(__name__)
 25 | 
 26 | def run_job(command):
 27 |     #Run provided command, joining list with space. Pipe stdout and sdterror to log file
 28 |     process = subprocess.run(' '.join(command), stdout=subprocess.PIPE, \
 29 |         stderr=subprocess.STDOUT, shell=True, text=True)
 30 |     
 31 |     #Iterate over sdtout and sdterror
 32 |     for line in process.stdout.split('\n'):
 33 |         #Ignore blank lines
 34 |         if line != "":
 35 |             #Ignore ExitStatus
 36 |             if "ExitStatus" not in line:
 37 |                 #Write to log file for debugging
 38 |                 logger.debug(line)
 39 | 
 40 | def dircheck(master_dir):
 41 |     #Generate scratch directory name
 42 |     newdir = os.path.join(master_dir, "desmond_md_analysis", "scratch")
 43 | 
 44 |     #Check if scratch directory exists
 45 |     if os.path.isdir(newdir) == False:
 46 |         #If not, make scratch directory
 47 |         os.makedirs(newdir)
 48 | 
 49 |         #Capture current step
 50 |         logger.info("Created directory: %s"%newdir)
 51 |     
 52 |     #Scratch directory exists
 53 |     else:
 54 |         #Capture current step
 55 |         logger.info("Directory already exists: %s"%newdir)
 56 | 
 57 |     #Document current step
 58 |     logger.info("Changing directory to %s"%newdir)
 59 | 
 60 |     #Change to scratch directory
 61 |     os.chdir(newdir)
 62 | 
 63 |     #Return scratch directory name
 64 |     return newdir
 65 | 
 66 | def ligfile_check(master_dir, args):
 67 |     #Check if user wants to analyze all ligands
 68 |     if args.analysis_lig == "all":
 69 |         #If they do, generate list with paths to all repetition directories
 70 |         reppaths = glob.glob(os.path.join(master_dir, "desmond_md", "*", "*repetition*"))
 71 |     
 72 |     #TODO: accept list of ligands for analysis
 73 |     #Otherwise, user wants single ligand analysis
 74 |     else:
 75 |         #Assign list with path to single repetition directory
 76 |         reppaths = [os.path.join(master_dir, "desmond_md", "*", "*%s*"%args.analysis_lig)]
 77 | 
 78 |     #Return list of paths to repeptition directories
 79 |     return reppaths
 80 | 
 81 | def prep_workers(args):
 82 |     #Check if user provided a number of workers
 83 |     if args.max_workers == 0:
 84 |         #If not, return either 32 or (number of cpu+4)
 85 |         workers = min(32, os.cpu_count() + 4)
 86 |     
 87 |     #User provided number of workers
 88 |     else:
 89 |         #Assign to variable
 90 |         workers = args.max_workers
 91 |     
 92 |     #Capture current step
 93 |     logger.info("Using %s workers for executing calls asynchronously"%workers)
 94 | 
 95 |     #Return number of workers
 96 |     return workers
 97 | 
 98 | def run_analysis(SCHRODINGER, rep, master_dir, args, inst_params):
 99 |     #Analyze given trajectory. Call mdfit_event_analysis.py
100 |     event_analysis_command2 = mdfit_event_analysis.main(SCHRODINGER, rep, master_dir, args, inst_params)
101 | 
102 |     #Return command for serial job
103 |     return event_analysis_command2
104 | 
105 | def dat_extract(pdf_commands):
106 |     #Iterate over all dat extract commands
107 |     for command in pdf_commands:
108 |         #Check if commands were generated. Can be empty if previous eaf files are found
109 |         if command != []:
110 |             #Capture current step
111 |             logger.info("Generating data files: %s"%' '.join(command))
112 | 
113 |             #Run each job serially
114 |             run_job(command)
115 | 
116 | def tabulate_simfp(SCHRODINGER, rep, master_dir, args):
117 |     #Tabulate SimFP and compatibility data. Calls mdfit_extract_dat.py
118 |     mdfit_extract_dat.main(SCHRODINGER, rep, master_dir, args)
119 | 
120 |     #Check if clustering will happen
121 |     if args.skip_cluster == True:
122 |         #If skipping, do cleanup now
123 |         cleanup(rep, master_dir, args)
124 | 
125 | def combine_csvs(master_dir):
126 |     #Combine all tabulated ligand-specific SimFPs and compatibility files into master file (serial)
127 |     #Call mdfit_combine_csvs.py
128 |     mdfit_combine_csvs.main(master_dir)
129 | 
130 | def cleanup(rep, master_dir, args):
131 |     #Generate ligand name <ligname>-repetition<#>
132 |     basename = os.path.basename(rep)
133 | 
134 |     #Generate base ligand name <ligname>
135 |     ligbase = basename.split("_repetition")[0]
136 | 
137 |     #Generate path to repetition directory desmond_md_analysis/<ligname>/<ligname>-repetition<#>
138 |     repdir = os.path.join(master_dir, "desmond_md_analysis", ligbase, basename)
139 |     
140 |     #Check if reptition directory exists
141 |     if os.path.isdir(repdir) == False:
142 |         #If not, make directory (recursive)
143 |         os.makedirs(repdir)
144 | 
145 |         #Capture current step
146 |         logger.info("Created directory: %s"%repdir)
147 | 
148 |     #Directory exists
149 |     else:
150 |         #Capture current step
151 |         logger.info("Directory already exists: %s"%repdir)
152 | 
153 |     #Generate list of paths for files to move from desmond_md_analysis/scratch/<ligname>*
154 |     searches = ("%s_*"%basename, "%s.*"%basename)
155 |     move_files = []
156 |     for each_search in searches:
157 |         move_files.extend(glob.glob(os.path.join(master_dir, "desmond_md_analysis", "scratch", each_search)))
158 | 
159 |     #Iterate over list of file paths
160 |     for file in move_files:
161 |         #Move to repetition directory
162 |         shutil.move(file, os.path.join(repdir, os.path.basename(file)))
163 | 
164 | def cluster_traj(SCHRODINGER, rep, master_dir, args):
165 |     #Cluster trajectories. Calls mdfit_cluster_traj.py
166 |     mdfit_cluster_traj.main(SCHRODINGER, rep, master_dir, args)
167 | 
168 |     #Move files from scratch to repetition directories
169 |     cleanup(rep, master_dir, args)
170 | 
171 | def main(args, master_dir, SCHRODINGER, inst_params):
172 |     #Generate scratch directory
173 |     scratch_dir = dircheck(master_dir)
174 | 
175 |     #Check that user wants analysis
176 |     if args.skip_analysis == False:
177 |         #If they do, get paths to all the repetition files
178 |         reppaths = ligfile_check(master_dir, args)
179 | 
180 |         #Prepare number of workers based on ThreadPoolExecutor suggestion
181 |         workers = prep_workers(args)
182 | 
183 |         #Initiate empty list for final dat and png file extraction. Has to be run serially
184 |         pdf_commands = []
185 | 
186 |         #Start parallel task controller
187 |         with concurrent.futures.ThreadPoolExecutor(max_workers=workers) as executor:
188 |             #Run MD analysis asynchronously
189 |             analysis_jobs = {executor.submit(run_analysis, SCHRODINGER, rep, master_dir, args, inst_params): rep for rep in reppaths}
190 | 
191 |             #For each asynchronous job
192 |             for future in concurrent.futures.as_completed(analysis_jobs):
193 |                 #Capture the output
194 |                 lig = analysis_jobs[future]
195 | 
196 |                 #Try getting the serial command
197 |                 try:
198 |                     pdf_commands.append(future.result())
199 | 
200 |                 #If a step in MD analysis fails
201 |                 except Exception as exc:
202 |                     #Capture error
203 |                     logger.critical("%s generated an exception during event analysis: %s"%(lig, exc))
204 | 
205 |                     #Exit
206 |                     sys.exit()
207 | 
208 |                 #Otherwise, MD analysis was successful
209 |                 else:
210 |                     #Capture current step
211 |                     logger.info("Analysis success: %s"%(lig))
212 | 
213 |         #Capture current step
214 |         logger.info("Extracting dat and png files serially")
215 | 
216 |         #Limitation of Schrodinger's utility. Cannot control output filenames. Forced to run dat extraction serially.
217 |         dat_extract(pdf_commands)
218 | 
219 |         #Start parallel task controller
220 |         with concurrent.futures.ThreadPoolExecutor(max_workers=workers) as executor:
221 |             #Run MD analysis tabulation asynchronously
222 |             tabulate_jobs = {executor.submit(tabulate_simfp, SCHRODINGER, rep, master_dir, args): rep for rep in reppaths}
223 |             
224 |             #For each asynchronous job
225 |             for future in concurrent.futures.as_completed(tabulate_jobs):
226 |                 #Capture the output
227 |                 lig = tabulate_jobs[future]
228 |                 
229 |                 #Try getting returned data
230 |                 try:
231 |                     data = future.result()
232 | 
233 |                 #If a step in MD analysis tabulation fails
234 |                 except Exception as exc:
235 |                     #Capture error
236 |                     logger.critical("%s generated an exception during tabulation: %s"%(lig, exc))
237 | 
238 |                     #Exit
239 |                     sys.exit()
240 |                 
241 |                 #Otherwise, MD analysis tabulation was successful
242 |                 else:
243 |                     #Capture current step
244 |                     logger.info("Trj extraction success: %s"%(lig))
245 |         
246 |         #Combine all SimFP and compatibility CSV files into a master file. Must be serial
247 |         combine_csvs(master_dir)
248 | 
249 |         #Check if the user wants to cluster the trajectories
250 |         if args.skip_cluster == True:
251 |             #If true, document current step
252 |             logger.info("Skipping trajectory clustering (--skip_cluster provided by user)")
253 |     
254 |         #User wants clustering
255 |         else:
256 |             #If they do, start parallel task controller
257 |             with concurrent.futures.ThreadPoolExecutor(max_workers=workers) as executor:
258 |                 #Run MD trajectory clustering asynchronously
259 |                 cluster_jobs = {executor.submit(cluster_traj, SCHRODINGER, rep, master_dir, args): rep for rep in reppaths}
260 | 
261 |                 #For each asynchronous job
262 |                 for future in concurrent.futures.as_completed(cluster_jobs):
263 |                     #Capture the output
264 |                     lig = cluster_jobs[future]
265 | 
266 |                     #Try getting returned data
267 |                     try:
268 |                         data = future.result()
269 | 
270 |                     #If a step in MD trajectory clustering fails
271 |                     except Exception as exc:
272 |                         #Capture the error
273 |                         logger.critical("%s generated an exception during clustering: %s"%(lig, exc))
274 | 
275 |                         #Exit
276 |                         sys.exit()
277 |                     
278 |                     #Otherwise, MD trajectory clustering was successful
279 |                     else:
280 |                         #Capture current step
281 |                         logger.info("Clustering success: %s"%(lig))
282 | 
283 |     #Capture current step
284 |     logger.info("Changing directory to %s"%master_dir)
285 | 
286 |     #Change to master directory
287 |     os.chdir(master_dir)
288 | 
289 | if __name__ == '__main__':
290 |     main(args, master_dir, SCHRODINGER, inst_params)


--------------------------------------------------------------------------------
/bin/mdfit_extract_dat.py:
--------------------------------------------------------------------------------
  1 | #!/ap/rhel7/bin/python3.6
  2 | 
  3 | ####################################################################
  4 | # Corresponding Authors : Alexander Brueckner, Kaushik Lakkaraju ###
  5 | # Contact : alexander.brueckner@bms.com, kaushik.lakkaraju@bms.com #
  6 | ####################################################################
  7 | 
  8 | #Import Python modules
  9 | import logging
 10 | import sys
 11 | import os
 12 | import pandas as pd
 13 | 
 14 | #Import MDFit modules
 15 | import mdfit_slicetrj
 16 | 
 17 | ###Initiate logger###
 18 | logger = logging.getLogger(__name__)
 19 | 
 20 | def trj_pathnames(md_path, basename):
 21 |     #Check if slice trajectory exists
 22 |     if os.path.isfile(os.path.join(md_path, "%s_sliced-out.cms"%basename)) == True:
 23 |         #If it does, set cms path to sliced trajectory file
 24 |         cms_path = os.path.join(md_path, "%s_sliced-out.cms"%basename)
 25 | 
 26 |         #If it does, set trj path to sliced trajectory directory
 27 |         trj_path = os.path.join(md_path, "%s_sliced_trj"%basename)
 28 |     
 29 |     #Check if unsliced trajectory does and sliced trajectory does not exist
 30 |     elif os.path.isfile(os.path.join(md_path, "%s_sliced-out.cms"%basename)) == False and os.path.isfile(os.path.join(md_path, "%s-out.cms"%basename)) == True:
 31 |         #If it does, set cms path to unsliced trajectory file
 32 |         cms_path = os.path.join(md_path, "%s-out.cms"%basename)
 33 | 
 34 |         #If it does, set trj path to unsliced trajectory file
 35 |         trj_path = os.path.join(md_path, "%s_trj"%basename)
 36 |     
 37 |     #Could not locate trajectory
 38 |     else:
 39 |         #Log error
 40 |         logger.critical("Trajectory could not be located!")
 41 | 
 42 |         #Exit
 43 |         sys.exit()
 44 | 
 45 |     #Return paths to trajectory files
 46 |     return cms_path, trj_path
 47 | 
 48 | def simfp(dat_files, round_int, basename, master_dir, num_frames, args, compat_prep, ligbase, repnum):
 49 |     #Initiate SimFP dataframe with ligand and repetition information
 50 |     simfp_prep = pd.DataFrame({'Molecule':['Repetition'], ligbase:[repnum]})
 51 | 
 52 |     #Iterate over all dat files
 53 |     for file in os.listdir(dat_files):
 54 |         #Pick out protein-ligand interaction files
 55 |         if file.startswith('PL-Contacts') and file.endswith('.dat'):
 56 |             #Get interaction type based on filename
 57 |             int_type = file.split('_')[1].split('.dat')[0]
 58 |             
 59 |             #Initiate dataframe for the dat file
 60 |             df = pd.read_csv(os.path.join(dat_files, file), sep="\s+")
 61 | 
 62 |             #Drop first column (extra "#")
 63 |             df = df.shift(axis=1).drop('#', axis=1)
 64 | 
 65 |             #If reading metal interaction dat file
 66 |             if file.endswith('Metal.dat'):
 67 |                 #Remove protein-metal interactions
 68 |                 df = df[df["Type"].str.contains("prot") == False]
 69 | 
 70 |                 #Calculate average number of interactions per frame
 71 |                 avg_int = round((len(df.index)/num_frames), round_int)
 72 | 
 73 |                 #Add average number of metal interactions to compatibility dataframe
 74 |                 compat_prep.loc[len(compat_prep.index)] = ["Average_%s_per_frame"%int_type, avg_int]
 75 | 
 76 |                 #Generate SimFP notation (Chain:ResNum:InteractionType) modified for metals (no chain)
 77 |                 df['Sort'] = '_:' + df['MetalSite'] + ':%s'%int_type
 78 |                 
 79 |                 #Initiate temporary dataframe with unique SimFPs
 80 |                 add_res = pd.DataFrame(df['Type'].groupby(df['Sort']).count().index.values, columns=['Molecule'])
 81 | 
 82 |                 #Calculate SimFPs (fraction of time interaction occurs across frames)
 83 |                 add_res[ligbase] = df['Type'].groupby(df['Sort']).count().values/num_frames
 84 | 
 85 |                 #Round to desired decimal places
 86 |                 add_res[ligbase] = add_res[ligbase].round(round_int)
 87 | 
 88 |                 #Remove SimFPs below cutoff
 89 |                 add_res[add_res[ligbase]>=args.analysis_cutoff]
 90 |                 
 91 |                 #Combine interaction-specific dataframe into full dataframe
 92 |                 simfp_prep = simfp_prep.append(add_res,ignore_index=True)
 93 | 
 94 |             #Not metal interaction dat file
 95 |             else:
 96 |                 #Calcaulte average number of interactions per frame
 97 |                 avg_int = round((len(df.index)/num_frames), round_int)
 98 | 
 99 |                 #Add average number of metal interactions to compatibility dataframe
100 |                 compat_prep.loc[len(compat_prep.index)] = ["Average_%s_per_frame"%int_type, avg_int]
101 | 
102 |                 #Generate SimFP notation (Chain:ResNum:InterationType)
103 |                 df['Sort'] = df['Chain'] + ':' + df['Residue#'].astype(str) + ':%s'%int_type
104 | 
105 |                 #Initiate temporary dataframe with unique SimFPs
106 |                 add_res = pd.DataFrame(df['Residue#'].groupby(df['Sort']).count().index.values, columns=['Molecule'])
107 | 
108 |                 #Calculate SimFPs (fraction of time interaction occurs across frames)
109 |                 add_res[ligbase] = df['Residue#'].groupby(df['Sort']).count().values/num_frames
110 |                 
111 |                 #Round to desired decimal places
112 |                 add_res[ligbase] = add_res[ligbase].round(round_int)
113 | 
114 |                 #Remove SimFPs below cutoff
115 |                 add_res[add_res[ligbase]>=args.analysis_cutoff]
116 |             
117 |                 #Combine interaction-specific dataframe into full dataframe
118 |                 simfp_prep = simfp_prep.append(add_res,ignore_index=True)
119 | 
120 |     #Transpose SimFP dataframe and write to csv file
121 |     simfp_prep.transpose().to_csv(os.path.join(master_dir, "desmond_md_analysis", "scratch", "%s_SimFP.csv"%basename), header=False)
122 | 
123 | def compatibility(dat_files, round_int, basename, master_dir, num_frames, args, compat_prep):
124 |     #Iterate over all dat files
125 |     for file in os.listdir(dat_files):
126 |         #If reading ligand properties dat file
127 |         if file == "L-Properties.dat":
128 |             #Initiate dataframe for the dat file
129 |             df = pd.read_csv(os.path.join(dat_files, file), sep="\s+")
130 | 
131 |             #Drop first column to fix spacing issue
132 |             df = df.shift(axis=1).drop('Frame', axis=1)
133 | 
134 |             #Rename "#" to "Frame"
135 |             df = df.rename(columns={'#': 'Frame'})
136 | 
137 |             #Iterate over all frames in dataframe
138 |             for col in df.columns:
139 |                 #Check if not reading frame column
140 |                 if "Frame" not in col:
141 |                     #Calculate the average value for the given property
142 |                     avg_val = df.loc[:, '%s'%col].mean().round(round_int)
143 | 
144 |                     #Add average to compatibility dataframe
145 |                     compat_prep.loc[len(compat_prep.index)] = ["Avg_lig_%s"%col, avg_val]
146 | 
147 |         #Check if reading ligand RMSF dat file
148 |         elif file == "L_RMSF.dat":
149 |             #Initiate dataframe for the dat file
150 |             df = pd.read_csv(os.path.join(dat_files, file), delim_whitespace=True, na_values=[''])
151 | 
152 |             #Drop first column to fix spacing (stray "#")
153 |             df = df.shift(axis=1).drop('#', axis=1)
154 | 
155 |             #"PDBResName" can be empty. Hack to get around alignment issues in pandas
156 |             #Check if last column in dataframe is empty (NaN)
157 |             if df['wrt_Ligand'].isnull().values.any():
158 |                 #If it is, shift the dataframe over and drop the Atom# column
159 |                 df = df.shift(axis=1).drop('Atom#', axis=1)
160 | 
161 |                 #Rename PDBResName to Frame
162 |                 df = df.rename(columns={'PDBResName': 'Frame'})
163 | 
164 |             #Iterate over all columns in dataframe
165 |             for col in df.columns:
166 |                 #Check if reading property column
167 |                 if "wrt" in col:
168 |                     #Calculate the average value for the given property
169 |                     avg_val = df.loc[:, '%s'%col].mean().round(round_int)
170 | 
171 |                     #Add average to compatibility dataframe
172 |                     compat_prep.loc[len(compat_prep.index)] = ["Avg_ligRMSF_%s"%col, avg_val]
173 | 
174 |         #Check if reading protein and ligand RMSD dat file
175 |         elif file == "PL_RMSD.dat":
176 |             #Initiate dataframe for the dat file
177 |             df = pd.read_csv(os.path.join(dat_files, file), sep="\s+")
178 | 
179 |             #Drop first column to fix spacing issue (stray "#")
180 |             df = df.shift(axis=1).drop('#', axis=1)
181 | 
182 |             #Iterate over all columns in dataframe
183 |             for col in df.columns:
184 |                 #Check if column is not "frame"
185 |                 if "frame" not in col:
186 |                     #Calculate average value for the given property
187 |                     avg_val = df.loc[:, '%s'%col].mean().round(round_int)
188 | 
189 |                     #Add average to compatibility dataframe
190 |                     compat_prep.loc[len(compat_prep.index)] = ["Avg_PL_RMSD_%s"%col, avg_val]
191 | 
192 |         #Check if reading protein RMSF dat file
193 |         elif file == "P_RMSF.dat":
194 |             #Initiate dataframe for the dat file
195 |             df = pd.read_csv(os.path.join(dat_files, file), sep="\s+")
196 | 
197 |             #Drop first column to fix spacing (stray "#")
198 |             df = df.shift(axis=1).drop('#', axis=1)
199 | 
200 |             #Initiate list with columns of interest
201 |             columns = ["CA", "Backbone", "Sidechain", "All_Heavy"]
202 | 
203 |             #Iterate over list of column names
204 |             for col in columns:
205 |                 #Calculate average value for the given property
206 |                 avg_val = df.loc[:, '%s'%col].mean().round(round_int)
207 | 
208 |                 #Add average to compatibility dataframe
209 |                 compat_prep.loc[len(compat_prep.index)] = ["Avg_protRMSF_%s"%col, avg_val]
210 |         
211 |     #Transpose compatibility dataframe and write to csv file
212 |     compat_prep.transpose().to_csv(os.path.join(master_dir, "desmond_md_analysis", "scratch", "%s_compatibility.csv"%basename), header=False)
213 | 
214 | def main(SCHRODINGER, rep, master_dir, args):
215 |     #Generate repetition name <ligname>-repetition<#>
216 |     basename = os.path.basename(rep)
217 | 
218 |     #Generate ligand name <ligname>
219 |     ligbase = basename.split("_repetition")[0]
220 | 
221 |     #Get repetition number
222 |     repnum = basename.split("_repetition")[-1]
223 | 
224 |     #Check if dat files are in scratch
225 |     if os.path.isfile(os.path.join(master_dir, "desmond_md_analysis", "scratch", basename, "P_RMSF.dat")) == True:
226 |         #If they are, set path to scratch space
227 |         dat_files = os.path.join(master_dir, "desmond_md_analysis", "scratch", basename)
228 |     
229 |     #Check if dat files are in permanent space
230 |     elif os.path.isfile(os.path.join(master_dir, "desmond_md_analysis", ligbase, basename, basename, "P_RMSF.dat")) == True:
231 |         #If they are, set path to permanent space
232 |         dat_files = os.path.join(master_dir, "desmond_md_analysis", ligbase, basename, basename)
233 |     
234 |     #Cannot fine dat files
235 |     else:
236 |         #Capture error
237 |         logger.critical("Dat files could not be located for %s. Try removing desmond_md_analysis directory are re-running analysis."%basename)
238 | 
239 |         #Exit
240 |         sys.exit()
241 | 
242 |     #Generate path to MD trajectories
243 |     md_path = os.path.join(master_dir, "desmond_md", ligbase, basename)
244 | 
245 |     #Generate paths to trajectory files
246 |     cms_path, trj_path = trj_pathnames(md_path, basename)
247 | 
248 |     #Initiate compatibility dataframe with ligand and repetition info
249 |     compat_prep = pd.DataFrame({'Molecule':['Repetition'], ligbase:[repnum]})
250 | 
251 |     #Get number of frames. Calls mdfit_slicetrj.py
252 |     num_frames = mdfit_slicetrj.count_frames(trj_path)
253 | 
254 |     #Number of decimal places for rounding
255 |     round_int=4
256 | 
257 |     #Generate repetition-specific SimFPs
258 |     simfp(dat_files, round_int, basename, master_dir, num_frames, args, compat_prep, ligbase, repnum)
259 | 
260 |     #Generate repetition-specific compatibility metrics
261 |     compatibility(dat_files, round_int, basename, master_dir, num_frames, args, compat_prep)
262 | 
263 | if __name__ == '__main__':
264 |     main(SCHRODINGER, rep, master_dir, args)


--------------------------------------------------------------------------------
/bin/mdfit_desmond_md.py:
--------------------------------------------------------------------------------
  1 | #!/ap/rhel7/bin/python3.6
  2 | 
  3 | ####################################################################
  4 | # Corresponding Authors : Alexander Brueckner, Kaushik Lakkaraju ###
  5 | # Contact : alexander.brueckner@bms.com, kaushik.lakkaraju@bms.com #
  6 | ####################################################################
  7 | 
  8 | #Import Python modules
  9 | import logging
 10 | import sys
 11 | import os
 12 | import shutil
 13 | import subprocess
 14 | import threading
 15 | import concurrent.futures
 16 | import glob
 17 | import random
 18 | 
 19 | #Import MDFit modules
 20 | import mdfit_prep_complex
 21 | import mdfit_run_minimization
 22 | import mdfit_get_charge
 23 | import mdfit_build_box
 24 | import mdfit_run_md
 25 | import mdfit_slicetrj
 26 | 
 27 | ###Initiate logger###
 28 | logger = logging.getLogger(__name__)
 29 | 
 30 | def run_job(command):
 31 |     #Run provided command, joining list with space. Pipe stdout and sdterror to log file
 32 |     process = subprocess.run(' '.join(command), stdout=subprocess.PIPE, \
 33 |         stderr=subprocess.STDOUT, shell=True, text=True)
 34 |     
 35 |     #Iterate over sdtout and sdterror
 36 |     for line in process.stdout.split('\n'):
 37 |         #Ignore blank lines
 38 |         if line != "":
 39 |             #Ignore ExitStatus
 40 |             if "ExitStatus" not in line:
 41 |                 #Write to log file for debugging
 42 |                 logger.debug(line)
 43 | 
 44 | def random_seed():
 45 |     #Initialize random number generator 
 46 |     random.seed()
 47 | 
 48 |     #Return random number
 49 |     return int(random.random()*1000000)
 50 | 
 51 | def dircheck(master_dir):
 52 |     #Generate scratch directory name
 53 |     newdir = os.path.join(master_dir, "desmond_md", "scratch")
 54 | 
 55 |     #Check if scratch directory exists
 56 |     if os.path.isdir(newdir) == False:
 57 |         #If not, make scratch directory
 58 |         os.makedirs(newdir)
 59 | 
 60 |         #Capture current step
 61 |         logger.info("Created directory: %s"%newdir)
 62 |     
 63 |     #Scratch directory exists
 64 |     else:
 65 |         #Capture current step
 66 |         logger.info("Directory already exists: %s"%newdir)
 67 |     
 68 |     #Return scratch directory name
 69 |     return newdir
 70 | 
 71 | def countligs(ligpath, SCHRODINGER):
 72 |     #Check if ligand name file already exists
 73 |     if os.path.isfile("lignames.csv") == False:
 74 |         #If not, set up Schrodinger proplister variable ($SCHRODINGER/utilities/proplister)
 75 |         run_cmd = os.path.join(SCHRODINGER, 'utilities', 'proplister')
 76 | 
 77 |         #Set up full command
 78 |         command = [run_cmd, '-p', 'title', '-noheader', ligpath, '-c', '-o', 'lignames.csv']
 79 | 
 80 |         #Document current step
 81 |         logger.info("Getting lignames: %s"%' '.join(command))
 82 | 
 83 |         #Run job
 84 |         run_job(command)
 85 | 
 86 |     #Read in ligand name file
 87 |     with open("lignames.csv", 'r') as fp:
 88 | 
 89 |         #Get the number of ligands but counting length of file
 90 |         numligs = len(fp.readlines())
 91 | 
 92 |     #Document current step
 93 |     logger.info("Number of ligands: %s"%numligs)
 94 | 
 95 |     #Return the number of ligands for MD
 96 |     return numligs
 97 | 
 98 | def gen_list(num_ligs):
 99 |     #Initiate empty list
100 |     lignum = []
101 | 
102 |     #Iterate between 0 and number of ligs
103 |     for i in range(0, num_ligs):
104 |         #Add to list
105 |         lignum.append(i)
106 | 
107 |     #Return list with explicit ligand numbers
108 |     return lignum
109 | 
110 | def prep_workers(args):
111 |     #Check if user provided a number of workers
112 |     if args.max_workers == 0:
113 |         #If not, return either 32 or (number of cpu+4)
114 |         workers = min(32, os.cpu_count() + 4)
115 |     
116 |     #User provided number of workers
117 |     else:
118 |         #Assign to variable
119 |         workers = args.max_workers
120 |     
121 |     #Capture current step
122 |     logger.info("Using %s workers for executing calls asynchronously"%workers)
123 | 
124 |     #Return number of workers
125 |     return workers
126 | 
127 | def lig_extract(master_dir, i):
128 |     #Read in ligand name file
129 |     with open("lignames.csv", 'r') as infile:
130 |         #Put all ligand names in a variable
131 |         all_lines = infile.readlines()
132 |     
133 |     #Get ligand name for a given ligand number (i). Remove any trailing spaces/new line character
134 |     ligname_base = all_lines[i].strip()
135 | 
136 |     #Return desired ligand name
137 |     return ligname_base
138 | 
139 | def cleanup_dirs(master_dir, ligname_base, args, all_md_names):
140 |     #Generate MD setup directory within each ligand name (desmond_md/<ligname>/md_setup)
141 |     setup_dir = os.path.join(master_dir, "desmond_md", ligname_base, "md_setup")
142 | 
143 |     #Check if directory exists
144 |     if os.path.isdir(setup_dir) == False:
145 |         #If not, make directory (recursive)
146 |         os.makedirs(setup_dir)
147 | 
148 |         #Document current step
149 |         logger.info("Created directory: %s"%setup_dir)
150 |     
151 |     #Directory does not exist
152 |     else:
153 |         #Document current step
154 |         logger.info("Directory already exists: %s"%setup_dir)
155 | 
156 |     #Initiate variable for iterating
157 |     j = 0
158 | 
159 |     #Initiate empty list for temporary repetition names
160 |     md_names = []
161 | 
162 |     #Iterate over number of repetitions
163 |     while j < args.md_repetitions:
164 | 
165 |         #Generate repetition name using ligand name and iteration
166 |         repname = "%s_repetition%s"%(ligname_base, j+1)
167 | 
168 |         #Append repetition name to master repetition name list
169 |         all_md_names.append(repname)
170 | 
171 |         #Append reptition name to temporary name list
172 |         md_names.append(repname)
173 | 
174 |         #Generate repetition directory name
175 |         repdir = os.path.join(master_dir, "desmond_md", ligname_base, repname)
176 | 
177 |         #Check if repetition directory exists
178 |         if os.path.isdir(repdir) == False:
179 |             #If not, make directory (recursive)
180 |             os.makedirs(repdir)
181 | 
182 |             #Document current step
183 |             logger.info("Created directory: %s"%repdir)
184 |         
185 |         #Repetition directory exists
186 |         else:
187 |             #Document current step
188 |             logger.info("Directory already exists: %s"%repdir)
189 |         
190 |         #Increment repetition variable
191 |         j+=1
192 |     
193 |     #Return md_setup directory path and temporary reptition name list
194 |     return setup_dir, md_names
195 | 
196 | def move_copy_files(master_dir, ligname_base, setup_dir, md_names, args, template_dir):
197 |     #Generate list of files in desmond_md/scratch/<ligname>* to move
198 |     searches = ("%s_*"%ligname_base, "%s.*"%ligname_base)
199 |     move_files = []
200 |     for each_search in searches:
201 |         move_files.extend(glob.glob(os.path.join(master_dir, "desmond_md", "scratch", each_search)))
202 | 
203 |     #Iterate over filenames
204 |     for file in move_files:
205 |         #Make sure only moving setup files
206 |         if "repetition" not in file:
207 |             #If setup file, move to setup directory
208 |             shutil.move(file, os.path.join(setup_dir, os.path.basename(file)))
209 | 
210 |     #Iterate over each repetition name
211 |     for rep in md_names:
212 |         #Copy prepared input geometry back to scratch and rename for repetition
213 |         shutil.copy(os.path.join(setup_dir, "%s_md_setup_out.cms"%ligname_base), os.path.join(master_dir, "desmond_md", "scratch", "%s_md.cms"%rep))
214 |         
215 |         #Generate random number for seed for MD
216 |         rseed = random_seed()
217 |         
218 |         #Check if cfg file exists
219 |         if os.path.isfile(os.path.join(master_dir, "desmond_md", "scratch", "%s_md.cfg"%rep)) == False:
220 |             #If not, read in template cfg file
221 |             with open(os.path.join(template_dir, "desmond_md_job_template.cfg"), "r") as template:
222 |                 #Put all lines in a variable
223 |                 lines = template.readlines()
224 |             
225 |             #Open repetition-specific cfg file for writing
226 |             with open(os.path.join(master_dir, "desmond_md", "scratch", "%s_md.cfg"%rep), "w") as ligoutput:
227 |                 #Iterate over all template lines
228 |                 for line in lines:
229 |                     #Write to output cfg, replacing SIMTIME, RSEED, and WRITEFRQ with prepared variables (simulation time, random seed, simulation write frequency)
230 |                     ligoutput.write(line.replace("SIMTIME",str(args.md_sim_time)).replace("RSEED",str(rseed)).replace("WRITEFRQ",str(args.md_traj_write_freq)))
231 |         
232 |         #Check if msj file exists
233 |         if os.path.isfile(os.path.join(master_dir, "desmond_md", "scratch", "%s_md.msj"%rep)) == False:
234 |             #If not, read in template msj file
235 |             with open(os.path.join(template_dir, "desmond_md_job_template.msj"), "r") as template:
236 |                 #Put all lines in a variable
237 |                 msjlines = template.readlines()
238 |             
239 |             #Open repetition-specific msj file for writing
240 |             with open(os.path.join(master_dir, "desmond_md", "scratch", "%s_md.msj"%rep), "w") as ligoutput:
241 |                 #Iterate over all template lines
242 |                 for msjline in msjlines:
243 |                     #Write to output msj, replacing CONFIG_NAME (repetition-specific cfg filename)
244 |                     ligoutput.write(msjline.replace("CONFIG_NAME","%s_md.cfg"%rep))
245 | 
246 | def rep_one_setup(SCHRODINGER, ligpath, i, master_dir, args, bmin_host, multisim_host, all_md_names, template_dir):
247 |     #Extract specific ligand from ligand library and get ligand base name
248 |     ligname_base = lig_extract(master_dir, i)
249 | 
250 |     #Complex protein and ligand. Calls mdfit_prep_complex.py
251 |     pvcomplex = mdfit_prep_complex.main(SCHRODINGER, ligpath, ligname_base, i, master_dir, args)
252 | 
253 |     #Minimize prepared complex. Calls mdfit_run_minimization.py
254 |     bmincomplex = mdfit_run_minimization.main(ligname_base, pvcomplex, args, bmin_host, SCHRODINGER, master_dir, template_dir)
255 | 
256 |     #Calculate the total charge of the system. Calls mdfit_get_charge.py
257 |     charge = mdfit_get_charge.main(SCHRODINGER, ligname_base, master_dir, args)
258 | 
259 |     #Neutralizes and solvates the minimized protein and ligand complex. Calls mdfit_build_box.py
260 |     simbox = mdfit_build_box.main(master_dir, SCHRODINGER, args, charge, ligname_base, multisim_host, bmincomplex, template_dir)
261 | 
262 |     #Blocks multiple threads writing to file at the same time
263 |     with threading.Lock():
264 |         #Move MD setup files to permanent directory. Generate permanent directory name and get all repetition names
265 |         setup_dir, md_names = cleanup_dirs(master_dir, ligname_base, args, all_md_names)
266 | 
267 |     #Prepares repetition-specfic input files for MD (cfg/msj files)
268 |     move_copy_files(master_dir, ligname_base, setup_dir, md_names, args, template_dir)
269 | 
270 |     #Return ligand base name
271 |     return ligname_base
272 | 
273 | def move_trj_files(master_dir, lig, lig_basename):
274 |     #Get list of all files in desmond_md/scratch/<ligname>* to move
275 |     move_files = glob.glob(os.path.join(master_dir, "desmond_md", "scratch", "%s*"%lig))
276 |     
277 |     #Iterate over all files in list
278 |     for file in move_files:
279 |         #Make sure only moving repetition files
280 |         if "repetition" in file:
281 |             #If repetition-specific file, check if already exists
282 |             if os.path.isfile(os.path.join(master_dir, "desmond_md", lig_basename, lig, os.path.basename(file))) == True:
283 |                 #Remove file/directory
284 |                 os.remove(os.path.join(master_dir, "desmond_md", lig_basename, lig, os.path.basename(file)))
285 |             elif os.path.isdir(os.path.join(master_dir, "desmond_md", lig_basename, lig, os.path.basename(file))) == True:
286 |                 shutil.rmtree(os.path.join(master_dir, "desmond_md", lig_basename, lig, os.path.basename(file)))
287 |             
288 |             #Copy to permanent location
289 |             shutil.move(file, os.path.join(master_dir, "desmond_md", lig_basename, lig, os.path.basename(file)))
290 | 
291 | def md_production(SCHRODINGER, master_dir, args, desmond_host, lig):
292 |     #Run Desmond MD. Generate output trajectory filenames and ligand basename for future use. Calls mdfit_run_md.py
293 |     outcms, outtrj, lig_basename = mdfit_run_md.main(lig, args, desmond_host, SCHRODINGER, master_dir)
294 | 
295 |     #Slice trajectory (remove frames). Calls mdfit_slicetrj.py
296 |     sliced_trj = mdfit_slicetrj.main(SCHRODINGER, lig, master_dir, args)
297 | 
298 |     #Move Desmond MD trajectory files to permanent directory
299 |     move_trj_files(master_dir, lig, lig_basename)
300 | 
301 |     #Return output trajectory filenames
302 |     return outcms, outtrj
303 | 
304 | def main(args, master_dir, ligfileprefix, SCHRODINGER, ligpath, template_dir, inst_params):
305 |     ###TODO: check if lignames are unique
306 | 
307 |     #Check if user wants Desmond MD
308 |     if args.skip_md == True:
309 |         #If not, capture current step
310 |         logger.info("Skipping Desmond MD (--skip_md provided by user)")
311 |     
312 |     #User wants Desmond MD
313 |     else:
314 |         #Generate scratch directory for running jobs
315 |         scratch_dir = dircheck(master_dir)
316 | 
317 |         #Capture current step
318 |         logger.info("Changing directory to %s"%scratch_dir)
319 | 
320 |         #Change to scratch directory
321 |         os.chdir(scratch_dir)
322 | 
323 |         #Prep minimization host
324 |         bmin_host = inst_params["hostnames"]["BMIN"]
325 | 
326 |         #Document current step
327 |         logger.debug("Minimization hostname is %s"%bmin_host)
328 | 
329 |         #Prep multisim host
330 |         multisim_host = inst_params["hostnames"]["MULTISIM"]
331 | 
332 |         #Document current step
333 |         logger.debug("Multisim hostname is %s"%multisim_host)
334 | 
335 |         #Prep hostname for Desmond
336 |         desmond_host = inst_params["hostnames"]["DESMOND"]
337 | 
338 |         #Document current step
339 |         logger.debug("Desmond hostname is %s"%desmond_host)
340 | 
341 |         #Get number of ligs for parallelization
342 |         num_ligs = countligs(ligpath, SCHRODINGER)
343 | 
344 |         #Generate a list with ligand numbers [0, 1, 2, ...]
345 |         lignum = gen_list(num_ligs)
346 | 
347 |         #Prepare number of workers based on ThreadPoolExecutor suggestion
348 |         workers = prep_workers(args)
349 | 
350 |         #Initiate list to capture names for MD jobs
351 |         all_md_names = []
352 | 
353 |         #Start parallel task controller
354 |         with concurrent.futures.ThreadPoolExecutor(max_workers=workers) as executor:
355 |             #Run MD setup asynchronously
356 |             setup_jobs = {executor.submit(rep_one_setup, SCHRODINGER, ligpath, lig, master_dir, args, bmin_host, multisim_host, all_md_names, template_dir): lig for lig in lignum}
357 | 
358 |             #For each asynchronous job
359 |             for future in concurrent.futures.as_completed(setup_jobs):
360 |                 #Capture the output
361 |                 lig = setup_jobs[future]
362 | 
363 |                 #Try getting the ligname
364 |                 try:
365 |                     ligname_base = future.result()
366 |                 
367 |                 #If a step in MD setup fails
368 |                 except Exception as exc:
369 |                     #Capture error
370 |                     logger.critical("An exception occurred during MD setup: %s"%(exc))
371 | 
372 |                     #Exit
373 |                     sys.exit()
374 |                 
375 |                 #Otherwise, MD setup was successful
376 |                 else:
377 |                     #Capture current step
378 |                     logger.info("Setup success: %s"%(ligname_base))
379 |         
380 |         #Capture current step
381 |         logger.info("MD setup complete. Launching %s production jobs."%len(all_md_names))
382 | 
383 |         #Start parallel task controller
384 |         with concurrent.futures.ThreadPoolExecutor(max_workers=workers) as executor:
385 |             #Run MD asynchronously
386 |             prod_jobs = {executor.submit(md_production, SCHRODINGER, master_dir, args, desmond_host, lig): lig for lig in all_md_names}
387 | 
388 |             #For each asynchronous job
389 |             for future in concurrent.futures.as_completed(prod_jobs):
390 |                 #Capture the output
391 |                 lig = prod_jobs[future]
392 | 
393 |                 #Try getting the trajectory names
394 |                 try:
395 |                     outcms, outtrj = future.result()
396 | 
397 |                 #If a step in MD fails
398 |                 except Exception as exc:
399 |                     #Capture error
400 |                     logger.critical("%s generated an exception during production MD: %s"%(outcms, exc))
401 | 
402 |                     #Exit
403 |                     sys.exit()
404 |                 
405 |                 #Otherwise, MD was successful
406 |                 else:
407 |                     #Capture current step
408 |                     logger.info("Production success: %s, %s"%(outcms, outtrj))
409 | 
410 |     #Capture current step
411 |     logger.info("Changing directory to %s"%master_dir)
412 | 
413 |     #Change to master directory
414 |     os.chdir(master_dir)
415 | 
416 | if __name__ == '__main__':
417 |     main(args, master_dir, ligfileprefix, SCHRODINGER, ligpath, template_dir, inst_params)


--------------------------------------------------------------------------------