├── .gitignore ├── CHANGELOG.md ├── LICENSE ├── README.md ├── dandelion ├── __init__.py ├── cli.py ├── dandelion_prep.py ├── dandelion_refine.py ├── dandelion_sample.py ├── neb │ ├── __init__.py │ ├── compile_neb.py │ ├── filter_neb.py │ └── run_neb.py ├── prep │ ├── __init__.py │ ├── geom_opt.py │ └── smiles_to_isoconfs.py ├── refine │ ├── __init__.py │ ├── compile_refined.py │ └── refine_forces.py ├── segsm │ ├── __init__.py │ ├── ard_gsm │ │ ├── __init__.py │ │ ├── driving_coords.py │ │ ├── limits.py │ │ └── mol.py │ ├── create_gsm.py │ ├── filter_gsm.py │ └── run_gsm.py └── utils │ └── db_h5_tools │ ├── db_to_h5.py │ ├── h5_to_db.py │ ├── make_db_from_xyzs.py │ ├── merge_db.py │ └── merge_h5.py ├── environment.yml ├── setup.cfg └── setup.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Created by https://www.toptal.com/developers/gitignore/api/python,linux 2 | # Edit at https://www.toptal.com/developers/gitignore?templates=python,linux 3 | **/*wb97.py 4 | **/nms 5 | .xtboptok 6 | ### Linux ### 7 | *~ 8 | 9 | # temporary files which can be created if a process still has a handle open of a deleted file 10 | .fuse_hidden* 11 | 12 | # KDE directory preferences 13 | .directory 14 | 15 | # Linux trash folder which might appear on any partition or disk 16 | .Trash-* 17 | 18 | # .nfs files are created when an open file is removed but is still being accessed 19 | .nfs* 20 | 21 | ### Python ### 22 | # Byte-compiled / optimized / DLL files 23 | __pycache__/ 24 | *.py[cod] 25 | *$py.class 26 | 27 | # C extensions 28 | *.so 29 | 30 | # Distribution / packaging 31 | .Python 32 | build/ 33 | develop-eggs/ 34 | dist/ 35 | downloads/ 36 | eggs/ 37 | .eggs/ 38 | lib/ 39 | lib64/ 40 | parts/ 41 | sdist/ 42 | var/ 43 | wheels/ 44 | share/python-wheels/ 45 | *.egg-info/ 46 | .installed.cfg 47 | *.egg 48 | MANIFEST 49 | 50 | # PyInstaller 51 | # Usually these files are written by a python script from a template 52 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 53 | *.manifest 54 | *.spec 55 | 56 | # Installer logs 57 | pip-log.txt 58 | pip-delete-this-directory.txt 59 | 60 | # Unit test / coverage reports 61 | htmlcov/ 62 | .tox/ 63 | .nox/ 64 | .coverage 65 | .coverage.* 66 | .cache 67 | nosetests.xml 68 | coverage.xml 69 | *.cover 70 | *.py,cover 71 | .hypothesis/ 72 | .pytest_cache/ 73 | cover/ 74 | 75 | # Translations 76 | *.mo 77 | *.pot 78 | 79 | # Django stuff: 80 | *.log 81 | local_settings.py 82 | db.sqlite3 83 | db.sqlite3-journal 84 | 85 | # Flask stuff: 86 | instance/ 87 | .webassets-cache 88 | 89 | # Scrapy stuff: 90 | .scrapy 91 | 92 | # Sphinx documentation 93 | docs/_build/ 94 | 95 | # PyBuilder 96 | .pybuilder/ 97 | target/ 98 | 99 | # Jupyter Notebook 100 | .ipynb_checkpoints 101 | 102 | # IPython 103 | profile_default/ 104 | ipython_config.py 105 | 106 | # pyenv 107 | # For a library or package, you might want to ignore these files since the code is 108 | # intended to run in multiple environments; otherwise, check them in: 109 | # .python-version 110 | 111 | # pipenv 112 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 113 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 114 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 115 | # install all needed dependencies. 116 | #Pipfile.lock 117 | 118 | # poetry 119 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 120 | # This is especially recommended for binary packages to ensure reproducibility, and is more 121 | # commonly ignored for libraries. 122 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 123 | #poetry.lock 124 | 125 | # pdm 126 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 127 | #pdm.lock 128 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 129 | # in version control. 130 | # https://pdm.fming.dev/#use-with-ide 131 | .pdm.toml 132 | 133 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 134 | __pypackages__/ 135 | 136 | # Celery stuff 137 | celerybeat-schedule 138 | celerybeat.pid 139 | 140 | # SageMath parsed files 141 | *.sage.py 142 | 143 | # Environments 144 | .env 145 | .venv 146 | env/ 147 | venv/ 148 | ENV/ 149 | env.bak/ 150 | venv.bak/ 151 | 152 | # Spyder project settings 153 | .spyderproject 154 | .spyproject 155 | 156 | # Rope project settings 157 | .ropeproject 158 | 159 | # mkdocs documentation 160 | /site 161 | 162 | # mypy 163 | .mypy_cache/ 164 | .dmypy.json 165 | dmypy.json 166 | 167 | # Pyre type checker 168 | .pyre/ 169 | 170 | # pytype static type analyzer 171 | .pytype/ 172 | 173 | # Cython debug symbols 174 | cython_debug/ 175 | 176 | # PyCharm 177 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 178 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 179 | # and can be added to the global gitignore or merged into this file. For a more nuclear 180 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 181 | #.idea/ 182 | 183 | ### Python Patch ### 184 | # Poetry local configuration file - https://python-poetry.org/docs/configuration/#local-configuration 185 | poetry.toml 186 | 187 | # ruff 188 | .ruff_cache/ 189 | 190 | # LSP config files 191 | pyrightconfig.json 192 | 193 | # End of https://www.toptal.com/developers/gitignore/api/python,linux 194 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | # Changelog 2 | 3 | ## [Unreleased] - 2023-10-12 4 | 5 | ### Added 6 | - run_gsm.py: added test if gsm command is available 7 | - filter_gsm.py: added step where optimize product with xTB and if RMSD change is big, filter out the reaction 8 | 9 | 10 | ### Added 11 | ### Changed 12 | ### Removed 13 | ### Fixed 14 | 15 | ## [0.7.4] - 2024-10-08 16 | ### Changed 17 | - README.md updated. 18 | 19 | ## [0.7.3] - 2024-10-08 20 | ### Changed 21 | - segsm/ard_gsm/mol.py: This version works with radicals. no assumption of four connections on carbon atoms. Also works with other heavy elementts. 22 | 23 | 24 | ## [0.7.2] - 2024-09-02 25 | ### Added 26 | - Utility that handle db and h5 files are added to utils/db_h5_tools. 27 | 1. db_to_h5.py 28 | 2. h5_to_db.py 29 | 3. make_db_from_xyzs.py 30 | 4. merge_db.py 31 | 5. merge_h5.py 32 | 33 | ## [0.7.1] - 2024-09-01 34 | 35 | ### Added 36 | - Normal mode sampling codes are added to utils/nms. 37 | 1. normal_mode_sampling.py 38 | 2. refine_forces_nms.py 39 | 40 | ### Fixed 41 | - All code now assert the type of the input_path (dir or file) 42 | 43 | ## [0.7.0] - 2024-08-31 44 | 45 | ### Added 46 | - Sampling iso/conformers is included as a preparatory step in dandelion. 47 | 1. smiles_to_isoconfs.py 48 | 2. geom_opt.py 49 | 3. dandelion_prep.py 50 | 51 | - cli.py: to invoke dandelion like 'dand prep -i ./a.smi -n 40' in cli. 52 | 53 | ### Changed 54 | - dandelion is shortend as 'dand' in cli. 55 | - dandelion_sample.py: default argument '0_mothers' changed to '0_reactants' 56 | - print_separator, merge_args_with_defaults are moved to init.py 57 | ## [0.6.2] - 2024-07-08 58 | 59 | ### Fixed 60 | - dandelion.py: name changed to dandelion_refine.py 61 | 62 | 63 | 64 | ## [0.6.1] - 2024-01-14 65 | 66 | ### Changed 67 | - compile_refined.py: bug fixed when atomrow doesn't have 'energy' and 'forces' 68 | 69 | 70 | ## [0.6.0] - 2023-11-21 71 | 72 | ### Added 73 | - filter_neb.py: added function is_valid_reaction to filter out weird rxn 74 | 75 | ## [0.5.6] - 2023-11-21 76 | 77 | ### Changed 78 | - refine_forces.py: suppress error in force calculation, save to orca_error.log 79 | - refine_forces.py: now save samples in batch 80 | - refine_forces.py: open .db file with statement 81 | 82 | ## [0.5.5] - 2023-11-14 83 | 84 | ### Added 85 | - run_neb.py: added argument fmax_threshold (default=0.1ev/A) 86 | 87 | ### Fixed 88 | - refine_forces.py: added NoTrah for the orca command 89 | 90 | 91 | ## [0.5.4] - 2023-11-7 92 | 93 | ### Fixed 94 | - compile_neb.py: fixed argparser that had no required=True 95 | 96 | 97 | ## [0.5.3] - 2023-11-2 98 | 99 | ### Fixed 100 | - dandelion_refine.py: awesome ascii art 101 | 102 | 103 | ## [0.5.2] - 2023-11-2 104 | 105 | ### Fixed 106 | - compile_refined.py: sorting the rows in the right order 107 | 108 | 109 | ## [0.5.1] - 2023-10-17 110 | 111 | ### Added 112 | - opt_mothers.py: optimize crude structures using xTB 113 | 114 | 115 | ## [0.5.0] - 2023-10-12 116 | 117 | ### Added 118 | - filter_neb.py: xTB normal mode TS validation: is_transition_state 119 | 120 | 121 | ## [0.4.1] - 2023-10-11 122 | 123 | ### Added 124 | - Added \__init__.py have variable \__version__ 125 | 126 | ### Fixed 127 | - Basis set 6-31g(d) for Br atom in orca was handled thanks to https://github.com/ZimmermanGroup/ORCA-Basis-Sets 128 | 129 | 130 | ## [0.4.0] - 2023-10-10 131 | 132 | ### Added 133 | - dandelion_refine.py that run refine processes 134 | 135 | 136 | ## [0.3.1] - 2023-10-10 137 | 138 | ### Added 139 | - setup.py, README.md, CHANGELOG.md, LICENSE added 140 | 141 | 142 | ## [0.2.0] - 2023-09-30 143 | 144 | ### Added 145 | - dandelion.py that run through neb, refine 146 | - Codes refactored 147 | 148 | ### Fixed 149 | - Issues with absolute import fixed 150 | 151 | 152 | ## [0.1.0] - 2023-09-10 153 | 154 | ### Added 155 | - Initial release with features neb, refine, segsm 156 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | 2 | MIT License 3 | 4 | Copyright (c) 2023 Minhyeok Lee 5 | 6 | Permission is hereby granted, free of charge, to any person obtaining a copy 7 | of this software and associated documentation files (the "Software"), to deal 8 | in the Software without restriction, including without limitation the rights 9 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 10 | copies of the Software, and to permit persons to whom the Software is 11 | furnished to do so, subject to the following conditions: 12 | 13 | The above copyright notice and this permission notice shall be included in all 14 | copies or substantial portions of the Software. 15 | 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 19 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 22 | SOFTWARE. 23 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 |

Dandelion

2 | 3 | [![docs](https://img.shields.io/badge/docs-mhyeok1.github.io/dand__docs/-brightgreen.svg)](https://mhyeok1.github.io/dand_docs/) 4 | [![DOI](https://img.shields.io/badge/DOI-10.1002/advs.202409009-blue.svg)](https://doi.org/10.1002/advs.202409009) 5 | [![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.14020916.svg)](https://doi.org/10.5281/zenodo.14020916) 6 |
7 | 8 |

Codes for automated and efficient sampling of chemical reaction space for MLIP training

9 | 10 | 11 | drawing 12 |
13 | 14 | Dandelion is a code for generating datasets that contain both equilibrium and reactive regions of potential energy surfaces, using automated and efficient sampling of chemical reaction space. 15 | 16 | **Documentation** : 17 | 18 | 19 | 20 | ## Citation 21 | If you find this work useful for your research, please consider citing: 22 | 23 | - Lee et al. *Adv. Sci.* **12**, 2409009 (2025) [LINK](https://doi.org/10.1002/advs.202409009) 24 | 25 | This work builds upon pioneering works that should also be cited: 26 | - Grambow et al. *Sci. Data* **7**, 137 (2020) [LINK](https://doi.org/10.1038/s41597-020-0460-4) 27 | - Schreiner et al. *Sci. Data* **9**, 779 (2022) [LINK](https://doi.org/10.1038/s41597-022-01870-w) 28 | 29 | ## Supporting Information 30 | The datasets used in the paper are available at zenodo. 31 | 32 | [![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.14020916.svg)](https://doi.org/10.5281/zenodo.14020916) 33 | 34 | 35 | -------------------------------------------------------------------------------- /dandelion/__init__.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | 3 | __version__ = '0.7.4' 4 | 5 | def print_separator(text, width=70): 6 | border = "╔" + "═" * (width-2) + "╗" 7 | 8 | total_symbols_len = width - len(text) - 4 9 | half_len = total_symbols_len // 2 10 | left_symbol = "║" + " " * (half_len - 1) 11 | right_symbol = " " * (total_symbols_len - half_len - 1) + "║" 12 | separator = left_symbol + ' ' + text + ' ' + right_symbol 13 | 14 | end = "╚" + "═" * (width-2) + "╝" 15 | print("\n\n" + border) 16 | print(separator) 17 | print(end + "\n\n") 18 | 19 | def merge_args_with_defaults(module_parser, custom_args): 20 | """ 21 | Merge custom arguments with module defaults. 22 | Args: 23 | - module_parser: the module parser function 24 | - custom_args: dictionary of custom arguments 25 | 26 | Returns: 27 | - argparse.Namespace: merged namespace of arguments 28 | """ 29 | 30 | parser = module_parser() 31 | for action in parser._actions: 32 | if action.required: 33 | action.required = False 34 | 35 | defaults = vars(parser.parse_args([])) 36 | defaults.update(custom_args) 37 | 38 | for action in parser._actions: 39 | if not action.required and action.dest in custom_args: 40 | action.required = True 41 | 42 | return argparse.Namespace(**defaults) -------------------------------------------------------------------------------- /dandelion/cli.py: -------------------------------------------------------------------------------- 1 | import sys 2 | from dandelion import dandelion_prep, dandelion_sample, dandelion_refine 3 | 4 | def main(): 5 | if len(sys.argv) < 2: 6 | print("Usage: dand [prep|sample|refine] [options]") 7 | sys.exit(1) 8 | 9 | command = sys.argv[1] 10 | # Remove the 'dand' and the subcommand from sys.argv 11 | sys.argv = [sys.argv[0]] + sys.argv[2:] 12 | 13 | if command == "prep": 14 | dandelion_prep.main() 15 | elif command == "sample": 16 | dandelion_sample.main() 17 | elif command == "refine": 18 | dandelion_refine.main() 19 | else: 20 | print(f"Unknown command: {command}") 21 | print("Available commands: prep, sample, refine") 22 | sys.exit(1) 23 | 24 | if __name__ == "__main__": 25 | main() -------------------------------------------------------------------------------- /dandelion/dandelion_prep.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import os 3 | import sys 4 | import time 5 | import argparse 6 | 7 | from dandelion import __version__, print_separator, merge_args_with_defaults 8 | from dandelion.prep.smiles_to_isoconfs import main as smiles_to_isoconfs, get_parser as smiles_to_isoconfs_parser 9 | from dandelion.prep.geom_opt import main as geom_opt, get_parser as geom_opt_parser 10 | 11 | 12 | def print_header(width=70): 13 | 14 | print(f''' 15 | 16 | H H 17 | \\\\ - 18 | \\\\ - 19 | C──────────C\ H 20 | - \\\\ / 21 | - \\\\ / 22 | H────────C O=Cc1ccccc1 C──────────C 23 | \\\\ - \\\\ 24 | \\\\ - \\\\ 25 | \\C─────────C- O 26 | - \\\\ 27 | - \\\\ 28 | H H 29 | 30 | {"Prepare Iso/Conformers from SMILES stirngs".center(width)} 31 | {("Dandelion " + __version__ + " by mlee").center(width)} 32 | ''') 33 | 34 | 35 | def main(): 36 | args = parse_arguments() 37 | 38 | input_path = args.input_path 39 | if not os.path.isfile(input_path): 40 | sys.exit(f"Error: '{input_path}' is not a file.") 41 | working_path = os.path.dirname(input_path) 42 | max_workers = args.max_workers 43 | 44 | phases = [ 45 | ("1. Sample iso/conformers from SIMLES strings", smiles_to_isoconfs, smiles_to_isoconfs_parser, { 46 | 'input_path': input_path, 47 | 'output_path': os.path.join(working_path, '-1_isoconfs'), 48 | }), 49 | ("2. Optimize geometries", geom_opt, geom_opt_parser, { 50 | 'input_path': os.path.join(working_path, '-1_isoconfs'), 51 | 'output_path': os.path.join(working_path, '0_reactants'), 52 | 'max_workers': max_workers 53 | }), 54 | ] 55 | 56 | print_header() 57 | 58 | for title, function, parser, custom_args in phases: 59 | time.sleep(3) 60 | print_separator(title) 61 | merged_args = merge_args_with_defaults(parser, custom_args) 62 | function(merged_args) 63 | 64 | 65 | def parse_arguments(): 66 | parser = argparse.ArgumentParser(description='Prepare optimized iso/conformers from SMILES,\ 67 | Other parameters can be set in each modules') 68 | 69 | parser.add_argument('-i', '--input_path', required=True, 70 | help='Input path of a file containing SMILES') 71 | parser.add_argument('-n', '--max_workers', type=int, default=1, 72 | help='Number of processes to use for parallel execution.') 73 | 74 | return parser.parse_args() 75 | 76 | 77 | if __name__ == "__main__": 78 | main() 79 | -------------------------------------------------------------------------------- /dandelion/dandelion_refine.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import os 3 | import sys 4 | import time 5 | import argparse 6 | 7 | from dandelion import __version__, print_separator, merge_args_with_defaults 8 | from dandelion.refine.refine_forces import main as refine_forces, get_parser as refine_forces_parser 9 | from dandelion.refine.compile_refined import main as compile_refined, get_parser as compile_refined_parser 10 | 11 | 12 | 13 | def print_header(width=70): 14 | 15 | print(f''' 16 | 17 | ⢀⣀⣀⣀⣀⣀⡀ ⢀⢀⣀⢀⠞⠖⠁⠡⡂⡆ ⡠⢀⡀ 18 | ⠺⢿⣿⣿⣿⣿⣿⣿⣷⣦⣠⣤⣤⣤⣄⣀⣀ ⡏⢸ ⢀ ⠣⠈ ⡠⡋⡨⡋⡂ 19 | ⠙⢿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣷⣦⣄⡀⡎⢀⡰⢀⢎⠌⢀⠔⣐⠠⣄⣀ 20 | ⢀ ⡔⢀⣴⣾⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⠿⠿⠿⣿⣿⣷⣄⠂ ⢊⠎ ⠠⠂⡀⠕⠌⠌ ⡄⡠⢄ 21 | ⢀⡆⠄⠁⢈⢠⣾⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣀ ⣀⣿⣿⣿⣆⠐ ⡨⠒⠁⡀⢠⣦⠍⠇⡀⢲⠂⡄⠄ 22 | ⠨⡀⠑⡈ ⢠⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⡄ ⠈ ⣬⠠⣰⣿ ⢳⢹⡄⡆⠄⢀⢼ 23 | ⡄⠱⠈⠁⠑⢄⠐⣾⣿⣿⡿⠋⠁⣀⣠⣬⣽⣿⣿⣿⣿⣿⣿⠿⠿⠿⠿⠿⠿⠿⠿⠟⠁⡟⣅⡢⠁⠠⠜⡄⡑⢌⢧⡀ ⡀⣰⢁⡐⢁⢄⣡⣧⡤⠄ 24 | ⠠⡐⠓⠂⠌ ⢀⣿⣿⡏⢀⣴⣿⠿⠛⠉⠉⠶⢸⣿⣿⠿⠁⠢⠨⢀⣻⣿⣿⣿⣿⢟⣿⣝⠂ ⠠⡠⢆⠈⡂⠱⡇ ⣅⠫⠂⡠⢂⡪⠋ ⠁⡆ 25 | ⡶⠉ ⢀⡀⠁⡁⢸⣿⣿⢠⣾⡟⠁⣿⣿⡇ ⢀⠈⠉⠁ ⣀⠷⣹⣏⣷⢏⠹⠁ ⠈⢈ ⢇ ⢸⠱⢸⡏⡀⡶⡸⠎ ⠰⠁⡸ 26 | ⢈⡕⡈⠁⠐⠂⢀⢸⣿⣿⣾⠏⣿⣿⡿⣻⣿⢞⡢⠄ ⠈ ⡀⡤⠂⠁⠉⠌ ⢀⢀⠠⠐⢄ ⡀⢆⠎⢹⣶⣷⣧⡈⠈⠉⠤⠂⠉⢀⠱⡀ 27 | ⢠⡊ ⠁⣸⣿⣿⣿⣀⠉⡻⡏⠋⠁ ⠁⠒⠒⡀⣍⠍⠁ ⡀ ⢠⠂ ⢀⠈⠄⢀⠄⡒⠅⠈⢄⢡ ⢿⣿⣷⣿⡄ ⠐⠄⠤ ⠜⢀ 28 | ⠐⠁ ⠤⠒⢠⣾⣿⣿⣿⣿⣿⣷⣄⢄ ⢀ ⡏ ⢰⣃⠊⡐⠐⠁⢀⠈ ⣀ ⠰⠢⢀⠂⡰⠈⠂ ⡱⠂⢂⡇⡈⠻⢿⣿⠇ ⡤⠄⣀⡰⠁ 29 | ⠁⣾⣿⣿⣿⣿⣿⣿⣿⣿⣦ ⠄ ⠉ ⠸⠫⢞⠈⣰⠈ ⡐⢲⣿⡏ ⢠⡾ ⣀⠊⢱ ⠠⡀ ⢈⢀⡐⠤⣕⡄ 30 | ⢰⣿⡿⠛⠉ ⠈⠙⠛ ⠈⠈ ⠻⠔⠁⢸⡍⡇ ⢀⣏ ⢀⠠⠆ ⠣⡀⠈⡠⡀⠉⠢⡤⠢⣈⡡⣢⠦ 31 | ⠈⠁ ⢻⣇ ⢸⡇⡇ ⣼⡿⠉ ⢀⡇ ⠑⡄⠑⣌⢄ ⠙⢄⠠⡪⣅ 32 | ⠈⣾⡆ ⢸⣏⡇ ⢠⣿⠇ ⠸⢌⢢⢄⡠⠣⠈⠢⡁⡈⣎⢢⡬⠃ 33 | 34 | {"Energy refinement on samples using orca".center(width)} 35 | {("Dandelion " + __version__ + " by mlee").center(width)} 36 | ''') 37 | 38 | 39 | def main(): 40 | args = parse_arguments() 41 | 42 | input_path = args.input_path 43 | if not os.path.isdir(input_path): 44 | sys.exit(f"Error: '{input_path}' is not a directory.") 45 | max_workers = args.max_workers 46 | orcabinary = args.orca 47 | 48 | phases = [ 49 | ("1. Refining forces", refine_forces, refine_forces_parser, { 50 | 'input_path': os.path.join(input_path, 'xtb.h5'), 51 | 'output_path': os.path.join(input_path, 'wb97x.db'), 52 | 'orca' : orcabinary, 53 | 'max_workers': max_workers 54 | }), 55 | ("2. Compiling final samples", compile_refined, compile_refined_parser, { 56 | 'input_path': os.path.join(input_path, 'wb97x.db'), 57 | 'output_path': os.path.join(input_path, 'wb97x.h5') 58 | }), 59 | ] 60 | 61 | print_header() 62 | 63 | for title, function, parser, custom_args in phases: 64 | time.sleep(3) 65 | print_separator(title) 66 | merged_args = merge_args_with_defaults(parser, custom_args) 67 | function(merged_args) 68 | 69 | 70 | def parse_arguments(): 71 | parser = argparse.ArgumentParser(description='Refine force on obtained samples,\ 72 | Other parameters can be set in each modules') 73 | 74 | parser.add_argument('-i', '--input_path', required=True, 75 | help='Input path of directory containing xtb.h5') 76 | parser.add_argument('-n', '--max_workers', type=int, required=True, 77 | help='Number of worker processes') 78 | parser.add_argument('--orca', required=True, 79 | help="Path of the orca binary file") 80 | 81 | return parser.parse_args() 82 | 83 | 84 | if __name__ == "__main__": 85 | main() 86 | -------------------------------------------------------------------------------- /dandelion/dandelion_sample.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import os 3 | import sys 4 | import time 5 | import argparse 6 | 7 | from dandelion import __version__, print_separator, merge_args_with_defaults 8 | from dandelion.segsm.create_gsm import main as create_gsm, get_parser as create_gsm_parser 9 | from dandelion.segsm.run_gsm import main as run_gsm, get_parser as run_gsm_parser 10 | from dandelion.segsm.filter_gsm import main as filter_gsm, get_parser as filter_gsm_parser 11 | from dandelion.neb.run_neb import main as run_neb, get_parser as run_neb_parser 12 | from dandelion.neb.filter_neb import main as filter_neb, get_parser as filter_neb_parser 13 | from dandelion.neb.compile_neb import main as compile_neb, get_parser as compile_neb_parser 14 | 15 | 16 | def print_header(width=70): 17 | 18 | print(f''' 19 | 20 | `;:` BREAK 1 2 21 | .;:; / BREAK 3 4 22 | _____ _ _;::; ` ADD 1 3 23 | | __ \ | | | |';:;' 24 | | | | | __ _ _ __ __| | ___| | _ ___ _ __ 25 | | | | |/ _` | '_ \ / _` |/ _ \ | | |/ _ \| '_ \ 26 | | |__| | (_| | | | | (_| | __/ | | | (_) | | | | 27 | |_____/ \__,_|_| |_|\__,_|\___|_| |_|\___/|_| |_| 28 | 29 | {"Chemical compound space sampling".center(width)} 30 | {"near transition state using xTB, SE-GSM and NEB".center(width)} 31 | {("Dandelion " + __version__ + " by mlee").center(width)} 32 | ''') 33 | 34 | 35 | def main(): 36 | args = parse_arguments() 37 | 38 | input_path = args.input_path 39 | if not os.path.isdir(input_path): 40 | sys.exit(f"Error: '{input_path}' is not a directory.") 41 | output_path = os.path.dirname(os.path.dirname(input_path)) 42 | max_workers = args.max_workers 43 | 44 | if not os.path.exists(output_path): 45 | os.makedirs(output_path) 46 | 47 | phases = [ 48 | ("1. Creating GSM", create_gsm, create_gsm_parser, { 49 | 'input_path': input_path, 50 | 'output_path': os.path.join(output_path, '1_gsm') 51 | }), 52 | ("2. Running GSM", run_gsm, run_gsm_parser, { 53 | 'input_path': os.path.join(output_path, '1_gsm'), 54 | 'max_workers': max_workers 55 | }), 56 | ("3. Filtering GSM", filter_gsm, filter_gsm_parser, { 57 | 'input_path': os.path.join(output_path, '1_gsm'), 58 | 'output_path': os.path.join(output_path, '2_gsm_filtered') 59 | }), 60 | 61 | ("4. Running NEB", run_neb, run_neb_parser, { 62 | 'input_path': os.path.join(output_path, '2_gsm_filtered'), 63 | 'output_path': os.path.join(output_path, '3_neb'), 64 | 'max_workers': max_workers 65 | }), 66 | ("5. Filtering NEB", filter_neb, filter_neb_parser, { 67 | 'input_path': os.path.join(output_path, '3_neb'), 68 | 'output_path': os.path.join(output_path, '4_neb_filtered') 69 | }), 70 | ("6. Compiling samples", compile_neb, compile_neb_parser, { 71 | 'input_path': os.path.join(output_path, '4_neb_filtered', 'reactions.json'), 72 | 'output_path': os.path.join(output_path, 'xtb.h5') 73 | }), 74 | ] 75 | 76 | print_header() 77 | 78 | for title, function, parser, custom_args in phases: 79 | time.sleep(3) 80 | print_separator(title) 81 | merged_args = merge_args_with_defaults(parser, custom_args) 82 | function(merged_args) 83 | 84 | 85 | def parse_arguments(): 86 | parser = argparse.ArgumentParser(description='Do SEGSM and NEB from reactant structures,\ 87 | Other parameters can be set in each modules') 88 | 89 | parser.add_argument('-i', '--input_path', required=True, 90 | help='Input path of reactant structures (must be a directory)') 91 | parser.add_argument('-n', '--max_workers', type=int, required=True, 92 | help='Number of worker processes') 93 | return parser.parse_args() 94 | 95 | 96 | if __name__ == "__main__": 97 | main() -------------------------------------------------------------------------------- /dandelion/neb/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mhyeok1/dand/6c9ff246047ff37c89ab2b338f236e4798b3e2c2/dandelion/neb/__init__.py -------------------------------------------------------------------------------- /dandelion/neb/compile_neb.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import json 4 | import hashlib 5 | import argparse 6 | import itertools 7 | 8 | import h5py 9 | import ase.db 10 | import numpy as np 11 | from tqdm import tqdm 12 | from ase.units import Hartree, Bohr 13 | 14 | 15 | def get_hash(row): 16 | s = str(row.positions) + row.formula 17 | return int(hashlib.sha1(s.encode("utf-8")).hexdigest(), 16) % (10 ** 8) 18 | 19 | def write_rxn(h5file, fmaxs_path, db_path, rxn, fmax_threshold): 20 | fmaxs = json.load(open(fmaxs_path)) 21 | 22 | skip_next = False 23 | first = True 24 | cum_fmax = 0 25 | 26 | with ase.db.connect(db_path) as db: 27 | for i, (fmax, path) in enumerate(zip(fmaxs, sliced_it(10, db.select("")))): 28 | cum_fmax += fmax 29 | skip_this = skip_next 30 | skip_next = False 31 | last = i == len(fmaxs) - 1 32 | 33 | if last: 34 | skip_this = False 35 | 36 | if cum_fmax < fmax_threshold: 37 | skip_next = True 38 | 39 | else: 40 | cum_fmax = 0 41 | 42 | if skip_this: 43 | continue 44 | 45 | if not first: 46 | path = path[1:-1] 47 | 48 | # reactant and product is sampled once 49 | # (all points -2) // 8 ==0 50 | 51 | forces_path = np.array([row.forces for row in path]) 52 | positions_path = np.array([row.positions for row in path]) 53 | energy_path = np.array([row.energy for row in path]) 54 | 55 | if first: 56 | forces = forces_path 57 | positions = positions_path 58 | energy = energy_path 59 | reactant = path[0] # pylint: disable=undefined-loop-variable 60 | product = path[-1] # pylint: disable=undefined-loop-variable 61 | 62 | else: 63 | forces = np.concatenate((forces, forces_path), axis=0) 64 | positions = np.concatenate((positions, positions_path), axis=0) 65 | energy = np.concatenate((energy, energy_path), axis=0) 66 | 67 | first = False 68 | 69 | transition_state = path[ # pylint: disable=undefined-loop-variable 70 | np.argmax(energy_path) 71 | ] 72 | 73 | formula = reactant.formula 74 | atomic_numbers = reactant.numbers 75 | 76 | if formula in h5file: 77 | grp = h5file[formula] 78 | else: 79 | grp = h5file.create_group(formula) 80 | 81 | subgrp = grp.create_group(rxn) 82 | single_molecule(reactant, subgrp.create_group("reactant")) 83 | single_molecule(transition_state, subgrp.create_group("transition_state")) 84 | single_molecule(product, subgrp.create_group("product")) 85 | 86 | dict_ = { 87 | "forces": forces, 88 | "positions": positions, 89 | "energy": energy, 90 | "atomic_numbers": atomic_numbers, 91 | } 92 | write_group(dict_, subgrp) 93 | 94 | 95 | def single_molecule(molecule, subgrp): 96 | dict_ = { 97 | "forces": np.expand_dims(molecule.forces, 0), 98 | "positions": np.expand_dims(molecule.positions, 0), 99 | "energy": np.expand_dims(molecule.energy, 0), 100 | "atomic_numbers": molecule.numbers, 101 | "hash": get_hash(molecule), 102 | } 103 | write_group(dict_, subgrp) 104 | 105 | 106 | def write_group(dict_, grp): 107 | grp.create_dataset("atomic_numbers", data=dict_["atomic_numbers"]) 108 | grp.create_dataset("GFN2-xTB.forces", data=dict_["forces"]) 109 | grp.create_dataset("GFN2-xTB.energy", data=dict_["energy"]) 110 | grp.create_dataset("positions", data=dict_["positions"]) 111 | 112 | if "hash" in dict_: 113 | grp.create_dataset("hash", data=dict_["hash"]) 114 | 115 | 116 | def sliced_it(n, iterable): 117 | it = iter(iterable) 118 | while True: 119 | chunk = itertools.islice(it, n) 120 | yield list(chunk) 121 | 122 | 123 | def main(args): 124 | 125 | print_args(args) 126 | 127 | input_path = args.input_path 128 | if not os.path.isfile(input_path): 129 | sys.exit(f"Error: '{input_path}' is not a file.") 130 | output_path = args.output_path 131 | fmax_threshold = args.fmax_threshold 132 | 133 | rxns = json.load(open(input_path)) 134 | h5file = h5py.File(output_path, "w") 135 | 136 | data = h5file.create_group("data") 137 | indexfile = open(output_path + ".index.json", "w") 138 | index = {} 139 | 140 | bar_format = '{l_bar}{bar}| {n_fmt}/{total_fmt} [{elapsed}<{remaining}]' 141 | for i, path in tqdm(enumerate(rxns), total=len(rxns), desc="Compiling reactions", bar_format=bar_format, ncols=70): 142 | 143 | fmaxs_path = os.path.join(path, "fmaxs.json") 144 | db_path = os.path.join(path, "neb.db") 145 | 146 | new_rxn_name = f"rxn{str(i).zfill(4)}" 147 | write_rxn(data, fmaxs_path, db_path, new_rxn_name, fmax_threshold) 148 | index[new_rxn_name] = os.path.basename(path) 149 | 150 | json.dump(index, indexfile, indent=4) 151 | 152 | print('Compiling finished!') 153 | 154 | def print_args(args): 155 | print() 156 | print("Arguments provided:") 157 | arg_dict = vars(args) 158 | for key, value in arg_dict.items(): 159 | print(f" {key}: {value}") 160 | print() 161 | 162 | def get_parser(): 163 | parser = argparse.ArgumentParser(description="Compile filtered neb jobs to xtb h5 file.") 164 | 165 | parser.add_argument('-i', '--input_path', required=True, 166 | help="Path of reactions.json, contains all reactions that should be included in the dataset ") 167 | parser.add_argument('-o', '--output_path', required=True, 168 | help="Path to the h5 file to write to") 169 | parser.add_argument('--fmax_threshold', type=int, default=0.1, 170 | help='Fmax threshold for selecting bands') 171 | return parser 172 | 173 | if __name__ == "__main__": 174 | args = get_parser().parse_args() 175 | main(args) 176 | 177 | -------------------------------------------------------------------------------- /dandelion/neb/filter_neb.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import json 4 | import shutil 5 | import argparse 6 | from collections import defaultdict 7 | 8 | from tqdm import tqdm 9 | import numpy as np 10 | from ase.io import read 11 | from ase.vibrations import Vibrations 12 | from xtb.ase.calculator import XTB 13 | 14 | def get_energy_from_xyz(file_path): 15 | """Extracts the energy of a structure from an XYZ file.""" 16 | try: 17 | atom = read(file_path) 18 | return atom.get_potential_energy() 19 | except: 20 | return None 21 | 22 | def is_valid_rxn(reactant_path, product_path, ts_path): 23 | """Check if the reaction is valid based on energy.""" 24 | 25 | reactant_energy = get_energy_from_xyz(reactant_path) 26 | product_energy = get_energy_from_xyz(product_path) 27 | ts_energy = get_energy_from_xyz(ts_path) 28 | 29 | if abs(reactant_energy - product_energy) < 5 * 0.0433634: # delta E below 5 kcal/mol 30 | return False 31 | 32 | if abs(ts_energy - reactant_energy) < 5 * 0.0433634: # reverse AE below 5 kcal/mol 33 | return False 34 | 35 | if abs(ts_energy - product_energy) < 5 * 0.0433634: # reverse AE below 5 kcal/mol 36 | return False 37 | 38 | return product_energy != ts_energy 39 | 40 | 41 | def is_transition_state(ts_file_path, threshold=50): #cm-1 42 | struc = read(ts_file_path) 43 | struc.calc = XTB(method="GFN2-xTB") 44 | 45 | try: 46 | vib = Vibrations(struc) 47 | vib.run() 48 | frequencies = vib.get_frequencies() 49 | vib.clean() 50 | 51 | # Filter out imaginary frequencies below the threshold 52 | significant_imaginary_freqs = np.count_nonzero(np.abs(np.imag(frequencies)) > threshold) 53 | 54 | return significant_imaginary_freqs == 1 55 | except: 56 | return False 57 | 58 | def main(args): 59 | 60 | print_args(args) 61 | 62 | input_path = args.input_path 63 | if not os.path.isdir(input_path): 64 | sys.exit(f"Error: '{input_path}' is not a directory.") 65 | output_path = args.output_path 66 | if not os.path.exists(output_path): 67 | os.mkdir(output_path) 68 | 69 | grown_seeds = [dirpath for dirpath, _, filenames in os.walk(input_path) if "converged" in filenames] 70 | grown_seeds_copy = grown_seeds 71 | # Group by mother string 72 | grouped_seeds = defaultdict(list) 73 | for seed in grown_seeds: 74 | mother_string = os.path.basename(seed)[:-8] # gsmGeom-m1-i1-c1-opt-gsm0044 -> gsmGeom-m1-i1-c1-opt 75 | grouped_seeds[mother_string].append(seed) 76 | rxn_list = [] 77 | 78 | bar_format = '{l_bar}{bar}| {n_fmt}/{total_fmt} [{elapsed}<{remaining}]' 79 | for mother_string, seeds in tqdm(grouped_seeds.items(), desc="Mothers", position=0, bar_format=bar_format, ncols=70): 80 | idx = 0 81 | for f in tqdm(seeds, desc=f"Rxns in {mother_string}", position=1, bar_format=bar_format, ncols=70, leave=False): 82 | 83 | ts_file_path = os.path.join(f, 'transition_state.xyz') 84 | reactant_path = os.path.join(f, 'reactant.xyz') 85 | product_path = os.path.join(f, 'product.xyz') 86 | 87 | if not is_valid_rxn(reactant_path, product_path, ts_file_path): 88 | continue 89 | 90 | if not is_transition_state(ts_file_path): 91 | # print(f"Directory {f} is not a valid reaction. Skipping...") 92 | continue 93 | 94 | # If True, copy the directory 95 | new_name = os.path.join(output_path, f'{mother_string}-rxn{idx:04}') 96 | shutil.copytree(f, new_name) 97 | rxn_list.append(new_name) 98 | idx += 1 99 | 100 | with open(os.path.join(output_path, 'reactions.json'), 'w') as f: 101 | json.dump(rxn_list, f, indent=4) 102 | 103 | print(f'\n{len(rxn_list)}/{len(grown_seeds_copy)} rxns were saved to {output_path}/reactions.json') 104 | print('Filtering NEB finished!') 105 | 106 | 107 | def print_args(args): 108 | print() 109 | print("Arguments provided:") 110 | arg_dict = vars(args) 111 | for key, value in arg_dict.items(): 112 | print(f" {key}: {value}") 113 | print() 114 | 115 | def get_parser(): 116 | parser = argparse.ArgumentParser(description='Filter neb jobs and make reactions.json') 117 | 118 | parser.add_argument('-i', '--input_path', required=True, 119 | help='Input path of finished neb jobs') 120 | parser.add_argument('-o', '--output_path', required=True, 121 | help='Output path of filtered neb jobs') 122 | 123 | return parser 124 | 125 | 126 | if __name__ == "__main__": 127 | args = get_parser().parse_args() 128 | main(args) 129 | 130 | 131 | 132 | 133 | -------------------------------------------------------------------------------- /dandelion/neb/run_neb.py: -------------------------------------------------------------------------------- 1 | 2 | import os 3 | import sys 4 | import json 5 | import argparse 6 | from functools import partial 7 | from concurrent.futures import ProcessPoolExecutor 8 | 9 | import uuid 10 | import numpy as np 11 | from tqdm import tqdm 12 | 13 | import matplotlib 14 | import matplotlib.pyplot as plt 15 | import imageio.v2 as imageio 16 | from PIL import Image, ImageOps 17 | 18 | import ase.db 19 | from ase.io import read, write 20 | from xtb.ase.calculator import XTB 21 | from ase.optimize.bfgs import BFGS 22 | from ase.utils.forcecurve import fit_images 23 | from ase.neb import NEB, NEBOptimizer, NEBTools 24 | from ase.calculators.orca import ORCA 25 | 26 | class SuppressStderr: 27 | def __enter__(self): 28 | self._original_stderr = sys.stderr 29 | sys.stderr = open(os.devnull, 'w') 30 | 31 | def __exit__(self, exc_type, exc_val, exc_tb): 32 | sys.stderr.close() 33 | sys.stderr = self._original_stderr 34 | 35 | def plot_mep(fit_list): 36 | fit_list[:,1,:] *= 23.0609 #to kcal/mol 37 | gray_scale = matplotlib.colormaps.get('binary', len(fit_list)) 38 | fig, ax = plt.subplots() 39 | for i in range(len(fit_list)): 40 | 41 | if i+1 == len(fit_list): 42 | ax.plot(fit_list[i,0,:], fit_list[i,1,:], color='red', linewidth=3) 43 | break 44 | 45 | color = gray_scale(max(i / len(fit_list), 0.1)) 46 | ax.plot(fit_list[i,0,:], fit_list[i,1,:], color=color) 47 | 48 | ax.set_title(f'Iter {len(fit_list)}') 49 | ax.set_axisbelow(True) 50 | ax.set_ylabel("Energy [kcal/mol]") 51 | ax.set_xlabel("Reaction Coordinate [AA]") 52 | return fig 53 | 54 | def get_fit(neb_tools): 55 | fit = fit_images(neb_tools.images) 56 | return fit.fit_path, fit.fit_energies 57 | 58 | class CalculationChecker: 59 | def __init__(self, neb): 60 | self.neb = neb 61 | 62 | def check_calculations(self): 63 | missing_calculations = [] 64 | for i, image in enumerate(self.neb.images[1:-1]): 65 | if {"forces", "energy"} - image.calc.results.keys(): 66 | missing_calculations.append(i) 67 | 68 | if missing_calculations: 69 | raise ValueError(f"missing calculation for image(s) {missing_calculations}") 70 | 71 | 72 | class DBWriter: 73 | def __init__(self, db_path, atomss): 74 | self.atomss = atomss 75 | self.db_path = db_path 76 | 77 | def write(self): 78 | with ase.db.connect(self.db_path) as db: 79 | for atoms in self.atomss: 80 | if atoms.calc.results: 81 | db.write(atoms, data=atoms.calc.results) 82 | 83 | 84 | def interpolate_band(atom_configs, transition_state=None): 85 | if transition_state: 86 | transition_state = read(transition_state) 87 | ts_positions = transition_state.get_positions() 88 | middle_idx = len(atom_configs) // 2 89 | atom_configs[middle_idx].set_positions(ts_positions) 90 | first_band = NEB(atom_configs[: middle_idx + 1]) 91 | second_band = NEB(atom_configs[middle_idx:]) 92 | first_band.interpolate("idpp") 93 | second_band.interpolate("idpp") 94 | else: 95 | band = NEB(atom_configs) 96 | band.interpolate("idpp") 97 | return atom_configs 98 | 99 | 100 | def max_dimensions(frames): 101 | """Get the maximum width and height among a list of images.""" 102 | max_width = max_height = 0 103 | for frame in frames: 104 | with Image.open(frame) as img: 105 | width, height = img.size 106 | max_width = max(max_width, width) 107 | max_height = max(max_height, height) 108 | return max_width, max_height 109 | 110 | def pad_image(image_path, target_size): 111 | """Pad an image to the target size.""" 112 | with Image.open(image_path) as img: 113 | img = ImageOps.expand(img, border=((target_size[0]-img.size[0])//2, 114 | (target_size[1]-img.size[1])//2, 115 | (target_size[0]-img.size[0]+1)//2, 116 | (target_size[1]-img.size[1]+1)//2), 117 | fill='white') # or another suitable color for your images 118 | return img 119 | 120 | def frames_to_gif(frames, output_gif): 121 | # First, render each Atoms frame to an image 122 | image_paths = [] 123 | for i, frame in enumerate(frames): 124 | img_path = f"tmp_frame_{i}_{uuid.uuid4()}.png" 125 | write(img_path, frame) 126 | image_paths.append(img_path) 127 | 128 | # Determine the max dimensions 129 | max_width, max_height = max_dimensions(image_paths) 130 | 131 | # Create a list to store processed frames 132 | processed_frames = [] 133 | 134 | # Pad each frame, ensuring a non-transparent background 135 | for img_path in image_paths: 136 | with Image.open(img_path) as opened_img: 137 | padded_frame = pad_image(img_path, (max_width, max_height)) 138 | 139 | # Create a white background and paste the frame onto it to ensure non-transparency 140 | background = Image.new('RGB', padded_frame.size, (255, 255, 255)) 141 | background.paste(padded_frame, mask=(padded_frame.split()[3] if len(padded_frame.split()) == 4 else None)) 142 | processed_frames.append(np.array(background)) 143 | 144 | # Extend the list of processed frames with a reversed copy (excluding the last frame) 145 | extended_frames = processed_frames + processed_frames[-2::-1] 146 | 147 | # Save the gif using imageio 148 | with imageio.get_writer(output_gif, mode='I', duration=0.5) as writer: 149 | for processed_frame in extended_frames: 150 | writer.append_data(processed_frame) 151 | 152 | # Cleanup the temporary image files 153 | for img_path in image_paths: 154 | os.remove(img_path) 155 | 156 | 157 | def process_seed(seed, n_images, neb_fmax, cineb_fmax, steps, output_path): 158 | 159 | with SuppressStderr(): # xTB is so noisy when not converged 160 | try: 161 | #print(f"Starting from seed : {seed}") 162 | reactant = os.path.join(seed, 'reactant.xyz') 163 | product = os.path.join(seed, 'product.xyz') 164 | transition_state = os.path.join(seed, 'ts.xyz') 165 | product = read(product) 166 | reactant = read(reactant) 167 | 168 | output = os.path.join(output_path, seed.split('/')[-2]+'-'+seed.split('/')[-1]) 169 | os.makedirs(output, exist_ok=True) 170 | atom_configs = [reactant.copy() for i in range(n_images - 1)] + [product] 171 | 172 | for i, atom_config in enumerate(atom_configs): 173 | atom_config.calc = XTB(method='GFN2-xTB') 174 | 175 | #print("Relaxing endpoints ... ") 176 | BFGS(atom_configs[0], logfile=None).run() 177 | BFGS(atom_configs[-1], logfile=None).run() 178 | 179 | #print("Interpolating band ... ") 180 | interpolate_band(atom_configs, transition_state) 181 | 182 | #print("Running NEB ... ") 183 | neb = NEB(atom_configs, climb=True, parallel=False) 184 | calculation_checker = CalculationChecker(neb) 185 | neb_tools = NEBTools(neb.images) 186 | 187 | relax_neb = NEBOptimizer(neb, logfile=None) 188 | db_writer = DBWriter(os.path.join(output, "neb.db"), atom_configs) 189 | fmaxs = [] 190 | fit_list = [] 191 | relax_neb.attach(calculation_checker.check_calculations) 192 | relax_neb.attach(db_writer.write) 193 | relax_neb.attach(lambda: fmaxs.append(neb_tools.get_fmax())) 194 | relax_neb.attach(lambda: fit_list.append(get_fit(neb_tools))) 195 | 196 | converged = relax_neb.run(fmax=neb_fmax, steps=steps) 197 | 198 | if not converged: 199 | raise 200 | 201 | #print("NEB has converged, turn on CI-NEB ...") 202 | neb.climb = True 203 | ci_converged = relax_neb.run(fmax=cineb_fmax, steps=steps) 204 | 205 | if ci_converged: 206 | open(os.path.join(output, "converged"), "w") 207 | #print("Reaction converged ... ") 208 | fit_list = np.array(fit_list) 209 | fig = plot_mep(fit_list) 210 | if ci_converged: 211 | np.save(os.path.join(output, "fitlist.npy"), fit_list) 212 | 213 | fig.savefig(os.path.join(output, "mep.png")) 214 | json.dump(fmaxs, open(os.path.join(output, "fmaxs.json"), "w"), indent=4) 215 | transition_state = max(atom_configs, key=lambda x: x.get_potential_energy()) 216 | write(os.path.join(output, "transition_state.xyz"), transition_state) 217 | write(os.path.join(output, "transition_state.png"), transition_state) 218 | write(os.path.join(output, "reactant.xyz"), atom_configs[0]) 219 | write(os.path.join(output, "reactant.png"), atom_configs[0]) 220 | write(os.path.join(output, "product.xyz"), atom_configs[-1]) 221 | write(os.path.join(output, "product.png"), atom_configs[-1]) 222 | write(os.path.join(output, "mep.xyz"), atom_configs) 223 | frames_to_gif(atom_configs, os.path.join(output, "mep.gif")) 224 | 225 | return seed 226 | 227 | except Exception as e: 228 | #print(f"Error processing seed {seed}: {e}") 229 | return None 230 | 231 | def main(args): 232 | 233 | print_args(args) 234 | 235 | input_path = args.input_path 236 | if not os.path.isdir(input_path): 237 | sys.exit(f"Error: '{input_path}' is not a directory.") 238 | output_path = args.output_path 239 | if not os.path.exists(output_path): 240 | os.mkdir(output_path) 241 | max_workers = args.max_workers 242 | n_images = args.n_images 243 | neb_fmax = args.neb_fmax 244 | cineb_fmax = args.cineb_fmax 245 | steps = args.steps 246 | 247 | 248 | seeds = [dirpath for dirpath, _, filenames in os.walk(input_path) if "ts.png" in filenames] 249 | 250 | bar_format = '{l_bar}{bar}| {n_fmt}/{total_fmt} [{elapsed}<{remaining}]' 251 | # Use a partial function to pass the extra arguments to process_seed 252 | process_with_args = partial(process_seed, n_images=n_images, neb_fmax=neb_fmax, 253 | cineb_fmax=cineb_fmax, steps=steps, output_path=output_path) 254 | with ProcessPoolExecutor(max_workers=max_workers) as executor: 255 | results = list(tqdm(executor.map(process_with_args, seeds), 256 | desc='Seeds', total=len(seeds), smoothing=0, bar_format=bar_format, ncols=70)) 257 | 258 | print('xTB-NEB completed!') 259 | 260 | def print_args(args): 261 | print() 262 | print("Arguments provided:") 263 | arg_dict = vars(args) 264 | for key, value in arg_dict.items(): 265 | print(f" {key}: {value}") 266 | print() 267 | 268 | 269 | def get_parser(): 270 | parser = argparse.ArgumentParser(description="Run NEB calculations on filtered gsm jobs") 271 | 272 | parser.add_argument('-i', '--input_path', type=str, required=True, 273 | help='Path of input directory containing filtered gsm jobs.') 274 | parser.add_argument('-o', '--output_path', type=str, required=True, 275 | help='Path of output directory to store results.') 276 | parser.add_argument('-n', '--max_workers', type=int, default=1, 277 | help='Number of processes to use for parallel execution.') 278 | parser.add_argument('--n_images', type=int, default=10, 279 | help='Number of images for NEB.') 280 | parser.add_argument('--neb_fmax', type=float, default=0.5, 281 | help='Fmax threshold for NEB.') 282 | parser.add_argument('--cineb_fmax', type=float, default=0.05, 283 | help='Fmax threshold for CI-NEB.') 284 | parser.add_argument('--steps', type=int, default=500, 285 | help='Maximum number of optimization steps.') 286 | 287 | return parser 288 | 289 | 290 | 291 | if __name__ == "__main__": 292 | args = get_parser().parse_args() 293 | main(args) 294 | -------------------------------------------------------------------------------- /dandelion/prep/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mhyeok1/dand/6c9ff246047ff37c89ab2b338f236e4798b3e2c2/dandelion/prep/__init__.py -------------------------------------------------------------------------------- /dandelion/prep/geom_opt.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import shutil 4 | import argparse 5 | import warnings 6 | 7 | from ase.io import read 8 | from ase.optimize import BFGS 9 | from xtb.ase.calculator import XTB 10 | from tqdm import tqdm 11 | from concurrent.futures import ProcessPoolExecutor, as_completed 12 | 13 | def write_xyz(filename, atoms): 14 | with open(filename, 'w') as f: 15 | f.write(f"{len(atoms)}\n\n") 16 | for atom in atoms: 17 | f.write(f"{atom.symbol:<2} {atom.position[0]:15.8f} {atom.position[1]:15.8f} {atom.position[2]:15.8f}\n") 18 | 19 | def generate_eq_struc(atoms): 20 | atoms.calc = XTB(method="GFN2-xTB") 21 | with warnings.catch_warnings(): 22 | warnings.simplefilter("ignore") 23 | opt = BFGS(atoms, logfile=None) 24 | opt.run(fmax=1e-4) 25 | return atoms 26 | 27 | def process_file(input_file, output_dir): 28 | filename = os.path.basename(input_file) 29 | mol_dir = os.path.join(output_dir, os.path.splitext(filename)[0]) 30 | os.makedirs(mol_dir, exist_ok=True) 31 | 32 | # Copy original file 33 | shutil.copy(input_file, mol_dir) 34 | 35 | # Generate and save optimized structure 36 | atoms = read(input_file) 37 | optimized_atoms = generate_eq_struc(atoms) 38 | write_xyz(os.path.join(mol_dir, 'struc.xyz'), optimized_atoms) 39 | 40 | # Remove the original copied file 41 | os.remove(os.path.join(mol_dir, filename)) 42 | 43 | def main(args): 44 | print_args(args) 45 | 46 | input_path = os.path.abspath(args.input_path) 47 | if not os.path.isdir(input_path): 48 | sys.exit(f"Error: '{input_path}' is not a directory.") 49 | output_path = os.path.abspath(args.output_path) 50 | max_workers = args.max_workers 51 | 52 | # Get list of all .xyz files 53 | xyz_files = [] 54 | for root, _, files in os.walk(input_path): 55 | xyz_files.extend([os.path.join(root, f) for f in files if f.endswith('.xyz')]) 56 | 57 | # Process files in parallel with progress bar 58 | bar_format = '{l_bar}{bar}| {n_fmt}/{total_fmt} [{elapsed}<{remaining}]' 59 | with ProcessPoolExecutor(max_workers=max_workers) as executor: 60 | list(tqdm(executor.map(process_file, xyz_files, [output_path]*len(xyz_files)), 61 | total=len(xyz_files), desc="Optimizing structures", smoothing=0, bar_format=bar_format, ncols=70)) 62 | 63 | def print_args(args): 64 | print("\nArguments provided:") 65 | for key, value in vars(args).items(): 66 | print(f" {key}: {value}") 67 | print() 68 | 69 | def get_parser(): 70 | parser = argparse.ArgumentParser(description="Optimize geometries using xTB") 71 | parser.add_argument('-i', '--input_path', required=True, 72 | help="Path of the input reactants directory") 73 | parser.add_argument('-o', '--output_path', required=True, 74 | help='Path of output directory to store optimized geometries') 75 | parser.add_argument('-n', '--max_workers', type=int, default=1, 76 | help='Number of processes to use for parallel execution.') 77 | return parser 78 | 79 | if __name__ == "__main__": 80 | args = get_parser().parse_args() 81 | main(args) -------------------------------------------------------------------------------- /dandelion/prep/smiles_to_isoconfs.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import argparse 4 | import subprocess 5 | 6 | from rdkit import Chem 7 | from rdkit.Chem.EnumerateStereoisomers import EnumerateStereoisomers, StereoEnumerationOptions 8 | 9 | def obabel_command(input_data, input_format, output_str, options=[], output_path=None): 10 | cmd = ['obabel', '-i', input_format] + input_data + ['-O', output_str] + options 11 | full_output_path = os.path.join(output_path, output_str) if output_path else output_str 12 | subprocess.run(cmd, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, cwd=output_path) 13 | return full_output_path 14 | 15 | def obabel_from_smiles(smiles_str, output_str, options=[], output_path=None): 16 | cmd = ['obabel', '-ismi', '-', '-O', output_str] + options 17 | full_output_path = os.path.join(output_path, output_str) if output_path else output_str 18 | process = subprocess.Popen(cmd, stdin=subprocess.PIPE, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, cwd=output_path) 19 | process.communicate(input=smiles_str.encode()) 20 | return full_output_path 21 | 22 | def cleanup_files(output_path, files_to_remove): 23 | for file in files_to_remove: 24 | file_path = os.path.join(output_path, file) 25 | if os.path.exists(file_path): 26 | os.remove(file_path) 27 | 28 | def main(args): 29 | print_args(args) 30 | 31 | input_path = os.path.abspath(args.input_path) 32 | if not os.path.isfile(input_path): 33 | sys.exit(f"Error: '{input_path}' is not a file.") 34 | output_path = os.path.abspath(args.output_path) 35 | 36 | if not os.path.exists(output_path): 37 | os.makedirs(output_path) 38 | 39 | with open(input_path, 'r') as f: 40 | lines = f.readlines() 41 | lines = list(map(lambda s: s.strip(), lines)) 42 | 43 | for m, mol_smi in enumerate(lines): 44 | print(f'==={m+1}th molecules : {mol_smi} ') 45 | mol = Chem.MolFromSmiles(mol_smi) 46 | opts = StereoEnumerationOptions(tryEmbedding=True, unique=True) 47 | isomers = tuple(EnumerateStereoisomers(mol, options=opts)) 48 | for i, isomer_smi in enumerate(Chem.MolToSmiles(x, isomericSmiles=True) for x in isomers): 49 | print(f'-{i+1}th isomer : {isomer_smi}') 50 | 51 | gen3d_file = obabel_from_smiles(isomer_smi, 'gen3d.xyz', ['--gen3d'], output_path=output_path) 52 | confab_file = obabel_command([os.path.basename(gen3d_file)], 'xyz', 'confab.sdf', ['--confab', '--rcutoff', '1.0'], output_path=output_path) 53 | obabel_command([os.path.basename(confab_file)], 'sdf', f'm{m+1}-i{i+1}-c.xyz', ['-m'], output_path=output_path) 54 | 55 | cleanup_files(output_path, ['confab.sdf', 'gen3d.xyz']) 56 | 57 | 58 | def print_args(args): 59 | print() 60 | print("Arguments provided:") 61 | arg_dict = vars(args) 62 | for key, value in arg_dict.items(): 63 | print(f" {key}: {value}") 64 | print() 65 | 66 | def get_parser(): 67 | parser = argparse.ArgumentParser(description="Generate Iso/Conformers from SMILES using RDkit and Obabel") 68 | 69 | parser.add_argument('-i', '--input_path', required=True, 70 | help="Path of the input SMILES string file") 71 | parser.add_argument('-o', '--output_path', type=str, required=True, 72 | help='Path of output directory to store Iso/Conformers.') 73 | return parser 74 | 75 | if __name__ == "__main__": 76 | args = get_parser().parse_args() 77 | main(args) -------------------------------------------------------------------------------- /dandelion/refine/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mhyeok1/dand/6c9ff246047ff37c89ab2b338f236e4798b3e2c2/dandelion/refine/__init__.py -------------------------------------------------------------------------------- /dandelion/refine/compile_refined.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import argparse 4 | 5 | import h5py 6 | from ase.db import connect 7 | 8 | 9 | def main(args): 10 | 11 | print_args(args) 12 | 13 | input_path = args.input_path 14 | if not os.path.isfile(input_path): 15 | sys.exit(f"Error: '{input_path}' is not a file.") 16 | output_path = args.output_path 17 | 18 | # Data structure to hold the computed results 19 | rxn_data = {} 20 | 21 | rows = [] # List to store all rows 22 | 23 | # Extract data from ASE database 24 | with connect(input_path) as db: 25 | for row in db.select(): 26 | if hasattr(row, 'energy') and hasattr(row, 'forces'): 27 | rows.append(row) 28 | 29 | # Sort rows based on the unique_id number 30 | rows.sort(key=lambda r: int(r.data['unique_id'].split('_')[-1])) 31 | 32 | # Process sorted rows 33 | for row in rows: 34 | # Extract unique_id and other data 35 | unique_id = row.data['unique_id'] 36 | chem_group_name, rxn_group_name, index = unique_id.split('_') 37 | 38 | if chem_group_name not in rxn_data: 39 | rxn_data[chem_group_name] = {} 40 | 41 | if rxn_group_name not in rxn_data[chem_group_name]: 42 | rxn_data[chem_group_name][rxn_group_name] = { 43 | 'atomic_numbers': row.toatoms().numbers, 44 | 'energies': [], 45 | 'forces': [], 46 | 'positions': [] 47 | } 48 | rxn_data[chem_group_name][rxn_group_name]['energies'].append(row.energy) 49 | rxn_data[chem_group_name][rxn_group_name]['forces'].append(row.forces) 50 | rxn_data[chem_group_name][rxn_group_name]['positions'].append(row.toatoms().positions) 51 | 52 | # Save the data to an h5 file 53 | with h5py.File(output_path, 'w') as h5file: 54 | # Ensure the 'data' group exists 55 | if 'data' not in h5file: 56 | data_group = h5file.create_group('data') 57 | else: 58 | data_group = h5file['data'] 59 | 60 | # Iterate through the rxn_data dictionary to save datasets 61 | for chem_group_name in rxn_data: 62 | if chem_group_name not in data_group: 63 | chem_group = data_group.create_group(chem_group_name) 64 | else: 65 | chem_group = data_group[chem_group_name] 66 | 67 | for rxn_group_name, rxn_entry in rxn_data[chem_group_name].items(): 68 | if rxn_group_name not in chem_group: 69 | rxn_group = chem_group.create_group(rxn_group_name) 70 | else: 71 | rxn_group = chem_group[rxn_group_name] 72 | 73 | # Add datasets to the reaction group 74 | rxn_group.create_dataset('atomic_numbers', data=rxn_entry['atomic_numbers']) 75 | rxn_group.create_dataset('wB97x_6-31G(d).energy', data=rxn_entry['energies']) 76 | rxn_group.create_dataset('wB97x_6-31G(d).forces', data=rxn_entry['forces']) 77 | rxn_group.create_dataset('positions', data=rxn_entry['positions']) 78 | 79 | print('Compiled successfully!') 80 | 81 | def print_args(args): 82 | print() 83 | print("Arguments provided:") 84 | arg_dict = vars(args) 85 | for key, value in arg_dict.items(): 86 | print(f" {key}: {value}") 87 | print() 88 | 89 | def get_parser(): 90 | parser = argparse.ArgumentParser(description="Translate ase db file into hdf5 file.") 91 | 92 | parser.add_argument('-i', '--input_path', required=True, 93 | help="Path of the input wB97X ASE db file") 94 | parser.add_argument('-o', '--output_path', required=True, 95 | help="Path of the output wB97X hdf5 file") 96 | 97 | return parser 98 | 99 | if __name__ == "__main__": 100 | args = get_parser().parse_args() 101 | main(args) 102 | 103 | 104 | -------------------------------------------------------------------------------- /dandelion/refine/refine_forces.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import logging 4 | import argparse 5 | from itertools import repeat 6 | from concurrent.futures import ProcessPoolExecutor, as_completed 7 | 8 | import h5py 9 | from tqdm import tqdm 10 | from ase import Atoms 11 | from ase.db import connect 12 | from ase.calculators.orca import ORCA 13 | 14 | 15 | # thank you for https://github.com/ZimmermanGroup/ORCA-Basis-Sets 16 | custom_basis = ''' 17 | %basis 18 | newgto Br 19 | S 6 20 | 1 0.1137182000E+06 0.1717696000E-02 21 | 2 0.1707444000E+05 0.1316744000E-01 22 | 3 0.3889576000E+04 0.6504553000E-01 23 | 4 0.1097096000E+04 0.2269505000E+00 24 | 5 0.3520624000E+03 0.4768357000E+00 25 | 6 0.1207002000E+03 0.3583677000E+00 26 | S 6 27 | 1 0.2471138000E+04 0.2243687000E-02 28 | 2 0.5893838000E+03 0.2994853000E-01 29 | 3 0.1918738000E+03 0.1256009000E+00 30 | 4 0.7295339000E+02 -0.9832786000E-03 31 | 5 0.3005839000E+02 -0.6013141000E+00 32 | 6 0.1252927000E+02 -0.4913983000E+00 33 | P 6 34 | 1 0.2471138000E+04 0.3790182000E-02 35 | 2 0.5893838000E+03 0.2995979000E-01 36 | 3 0.1918738000E+03 0.1318228000E+00 37 | 4 0.7295339000E+02 0.3432708000E+00 38 | 5 0.3005839000E+02 0.4642345000E+00 39 | 6 0.1252927000E+02 0.2079387000E+00 40 | S 6 41 | 1 0.1096411000E+03 -0.5975683000E-02 42 | 2 0.3858948000E+02 0.5542122000E-01 43 | 3 0.1637818000E+02 0.2681200000E+00 44 | 4 0.7221836000E+01 -0.1543606000E+00 45 | 5 0.3263697000E+01 -0.7206306000E+00 46 | 6 0.1465499000E+01 -0.3316437000E+00 47 | P 6 48 | 1 0.1096411000E+03 -0.6907483000E-02 49 | 2 0.3858948000E+02 -0.3041432000E-01 50 | 3 0.1637818000E+02 0.4602725000E-01 51 | 4 0.7221836000E+01 0.3650689000E+00 52 | 5 0.3263697000E+01 0.4949232000E+00 53 | 6 0.1465499000E+01 0.2090394000E+00 54 | S 3 55 | 1 0.2103651000E+01 0.3029029000E+00 56 | 2 0.7547050000E+00 -0.2152659000E+00 57 | 3 0.3005140000E+00 -0.9633941000E+00 58 | P 3 59 | 1 0.2103651000E+01 -0.2826714000E-01 60 | 2 0.7547050000E+00 0.3503065000E+00 61 | 3 0.3005140000E+00 0.7182446000E+00 62 | S 1 63 | 1 0.1090710000E+00 0.1000000000E+01 64 | P 1 65 | 1 0.1090710000E+00 0.1000000000E+01 66 | D 3 67 | 1 0.6225514000E+02 0.7704229000E-01 68 | 2 0.1731284000E+02 0.3707384000E+00 69 | 3 0.5607915000E+01 0.7097628000E+00 70 | D 1 71 | 1 0.1746486000E+01 1.0000000 72 | end 73 | end 74 | ''' 75 | 76 | 77 | 78 | class tqdm_hour(tqdm): 79 | """Provides an `hours per iteration` format parameter.""" 80 | @property 81 | def format_dict(self): 82 | d = super(tqdm_hour, self).format_dict 83 | rate_hr = '{:.1f}'.format(1/d["rate"] / 3600) if d["rate"] else '?' 84 | d.update(rate_hr=(rate_hr + ' hour/' + d['unit'])) 85 | return d 86 | 87 | class tqdm_minute(tqdm): 88 | """Provides a `minutes per iteration` format parameter""" 89 | @property 90 | def format_dict(self): 91 | d = super(tqdm_minute, self).format_dict 92 | rate_min = '{:.0f}'.format(1/d["rate"] / 60) if d["rate"] else '?' 93 | d.update(rate_min=(rate_min + ' min/' + d['unit'])) 94 | return d 95 | 96 | bar_format_hr = '{l_bar}{bar}| {n_fmt}/{total_fmt} [{elapsed}<{remaining}, {rate_hr}{postfix}]' 97 | bar_format_min = '{l_bar}{bar}| {n_fmt}/{total_fmt} [{elapsed}<{remaining}, {rate_min}{postfix}]' 98 | bar_format_points = '{l_bar}{bar}| {n_fmt}/{total_fmt} [{elapsed}<{remaining}]' 99 | 100 | def get_unique_ids_from_db(output_path): 101 | """Extract all unique IDs from the ASE database.""" 102 | unique_ids = set() 103 | with connect(output_path) as db: 104 | for row in db.select(): 105 | data = row.data 106 | if "unique_id" in data: 107 | unique_ids.add(data['unique_id']) 108 | return unique_ids 109 | 110 | def already_calculated(unique_id, unique_id_list): 111 | """Check if a unique ID has already been processed.""" 112 | return unique_id in unique_id_list 113 | 114 | def compute_force(coord, atomic_numbers, unique_id, output_path): 115 | """Compute forces using ORCA for a given set of coordinates.""" 116 | atoms = Atoms(positions=coord, numbers=atomic_numbers) 117 | atoms.calc = ORCA( 118 | label=os.path.join(os.path.dirname(output_path), f"orca/{unique_id}/{unique_id}"), 119 | orcasimpleinput="wB97X 6-31G(d) NoTrah", 120 | orcablocks=custom_basis 121 | ) 122 | try: 123 | # Forces and energy will be stored in the calculator of the Atoms object. 124 | atoms.get_forces() 125 | return atoms 126 | except Exception as e: 127 | # Log the error 128 | logging.error(f"Error in computing forces for unique_id {unique_id}: {e}") 129 | return None 130 | 131 | def accumulate_files_for_deletion(unique_id, output_path, files_to_delete, file_exts=['gbw', 'engrad', 'densities', 'ase']): 132 | for ext in file_exts: 133 | file_path = os.path.join(os.path.dirname(output_path), f"orca/{unique_id}/{unique_id}.{ext}") 134 | if os.path.exists(file_path): 135 | files_to_delete.add(file_path) 136 | 137 | def main(args): 138 | """Main function to orchestrate the computations and database writing.""" 139 | print_args(args) 140 | 141 | input_path = args.input_path 142 | if not os.path.isfile(input_path): 143 | sys.exit(f"Error: '{input_path}' is not a file.") 144 | output_path = args.output_path 145 | max_workers = args.max_workers 146 | orcabinary = args.orca 147 | 148 | os.environ["ASE_ORCA_COMMAND"] = f"{orcabinary} PREFIX.inp > PREFIX.out 2>&1" 149 | 150 | os.makedirs(os.path.dirname(output_path), exist_ok=True) 151 | 152 | log_file_path = os.path.join(os.path.dirname(output_path), 'orca_errors.log') 153 | logging.basicConfig(filename=log_file_path, level=logging.ERROR, 154 | format='%(asctime)s %(levelname)s: %(message)s', 155 | datefmt='%Y-%m-%d %H:%M:%S') 156 | 157 | if os.path.isfile(output_path): 158 | print(f'Restarting calculation from {output_path}') 159 | is_restart = True 160 | else: 161 | print(f'Created db file at {output_path}\n') 162 | is_restart = False 163 | 164 | 165 | unique_ids_from_db = get_unique_ids_from_db(output_path) 166 | if is_restart: 167 | print(f'{len(unique_ids_from_db)} points are skipped.\n') 168 | 169 | files_to_delete = set() # Set to accumulate files for deletion 170 | 171 | # Read from the input HDF5 file and compute the energies and forces. 172 | with h5py.File(input_path, 'r') as f: 173 | 174 | for chem_group_name, chem_group in tqdm_hour(f['data'].items(), 175 | desc="Formulas", 176 | position=0, 177 | smoothing=1, 178 | bar_format=bar_format_hr, 179 | ncols=70): 180 | 181 | for rxn_group_name, rxn_group in tqdm_minute(chem_group.items(), 182 | desc=f"Rxns in {chem_group_name}", 183 | leave=False, 184 | position=1, 185 | smoothing=1, 186 | bar_format=bar_format_min, 187 | ncols=70): 188 | 189 | positions_dataset = rxn_group['positions'] 190 | coords = [coord for coord in positions_dataset] 191 | atomic_numbers = rxn_group['atomic_numbers'][:] 192 | args_atomic_numbers = repeat(atomic_numbers, len(coords)) 193 | unique_ids = [f"{chem_group_name}_{rxn_group_name}_{index}" for index, _ in enumerate(positions_dataset)] 194 | 195 | # Parallel computation using ProcessPoolExecutor. 196 | with ProcessPoolExecutor(max_workers=max_workers) as executor: 197 | future_to_unique_id = {executor.submit(compute_force, coord, atomic_number, unique_id, output_path): unique_id 198 | for coord, atomic_number, unique_id in zip(coords, args_atomic_numbers, unique_ids) 199 | if not already_calculated(unique_id, unique_ids_from_db)} 200 | 201 | batch_size = max_workers # Batch size set to the number of workers 202 | results_batch = [] 203 | 204 | # Process the completed tasks. 205 | for future in tqdm(as_completed(future_to_unique_id), 206 | total=len(future_to_unique_id), 207 | desc=f"Samples in {rxn_group_name}", 208 | leave=False, 209 | position=2, 210 | smoothing=0, 211 | bar_format=bar_format_points, 212 | ncols=70): 213 | 214 | unique_id = future_to_unique_id[future] 215 | atoms_result = future.result() # Finished ASE Atoms object 216 | if atoms_result is not None: 217 | results_batch.append((atoms_result, {'unique_id': unique_id})) 218 | accumulate_files_for_deletion(unique_id, output_path, files_to_delete) 219 | 220 | # Write to database in batches 221 | if len(results_batch) >= batch_size: 222 | with connect(output_path) as db: 223 | for atoms, data in results_batch: 224 | db.write(atoms, data=data) 225 | results_batch.clear() 226 | 227 | # Write any remaining results in the batch 228 | for atoms, data in results_batch: 229 | with connect(output_path) as db: 230 | db.write(atoms, data=data) 231 | results_batch.clear() 232 | 233 | for file_path in files_to_delete: 234 | os.remove(file_path) 235 | 236 | print('wB97X calculation finished!') 237 | 238 | def print_args(args): 239 | print() 240 | print("Arguments provided:") 241 | arg_dict = vars(args) 242 | for key, value in arg_dict.items(): 243 | print(f" {key}: {value}") 244 | print() 245 | 246 | def get_parser(): 247 | parser = argparse.ArgumentParser(description="Compute energies and forces and store in ASE database") 248 | 249 | parser.add_argument('-i', '--input_path', required=True, 250 | help="Path of the input XTB HDF5 file") 251 | parser.add_argument('-o', '--output_path', required=True, 252 | help="Path of the output wB97X ASE database") 253 | parser.add_argument('-n', '--max_workers', type=int, default=1, 254 | help="Number of worker processes") 255 | parser.add_argument('--orca', required=True, 256 | help="Path of the orca binary file") 257 | 258 | return parser 259 | 260 | if __name__ == "__main__": 261 | args = get_parser().parse_args() 262 | main(args) 263 | 264 | -------------------------------------------------------------------------------- /dandelion/segsm/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mhyeok1/dand/6c9ff246047ff37c89ab2b338f236e4798b3e2c2/dandelion/segsm/__init__.py -------------------------------------------------------------------------------- /dandelion/segsm/ard_gsm/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mhyeok1/dand/6c9ff246047ff37c89ab2b338f236e4798b3e2c2/dandelion/segsm/ard_gsm/__init__.py -------------------------------------------------------------------------------- /dandelion/segsm/ard_gsm/driving_coords.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding:utf-8 -*- 3 | 4 | import itertools 5 | 6 | from .mol import Connection 7 | from .limits import connection_limits 8 | 9 | 10 | class ConnectionError(Exception): 11 | """ 12 | For any invalid connection changes that occur in MolGraph. 13 | """ 14 | pass 15 | 16 | 17 | class DrivingCoords(object): 18 | def __init__(self, break_idxs=None, form_idxs=None): 19 | self._break_idxs = break_idxs or set() 20 | self._form_idxs = form_idxs or set() 21 | 22 | self.remove_duplicates() 23 | 24 | def __str__(self): 25 | s = '' 26 | for idxs in self._break_idxs: 27 | s += 'BREAK {0[0]} {0[1]}\n'.format(idxs) 28 | for idxs in self._form_idxs: 29 | s += 'ADD {0[0]} {0[1]}\n'.format(idxs) 30 | return s 31 | 32 | def __eq__(self, other): 33 | return str(self) == str(other) 34 | 35 | def __ne__(self, other): 36 | return not self == other 37 | 38 | def __hash__(self): 39 | return hash(str(self)) 40 | 41 | def reconstruct_from_str(self, s): 42 | self._break_idxs = set() 43 | self._form_idxs = set() 44 | for line in s.splitlines(): 45 | if 'BREAK' in line: 46 | idxs = [int(idx) for idx in line.split()[1:]] 47 | self.add_break_idxs(idxs) 48 | elif 'ADD' in line: 49 | idxs = [int(idx) for idx in line.split()[1:]] 50 | self.add_form_idxs(idxs) 51 | 52 | def remove_duplicates(self): 53 | self._break_idxs = {tuple(sorted(idxs)) for idxs in self._break_idxs} 54 | self._form_idxs = {tuple(sorted(idxs)) for idxs in self._form_idxs} 55 | 56 | def add_break_idxs(self, idxs): 57 | self._break_idxs.add(tuple(sorted(idxs))) 58 | 59 | def add_form_idxs(self, idxs): 60 | self._form_idxs.add(tuple(sorted(idxs))) 61 | 62 | def is_subset(self, other): 63 | """ 64 | Return True if self is contained in other. 65 | """ 66 | for idxs in self._break_idxs: 67 | if idxs not in other._break_idxs: 68 | return False 69 | for idxs in self._form_idxs: 70 | if idxs not in other._form_idxs: 71 | return False 72 | return True 73 | 74 | def get_connections(self, atoms): 75 | atoms_dict = {} 76 | for atom in atoms: 77 | if atom.idx is None: 78 | raise Exception('Atom {} is missing index'.format(atom.symbol)) 79 | else: 80 | atoms_dict[atom.idx] = atom 81 | 82 | connections_break, connections_form = [], [] 83 | for idxs in self._break_idxs: 84 | connection = Connection(atoms_dict[idxs[0]], atoms_dict[idxs[1]]) 85 | connections_break.append(connection) 86 | for idxs in self._form_idxs: 87 | connection = Connection(atoms_dict[idxs[0]], atoms_dict[idxs[1]]) 88 | connections_form.append(connection) 89 | 90 | return connections_break, connections_form 91 | 92 | 93 | def generate_driving_coords(mol, maxbreak=3, maxform=3, maxchange=5, single_change=True, equiv_Hs=False, 94 | minbreak=0, minform=0, minchange=1): 95 | """ 96 | Generate the set of possible driving coordinates given a molecule. Only 97 | consider breaking a maximum of `maxbreak`, forming a maximum of `maxform`, 98 | and in total changing a maximum of `maxchange` connections (molecular 99 | bonds are considered without regard for the bond order). If `single_change` 100 | is true, consider driving coordinates for (nbreak,nform) in ((0,1),(1,0)) 101 | in addition to the other ones. If `equiv_Hs` is true, generate essentially 102 | equivalent driving coordinates for different but equivalent hydrogens, 103 | i.e., those attached to the same non-cyclic tetrahedral carbon. 104 | 105 | Can also specify minbreak, minform, and minchange. 106 | """ 107 | assert all(atom.idx is not None for atom in mol.atoms) 108 | driving_coords_set = set() 109 | 110 | mol = mol.copy(deep=True) 111 | if not equiv_Hs: 112 | mol.label_equivalent_hydrogens() 113 | 114 | # Enumerate all possible connections between atoms 115 | # and remove the ones for atoms that are already connected 116 | atoms = mol.atoms 117 | connections = mol.get_all_connections() 118 | all_possible_connections = [Connection(atom1, atom2) 119 | for i, atom1 in enumerate(atoms) 120 | for atom2 in atoms[(i+1):] 121 | if not atom1.frozen and not atom2.frozen] 122 | all_potential_new_connections = [connection for connection in all_possible_connections 123 | if connection not in connections] 124 | 125 | for nbreak in range(minbreak, maxbreak+1): 126 | for nform in range(minform, maxform+1): 127 | if nbreak + nform < minchange: 128 | continue 129 | elif nbreak + nform > maxchange: 130 | continue 131 | elif not single_change and (nbreak + nform == 1): 132 | continue 133 | 134 | # Generate all possible combinations of connection changes 135 | potential_remove_connections_iter = itertools.combinations(connections, nbreak) 136 | potential_new_connections_iter = itertools.combinations(all_potential_new_connections, nform) 137 | potential_connection_changes = itertools.product(potential_remove_connections_iter, 138 | potential_new_connections_iter) 139 | 140 | for connections_to_break, connections_to_form in potential_connection_changes: 141 | try: 142 | change_connections(mol, connections_to_break, connections_to_form) 143 | except ConnectionError: 144 | continue 145 | else: 146 | break_idxs = [(c.atom1.idx, c.atom2.idx) for c in connections_to_break] 147 | form_idxs = [(c.atom1.idx, c.atom2.idx) for c in connections_to_form] 148 | driving_coords = DrivingCoords(break_idxs=break_idxs, form_idxs=form_idxs) 149 | driving_coords_set.add(driving_coords) 150 | finally: 151 | # Always restore connections for next molecule test 152 | change_connections(mol, connections_to_form, connections_to_break, test_validity=False) 153 | 154 | return driving_coords_set 155 | 156 | 157 | def change_connections(mol, connections_to_break, connections_to_form, test_validity=True): 158 | for connection in connections_to_break: 159 | mol.remove_connection(connection) 160 | for connection in connections_to_form: 161 | mol.add_connection(connection) 162 | 163 | if test_validity: 164 | # Only have to test the atoms involved in the changed connections 165 | for connection in connections_to_break: 166 | if not test_connection_validity(connection): 167 | raise ConnectionError('Breaking {} resulted in violation of connection limits'.format(connection)) 168 | for connection in connections_to_form: 169 | if not test_connection_validity(connection): 170 | raise ConnectionError('Forming {} resulted in violation of connection limits'.format(connection)) 171 | 172 | 173 | def test_connection_validity(connection): 174 | atom1 = connection.atom1 175 | atom2 = connection.atom2 176 | atom1_ll, atom1_ul = connection_limits[atom1.symbol.upper()] 177 | atom2_ll, atom2_ul = connection_limits[atom2.symbol.upper()] 178 | if len(atom1.connections) < atom1_ll or len(atom1.connections) > atom1_ul: 179 | return False 180 | elif len(atom2.connections) < atom2_ll or len(atom2.connections) > atom2_ul: 181 | return False 182 | else: 183 | return True 184 | -------------------------------------------------------------------------------- /dandelion/segsm/ard_gsm/limits.py: -------------------------------------------------------------------------------- 1 | # First number is the minimum number of connections; 2 | # second number is the maximum number of connections. 3 | connection_limits = { 4 | 'H': (1, 1), 5 | 'C': (2, 4), 6 | 'N': (1, 3), 7 | 'O': (1, 2), 8 | 'F': (1, 1), 9 | 'S': (1, 4), 10 | 'CL':(1, 1), 11 | 'BR':(1, 1), 12 | 'LI':(0, 1) 13 | } -------------------------------------------------------------------------------- /dandelion/segsm/ard_gsm/mol.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding:utf-8 -*- 3 | 4 | import os 5 | 6 | import numpy as np 7 | from openbabel import pybel 8 | from rdkit import Chem 9 | from rdkit.Chem import AllChem, GetPeriodicTable 10 | import networkx as nx 11 | _rdkit_periodic_table = GetPeriodicTable() 12 | 13 | 14 | class SanitizationError(Exception): 15 | """ 16 | Exception class to handle errors during SMILES perception. 17 | """ 18 | pass 19 | 20 | 21 | class Atom(object): 22 | """ 23 | Represents an atom in a molecular graph. 24 | """ 25 | 26 | def __init__(self, symbol=None, idx=None, coords=np.array([]), frozen=False): 27 | self.symbol = symbol 28 | self.idx = idx 29 | self.coords = coords 30 | self.frozen = frozen 31 | self.connections = {} 32 | 33 | def __str__(self): 34 | return '{}: {}'.format(self.idx, self.symbol) 35 | 36 | def __repr__(self): 37 | return ''.format(str(self)) 38 | 39 | def copy(self): 40 | return Atom( 41 | symbol=self.symbol, 42 | idx=self.idx, 43 | coords=self.coords.copy(), 44 | frozen=self.frozen, 45 | ) 46 | 47 | def get_atomicnum(self): 48 | return _rdkit_periodic_table.GetAtomicNumber(self.symbol) 49 | 50 | def get_cov_rad(self): 51 | return _rdkit_periodic_table.GetRcovalent(self.symbol) 52 | 53 | 54 | class Connection(object): 55 | """ 56 | Represents a connection in a molecular graph. 57 | 58 | Note: Equality and hash are only based on atom symbols and indices. 59 | """ 60 | 61 | def __init__(self, atom1, atom2): 62 | self._atom1 = atom1 63 | self._atom2 = atom2 64 | self._make_order_invariant() 65 | 66 | def __str__(self): 67 | return '({})--({})'.format(str(self.atom1), str(self.atom2)) 68 | 69 | def __repr__(self): 70 | return ''.format(str(self)) 71 | 72 | def __eq__(self, other): 73 | return str(self) == str(other) 74 | 75 | def __ne__(self, other): 76 | return not self == other 77 | 78 | def __hash__(self): 79 | return hash(str(self)) 80 | 81 | def _make_order_invariant(self): 82 | # Ensure that atom ordering is consistent 83 | atoms = [self._atom1, self._atom2] 84 | atoms.sort(key=lambda a: a.symbol) 85 | if self._atom1.idx is not None or self._atom2.idx is not None: 86 | atoms.sort(key=lambda a: a.idx) 87 | self._atom1, self._atom2 = atoms 88 | 89 | @property 90 | def atom1(self): 91 | return self._atom1 92 | 93 | @property 94 | def atom2(self): 95 | return self._atom2 96 | 97 | @atom1.setter 98 | def atom1(self, val): 99 | self._atom1 = val 100 | self._make_order_invariant() 101 | 102 | @atom2.setter 103 | def atom2(self, val): 104 | self._atom2 = val 105 | self._make_order_invariant() 106 | 107 | def copy(self): 108 | return Connection(self.atom1, self.atom2) 109 | 110 | 111 | class MolGraph(object): 112 | """ 113 | Class to convert coordinates to a molecular graph 114 | and to generate driving coordinates. 115 | 116 | Note: Atom indices start at 1. 117 | """ 118 | 119 | def __init__(self, atoms=None, symbols=None, coords=None, energy=None): 120 | self.atoms = atoms or [] 121 | self.energy = energy 122 | 123 | if not self.atoms and symbols is not None: 124 | for idx, symbol in enumerate(symbols): 125 | atom = Atom(symbol=symbol, idx=idx+1) 126 | self.add_atom(atom) 127 | 128 | if coords is not None: 129 | self.set_coords(coords) 130 | 131 | def __iter__(self): 132 | for atom in self.atoms: 133 | yield atom 134 | 135 | def get_formula(self): 136 | """ 137 | Return the molecular formula corresponding to the graph. 138 | """ 139 | # Count the numbers of each element 140 | elements = {} 141 | for atom in self: 142 | symbol = atom.symbol 143 | elements[symbol] = elements.get(symbol, 0) + 1 144 | 145 | # Carbon and hydrogen come first if carbon is present, other 146 | # atoms come in alphabetical order (also hydrogen if there is no 147 | # carbon) 148 | formula = '' 149 | if 'C' in elements.keys(): 150 | count = elements['C'] 151 | formula += 'C{:d}'.format(count) if count > 1 else 'C' 152 | del elements['C'] 153 | if 'H' in elements.keys(): 154 | count = elements['H'] 155 | formula += 'H{:d}'.format(count) if count > 1 else 'H' 156 | del elements['H'] 157 | keys = elements.keys() 158 | keys.sort() 159 | for key in keys: 160 | count = elements[key] 161 | formula += '{}{:d}'.format(key, count) if count > 1 else key 162 | 163 | return formula 164 | 165 | def to_rdkit_mol(self): 166 | """ 167 | Convert the graph to an RDKit molecule with atom map numbers set 168 | by the indices of the atoms. 169 | """ 170 | assert all(atom.idx is not None for atom in self) 171 | 172 | rd_mol = Chem.rdchem.EditableMol(Chem.rdchem.Mol()) 173 | for atom in self: 174 | rd_atom = Chem.rdchem.Atom(atom.symbol) 175 | rd_atom.SetAtomMapNum(atom.idx) 176 | rd_mol.AddAtom(rd_atom) 177 | 178 | for atom1 in self: 179 | for atom2, connection in atom1.connections.items(): 180 | idx1 = self.atoms.index(atom1) # This is the index in the atoms list 181 | idx2 = self.atoms.index(atom2) 182 | if idx1 < idx2: 183 | rd_mol.AddBond(idx1, idx2, Chem.rdchem.BondType.SINGLE) 184 | 185 | rd_mol = rd_mol.GetMol() 186 | return rd_mol 187 | 188 | def to_pybel_mol(self, from_coords=True): 189 | """ 190 | Convert the graph to a Pybel molecule. Currently only supports 191 | creating the molecule from 3D coordinates. 192 | """ 193 | if from_coords: 194 | xyz = self.to_xyz() 195 | mol = pybel.readstring('xyz', xyz) 196 | return mol 197 | else: 198 | raise NotImplementedError('Can only create Pybel molecules from 3D structure') 199 | 200 | def to_xyz(self, comment=''): 201 | """ 202 | Convert the graph to an XYZ-format string. Optionally, add 203 | comment on the second line. 204 | """ 205 | for atom in self: 206 | assert len(atom.coords) != 0 207 | symbols, coords = self.get_geometry() 208 | cblock = ['{0} {1[0]: .10f} {1[1]: .10f} {1[2]: .10f}'.format(s, c) for s, c in zip(symbols, coords)] 209 | return str(len(symbols)) + '\n' + comment + '\n' + '\n'.join(cblock) 210 | 211 | def perceive_smiles(self, atommap=True): 212 | """ 213 | Using the geometry, perceive the corresponding SMILES with bond 214 | orders using Open Babel and RDKit. In order to create a sensible 215 | SMILES, first infer the connectivity from the 3D coordinates 216 | using Open Babel, then convert to InChI to saturate unphysical 217 | multi-radical structures, then convert to RDKit and match the 218 | atoms to the ones in self in order to return a SMILES with atom 219 | mapping corresponding to the order given by the values of 220 | atom.idx for all atoms in self. 221 | 222 | This method requires Open Babel version >=2.4.1 223 | """ 224 | 225 | # Get dict of atomic numbers for later comparison. 226 | atoms_in_mol_true = {} 227 | for atom in self: 228 | anum = atom.get_atomicnum() 229 | atoms_in_mol_true[anum] = atoms_in_mol_true.get(anum, 0) + 1 230 | 231 | # There seems to be no particularly simple way in RDKit to read 232 | # in 3D structures, so use Open Babel for this part. RMG doesn't 233 | # recognize some single bonds, so we can't use that. 234 | # We've probably called to_pybel_mol at some previous time to set 235 | # connections, but it shouldn't be too expensive to do it again. 236 | pybel_mol = self.to_pybel_mol() 237 | 238 | # Open Babel will often make single bonds and generate Smiles 239 | # that have multiple radicals, which would probably correspond 240 | # to double bonds. To get around this, convert to InChI (which 241 | # does not consider bond orders) and then convert to Smiles. 242 | inchi = pybel_mol.write('inchi', opt={'F': None}).strip() # Add fixed H layer 243 | 244 | # Use RDKit to convert back to Smiles 245 | mol_sanitized = Chem.MolFromInchi(inchi) 246 | 247 | # RDKit doesn't like some hypervalent atoms 248 | if mol_sanitized is None: 249 | raise SanitizationError( 250 | 'Could not convert \n{}\nto Smiles. Unsanitized Smiles: {}'.format(self.to_xyz(), 251 | pybel_mol.write('smi').strip()) 252 | ) 253 | 254 | # RDKit adds unnecessary hydrogens in some cases. If 255 | # this happens, give up and return an error. 256 | mol_sanitized = Chem.AddHs(mol_sanitized) 257 | atoms_in_mol_sani = {} 258 | for atom in mol_sanitized.GetAtoms(): 259 | atoms_in_mol_sani[atom.GetAtomicNum()] = atoms_in_mol_sani.get(atom.GetAtomicNum(), 0) + 1 260 | if atoms_in_mol_sani != atoms_in_mol_true: 261 | raise SanitizationError( 262 | 'Could not convert \n{}\nto Smiles. Wrong Smiles: {}'.format(self.to_xyz(), 263 | Chem.MolToSmiles(mol_sanitized)) 264 | ) 265 | 266 | if not atommap: 267 | return Chem.MolToSmiles(mol_sanitized) 268 | 269 | # Because we went through InChI, we lost atom mapping 270 | # information. Restore it by matching the original molecule. 271 | # There should only be one unique map. 272 | mol_with_map = self.to_rdkit_mol() # This only has single bonds 273 | mol_sani_sb = Chem.Mol(mol_sanitized) # Make copy with single bonds only 274 | for bond in mol_sani_sb.GetBonds(): 275 | bond.SetBondType(Chem.rdchem.BondType.SINGLE) 276 | match = mol_sani_sb.GetSubstructMatch(mol_with_map) # Isomorphism mapping 277 | assert mol_with_map.GetNumAtoms() == len(match) # Make sure we match all atoms 278 | for atom in mol_with_map.GetAtoms(): 279 | idx = match[atom.GetIdx()] 280 | map_num = atom.GetAtomMapNum() 281 | mol_sanitized.GetAtomWithIdx(idx).SetAtomMapNum(map_num) 282 | 283 | # If everything succeeded up to here, we hopefully have a 284 | # sensible Smiles string with atom mappings for all atoms. 285 | return Chem.MolToSmiles(mol_sanitized) 286 | 287 | def add_atom(self, atom): 288 | self.atoms.append(atom) 289 | atom.connections = {} 290 | return atom 291 | 292 | def add_connection(self, connection=None, atom1=None, atom2=None): 293 | """ 294 | Either add a connection directly or first create one from two 295 | atoms and then add it. 296 | """ 297 | if connection is None: 298 | connection = Connection(atom1, atom2) 299 | if connection.atom1 not in self.atoms or connection.atom2 not in self.atoms: 300 | raise Exception('Cannot add connection between atoms not in the graph') 301 | else: 302 | connection.atom1.connections[connection.atom2] = connection 303 | connection.atom2.connections[connection.atom1] = connection 304 | return connection 305 | 306 | def get_all_connections(self): 307 | return {connection for atom in self.atoms for connection in atom.connections.values()} 308 | 309 | def get_connection(self, atom1, atom2): 310 | if atom1 not in self.atoms or atom2 not in self.atoms: 311 | raise Exception('One or both of the specified atoms are not in this graph') 312 | 313 | try: 314 | return atom1.connections[atom2] 315 | except KeyError: 316 | raise Exception('The specified atoms are not connected in this graph') 317 | 318 | def remove_atom(self, atom): 319 | for atom2 in atom.connections: 320 | del atom2.connections[atom] 321 | atom.connections = {} 322 | self.atoms.remove(atom) 323 | 324 | def remove_connection(self, connection): 325 | if connection.atom1 not in self.atoms or connection.atom2 not in self.atoms: 326 | raise Exception('Cannot remove connection between atoms not in the graph') 327 | del connection.atom1.connections[connection.atom2] 328 | del connection.atom2.connections[connection.atom1] 329 | 330 | def copy(self, deep=False): 331 | other = MolGraph(energy=self.energy) 332 | atoms = self.atoms 333 | mapping = {} 334 | for atom in atoms: 335 | if deep: 336 | atom2 = other.add_atom(atom.copy()) 337 | mapping[atom] = atom2 338 | else: 339 | connections = atom.connections 340 | other.add_atom(atom) 341 | atom.connections = connections 342 | if deep: 343 | for atom1 in atoms: 344 | for atom2 in atom1.connections: 345 | connection = atom1.connections[atom2] 346 | connection = connection.copy() 347 | connection.atom1 = mapping[atom1] 348 | connection.atom2 = mapping[atom2] 349 | other.add_connection(connection) 350 | return other 351 | 352 | def merge(self, other): 353 | new = MolGraph() 354 | for atom in self.atoms: 355 | connections = atom.connections 356 | new.add_atom(atom) 357 | atom.connections = connections 358 | for atom in other.atoms: 359 | connections = atom.connections 360 | new.add_atom(atom) 361 | atom.connections = connections 362 | new.energy = self.energy + other.energy 363 | return new 364 | 365 | def split(self): 366 | new1 = self.copy() 367 | new2 = MolGraph() 368 | 369 | if len(self.atoms) == 0: 370 | return [new1] 371 | 372 | atoms_to_move = [self.atoms[-1]] 373 | idx = 0 374 | while idx < len(atoms_to_move): 375 | for atom2 in atoms_to_move[idx].connections: 376 | if atom2 not in atoms_to_move: 377 | atoms_to_move.append(atom2) 378 | idx += 1 379 | 380 | if len(new1.atoms) == len(atoms_to_move): 381 | return [new1] 382 | 383 | for atom in atoms_to_move: 384 | new2.atoms.append(atom) 385 | new1.atoms.remove(atom) 386 | 387 | new = [new2] 388 | new.extend(new1.split()) 389 | new.energy = None 390 | return new 391 | 392 | def sort_atoms(self): 393 | self.atoms.sort(key=lambda a: a.idx) 394 | 395 | def is_radical(self): 396 | """ 397 | Determine whether or not the molecule is a radical based on the number 398 | of valence electrons for each atom. If the total number of valence 399 | electrons is odd, then it is a radical. This assumes that molecules 400 | with an even number of electrons are singlets. This method also assumes 401 | that none of the atoms are charged. 402 | """ 403 | valence_electrons = {'H': 1, 'C': 4, 'N': 5, 'O': 6, 'F': 7, 'P': 5, 'S': 6, 'Cl': 7, 'Br': 7, 'I': 7, 'Li':1} 404 | symbols = [atom.symbol for atom in self] 405 | total_valence_electrons = sum(valence_electrons[s] for s in symbols) 406 | return bool(total_valence_electrons % 2) 407 | 408 | # def is_isomorphic(self, other): 409 | # """ 410 | # Test if self is isomorphic with other, ignoring atom indices. 411 | # Requires RMG to do the isomorphism check. 412 | # """ 413 | # self_rmg = self.to_rmg_mol() 414 | # other_rmg = other.to_rmg_mol() 415 | # return self_rmg.isIsomorphic(other_rmg) 416 | 417 | def topology_from_rdkit(self): 418 | rdkit_molecule = self.to_rdkit_mol() 419 | topology = nx.Graph() 420 | for atom in rdkit_molecule.GetAtoms(): 421 | # Add the atoms as nodes 422 | topology.add_node(atom.GetIdx()) 423 | 424 | # Add the bonds as edges 425 | for bonded in atom.GetNeighbors(): 426 | topology.add_edge(atom.GetIdx(), bonded.GetIdx()) 427 | 428 | return topology 429 | 430 | def is_isomorphic(self, other): 431 | topology1 = self.topology_from_rdkit() 432 | topology2 = self.topology_from_rdkit() 433 | return nx.is_isomorphic(topology1, topology2) 434 | 435 | 436 | def set_coords(self, coords): 437 | """ 438 | Set atom coordinates. Assumes coords are in same order as self.atoms. 439 | """ 440 | try: 441 | coords = np.reshape(coords, (-1,3)) 442 | except ValueError: 443 | raise Exception('Coordinates cannot be reshaped into matrix of size Nx3') 444 | assert len(coords) == len(self.atoms) 445 | 446 | for atom, xyz in zip(self.atoms, coords): 447 | atom.coords = xyz 448 | 449 | def get_coords(self): 450 | """ 451 | Get coordinates in the order specified by the atom indices. 452 | """ 453 | assert all(atom.idx is not None for atom in self) 454 | atoms = self.atoms[:] 455 | atoms.sort(key=lambda a: a.idx) 456 | return np.array([atom.coords for atom in atoms]) 457 | 458 | def get_symbols(self): 459 | """ 460 | Get symbols in the order specified by the atom indices. 461 | """ 462 | assert all(atom.idx is not None for atom in self) 463 | atoms = self.atoms[:] 464 | atoms.sort(key=lambda a: a.idx) 465 | return [atom.symbol for atom in atoms] 466 | 467 | def get_geometry(self): 468 | """ 469 | Get symbols and coordinates in the order specified by the atom 470 | indices. 471 | """ 472 | assert all(atom.idx is not None for atom in self) 473 | atoms = self.atoms[:] 474 | atoms.sort(key=lambda a: a.idx) 475 | return [atom.symbol for atom in atoms], np.array([atom.coords for atom in atoms]) 476 | 477 | def infer_connections(self, use_ob=True): 478 | """ 479 | Delete connections and set them again based on coordinates. 480 | 481 | Note: By default this uses Open Babel, which is better than a 482 | simple covalent radii check. 483 | """ 484 | atoms = self.atoms 485 | 486 | for atom in atoms: 487 | assert len(atom.coords) != 0 488 | 489 | for atom in atoms: 490 | for connection in atom.connections: 491 | self.remove_connection(connection) 492 | 493 | if use_ob: 494 | pybel_mol = self.to_pybel_mol() # Should be sorted by atom indices 495 | assert all(ap.idx == a.idx for ap, a in zip(pybel_mol, self)) # Check to be sure 496 | mapping = {ap.idx: a for ap, a in zip(pybel_mol, self)} 497 | for bond in pybel.ob.OBMolBondIter(pybel_mol.OBMol): 498 | atom1 = mapping[bond.GetBeginAtomIdx()] 499 | atom2 = mapping[bond.GetEndAtomIdx()] 500 | connection = Connection(atom1, atom2) 501 | self.add_connection(connection) 502 | else: 503 | sorted_atoms = sorted(atoms, key=lambda a: a.coords[2]) 504 | for i, atom1 in enumerate(sorted_atoms): 505 | for atom2 in sorted_atoms[(i+1):]: 506 | crit_dist = (atom1.get_cov_rad() + atom2.get_cov_rad() + 0.45)**2 507 | z_boundary = (atom1.coords[2] - atom2.coords[2])**2 508 | if z_boundary > 16.0: 509 | break 510 | dist_sq = sum((atom1.coords - atom2.coords)**2) 511 | if dist_sq > crit_dist or dist_sq < 0.4: 512 | continue 513 | else: 514 | connection = Connection(atom1, atom2) 515 | self.add_connection(connection) 516 | 517 | def is_atom_in_cycle(self, atom): 518 | return self._is_chain_in_cycle([atom]) 519 | 520 | def _is_chain_in_cycle(self, chain): 521 | atom1 = chain[-1] 522 | for atom2 in atom1.connections: 523 | if atom2 is chain[0] and len(chain) > 2: 524 | return True 525 | elif atom2 not in chain: 526 | chain.append(atom2) 527 | if self._is_chain_in_cycle(chain): 528 | return True 529 | else: 530 | chain.remove(atom2) 531 | return False 532 | 533 | #def label_equivalent_hydrogens(self): 534 | # """ 535 | # Mark all equivalent hydrogens as frozen. For now, this assumes that the 536 | # carbons they are attached to have 4 connections, which means this 537 | # method does not yet work for radicals. 538 | # """ 539 | # if self.is_radical(): 540 | # raise NotImplementedError('Cannot yet label equivalent hydrogens for radicals') 541 | # for atom in self: 542 | # if (atom.symbol.upper() == 'C' 543 | # and len(atom.connections) == 4 544 | # and not self.is_atom_in_cycle(atom)): 545 | # first_hydrogen = True 546 | # for atom2 in atom.connections: 547 | # if atom2.symbol.upper() == 'H': 548 | # if first_hydrogen: 549 | # first_hydrogen = False 550 | # else: 551 | # atom2.frozen = True 552 | 553 | def label_equivalent_hydrogens(self): 554 | """ 555 | This version works with radicals. no assumption of four connections on carbon atoms. 556 | Also works with other heavy elementts. 557 | """ 558 | # Proceed even if the molecule is a radical 559 | for atom in self: 560 | if atom.symbol.upper() != 'H' and not self.is_atom_in_cycle(atom): 561 | hydrogens = [a for a in atom.connections if a.symbol.upper() == 'H'] 562 | if len(hydrogens) > 1: 563 | first_hydrogen = True 564 | for hydrogen in hydrogens: 565 | if first_hydrogen: 566 | first_hydrogen = False 567 | else: 568 | hydrogen.frozen = True 569 | -------------------------------------------------------------------------------- /dandelion/segsm/create_gsm.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import glob 4 | import argparse 5 | 6 | from ase.io import read 7 | from .ard_gsm.mol import MolGraph 8 | from .ard_gsm.limits import connection_limits 9 | from .ard_gsm.driving_coords import generate_driving_coords 10 | 11 | 12 | def main(args): 13 | 14 | print_args(args) 15 | 16 | input_path = args.input_path 17 | if not os.path.isdir(input_path): 18 | sys.exit(f"Error: '{input_path}' is not a directory.") 19 | output_path = args.output_path 20 | maxbreak = args.maxbreak 21 | maxform = args.maxform 22 | maxchange = args.maxchange 23 | minbreak = args.minbreak 24 | minform = args.minform 25 | minchange = args.minchange 26 | ignore_single_change = args.ignore_single_change 27 | equiv_Hs = args.equiv_Hs 28 | 29 | pdir = output_path 30 | if not os.path.exists(pdir): 31 | os.makedirs(pdir) 32 | 33 | with open(os.path.join(pdir, 'params.log'), 'w') as f: 34 | f.write('Connection limits:\n') 35 | for symbol in connection_limits: 36 | ll = connection_limits[symbol][0] 37 | ul = connection_limits[symbol][1] 38 | f.write(' {}: {}, {}\n'.format(symbol, ll, ul)) 39 | f.write('maxbreak = {}\n'.format(maxbreak)) 40 | f.write('maxform = {}\n'.format(maxform)) 41 | f.write('maxchange = {}\n'.format(maxchange)) 42 | f.write('single_change = {}\n'.format(not ignore_single_change)) 43 | f.write('equiv_Hs = {}\n'.format(equiv_Hs)) 44 | f.write('minbreak = {}\n'.format(minbreak)) 45 | f.write('minform = {}\n'.format(minform)) 46 | f.write('minchange = {}\n'.format(minchange)) 47 | 48 | # Loop over Mothers 49 | for idx, mother in enumerate(glob.iglob(os.path.join(input_path, '**/*.xyz'), recursive=True)): 50 | xyz = read(mother) 51 | symbols, coords = xyz.get_chemical_symbols(), xyz.get_positions() 52 | mol = MolGraph(symbols=symbols, coords=coords) 53 | mol.infer_connections() 54 | name = os.path.basename(os.path.dirname(mother)) 55 | 56 | seeds = generate_driving_coords( 57 | mol, 58 | maxbreak=maxbreak, 59 | maxform=maxform, 60 | maxchange=maxchange, 61 | single_change=not ignore_single_change, 62 | equiv_Hs=equiv_Hs, 63 | minbreak=minbreak, 64 | minform=minform, 65 | minchange=minchange 66 | ) 67 | print(f'{len(seeds)} Seeds were generated from {name}') 68 | 69 | output_path = os.path.join(pdir, '{}'.format(name)) 70 | if not os.path.exists(output_path): 71 | os.mkdir(output_path) 72 | 73 | # Loop over seeds 74 | for idx, seed in enumerate(seeds): 75 | 76 | gsm_dir = os.path.join(output_path, f'gsm{idx:04}') 77 | if not os.path.exists(gsm_dir): 78 | os.mkdir(gsm_dir) 79 | 80 | isomers_file = os.path.join(gsm_dir, 'ISOMERS.txt') 81 | initial_file = os.path.join(gsm_dir, 'initial.xyz') 82 | bash_file = os.path.join(gsm_dir, 'gsm.sh') 83 | 84 | with open(bash_file, 'w') as f: 85 | f.write(''' 86 | gsm -xyzfile initial.xyz \\ 87 | -mode SE_GSM \\ 88 | -num_nodes 30 \\ 89 | -package xTB_lot \\ 90 | -isomers ISOMERS.txt \\ 91 | -xyz_output_format multixyz \\ 92 | -coordinate_type DLC > gsm_log 2>&1''') 93 | 94 | with open(isomers_file, 'w') as f: 95 | f.write(str(seed)) 96 | with open(initial_file, 'w') as f: 97 | f.write(str(len(symbols)) + '\n') 98 | f.write('\n') 99 | for symbol, xyz in zip(symbols, coords): 100 | f.write('{0} {1[0]: .10f} {1[1]: .10f} {1[2]: .10f}\n'.format(symbol, xyz)) 101 | 102 | print('\nCreating GSM finished!') 103 | 104 | def print_args(args): 105 | print() 106 | print("Arguments provided:") 107 | arg_dict = vars(args) 108 | for key, value in arg_dict.items(): 109 | print(f" {key}: {value}") 110 | print() 111 | 112 | def get_parser(): 113 | parser = argparse.ArgumentParser(description='Make GSM jobs from mother structures') 114 | 115 | parser.add_argument('-i', '--input_path', required=True, 116 | help='Input path of mother structures') 117 | parser.add_argument('-o', '--output_path', required=True, 118 | help='Output path of gsm jobs') 119 | 120 | parser.add_argument('--maxbreak', type=int, default=2, 121 | help='Maximum number of connections to break') 122 | parser.add_argument('--maxform', type=int, default=2, 123 | help='Maximum number of connections to form') 124 | parser.add_argument('--maxchange', type=int, default=3, 125 | help='Maximum number of connections to change') 126 | 127 | parser.add_argument('--minbreak', type=int, default=0, 128 | help='Minumum number of connections to break') 129 | parser.add_argument('--minform', type=int, default=0, 130 | help='Minumum number of connections to form') 131 | parser.add_argument('--minchange', type=int, default=1, 132 | help='Minumum number of connections to change') 133 | 134 | parser.add_argument('--ignore_single_change', type=bool, default=True, 135 | help='Do not consider single connection changes (e.g., nbreak=1, nform=0)') 136 | parser.add_argument('--equiv_Hs', type=bool, default=False, 137 | help='Create equivalent driving coordinates for the same reaction with different but\ 138 | equivalent hydrogens, i.e., hydrogens attached to non-cyclic tetrahedral carbons') 139 | 140 | return parser 141 | 142 | 143 | if __name__ == "__main__": 144 | args = get_parser().parse_args() 145 | main(args) 146 | 147 | 148 | 149 | -------------------------------------------------------------------------------- /dandelion/segsm/filter_gsm.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import glob 4 | import shutil 5 | import argparse 6 | 7 | from rdkit import RDLogger 8 | from ase.io import read, write 9 | from openbabel import openbabel 10 | from .ard_gsm.mol import MolGraph, SanitizationError 11 | #from ard_gsm.mol import MolGraph, SanitizationError 12 | 13 | # Suppress Noisy warning in the filter 14 | RDLogger.logger().setLevel(RDLogger.CRITICAL) 15 | openbabel.obErrorLog.SetOutputLevel(openbabel.obError) 16 | 17 | ''' 18 | Faith of pyGSM run 19 | 20 | 1) png is not made 21 | - xTB not converge 22 | - pyGSM suicide on his criteria 23 | 24 | 2) png is made 25 | - Exiting early -> should filter out 26 | - Ran out of iterations -> also includes potential rxn 27 | - Converged -> very rare 28 | ''' 29 | 30 | 31 | 32 | def parse_gsm_log(keyword, content): 33 | """Find the value associated with a keyword in a text content.""" 34 | # For TS_energy, we're expecting a float, so we use a different pattern 35 | if keyword == "TS energy:": 36 | pattern = f"{keyword} ([+-]?(\d+(\.\d*)?|\.\d+)([eE][+-]?\d+)?)" 37 | else: 38 | pattern = f"{keyword} (\d+)" 39 | 40 | import re 41 | matches = re.findall(pattern, content) 42 | 43 | # Return the matched value; assume there's only one match 44 | if matches: 45 | return matches[0][0] # Due to group structures, we take the first element 46 | else: 47 | return None 48 | 49 | 50 | def get_gsm_data(home, seed, string): 51 | try: 52 | with open(os.path.join(home, seed, string, 'gsm_log'), 'r') as f: 53 | content = f.read() 54 | except FileNotFoundError: 55 | return None 56 | 57 | nodes = [] 58 | try: 59 | with open(os.path.join(home, seed, string, 'opt_converged_000.xyz'), 'r') as f: 60 | for i in range(30): 61 | try: 62 | nodes.append(read(f, i)) 63 | except: 64 | break 65 | except FileNotFoundError: 66 | return None 67 | 68 | return { 69 | "TS_energy" : float(parse_gsm_log("TS energy:", content)), 70 | "reactant_idx" : int(parse_gsm_log("min reactant node:", content)), 71 | "product_idx" : int(parse_gsm_log("min product node", content)), 72 | "TS_idx" : int(parse_gsm_log("TS node is", content)), 73 | "nodes" : nodes, 74 | 'energies' : [float(list(node.info.keys())[0]) for node in nodes] 75 | } 76 | 77 | 78 | 79 | def profile_filter(strings, home, seed, barrier_max, barrier_min, delta_e_min): 80 | ''' 81 | Given gsm success reactions, 82 | Filter strings by TS_index and Barrier height and delta_e. 83 | ''' 84 | filtered = {} 85 | for string in strings: 86 | data = get_gsm_data(home, seed, string) 87 | if not data: 88 | continue 89 | 90 | if data["TS_idx"] >= data["product_idx"]: # wrong ts 91 | continue 92 | if (data["TS_energy"] > barrier_max) or (data["TS_energy"] < barrier_min): # too high or low barrier 93 | continue 94 | if abs(data['energies'][data['product_idx']]) * 627.503 < delta_e_min: # maybe reactant==product 95 | continue 96 | 97 | product_graph = MolGraph(symbols=data["nodes"][data["product_idx"]].get_chemical_symbols(), 98 | coords=data["nodes"][data["product_idx"]].get_positions(), 99 | energy=float(list(data["nodes"][data["product_idx"]].info.keys())[0])) 100 | 101 | filtered[string] = { 102 | 'reactant': data["nodes"][data["reactant_idx"]], 103 | 'product': data["nodes"][data["product_idx"]], 104 | 'ts': data["nodes"][data["TS_idx"]], 105 | 'product_graph': product_graph, 106 | 'ts_energy': data["TS_energy"] 107 | } 108 | 109 | return filtered 110 | 111 | def structure_filter(reactions): 112 | ''' 113 | Chemically absurd products are filtered here. (graph->pybel->inchi->smiles) 114 | SMILES are constructed, and saved to the dictionary for the unique filter. 115 | ''' 116 | 117 | filtered = {} 118 | 119 | for rxn, data in reactions.items(): 120 | try: 121 | smiles = data['product_graph'].perceive_smiles() 122 | filtered[rxn] = data 123 | filtered[rxn]['product_smiles'] = smiles 124 | except SanitizationError: 125 | continue 126 | return filtered 127 | 128 | def unique_filter(reactions): 129 | ''' 130 | Duplicates are filtered based on SMILES. 131 | If there are more than one of same SMILES, pick the lowest barrier reaction. 132 | ''' 133 | unique = {} 134 | for rxn, data in reactions.items(): 135 | smiles = data['product_smiles'] 136 | ts_energy = data['ts_energy'] 137 | if smiles not in unique or ts_energy < unique[smiles]['ts_energy']: 138 | unique[smiles] = { 139 | 'reaction_key': rxn, 140 | 'ts_energy': ts_energy, 141 | 'reactant': data['reactant'], 142 | 'product': data['product'], 143 | 'ts': data['ts'], 144 | } 145 | return unique 146 | 147 | def save_unique_reactions(home, output_path, seed, reactions): 148 | for smiles, data in reactions.items(): 149 | reaction_dir = os.path.join(output_path, seed, data['reaction_key']) 150 | os.makedirs(reaction_dir, exist_ok=True) 151 | 152 | file_types = ["reactant", "ts", "product"] 153 | for f_type in file_types: 154 | write(os.path.join(reaction_dir, f"{f_type}.xyz"), data[f_type]) 155 | write(os.path.join(reaction_dir, f"{f_type}.png"), data[f_type]) 156 | 157 | shutil.copyfile(os.path.join(home, seed, data['reaction_key'], '0000_string.png'), 158 | os.path.join(reaction_dir, 'string.png')) 159 | 160 | shutil.copyfile(os.path.join(home, seed, data['reaction_key'], 'opt_converged_000.xyz'), 161 | os.path.join(reaction_dir, 'string.xyz')) 162 | 163 | def main(args): 164 | 165 | print_args(args) 166 | 167 | input_path = args.input_path 168 | if not os.path.isdir(input_path): 169 | sys.exit(f"Error: '{input_path}' is not a directory.") 170 | output_path = args.output_path 171 | if not os.path.exists(output_path): 172 | os.mkdir(output_path) 173 | barrier_max = args.barrier_max 174 | barrier_min = args.barrier_min 175 | delta_e_min = args.delta_e_min 176 | 177 | mothers = [d for d in os.listdir(input_path) if os.path.isdir(os.path.join(input_path, d))] 178 | for mother in mothers: 179 | print('\n◤◢◤◢◤◢◤◢◤◢◤◢◤◢◤◢◤◢◤◢◤◢◤◢◤◢◤◢◤◢◤◢◤◢◤◢') 180 | print(f'mother: {mother}'.center(35)) 181 | driving_coordinates = list(glob.iglob(os.path.join(input_path, f'{mother}/*/gsm_log'))) 182 | success_strings = [path.split('/')[-2] for path in glob.iglob(os.path.join(input_path, f'{mother}/*/0000_string.png'))] 183 | 184 | profile_filtered_strings = profile_filter(success_strings, input_path, mother, barrier_max, barrier_min, delta_e_min) 185 | structure_filtered_strings = structure_filter(profile_filtered_strings) 186 | unique_reactions = unique_filter(structure_filtered_strings) 187 | 188 | print(f'Initial seeds: {len(driving_coordinates):>5}') 189 | print(f'GSM success reactions: {len(success_strings):>5}') 190 | print(f'Profile filtered reactions: {len(profile_filtered_strings):>5}') 191 | print(f'Structure filtered reactions: {len(structure_filtered_strings):>5}') 192 | print(f'Unique reactions: {len(unique_reactions):>5}') 193 | 194 | save_unique_reactions(input_path, output_path, mother, unique_reactions) 195 | 196 | print('\nFiltering GSM finished!') 197 | 198 | def print_args(args): 199 | print() 200 | print("Arguments provided:") 201 | arg_dict = vars(args) 202 | for key, value in arg_dict.items(): 203 | print(f" {key}: {value}") 204 | print() 205 | 206 | def get_parser(): 207 | parser = argparse.ArgumentParser(description='Make GSM jobs from mother structures') 208 | 209 | parser.add_argument('-i', '--input_path', required=True, 210 | help='Input path of finished gsm jobs') 211 | parser.add_argument('-o', '--output_path', required=True, 212 | help='Output path of filtered gsm jobs') 213 | 214 | parser.add_argument('--barrier_min', type=int, default=5) 215 | parser.add_argument('--barrier_max', type=int, default=200) 216 | parser.add_argument('--delta_e_min', type=int, default=5) 217 | 218 | return parser 219 | 220 | 221 | if __name__ == "__main__": 222 | args = get_parser().parse_args() 223 | main(args) 224 | 225 | 226 | -------------------------------------------------------------------------------- /dandelion/segsm/run_gsm.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import argparse 4 | import subprocess 5 | from concurrent.futures import ProcessPoolExecutor, as_completed 6 | 7 | from tqdm import tqdm 8 | 9 | # conda activate ts 10 | # check whether gsm is killed when you interrupted 11 | # use like "nohup python -u 2_run_gsm_jobs > gsm.out &" 12 | 13 | 14 | def run_gsm_script(script_dir): 15 | #print(f"Executing in directory: {script_dir}") 16 | subprocess.run('bash gsm.sh', cwd=script_dir, capture_output=True, text=True, shell=True) 17 | 18 | def main(args): 19 | 20 | print_args(args) 21 | 22 | input_path = args.input_path 23 | if not os.path.isdir(input_path): 24 | sys.exit(f"Error: '{input_path}' is not a directory.") 25 | max_workers = args.max_workers 26 | 27 | # Find all directories containing gsm.sh scripts 28 | script_dirs = [dirpath for dirpath, _, filenames in os.walk(input_path) if "gsm.sh" in filenames] 29 | 30 | bar_format = '{l_bar}{bar}| {n_fmt}/{total_fmt} [{elapsed}<{remaining}]' 31 | with ProcessPoolExecutor(max_workers=max_workers) as executor: 32 | futures = [executor.submit(run_gsm_script, script_dir) for script_dir in script_dirs] 33 | 34 | for future in tqdm(as_completed(futures), desc='GSM on seeds', 35 | total=len(script_dirs), smoothing=0, bar_format=bar_format, ncols=70): 36 | pass # just update the tqdm 37 | 38 | print('GSM finished!') 39 | 40 | def print_args(args): 41 | print() 42 | print("Arguments provided:") 43 | arg_dict = vars(args) 44 | for key, value in arg_dict.items(): 45 | print(f" {key}: {value}") 46 | print() 47 | 48 | def get_parser(): 49 | parser = argparse.ArgumentParser(description='Run GSM jobs concurrently') 50 | 51 | parser.add_argument('-i', '--input_path', required=True, 52 | help='Base directory of mothers bearing seeds') 53 | parser.add_argument('-n', '--max_workers', type=int, default=1, 54 | help='Number of worker processes') 55 | 56 | return parser 57 | 58 | 59 | if __name__ == "__main__": 60 | args = get_parser().parse_args() 61 | main(args) 62 | -------------------------------------------------------------------------------- /dandelion/utils/db_h5_tools/db_to_h5.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import argparse 4 | 5 | import h5py 6 | from ase.db import connect 7 | 8 | 9 | def main(args): 10 | 11 | print_args(args) 12 | 13 | input_path = args.input_path 14 | if not os.path.isfile(input_path): 15 | sys.exit(f"Error: '{input_path}' is not a file.") 16 | output_path = args.output_path 17 | 18 | # Data structure to hold the computed results 19 | rxn_data = {} 20 | 21 | rows = [] # List to store all rows 22 | 23 | # Extract data from ASE database 24 | with connect(input_path) as db: 25 | for row in db.select(): 26 | if hasattr(row, 'energy') and hasattr(row, 'forces'): 27 | rows.append(row) 28 | 29 | # Sort rows based on the unique_id number 30 | rows.sort(key=lambda r: int(r.data['unique_id'].split('_')[-1])) 31 | 32 | # Process sorted rows 33 | for row in rows: 34 | # Extract unique_id and other data 35 | unique_id = row.data['unique_id'] 36 | chem_group_name, rxn_group_name, index = unique_id.split('_') 37 | 38 | if chem_group_name not in rxn_data: 39 | rxn_data[chem_group_name] = {} 40 | 41 | if rxn_group_name not in rxn_data[chem_group_name]: 42 | rxn_data[chem_group_name][rxn_group_name] = { 43 | 'atomic_numbers': row.toatoms().numbers, 44 | 'energies': [], 45 | 'forces': [], 46 | 'positions': [] 47 | } 48 | rxn_data[chem_group_name][rxn_group_name]['energies'].append(row.energy) 49 | rxn_data[chem_group_name][rxn_group_name]['forces'].append(row.forces) 50 | rxn_data[chem_group_name][rxn_group_name]['positions'].append(row.toatoms().positions) 51 | 52 | # Save the data to an h5 file 53 | with h5py.File(output_path, 'w') as h5file: 54 | # Ensure the 'data' group exists 55 | if 'data' not in h5file: 56 | data_group = h5file.create_group('data') 57 | else: 58 | data_group = h5file['data'] 59 | 60 | # Iterate through the rxn_data dictionary to save datasets 61 | for chem_group_name in rxn_data: 62 | if chem_group_name not in data_group: 63 | chem_group = data_group.create_group(chem_group_name) 64 | else: 65 | chem_group = data_group[chem_group_name] 66 | 67 | for rxn_group_name, rxn_entry in rxn_data[chem_group_name].items(): 68 | if rxn_group_name not in chem_group: 69 | rxn_group = chem_group.create_group(rxn_group_name) 70 | else: 71 | rxn_group = chem_group[rxn_group_name] 72 | 73 | # Add datasets to the reaction group 74 | rxn_group.create_dataset('atomic_numbers', data=rxn_entry['atomic_numbers']) 75 | rxn_group.create_dataset('wB97x_6-31G(d).energy', data=rxn_entry['energies']) 76 | rxn_group.create_dataset('wB97x_6-31G(d).forces', data=rxn_entry['forces']) 77 | rxn_group.create_dataset('positions', data=rxn_entry['positions']) 78 | 79 | print('Compiled successfully!') 80 | 81 | def print_args(args): 82 | print() 83 | print("Arguments provided:") 84 | arg_dict = vars(args) 85 | for key, value in arg_dict.items(): 86 | print(f" {key}: {value}") 87 | print() 88 | 89 | def get_parser(): 90 | parser = argparse.ArgumentParser(description="Translate ase db file into hdf5 file.") 91 | 92 | parser.add_argument('-i', '--input_path', required=True, 93 | help="Path of the input wB97X ASE db file") 94 | parser.add_argument('-o', '--output_path', required=True, 95 | help="Path of the output wB97X hdf5 file") 96 | 97 | return parser 98 | 99 | if __name__ == "__main__": 100 | args = get_parser().parse_args() 101 | main(args) 102 | 103 | 104 | -------------------------------------------------------------------------------- /dandelion/utils/db_h5_tools/h5_to_db.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import h5py 4 | import argparse 5 | 6 | from tqdm import tqdm 7 | from ase import Atoms 8 | from ase.db import connect 9 | from ase.calculators.singlepoint import SinglePointCalculator 10 | 11 | 12 | def main(args): 13 | 14 | print_args(args) 15 | 16 | input_path = args.input_path 17 | if not os.path.isfile(input_path): 18 | sys.exit(f"Error: '{input_path}' is not a file.") 19 | output_path = args.output_path 20 | 21 | 22 | with h5py.File(input_path, 'r') as h5_file: 23 | data_group = h5_file['data'] 24 | 25 | # Count total number of configurations 26 | total_configs = sum( 27 | rxn_group['wB97x_6-31G(d).energy'].shape[0] 28 | for chem_group in data_group.values() 29 | for rxn_group in chem_group.values() 30 | ) 31 | 32 | with connect(output_path) as db: 33 | with tqdm(total=total_configs, desc="Converting", unit="config") as pbar: 34 | for chem_group_name, chem_group in data_group.items(): 35 | for rxn_group_name, rxn_group in chem_group.items(): 36 | atomic_numbers = rxn_group['atomic_numbers'][:] 37 | positions = rxn_group['positions'][:] 38 | energies = rxn_group['wB97x_6-31G(d).energy'][:] 39 | forces = rxn_group['wB97x_6-31G(d).forces'][:] 40 | 41 | for i in range(len(energies)): 42 | atoms = Atoms( 43 | numbers=atomic_numbers, 44 | positions=positions[i], 45 | ) 46 | atoms.set_calculator(SinglePointCalculator( 47 | atoms, 48 | energy=energies[i], 49 | forces=forces[i] 50 | )) 51 | 52 | unique_id = f"{chem_group_name}_{rxn_group_name}_{i}" 53 | db.write(atoms, data={'unique_id': unique_id}) 54 | 55 | pbar.update(1) 56 | 57 | def print_args(args): 58 | print() 59 | print("Arguments provided:") 60 | arg_dict = vars(args) 61 | for key, value in arg_dict.items(): 62 | print(f" {key}: {value}") 63 | print() 64 | 65 | def get_parser(): 66 | parser = argparse.ArgumentParser(description="Translate hdf5 file into ase db file.") 67 | 68 | parser.add_argument('-i', '--input_path', required=True, 69 | help="Path of the input wB97X hdf5 file") 70 | parser.add_argument('-o', '--output_path', required=True, 71 | help="Path of the output wB97X db file") 72 | 73 | return parser 74 | 75 | if __name__ == "__main__": 76 | args = get_parser().parse_args() 77 | main(args) 78 | -------------------------------------------------------------------------------- /dandelion/utils/db_h5_tools/make_db_from_xyzs.py: -------------------------------------------------------------------------------- 1 | import os 2 | import glob 3 | import argparse 4 | 5 | from tqdm import tqdm 6 | from ase import io 7 | from ase.db import connect 8 | 9 | 10 | def main(args): 11 | 12 | print_args(args) 13 | 14 | input_path = args.input_path 15 | if not os.path.isdir(input_path): 16 | sys.exit(f"Error: '{input_path}' is not a directory.") 17 | output_path = args.output_path 18 | 19 | with connect(output_path) as db: 20 | for file_path in tqdm(glob.glob(os.path.join(input_path, '**/*.xyz'), recursive=True)): 21 | atoms = io.read(file_path) 22 | db.write(atoms) 23 | 24 | def print_args(args): 25 | print() 26 | print("Arguments provided:") 27 | arg_dict = vars(args) 28 | for key, value in arg_dict.items(): 29 | print(f" {key}: {value}") 30 | print() 31 | 32 | def get_parser(): 33 | parser = argparse.ArgumentParser(description='Merge xyz files in input directory into db file.') 34 | 35 | parser.add_argument('-i', '--input_path', required=True, 36 | help='Input path of directory containing xyz files to merge') 37 | parser.add_argument('-o', '--output_path', required=True, 38 | help='Output path of the merged db file.') 39 | 40 | return parser 41 | 42 | 43 | if __name__ == "__main__": 44 | args = get_parser().parse_args() 45 | main(args) 46 | 47 | 48 | -------------------------------------------------------------------------------- /dandelion/utils/db_h5_tools/merge_db.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import glob 4 | import argparse 5 | 6 | from tqdm import tqdm 7 | from ase.db import connect 8 | 9 | def main(args): 10 | 11 | print_args(args) 12 | 13 | input_path = args.input_path 14 | if not os.path.isdir(input_path): 15 | sys.exit(f"Error: '{input_path}' is not a directory.") 16 | output_path = args.output_path 17 | 18 | with connect(output_path) as db1: 19 | for f in glob.glob(os.path.join(input_path, '**/wb97x.db'), recursive=True): 20 | with connect(f) as db2: 21 | for row in tqdm(db2.select(), total=db2.count(), desc=f"{f}"): 22 | db1.write(row.toatoms()) 23 | 24 | def print_args(args): 25 | print() 26 | print("Arguments provided:") 27 | arg_dict = vars(args) 28 | for key, value in arg_dict.items(): 29 | print(f" {key}: {value}") 30 | print() 31 | 32 | def get_parser(): 33 | parser = argparse.ArgumentParser(description='Merge db files in input directory') 34 | 35 | parser.add_argument('-i', '--input_path', required=True, 36 | help='Input path of directory containing db files to merge') 37 | parser.add_argument('-o', '--output_path', required=True, 38 | help='Output path of the merged db file.') 39 | 40 | return parser 41 | 42 | 43 | if __name__ == "__main__": 44 | args = get_parser().parse_args() 45 | main(args) 46 | 47 | -------------------------------------------------------------------------------- /dandelion/utils/db_h5_tools/merge_h5.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import argparse 4 | from itertools import repeat 5 | 6 | import h5py 7 | from tqdm import tqdm 8 | import glob 9 | 10 | 11 | def main(args): 12 | 13 | print_args(args) 14 | 15 | input_path = args.input_path 16 | if not os.path.isdir(input_path): 17 | sys.exit(f"Error: '{input_path}' is not a directory.") 18 | output_path = args.output_path 19 | 20 | # Open the output file 21 | with h5py.File(output_path, 'w') as h5file_out: 22 | # Ensure the 'data' group exists in the output file 23 | if 'data' not in h5file_out: 24 | data_group_out = h5file_out.create_group('data') 25 | else: 26 | data_group_out = h5file_out['data'] 27 | 28 | # Iterate through each input file 29 | for input_path in glob.glob(os.path.join(input_path, '**/wb97x.h5'), recursive=True): 30 | print(input_path) 31 | # Determine the prefix ('a' or 'b') based on the input file name 32 | prefix = os.path.basename(os.path.dirname(input_path)) # Assumes file name is 'a.h5' or 'b.h5' 33 | 34 | # Open the input file 35 | with h5py.File(input_path, 'r') as h5file_in: 36 | # Iterate through chemical groups in the input file 37 | for chem_group_name, chem_group in tqdm(h5file_in['data'].items(), desc="Formulas"): 38 | # Ensure the chemical group exists in the output file 39 | if chem_group_name not in data_group_out: 40 | chem_group_out = data_group_out.create_group(chem_group_name) 41 | else: 42 | chem_group_out = data_group_out[chem_group_name] 43 | 44 | # Iterate through reaction groups in the chemical group 45 | for rxn_group_name, rxn_group in tqdm(chem_group.items(), desc=f"Rxns in {chem_group_name}", leave=False): 46 | # Prefix the reaction group name with 'a' or 'b' 47 | rxn_group_name_prefixed = f"{prefix}_{rxn_group_name}" 48 | 49 | # Ensure the reaction group exists in the output file 50 | if rxn_group_name_prefixed not in chem_group_out: 51 | rxn_group_out = chem_group_out.create_group(rxn_group_name_prefixed) 52 | else: 53 | rxn_group_out = chem_group_out[rxn_group_name_prefixed] 54 | 55 | # Copy datasets from input to output, creating new datasets 56 | for dset_name, dset in rxn_group.items(): 57 | data = dset[:] 58 | rxn_group_out.create_dataset(dset_name, data=data) 59 | 60 | def print_args(args): 61 | print() 62 | print("Arguments provided:") 63 | arg_dict = vars(args) 64 | for key, value in arg_dict.items(): 65 | print(f" {key}: {value}") 66 | print() 67 | 68 | def get_parser(): 69 | parser = argparse.ArgumentParser(description='Merge h5 files in input directory') 70 | 71 | parser.add_argument('-i', '--input_path', required=True, 72 | help='Input path of directory containing h5 files to merge') 73 | parser.add_argument('-o', '--output_path', required=True, 74 | help='Output path of the merged h5 file.') 75 | 76 | return parser 77 | 78 | 79 | if __name__ == "__main__": 80 | args = get_parser().parse_args() 81 | main(args) 82 | 83 | -------------------------------------------------------------------------------- /environment.yml: -------------------------------------------------------------------------------- 1 | name: ts 2 | channels: 3 | - conda-forge 4 | - defaults 5 | dependencies: 6 | - python=3.11.5 7 | - pip=23.2.1 8 | - ase=3.22.1 9 | - imageio=2.31.1 10 | - matplotlib-base=3.7.2 11 | - numpy=1.25.2 12 | - openbabel=3.1.1 13 | - scipy=1.11.2 14 | - networkx=3.1 15 | - xtb-python=22.1 -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [metadata] 2 | name = dandelion 3 | version = attr: dandelion.__version__ 4 | author = Minhyeok Lee 5 | author_email = mlee@yonsei.ac.kr 6 | description = Near TS region sampler for machine learning force field 7 | python_requires = >=3.11 8 | classifiers = 9 | License :: MIT License 10 | Programming Language :: Python :: 3 11 | long_description = file: README.md 12 | 13 | [options] 14 | packages = find: 15 | install_requires = 16 | h5py==3.9.0 17 | rdkit==2023.3.3 18 | tqdm==4.66.1 19 | typing-extensions==4.8.0 20 | 21 | [options.entry_points] 22 | console_scripts = 23 | dand = dandelion.cli:main 24 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup 2 | 3 | setup() --------------------------------------------------------------------------------