├── .gitignore
├── CHANGELOG.md
├── LICENSE
├── README.md
├── dandelion
├── __init__.py
├── cli.py
├── dandelion_prep.py
├── dandelion_refine.py
├── dandelion_sample.py
├── neb
│ ├── __init__.py
│ ├── compile_neb.py
│ ├── filter_neb.py
│ └── run_neb.py
├── prep
│ ├── __init__.py
│ ├── geom_opt.py
│ └── smiles_to_isoconfs.py
├── refine
│ ├── __init__.py
│ ├── compile_refined.py
│ └── refine_forces.py
├── segsm
│ ├── __init__.py
│ ├── ard_gsm
│ │ ├── __init__.py
│ │ ├── driving_coords.py
│ │ ├── limits.py
│ │ └── mol.py
│ ├── create_gsm.py
│ ├── filter_gsm.py
│ └── run_gsm.py
└── utils
│ └── db_h5_tools
│ ├── db_to_h5.py
│ ├── h5_to_db.py
│ ├── make_db_from_xyzs.py
│ ├── merge_db.py
│ └── merge_h5.py
├── environment.yml
├── setup.cfg
└── setup.py
/.gitignore:
--------------------------------------------------------------------------------
1 | # Created by https://www.toptal.com/developers/gitignore/api/python,linux
2 | # Edit at https://www.toptal.com/developers/gitignore?templates=python,linux
3 | **/*wb97.py
4 | **/nms
5 | .xtboptok
6 | ### Linux ###
7 | *~
8 |
9 | # temporary files which can be created if a process still has a handle open of a deleted file
10 | .fuse_hidden*
11 |
12 | # KDE directory preferences
13 | .directory
14 |
15 | # Linux trash folder which might appear on any partition or disk
16 | .Trash-*
17 |
18 | # .nfs files are created when an open file is removed but is still being accessed
19 | .nfs*
20 |
21 | ### Python ###
22 | # Byte-compiled / optimized / DLL files
23 | __pycache__/
24 | *.py[cod]
25 | *$py.class
26 |
27 | # C extensions
28 | *.so
29 |
30 | # Distribution / packaging
31 | .Python
32 | build/
33 | develop-eggs/
34 | dist/
35 | downloads/
36 | eggs/
37 | .eggs/
38 | lib/
39 | lib64/
40 | parts/
41 | sdist/
42 | var/
43 | wheels/
44 | share/python-wheels/
45 | *.egg-info/
46 | .installed.cfg
47 | *.egg
48 | MANIFEST
49 |
50 | # PyInstaller
51 | # Usually these files are written by a python script from a template
52 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
53 | *.manifest
54 | *.spec
55 |
56 | # Installer logs
57 | pip-log.txt
58 | pip-delete-this-directory.txt
59 |
60 | # Unit test / coverage reports
61 | htmlcov/
62 | .tox/
63 | .nox/
64 | .coverage
65 | .coverage.*
66 | .cache
67 | nosetests.xml
68 | coverage.xml
69 | *.cover
70 | *.py,cover
71 | .hypothesis/
72 | .pytest_cache/
73 | cover/
74 |
75 | # Translations
76 | *.mo
77 | *.pot
78 |
79 | # Django stuff:
80 | *.log
81 | local_settings.py
82 | db.sqlite3
83 | db.sqlite3-journal
84 |
85 | # Flask stuff:
86 | instance/
87 | .webassets-cache
88 |
89 | # Scrapy stuff:
90 | .scrapy
91 |
92 | # Sphinx documentation
93 | docs/_build/
94 |
95 | # PyBuilder
96 | .pybuilder/
97 | target/
98 |
99 | # Jupyter Notebook
100 | .ipynb_checkpoints
101 |
102 | # IPython
103 | profile_default/
104 | ipython_config.py
105 |
106 | # pyenv
107 | # For a library or package, you might want to ignore these files since the code is
108 | # intended to run in multiple environments; otherwise, check them in:
109 | # .python-version
110 |
111 | # pipenv
112 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
113 | # However, in case of collaboration, if having platform-specific dependencies or dependencies
114 | # having no cross-platform support, pipenv may install dependencies that don't work, or not
115 | # install all needed dependencies.
116 | #Pipfile.lock
117 |
118 | # poetry
119 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
120 | # This is especially recommended for binary packages to ensure reproducibility, and is more
121 | # commonly ignored for libraries.
122 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
123 | #poetry.lock
124 |
125 | # pdm
126 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
127 | #pdm.lock
128 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
129 | # in version control.
130 | # https://pdm.fming.dev/#use-with-ide
131 | .pdm.toml
132 |
133 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
134 | __pypackages__/
135 |
136 | # Celery stuff
137 | celerybeat-schedule
138 | celerybeat.pid
139 |
140 | # SageMath parsed files
141 | *.sage.py
142 |
143 | # Environments
144 | .env
145 | .venv
146 | env/
147 | venv/
148 | ENV/
149 | env.bak/
150 | venv.bak/
151 |
152 | # Spyder project settings
153 | .spyderproject
154 | .spyproject
155 |
156 | # Rope project settings
157 | .ropeproject
158 |
159 | # mkdocs documentation
160 | /site
161 |
162 | # mypy
163 | .mypy_cache/
164 | .dmypy.json
165 | dmypy.json
166 |
167 | # Pyre type checker
168 | .pyre/
169 |
170 | # pytype static type analyzer
171 | .pytype/
172 |
173 | # Cython debug symbols
174 | cython_debug/
175 |
176 | # PyCharm
177 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
178 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
179 | # and can be added to the global gitignore or merged into this file. For a more nuclear
180 | # option (not recommended) you can uncomment the following to ignore the entire idea folder.
181 | #.idea/
182 |
183 | ### Python Patch ###
184 | # Poetry local configuration file - https://python-poetry.org/docs/configuration/#local-configuration
185 | poetry.toml
186 |
187 | # ruff
188 | .ruff_cache/
189 |
190 | # LSP config files
191 | pyrightconfig.json
192 |
193 | # End of https://www.toptal.com/developers/gitignore/api/python,linux
194 |
--------------------------------------------------------------------------------
/CHANGELOG.md:
--------------------------------------------------------------------------------
1 | # Changelog
2 |
3 | ## [Unreleased] - 2023-10-12
4 |
5 | ### Added
6 | - run_gsm.py: added test if gsm command is available
7 | - filter_gsm.py: added step where optimize product with xTB and if RMSD change is big, filter out the reaction
8 |
9 |
10 | ### Added
11 | ### Changed
12 | ### Removed
13 | ### Fixed
14 |
15 | ## [0.7.4] - 2024-10-08
16 | ### Changed
17 | - README.md updated.
18 |
19 | ## [0.7.3] - 2024-10-08
20 | ### Changed
21 | - segsm/ard_gsm/mol.py: This version works with radicals. no assumption of four connections on carbon atoms. Also works with other heavy elementts.
22 |
23 |
24 | ## [0.7.2] - 2024-09-02
25 | ### Added
26 | - Utility that handle db and h5 files are added to utils/db_h5_tools.
27 | 1. db_to_h5.py
28 | 2. h5_to_db.py
29 | 3. make_db_from_xyzs.py
30 | 4. merge_db.py
31 | 5. merge_h5.py
32 |
33 | ## [0.7.1] - 2024-09-01
34 |
35 | ### Added
36 | - Normal mode sampling codes are added to utils/nms.
37 | 1. normal_mode_sampling.py
38 | 2. refine_forces_nms.py
39 |
40 | ### Fixed
41 | - All code now assert the type of the input_path (dir or file)
42 |
43 | ## [0.7.0] - 2024-08-31
44 |
45 | ### Added
46 | - Sampling iso/conformers is included as a preparatory step in dandelion.
47 | 1. smiles_to_isoconfs.py
48 | 2. geom_opt.py
49 | 3. dandelion_prep.py
50 |
51 | - cli.py: to invoke dandelion like 'dand prep -i ./a.smi -n 40' in cli.
52 |
53 | ### Changed
54 | - dandelion is shortend as 'dand' in cli.
55 | - dandelion_sample.py: default argument '0_mothers' changed to '0_reactants'
56 | - print_separator, merge_args_with_defaults are moved to init.py
57 | ## [0.6.2] - 2024-07-08
58 |
59 | ### Fixed
60 | - dandelion.py: name changed to dandelion_refine.py
61 |
62 |
63 |
64 | ## [0.6.1] - 2024-01-14
65 |
66 | ### Changed
67 | - compile_refined.py: bug fixed when atomrow doesn't have 'energy' and 'forces'
68 |
69 |
70 | ## [0.6.0] - 2023-11-21
71 |
72 | ### Added
73 | - filter_neb.py: added function is_valid_reaction to filter out weird rxn
74 |
75 | ## [0.5.6] - 2023-11-21
76 |
77 | ### Changed
78 | - refine_forces.py: suppress error in force calculation, save to orca_error.log
79 | - refine_forces.py: now save samples in batch
80 | - refine_forces.py: open .db file with statement
81 |
82 | ## [0.5.5] - 2023-11-14
83 |
84 | ### Added
85 | - run_neb.py: added argument fmax_threshold (default=0.1ev/A)
86 |
87 | ### Fixed
88 | - refine_forces.py: added NoTrah for the orca command
89 |
90 |
91 | ## [0.5.4] - 2023-11-7
92 |
93 | ### Fixed
94 | - compile_neb.py: fixed argparser that had no required=True
95 |
96 |
97 | ## [0.5.3] - 2023-11-2
98 |
99 | ### Fixed
100 | - dandelion_refine.py: awesome ascii art
101 |
102 |
103 | ## [0.5.2] - 2023-11-2
104 |
105 | ### Fixed
106 | - compile_refined.py: sorting the rows in the right order
107 |
108 |
109 | ## [0.5.1] - 2023-10-17
110 |
111 | ### Added
112 | - opt_mothers.py: optimize crude structures using xTB
113 |
114 |
115 | ## [0.5.0] - 2023-10-12
116 |
117 | ### Added
118 | - filter_neb.py: xTB normal mode TS validation: is_transition_state
119 |
120 |
121 | ## [0.4.1] - 2023-10-11
122 |
123 | ### Added
124 | - Added \__init__.py have variable \__version__
125 |
126 | ### Fixed
127 | - Basis set 6-31g(d) for Br atom in orca was handled thanks to https://github.com/ZimmermanGroup/ORCA-Basis-Sets
128 |
129 |
130 | ## [0.4.0] - 2023-10-10
131 |
132 | ### Added
133 | - dandelion_refine.py that run refine processes
134 |
135 |
136 | ## [0.3.1] - 2023-10-10
137 |
138 | ### Added
139 | - setup.py, README.md, CHANGELOG.md, LICENSE added
140 |
141 |
142 | ## [0.2.0] - 2023-09-30
143 |
144 | ### Added
145 | - dandelion.py that run through neb, refine
146 | - Codes refactored
147 |
148 | ### Fixed
149 | - Issues with absolute import fixed
150 |
151 |
152 | ## [0.1.0] - 2023-09-10
153 |
154 | ### Added
155 | - Initial release with features neb, refine, segsm
156 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 |
2 | MIT License
3 |
4 | Copyright (c) 2023 Minhyeok Lee
5 |
6 | Permission is hereby granted, free of charge, to any person obtaining a copy
7 | of this software and associated documentation files (the "Software"), to deal
8 | in the Software without restriction, including without limitation the rights
9 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 | copies of the Software, and to permit persons to whom the Software is
11 | furnished to do so, subject to the following conditions:
12 |
13 | The above copyright notice and this permission notice shall be included in all
14 | copies or substantial portions of the Software.
15 |
16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 | SOFTWARE.
23 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 |
Dandelion
2 |
3 | [](https://mhyeok1.github.io/dand_docs/)
4 | [](https://doi.org/10.1002/advs.202409009)
5 | [](https://doi.org/10.5281/zenodo.14020916)
6 |
7 |
8 |
Codes for automated and efficient sampling of chemical reaction space for MLIP training
9 |
10 |
11 |

12 |
13 |
14 | Dandelion is a code for generating datasets that contain both equilibrium and reactive regions of potential energy surfaces, using automated and efficient sampling of chemical reaction space.
15 |
16 | **Documentation** :
17 |
18 |
19 |
20 | ## Citation
21 | If you find this work useful for your research, please consider citing:
22 |
23 | - Lee et al. *Adv. Sci.* **12**, 2409009 (2025) [LINK](https://doi.org/10.1002/advs.202409009)
24 |
25 | This work builds upon pioneering works that should also be cited:
26 | - Grambow et al. *Sci. Data* **7**, 137 (2020) [LINK](https://doi.org/10.1038/s41597-020-0460-4)
27 | - Schreiner et al. *Sci. Data* **9**, 779 (2022) [LINK](https://doi.org/10.1038/s41597-022-01870-w)
28 |
29 | ## Supporting Information
30 | The datasets used in the paper are available at zenodo.
31 |
32 | [](https://doi.org/10.5281/zenodo.14020916)
33 |
34 |
35 |
--------------------------------------------------------------------------------
/dandelion/__init__.py:
--------------------------------------------------------------------------------
1 | import argparse
2 |
3 | __version__ = '0.7.4'
4 |
5 | def print_separator(text, width=70):
6 | border = "╔" + "═" * (width-2) + "╗"
7 |
8 | total_symbols_len = width - len(text) - 4
9 | half_len = total_symbols_len // 2
10 | left_symbol = "║" + " " * (half_len - 1)
11 | right_symbol = " " * (total_symbols_len - half_len - 1) + "║"
12 | separator = left_symbol + ' ' + text + ' ' + right_symbol
13 |
14 | end = "╚" + "═" * (width-2) + "╝"
15 | print("\n\n" + border)
16 | print(separator)
17 | print(end + "\n\n")
18 |
19 | def merge_args_with_defaults(module_parser, custom_args):
20 | """
21 | Merge custom arguments with module defaults.
22 | Args:
23 | - module_parser: the module parser function
24 | - custom_args: dictionary of custom arguments
25 |
26 | Returns:
27 | - argparse.Namespace: merged namespace of arguments
28 | """
29 |
30 | parser = module_parser()
31 | for action in parser._actions:
32 | if action.required:
33 | action.required = False
34 |
35 | defaults = vars(parser.parse_args([]))
36 | defaults.update(custom_args)
37 |
38 | for action in parser._actions:
39 | if not action.required and action.dest in custom_args:
40 | action.required = True
41 |
42 | return argparse.Namespace(**defaults)
--------------------------------------------------------------------------------
/dandelion/cli.py:
--------------------------------------------------------------------------------
1 | import sys
2 | from dandelion import dandelion_prep, dandelion_sample, dandelion_refine
3 |
4 | def main():
5 | if len(sys.argv) < 2:
6 | print("Usage: dand [prep|sample|refine] [options]")
7 | sys.exit(1)
8 |
9 | command = sys.argv[1]
10 | # Remove the 'dand' and the subcommand from sys.argv
11 | sys.argv = [sys.argv[0]] + sys.argv[2:]
12 |
13 | if command == "prep":
14 | dandelion_prep.main()
15 | elif command == "sample":
16 | dandelion_sample.main()
17 | elif command == "refine":
18 | dandelion_refine.main()
19 | else:
20 | print(f"Unknown command: {command}")
21 | print("Available commands: prep, sample, refine")
22 | sys.exit(1)
23 |
24 | if __name__ == "__main__":
25 | main()
--------------------------------------------------------------------------------
/dandelion/dandelion_prep.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | import os
3 | import sys
4 | import time
5 | import argparse
6 |
7 | from dandelion import __version__, print_separator, merge_args_with_defaults
8 | from dandelion.prep.smiles_to_isoconfs import main as smiles_to_isoconfs, get_parser as smiles_to_isoconfs_parser
9 | from dandelion.prep.geom_opt import main as geom_opt, get_parser as geom_opt_parser
10 |
11 |
12 | def print_header(width=70):
13 |
14 | print(f'''
15 |
16 | H H
17 | \\\\ -
18 | \\\\ -
19 | C──────────C\ H
20 | - \\\\ /
21 | - \\\\ /
22 | H────────C O=Cc1ccccc1 C──────────C
23 | \\\\ - \\\\
24 | \\\\ - \\\\
25 | \\C─────────C- O
26 | - \\\\
27 | - \\\\
28 | H H
29 |
30 | {"Prepare Iso/Conformers from SMILES stirngs".center(width)}
31 | {("Dandelion " + __version__ + " by mlee").center(width)}
32 | ''')
33 |
34 |
35 | def main():
36 | args = parse_arguments()
37 |
38 | input_path = args.input_path
39 | if not os.path.isfile(input_path):
40 | sys.exit(f"Error: '{input_path}' is not a file.")
41 | working_path = os.path.dirname(input_path)
42 | max_workers = args.max_workers
43 |
44 | phases = [
45 | ("1. Sample iso/conformers from SIMLES strings", smiles_to_isoconfs, smiles_to_isoconfs_parser, {
46 | 'input_path': input_path,
47 | 'output_path': os.path.join(working_path, '-1_isoconfs'),
48 | }),
49 | ("2. Optimize geometries", geom_opt, geom_opt_parser, {
50 | 'input_path': os.path.join(working_path, '-1_isoconfs'),
51 | 'output_path': os.path.join(working_path, '0_reactants'),
52 | 'max_workers': max_workers
53 | }),
54 | ]
55 |
56 | print_header()
57 |
58 | for title, function, parser, custom_args in phases:
59 | time.sleep(3)
60 | print_separator(title)
61 | merged_args = merge_args_with_defaults(parser, custom_args)
62 | function(merged_args)
63 |
64 |
65 | def parse_arguments():
66 | parser = argparse.ArgumentParser(description='Prepare optimized iso/conformers from SMILES,\
67 | Other parameters can be set in each modules')
68 |
69 | parser.add_argument('-i', '--input_path', required=True,
70 | help='Input path of a file containing SMILES')
71 | parser.add_argument('-n', '--max_workers', type=int, default=1,
72 | help='Number of processes to use for parallel execution.')
73 |
74 | return parser.parse_args()
75 |
76 |
77 | if __name__ == "__main__":
78 | main()
79 |
--------------------------------------------------------------------------------
/dandelion/dandelion_refine.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | import os
3 | import sys
4 | import time
5 | import argparse
6 |
7 | from dandelion import __version__, print_separator, merge_args_with_defaults
8 | from dandelion.refine.refine_forces import main as refine_forces, get_parser as refine_forces_parser
9 | from dandelion.refine.compile_refined import main as compile_refined, get_parser as compile_refined_parser
10 |
11 |
12 |
13 | def print_header(width=70):
14 |
15 | print(f'''
16 |
17 | ⢀⣀⣀⣀⣀⣀⡀ ⢀⢀⣀⢀⠞⠖⠁⠡⡂⡆ ⡠⢀⡀
18 | ⠺⢿⣿⣿⣿⣿⣿⣿⣷⣦⣠⣤⣤⣤⣄⣀⣀ ⡏⢸ ⢀ ⠣⠈ ⡠⡋⡨⡋⡂
19 | ⠙⢿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣷⣦⣄⡀⡎⢀⡰⢀⢎⠌⢀⠔⣐⠠⣄⣀
20 | ⢀ ⡔⢀⣴⣾⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⠿⠿⠿⣿⣿⣷⣄⠂ ⢊⠎ ⠠⠂⡀⠕⠌⠌ ⡄⡠⢄
21 | ⢀⡆⠄⠁⢈⢠⣾⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣀ ⣀⣿⣿⣿⣆⠐ ⡨⠒⠁⡀⢠⣦⠍⠇⡀⢲⠂⡄⠄
22 | ⠨⡀⠑⡈ ⢠⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⡄ ⠈ ⣬⠠⣰⣿ ⢳⢹⡄⡆⠄⢀⢼
23 | ⡄⠱⠈⠁⠑⢄⠐⣾⣿⣿⡿⠋⠁⣀⣠⣬⣽⣿⣿⣿⣿⣿⣿⠿⠿⠿⠿⠿⠿⠿⠿⠟⠁⡟⣅⡢⠁⠠⠜⡄⡑⢌⢧⡀ ⡀⣰⢁⡐⢁⢄⣡⣧⡤⠄
24 | ⠠⡐⠓⠂⠌ ⢀⣿⣿⡏⢀⣴⣿⠿⠛⠉⠉⠶⢸⣿⣿⠿⠁⠢⠨⢀⣻⣿⣿⣿⣿⢟⣿⣝⠂ ⠠⡠⢆⠈⡂⠱⡇ ⣅⠫⠂⡠⢂⡪⠋ ⠁⡆
25 | ⡶⠉ ⢀⡀⠁⡁⢸⣿⣿⢠⣾⡟⠁⣿⣿⡇ ⢀⠈⠉⠁ ⣀⠷⣹⣏⣷⢏⠹⠁ ⠈⢈ ⢇ ⢸⠱⢸⡏⡀⡶⡸⠎ ⠰⠁⡸
26 | ⢈⡕⡈⠁⠐⠂⢀⢸⣿⣿⣾⠏⣿⣿⡿⣻⣿⢞⡢⠄ ⠈ ⡀⡤⠂⠁⠉⠌ ⢀⢀⠠⠐⢄ ⡀⢆⠎⢹⣶⣷⣧⡈⠈⠉⠤⠂⠉⢀⠱⡀
27 | ⢠⡊ ⠁⣸⣿⣿⣿⣀⠉⡻⡏⠋⠁ ⠁⠒⠒⡀⣍⠍⠁ ⡀ ⢠⠂ ⢀⠈⠄⢀⠄⡒⠅⠈⢄⢡ ⢿⣿⣷⣿⡄ ⠐⠄⠤ ⠜⢀
28 | ⠐⠁ ⠤⠒⢠⣾⣿⣿⣿⣿⣿⣷⣄⢄ ⢀ ⡏ ⢰⣃⠊⡐⠐⠁⢀⠈ ⣀ ⠰⠢⢀⠂⡰⠈⠂ ⡱⠂⢂⡇⡈⠻⢿⣿⠇ ⡤⠄⣀⡰⠁
29 | ⠁⣾⣿⣿⣿⣿⣿⣿⣿⣿⣦ ⠄ ⠉ ⠸⠫⢞⠈⣰⠈ ⡐⢲⣿⡏ ⢠⡾ ⣀⠊⢱ ⠠⡀ ⢈⢀⡐⠤⣕⡄
30 | ⢰⣿⡿⠛⠉ ⠈⠙⠛ ⠈⠈ ⠻⠔⠁⢸⡍⡇ ⢀⣏ ⢀⠠⠆ ⠣⡀⠈⡠⡀⠉⠢⡤⠢⣈⡡⣢⠦
31 | ⠈⠁ ⢻⣇ ⢸⡇⡇ ⣼⡿⠉ ⢀⡇ ⠑⡄⠑⣌⢄ ⠙⢄⠠⡪⣅
32 | ⠈⣾⡆ ⢸⣏⡇ ⢠⣿⠇ ⠸⢌⢢⢄⡠⠣⠈⠢⡁⡈⣎⢢⡬⠃
33 |
34 | {"Energy refinement on samples using orca".center(width)}
35 | {("Dandelion " + __version__ + " by mlee").center(width)}
36 | ''')
37 |
38 |
39 | def main():
40 | args = parse_arguments()
41 |
42 | input_path = args.input_path
43 | if not os.path.isdir(input_path):
44 | sys.exit(f"Error: '{input_path}' is not a directory.")
45 | max_workers = args.max_workers
46 | orcabinary = args.orca
47 |
48 | phases = [
49 | ("1. Refining forces", refine_forces, refine_forces_parser, {
50 | 'input_path': os.path.join(input_path, 'xtb.h5'),
51 | 'output_path': os.path.join(input_path, 'wb97x.db'),
52 | 'orca' : orcabinary,
53 | 'max_workers': max_workers
54 | }),
55 | ("2. Compiling final samples", compile_refined, compile_refined_parser, {
56 | 'input_path': os.path.join(input_path, 'wb97x.db'),
57 | 'output_path': os.path.join(input_path, 'wb97x.h5')
58 | }),
59 | ]
60 |
61 | print_header()
62 |
63 | for title, function, parser, custom_args in phases:
64 | time.sleep(3)
65 | print_separator(title)
66 | merged_args = merge_args_with_defaults(parser, custom_args)
67 | function(merged_args)
68 |
69 |
70 | def parse_arguments():
71 | parser = argparse.ArgumentParser(description='Refine force on obtained samples,\
72 | Other parameters can be set in each modules')
73 |
74 | parser.add_argument('-i', '--input_path', required=True,
75 | help='Input path of directory containing xtb.h5')
76 | parser.add_argument('-n', '--max_workers', type=int, required=True,
77 | help='Number of worker processes')
78 | parser.add_argument('--orca', required=True,
79 | help="Path of the orca binary file")
80 |
81 | return parser.parse_args()
82 |
83 |
84 | if __name__ == "__main__":
85 | main()
86 |
--------------------------------------------------------------------------------
/dandelion/dandelion_sample.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | import os
3 | import sys
4 | import time
5 | import argparse
6 |
7 | from dandelion import __version__, print_separator, merge_args_with_defaults
8 | from dandelion.segsm.create_gsm import main as create_gsm, get_parser as create_gsm_parser
9 | from dandelion.segsm.run_gsm import main as run_gsm, get_parser as run_gsm_parser
10 | from dandelion.segsm.filter_gsm import main as filter_gsm, get_parser as filter_gsm_parser
11 | from dandelion.neb.run_neb import main as run_neb, get_parser as run_neb_parser
12 | from dandelion.neb.filter_neb import main as filter_neb, get_parser as filter_neb_parser
13 | from dandelion.neb.compile_neb import main as compile_neb, get_parser as compile_neb_parser
14 |
15 |
16 | def print_header(width=70):
17 |
18 | print(f'''
19 |
20 | `;:` BREAK 1 2
21 | .;:; / BREAK 3 4
22 | _____ _ _;::; ` ADD 1 3
23 | | __ \ | | | |';:;'
24 | | | | | __ _ _ __ __| | ___| | _ ___ _ __
25 | | | | |/ _` | '_ \ / _` |/ _ \ | | |/ _ \| '_ \
26 | | |__| | (_| | | | | (_| | __/ | | | (_) | | | |
27 | |_____/ \__,_|_| |_|\__,_|\___|_| |_|\___/|_| |_|
28 |
29 | {"Chemical compound space sampling".center(width)}
30 | {"near transition state using xTB, SE-GSM and NEB".center(width)}
31 | {("Dandelion " + __version__ + " by mlee").center(width)}
32 | ''')
33 |
34 |
35 | def main():
36 | args = parse_arguments()
37 |
38 | input_path = args.input_path
39 | if not os.path.isdir(input_path):
40 | sys.exit(f"Error: '{input_path}' is not a directory.")
41 | output_path = os.path.dirname(os.path.dirname(input_path))
42 | max_workers = args.max_workers
43 |
44 | if not os.path.exists(output_path):
45 | os.makedirs(output_path)
46 |
47 | phases = [
48 | ("1. Creating GSM", create_gsm, create_gsm_parser, {
49 | 'input_path': input_path,
50 | 'output_path': os.path.join(output_path, '1_gsm')
51 | }),
52 | ("2. Running GSM", run_gsm, run_gsm_parser, {
53 | 'input_path': os.path.join(output_path, '1_gsm'),
54 | 'max_workers': max_workers
55 | }),
56 | ("3. Filtering GSM", filter_gsm, filter_gsm_parser, {
57 | 'input_path': os.path.join(output_path, '1_gsm'),
58 | 'output_path': os.path.join(output_path, '2_gsm_filtered')
59 | }),
60 |
61 | ("4. Running NEB", run_neb, run_neb_parser, {
62 | 'input_path': os.path.join(output_path, '2_gsm_filtered'),
63 | 'output_path': os.path.join(output_path, '3_neb'),
64 | 'max_workers': max_workers
65 | }),
66 | ("5. Filtering NEB", filter_neb, filter_neb_parser, {
67 | 'input_path': os.path.join(output_path, '3_neb'),
68 | 'output_path': os.path.join(output_path, '4_neb_filtered')
69 | }),
70 | ("6. Compiling samples", compile_neb, compile_neb_parser, {
71 | 'input_path': os.path.join(output_path, '4_neb_filtered', 'reactions.json'),
72 | 'output_path': os.path.join(output_path, 'xtb.h5')
73 | }),
74 | ]
75 |
76 | print_header()
77 |
78 | for title, function, parser, custom_args in phases:
79 | time.sleep(3)
80 | print_separator(title)
81 | merged_args = merge_args_with_defaults(parser, custom_args)
82 | function(merged_args)
83 |
84 |
85 | def parse_arguments():
86 | parser = argparse.ArgumentParser(description='Do SEGSM and NEB from reactant structures,\
87 | Other parameters can be set in each modules')
88 |
89 | parser.add_argument('-i', '--input_path', required=True,
90 | help='Input path of reactant structures (must be a directory)')
91 | parser.add_argument('-n', '--max_workers', type=int, required=True,
92 | help='Number of worker processes')
93 | return parser.parse_args()
94 |
95 |
96 | if __name__ == "__main__":
97 | main()
--------------------------------------------------------------------------------
/dandelion/neb/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mhyeok1/dand/6c9ff246047ff37c89ab2b338f236e4798b3e2c2/dandelion/neb/__init__.py
--------------------------------------------------------------------------------
/dandelion/neb/compile_neb.py:
--------------------------------------------------------------------------------
1 | import os
2 | import sys
3 | import json
4 | import hashlib
5 | import argparse
6 | import itertools
7 |
8 | import h5py
9 | import ase.db
10 | import numpy as np
11 | from tqdm import tqdm
12 | from ase.units import Hartree, Bohr
13 |
14 |
15 | def get_hash(row):
16 | s = str(row.positions) + row.formula
17 | return int(hashlib.sha1(s.encode("utf-8")).hexdigest(), 16) % (10 ** 8)
18 |
19 | def write_rxn(h5file, fmaxs_path, db_path, rxn, fmax_threshold):
20 | fmaxs = json.load(open(fmaxs_path))
21 |
22 | skip_next = False
23 | first = True
24 | cum_fmax = 0
25 |
26 | with ase.db.connect(db_path) as db:
27 | for i, (fmax, path) in enumerate(zip(fmaxs, sliced_it(10, db.select("")))):
28 | cum_fmax += fmax
29 | skip_this = skip_next
30 | skip_next = False
31 | last = i == len(fmaxs) - 1
32 |
33 | if last:
34 | skip_this = False
35 |
36 | if cum_fmax < fmax_threshold:
37 | skip_next = True
38 |
39 | else:
40 | cum_fmax = 0
41 |
42 | if skip_this:
43 | continue
44 |
45 | if not first:
46 | path = path[1:-1]
47 |
48 | # reactant and product is sampled once
49 | # (all points -2) // 8 ==0
50 |
51 | forces_path = np.array([row.forces for row in path])
52 | positions_path = np.array([row.positions for row in path])
53 | energy_path = np.array([row.energy for row in path])
54 |
55 | if first:
56 | forces = forces_path
57 | positions = positions_path
58 | energy = energy_path
59 | reactant = path[0] # pylint: disable=undefined-loop-variable
60 | product = path[-1] # pylint: disable=undefined-loop-variable
61 |
62 | else:
63 | forces = np.concatenate((forces, forces_path), axis=0)
64 | positions = np.concatenate((positions, positions_path), axis=0)
65 | energy = np.concatenate((energy, energy_path), axis=0)
66 |
67 | first = False
68 |
69 | transition_state = path[ # pylint: disable=undefined-loop-variable
70 | np.argmax(energy_path)
71 | ]
72 |
73 | formula = reactant.formula
74 | atomic_numbers = reactant.numbers
75 |
76 | if formula in h5file:
77 | grp = h5file[formula]
78 | else:
79 | grp = h5file.create_group(formula)
80 |
81 | subgrp = grp.create_group(rxn)
82 | single_molecule(reactant, subgrp.create_group("reactant"))
83 | single_molecule(transition_state, subgrp.create_group("transition_state"))
84 | single_molecule(product, subgrp.create_group("product"))
85 |
86 | dict_ = {
87 | "forces": forces,
88 | "positions": positions,
89 | "energy": energy,
90 | "atomic_numbers": atomic_numbers,
91 | }
92 | write_group(dict_, subgrp)
93 |
94 |
95 | def single_molecule(molecule, subgrp):
96 | dict_ = {
97 | "forces": np.expand_dims(molecule.forces, 0),
98 | "positions": np.expand_dims(molecule.positions, 0),
99 | "energy": np.expand_dims(molecule.energy, 0),
100 | "atomic_numbers": molecule.numbers,
101 | "hash": get_hash(molecule),
102 | }
103 | write_group(dict_, subgrp)
104 |
105 |
106 | def write_group(dict_, grp):
107 | grp.create_dataset("atomic_numbers", data=dict_["atomic_numbers"])
108 | grp.create_dataset("GFN2-xTB.forces", data=dict_["forces"])
109 | grp.create_dataset("GFN2-xTB.energy", data=dict_["energy"])
110 | grp.create_dataset("positions", data=dict_["positions"])
111 |
112 | if "hash" in dict_:
113 | grp.create_dataset("hash", data=dict_["hash"])
114 |
115 |
116 | def sliced_it(n, iterable):
117 | it = iter(iterable)
118 | while True:
119 | chunk = itertools.islice(it, n)
120 | yield list(chunk)
121 |
122 |
123 | def main(args):
124 |
125 | print_args(args)
126 |
127 | input_path = args.input_path
128 | if not os.path.isfile(input_path):
129 | sys.exit(f"Error: '{input_path}' is not a file.")
130 | output_path = args.output_path
131 | fmax_threshold = args.fmax_threshold
132 |
133 | rxns = json.load(open(input_path))
134 | h5file = h5py.File(output_path, "w")
135 |
136 | data = h5file.create_group("data")
137 | indexfile = open(output_path + ".index.json", "w")
138 | index = {}
139 |
140 | bar_format = '{l_bar}{bar}| {n_fmt}/{total_fmt} [{elapsed}<{remaining}]'
141 | for i, path in tqdm(enumerate(rxns), total=len(rxns), desc="Compiling reactions", bar_format=bar_format, ncols=70):
142 |
143 | fmaxs_path = os.path.join(path, "fmaxs.json")
144 | db_path = os.path.join(path, "neb.db")
145 |
146 | new_rxn_name = f"rxn{str(i).zfill(4)}"
147 | write_rxn(data, fmaxs_path, db_path, new_rxn_name, fmax_threshold)
148 | index[new_rxn_name] = os.path.basename(path)
149 |
150 | json.dump(index, indexfile, indent=4)
151 |
152 | print('Compiling finished!')
153 |
154 | def print_args(args):
155 | print()
156 | print("Arguments provided:")
157 | arg_dict = vars(args)
158 | for key, value in arg_dict.items():
159 | print(f" {key}: {value}")
160 | print()
161 |
162 | def get_parser():
163 | parser = argparse.ArgumentParser(description="Compile filtered neb jobs to xtb h5 file.")
164 |
165 | parser.add_argument('-i', '--input_path', required=True,
166 | help="Path of reactions.json, contains all reactions that should be included in the dataset ")
167 | parser.add_argument('-o', '--output_path', required=True,
168 | help="Path to the h5 file to write to")
169 | parser.add_argument('--fmax_threshold', type=int, default=0.1,
170 | help='Fmax threshold for selecting bands')
171 | return parser
172 |
173 | if __name__ == "__main__":
174 | args = get_parser().parse_args()
175 | main(args)
176 |
177 |
--------------------------------------------------------------------------------
/dandelion/neb/filter_neb.py:
--------------------------------------------------------------------------------
1 | import os
2 | import sys
3 | import json
4 | import shutil
5 | import argparse
6 | from collections import defaultdict
7 |
8 | from tqdm import tqdm
9 | import numpy as np
10 | from ase.io import read
11 | from ase.vibrations import Vibrations
12 | from xtb.ase.calculator import XTB
13 |
14 | def get_energy_from_xyz(file_path):
15 | """Extracts the energy of a structure from an XYZ file."""
16 | try:
17 | atom = read(file_path)
18 | return atom.get_potential_energy()
19 | except:
20 | return None
21 |
22 | def is_valid_rxn(reactant_path, product_path, ts_path):
23 | """Check if the reaction is valid based on energy."""
24 |
25 | reactant_energy = get_energy_from_xyz(reactant_path)
26 | product_energy = get_energy_from_xyz(product_path)
27 | ts_energy = get_energy_from_xyz(ts_path)
28 |
29 | if abs(reactant_energy - product_energy) < 5 * 0.0433634: # delta E below 5 kcal/mol
30 | return False
31 |
32 | if abs(ts_energy - reactant_energy) < 5 * 0.0433634: # reverse AE below 5 kcal/mol
33 | return False
34 |
35 | if abs(ts_energy - product_energy) < 5 * 0.0433634: # reverse AE below 5 kcal/mol
36 | return False
37 |
38 | return product_energy != ts_energy
39 |
40 |
41 | def is_transition_state(ts_file_path, threshold=50): #cm-1
42 | struc = read(ts_file_path)
43 | struc.calc = XTB(method="GFN2-xTB")
44 |
45 | try:
46 | vib = Vibrations(struc)
47 | vib.run()
48 | frequencies = vib.get_frequencies()
49 | vib.clean()
50 |
51 | # Filter out imaginary frequencies below the threshold
52 | significant_imaginary_freqs = np.count_nonzero(np.abs(np.imag(frequencies)) > threshold)
53 |
54 | return significant_imaginary_freqs == 1
55 | except:
56 | return False
57 |
58 | def main(args):
59 |
60 | print_args(args)
61 |
62 | input_path = args.input_path
63 | if not os.path.isdir(input_path):
64 | sys.exit(f"Error: '{input_path}' is not a directory.")
65 | output_path = args.output_path
66 | if not os.path.exists(output_path):
67 | os.mkdir(output_path)
68 |
69 | grown_seeds = [dirpath for dirpath, _, filenames in os.walk(input_path) if "converged" in filenames]
70 | grown_seeds_copy = grown_seeds
71 | # Group by mother string
72 | grouped_seeds = defaultdict(list)
73 | for seed in grown_seeds:
74 | mother_string = os.path.basename(seed)[:-8] # gsmGeom-m1-i1-c1-opt-gsm0044 -> gsmGeom-m1-i1-c1-opt
75 | grouped_seeds[mother_string].append(seed)
76 | rxn_list = []
77 |
78 | bar_format = '{l_bar}{bar}| {n_fmt}/{total_fmt} [{elapsed}<{remaining}]'
79 | for mother_string, seeds in tqdm(grouped_seeds.items(), desc="Mothers", position=0, bar_format=bar_format, ncols=70):
80 | idx = 0
81 | for f in tqdm(seeds, desc=f"Rxns in {mother_string}", position=1, bar_format=bar_format, ncols=70, leave=False):
82 |
83 | ts_file_path = os.path.join(f, 'transition_state.xyz')
84 | reactant_path = os.path.join(f, 'reactant.xyz')
85 | product_path = os.path.join(f, 'product.xyz')
86 |
87 | if not is_valid_rxn(reactant_path, product_path, ts_file_path):
88 | continue
89 |
90 | if not is_transition_state(ts_file_path):
91 | # print(f"Directory {f} is not a valid reaction. Skipping...")
92 | continue
93 |
94 | # If True, copy the directory
95 | new_name = os.path.join(output_path, f'{mother_string}-rxn{idx:04}')
96 | shutil.copytree(f, new_name)
97 | rxn_list.append(new_name)
98 | idx += 1
99 |
100 | with open(os.path.join(output_path, 'reactions.json'), 'w') as f:
101 | json.dump(rxn_list, f, indent=4)
102 |
103 | print(f'\n{len(rxn_list)}/{len(grown_seeds_copy)} rxns were saved to {output_path}/reactions.json')
104 | print('Filtering NEB finished!')
105 |
106 |
107 | def print_args(args):
108 | print()
109 | print("Arguments provided:")
110 | arg_dict = vars(args)
111 | for key, value in arg_dict.items():
112 | print(f" {key}: {value}")
113 | print()
114 |
115 | def get_parser():
116 | parser = argparse.ArgumentParser(description='Filter neb jobs and make reactions.json')
117 |
118 | parser.add_argument('-i', '--input_path', required=True,
119 | help='Input path of finished neb jobs')
120 | parser.add_argument('-o', '--output_path', required=True,
121 | help='Output path of filtered neb jobs')
122 |
123 | return parser
124 |
125 |
126 | if __name__ == "__main__":
127 | args = get_parser().parse_args()
128 | main(args)
129 |
130 |
131 |
132 |
133 |
--------------------------------------------------------------------------------
/dandelion/neb/run_neb.py:
--------------------------------------------------------------------------------
1 |
2 | import os
3 | import sys
4 | import json
5 | import argparse
6 | from functools import partial
7 | from concurrent.futures import ProcessPoolExecutor
8 |
9 | import uuid
10 | import numpy as np
11 | from tqdm import tqdm
12 |
13 | import matplotlib
14 | import matplotlib.pyplot as plt
15 | import imageio.v2 as imageio
16 | from PIL import Image, ImageOps
17 |
18 | import ase.db
19 | from ase.io import read, write
20 | from xtb.ase.calculator import XTB
21 | from ase.optimize.bfgs import BFGS
22 | from ase.utils.forcecurve import fit_images
23 | from ase.neb import NEB, NEBOptimizer, NEBTools
24 | from ase.calculators.orca import ORCA
25 |
26 | class SuppressStderr:
27 | def __enter__(self):
28 | self._original_stderr = sys.stderr
29 | sys.stderr = open(os.devnull, 'w')
30 |
31 | def __exit__(self, exc_type, exc_val, exc_tb):
32 | sys.stderr.close()
33 | sys.stderr = self._original_stderr
34 |
35 | def plot_mep(fit_list):
36 | fit_list[:,1,:] *= 23.0609 #to kcal/mol
37 | gray_scale = matplotlib.colormaps.get('binary', len(fit_list))
38 | fig, ax = plt.subplots()
39 | for i in range(len(fit_list)):
40 |
41 | if i+1 == len(fit_list):
42 | ax.plot(fit_list[i,0,:], fit_list[i,1,:], color='red', linewidth=3)
43 | break
44 |
45 | color = gray_scale(max(i / len(fit_list), 0.1))
46 | ax.plot(fit_list[i,0,:], fit_list[i,1,:], color=color)
47 |
48 | ax.set_title(f'Iter {len(fit_list)}')
49 | ax.set_axisbelow(True)
50 | ax.set_ylabel("Energy [kcal/mol]")
51 | ax.set_xlabel("Reaction Coordinate [AA]")
52 | return fig
53 |
54 | def get_fit(neb_tools):
55 | fit = fit_images(neb_tools.images)
56 | return fit.fit_path, fit.fit_energies
57 |
58 | class CalculationChecker:
59 | def __init__(self, neb):
60 | self.neb = neb
61 |
62 | def check_calculations(self):
63 | missing_calculations = []
64 | for i, image in enumerate(self.neb.images[1:-1]):
65 | if {"forces", "energy"} - image.calc.results.keys():
66 | missing_calculations.append(i)
67 |
68 | if missing_calculations:
69 | raise ValueError(f"missing calculation for image(s) {missing_calculations}")
70 |
71 |
72 | class DBWriter:
73 | def __init__(self, db_path, atomss):
74 | self.atomss = atomss
75 | self.db_path = db_path
76 |
77 | def write(self):
78 | with ase.db.connect(self.db_path) as db:
79 | for atoms in self.atomss:
80 | if atoms.calc.results:
81 | db.write(atoms, data=atoms.calc.results)
82 |
83 |
84 | def interpolate_band(atom_configs, transition_state=None):
85 | if transition_state:
86 | transition_state = read(transition_state)
87 | ts_positions = transition_state.get_positions()
88 | middle_idx = len(atom_configs) // 2
89 | atom_configs[middle_idx].set_positions(ts_positions)
90 | first_band = NEB(atom_configs[: middle_idx + 1])
91 | second_band = NEB(atom_configs[middle_idx:])
92 | first_band.interpolate("idpp")
93 | second_band.interpolate("idpp")
94 | else:
95 | band = NEB(atom_configs)
96 | band.interpolate("idpp")
97 | return atom_configs
98 |
99 |
100 | def max_dimensions(frames):
101 | """Get the maximum width and height among a list of images."""
102 | max_width = max_height = 0
103 | for frame in frames:
104 | with Image.open(frame) as img:
105 | width, height = img.size
106 | max_width = max(max_width, width)
107 | max_height = max(max_height, height)
108 | return max_width, max_height
109 |
110 | def pad_image(image_path, target_size):
111 | """Pad an image to the target size."""
112 | with Image.open(image_path) as img:
113 | img = ImageOps.expand(img, border=((target_size[0]-img.size[0])//2,
114 | (target_size[1]-img.size[1])//2,
115 | (target_size[0]-img.size[0]+1)//2,
116 | (target_size[1]-img.size[1]+1)//2),
117 | fill='white') # or another suitable color for your images
118 | return img
119 |
120 | def frames_to_gif(frames, output_gif):
121 | # First, render each Atoms frame to an image
122 | image_paths = []
123 | for i, frame in enumerate(frames):
124 | img_path = f"tmp_frame_{i}_{uuid.uuid4()}.png"
125 | write(img_path, frame)
126 | image_paths.append(img_path)
127 |
128 | # Determine the max dimensions
129 | max_width, max_height = max_dimensions(image_paths)
130 |
131 | # Create a list to store processed frames
132 | processed_frames = []
133 |
134 | # Pad each frame, ensuring a non-transparent background
135 | for img_path in image_paths:
136 | with Image.open(img_path) as opened_img:
137 | padded_frame = pad_image(img_path, (max_width, max_height))
138 |
139 | # Create a white background and paste the frame onto it to ensure non-transparency
140 | background = Image.new('RGB', padded_frame.size, (255, 255, 255))
141 | background.paste(padded_frame, mask=(padded_frame.split()[3] if len(padded_frame.split()) == 4 else None))
142 | processed_frames.append(np.array(background))
143 |
144 | # Extend the list of processed frames with a reversed copy (excluding the last frame)
145 | extended_frames = processed_frames + processed_frames[-2::-1]
146 |
147 | # Save the gif using imageio
148 | with imageio.get_writer(output_gif, mode='I', duration=0.5) as writer:
149 | for processed_frame in extended_frames:
150 | writer.append_data(processed_frame)
151 |
152 | # Cleanup the temporary image files
153 | for img_path in image_paths:
154 | os.remove(img_path)
155 |
156 |
157 | def process_seed(seed, n_images, neb_fmax, cineb_fmax, steps, output_path):
158 |
159 | with SuppressStderr(): # xTB is so noisy when not converged
160 | try:
161 | #print(f"Starting from seed : {seed}")
162 | reactant = os.path.join(seed, 'reactant.xyz')
163 | product = os.path.join(seed, 'product.xyz')
164 | transition_state = os.path.join(seed, 'ts.xyz')
165 | product = read(product)
166 | reactant = read(reactant)
167 |
168 | output = os.path.join(output_path, seed.split('/')[-2]+'-'+seed.split('/')[-1])
169 | os.makedirs(output, exist_ok=True)
170 | atom_configs = [reactant.copy() for i in range(n_images - 1)] + [product]
171 |
172 | for i, atom_config in enumerate(atom_configs):
173 | atom_config.calc = XTB(method='GFN2-xTB')
174 |
175 | #print("Relaxing endpoints ... ")
176 | BFGS(atom_configs[0], logfile=None).run()
177 | BFGS(atom_configs[-1], logfile=None).run()
178 |
179 | #print("Interpolating band ... ")
180 | interpolate_band(atom_configs, transition_state)
181 |
182 | #print("Running NEB ... ")
183 | neb = NEB(atom_configs, climb=True, parallel=False)
184 | calculation_checker = CalculationChecker(neb)
185 | neb_tools = NEBTools(neb.images)
186 |
187 | relax_neb = NEBOptimizer(neb, logfile=None)
188 | db_writer = DBWriter(os.path.join(output, "neb.db"), atom_configs)
189 | fmaxs = []
190 | fit_list = []
191 | relax_neb.attach(calculation_checker.check_calculations)
192 | relax_neb.attach(db_writer.write)
193 | relax_neb.attach(lambda: fmaxs.append(neb_tools.get_fmax()))
194 | relax_neb.attach(lambda: fit_list.append(get_fit(neb_tools)))
195 |
196 | converged = relax_neb.run(fmax=neb_fmax, steps=steps)
197 |
198 | if not converged:
199 | raise
200 |
201 | #print("NEB has converged, turn on CI-NEB ...")
202 | neb.climb = True
203 | ci_converged = relax_neb.run(fmax=cineb_fmax, steps=steps)
204 |
205 | if ci_converged:
206 | open(os.path.join(output, "converged"), "w")
207 | #print("Reaction converged ... ")
208 | fit_list = np.array(fit_list)
209 | fig = plot_mep(fit_list)
210 | if ci_converged:
211 | np.save(os.path.join(output, "fitlist.npy"), fit_list)
212 |
213 | fig.savefig(os.path.join(output, "mep.png"))
214 | json.dump(fmaxs, open(os.path.join(output, "fmaxs.json"), "w"), indent=4)
215 | transition_state = max(atom_configs, key=lambda x: x.get_potential_energy())
216 | write(os.path.join(output, "transition_state.xyz"), transition_state)
217 | write(os.path.join(output, "transition_state.png"), transition_state)
218 | write(os.path.join(output, "reactant.xyz"), atom_configs[0])
219 | write(os.path.join(output, "reactant.png"), atom_configs[0])
220 | write(os.path.join(output, "product.xyz"), atom_configs[-1])
221 | write(os.path.join(output, "product.png"), atom_configs[-1])
222 | write(os.path.join(output, "mep.xyz"), atom_configs)
223 | frames_to_gif(atom_configs, os.path.join(output, "mep.gif"))
224 |
225 | return seed
226 |
227 | except Exception as e:
228 | #print(f"Error processing seed {seed}: {e}")
229 | return None
230 |
231 | def main(args):
232 |
233 | print_args(args)
234 |
235 | input_path = args.input_path
236 | if not os.path.isdir(input_path):
237 | sys.exit(f"Error: '{input_path}' is not a directory.")
238 | output_path = args.output_path
239 | if not os.path.exists(output_path):
240 | os.mkdir(output_path)
241 | max_workers = args.max_workers
242 | n_images = args.n_images
243 | neb_fmax = args.neb_fmax
244 | cineb_fmax = args.cineb_fmax
245 | steps = args.steps
246 |
247 |
248 | seeds = [dirpath for dirpath, _, filenames in os.walk(input_path) if "ts.png" in filenames]
249 |
250 | bar_format = '{l_bar}{bar}| {n_fmt}/{total_fmt} [{elapsed}<{remaining}]'
251 | # Use a partial function to pass the extra arguments to process_seed
252 | process_with_args = partial(process_seed, n_images=n_images, neb_fmax=neb_fmax,
253 | cineb_fmax=cineb_fmax, steps=steps, output_path=output_path)
254 | with ProcessPoolExecutor(max_workers=max_workers) as executor:
255 | results = list(tqdm(executor.map(process_with_args, seeds),
256 | desc='Seeds', total=len(seeds), smoothing=0, bar_format=bar_format, ncols=70))
257 |
258 | print('xTB-NEB completed!')
259 |
260 | def print_args(args):
261 | print()
262 | print("Arguments provided:")
263 | arg_dict = vars(args)
264 | for key, value in arg_dict.items():
265 | print(f" {key}: {value}")
266 | print()
267 |
268 |
269 | def get_parser():
270 | parser = argparse.ArgumentParser(description="Run NEB calculations on filtered gsm jobs")
271 |
272 | parser.add_argument('-i', '--input_path', type=str, required=True,
273 | help='Path of input directory containing filtered gsm jobs.')
274 | parser.add_argument('-o', '--output_path', type=str, required=True,
275 | help='Path of output directory to store results.')
276 | parser.add_argument('-n', '--max_workers', type=int, default=1,
277 | help='Number of processes to use for parallel execution.')
278 | parser.add_argument('--n_images', type=int, default=10,
279 | help='Number of images for NEB.')
280 | parser.add_argument('--neb_fmax', type=float, default=0.5,
281 | help='Fmax threshold for NEB.')
282 | parser.add_argument('--cineb_fmax', type=float, default=0.05,
283 | help='Fmax threshold for CI-NEB.')
284 | parser.add_argument('--steps', type=int, default=500,
285 | help='Maximum number of optimization steps.')
286 |
287 | return parser
288 |
289 |
290 |
291 | if __name__ == "__main__":
292 | args = get_parser().parse_args()
293 | main(args)
294 |
--------------------------------------------------------------------------------
/dandelion/prep/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mhyeok1/dand/6c9ff246047ff37c89ab2b338f236e4798b3e2c2/dandelion/prep/__init__.py
--------------------------------------------------------------------------------
/dandelion/prep/geom_opt.py:
--------------------------------------------------------------------------------
1 | import os
2 | import sys
3 | import shutil
4 | import argparse
5 | import warnings
6 |
7 | from ase.io import read
8 | from ase.optimize import BFGS
9 | from xtb.ase.calculator import XTB
10 | from tqdm import tqdm
11 | from concurrent.futures import ProcessPoolExecutor, as_completed
12 |
13 | def write_xyz(filename, atoms):
14 | with open(filename, 'w') as f:
15 | f.write(f"{len(atoms)}\n\n")
16 | for atom in atoms:
17 | f.write(f"{atom.symbol:<2} {atom.position[0]:15.8f} {atom.position[1]:15.8f} {atom.position[2]:15.8f}\n")
18 |
19 | def generate_eq_struc(atoms):
20 | atoms.calc = XTB(method="GFN2-xTB")
21 | with warnings.catch_warnings():
22 | warnings.simplefilter("ignore")
23 | opt = BFGS(atoms, logfile=None)
24 | opt.run(fmax=1e-4)
25 | return atoms
26 |
27 | def process_file(input_file, output_dir):
28 | filename = os.path.basename(input_file)
29 | mol_dir = os.path.join(output_dir, os.path.splitext(filename)[0])
30 | os.makedirs(mol_dir, exist_ok=True)
31 |
32 | # Copy original file
33 | shutil.copy(input_file, mol_dir)
34 |
35 | # Generate and save optimized structure
36 | atoms = read(input_file)
37 | optimized_atoms = generate_eq_struc(atoms)
38 | write_xyz(os.path.join(mol_dir, 'struc.xyz'), optimized_atoms)
39 |
40 | # Remove the original copied file
41 | os.remove(os.path.join(mol_dir, filename))
42 |
43 | def main(args):
44 | print_args(args)
45 |
46 | input_path = os.path.abspath(args.input_path)
47 | if not os.path.isdir(input_path):
48 | sys.exit(f"Error: '{input_path}' is not a directory.")
49 | output_path = os.path.abspath(args.output_path)
50 | max_workers = args.max_workers
51 |
52 | # Get list of all .xyz files
53 | xyz_files = []
54 | for root, _, files in os.walk(input_path):
55 | xyz_files.extend([os.path.join(root, f) for f in files if f.endswith('.xyz')])
56 |
57 | # Process files in parallel with progress bar
58 | bar_format = '{l_bar}{bar}| {n_fmt}/{total_fmt} [{elapsed}<{remaining}]'
59 | with ProcessPoolExecutor(max_workers=max_workers) as executor:
60 | list(tqdm(executor.map(process_file, xyz_files, [output_path]*len(xyz_files)),
61 | total=len(xyz_files), desc="Optimizing structures", smoothing=0, bar_format=bar_format, ncols=70))
62 |
63 | def print_args(args):
64 | print("\nArguments provided:")
65 | for key, value in vars(args).items():
66 | print(f" {key}: {value}")
67 | print()
68 |
69 | def get_parser():
70 | parser = argparse.ArgumentParser(description="Optimize geometries using xTB")
71 | parser.add_argument('-i', '--input_path', required=True,
72 | help="Path of the input reactants directory")
73 | parser.add_argument('-o', '--output_path', required=True,
74 | help='Path of output directory to store optimized geometries')
75 | parser.add_argument('-n', '--max_workers', type=int, default=1,
76 | help='Number of processes to use for parallel execution.')
77 | return parser
78 |
79 | if __name__ == "__main__":
80 | args = get_parser().parse_args()
81 | main(args)
--------------------------------------------------------------------------------
/dandelion/prep/smiles_to_isoconfs.py:
--------------------------------------------------------------------------------
1 | import os
2 | import sys
3 | import argparse
4 | import subprocess
5 |
6 | from rdkit import Chem
7 | from rdkit.Chem.EnumerateStereoisomers import EnumerateStereoisomers, StereoEnumerationOptions
8 |
9 | def obabel_command(input_data, input_format, output_str, options=[], output_path=None):
10 | cmd = ['obabel', '-i', input_format] + input_data + ['-O', output_str] + options
11 | full_output_path = os.path.join(output_path, output_str) if output_path else output_str
12 | subprocess.run(cmd, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, cwd=output_path)
13 | return full_output_path
14 |
15 | def obabel_from_smiles(smiles_str, output_str, options=[], output_path=None):
16 | cmd = ['obabel', '-ismi', '-', '-O', output_str] + options
17 | full_output_path = os.path.join(output_path, output_str) if output_path else output_str
18 | process = subprocess.Popen(cmd, stdin=subprocess.PIPE, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, cwd=output_path)
19 | process.communicate(input=smiles_str.encode())
20 | return full_output_path
21 |
22 | def cleanup_files(output_path, files_to_remove):
23 | for file in files_to_remove:
24 | file_path = os.path.join(output_path, file)
25 | if os.path.exists(file_path):
26 | os.remove(file_path)
27 |
28 | def main(args):
29 | print_args(args)
30 |
31 | input_path = os.path.abspath(args.input_path)
32 | if not os.path.isfile(input_path):
33 | sys.exit(f"Error: '{input_path}' is not a file.")
34 | output_path = os.path.abspath(args.output_path)
35 |
36 | if not os.path.exists(output_path):
37 | os.makedirs(output_path)
38 |
39 | with open(input_path, 'r') as f:
40 | lines = f.readlines()
41 | lines = list(map(lambda s: s.strip(), lines))
42 |
43 | for m, mol_smi in enumerate(lines):
44 | print(f'==={m+1}th molecules : {mol_smi} ')
45 | mol = Chem.MolFromSmiles(mol_smi)
46 | opts = StereoEnumerationOptions(tryEmbedding=True, unique=True)
47 | isomers = tuple(EnumerateStereoisomers(mol, options=opts))
48 | for i, isomer_smi in enumerate(Chem.MolToSmiles(x, isomericSmiles=True) for x in isomers):
49 | print(f'-{i+1}th isomer : {isomer_smi}')
50 |
51 | gen3d_file = obabel_from_smiles(isomer_smi, 'gen3d.xyz', ['--gen3d'], output_path=output_path)
52 | confab_file = obabel_command([os.path.basename(gen3d_file)], 'xyz', 'confab.sdf', ['--confab', '--rcutoff', '1.0'], output_path=output_path)
53 | obabel_command([os.path.basename(confab_file)], 'sdf', f'm{m+1}-i{i+1}-c.xyz', ['-m'], output_path=output_path)
54 |
55 | cleanup_files(output_path, ['confab.sdf', 'gen3d.xyz'])
56 |
57 |
58 | def print_args(args):
59 | print()
60 | print("Arguments provided:")
61 | arg_dict = vars(args)
62 | for key, value in arg_dict.items():
63 | print(f" {key}: {value}")
64 | print()
65 |
66 | def get_parser():
67 | parser = argparse.ArgumentParser(description="Generate Iso/Conformers from SMILES using RDkit and Obabel")
68 |
69 | parser.add_argument('-i', '--input_path', required=True,
70 | help="Path of the input SMILES string file")
71 | parser.add_argument('-o', '--output_path', type=str, required=True,
72 | help='Path of output directory to store Iso/Conformers.')
73 | return parser
74 |
75 | if __name__ == "__main__":
76 | args = get_parser().parse_args()
77 | main(args)
--------------------------------------------------------------------------------
/dandelion/refine/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mhyeok1/dand/6c9ff246047ff37c89ab2b338f236e4798b3e2c2/dandelion/refine/__init__.py
--------------------------------------------------------------------------------
/dandelion/refine/compile_refined.py:
--------------------------------------------------------------------------------
1 | import os
2 | import sys
3 | import argparse
4 |
5 | import h5py
6 | from ase.db import connect
7 |
8 |
9 | def main(args):
10 |
11 | print_args(args)
12 |
13 | input_path = args.input_path
14 | if not os.path.isfile(input_path):
15 | sys.exit(f"Error: '{input_path}' is not a file.")
16 | output_path = args.output_path
17 |
18 | # Data structure to hold the computed results
19 | rxn_data = {}
20 |
21 | rows = [] # List to store all rows
22 |
23 | # Extract data from ASE database
24 | with connect(input_path) as db:
25 | for row in db.select():
26 | if hasattr(row, 'energy') and hasattr(row, 'forces'):
27 | rows.append(row)
28 |
29 | # Sort rows based on the unique_id number
30 | rows.sort(key=lambda r: int(r.data['unique_id'].split('_')[-1]))
31 |
32 | # Process sorted rows
33 | for row in rows:
34 | # Extract unique_id and other data
35 | unique_id = row.data['unique_id']
36 | chem_group_name, rxn_group_name, index = unique_id.split('_')
37 |
38 | if chem_group_name not in rxn_data:
39 | rxn_data[chem_group_name] = {}
40 |
41 | if rxn_group_name not in rxn_data[chem_group_name]:
42 | rxn_data[chem_group_name][rxn_group_name] = {
43 | 'atomic_numbers': row.toatoms().numbers,
44 | 'energies': [],
45 | 'forces': [],
46 | 'positions': []
47 | }
48 | rxn_data[chem_group_name][rxn_group_name]['energies'].append(row.energy)
49 | rxn_data[chem_group_name][rxn_group_name]['forces'].append(row.forces)
50 | rxn_data[chem_group_name][rxn_group_name]['positions'].append(row.toatoms().positions)
51 |
52 | # Save the data to an h5 file
53 | with h5py.File(output_path, 'w') as h5file:
54 | # Ensure the 'data' group exists
55 | if 'data' not in h5file:
56 | data_group = h5file.create_group('data')
57 | else:
58 | data_group = h5file['data']
59 |
60 | # Iterate through the rxn_data dictionary to save datasets
61 | for chem_group_name in rxn_data:
62 | if chem_group_name not in data_group:
63 | chem_group = data_group.create_group(chem_group_name)
64 | else:
65 | chem_group = data_group[chem_group_name]
66 |
67 | for rxn_group_name, rxn_entry in rxn_data[chem_group_name].items():
68 | if rxn_group_name not in chem_group:
69 | rxn_group = chem_group.create_group(rxn_group_name)
70 | else:
71 | rxn_group = chem_group[rxn_group_name]
72 |
73 | # Add datasets to the reaction group
74 | rxn_group.create_dataset('atomic_numbers', data=rxn_entry['atomic_numbers'])
75 | rxn_group.create_dataset('wB97x_6-31G(d).energy', data=rxn_entry['energies'])
76 | rxn_group.create_dataset('wB97x_6-31G(d).forces', data=rxn_entry['forces'])
77 | rxn_group.create_dataset('positions', data=rxn_entry['positions'])
78 |
79 | print('Compiled successfully!')
80 |
81 | def print_args(args):
82 | print()
83 | print("Arguments provided:")
84 | arg_dict = vars(args)
85 | for key, value in arg_dict.items():
86 | print(f" {key}: {value}")
87 | print()
88 |
89 | def get_parser():
90 | parser = argparse.ArgumentParser(description="Translate ase db file into hdf5 file.")
91 |
92 | parser.add_argument('-i', '--input_path', required=True,
93 | help="Path of the input wB97X ASE db file")
94 | parser.add_argument('-o', '--output_path', required=True,
95 | help="Path of the output wB97X hdf5 file")
96 |
97 | return parser
98 |
99 | if __name__ == "__main__":
100 | args = get_parser().parse_args()
101 | main(args)
102 |
103 |
104 |
--------------------------------------------------------------------------------
/dandelion/refine/refine_forces.py:
--------------------------------------------------------------------------------
1 | import os
2 | import sys
3 | import logging
4 | import argparse
5 | from itertools import repeat
6 | from concurrent.futures import ProcessPoolExecutor, as_completed
7 |
8 | import h5py
9 | from tqdm import tqdm
10 | from ase import Atoms
11 | from ase.db import connect
12 | from ase.calculators.orca import ORCA
13 |
14 |
15 | # thank you for https://github.com/ZimmermanGroup/ORCA-Basis-Sets
16 | custom_basis = '''
17 | %basis
18 | newgto Br
19 | S 6
20 | 1 0.1137182000E+06 0.1717696000E-02
21 | 2 0.1707444000E+05 0.1316744000E-01
22 | 3 0.3889576000E+04 0.6504553000E-01
23 | 4 0.1097096000E+04 0.2269505000E+00
24 | 5 0.3520624000E+03 0.4768357000E+00
25 | 6 0.1207002000E+03 0.3583677000E+00
26 | S 6
27 | 1 0.2471138000E+04 0.2243687000E-02
28 | 2 0.5893838000E+03 0.2994853000E-01
29 | 3 0.1918738000E+03 0.1256009000E+00
30 | 4 0.7295339000E+02 -0.9832786000E-03
31 | 5 0.3005839000E+02 -0.6013141000E+00
32 | 6 0.1252927000E+02 -0.4913983000E+00
33 | P 6
34 | 1 0.2471138000E+04 0.3790182000E-02
35 | 2 0.5893838000E+03 0.2995979000E-01
36 | 3 0.1918738000E+03 0.1318228000E+00
37 | 4 0.7295339000E+02 0.3432708000E+00
38 | 5 0.3005839000E+02 0.4642345000E+00
39 | 6 0.1252927000E+02 0.2079387000E+00
40 | S 6
41 | 1 0.1096411000E+03 -0.5975683000E-02
42 | 2 0.3858948000E+02 0.5542122000E-01
43 | 3 0.1637818000E+02 0.2681200000E+00
44 | 4 0.7221836000E+01 -0.1543606000E+00
45 | 5 0.3263697000E+01 -0.7206306000E+00
46 | 6 0.1465499000E+01 -0.3316437000E+00
47 | P 6
48 | 1 0.1096411000E+03 -0.6907483000E-02
49 | 2 0.3858948000E+02 -0.3041432000E-01
50 | 3 0.1637818000E+02 0.4602725000E-01
51 | 4 0.7221836000E+01 0.3650689000E+00
52 | 5 0.3263697000E+01 0.4949232000E+00
53 | 6 0.1465499000E+01 0.2090394000E+00
54 | S 3
55 | 1 0.2103651000E+01 0.3029029000E+00
56 | 2 0.7547050000E+00 -0.2152659000E+00
57 | 3 0.3005140000E+00 -0.9633941000E+00
58 | P 3
59 | 1 0.2103651000E+01 -0.2826714000E-01
60 | 2 0.7547050000E+00 0.3503065000E+00
61 | 3 0.3005140000E+00 0.7182446000E+00
62 | S 1
63 | 1 0.1090710000E+00 0.1000000000E+01
64 | P 1
65 | 1 0.1090710000E+00 0.1000000000E+01
66 | D 3
67 | 1 0.6225514000E+02 0.7704229000E-01
68 | 2 0.1731284000E+02 0.3707384000E+00
69 | 3 0.5607915000E+01 0.7097628000E+00
70 | D 1
71 | 1 0.1746486000E+01 1.0000000
72 | end
73 | end
74 | '''
75 |
76 |
77 |
78 | class tqdm_hour(tqdm):
79 | """Provides an `hours per iteration` format parameter."""
80 | @property
81 | def format_dict(self):
82 | d = super(tqdm_hour, self).format_dict
83 | rate_hr = '{:.1f}'.format(1/d["rate"] / 3600) if d["rate"] else '?'
84 | d.update(rate_hr=(rate_hr + ' hour/' + d['unit']))
85 | return d
86 |
87 | class tqdm_minute(tqdm):
88 | """Provides a `minutes per iteration` format parameter"""
89 | @property
90 | def format_dict(self):
91 | d = super(tqdm_minute, self).format_dict
92 | rate_min = '{:.0f}'.format(1/d["rate"] / 60) if d["rate"] else '?'
93 | d.update(rate_min=(rate_min + ' min/' + d['unit']))
94 | return d
95 |
96 | bar_format_hr = '{l_bar}{bar}| {n_fmt}/{total_fmt} [{elapsed}<{remaining}, {rate_hr}{postfix}]'
97 | bar_format_min = '{l_bar}{bar}| {n_fmt}/{total_fmt} [{elapsed}<{remaining}, {rate_min}{postfix}]'
98 | bar_format_points = '{l_bar}{bar}| {n_fmt}/{total_fmt} [{elapsed}<{remaining}]'
99 |
100 | def get_unique_ids_from_db(output_path):
101 | """Extract all unique IDs from the ASE database."""
102 | unique_ids = set()
103 | with connect(output_path) as db:
104 | for row in db.select():
105 | data = row.data
106 | if "unique_id" in data:
107 | unique_ids.add(data['unique_id'])
108 | return unique_ids
109 |
110 | def already_calculated(unique_id, unique_id_list):
111 | """Check if a unique ID has already been processed."""
112 | return unique_id in unique_id_list
113 |
114 | def compute_force(coord, atomic_numbers, unique_id, output_path):
115 | """Compute forces using ORCA for a given set of coordinates."""
116 | atoms = Atoms(positions=coord, numbers=atomic_numbers)
117 | atoms.calc = ORCA(
118 | label=os.path.join(os.path.dirname(output_path), f"orca/{unique_id}/{unique_id}"),
119 | orcasimpleinput="wB97X 6-31G(d) NoTrah",
120 | orcablocks=custom_basis
121 | )
122 | try:
123 | # Forces and energy will be stored in the calculator of the Atoms object.
124 | atoms.get_forces()
125 | return atoms
126 | except Exception as e:
127 | # Log the error
128 | logging.error(f"Error in computing forces for unique_id {unique_id}: {e}")
129 | return None
130 |
131 | def accumulate_files_for_deletion(unique_id, output_path, files_to_delete, file_exts=['gbw', 'engrad', 'densities', 'ase']):
132 | for ext in file_exts:
133 | file_path = os.path.join(os.path.dirname(output_path), f"orca/{unique_id}/{unique_id}.{ext}")
134 | if os.path.exists(file_path):
135 | files_to_delete.add(file_path)
136 |
137 | def main(args):
138 | """Main function to orchestrate the computations and database writing."""
139 | print_args(args)
140 |
141 | input_path = args.input_path
142 | if not os.path.isfile(input_path):
143 | sys.exit(f"Error: '{input_path}' is not a file.")
144 | output_path = args.output_path
145 | max_workers = args.max_workers
146 | orcabinary = args.orca
147 |
148 | os.environ["ASE_ORCA_COMMAND"] = f"{orcabinary} PREFIX.inp > PREFIX.out 2>&1"
149 |
150 | os.makedirs(os.path.dirname(output_path), exist_ok=True)
151 |
152 | log_file_path = os.path.join(os.path.dirname(output_path), 'orca_errors.log')
153 | logging.basicConfig(filename=log_file_path, level=logging.ERROR,
154 | format='%(asctime)s %(levelname)s: %(message)s',
155 | datefmt='%Y-%m-%d %H:%M:%S')
156 |
157 | if os.path.isfile(output_path):
158 | print(f'Restarting calculation from {output_path}')
159 | is_restart = True
160 | else:
161 | print(f'Created db file at {output_path}\n')
162 | is_restart = False
163 |
164 |
165 | unique_ids_from_db = get_unique_ids_from_db(output_path)
166 | if is_restart:
167 | print(f'{len(unique_ids_from_db)} points are skipped.\n')
168 |
169 | files_to_delete = set() # Set to accumulate files for deletion
170 |
171 | # Read from the input HDF5 file and compute the energies and forces.
172 | with h5py.File(input_path, 'r') as f:
173 |
174 | for chem_group_name, chem_group in tqdm_hour(f['data'].items(),
175 | desc="Formulas",
176 | position=0,
177 | smoothing=1,
178 | bar_format=bar_format_hr,
179 | ncols=70):
180 |
181 | for rxn_group_name, rxn_group in tqdm_minute(chem_group.items(),
182 | desc=f"Rxns in {chem_group_name}",
183 | leave=False,
184 | position=1,
185 | smoothing=1,
186 | bar_format=bar_format_min,
187 | ncols=70):
188 |
189 | positions_dataset = rxn_group['positions']
190 | coords = [coord for coord in positions_dataset]
191 | atomic_numbers = rxn_group['atomic_numbers'][:]
192 | args_atomic_numbers = repeat(atomic_numbers, len(coords))
193 | unique_ids = [f"{chem_group_name}_{rxn_group_name}_{index}" for index, _ in enumerate(positions_dataset)]
194 |
195 | # Parallel computation using ProcessPoolExecutor.
196 | with ProcessPoolExecutor(max_workers=max_workers) as executor:
197 | future_to_unique_id = {executor.submit(compute_force, coord, atomic_number, unique_id, output_path): unique_id
198 | for coord, atomic_number, unique_id in zip(coords, args_atomic_numbers, unique_ids)
199 | if not already_calculated(unique_id, unique_ids_from_db)}
200 |
201 | batch_size = max_workers # Batch size set to the number of workers
202 | results_batch = []
203 |
204 | # Process the completed tasks.
205 | for future in tqdm(as_completed(future_to_unique_id),
206 | total=len(future_to_unique_id),
207 | desc=f"Samples in {rxn_group_name}",
208 | leave=False,
209 | position=2,
210 | smoothing=0,
211 | bar_format=bar_format_points,
212 | ncols=70):
213 |
214 | unique_id = future_to_unique_id[future]
215 | atoms_result = future.result() # Finished ASE Atoms object
216 | if atoms_result is not None:
217 | results_batch.append((atoms_result, {'unique_id': unique_id}))
218 | accumulate_files_for_deletion(unique_id, output_path, files_to_delete)
219 |
220 | # Write to database in batches
221 | if len(results_batch) >= batch_size:
222 | with connect(output_path) as db:
223 | for atoms, data in results_batch:
224 | db.write(atoms, data=data)
225 | results_batch.clear()
226 |
227 | # Write any remaining results in the batch
228 | for atoms, data in results_batch:
229 | with connect(output_path) as db:
230 | db.write(atoms, data=data)
231 | results_batch.clear()
232 |
233 | for file_path in files_to_delete:
234 | os.remove(file_path)
235 |
236 | print('wB97X calculation finished!')
237 |
238 | def print_args(args):
239 | print()
240 | print("Arguments provided:")
241 | arg_dict = vars(args)
242 | for key, value in arg_dict.items():
243 | print(f" {key}: {value}")
244 | print()
245 |
246 | def get_parser():
247 | parser = argparse.ArgumentParser(description="Compute energies and forces and store in ASE database")
248 |
249 | parser.add_argument('-i', '--input_path', required=True,
250 | help="Path of the input XTB HDF5 file")
251 | parser.add_argument('-o', '--output_path', required=True,
252 | help="Path of the output wB97X ASE database")
253 | parser.add_argument('-n', '--max_workers', type=int, default=1,
254 | help="Number of worker processes")
255 | parser.add_argument('--orca', required=True,
256 | help="Path of the orca binary file")
257 |
258 | return parser
259 |
260 | if __name__ == "__main__":
261 | args = get_parser().parse_args()
262 | main(args)
263 |
264 |
--------------------------------------------------------------------------------
/dandelion/segsm/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mhyeok1/dand/6c9ff246047ff37c89ab2b338f236e4798b3e2c2/dandelion/segsm/__init__.py
--------------------------------------------------------------------------------
/dandelion/segsm/ard_gsm/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mhyeok1/dand/6c9ff246047ff37c89ab2b338f236e4798b3e2c2/dandelion/segsm/ard_gsm/__init__.py
--------------------------------------------------------------------------------
/dandelion/segsm/ard_gsm/driving_coords.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding:utf-8 -*-
3 |
4 | import itertools
5 |
6 | from .mol import Connection
7 | from .limits import connection_limits
8 |
9 |
10 | class ConnectionError(Exception):
11 | """
12 | For any invalid connection changes that occur in MolGraph.
13 | """
14 | pass
15 |
16 |
17 | class DrivingCoords(object):
18 | def __init__(self, break_idxs=None, form_idxs=None):
19 | self._break_idxs = break_idxs or set()
20 | self._form_idxs = form_idxs or set()
21 |
22 | self.remove_duplicates()
23 |
24 | def __str__(self):
25 | s = ''
26 | for idxs in self._break_idxs:
27 | s += 'BREAK {0[0]} {0[1]}\n'.format(idxs)
28 | for idxs in self._form_idxs:
29 | s += 'ADD {0[0]} {0[1]}\n'.format(idxs)
30 | return s
31 |
32 | def __eq__(self, other):
33 | return str(self) == str(other)
34 |
35 | def __ne__(self, other):
36 | return not self == other
37 |
38 | def __hash__(self):
39 | return hash(str(self))
40 |
41 | def reconstruct_from_str(self, s):
42 | self._break_idxs = set()
43 | self._form_idxs = set()
44 | for line in s.splitlines():
45 | if 'BREAK' in line:
46 | idxs = [int(idx) for idx in line.split()[1:]]
47 | self.add_break_idxs(idxs)
48 | elif 'ADD' in line:
49 | idxs = [int(idx) for idx in line.split()[1:]]
50 | self.add_form_idxs(idxs)
51 |
52 | def remove_duplicates(self):
53 | self._break_idxs = {tuple(sorted(idxs)) for idxs in self._break_idxs}
54 | self._form_idxs = {tuple(sorted(idxs)) for idxs in self._form_idxs}
55 |
56 | def add_break_idxs(self, idxs):
57 | self._break_idxs.add(tuple(sorted(idxs)))
58 |
59 | def add_form_idxs(self, idxs):
60 | self._form_idxs.add(tuple(sorted(idxs)))
61 |
62 | def is_subset(self, other):
63 | """
64 | Return True if self is contained in other.
65 | """
66 | for idxs in self._break_idxs:
67 | if idxs not in other._break_idxs:
68 | return False
69 | for idxs in self._form_idxs:
70 | if idxs not in other._form_idxs:
71 | return False
72 | return True
73 |
74 | def get_connections(self, atoms):
75 | atoms_dict = {}
76 | for atom in atoms:
77 | if atom.idx is None:
78 | raise Exception('Atom {} is missing index'.format(atom.symbol))
79 | else:
80 | atoms_dict[atom.idx] = atom
81 |
82 | connections_break, connections_form = [], []
83 | for idxs in self._break_idxs:
84 | connection = Connection(atoms_dict[idxs[0]], atoms_dict[idxs[1]])
85 | connections_break.append(connection)
86 | for idxs in self._form_idxs:
87 | connection = Connection(atoms_dict[idxs[0]], atoms_dict[idxs[1]])
88 | connections_form.append(connection)
89 |
90 | return connections_break, connections_form
91 |
92 |
93 | def generate_driving_coords(mol, maxbreak=3, maxform=3, maxchange=5, single_change=True, equiv_Hs=False,
94 | minbreak=0, minform=0, minchange=1):
95 | """
96 | Generate the set of possible driving coordinates given a molecule. Only
97 | consider breaking a maximum of `maxbreak`, forming a maximum of `maxform`,
98 | and in total changing a maximum of `maxchange` connections (molecular
99 | bonds are considered without regard for the bond order). If `single_change`
100 | is true, consider driving coordinates for (nbreak,nform) in ((0,1),(1,0))
101 | in addition to the other ones. If `equiv_Hs` is true, generate essentially
102 | equivalent driving coordinates for different but equivalent hydrogens,
103 | i.e., those attached to the same non-cyclic tetrahedral carbon.
104 |
105 | Can also specify minbreak, minform, and minchange.
106 | """
107 | assert all(atom.idx is not None for atom in mol.atoms)
108 | driving_coords_set = set()
109 |
110 | mol = mol.copy(deep=True)
111 | if not equiv_Hs:
112 | mol.label_equivalent_hydrogens()
113 |
114 | # Enumerate all possible connections between atoms
115 | # and remove the ones for atoms that are already connected
116 | atoms = mol.atoms
117 | connections = mol.get_all_connections()
118 | all_possible_connections = [Connection(atom1, atom2)
119 | for i, atom1 in enumerate(atoms)
120 | for atom2 in atoms[(i+1):]
121 | if not atom1.frozen and not atom2.frozen]
122 | all_potential_new_connections = [connection for connection in all_possible_connections
123 | if connection not in connections]
124 |
125 | for nbreak in range(minbreak, maxbreak+1):
126 | for nform in range(minform, maxform+1):
127 | if nbreak + nform < minchange:
128 | continue
129 | elif nbreak + nform > maxchange:
130 | continue
131 | elif not single_change and (nbreak + nform == 1):
132 | continue
133 |
134 | # Generate all possible combinations of connection changes
135 | potential_remove_connections_iter = itertools.combinations(connections, nbreak)
136 | potential_new_connections_iter = itertools.combinations(all_potential_new_connections, nform)
137 | potential_connection_changes = itertools.product(potential_remove_connections_iter,
138 | potential_new_connections_iter)
139 |
140 | for connections_to_break, connections_to_form in potential_connection_changes:
141 | try:
142 | change_connections(mol, connections_to_break, connections_to_form)
143 | except ConnectionError:
144 | continue
145 | else:
146 | break_idxs = [(c.atom1.idx, c.atom2.idx) for c in connections_to_break]
147 | form_idxs = [(c.atom1.idx, c.atom2.idx) for c in connections_to_form]
148 | driving_coords = DrivingCoords(break_idxs=break_idxs, form_idxs=form_idxs)
149 | driving_coords_set.add(driving_coords)
150 | finally:
151 | # Always restore connections for next molecule test
152 | change_connections(mol, connections_to_form, connections_to_break, test_validity=False)
153 |
154 | return driving_coords_set
155 |
156 |
157 | def change_connections(mol, connections_to_break, connections_to_form, test_validity=True):
158 | for connection in connections_to_break:
159 | mol.remove_connection(connection)
160 | for connection in connections_to_form:
161 | mol.add_connection(connection)
162 |
163 | if test_validity:
164 | # Only have to test the atoms involved in the changed connections
165 | for connection in connections_to_break:
166 | if not test_connection_validity(connection):
167 | raise ConnectionError('Breaking {} resulted in violation of connection limits'.format(connection))
168 | for connection in connections_to_form:
169 | if not test_connection_validity(connection):
170 | raise ConnectionError('Forming {} resulted in violation of connection limits'.format(connection))
171 |
172 |
173 | def test_connection_validity(connection):
174 | atom1 = connection.atom1
175 | atom2 = connection.atom2
176 | atom1_ll, atom1_ul = connection_limits[atom1.symbol.upper()]
177 | atom2_ll, atom2_ul = connection_limits[atom2.symbol.upper()]
178 | if len(atom1.connections) < atom1_ll or len(atom1.connections) > atom1_ul:
179 | return False
180 | elif len(atom2.connections) < atom2_ll or len(atom2.connections) > atom2_ul:
181 | return False
182 | else:
183 | return True
184 |
--------------------------------------------------------------------------------
/dandelion/segsm/ard_gsm/limits.py:
--------------------------------------------------------------------------------
1 | # First number is the minimum number of connections;
2 | # second number is the maximum number of connections.
3 | connection_limits = {
4 | 'H': (1, 1),
5 | 'C': (2, 4),
6 | 'N': (1, 3),
7 | 'O': (1, 2),
8 | 'F': (1, 1),
9 | 'S': (1, 4),
10 | 'CL':(1, 1),
11 | 'BR':(1, 1),
12 | 'LI':(0, 1)
13 | }
--------------------------------------------------------------------------------
/dandelion/segsm/ard_gsm/mol.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding:utf-8 -*-
3 |
4 | import os
5 |
6 | import numpy as np
7 | from openbabel import pybel
8 | from rdkit import Chem
9 | from rdkit.Chem import AllChem, GetPeriodicTable
10 | import networkx as nx
11 | _rdkit_periodic_table = GetPeriodicTable()
12 |
13 |
14 | class SanitizationError(Exception):
15 | """
16 | Exception class to handle errors during SMILES perception.
17 | """
18 | pass
19 |
20 |
21 | class Atom(object):
22 | """
23 | Represents an atom in a molecular graph.
24 | """
25 |
26 | def __init__(self, symbol=None, idx=None, coords=np.array([]), frozen=False):
27 | self.symbol = symbol
28 | self.idx = idx
29 | self.coords = coords
30 | self.frozen = frozen
31 | self.connections = {}
32 |
33 | def __str__(self):
34 | return '{}: {}'.format(self.idx, self.symbol)
35 |
36 | def __repr__(self):
37 | return ''.format(str(self))
38 |
39 | def copy(self):
40 | return Atom(
41 | symbol=self.symbol,
42 | idx=self.idx,
43 | coords=self.coords.copy(),
44 | frozen=self.frozen,
45 | )
46 |
47 | def get_atomicnum(self):
48 | return _rdkit_periodic_table.GetAtomicNumber(self.symbol)
49 |
50 | def get_cov_rad(self):
51 | return _rdkit_periodic_table.GetRcovalent(self.symbol)
52 |
53 |
54 | class Connection(object):
55 | """
56 | Represents a connection in a molecular graph.
57 |
58 | Note: Equality and hash are only based on atom symbols and indices.
59 | """
60 |
61 | def __init__(self, atom1, atom2):
62 | self._atom1 = atom1
63 | self._atom2 = atom2
64 | self._make_order_invariant()
65 |
66 | def __str__(self):
67 | return '({})--({})'.format(str(self.atom1), str(self.atom2))
68 |
69 | def __repr__(self):
70 | return ''.format(str(self))
71 |
72 | def __eq__(self, other):
73 | return str(self) == str(other)
74 |
75 | def __ne__(self, other):
76 | return not self == other
77 |
78 | def __hash__(self):
79 | return hash(str(self))
80 |
81 | def _make_order_invariant(self):
82 | # Ensure that atom ordering is consistent
83 | atoms = [self._atom1, self._atom2]
84 | atoms.sort(key=lambda a: a.symbol)
85 | if self._atom1.idx is not None or self._atom2.idx is not None:
86 | atoms.sort(key=lambda a: a.idx)
87 | self._atom1, self._atom2 = atoms
88 |
89 | @property
90 | def atom1(self):
91 | return self._atom1
92 |
93 | @property
94 | def atom2(self):
95 | return self._atom2
96 |
97 | @atom1.setter
98 | def atom1(self, val):
99 | self._atom1 = val
100 | self._make_order_invariant()
101 |
102 | @atom2.setter
103 | def atom2(self, val):
104 | self._atom2 = val
105 | self._make_order_invariant()
106 |
107 | def copy(self):
108 | return Connection(self.atom1, self.atom2)
109 |
110 |
111 | class MolGraph(object):
112 | """
113 | Class to convert coordinates to a molecular graph
114 | and to generate driving coordinates.
115 |
116 | Note: Atom indices start at 1.
117 | """
118 |
119 | def __init__(self, atoms=None, symbols=None, coords=None, energy=None):
120 | self.atoms = atoms or []
121 | self.energy = energy
122 |
123 | if not self.atoms and symbols is not None:
124 | for idx, symbol in enumerate(symbols):
125 | atom = Atom(symbol=symbol, idx=idx+1)
126 | self.add_atom(atom)
127 |
128 | if coords is not None:
129 | self.set_coords(coords)
130 |
131 | def __iter__(self):
132 | for atom in self.atoms:
133 | yield atom
134 |
135 | def get_formula(self):
136 | """
137 | Return the molecular formula corresponding to the graph.
138 | """
139 | # Count the numbers of each element
140 | elements = {}
141 | for atom in self:
142 | symbol = atom.symbol
143 | elements[symbol] = elements.get(symbol, 0) + 1
144 |
145 | # Carbon and hydrogen come first if carbon is present, other
146 | # atoms come in alphabetical order (also hydrogen if there is no
147 | # carbon)
148 | formula = ''
149 | if 'C' in elements.keys():
150 | count = elements['C']
151 | formula += 'C{:d}'.format(count) if count > 1 else 'C'
152 | del elements['C']
153 | if 'H' in elements.keys():
154 | count = elements['H']
155 | formula += 'H{:d}'.format(count) if count > 1 else 'H'
156 | del elements['H']
157 | keys = elements.keys()
158 | keys.sort()
159 | for key in keys:
160 | count = elements[key]
161 | formula += '{}{:d}'.format(key, count) if count > 1 else key
162 |
163 | return formula
164 |
165 | def to_rdkit_mol(self):
166 | """
167 | Convert the graph to an RDKit molecule with atom map numbers set
168 | by the indices of the atoms.
169 | """
170 | assert all(atom.idx is not None for atom in self)
171 |
172 | rd_mol = Chem.rdchem.EditableMol(Chem.rdchem.Mol())
173 | for atom in self:
174 | rd_atom = Chem.rdchem.Atom(atom.symbol)
175 | rd_atom.SetAtomMapNum(atom.idx)
176 | rd_mol.AddAtom(rd_atom)
177 |
178 | for atom1 in self:
179 | for atom2, connection in atom1.connections.items():
180 | idx1 = self.atoms.index(atom1) # This is the index in the atoms list
181 | idx2 = self.atoms.index(atom2)
182 | if idx1 < idx2:
183 | rd_mol.AddBond(idx1, idx2, Chem.rdchem.BondType.SINGLE)
184 |
185 | rd_mol = rd_mol.GetMol()
186 | return rd_mol
187 |
188 | def to_pybel_mol(self, from_coords=True):
189 | """
190 | Convert the graph to a Pybel molecule. Currently only supports
191 | creating the molecule from 3D coordinates.
192 | """
193 | if from_coords:
194 | xyz = self.to_xyz()
195 | mol = pybel.readstring('xyz', xyz)
196 | return mol
197 | else:
198 | raise NotImplementedError('Can only create Pybel molecules from 3D structure')
199 |
200 | def to_xyz(self, comment=''):
201 | """
202 | Convert the graph to an XYZ-format string. Optionally, add
203 | comment on the second line.
204 | """
205 | for atom in self:
206 | assert len(atom.coords) != 0
207 | symbols, coords = self.get_geometry()
208 | cblock = ['{0} {1[0]: .10f} {1[1]: .10f} {1[2]: .10f}'.format(s, c) for s, c in zip(symbols, coords)]
209 | return str(len(symbols)) + '\n' + comment + '\n' + '\n'.join(cblock)
210 |
211 | def perceive_smiles(self, atommap=True):
212 | """
213 | Using the geometry, perceive the corresponding SMILES with bond
214 | orders using Open Babel and RDKit. In order to create a sensible
215 | SMILES, first infer the connectivity from the 3D coordinates
216 | using Open Babel, then convert to InChI to saturate unphysical
217 | multi-radical structures, then convert to RDKit and match the
218 | atoms to the ones in self in order to return a SMILES with atom
219 | mapping corresponding to the order given by the values of
220 | atom.idx for all atoms in self.
221 |
222 | This method requires Open Babel version >=2.4.1
223 | """
224 |
225 | # Get dict of atomic numbers for later comparison.
226 | atoms_in_mol_true = {}
227 | for atom in self:
228 | anum = atom.get_atomicnum()
229 | atoms_in_mol_true[anum] = atoms_in_mol_true.get(anum, 0) + 1
230 |
231 | # There seems to be no particularly simple way in RDKit to read
232 | # in 3D structures, so use Open Babel for this part. RMG doesn't
233 | # recognize some single bonds, so we can't use that.
234 | # We've probably called to_pybel_mol at some previous time to set
235 | # connections, but it shouldn't be too expensive to do it again.
236 | pybel_mol = self.to_pybel_mol()
237 |
238 | # Open Babel will often make single bonds and generate Smiles
239 | # that have multiple radicals, which would probably correspond
240 | # to double bonds. To get around this, convert to InChI (which
241 | # does not consider bond orders) and then convert to Smiles.
242 | inchi = pybel_mol.write('inchi', opt={'F': None}).strip() # Add fixed H layer
243 |
244 | # Use RDKit to convert back to Smiles
245 | mol_sanitized = Chem.MolFromInchi(inchi)
246 |
247 | # RDKit doesn't like some hypervalent atoms
248 | if mol_sanitized is None:
249 | raise SanitizationError(
250 | 'Could not convert \n{}\nto Smiles. Unsanitized Smiles: {}'.format(self.to_xyz(),
251 | pybel_mol.write('smi').strip())
252 | )
253 |
254 | # RDKit adds unnecessary hydrogens in some cases. If
255 | # this happens, give up and return an error.
256 | mol_sanitized = Chem.AddHs(mol_sanitized)
257 | atoms_in_mol_sani = {}
258 | for atom in mol_sanitized.GetAtoms():
259 | atoms_in_mol_sani[atom.GetAtomicNum()] = atoms_in_mol_sani.get(atom.GetAtomicNum(), 0) + 1
260 | if atoms_in_mol_sani != atoms_in_mol_true:
261 | raise SanitizationError(
262 | 'Could not convert \n{}\nto Smiles. Wrong Smiles: {}'.format(self.to_xyz(),
263 | Chem.MolToSmiles(mol_sanitized))
264 | )
265 |
266 | if not atommap:
267 | return Chem.MolToSmiles(mol_sanitized)
268 |
269 | # Because we went through InChI, we lost atom mapping
270 | # information. Restore it by matching the original molecule.
271 | # There should only be one unique map.
272 | mol_with_map = self.to_rdkit_mol() # This only has single bonds
273 | mol_sani_sb = Chem.Mol(mol_sanitized) # Make copy with single bonds only
274 | for bond in mol_sani_sb.GetBonds():
275 | bond.SetBondType(Chem.rdchem.BondType.SINGLE)
276 | match = mol_sani_sb.GetSubstructMatch(mol_with_map) # Isomorphism mapping
277 | assert mol_with_map.GetNumAtoms() == len(match) # Make sure we match all atoms
278 | for atom in mol_with_map.GetAtoms():
279 | idx = match[atom.GetIdx()]
280 | map_num = atom.GetAtomMapNum()
281 | mol_sanitized.GetAtomWithIdx(idx).SetAtomMapNum(map_num)
282 |
283 | # If everything succeeded up to here, we hopefully have a
284 | # sensible Smiles string with atom mappings for all atoms.
285 | return Chem.MolToSmiles(mol_sanitized)
286 |
287 | def add_atom(self, atom):
288 | self.atoms.append(atom)
289 | atom.connections = {}
290 | return atom
291 |
292 | def add_connection(self, connection=None, atom1=None, atom2=None):
293 | """
294 | Either add a connection directly or first create one from two
295 | atoms and then add it.
296 | """
297 | if connection is None:
298 | connection = Connection(atom1, atom2)
299 | if connection.atom1 not in self.atoms or connection.atom2 not in self.atoms:
300 | raise Exception('Cannot add connection between atoms not in the graph')
301 | else:
302 | connection.atom1.connections[connection.atom2] = connection
303 | connection.atom2.connections[connection.atom1] = connection
304 | return connection
305 |
306 | def get_all_connections(self):
307 | return {connection for atom in self.atoms for connection in atom.connections.values()}
308 |
309 | def get_connection(self, atom1, atom2):
310 | if atom1 not in self.atoms or atom2 not in self.atoms:
311 | raise Exception('One or both of the specified atoms are not in this graph')
312 |
313 | try:
314 | return atom1.connections[atom2]
315 | except KeyError:
316 | raise Exception('The specified atoms are not connected in this graph')
317 |
318 | def remove_atom(self, atom):
319 | for atom2 in atom.connections:
320 | del atom2.connections[atom]
321 | atom.connections = {}
322 | self.atoms.remove(atom)
323 |
324 | def remove_connection(self, connection):
325 | if connection.atom1 not in self.atoms or connection.atom2 not in self.atoms:
326 | raise Exception('Cannot remove connection between atoms not in the graph')
327 | del connection.atom1.connections[connection.atom2]
328 | del connection.atom2.connections[connection.atom1]
329 |
330 | def copy(self, deep=False):
331 | other = MolGraph(energy=self.energy)
332 | atoms = self.atoms
333 | mapping = {}
334 | for atom in atoms:
335 | if deep:
336 | atom2 = other.add_atom(atom.copy())
337 | mapping[atom] = atom2
338 | else:
339 | connections = atom.connections
340 | other.add_atom(atom)
341 | atom.connections = connections
342 | if deep:
343 | for atom1 in atoms:
344 | for atom2 in atom1.connections:
345 | connection = atom1.connections[atom2]
346 | connection = connection.copy()
347 | connection.atom1 = mapping[atom1]
348 | connection.atom2 = mapping[atom2]
349 | other.add_connection(connection)
350 | return other
351 |
352 | def merge(self, other):
353 | new = MolGraph()
354 | for atom in self.atoms:
355 | connections = atom.connections
356 | new.add_atom(atom)
357 | atom.connections = connections
358 | for atom in other.atoms:
359 | connections = atom.connections
360 | new.add_atom(atom)
361 | atom.connections = connections
362 | new.energy = self.energy + other.energy
363 | return new
364 |
365 | def split(self):
366 | new1 = self.copy()
367 | new2 = MolGraph()
368 |
369 | if len(self.atoms) == 0:
370 | return [new1]
371 |
372 | atoms_to_move = [self.atoms[-1]]
373 | idx = 0
374 | while idx < len(atoms_to_move):
375 | for atom2 in atoms_to_move[idx].connections:
376 | if atom2 not in atoms_to_move:
377 | atoms_to_move.append(atom2)
378 | idx += 1
379 |
380 | if len(new1.atoms) == len(atoms_to_move):
381 | return [new1]
382 |
383 | for atom in atoms_to_move:
384 | new2.atoms.append(atom)
385 | new1.atoms.remove(atom)
386 |
387 | new = [new2]
388 | new.extend(new1.split())
389 | new.energy = None
390 | return new
391 |
392 | def sort_atoms(self):
393 | self.atoms.sort(key=lambda a: a.idx)
394 |
395 | def is_radical(self):
396 | """
397 | Determine whether or not the molecule is a radical based on the number
398 | of valence electrons for each atom. If the total number of valence
399 | electrons is odd, then it is a radical. This assumes that molecules
400 | with an even number of electrons are singlets. This method also assumes
401 | that none of the atoms are charged.
402 | """
403 | valence_electrons = {'H': 1, 'C': 4, 'N': 5, 'O': 6, 'F': 7, 'P': 5, 'S': 6, 'Cl': 7, 'Br': 7, 'I': 7, 'Li':1}
404 | symbols = [atom.symbol for atom in self]
405 | total_valence_electrons = sum(valence_electrons[s] for s in symbols)
406 | return bool(total_valence_electrons % 2)
407 |
408 | # def is_isomorphic(self, other):
409 | # """
410 | # Test if self is isomorphic with other, ignoring atom indices.
411 | # Requires RMG to do the isomorphism check.
412 | # """
413 | # self_rmg = self.to_rmg_mol()
414 | # other_rmg = other.to_rmg_mol()
415 | # return self_rmg.isIsomorphic(other_rmg)
416 |
417 | def topology_from_rdkit(self):
418 | rdkit_molecule = self.to_rdkit_mol()
419 | topology = nx.Graph()
420 | for atom in rdkit_molecule.GetAtoms():
421 | # Add the atoms as nodes
422 | topology.add_node(atom.GetIdx())
423 |
424 | # Add the bonds as edges
425 | for bonded in atom.GetNeighbors():
426 | topology.add_edge(atom.GetIdx(), bonded.GetIdx())
427 |
428 | return topology
429 |
430 | def is_isomorphic(self, other):
431 | topology1 = self.topology_from_rdkit()
432 | topology2 = self.topology_from_rdkit()
433 | return nx.is_isomorphic(topology1, topology2)
434 |
435 |
436 | def set_coords(self, coords):
437 | """
438 | Set atom coordinates. Assumes coords are in same order as self.atoms.
439 | """
440 | try:
441 | coords = np.reshape(coords, (-1,3))
442 | except ValueError:
443 | raise Exception('Coordinates cannot be reshaped into matrix of size Nx3')
444 | assert len(coords) == len(self.atoms)
445 |
446 | for atom, xyz in zip(self.atoms, coords):
447 | atom.coords = xyz
448 |
449 | def get_coords(self):
450 | """
451 | Get coordinates in the order specified by the atom indices.
452 | """
453 | assert all(atom.idx is not None for atom in self)
454 | atoms = self.atoms[:]
455 | atoms.sort(key=lambda a: a.idx)
456 | return np.array([atom.coords for atom in atoms])
457 |
458 | def get_symbols(self):
459 | """
460 | Get symbols in the order specified by the atom indices.
461 | """
462 | assert all(atom.idx is not None for atom in self)
463 | atoms = self.atoms[:]
464 | atoms.sort(key=lambda a: a.idx)
465 | return [atom.symbol for atom in atoms]
466 |
467 | def get_geometry(self):
468 | """
469 | Get symbols and coordinates in the order specified by the atom
470 | indices.
471 | """
472 | assert all(atom.idx is not None for atom in self)
473 | atoms = self.atoms[:]
474 | atoms.sort(key=lambda a: a.idx)
475 | return [atom.symbol for atom in atoms], np.array([atom.coords for atom in atoms])
476 |
477 | def infer_connections(self, use_ob=True):
478 | """
479 | Delete connections and set them again based on coordinates.
480 |
481 | Note: By default this uses Open Babel, which is better than a
482 | simple covalent radii check.
483 | """
484 | atoms = self.atoms
485 |
486 | for atom in atoms:
487 | assert len(atom.coords) != 0
488 |
489 | for atom in atoms:
490 | for connection in atom.connections:
491 | self.remove_connection(connection)
492 |
493 | if use_ob:
494 | pybel_mol = self.to_pybel_mol() # Should be sorted by atom indices
495 | assert all(ap.idx == a.idx for ap, a in zip(pybel_mol, self)) # Check to be sure
496 | mapping = {ap.idx: a for ap, a in zip(pybel_mol, self)}
497 | for bond in pybel.ob.OBMolBondIter(pybel_mol.OBMol):
498 | atom1 = mapping[bond.GetBeginAtomIdx()]
499 | atom2 = mapping[bond.GetEndAtomIdx()]
500 | connection = Connection(atom1, atom2)
501 | self.add_connection(connection)
502 | else:
503 | sorted_atoms = sorted(atoms, key=lambda a: a.coords[2])
504 | for i, atom1 in enumerate(sorted_atoms):
505 | for atom2 in sorted_atoms[(i+1):]:
506 | crit_dist = (atom1.get_cov_rad() + atom2.get_cov_rad() + 0.45)**2
507 | z_boundary = (atom1.coords[2] - atom2.coords[2])**2
508 | if z_boundary > 16.0:
509 | break
510 | dist_sq = sum((atom1.coords - atom2.coords)**2)
511 | if dist_sq > crit_dist or dist_sq < 0.4:
512 | continue
513 | else:
514 | connection = Connection(atom1, atom2)
515 | self.add_connection(connection)
516 |
517 | def is_atom_in_cycle(self, atom):
518 | return self._is_chain_in_cycle([atom])
519 |
520 | def _is_chain_in_cycle(self, chain):
521 | atom1 = chain[-1]
522 | for atom2 in atom1.connections:
523 | if atom2 is chain[0] and len(chain) > 2:
524 | return True
525 | elif atom2 not in chain:
526 | chain.append(atom2)
527 | if self._is_chain_in_cycle(chain):
528 | return True
529 | else:
530 | chain.remove(atom2)
531 | return False
532 |
533 | #def label_equivalent_hydrogens(self):
534 | # """
535 | # Mark all equivalent hydrogens as frozen. For now, this assumes that the
536 | # carbons they are attached to have 4 connections, which means this
537 | # method does not yet work for radicals.
538 | # """
539 | # if self.is_radical():
540 | # raise NotImplementedError('Cannot yet label equivalent hydrogens for radicals')
541 | # for atom in self:
542 | # if (atom.symbol.upper() == 'C'
543 | # and len(atom.connections) == 4
544 | # and not self.is_atom_in_cycle(atom)):
545 | # first_hydrogen = True
546 | # for atom2 in atom.connections:
547 | # if atom2.symbol.upper() == 'H':
548 | # if first_hydrogen:
549 | # first_hydrogen = False
550 | # else:
551 | # atom2.frozen = True
552 |
553 | def label_equivalent_hydrogens(self):
554 | """
555 | This version works with radicals. no assumption of four connections on carbon atoms.
556 | Also works with other heavy elementts.
557 | """
558 | # Proceed even if the molecule is a radical
559 | for atom in self:
560 | if atom.symbol.upper() != 'H' and not self.is_atom_in_cycle(atom):
561 | hydrogens = [a for a in atom.connections if a.symbol.upper() == 'H']
562 | if len(hydrogens) > 1:
563 | first_hydrogen = True
564 | for hydrogen in hydrogens:
565 | if first_hydrogen:
566 | first_hydrogen = False
567 | else:
568 | hydrogen.frozen = True
569 |
--------------------------------------------------------------------------------
/dandelion/segsm/create_gsm.py:
--------------------------------------------------------------------------------
1 | import os
2 | import sys
3 | import glob
4 | import argparse
5 |
6 | from ase.io import read
7 | from .ard_gsm.mol import MolGraph
8 | from .ard_gsm.limits import connection_limits
9 | from .ard_gsm.driving_coords import generate_driving_coords
10 |
11 |
12 | def main(args):
13 |
14 | print_args(args)
15 |
16 | input_path = args.input_path
17 | if not os.path.isdir(input_path):
18 | sys.exit(f"Error: '{input_path}' is not a directory.")
19 | output_path = args.output_path
20 | maxbreak = args.maxbreak
21 | maxform = args.maxform
22 | maxchange = args.maxchange
23 | minbreak = args.minbreak
24 | minform = args.minform
25 | minchange = args.minchange
26 | ignore_single_change = args.ignore_single_change
27 | equiv_Hs = args.equiv_Hs
28 |
29 | pdir = output_path
30 | if not os.path.exists(pdir):
31 | os.makedirs(pdir)
32 |
33 | with open(os.path.join(pdir, 'params.log'), 'w') as f:
34 | f.write('Connection limits:\n')
35 | for symbol in connection_limits:
36 | ll = connection_limits[symbol][0]
37 | ul = connection_limits[symbol][1]
38 | f.write(' {}: {}, {}\n'.format(symbol, ll, ul))
39 | f.write('maxbreak = {}\n'.format(maxbreak))
40 | f.write('maxform = {}\n'.format(maxform))
41 | f.write('maxchange = {}\n'.format(maxchange))
42 | f.write('single_change = {}\n'.format(not ignore_single_change))
43 | f.write('equiv_Hs = {}\n'.format(equiv_Hs))
44 | f.write('minbreak = {}\n'.format(minbreak))
45 | f.write('minform = {}\n'.format(minform))
46 | f.write('minchange = {}\n'.format(minchange))
47 |
48 | # Loop over Mothers
49 | for idx, mother in enumerate(glob.iglob(os.path.join(input_path, '**/*.xyz'), recursive=True)):
50 | xyz = read(mother)
51 | symbols, coords = xyz.get_chemical_symbols(), xyz.get_positions()
52 | mol = MolGraph(symbols=symbols, coords=coords)
53 | mol.infer_connections()
54 | name = os.path.basename(os.path.dirname(mother))
55 |
56 | seeds = generate_driving_coords(
57 | mol,
58 | maxbreak=maxbreak,
59 | maxform=maxform,
60 | maxchange=maxchange,
61 | single_change=not ignore_single_change,
62 | equiv_Hs=equiv_Hs,
63 | minbreak=minbreak,
64 | minform=minform,
65 | minchange=minchange
66 | )
67 | print(f'{len(seeds)} Seeds were generated from {name}')
68 |
69 | output_path = os.path.join(pdir, '{}'.format(name))
70 | if not os.path.exists(output_path):
71 | os.mkdir(output_path)
72 |
73 | # Loop over seeds
74 | for idx, seed in enumerate(seeds):
75 |
76 | gsm_dir = os.path.join(output_path, f'gsm{idx:04}')
77 | if not os.path.exists(gsm_dir):
78 | os.mkdir(gsm_dir)
79 |
80 | isomers_file = os.path.join(gsm_dir, 'ISOMERS.txt')
81 | initial_file = os.path.join(gsm_dir, 'initial.xyz')
82 | bash_file = os.path.join(gsm_dir, 'gsm.sh')
83 |
84 | with open(bash_file, 'w') as f:
85 | f.write('''
86 | gsm -xyzfile initial.xyz \\
87 | -mode SE_GSM \\
88 | -num_nodes 30 \\
89 | -package xTB_lot \\
90 | -isomers ISOMERS.txt \\
91 | -xyz_output_format multixyz \\
92 | -coordinate_type DLC > gsm_log 2>&1''')
93 |
94 | with open(isomers_file, 'w') as f:
95 | f.write(str(seed))
96 | with open(initial_file, 'w') as f:
97 | f.write(str(len(symbols)) + '\n')
98 | f.write('\n')
99 | for symbol, xyz in zip(symbols, coords):
100 | f.write('{0} {1[0]: .10f} {1[1]: .10f} {1[2]: .10f}\n'.format(symbol, xyz))
101 |
102 | print('\nCreating GSM finished!')
103 |
104 | def print_args(args):
105 | print()
106 | print("Arguments provided:")
107 | arg_dict = vars(args)
108 | for key, value in arg_dict.items():
109 | print(f" {key}: {value}")
110 | print()
111 |
112 | def get_parser():
113 | parser = argparse.ArgumentParser(description='Make GSM jobs from mother structures')
114 |
115 | parser.add_argument('-i', '--input_path', required=True,
116 | help='Input path of mother structures')
117 | parser.add_argument('-o', '--output_path', required=True,
118 | help='Output path of gsm jobs')
119 |
120 | parser.add_argument('--maxbreak', type=int, default=2,
121 | help='Maximum number of connections to break')
122 | parser.add_argument('--maxform', type=int, default=2,
123 | help='Maximum number of connections to form')
124 | parser.add_argument('--maxchange', type=int, default=3,
125 | help='Maximum number of connections to change')
126 |
127 | parser.add_argument('--minbreak', type=int, default=0,
128 | help='Minumum number of connections to break')
129 | parser.add_argument('--minform', type=int, default=0,
130 | help='Minumum number of connections to form')
131 | parser.add_argument('--minchange', type=int, default=1,
132 | help='Minumum number of connections to change')
133 |
134 | parser.add_argument('--ignore_single_change', type=bool, default=True,
135 | help='Do not consider single connection changes (e.g., nbreak=1, nform=0)')
136 | parser.add_argument('--equiv_Hs', type=bool, default=False,
137 | help='Create equivalent driving coordinates for the same reaction with different but\
138 | equivalent hydrogens, i.e., hydrogens attached to non-cyclic tetrahedral carbons')
139 |
140 | return parser
141 |
142 |
143 | if __name__ == "__main__":
144 | args = get_parser().parse_args()
145 | main(args)
146 |
147 |
148 |
149 |
--------------------------------------------------------------------------------
/dandelion/segsm/filter_gsm.py:
--------------------------------------------------------------------------------
1 | import os
2 | import sys
3 | import glob
4 | import shutil
5 | import argparse
6 |
7 | from rdkit import RDLogger
8 | from ase.io import read, write
9 | from openbabel import openbabel
10 | from .ard_gsm.mol import MolGraph, SanitizationError
11 | #from ard_gsm.mol import MolGraph, SanitizationError
12 |
13 | # Suppress Noisy warning in the filter
14 | RDLogger.logger().setLevel(RDLogger.CRITICAL)
15 | openbabel.obErrorLog.SetOutputLevel(openbabel.obError)
16 |
17 | '''
18 | Faith of pyGSM run
19 |
20 | 1) png is not made
21 | - xTB not converge
22 | - pyGSM suicide on his criteria
23 |
24 | 2) png is made
25 | - Exiting early -> should filter out
26 | - Ran out of iterations -> also includes potential rxn
27 | - Converged -> very rare
28 | '''
29 |
30 |
31 |
32 | def parse_gsm_log(keyword, content):
33 | """Find the value associated with a keyword in a text content."""
34 | # For TS_energy, we're expecting a float, so we use a different pattern
35 | if keyword == "TS energy:":
36 | pattern = f"{keyword} ([+-]?(\d+(\.\d*)?|\.\d+)([eE][+-]?\d+)?)"
37 | else:
38 | pattern = f"{keyword} (\d+)"
39 |
40 | import re
41 | matches = re.findall(pattern, content)
42 |
43 | # Return the matched value; assume there's only one match
44 | if matches:
45 | return matches[0][0] # Due to group structures, we take the first element
46 | else:
47 | return None
48 |
49 |
50 | def get_gsm_data(home, seed, string):
51 | try:
52 | with open(os.path.join(home, seed, string, 'gsm_log'), 'r') as f:
53 | content = f.read()
54 | except FileNotFoundError:
55 | return None
56 |
57 | nodes = []
58 | try:
59 | with open(os.path.join(home, seed, string, 'opt_converged_000.xyz'), 'r') as f:
60 | for i in range(30):
61 | try:
62 | nodes.append(read(f, i))
63 | except:
64 | break
65 | except FileNotFoundError:
66 | return None
67 |
68 | return {
69 | "TS_energy" : float(parse_gsm_log("TS energy:", content)),
70 | "reactant_idx" : int(parse_gsm_log("min reactant node:", content)),
71 | "product_idx" : int(parse_gsm_log("min product node", content)),
72 | "TS_idx" : int(parse_gsm_log("TS node is", content)),
73 | "nodes" : nodes,
74 | 'energies' : [float(list(node.info.keys())[0]) for node in nodes]
75 | }
76 |
77 |
78 |
79 | def profile_filter(strings, home, seed, barrier_max, barrier_min, delta_e_min):
80 | '''
81 | Given gsm success reactions,
82 | Filter strings by TS_index and Barrier height and delta_e.
83 | '''
84 | filtered = {}
85 | for string in strings:
86 | data = get_gsm_data(home, seed, string)
87 | if not data:
88 | continue
89 |
90 | if data["TS_idx"] >= data["product_idx"]: # wrong ts
91 | continue
92 | if (data["TS_energy"] > barrier_max) or (data["TS_energy"] < barrier_min): # too high or low barrier
93 | continue
94 | if abs(data['energies'][data['product_idx']]) * 627.503 < delta_e_min: # maybe reactant==product
95 | continue
96 |
97 | product_graph = MolGraph(symbols=data["nodes"][data["product_idx"]].get_chemical_symbols(),
98 | coords=data["nodes"][data["product_idx"]].get_positions(),
99 | energy=float(list(data["nodes"][data["product_idx"]].info.keys())[0]))
100 |
101 | filtered[string] = {
102 | 'reactant': data["nodes"][data["reactant_idx"]],
103 | 'product': data["nodes"][data["product_idx"]],
104 | 'ts': data["nodes"][data["TS_idx"]],
105 | 'product_graph': product_graph,
106 | 'ts_energy': data["TS_energy"]
107 | }
108 |
109 | return filtered
110 |
111 | def structure_filter(reactions):
112 | '''
113 | Chemically absurd products are filtered here. (graph->pybel->inchi->smiles)
114 | SMILES are constructed, and saved to the dictionary for the unique filter.
115 | '''
116 |
117 | filtered = {}
118 |
119 | for rxn, data in reactions.items():
120 | try:
121 | smiles = data['product_graph'].perceive_smiles()
122 | filtered[rxn] = data
123 | filtered[rxn]['product_smiles'] = smiles
124 | except SanitizationError:
125 | continue
126 | return filtered
127 |
128 | def unique_filter(reactions):
129 | '''
130 | Duplicates are filtered based on SMILES.
131 | If there are more than one of same SMILES, pick the lowest barrier reaction.
132 | '''
133 | unique = {}
134 | for rxn, data in reactions.items():
135 | smiles = data['product_smiles']
136 | ts_energy = data['ts_energy']
137 | if smiles not in unique or ts_energy < unique[smiles]['ts_energy']:
138 | unique[smiles] = {
139 | 'reaction_key': rxn,
140 | 'ts_energy': ts_energy,
141 | 'reactant': data['reactant'],
142 | 'product': data['product'],
143 | 'ts': data['ts'],
144 | }
145 | return unique
146 |
147 | def save_unique_reactions(home, output_path, seed, reactions):
148 | for smiles, data in reactions.items():
149 | reaction_dir = os.path.join(output_path, seed, data['reaction_key'])
150 | os.makedirs(reaction_dir, exist_ok=True)
151 |
152 | file_types = ["reactant", "ts", "product"]
153 | for f_type in file_types:
154 | write(os.path.join(reaction_dir, f"{f_type}.xyz"), data[f_type])
155 | write(os.path.join(reaction_dir, f"{f_type}.png"), data[f_type])
156 |
157 | shutil.copyfile(os.path.join(home, seed, data['reaction_key'], '0000_string.png'),
158 | os.path.join(reaction_dir, 'string.png'))
159 |
160 | shutil.copyfile(os.path.join(home, seed, data['reaction_key'], 'opt_converged_000.xyz'),
161 | os.path.join(reaction_dir, 'string.xyz'))
162 |
163 | def main(args):
164 |
165 | print_args(args)
166 |
167 | input_path = args.input_path
168 | if not os.path.isdir(input_path):
169 | sys.exit(f"Error: '{input_path}' is not a directory.")
170 | output_path = args.output_path
171 | if not os.path.exists(output_path):
172 | os.mkdir(output_path)
173 | barrier_max = args.barrier_max
174 | barrier_min = args.barrier_min
175 | delta_e_min = args.delta_e_min
176 |
177 | mothers = [d for d in os.listdir(input_path) if os.path.isdir(os.path.join(input_path, d))]
178 | for mother in mothers:
179 | print('\n◤◢◤◢◤◢◤◢◤◢◤◢◤◢◤◢◤◢◤◢◤◢◤◢◤◢◤◢◤◢◤◢◤◢◤◢')
180 | print(f'mother: {mother}'.center(35))
181 | driving_coordinates = list(glob.iglob(os.path.join(input_path, f'{mother}/*/gsm_log')))
182 | success_strings = [path.split('/')[-2] for path in glob.iglob(os.path.join(input_path, f'{mother}/*/0000_string.png'))]
183 |
184 | profile_filtered_strings = profile_filter(success_strings, input_path, mother, barrier_max, barrier_min, delta_e_min)
185 | structure_filtered_strings = structure_filter(profile_filtered_strings)
186 | unique_reactions = unique_filter(structure_filtered_strings)
187 |
188 | print(f'Initial seeds: {len(driving_coordinates):>5}')
189 | print(f'GSM success reactions: {len(success_strings):>5}')
190 | print(f'Profile filtered reactions: {len(profile_filtered_strings):>5}')
191 | print(f'Structure filtered reactions: {len(structure_filtered_strings):>5}')
192 | print(f'Unique reactions: {len(unique_reactions):>5}')
193 |
194 | save_unique_reactions(input_path, output_path, mother, unique_reactions)
195 |
196 | print('\nFiltering GSM finished!')
197 |
198 | def print_args(args):
199 | print()
200 | print("Arguments provided:")
201 | arg_dict = vars(args)
202 | for key, value in arg_dict.items():
203 | print(f" {key}: {value}")
204 | print()
205 |
206 | def get_parser():
207 | parser = argparse.ArgumentParser(description='Make GSM jobs from mother structures')
208 |
209 | parser.add_argument('-i', '--input_path', required=True,
210 | help='Input path of finished gsm jobs')
211 | parser.add_argument('-o', '--output_path', required=True,
212 | help='Output path of filtered gsm jobs')
213 |
214 | parser.add_argument('--barrier_min', type=int, default=5)
215 | parser.add_argument('--barrier_max', type=int, default=200)
216 | parser.add_argument('--delta_e_min', type=int, default=5)
217 |
218 | return parser
219 |
220 |
221 | if __name__ == "__main__":
222 | args = get_parser().parse_args()
223 | main(args)
224 |
225 |
226 |
--------------------------------------------------------------------------------
/dandelion/segsm/run_gsm.py:
--------------------------------------------------------------------------------
1 | import os
2 | import sys
3 | import argparse
4 | import subprocess
5 | from concurrent.futures import ProcessPoolExecutor, as_completed
6 |
7 | from tqdm import tqdm
8 |
9 | # conda activate ts
10 | # check whether gsm is killed when you interrupted
11 | # use like "nohup python -u 2_run_gsm_jobs > gsm.out &"
12 |
13 |
14 | def run_gsm_script(script_dir):
15 | #print(f"Executing in directory: {script_dir}")
16 | subprocess.run('bash gsm.sh', cwd=script_dir, capture_output=True, text=True, shell=True)
17 |
18 | def main(args):
19 |
20 | print_args(args)
21 |
22 | input_path = args.input_path
23 | if not os.path.isdir(input_path):
24 | sys.exit(f"Error: '{input_path}' is not a directory.")
25 | max_workers = args.max_workers
26 |
27 | # Find all directories containing gsm.sh scripts
28 | script_dirs = [dirpath for dirpath, _, filenames in os.walk(input_path) if "gsm.sh" in filenames]
29 |
30 | bar_format = '{l_bar}{bar}| {n_fmt}/{total_fmt} [{elapsed}<{remaining}]'
31 | with ProcessPoolExecutor(max_workers=max_workers) as executor:
32 | futures = [executor.submit(run_gsm_script, script_dir) for script_dir in script_dirs]
33 |
34 | for future in tqdm(as_completed(futures), desc='GSM on seeds',
35 | total=len(script_dirs), smoothing=0, bar_format=bar_format, ncols=70):
36 | pass # just update the tqdm
37 |
38 | print('GSM finished!')
39 |
40 | def print_args(args):
41 | print()
42 | print("Arguments provided:")
43 | arg_dict = vars(args)
44 | for key, value in arg_dict.items():
45 | print(f" {key}: {value}")
46 | print()
47 |
48 | def get_parser():
49 | parser = argparse.ArgumentParser(description='Run GSM jobs concurrently')
50 |
51 | parser.add_argument('-i', '--input_path', required=True,
52 | help='Base directory of mothers bearing seeds')
53 | parser.add_argument('-n', '--max_workers', type=int, default=1,
54 | help='Number of worker processes')
55 |
56 | return parser
57 |
58 |
59 | if __name__ == "__main__":
60 | args = get_parser().parse_args()
61 | main(args)
62 |
--------------------------------------------------------------------------------
/dandelion/utils/db_h5_tools/db_to_h5.py:
--------------------------------------------------------------------------------
1 | import os
2 | import sys
3 | import argparse
4 |
5 | import h5py
6 | from ase.db import connect
7 |
8 |
9 | def main(args):
10 |
11 | print_args(args)
12 |
13 | input_path = args.input_path
14 | if not os.path.isfile(input_path):
15 | sys.exit(f"Error: '{input_path}' is not a file.")
16 | output_path = args.output_path
17 |
18 | # Data structure to hold the computed results
19 | rxn_data = {}
20 |
21 | rows = [] # List to store all rows
22 |
23 | # Extract data from ASE database
24 | with connect(input_path) as db:
25 | for row in db.select():
26 | if hasattr(row, 'energy') and hasattr(row, 'forces'):
27 | rows.append(row)
28 |
29 | # Sort rows based on the unique_id number
30 | rows.sort(key=lambda r: int(r.data['unique_id'].split('_')[-1]))
31 |
32 | # Process sorted rows
33 | for row in rows:
34 | # Extract unique_id and other data
35 | unique_id = row.data['unique_id']
36 | chem_group_name, rxn_group_name, index = unique_id.split('_')
37 |
38 | if chem_group_name not in rxn_data:
39 | rxn_data[chem_group_name] = {}
40 |
41 | if rxn_group_name not in rxn_data[chem_group_name]:
42 | rxn_data[chem_group_name][rxn_group_name] = {
43 | 'atomic_numbers': row.toatoms().numbers,
44 | 'energies': [],
45 | 'forces': [],
46 | 'positions': []
47 | }
48 | rxn_data[chem_group_name][rxn_group_name]['energies'].append(row.energy)
49 | rxn_data[chem_group_name][rxn_group_name]['forces'].append(row.forces)
50 | rxn_data[chem_group_name][rxn_group_name]['positions'].append(row.toatoms().positions)
51 |
52 | # Save the data to an h5 file
53 | with h5py.File(output_path, 'w') as h5file:
54 | # Ensure the 'data' group exists
55 | if 'data' not in h5file:
56 | data_group = h5file.create_group('data')
57 | else:
58 | data_group = h5file['data']
59 |
60 | # Iterate through the rxn_data dictionary to save datasets
61 | for chem_group_name in rxn_data:
62 | if chem_group_name not in data_group:
63 | chem_group = data_group.create_group(chem_group_name)
64 | else:
65 | chem_group = data_group[chem_group_name]
66 |
67 | for rxn_group_name, rxn_entry in rxn_data[chem_group_name].items():
68 | if rxn_group_name not in chem_group:
69 | rxn_group = chem_group.create_group(rxn_group_name)
70 | else:
71 | rxn_group = chem_group[rxn_group_name]
72 |
73 | # Add datasets to the reaction group
74 | rxn_group.create_dataset('atomic_numbers', data=rxn_entry['atomic_numbers'])
75 | rxn_group.create_dataset('wB97x_6-31G(d).energy', data=rxn_entry['energies'])
76 | rxn_group.create_dataset('wB97x_6-31G(d).forces', data=rxn_entry['forces'])
77 | rxn_group.create_dataset('positions', data=rxn_entry['positions'])
78 |
79 | print('Compiled successfully!')
80 |
81 | def print_args(args):
82 | print()
83 | print("Arguments provided:")
84 | arg_dict = vars(args)
85 | for key, value in arg_dict.items():
86 | print(f" {key}: {value}")
87 | print()
88 |
89 | def get_parser():
90 | parser = argparse.ArgumentParser(description="Translate ase db file into hdf5 file.")
91 |
92 | parser.add_argument('-i', '--input_path', required=True,
93 | help="Path of the input wB97X ASE db file")
94 | parser.add_argument('-o', '--output_path', required=True,
95 | help="Path of the output wB97X hdf5 file")
96 |
97 | return parser
98 |
99 | if __name__ == "__main__":
100 | args = get_parser().parse_args()
101 | main(args)
102 |
103 |
104 |
--------------------------------------------------------------------------------
/dandelion/utils/db_h5_tools/h5_to_db.py:
--------------------------------------------------------------------------------
1 | import os
2 | import sys
3 | import h5py
4 | import argparse
5 |
6 | from tqdm import tqdm
7 | from ase import Atoms
8 | from ase.db import connect
9 | from ase.calculators.singlepoint import SinglePointCalculator
10 |
11 |
12 | def main(args):
13 |
14 | print_args(args)
15 |
16 | input_path = args.input_path
17 | if not os.path.isfile(input_path):
18 | sys.exit(f"Error: '{input_path}' is not a file.")
19 | output_path = args.output_path
20 |
21 |
22 | with h5py.File(input_path, 'r') as h5_file:
23 | data_group = h5_file['data']
24 |
25 | # Count total number of configurations
26 | total_configs = sum(
27 | rxn_group['wB97x_6-31G(d).energy'].shape[0]
28 | for chem_group in data_group.values()
29 | for rxn_group in chem_group.values()
30 | )
31 |
32 | with connect(output_path) as db:
33 | with tqdm(total=total_configs, desc="Converting", unit="config") as pbar:
34 | for chem_group_name, chem_group in data_group.items():
35 | for rxn_group_name, rxn_group in chem_group.items():
36 | atomic_numbers = rxn_group['atomic_numbers'][:]
37 | positions = rxn_group['positions'][:]
38 | energies = rxn_group['wB97x_6-31G(d).energy'][:]
39 | forces = rxn_group['wB97x_6-31G(d).forces'][:]
40 |
41 | for i in range(len(energies)):
42 | atoms = Atoms(
43 | numbers=atomic_numbers,
44 | positions=positions[i],
45 | )
46 | atoms.set_calculator(SinglePointCalculator(
47 | atoms,
48 | energy=energies[i],
49 | forces=forces[i]
50 | ))
51 |
52 | unique_id = f"{chem_group_name}_{rxn_group_name}_{i}"
53 | db.write(atoms, data={'unique_id': unique_id})
54 |
55 | pbar.update(1)
56 |
57 | def print_args(args):
58 | print()
59 | print("Arguments provided:")
60 | arg_dict = vars(args)
61 | for key, value in arg_dict.items():
62 | print(f" {key}: {value}")
63 | print()
64 |
65 | def get_parser():
66 | parser = argparse.ArgumentParser(description="Translate hdf5 file into ase db file.")
67 |
68 | parser.add_argument('-i', '--input_path', required=True,
69 | help="Path of the input wB97X hdf5 file")
70 | parser.add_argument('-o', '--output_path', required=True,
71 | help="Path of the output wB97X db file")
72 |
73 | return parser
74 |
75 | if __name__ == "__main__":
76 | args = get_parser().parse_args()
77 | main(args)
78 |
--------------------------------------------------------------------------------
/dandelion/utils/db_h5_tools/make_db_from_xyzs.py:
--------------------------------------------------------------------------------
1 | import os
2 | import glob
3 | import argparse
4 |
5 | from tqdm import tqdm
6 | from ase import io
7 | from ase.db import connect
8 |
9 |
10 | def main(args):
11 |
12 | print_args(args)
13 |
14 | input_path = args.input_path
15 | if not os.path.isdir(input_path):
16 | sys.exit(f"Error: '{input_path}' is not a directory.")
17 | output_path = args.output_path
18 |
19 | with connect(output_path) as db:
20 | for file_path in tqdm(glob.glob(os.path.join(input_path, '**/*.xyz'), recursive=True)):
21 | atoms = io.read(file_path)
22 | db.write(atoms)
23 |
24 | def print_args(args):
25 | print()
26 | print("Arguments provided:")
27 | arg_dict = vars(args)
28 | for key, value in arg_dict.items():
29 | print(f" {key}: {value}")
30 | print()
31 |
32 | def get_parser():
33 | parser = argparse.ArgumentParser(description='Merge xyz files in input directory into db file.')
34 |
35 | parser.add_argument('-i', '--input_path', required=True,
36 | help='Input path of directory containing xyz files to merge')
37 | parser.add_argument('-o', '--output_path', required=True,
38 | help='Output path of the merged db file.')
39 |
40 | return parser
41 |
42 |
43 | if __name__ == "__main__":
44 | args = get_parser().parse_args()
45 | main(args)
46 |
47 |
48 |
--------------------------------------------------------------------------------
/dandelion/utils/db_h5_tools/merge_db.py:
--------------------------------------------------------------------------------
1 | import os
2 | import sys
3 | import glob
4 | import argparse
5 |
6 | from tqdm import tqdm
7 | from ase.db import connect
8 |
9 | def main(args):
10 |
11 | print_args(args)
12 |
13 | input_path = args.input_path
14 | if not os.path.isdir(input_path):
15 | sys.exit(f"Error: '{input_path}' is not a directory.")
16 | output_path = args.output_path
17 |
18 | with connect(output_path) as db1:
19 | for f in glob.glob(os.path.join(input_path, '**/wb97x.db'), recursive=True):
20 | with connect(f) as db2:
21 | for row in tqdm(db2.select(), total=db2.count(), desc=f"{f}"):
22 | db1.write(row.toatoms())
23 |
24 | def print_args(args):
25 | print()
26 | print("Arguments provided:")
27 | arg_dict = vars(args)
28 | for key, value in arg_dict.items():
29 | print(f" {key}: {value}")
30 | print()
31 |
32 | def get_parser():
33 | parser = argparse.ArgumentParser(description='Merge db files in input directory')
34 |
35 | parser.add_argument('-i', '--input_path', required=True,
36 | help='Input path of directory containing db files to merge')
37 | parser.add_argument('-o', '--output_path', required=True,
38 | help='Output path of the merged db file.')
39 |
40 | return parser
41 |
42 |
43 | if __name__ == "__main__":
44 | args = get_parser().parse_args()
45 | main(args)
46 |
47 |
--------------------------------------------------------------------------------
/dandelion/utils/db_h5_tools/merge_h5.py:
--------------------------------------------------------------------------------
1 | import os
2 | import sys
3 | import argparse
4 | from itertools import repeat
5 |
6 | import h5py
7 | from tqdm import tqdm
8 | import glob
9 |
10 |
11 | def main(args):
12 |
13 | print_args(args)
14 |
15 | input_path = args.input_path
16 | if not os.path.isdir(input_path):
17 | sys.exit(f"Error: '{input_path}' is not a directory.")
18 | output_path = args.output_path
19 |
20 | # Open the output file
21 | with h5py.File(output_path, 'w') as h5file_out:
22 | # Ensure the 'data' group exists in the output file
23 | if 'data' not in h5file_out:
24 | data_group_out = h5file_out.create_group('data')
25 | else:
26 | data_group_out = h5file_out['data']
27 |
28 | # Iterate through each input file
29 | for input_path in glob.glob(os.path.join(input_path, '**/wb97x.h5'), recursive=True):
30 | print(input_path)
31 | # Determine the prefix ('a' or 'b') based on the input file name
32 | prefix = os.path.basename(os.path.dirname(input_path)) # Assumes file name is 'a.h5' or 'b.h5'
33 |
34 | # Open the input file
35 | with h5py.File(input_path, 'r') as h5file_in:
36 | # Iterate through chemical groups in the input file
37 | for chem_group_name, chem_group in tqdm(h5file_in['data'].items(), desc="Formulas"):
38 | # Ensure the chemical group exists in the output file
39 | if chem_group_name not in data_group_out:
40 | chem_group_out = data_group_out.create_group(chem_group_name)
41 | else:
42 | chem_group_out = data_group_out[chem_group_name]
43 |
44 | # Iterate through reaction groups in the chemical group
45 | for rxn_group_name, rxn_group in tqdm(chem_group.items(), desc=f"Rxns in {chem_group_name}", leave=False):
46 | # Prefix the reaction group name with 'a' or 'b'
47 | rxn_group_name_prefixed = f"{prefix}_{rxn_group_name}"
48 |
49 | # Ensure the reaction group exists in the output file
50 | if rxn_group_name_prefixed not in chem_group_out:
51 | rxn_group_out = chem_group_out.create_group(rxn_group_name_prefixed)
52 | else:
53 | rxn_group_out = chem_group_out[rxn_group_name_prefixed]
54 |
55 | # Copy datasets from input to output, creating new datasets
56 | for dset_name, dset in rxn_group.items():
57 | data = dset[:]
58 | rxn_group_out.create_dataset(dset_name, data=data)
59 |
60 | def print_args(args):
61 | print()
62 | print("Arguments provided:")
63 | arg_dict = vars(args)
64 | for key, value in arg_dict.items():
65 | print(f" {key}: {value}")
66 | print()
67 |
68 | def get_parser():
69 | parser = argparse.ArgumentParser(description='Merge h5 files in input directory')
70 |
71 | parser.add_argument('-i', '--input_path', required=True,
72 | help='Input path of directory containing h5 files to merge')
73 | parser.add_argument('-o', '--output_path', required=True,
74 | help='Output path of the merged h5 file.')
75 |
76 | return parser
77 |
78 |
79 | if __name__ == "__main__":
80 | args = get_parser().parse_args()
81 | main(args)
82 |
83 |
--------------------------------------------------------------------------------
/environment.yml:
--------------------------------------------------------------------------------
1 | name: ts
2 | channels:
3 | - conda-forge
4 | - defaults
5 | dependencies:
6 | - python=3.11.5
7 | - pip=23.2.1
8 | - ase=3.22.1
9 | - imageio=2.31.1
10 | - matplotlib-base=3.7.2
11 | - numpy=1.25.2
12 | - openbabel=3.1.1
13 | - scipy=1.11.2
14 | - networkx=3.1
15 | - xtb-python=22.1
--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | [metadata]
2 | name = dandelion
3 | version = attr: dandelion.__version__
4 | author = Minhyeok Lee
5 | author_email = mlee@yonsei.ac.kr
6 | description = Near TS region sampler for machine learning force field
7 | python_requires = >=3.11
8 | classifiers =
9 | License :: MIT License
10 | Programming Language :: Python :: 3
11 | long_description = file: README.md
12 |
13 | [options]
14 | packages = find:
15 | install_requires =
16 | h5py==3.9.0
17 | rdkit==2023.3.3
18 | tqdm==4.66.1
19 | typing-extensions==4.8.0
20 |
21 | [options.entry_points]
22 | console_scripts =
23 | dand = dandelion.cli:main
24 |
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | from setuptools import setup
2 |
3 | setup()
--------------------------------------------------------------------------------