├── .gitignore
├── CHANGELOG.md
├── LICENSE
├── README.md
├── dandelion
    ├── __init__.py
    ├── cli.py
    ├── dandelion_prep.py
    ├── dandelion_refine.py
    ├── dandelion_sample.py
    ├── neb
    │   ├── __init__.py
    │   ├── compile_neb.py
    │   ├── filter_neb.py
    │   └── run_neb.py
    ├── prep
    │   ├── __init__.py
    │   ├── geom_opt.py
    │   └── smiles_to_isoconfs.py
    ├── refine
    │   ├── __init__.py
    │   ├── compile_refined.py
    │   └── refine_forces.py
    ├── segsm
    │   ├── __init__.py
    │   ├── ard_gsm
    │   │   ├── __init__.py
    │   │   ├── driving_coords.py
    │   │   ├── limits.py
    │   │   └── mol.py
    │   ├── create_gsm.py
    │   ├── filter_gsm.py
    │   └── run_gsm.py
    └── utils
    │   └── db_h5_tools
    │       ├── db_to_h5.py
    │       ├── h5_to_db.py
    │       ├── make_db_from_xyzs.py
    │       ├── merge_db.py
    │       └── merge_h5.py
├── environment.yml
├── setup.cfg
└── setup.py


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Created by https://www.toptal.com/developers/gitignore/api/python,linux
  2 | # Edit at https://www.toptal.com/developers/gitignore?templates=python,linux
  3 | **/*wb97.py
  4 | **/nms
  5 | .xtboptok
  6 | ### Linux ###
  7 | *~
  8 | 
  9 | # temporary files which can be created if a process still has a handle open of a deleted file
 10 | .fuse_hidden*
 11 | 
 12 | # KDE directory preferences
 13 | .directory
 14 | 
 15 | # Linux trash folder which might appear on any partition or disk
 16 | .Trash-*
 17 | 
 18 | # .nfs files are created when an open file is removed but is still being accessed
 19 | .nfs*
 20 | 
 21 | ### Python ###
 22 | # Byte-compiled / optimized / DLL files
 23 | __pycache__/
 24 | *.py[cod]
 25 | *$py.class
 26 | 
 27 | # C extensions
 28 | *.so
 29 | 
 30 | # Distribution / packaging
 31 | .Python
 32 | build/
 33 | develop-eggs/
 34 | dist/
 35 | downloads/
 36 | eggs/
 37 | .eggs/
 38 | lib/
 39 | lib64/
 40 | parts/
 41 | sdist/
 42 | var/
 43 | wheels/
 44 | share/python-wheels/
 45 | *.egg-info/
 46 | .installed.cfg
 47 | *.egg
 48 | MANIFEST
 49 | 
 50 | # PyInstaller
 51 | #  Usually these files are written by a python script from a template
 52 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 53 | *.manifest
 54 | *.spec
 55 | 
 56 | # Installer logs
 57 | pip-log.txt
 58 | pip-delete-this-directory.txt
 59 | 
 60 | # Unit test / coverage reports
 61 | htmlcov/
 62 | .tox/
 63 | .nox/
 64 | .coverage
 65 | .coverage.*
 66 | .cache
 67 | nosetests.xml
 68 | coverage.xml
 69 | *.cover
 70 | *.py,cover
 71 | .hypothesis/
 72 | .pytest_cache/
 73 | cover/
 74 | 
 75 | # Translations
 76 | *.mo
 77 | *.pot
 78 | 
 79 | # Django stuff:
 80 | *.log
 81 | local_settings.py
 82 | db.sqlite3
 83 | db.sqlite3-journal
 84 | 
 85 | # Flask stuff:
 86 | instance/
 87 | .webassets-cache
 88 | 
 89 | # Scrapy stuff:
 90 | .scrapy
 91 | 
 92 | # Sphinx documentation
 93 | docs/_build/
 94 | 
 95 | # PyBuilder
 96 | .pybuilder/
 97 | target/
 98 | 
 99 | # Jupyter Notebook
100 | .ipynb_checkpoints
101 | 
102 | # IPython
103 | profile_default/
104 | ipython_config.py
105 | 
106 | # pyenv
107 | #   For a library or package, you might want to ignore these files since the code is
108 | #   intended to run in multiple environments; otherwise, check them in:
109 | # .python-version
110 | 
111 | # pipenv
112 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
113 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
114 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
115 | #   install all needed dependencies.
116 | #Pipfile.lock
117 | 
118 | # poetry
119 | #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
120 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
121 | #   commonly ignored for libraries.
122 | #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
123 | #poetry.lock
124 | 
125 | # pdm
126 | #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
127 | #pdm.lock
128 | #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
129 | #   in version control.
130 | #   https://pdm.fming.dev/#use-with-ide
131 | .pdm.toml
132 | 
133 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
134 | __pypackages__/
135 | 
136 | # Celery stuff
137 | celerybeat-schedule
138 | celerybeat.pid
139 | 
140 | # SageMath parsed files
141 | *.sage.py
142 | 
143 | # Environments
144 | .env
145 | .venv
146 | env/
147 | venv/
148 | ENV/
149 | env.bak/
150 | venv.bak/
151 | 
152 | # Spyder project settings
153 | .spyderproject
154 | .spyproject
155 | 
156 | # Rope project settings
157 | .ropeproject
158 | 
159 | # mkdocs documentation
160 | /site
161 | 
162 | # mypy
163 | .mypy_cache/
164 | .dmypy.json
165 | dmypy.json
166 | 
167 | # Pyre type checker
168 | .pyre/
169 | 
170 | # pytype static type analyzer
171 | .pytype/
172 | 
173 | # Cython debug symbols
174 | cython_debug/
175 | 
176 | # PyCharm
177 | #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
178 | #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
179 | #  and can be added to the global gitignore or merged into this file.  For a more nuclear
180 | #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
181 | #.idea/
182 | 
183 | ### Python Patch ###
184 | # Poetry local configuration file - https://python-poetry.org/docs/configuration/#local-configuration
185 | poetry.toml
186 | 
187 | # ruff
188 | .ruff_cache/
189 | 
190 | # LSP config files
191 | pyrightconfig.json
192 | 
193 | # End of https://www.toptal.com/developers/gitignore/api/python,linux
194 | 


--------------------------------------------------------------------------------
/CHANGELOG.md:
--------------------------------------------------------------------------------
  1 | # Changelog
  2 | 
  3 | ## [Unreleased] - 2023-10-12
  4 | 
  5 | ### Added
  6 | - run_gsm.py: added test if gsm command is available
  7 | - filter_gsm.py: added step where optimize product with xTB and if RMSD change is big, filter out the reaction
  8 | 
  9 | 
 10 | ### Added
 11 | ### Changed 
 12 | ### Removed 
 13 | ### Fixed
 14 | 
 15 | ## [0.7.4] - 2024-10-08
 16 | ### Changed 
 17 | - README.md updated.
 18 | 
 19 | ## [0.7.3] - 2024-10-08
 20 | ### Changed 
 21 | - segsm/ard_gsm/mol.py: This version works with radicals. no assumption of four connections on carbon atoms. Also works with other heavy elementts.
 22 | 
 23 | 
 24 | ## [0.7.2] - 2024-09-02
 25 | ### Added
 26 | - Utility that handle db and h5 files are added to utils/db_h5_tools.
 27 |   1. db_to_h5.py
 28 |   2. h5_to_db.py
 29 |   3. make_db_from_xyzs.py
 30 |   4. merge_db.py
 31 |   5. merge_h5.py
 32 | 
 33 | ## [0.7.1] - 2024-09-01
 34 | 
 35 | ### Added
 36 | - Normal mode sampling codes are added to utils/nms.
 37 |   1. normal_mode_sampling.py
 38 |   2. refine_forces_nms.py
 39 | 
 40 | ### Fixed
 41 | - All code now assert the type of the input_path (dir or file)
 42 | 
 43 | ## [0.7.0] - 2024-08-31
 44 | 
 45 | ### Added
 46 | - Sampling iso/conformers is included as a preparatory step in dandelion.
 47 |   1. smiles_to_isoconfs.py
 48 |   2. geom_opt.py
 49 |   3. dandelion_prep.py
 50 |    
 51 | - cli.py: to invoke dandelion like 'dand prep -i ./a.smi -n 40' in cli.
 52 | 
 53 | ### Changed 
 54 | - dandelion is shortend as 'dand' in cli.
 55 | - dandelion_sample.py: default argument '0_mothers' changed to '0_reactants'
 56 | - print_separator, merge_args_with_defaults are moved to init.py
 57 | ## [0.6.2] - 2024-07-08
 58 | 
 59 | ### Fixed
 60 | - dandelion.py: name changed to dandelion_refine.py
 61 | 
 62 | 
 63 | 
 64 | ## [0.6.1] - 2024-01-14
 65 | 
 66 | ### Changed
 67 | - compile_refined.py: bug fixed when atomrow doesn't have 'energy' and 'forces'
 68 | 
 69 | 
 70 | ## [0.6.0] - 2023-11-21
 71 | 
 72 | ### Added
 73 | - filter_neb.py: added function is_valid_reaction to filter out weird rxn
 74 | 
 75 | ## [0.5.6] - 2023-11-21
 76 | 
 77 | ### Changed
 78 | - refine_forces.py: suppress error in force calculation, save to orca_error.log
 79 | - refine_forces.py: now save samples in batch
 80 | - refine_forces.py: open .db file with statement
 81 | 
 82 | ## [0.5.5] - 2023-11-14
 83 | 
 84 | ### Added
 85 | - run_neb.py: added argument fmax_threshold (default=0.1ev/A)
 86 | 
 87 | ### Fixed
 88 | - refine_forces.py: added NoTrah for the orca command
 89 | 
 90 | 
 91 | ## [0.5.4] - 2023-11-7
 92 | 
 93 | ### Fixed
 94 | - compile_neb.py: fixed argparser that had no required=True
 95 | 
 96 | 
 97 | ## [0.5.3] - 2023-11-2
 98 | 
 99 | ### Fixed
100 | - dandelion_refine.py: awesome ascii art
101 | 
102 | 
103 | ## [0.5.2] - 2023-11-2
104 | 
105 | ### Fixed
106 | - compile_refined.py: sorting the rows in the right order
107 | 
108 | 
109 | ## [0.5.1] - 2023-10-17
110 | 
111 | ### Added
112 | - opt_mothers.py: optimize crude structures using xTB
113 | 
114 | 
115 | ## [0.5.0] - 2023-10-12
116 | 
117 | ### Added
118 | - filter_neb.py: xTB normal mode TS validation: is_transition_state
119 | 
120 | 
121 | ## [0.4.1] - 2023-10-11
122 | 
123 | ### Added
124 | - Added \__init__.py have variable \__version__
125 | 
126 | ### Fixed
127 | - Basis set 6-31g(d) for Br atom in orca was handled thanks to https://github.com/ZimmermanGroup/ORCA-Basis-Sets
128 | 
129 | 
130 | ## [0.4.0] - 2023-10-10
131 | 
132 | ### Added
133 | - dandelion_refine.py that run refine processes
134 | 
135 |   
136 | ## [0.3.1] - 2023-10-10
137 | 
138 | ### Added
139 | - setup.py, README.md, CHANGELOG.md, LICENSE added 
140 | 
141 | 
142 | ## [0.2.0] - 2023-09-30
143 | 
144 | ### Added
145 | - dandelion.py that run through neb, refine
146 | - Codes refactored
147 | 
148 | ### Fixed
149 | - Issues with absolute import fixed
150 | 
151 | 
152 | ## [0.1.0] - 2023-09-10
153 | 
154 | ### Added
155 | - Initial release with features neb, refine, segsm
156 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | 
 2 | MIT License
 3 | 
 4 | Copyright (c) 2023 Minhyeok Lee
 5 | 
 6 | Permission is hereby granted, free of charge, to any person obtaining a copy
 7 | of this software and associated documentation files (the "Software"), to deal
 8 | in the Software without restriction, including without limitation the rights
 9 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 | copies of the Software, and to permit persons to whom the Software is
11 | furnished to do so, subject to the following conditions:
12 | 
13 | The above copyright notice and this permission notice shall be included in all
14 | copies or substantial portions of the Software.
15 | 
16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 | SOFTWARE.
23 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | <h1 align="left">Dandelion</h1>
 2 | 
 3 | [![docs](https://img.shields.io/badge/docs-mhyeok1.github.io/dand__docs/-brightgreen.svg)](https://mhyeok1.github.io/dand_docs/)
 4 | [![DOI](https://img.shields.io/badge/DOI-10.1002/advs.202409009-blue.svg)](https://doi.org/10.1002/advs.202409009)
 5 | [![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.14020916.svg)](https://doi.org/10.5281/zenodo.14020916)
 6 | <div align="center">
 7 | 
 8 | <h4 align="center">Codes for automated and efficient sampling of chemical reaction space for MLIP training</h4>
 9 | 
10 | 
11 | <img src="https://github.com/user-attachments/assets/ed0e5069-6464-485e-b2a5-c3461b8cb528" alt="drawing" width="700"/>
12 | </div>
13 | 
14 | Dandelion is a code for generating datasets that contain both equilibrium and reactive regions of potential energy surfaces, using automated and efficient sampling of chemical reaction space.
15 | 
16 | **Documentation** : <https://mhyeok1.github.io/dand_docs/>
17 | 
18 | 
19 | 
20 | ## Citation
21 | If you find this work useful for your research, please consider citing:
22 | 
23 | - Lee et al. *Adv. Sci.* **12**, 2409009 (2025) [LINK](https://doi.org/10.1002/advs.202409009) 
24 | 
25 | This work builds upon pioneering works that should also be cited:
26 | - Grambow et al. *Sci. Data* **7**, 137 (2020) [LINK](https://doi.org/10.1038/s41597-020-0460-4)
27 | - Schreiner et al. *Sci. Data* **9**, 779 (2022) [LINK](https://doi.org/10.1038/s41597-022-01870-w)
28 | 
29 | ## Supporting Information
30 | The datasets used in the paper are available at zenodo.
31 | 
32 | [![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.14020916.svg)](https://doi.org/10.5281/zenodo.14020916)
33 | 
34 | 
35 | 


--------------------------------------------------------------------------------
/dandelion/__init__.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | 
 3 | __version__ = '0.7.4'
 4 | 
 5 | def print_separator(text, width=70):
 6 |     border = "╔" + "═" * (width-2) + "╗"
 7 |     
 8 |     total_symbols_len = width - len(text) - 4  
 9 |     half_len = total_symbols_len // 2
10 |     left_symbol = "║" + " " * (half_len - 1)
11 |     right_symbol = " " * (total_symbols_len - half_len - 1) + "║"
12 |     separator = left_symbol + '  ' + text + '  ' + right_symbol
13 |     
14 |     end = "╚" + "═" * (width-2) + "╝"
15 |     print("\n\n" + border)
16 |     print(separator)
17 |     print(end + "\n\n")
18 | 
19 | def merge_args_with_defaults(module_parser, custom_args):
20 |     """
21 |     Merge custom arguments with module defaults.
22 |     Args:
23 |     - module_parser: the module parser function
24 |     - custom_args: dictionary of custom arguments
25 | 
26 |     Returns:
27 |     - argparse.Namespace: merged namespace of arguments
28 |     """
29 |     
30 |     parser = module_parser()
31 |     for action in parser._actions:
32 |         if action.required:
33 |             action.required = False
34 | 
35 |     defaults = vars(parser.parse_args([]))
36 |     defaults.update(custom_args)
37 | 
38 |     for action in parser._actions:
39 |         if not action.required and action.dest in custom_args:
40 |             action.required = True
41 | 
42 |     return argparse.Namespace(**defaults)


--------------------------------------------------------------------------------
/dandelion/cli.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | from dandelion import dandelion_prep, dandelion_sample, dandelion_refine
 3 | 
 4 | def main():
 5 |     if len(sys.argv) < 2:
 6 |         print("Usage: dand [prep|sample|refine] [options]")
 7 |         sys.exit(1)
 8 | 
 9 |     command = sys.argv[1]
10 |     # Remove the 'dand' and the subcommand from sys.argv
11 |     sys.argv = [sys.argv[0]] + sys.argv[2:]
12 | 
13 |     if command == "prep":
14 |         dandelion_prep.main()
15 |     elif command == "sample":
16 |         dandelion_sample.main()
17 |     elif command == "refine":
18 |         dandelion_refine.main()
19 |     else:
20 |         print(f"Unknown command: {command}")
21 |         print("Available commands: prep, sample, refine")
22 |         sys.exit(1)
23 | 
24 | if __name__ == "__main__":
25 |     main()


--------------------------------------------------------------------------------
/dandelion/dandelion_prep.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | import os
 3 | import sys
 4 | import time
 5 | import argparse
 6 | 
 7 | from dandelion import __version__, print_separator, merge_args_with_defaults
 8 | from dandelion.prep.smiles_to_isoconfs import main as smiles_to_isoconfs, get_parser as smiles_to_isoconfs_parser
 9 | from dandelion.prep.geom_opt import main as geom_opt, get_parser as geom_opt_parser
10 | 
11 | 
12 | def print_header(width=70):
13 |     
14 |     print(f'''
15 | 
16 |               H                 H                          
17 |                \\\\               -                        
18 |                  \\\\            -                         
19 |                    C──────────C\                   H       
20 |                   -              \\\\               /      
21 |                  -                 \\\\            /       
22 |        H────────C     O=Cc1ccccc1    C──────────C          
23 |                  \\\\                 -            \\\\    
24 |                    \\\\              -              \\\\   
25 |                      \\C─────────C-                  O     
26 |                       -           \\\\                     
27 |                      -              \\\\                   
28 |                     H                 H                    
29 | 
30 | {"Prepare Iso/Conformers from SMILES stirngs".center(width)}    
31 | {("Dandelion " + __version__  + " by mlee").center(width)}
32 | ''')
33 | 
34 | 
35 | def main():
36 |     args = parse_arguments()
37 |     
38 |     input_path = args.input_path
39 |     if not os.path.isfile(input_path):
40 |         sys.exit(f"Error: '{input_path}' is not a file.")
41 |     working_path = os.path.dirname(input_path)
42 |     max_workers = args.max_workers
43 | 
44 |     phases = [
45 |         ("1. Sample iso/conformers from SIMLES strings", smiles_to_isoconfs, smiles_to_isoconfs_parser, {
46 |             'input_path': input_path,
47 |             'output_path': os.path.join(working_path, '-1_isoconfs'),
48 |         }),
49 |         ("2. Optimize geometries", geom_opt, geom_opt_parser, {
50 |             'input_path': os.path.join(working_path, '-1_isoconfs'),
51 |             'output_path': os.path.join(working_path, '0_reactants'),
52 |             'max_workers': max_workers
53 |         }),                  
54 |     ]
55 | 
56 |     print_header()
57 |     
58 |     for title, function, parser, custom_args in phases:
59 |         time.sleep(3)
60 |         print_separator(title)
61 |         merged_args = merge_args_with_defaults(parser, custom_args)
62 |         function(merged_args)
63 | 
64 | 
65 | def parse_arguments():
66 |     parser = argparse.ArgumentParser(description='Prepare optimized iso/conformers from SMILES,\
67 |                                      Other parameters can be set in each modules')
68 |     
69 |     parser.add_argument('-i', '--input_path', required=True, 
70 |                         help='Input path of a file containing SMILES')    
71 |     parser.add_argument('-n', '--max_workers', type=int, default=1,
72 |                         help='Number of processes to use for parallel execution.')
73 |     
74 |     return parser.parse_args()
75 | 
76 | 
77 | if __name__ == "__main__":
78 |     main()
79 | 


--------------------------------------------------------------------------------
/dandelion/dandelion_refine.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | import os
 3 | import sys
 4 | import time
 5 | import argparse
 6 | 
 7 | from dandelion import __version__, print_separator, merge_args_with_defaults
 8 | from dandelion.refine.refine_forces import main as refine_forces, get_parser as refine_forces_parser
 9 | from dandelion.refine.compile_refined import main as compile_refined, get_parser as compile_refined_parser
10 | 
11 | 
12 | 
13 | def print_header(width=70):
14 |     
15 |     print(f'''                      
16 | 
17 |           ⢀⣀⣀⣀⣀⣀⡀       ⢀⢀⣀⢀⠞⠖⠁⠡⡂⡆ ⡠⢀⡀                     
18 |          ⠺⢿⣿⣿⣿⣿⣿⣿⣷⣦⣠⣤⣤⣤⣄⣀⣀ ⡏⢸  ⢀ ⠣⠈ ⡠⡋⡨⡋⡂                  
19 |            ⠙⢿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣷⣦⣄⡀⡎⢀⡰⢀⢎⠌⢀⠔⣐⠠⣄⣀                
20 |        ⢀ ⡔⢀⣴⣾⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⠿⠿⠿⣿⣿⣷⣄⠂ ⢊⠎ ⠠⠂⡀⠕⠌⠌ ⡄⡠⢄            
21 |     ⢀⡆⠄⠁⢈⢠⣾⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣀   ⣀⣿⣿⣿⣆⠐    ⡨⠒⠁⡀⢠⣦⠍⠇⡀⢲⠂⡄⠄        
22 |    ⠨⡀⠑⡈ ⢠⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⣿⡄   ⠈  ⣬⠠⣰⣿ ⢳⢹⡄⡆⠄⢀⢼       
23 |  ⡄⠱⠈⠁⠑⢄⠐⣾⣿⣿⡿⠋⠁⣀⣠⣬⣽⣿⣿⣿⣿⣿⣿⠿⠿⠿⠿⠿⠿⠿⠿⠟⠁⡟⣅⡢⠁⠠⠜⡄⡑⢌⢧⡀ ⡀⣰⢁⡐⢁⢄⣡⣧⡤⠄   
24 | ⠠⡐⠓⠂⠌  ⢀⣿⣿⡏⢀⣴⣿⠿⠛⠉⠉⠶⢸⣿⣿⠿⠁⠢⠨⢀⣻⣿⣿⣿⣿⢟⣿⣝⠂  ⠠⡠⢆⠈⡂⠱⡇ ⣅⠫⠂⡠⢂⡪⠋  ⠁⡆  
25 | ⡶⠉ ⢀⡀⠁⡁⢸⣿⣿⢠⣾⡟⠁⣿⣿⡇ ⢀⠈⠉⠁    ⣀⠷⣹⣏⣷⢏⠹⠁    ⠈⢈ ⢇ ⢸⠱⢸⡏⡀⡶⡸⠎  ⠰⠁⡸   
26 | ⢈⡕⡈⠁⠐⠂⢀⢸⣿⣿⣾⠏⣿⣿⡿⣻⣿⢞⡢⠄ ⠈ ⡀⡤⠂⠁⠉⠌       ⢀⢀⠠⠐⢄ ⡀⢆⠎⢹⣶⣷⣧⡈⠈⠉⠤⠂⠉⢀⠱⡀ 
27 | ⢠⡊    ⠁⣸⣿⣿⣿⣀⠉⡻⡏⠋⠁ ⠁⠒⠒⡀⣍⠍⠁ ⡀ ⢠⠂     ⢀⠈⠄⢀⠄⡒⠅⠈⢄⢡ ⢿⣿⣷⣿⡄ ⠐⠄⠤ ⠜⢀ 
28 | ⠐⠁ ⠤⠒⢠⣾⣿⣿⣿⣿⣿⣷⣄⢄  ⢀ ⡏ ⢰⣃⠊⡐⠐⠁⢀⠈  ⣀ ⠰⠢⢀⠂⡰⠈⠂  ⡱⠂⢂⡇⡈⠻⢿⣿⠇   ⡤⠄⣀⡰⠁
29 |     ⠁⣾⣿⣿⣿⣿⣿⣿⣿⣿⣦ ⠄ ⠉   ⠸⠫⢞⠈⣰⠈ ⡐⢲⣿⡏       ⢠⡾ ⣀⠊⢱ ⠠⡀    ⢈⢀⡐⠤⣕⡄
30 |     ⢰⣿⡿⠛⠉   ⠈⠙⠛         ⠈⠈ ⠻⠔⠁⢸⡍⡇      ⢀⣏ ⢀⠠⠆ ⠣⡀⠈⡠⡀⠉⠢⡤⠢⣈⡡⣢⠦
31 | ⠈⠁           ⢻⣇               ⢸⡇⡇      ⣼⡿⠉  ⢀⡇ ⠑⡄⠑⣌⢄ ⠙⢄⠠⡪⣅ 
32 |              ⠈⣾⡆              ⢸⣏⡇     ⢠⣿⠇   ⠸⢌⢢⢄⡠⠣⠈⠢⡁⡈⣎⢢⡬⠃ 
33 | 
34 | {"Energy refinement on samples using orca".center(width)}    
35 | {("Dandelion " + __version__  + " by mlee").center(width)}
36 | ''')
37 | 
38 | 
39 | def main():
40 |     args = parse_arguments()
41 |     
42 |     input_path = args.input_path
43 |     if not os.path.isdir(input_path):
44 |         sys.exit(f"Error: '{input_path}' is not a directory.")
45 |     max_workers = args.max_workers
46 |     orcabinary = args.orca
47 |     
48 |     phases = [
49 |         ("1. Refining forces", refine_forces, refine_forces_parser, {
50 |             'input_path': os.path.join(input_path, 'xtb.h5'),
51 |             'output_path': os.path.join(input_path, 'wb97x.db'),
52 |             'orca' : orcabinary,
53 |             'max_workers': max_workers            
54 |         }),
55 |         ("2. Compiling final samples", compile_refined, compile_refined_parser, {
56 |             'input_path': os.path.join(input_path, 'wb97x.db'),
57 |             'output_path': os.path.join(input_path, 'wb97x.h5')
58 |         }),                  
59 |     ]
60 | 
61 |     print_header()
62 |     
63 |     for title, function, parser, custom_args in phases:
64 |         time.sleep(3)
65 |         print_separator(title)
66 |         merged_args = merge_args_with_defaults(parser, custom_args)
67 |         function(merged_args)
68 | 
69 | 
70 | def parse_arguments():
71 |     parser = argparse.ArgumentParser(description='Refine force on obtained samples,\
72 |                                      Other parameters can be set in each modules')
73 |     
74 |     parser.add_argument('-i', '--input_path', required=True, 
75 |                         help='Input path of directory containing xtb.h5')    
76 |     parser.add_argument('-n', '--max_workers', type=int, required=True, 
77 |                         help='Number of worker processes')
78 |     parser.add_argument('--orca', required=True, 
79 |                         help="Path of the orca binary file")
80 | 
81 |     return parser.parse_args()
82 | 
83 | 
84 | if __name__ == "__main__":
85 |     main()
86 | 


--------------------------------------------------------------------------------
/dandelion/dandelion_sample.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | import os
 3 | import sys
 4 | import time
 5 | import argparse
 6 | 
 7 | from dandelion import __version__, print_separator, merge_args_with_defaults
 8 | from dandelion.segsm.create_gsm import main as create_gsm, get_parser as create_gsm_parser
 9 | from dandelion.segsm.run_gsm import main as run_gsm, get_parser as run_gsm_parser
10 | from dandelion.segsm.filter_gsm import main as filter_gsm, get_parser as filter_gsm_parser
11 | from dandelion.neb.run_neb import main as run_neb, get_parser as run_neb_parser
12 | from dandelion.neb.filter_neb import main as filter_neb, get_parser as filter_neb_parser
13 | from dandelion.neb.compile_neb import main as compile_neb, get_parser as compile_neb_parser
14 | 
15 | 
16 | def print_header(width=70):
17 |     
18 |     print(f'''                      
19 |                      
20 |                                                      `;:`  BREAK 1 2
21 |                                          .;:;         /    BREAK 3 4 
22 |         _____                   _      _;::;         `     ADD 1 3
23 |         |  __ \                | |    | |';:;'           
24 |         | |  | | __ _ _ __   __| | ___| |  _  ___  _ __  
25 |         | |  | |/ _` | '_ \ / _` |/ _ \ | | |/ _ \| '_ \ 
26 |         | |__| | (_| | | | | (_| |  __/ | | | (_) | | | |
27 |         |_____/ \__,_|_| |_|\__,_|\___|_| |_|\___/|_| |_|
28 |     
29 | {"Chemical compound space sampling".center(width)}    
30 | {"near transition state using xTB, SE-GSM and NEB".center(width)}    
31 | {("Dandelion " + __version__  + " by mlee").center(width)}
32 | ''')
33 | 
34 | 
35 | def main():
36 |     args = parse_arguments()
37 |     
38 |     input_path = args.input_path
39 |     if not os.path.isdir(input_path):
40 |         sys.exit(f"Error: '{input_path}' is not a directory.")
41 |     output_path = os.path.dirname(os.path.dirname(input_path))
42 |     max_workers = args.max_workers
43 |     
44 |     if not os.path.exists(output_path):
45 |         os.makedirs(output_path)
46 | 
47 |     phases = [
48 |         ("1. Creating GSM", create_gsm, create_gsm_parser, {
49 |             'input_path': input_path,
50 |             'output_path': os.path.join(output_path, '1_gsm')
51 |         }),
52 |         ("2. Running GSM", run_gsm, run_gsm_parser, {
53 |             'input_path': os.path.join(output_path, '1_gsm'),
54 |             'max_workers': max_workers
55 |         }),
56 |         ("3. Filtering GSM", filter_gsm, filter_gsm_parser, {
57 |             'input_path': os.path.join(output_path, '1_gsm'),
58 |             'output_path': os.path.join(output_path, '2_gsm_filtered')
59 |         }),
60 |         
61 |         ("4. Running NEB", run_neb, run_neb_parser, {
62 |             'input_path': os.path.join(output_path, '2_gsm_filtered'),
63 |             'output_path': os.path.join(output_path, '3_neb'),
64 |             'max_workers': max_workers            
65 |         }),
66 |         ("5. Filtering NEB", filter_neb, filter_neb_parser, {
67 |             'input_path': os.path.join(output_path, '3_neb'),
68 |             'output_path': os.path.join(output_path, '4_neb_filtered')
69 |         }),          
70 |         ("6. Compiling samples", compile_neb, compile_neb_parser, {
71 |             'input_path': os.path.join(output_path, '4_neb_filtered', 'reactions.json'),
72 |             'output_path': os.path.join(output_path, 'xtb.h5')
73 |         }),                    
74 |     ]
75 | 
76 |     print_header()
77 |     
78 |     for title, function, parser, custom_args in phases:
79 |         time.sleep(3)
80 |         print_separator(title)
81 |         merged_args = merge_args_with_defaults(parser, custom_args)
82 |         function(merged_args)
83 | 
84 | 
85 | def parse_arguments():
86 |     parser = argparse.ArgumentParser(description='Do SEGSM and NEB from reactant structures,\
87 |                                      Other parameters can be set in each modules')
88 |     
89 |     parser.add_argument('-i', '--input_path', required=True, 
90 |                         help='Input path of reactant structures (must be a directory)')    
91 |     parser.add_argument('-n', '--max_workers', type=int, required=True, 
92 |                         help='Number of worker processes')
93 |     return parser.parse_args()
94 | 
95 | 
96 | if __name__ == "__main__":
97 |     main()


--------------------------------------------------------------------------------
/dandelion/neb/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mhyeok1/dand/6c9ff246047ff37c89ab2b338f236e4798b3e2c2/dandelion/neb/__init__.py


--------------------------------------------------------------------------------
/dandelion/neb/compile_neb.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import sys
  3 | import json
  4 | import hashlib
  5 | import argparse
  6 | import itertools
  7 | 
  8 | import h5py
  9 | import ase.db
 10 | import numpy as np
 11 | from tqdm import tqdm
 12 | from ase.units import Hartree, Bohr
 13 | 
 14 | 
 15 | def get_hash(row):
 16 |     s = str(row.positions) + row.formula
 17 |     return int(hashlib.sha1(s.encode("utf-8")).hexdigest(), 16) % (10 ** 8)
 18 | 
 19 | def write_rxn(h5file, fmaxs_path, db_path, rxn, fmax_threshold):
 20 |     fmaxs = json.load(open(fmaxs_path))
 21 | 
 22 |     skip_next = False
 23 |     first = True
 24 |     cum_fmax = 0
 25 | 
 26 |     with ase.db.connect(db_path) as db:
 27 |         for i, (fmax, path) in enumerate(zip(fmaxs, sliced_it(10, db.select("")))):
 28 |             cum_fmax += fmax
 29 |             skip_this = skip_next
 30 |             skip_next = False
 31 |             last = i == len(fmaxs) - 1
 32 | 
 33 |             if last:
 34 |                 skip_this = False
 35 | 
 36 |             if cum_fmax < fmax_threshold:
 37 |                 skip_next = True
 38 | 
 39 |             else:
 40 |                 cum_fmax = 0
 41 | 
 42 |             if skip_this:
 43 |                 continue
 44 | 
 45 |             if not first:
 46 |                 path = path[1:-1]
 47 | 
 48 |             # reactant and product is sampled once
 49 |             # (all points -2) // 8 ==0
 50 |             
 51 |             forces_path = np.array([row.forces for row in path])
 52 |             positions_path = np.array([row.positions for row in path])
 53 |             energy_path = np.array([row.energy for row in path])
 54 | 
 55 |             if first:
 56 |                 forces = forces_path
 57 |                 positions = positions_path
 58 |                 energy = energy_path
 59 |                 reactant = path[0]  # pylint: disable=undefined-loop-variable
 60 |                 product = path[-1]  # pylint: disable=undefined-loop-variable
 61 | 
 62 |             else:
 63 |                 forces = np.concatenate((forces, forces_path), axis=0)
 64 |                 positions = np.concatenate((positions, positions_path), axis=0)
 65 |                 energy = np.concatenate((energy, energy_path), axis=0)
 66 | 
 67 |             first = False
 68 | 
 69 |     transition_state = path[  # pylint: disable=undefined-loop-variable
 70 |         np.argmax(energy_path)
 71 |     ]
 72 | 
 73 |     formula = reactant.formula
 74 |     atomic_numbers = reactant.numbers
 75 | 
 76 |     if formula in h5file:
 77 |         grp = h5file[formula]
 78 |     else:
 79 |         grp = h5file.create_group(formula)
 80 | 
 81 |     subgrp = grp.create_group(rxn)
 82 |     single_molecule(reactant, subgrp.create_group("reactant"))
 83 |     single_molecule(transition_state, subgrp.create_group("transition_state"))
 84 |     single_molecule(product, subgrp.create_group("product"))
 85 | 
 86 |     dict_ = {
 87 |         "forces": forces,
 88 |         "positions": positions,
 89 |         "energy": energy,
 90 |         "atomic_numbers": atomic_numbers,
 91 |     }
 92 |     write_group(dict_, subgrp)
 93 | 
 94 | 
 95 | def single_molecule(molecule, subgrp):
 96 |     dict_ = {
 97 |         "forces": np.expand_dims(molecule.forces, 0),
 98 |         "positions": np.expand_dims(molecule.positions, 0),
 99 |         "energy": np.expand_dims(molecule.energy, 0),
100 |         "atomic_numbers": molecule.numbers,
101 |         "hash": get_hash(molecule),
102 |     }
103 |     write_group(dict_, subgrp)
104 | 
105 | 
106 | def write_group(dict_, grp):
107 |     grp.create_dataset("atomic_numbers", data=dict_["atomic_numbers"])
108 |     grp.create_dataset("GFN2-xTB.forces", data=dict_["forces"])
109 |     grp.create_dataset("GFN2-xTB.energy", data=dict_["energy"])
110 |     grp.create_dataset("positions", data=dict_["positions"])
111 | 
112 |     if "hash" in dict_:
113 |         grp.create_dataset("hash", data=dict_["hash"])
114 | 
115 | 
116 | def sliced_it(n, iterable):
117 |     it = iter(iterable)
118 |     while True:
119 |         chunk = itertools.islice(it, n)
120 |         yield list(chunk)
121 | 
122 | 
123 | def main(args):  
124 |     
125 |     print_args(args)
126 |     
127 |     input_path = args.input_path
128 |     if not os.path.isfile(input_path):
129 |         sys.exit(f"Error: '{input_path}' is not a file.")
130 |     output_path = args.output_path
131 |     fmax_threshold = args.fmax_threshold
132 | 
133 |     rxns = json.load(open(input_path))
134 |     h5file = h5py.File(output_path, "w")
135 | 
136 |     data = h5file.create_group("data")
137 |     indexfile = open(output_path + ".index.json", "w")
138 |     index = {}
139 | 
140 |     bar_format = '{l_bar}{bar}| {n_fmt}/{total_fmt} [{elapsed}<{remaining}]'
141 |     for i, path in tqdm(enumerate(rxns), total=len(rxns), desc="Compiling reactions", bar_format=bar_format, ncols=70):
142 | 
143 |         fmaxs_path = os.path.join(path, "fmaxs.json")
144 |         db_path = os.path.join(path, "neb.db")
145 | 
146 |         new_rxn_name = f"rxn{str(i).zfill(4)}"
147 |         write_rxn(data, fmaxs_path, db_path, new_rxn_name, fmax_threshold)
148 |         index[new_rxn_name] = os.path.basename(path)
149 | 
150 |     json.dump(index, indexfile, indent=4)
151 | 
152 |     print('Compiling finished!')
153 | 
154 | def print_args(args):
155 |     print()
156 |     print("Arguments provided:")
157 |     arg_dict = vars(args)
158 |     for key, value in arg_dict.items():
159 |         print(f"  {key}: {value}")
160 |     print()
161 | 
162 | def get_parser():
163 |     parser = argparse.ArgumentParser(description="Compile filtered neb jobs to xtb h5 file.")
164 |     
165 |     parser.add_argument('-i', '--input_path', required=True, 
166 |                         help="Path of reactions.json, contains all reactions that should be included in the dataset ")
167 |     parser.add_argument('-o', '--output_path', required=True, 
168 |                         help="Path to the h5 file to write to")
169 |     parser.add_argument('--fmax_threshold', type=int, default=0.1,
170 |                         help='Fmax threshold for selecting bands')
171 |     return parser
172 | 
173 | if __name__ == "__main__":
174 |     args = get_parser().parse_args()
175 |     main(args)
176 | 
177 | 


--------------------------------------------------------------------------------
/dandelion/neb/filter_neb.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import sys
  3 | import json
  4 | import shutil
  5 | import argparse
  6 | from collections import defaultdict
  7 | 
  8 | from tqdm import tqdm
  9 | import numpy as np
 10 | from ase.io import read
 11 | from ase.vibrations import Vibrations
 12 | from xtb.ase.calculator import XTB
 13 | 
 14 | def get_energy_from_xyz(file_path):
 15 |     """Extracts the energy of a structure from an XYZ file."""
 16 |     try:
 17 |         atom = read(file_path)
 18 |         return atom.get_potential_energy()
 19 |     except:
 20 |         return None
 21 | 
 22 | def is_valid_rxn(reactant_path, product_path, ts_path):
 23 |     """Check if the reaction is valid based on energy."""
 24 |     
 25 |     reactant_energy = get_energy_from_xyz(reactant_path)
 26 |     product_energy = get_energy_from_xyz(product_path)
 27 |     ts_energy = get_energy_from_xyz(ts_path)
 28 |     
 29 |     if abs(reactant_energy - product_energy) < 5 * 0.0433634: # delta E below 5 kcal/mol
 30 |         return False
 31 |     
 32 |     if abs(ts_energy - reactant_energy) < 5 * 0.0433634: # reverse AE below 5 kcal/mol
 33 |         return False
 34 |     
 35 |     if abs(ts_energy - product_energy) < 5 * 0.0433634: # reverse AE below 5 kcal/mol
 36 |         return False
 37 | 
 38 |     return product_energy != ts_energy
 39 | 
 40 | 
 41 | def is_transition_state(ts_file_path, threshold=50): #cm-1
 42 |     struc = read(ts_file_path)
 43 |     struc.calc = XTB(method="GFN2-xTB")
 44 |     
 45 |     try:
 46 |         vib = Vibrations(struc)
 47 |         vib.run()
 48 |         frequencies = vib.get_frequencies()
 49 |         vib.clean()
 50 |         
 51 |         # Filter out imaginary frequencies below the threshold
 52 |         significant_imaginary_freqs = np.count_nonzero(np.abs(np.imag(frequencies)) > threshold)
 53 | 
 54 |         return significant_imaginary_freqs == 1
 55 |     except:
 56 |         return False
 57 |     
 58 | def main(args):
 59 | 
 60 |     print_args(args)
 61 | 
 62 |     input_path  = args.input_path
 63 |     if not os.path.isdir(input_path):
 64 |         sys.exit(f"Error: '{input_path}' is not a directory.")
 65 |     output_path = args.output_path
 66 |     if not os.path.exists(output_path):
 67 |         os.mkdir(output_path)
 68 | 
 69 |     grown_seeds = [dirpath for dirpath, _, filenames in os.walk(input_path) if "converged" in filenames]
 70 |     grown_seeds_copy = grown_seeds
 71 |     # Group by mother string
 72 |     grouped_seeds = defaultdict(list)
 73 |     for seed in grown_seeds:
 74 |         mother_string = os.path.basename(seed)[:-8] # gsmGeom-m1-i1-c1-opt-gsm0044 -> gsmGeom-m1-i1-c1-opt
 75 |         grouped_seeds[mother_string].append(seed)
 76 |     rxn_list = []
 77 | 
 78 |     bar_format = '{l_bar}{bar}| {n_fmt}/{total_fmt} [{elapsed}<{remaining}]'
 79 |     for mother_string, seeds in tqdm(grouped_seeds.items(), desc="Mothers", position=0, bar_format=bar_format, ncols=70):
 80 |         idx = 0
 81 |         for f in tqdm(seeds, desc=f"Rxns in {mother_string}", position=1, bar_format=bar_format, ncols=70, leave=False):
 82 | 
 83 |             ts_file_path = os.path.join(f, 'transition_state.xyz')
 84 |             reactant_path = os.path.join(f, 'reactant.xyz')
 85 |             product_path = os.path.join(f, 'product.xyz')
 86 | 
 87 |             if not is_valid_rxn(reactant_path, product_path, ts_file_path):
 88 |                 continue
 89 |             
 90 |             if not is_transition_state(ts_file_path):
 91 |                 # print(f"Directory {f} is not a valid reaction. Skipping...")
 92 |                 continue
 93 | 
 94 |             # If True, copy the directory
 95 |             new_name = os.path.join(output_path, f'{mother_string}-rxn{idx:04}')
 96 |             shutil.copytree(f, new_name)
 97 |             rxn_list.append(new_name)
 98 |             idx += 1
 99 | 
100 |     with open(os.path.join(output_path, 'reactions.json'), 'w') as f:
101 |         json.dump(rxn_list, f, indent=4)
102 | 
103 |     print(f'\n{len(rxn_list)}/{len(grown_seeds_copy)} rxns were saved to {output_path}/reactions.json')
104 |     print('Filtering NEB finished!')
105 | 
106 | 
107 | def print_args(args):
108 |     print()
109 |     print("Arguments provided:")
110 |     arg_dict = vars(args)
111 |     for key, value in arg_dict.items():
112 |         print(f"  {key}: {value}")
113 |     print()
114 | 
115 | def get_parser():
116 |     parser = argparse.ArgumentParser(description='Filter neb jobs and make reactions.json')
117 |     
118 |     parser.add_argument('-i', '--input_path', required=True, 
119 |                         help='Input path of finished neb jobs')    
120 |     parser.add_argument('-o', '--output_path', required=True, 
121 |                         help='Output path of filtered neb jobs')
122 | 
123 |     return parser
124 | 
125 | 
126 | if __name__ == "__main__":
127 |     args = get_parser().parse_args()
128 |     main(args)
129 | 
130 | 
131 | 
132 | 
133 | 


--------------------------------------------------------------------------------
/dandelion/neb/run_neb.py:
--------------------------------------------------------------------------------
  1 | 
  2 | import os
  3 | import sys
  4 | import json
  5 | import argparse
  6 | from functools import partial
  7 | from concurrent.futures import ProcessPoolExecutor
  8 | 
  9 | import uuid
 10 | import numpy as np
 11 | from tqdm import tqdm
 12 | 
 13 | import matplotlib
 14 | import matplotlib.pyplot as plt
 15 | import imageio.v2 as imageio
 16 | from PIL import Image, ImageOps
 17 | 
 18 | import ase.db
 19 | from ase.io import read, write
 20 | from xtb.ase.calculator import XTB
 21 | from ase.optimize.bfgs import BFGS
 22 | from ase.utils.forcecurve import fit_images
 23 | from ase.neb import NEB, NEBOptimizer, NEBTools
 24 | from ase.calculators.orca import ORCA
 25 | 
 26 | class SuppressStderr:
 27 |     def __enter__(self):
 28 |         self._original_stderr = sys.stderr
 29 |         sys.stderr = open(os.devnull, 'w')
 30 |     
 31 |     def __exit__(self, exc_type, exc_val, exc_tb):
 32 |         sys.stderr.close()
 33 |         sys.stderr = self._original_stderr
 34 | 
 35 | def plot_mep(fit_list):
 36 |     fit_list[:,1,:] *= 23.0609 #to kcal/mol
 37 |     gray_scale = matplotlib.colormaps.get('binary', len(fit_list))
 38 |     fig, ax = plt.subplots()
 39 |     for i in range(len(fit_list)):
 40 |     
 41 |         if i+1 == len(fit_list):
 42 |             ax.plot(fit_list[i,0,:], fit_list[i,1,:], color='red', linewidth=3)
 43 |             break
 44 |     
 45 |         color = gray_scale(max(i / len(fit_list), 0.1))
 46 |         ax.plot(fit_list[i,0,:], fit_list[i,1,:], color=color)
 47 |     
 48 |     ax.set_title(f'Iter {len(fit_list)}')    
 49 |     ax.set_axisbelow(True)
 50 |     ax.set_ylabel("Energy [kcal/mol]")
 51 |     ax.set_xlabel("Reaction Coordinate [AA]")
 52 |     return fig
 53 | 
 54 | def get_fit(neb_tools):
 55 |     fit = fit_images(neb_tools.images)
 56 |     return fit.fit_path, fit.fit_energies
 57 | 
 58 | class CalculationChecker:
 59 |     def __init__(self, neb):
 60 |         self.neb = neb
 61 | 
 62 |     def check_calculations(self):
 63 |         missing_calculations = []
 64 |         for i, image in enumerate(self.neb.images[1:-1]):
 65 |             if {"forces", "energy"} - image.calc.results.keys():
 66 |                 missing_calculations.append(i)
 67 | 
 68 |         if missing_calculations:
 69 |             raise ValueError(f"missing calculation for image(s) {missing_calculations}")
 70 | 
 71 | 
 72 | class DBWriter:
 73 |     def __init__(self, db_path, atomss):
 74 |         self.atomss = atomss
 75 |         self.db_path = db_path
 76 | 
 77 |     def write(self):
 78 |         with ase.db.connect(self.db_path) as db:
 79 |             for atoms in self.atomss:
 80 |                 if atoms.calc.results:
 81 |                     db.write(atoms, data=atoms.calc.results)
 82 | 
 83 | 
 84 | def interpolate_band(atom_configs, transition_state=None):
 85 |     if transition_state:
 86 |         transition_state = read(transition_state)
 87 |         ts_positions = transition_state.get_positions()
 88 |         middle_idx = len(atom_configs) // 2
 89 |         atom_configs[middle_idx].set_positions(ts_positions)
 90 |         first_band = NEB(atom_configs[: middle_idx + 1])
 91 |         second_band = NEB(atom_configs[middle_idx:])
 92 |         first_band.interpolate("idpp")
 93 |         second_band.interpolate("idpp")
 94 |     else:
 95 |         band = NEB(atom_configs)
 96 |         band.interpolate("idpp")
 97 |     return atom_configs
 98 | 
 99 | 
100 | def max_dimensions(frames):
101 |     """Get the maximum width and height among a list of images."""
102 |     max_width = max_height = 0
103 |     for frame in frames:
104 |         with Image.open(frame) as img:
105 |             width, height = img.size
106 |             max_width = max(max_width, width)
107 |             max_height = max(max_height, height)
108 |     return max_width, max_height
109 | 
110 | def pad_image(image_path, target_size):
111 |     """Pad an image to the target size."""
112 |     with Image.open(image_path) as img:
113 |         img = ImageOps.expand(img, border=((target_size[0]-img.size[0])//2,
114 |                                             (target_size[1]-img.size[1])//2,
115 |                                             (target_size[0]-img.size[0]+1)//2,
116 |                                             (target_size[1]-img.size[1]+1)//2),
117 |                                 fill='white')  # or another suitable color for your images
118 |         return img
119 | 
120 | def frames_to_gif(frames, output_gif):
121 |     # First, render each Atoms frame to an image
122 |     image_paths = []
123 |     for i, frame in enumerate(frames):
124 |         img_path = f"tmp_frame_{i}_{uuid.uuid4()}.png"
125 |         write(img_path, frame) 
126 |         image_paths.append(img_path)
127 | 
128 |     # Determine the max dimensions
129 |     max_width, max_height = max_dimensions(image_paths)
130 | 
131 |     # Create a list to store processed frames
132 |     processed_frames = []
133 | 
134 |     # Pad each frame, ensuring a non-transparent background
135 |     for img_path in image_paths:
136 |         with Image.open(img_path) as opened_img:
137 |             padded_frame = pad_image(img_path, (max_width, max_height))
138 |             
139 |             # Create a white background and paste the frame onto it to ensure non-transparency
140 |             background = Image.new('RGB', padded_frame.size, (255, 255, 255))
141 |             background.paste(padded_frame, mask=(padded_frame.split()[3] if len(padded_frame.split()) == 4 else None))
142 |             processed_frames.append(np.array(background))
143 | 
144 |     # Extend the list of processed frames with a reversed copy (excluding the last frame)
145 |     extended_frames = processed_frames + processed_frames[-2::-1]
146 | 
147 |     # Save the gif using imageio
148 |     with imageio.get_writer(output_gif, mode='I', duration=0.5) as writer:
149 |         for processed_frame in extended_frames:
150 |             writer.append_data(processed_frame)
151 | 
152 |     # Cleanup the temporary image files
153 |     for img_path in image_paths:
154 |         os.remove(img_path)
155 | 
156 | 
157 | def process_seed(seed, n_images, neb_fmax, cineb_fmax, steps, output_path):
158 |     
159 |     with SuppressStderr(): # xTB is so noisy when not converged
160 |         try:
161 |             #print(f"Starting from seed : {seed}")
162 |             reactant         = os.path.join(seed, 'reactant.xyz')
163 |             product          = os.path.join(seed, 'product.xyz')
164 |             transition_state = os.path.join(seed, 'ts.xyz')
165 |             product = read(product)
166 |             reactant = read(reactant)
167 |             
168 |             output = os.path.join(output_path, seed.split('/')[-2]+'-'+seed.split('/')[-1])
169 |             os.makedirs(output, exist_ok=True)
170 |             atom_configs = [reactant.copy() for i in range(n_images - 1)] + [product]
171 |             
172 |             for i, atom_config in enumerate(atom_configs):
173 |                 atom_config.calc = XTB(method='GFN2-xTB')
174 | 
175 |             #print("Relaxing endpoints ... ")
176 |             BFGS(atom_configs[0], logfile=None).run()
177 |             BFGS(atom_configs[-1], logfile=None).run()
178 | 
179 |             #print("Interpolating band ... ")
180 |             interpolate_band(atom_configs, transition_state)
181 | 
182 |             #print("Running NEB ... ")
183 |             neb = NEB(atom_configs, climb=True, parallel=False)
184 |             calculation_checker = CalculationChecker(neb)
185 |             neb_tools = NEBTools(neb.images)
186 | 
187 |             relax_neb = NEBOptimizer(neb, logfile=None)
188 |             db_writer = DBWriter(os.path.join(output, "neb.db"), atom_configs)
189 |             fmaxs = []
190 |             fit_list = []
191 |             relax_neb.attach(calculation_checker.check_calculations)
192 |             relax_neb.attach(db_writer.write)
193 |             relax_neb.attach(lambda: fmaxs.append(neb_tools.get_fmax()))
194 |             relax_neb.attach(lambda: fit_list.append(get_fit(neb_tools)))
195 |         
196 |             converged = relax_neb.run(fmax=neb_fmax, steps=steps)
197 | 
198 |             if not converged:
199 |                 raise 
200 |             
201 |             #print("NEB has converged, turn on CI-NEB ...")
202 |             neb.climb = True
203 |             ci_converged = relax_neb.run(fmax=cineb_fmax, steps=steps)
204 |                 
205 |             if ci_converged:
206 |                 open(os.path.join(output, "converged"), "w")
207 |                 #print("Reaction converged ... ")
208 |             fit_list = np.array(fit_list)
209 |             fig = plot_mep(fit_list)
210 |             if ci_converged:
211 |                 np.save(os.path.join(output, "fitlist.npy"), fit_list)
212 | 
213 |             fig.savefig(os.path.join(output, "mep.png"))
214 |             json.dump(fmaxs, open(os.path.join(output, "fmaxs.json"), "w"), indent=4)
215 |             transition_state = max(atom_configs, key=lambda x: x.get_potential_energy())
216 |             write(os.path.join(output, "transition_state.xyz"), transition_state)
217 |             write(os.path.join(output, "transition_state.png"), transition_state)
218 |             write(os.path.join(output, "reactant.xyz"), atom_configs[0])
219 |             write(os.path.join(output, "reactant.png"), atom_configs[0])
220 |             write(os.path.join(output, "product.xyz"), atom_configs[-1])
221 |             write(os.path.join(output, "product.png"), atom_configs[-1])
222 |             write(os.path.join(output, "mep.xyz"), atom_configs)        
223 |             frames_to_gif(atom_configs, os.path.join(output, "mep.gif"))
224 | 
225 |             return seed
226 |         
227 |         except Exception as e:
228 |             #print(f"Error processing seed {seed}: {e}")
229 |             return None
230 | 
231 | def main(args):
232 |     
233 |     print_args(args)
234 |     
235 |     input_path  = args.input_path
236 |     if not os.path.isdir(input_path):
237 |         sys.exit(f"Error: '{input_path}' is not a directory.")
238 |     output_path = args.output_path
239 |     if not os.path.exists(output_path):
240 |         os.mkdir(output_path)
241 |     max_workers = args.max_workers
242 |     n_images    = args.n_images
243 |     neb_fmax    = args.neb_fmax
244 |     cineb_fmax  = args.cineb_fmax
245 |     steps       = args.steps
246 | 
247 |     
248 |     seeds = [dirpath for dirpath, _, filenames in os.walk(input_path) if "ts.png" in filenames]
249 |     
250 |     bar_format = '{l_bar}{bar}| {n_fmt}/{total_fmt} [{elapsed}<{remaining}]'
251 |     # Use a partial function to pass the extra arguments to process_seed
252 |     process_with_args = partial(process_seed, n_images=n_images, neb_fmax=neb_fmax, 
253 |                                 cineb_fmax=cineb_fmax, steps=steps, output_path=output_path)
254 |     with ProcessPoolExecutor(max_workers=max_workers) as executor:
255 |         results = list(tqdm(executor.map(process_with_args, seeds), 
256 |                             desc='Seeds', total=len(seeds), smoothing=0, bar_format=bar_format, ncols=70))
257 | 
258 |     print('xTB-NEB completed!')
259 | 
260 | def print_args(args):
261 |     print()
262 |     print("Arguments provided:")
263 |     arg_dict = vars(args)
264 |     for key, value in arg_dict.items():
265 |         print(f"  {key}: {value}")
266 |     print()
267 |     
268 | 
269 | def get_parser():
270 |     parser = argparse.ArgumentParser(description="Run NEB calculations on filtered gsm jobs")
271 | 
272 |     parser.add_argument('-i', '--input_path', type=str, required=True,
273 |                         help='Path of input directory containing filtered gsm jobs.')
274 |     parser.add_argument('-o', '--output_path', type=str, required=True,
275 |                         help='Path of output directory to store results.')
276 |     parser.add_argument('-n', '--max_workers', type=int, default=1,
277 |                         help='Number of processes to use for parallel execution.')
278 |     parser.add_argument('--n_images', type=int, default=10,
279 |                         help='Number of images for NEB.')
280 |     parser.add_argument('--neb_fmax', type=float, default=0.5,
281 |                         help='Fmax threshold for NEB.')
282 |     parser.add_argument('--cineb_fmax', type=float, default=0.05,
283 |                         help='Fmax threshold for CI-NEB.')
284 |     parser.add_argument('--steps', type=int, default=500,
285 |                         help='Maximum number of optimization steps.')
286 | 
287 |     return parser
288 | 
289 | 
290 | 
291 | if __name__ == "__main__":
292 |     args = get_parser().parse_args()
293 |     main(args)
294 | 


--------------------------------------------------------------------------------
/dandelion/prep/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mhyeok1/dand/6c9ff246047ff37c89ab2b338f236e4798b3e2c2/dandelion/prep/__init__.py


--------------------------------------------------------------------------------
/dandelion/prep/geom_opt.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | import shutil
 4 | import argparse
 5 | import warnings
 6 | 
 7 | from ase.io import read
 8 | from ase.optimize import BFGS
 9 | from xtb.ase.calculator import XTB
10 | from tqdm import tqdm
11 | from concurrent.futures import ProcessPoolExecutor, as_completed
12 | 
13 | def write_xyz(filename, atoms):
14 |     with open(filename, 'w') as f:
15 |         f.write(f"{len(atoms)}\n\n")
16 |         for atom in atoms:
17 |             f.write(f"{atom.symbol:<2}  {atom.position[0]:15.8f}  {atom.position[1]:15.8f}  {atom.position[2]:15.8f}\n")
18 | 
19 | def generate_eq_struc(atoms):
20 |     atoms.calc = XTB(method="GFN2-xTB")
21 |     with warnings.catch_warnings():
22 |         warnings.simplefilter("ignore")
23 |         opt = BFGS(atoms, logfile=None)
24 |         opt.run(fmax=1e-4)
25 |     return atoms
26 | 
27 | def process_file(input_file, output_dir):
28 |     filename = os.path.basename(input_file)
29 |     mol_dir = os.path.join(output_dir, os.path.splitext(filename)[0])
30 |     os.makedirs(mol_dir, exist_ok=True)
31 |     
32 |     # Copy original file
33 |     shutil.copy(input_file, mol_dir)
34 |     
35 |     # Generate and save optimized structure
36 |     atoms = read(input_file)
37 |     optimized_atoms = generate_eq_struc(atoms)
38 |     write_xyz(os.path.join(mol_dir, 'struc.xyz'), optimized_atoms)
39 |     
40 |     # Remove the original copied file
41 |     os.remove(os.path.join(mol_dir, filename))
42 | 
43 | def main(args):
44 |     print_args(args)
45 |     
46 |     input_path = os.path.abspath(args.input_path)
47 |     if not os.path.isdir(input_path):
48 |         sys.exit(f"Error: '{input_path}' is not a directory.")
49 |     output_path = os.path.abspath(args.output_path)
50 |     max_workers = args.max_workers
51 |     
52 |     # Get list of all .xyz files
53 |     xyz_files = []
54 |     for root, _, files in os.walk(input_path):
55 |         xyz_files.extend([os.path.join(root, f) for f in files if f.endswith('.xyz')])
56 |     
57 |     # Process files in parallel with progress bar
58 |     bar_format = '{l_bar}{bar}| {n_fmt}/{total_fmt} [{elapsed}<{remaining}]'
59 |     with ProcessPoolExecutor(max_workers=max_workers) as executor:
60 |         list(tqdm(executor.map(process_file, xyz_files, [output_path]*len(xyz_files)), 
61 |                   total=len(xyz_files), desc="Optimizing structures", smoothing=0, bar_format=bar_format, ncols=70))
62 | 
63 | def print_args(args):
64 |     print("\nArguments provided:")
65 |     for key, value in vars(args).items():
66 |         print(f"  {key}: {value}")
67 |     print()
68 | 
69 | def get_parser():
70 |     parser = argparse.ArgumentParser(description="Optimize geometries using xTB")
71 |     parser.add_argument('-i', '--input_path', required=True, 
72 |                         help="Path of the input reactants directory")
73 |     parser.add_argument('-o', '--output_path', required=True,
74 |                         help='Path of output directory to store optimized geometries')
75 |     parser.add_argument('-n', '--max_workers', type=int, default=1,
76 |                         help='Number of processes to use for parallel execution.')
77 |     return parser
78 | 
79 | if __name__ == "__main__":
80 |     args = get_parser().parse_args()
81 |     main(args)


--------------------------------------------------------------------------------
/dandelion/prep/smiles_to_isoconfs.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | import argparse
 4 | import subprocess
 5 | 
 6 | from rdkit import Chem
 7 | from rdkit.Chem.EnumerateStereoisomers import EnumerateStereoisomers, StereoEnumerationOptions
 8 | 
 9 | def obabel_command(input_data, input_format, output_str, options=[], output_path=None):
10 |     cmd = ['obabel', '-i', input_format] + input_data + ['-O', output_str] + options
11 |     full_output_path = os.path.join(output_path, output_str) if output_path else output_str
12 |     subprocess.run(cmd, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, cwd=output_path)
13 |     return full_output_path
14 | 
15 | def obabel_from_smiles(smiles_str, output_str, options=[], output_path=None):
16 |     cmd = ['obabel', '-ismi', '-', '-O', output_str] + options
17 |     full_output_path = os.path.join(output_path, output_str) if output_path else output_str
18 |     process = subprocess.Popen(cmd, stdin=subprocess.PIPE, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, cwd=output_path)
19 |     process.communicate(input=smiles_str.encode())
20 |     return full_output_path
21 | 
22 | def cleanup_files(output_path, files_to_remove):
23 |     for file in files_to_remove:
24 |         file_path = os.path.join(output_path, file)
25 |         if os.path.exists(file_path):
26 |             os.remove(file_path)
27 | 
28 | def main(args):
29 |     print_args(args)
30 |     
31 |     input_path = os.path.abspath(args.input_path)
32 |     if not os.path.isfile(input_path):
33 |         sys.exit(f"Error: '{input_path}' is not a file.")
34 |     output_path = os.path.abspath(args.output_path)
35 |     
36 |     if not os.path.exists(output_path):
37 |         os.makedirs(output_path)
38 | 
39 |     with open(input_path, 'r') as f:
40 |         lines = f.readlines()
41 |         lines = list(map(lambda s: s.strip(), lines))
42 | 
43 |     for m, mol_smi in enumerate(lines):
44 |         print(f'==={m+1}th molecules : {mol_smi} ')
45 |         mol = Chem.MolFromSmiles(mol_smi)
46 |         opts = StereoEnumerationOptions(tryEmbedding=True, unique=True)
47 |         isomers = tuple(EnumerateStereoisomers(mol, options=opts))
48 |         for i, isomer_smi in enumerate(Chem.MolToSmiles(x, isomericSmiles=True) for x in isomers):
49 |             print(f'-{i+1}th isomer : {isomer_smi}')
50 |             
51 |             gen3d_file = obabel_from_smiles(isomer_smi, 'gen3d.xyz', ['--gen3d'], output_path=output_path)
52 |             confab_file = obabel_command([os.path.basename(gen3d_file)], 'xyz', 'confab.sdf', ['--confab', '--rcutoff', '1.0'], output_path=output_path)
53 |             obabel_command([os.path.basename(confab_file)], 'sdf', f'm{m+1}-i{i+1}-c.xyz', ['-m'], output_path=output_path)
54 | 
55 |     cleanup_files(output_path, ['confab.sdf', 'gen3d.xyz'])
56 | 
57 | 
58 | def print_args(args):
59 |     print()
60 |     print("Arguments provided:")
61 |     arg_dict = vars(args)
62 |     for key, value in arg_dict.items():
63 |         print(f"  {key}: {value}")
64 |     print()
65 | 
66 | def get_parser():
67 |     parser = argparse.ArgumentParser(description="Generate Iso/Conformers from SMILES using RDkit and Obabel")
68 |     
69 |     parser.add_argument('-i', '--input_path', required=True, 
70 |                         help="Path of the input SMILES string file")
71 |     parser.add_argument('-o', '--output_path', type=str, required=True,
72 |                         help='Path of output directory to store Iso/Conformers.')
73 |     return parser
74 | 
75 | if __name__ == "__main__":
76 |     args = get_parser().parse_args()
77 |     main(args)


--------------------------------------------------------------------------------
/dandelion/refine/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mhyeok1/dand/6c9ff246047ff37c89ab2b338f236e4798b3e2c2/dandelion/refine/__init__.py


--------------------------------------------------------------------------------
/dandelion/refine/compile_refined.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import sys
  3 | import argparse
  4 | 
  5 | import h5py
  6 | from ase.db import connect
  7 | 
  8 | 
  9 | def main(args):
 10 |     
 11 |     print_args(args)
 12 |     
 13 |     input_path  = args.input_path
 14 |     if not os.path.isfile(input_path):
 15 |         sys.exit(f"Error: '{input_path}' is not a file.")
 16 |     output_path = args.output_path
 17 |     
 18 |     # Data structure to hold the computed results
 19 |     rxn_data = {}
 20 |     
 21 |     rows = []  # List to store all rows
 22 | 
 23 |     # Extract data from ASE database
 24 |     with connect(input_path) as db:
 25 |         for row in db.select():
 26 |             if hasattr(row, 'energy') and hasattr(row, 'forces'):
 27 |                 rows.append(row)
 28 |             
 29 |     # Sort rows based on the unique_id number
 30 |     rows.sort(key=lambda r: int(r.data['unique_id'].split('_')[-1]))
 31 | 
 32 |     # Process sorted rows
 33 |     for row in rows:
 34 |         # Extract unique_id and other data
 35 |         unique_id = row.data['unique_id']
 36 |         chem_group_name, rxn_group_name, index = unique_id.split('_')
 37 |         
 38 |         if chem_group_name not in rxn_data:
 39 |             rxn_data[chem_group_name] = {}
 40 |         
 41 |         if rxn_group_name not in rxn_data[chem_group_name]:
 42 |             rxn_data[chem_group_name][rxn_group_name] = {
 43 |                 'atomic_numbers': row.toatoms().numbers,
 44 |                 'energies': [],
 45 |                 'forces': [],
 46 |                 'positions': []
 47 |             }
 48 |         rxn_data[chem_group_name][rxn_group_name]['energies'].append(row.energy)
 49 |         rxn_data[chem_group_name][rxn_group_name]['forces'].append(row.forces)
 50 |         rxn_data[chem_group_name][rxn_group_name]['positions'].append(row.toatoms().positions)
 51 | 
 52 |     # Save the data to an h5 file
 53 |     with h5py.File(output_path, 'w') as h5file:
 54 |         # Ensure the 'data' group exists
 55 |         if 'data' not in h5file:
 56 |             data_group = h5file.create_group('data')
 57 |         else:
 58 |             data_group = h5file['data']
 59 |         
 60 |         # Iterate through the rxn_data dictionary to save datasets
 61 |         for chem_group_name in rxn_data:
 62 |             if chem_group_name not in data_group:
 63 |                 chem_group = data_group.create_group(chem_group_name)
 64 |             else:
 65 |                 chem_group = data_group[chem_group_name]
 66 |             
 67 |             for rxn_group_name, rxn_entry in rxn_data[chem_group_name].items():
 68 |                 if rxn_group_name not in chem_group:
 69 |                     rxn_group = chem_group.create_group(rxn_group_name)
 70 |                 else:
 71 |                     rxn_group = chem_group[rxn_group_name]
 72 |                 
 73 |                 # Add datasets to the reaction group
 74 |                 rxn_group.create_dataset('atomic_numbers', data=rxn_entry['atomic_numbers'])
 75 |                 rxn_group.create_dataset('wB97x_6-31G(d).energy', data=rxn_entry['energies'])
 76 |                 rxn_group.create_dataset('wB97x_6-31G(d).forces', data=rxn_entry['forces'])
 77 |                 rxn_group.create_dataset('positions', data=rxn_entry['positions'])
 78 | 
 79 |     print('Compiled successfully!')
 80 | 
 81 | def print_args(args):
 82 |     print()
 83 |     print("Arguments provided:")
 84 |     arg_dict = vars(args)
 85 |     for key, value in arg_dict.items():
 86 |         print(f"  {key}: {value}")
 87 |     print()
 88 | 
 89 | def get_parser():
 90 |     parser = argparse.ArgumentParser(description="Translate ase db file into hdf5 file.")
 91 |     
 92 |     parser.add_argument('-i', '--input_path', required=True, 
 93 |                         help="Path of the input wB97X ASE db file")
 94 |     parser.add_argument('-o', '--output_path', required=True, 
 95 |                         help="Path of the output wB97X hdf5 file")
 96 | 
 97 |     return parser
 98 | 
 99 | if __name__ == "__main__":
100 |     args = get_parser().parse_args()
101 |     main(args)
102 | 
103 | 
104 | 


--------------------------------------------------------------------------------
/dandelion/refine/refine_forces.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import sys
  3 | import logging
  4 | import argparse
  5 | from itertools import repeat
  6 | from concurrent.futures import ProcessPoolExecutor, as_completed
  7 | 
  8 | import h5py
  9 | from tqdm import tqdm
 10 | from ase import Atoms
 11 | from ase.db import connect
 12 | from ase.calculators.orca import ORCA
 13 | 
 14 | 
 15 | # thank you for https://github.com/ZimmermanGroup/ORCA-Basis-Sets
 16 | custom_basis = '''
 17 | %basis
 18 |  newgto Br
 19 |  S   6
 20 |  1         0.1137182000E+06       0.1717696000E-02
 21 |  2         0.1707444000E+05       0.1316744000E-01
 22 |  3         0.3889576000E+04       0.6504553000E-01
 23 |  4         0.1097096000E+04       0.2269505000E+00
 24 |  5         0.3520624000E+03       0.4768357000E+00
 25 |  6         0.1207002000E+03       0.3583677000E+00
 26 |  S   6
 27 |  1         0.2471138000E+04       0.2243687000E-02
 28 |  2         0.5893838000E+03       0.2994853000E-01
 29 |  3         0.1918738000E+03       0.1256009000E+00
 30 |  4         0.7295339000E+02      -0.9832786000E-03
 31 |  5         0.3005839000E+02      -0.6013141000E+00
 32 |  6         0.1252927000E+02      -0.4913983000E+00
 33 |  P   6
 34 |  1         0.2471138000E+04       0.3790182000E-02
 35 |  2         0.5893838000E+03       0.2995979000E-01
 36 |  3         0.1918738000E+03       0.1318228000E+00
 37 |  4         0.7295339000E+02       0.3432708000E+00
 38 |  5         0.3005839000E+02       0.4642345000E+00
 39 |  6         0.1252927000E+02       0.2079387000E+00
 40 |  S   6
 41 |  1         0.1096411000E+03      -0.5975683000E-02
 42 |  2         0.3858948000E+02       0.5542122000E-01
 43 |  3         0.1637818000E+02       0.2681200000E+00
 44 |  4         0.7221836000E+01      -0.1543606000E+00
 45 |  5         0.3263697000E+01      -0.7206306000E+00
 46 |  6         0.1465499000E+01      -0.3316437000E+00
 47 |  P   6
 48 |  1         0.1096411000E+03      -0.6907483000E-02
 49 |  2         0.3858948000E+02      -0.3041432000E-01
 50 |  3         0.1637818000E+02       0.4602725000E-01
 51 |  4         0.7221836000E+01       0.3650689000E+00
 52 |  5         0.3263697000E+01       0.4949232000E+00
 53 |  6         0.1465499000E+01       0.2090394000E+00
 54 |  S   3
 55 |  1         0.2103651000E+01       0.3029029000E+00
 56 |  2         0.7547050000E+00      -0.2152659000E+00
 57 |  3         0.3005140000E+00      -0.9633941000E+00
 58 |  P   3
 59 |  1         0.2103651000E+01      -0.2826714000E-01
 60 |  2         0.7547050000E+00       0.3503065000E+00
 61 |  3         0.3005140000E+00       0.7182446000E+00
 62 |  S   1
 63 |  1         0.1090710000E+00       0.1000000000E+01 
 64 |  P   1
 65 |  1         0.1090710000E+00       0.1000000000E+01
 66 |  D   3
 67 |  1         0.6225514000E+02       0.7704229000E-01
 68 |  2         0.1731284000E+02       0.3707384000E+00
 69 |  3         0.5607915000E+01       0.7097628000E+00
 70 |  D   1
 71 |  1         0.1746486000E+01       1.0000000
 72 |  end
 73 | end
 74 | '''
 75 | 
 76 | 
 77 | 
 78 | class tqdm_hour(tqdm):
 79 |     """Provides an `hours per iteration` format parameter."""
 80 |     @property
 81 |     def format_dict(self):
 82 |         d = super(tqdm_hour, self).format_dict
 83 |         rate_hr = '{:.1f}'.format(1/d["rate"] / 3600) if d["rate"] else '?'
 84 |         d.update(rate_hr=(rate_hr + ' hour/' + d['unit']))
 85 |         return d
 86 | 
 87 | class tqdm_minute(tqdm):
 88 |     """Provides a `minutes per iteration` format parameter"""
 89 |     @property
 90 |     def format_dict(self):
 91 |         d = super(tqdm_minute, self).format_dict
 92 |         rate_min = '{:.0f}'.format(1/d["rate"] / 60) if d["rate"] else '?'
 93 |         d.update(rate_min=(rate_min + ' min/' + d['unit']))
 94 |         return d
 95 | 
 96 | bar_format_hr = '{l_bar}{bar}| {n_fmt}/{total_fmt} [{elapsed}<{remaining}, {rate_hr}{postfix}]'
 97 | bar_format_min = '{l_bar}{bar}| {n_fmt}/{total_fmt} [{elapsed}<{remaining}, {rate_min}{postfix}]'
 98 | bar_format_points = '{l_bar}{bar}| {n_fmt}/{total_fmt} [{elapsed}<{remaining}]'
 99 | 
100 | def get_unique_ids_from_db(output_path):
101 |     """Extract all unique IDs from the ASE database."""
102 |     unique_ids = set()
103 |     with connect(output_path) as db: 
104 |         for row in db.select():
105 |             data = row.data
106 |             if "unique_id" in data:
107 |                 unique_ids.add(data['unique_id'])
108 |     return unique_ids
109 | 
110 | def already_calculated(unique_id, unique_id_list):
111 |     """Check if a unique ID has already been processed."""
112 |     return unique_id in unique_id_list
113 | 
114 | def compute_force(coord, atomic_numbers, unique_id, output_path):
115 |     """Compute forces using ORCA for a given set of coordinates."""
116 |     atoms = Atoms(positions=coord, numbers=atomic_numbers)
117 |     atoms.calc = ORCA(
118 |         label=os.path.join(os.path.dirname(output_path), f"orca/{unique_id}/{unique_id}"),
119 |         orcasimpleinput="wB97X 6-31G(d) NoTrah",
120 |         orcablocks=custom_basis
121 |     )
122 |     try:
123 |         # Forces and energy will be stored in the calculator of the Atoms object.
124 |         atoms.get_forces()
125 |         return atoms
126 |     except Exception as e:
127 |         # Log the error
128 |         logging.error(f"Error in computing forces for unique_id {unique_id}: {e}")
129 |         return None
130 |     
131 | def accumulate_files_for_deletion(unique_id, output_path, files_to_delete, file_exts=['gbw', 'engrad', 'densities', 'ase']):
132 |     for ext in file_exts:
133 |         file_path = os.path.join(os.path.dirname(output_path), f"orca/{unique_id}/{unique_id}.{ext}")
134 |         if os.path.exists(file_path):
135 |             files_to_delete.add(file_path)
136 |             
137 | def main(args):
138 |     """Main function to orchestrate the computations and database writing."""
139 |     print_args(args)
140 |     
141 |     input_path = args.input_path
142 |     if not os.path.isfile(input_path):
143 |         sys.exit(f"Error: '{input_path}' is not a file.")
144 |     output_path = args.output_path
145 |     max_workers = args.max_workers
146 |     orcabinary = args.orca
147 |     
148 |     os.environ["ASE_ORCA_COMMAND"] = f"{orcabinary} PREFIX.inp > PREFIX.out 2>&1"
149 |     
150 |     os.makedirs(os.path.dirname(output_path), exist_ok=True)  
151 | 
152 |     log_file_path = os.path.join(os.path.dirname(output_path), 'orca_errors.log')
153 |     logging.basicConfig(filename=log_file_path, level=logging.ERROR, 
154 |                         format='%(asctime)s %(levelname)s: %(message)s', 
155 |                         datefmt='%Y-%m-%d %H:%M:%S')
156 | 
157 |     if os.path.isfile(output_path):
158 |         print(f'Restarting calculation from {output_path}')
159 |         is_restart = True
160 |     else:
161 |         print(f'Created db file at {output_path}\n')
162 |         is_restart = False
163 | 
164 | 
165 |     unique_ids_from_db = get_unique_ids_from_db(output_path)
166 |     if is_restart:
167 |         print(f'{len(unique_ids_from_db)} points are skipped.\n')
168 |         
169 |     files_to_delete = set()  # Set to accumulate files for deletion
170 | 
171 |     # Read from the input HDF5 file and compute the energies and forces.
172 |     with h5py.File(input_path, 'r') as f:
173 |         
174 |         for chem_group_name, chem_group in tqdm_hour(f['data'].items(), 
175 |                                                      desc="Formulas", 
176 |                                                      position=0, 
177 |                                                      smoothing=1, 
178 |                                                      bar_format=bar_format_hr,
179 |                                                      ncols=70):
180 |             
181 |             for rxn_group_name, rxn_group in tqdm_minute(chem_group.items(), 
182 |                                                          desc=f"Rxns in {chem_group_name}", 
183 |                                                          leave=False, 
184 |                                                          position=1, 
185 |                                                          smoothing=1, 
186 |                                                          bar_format=bar_format_min,
187 |                                                          ncols=70):
188 |                 
189 |                 positions_dataset = rxn_group['positions']
190 |                 coords = [coord for coord in positions_dataset]
191 |                 atomic_numbers = rxn_group['atomic_numbers'][:]
192 |                 args_atomic_numbers = repeat(atomic_numbers, len(coords))
193 |                 unique_ids = [f"{chem_group_name}_{rxn_group_name}_{index}" for index, _ in enumerate(positions_dataset)]
194 | 
195 |                 # Parallel computation using ProcessPoolExecutor.
196 |                 with ProcessPoolExecutor(max_workers=max_workers) as executor:
197 |                     future_to_unique_id = {executor.submit(compute_force, coord, atomic_number, unique_id, output_path): unique_id 
198 |                                             for coord, atomic_number, unique_id in zip(coords, args_atomic_numbers, unique_ids) 
199 |                                             if not already_calculated(unique_id, unique_ids_from_db)}
200 | 
201 |                     batch_size = max_workers  # Batch size set to the number of workers
202 |                     results_batch = []
203 | 
204 |                     # Process the completed tasks.
205 |                     for future in tqdm(as_completed(future_to_unique_id), 
206 |                                        total=len(future_to_unique_id), 
207 |                                        desc=f"Samples in {rxn_group_name}", 
208 |                                        leave=False, 
209 |                                        position=2, 
210 |                                        smoothing=0, 
211 |                                        bar_format=bar_format_points,
212 |                                        ncols=70):
213 |                         
214 |                         unique_id = future_to_unique_id[future]
215 |                         atoms_result = future.result() # Finished ASE Atoms object
216 |                         if atoms_result is not None:
217 |                             results_batch.append((atoms_result, {'unique_id': unique_id}))
218 |                             accumulate_files_for_deletion(unique_id, output_path, files_to_delete)
219 | 
220 |                         # Write to database in batches
221 |                         if len(results_batch) >= batch_size:
222 |                             with connect(output_path) as db:
223 |                                 for atoms, data in results_batch:
224 |                                     db.write(atoms, data=data)
225 |                             results_batch.clear()
226 | 
227 |                     # Write any remaining results in the batch
228 |                     for atoms, data in results_batch:
229 |                         with connect(output_path) as db:
230 |                             db.write(atoms, data=data)
231 |                     results_batch.clear()
232 |     
233 |     for file_path in files_to_delete:
234 |         os.remove(file_path)
235 | 
236 |     print('wB97X calculation finished!')
237 | 
238 | def print_args(args):
239 |     print()
240 |     print("Arguments provided:")
241 |     arg_dict = vars(args)
242 |     for key, value in arg_dict.items():
243 |         print(f"  {key}: {value}")
244 |     print()
245 | 
246 | def get_parser():
247 |     parser = argparse.ArgumentParser(description="Compute energies and forces and store in ASE database")
248 |     
249 |     parser.add_argument('-i', '--input_path', required=True, 
250 |                         help="Path of the input XTB HDF5 file")
251 |     parser.add_argument('-o', '--output_path', required=True, 
252 |                         help="Path of the output wB97X ASE database")
253 |     parser.add_argument('-n', '--max_workers', type=int, default=1, 
254 |                         help="Number of worker processes")
255 |     parser.add_argument('--orca', required=True, 
256 |                         help="Path of the orca binary file")
257 | 
258 |     return parser
259 | 
260 | if __name__ == "__main__":
261 |     args = get_parser().parse_args()
262 |     main(args)
263 | 
264 | 


--------------------------------------------------------------------------------
/dandelion/segsm/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mhyeok1/dand/6c9ff246047ff37c89ab2b338f236e4798b3e2c2/dandelion/segsm/__init__.py


--------------------------------------------------------------------------------
/dandelion/segsm/ard_gsm/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mhyeok1/dand/6c9ff246047ff37c89ab2b338f236e4798b3e2c2/dandelion/segsm/ard_gsm/__init__.py


--------------------------------------------------------------------------------
/dandelion/segsm/ard_gsm/driving_coords.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding:utf-8 -*-
  3 | 
  4 | import itertools
  5 | 
  6 | from .mol import Connection
  7 | from .limits import connection_limits
  8 | 
  9 | 
 10 | class ConnectionError(Exception):
 11 |     """
 12 |     For any invalid connection changes that occur in MolGraph.
 13 |     """
 14 |     pass
 15 | 
 16 | 
 17 | class DrivingCoords(object):
 18 |     def __init__(self, break_idxs=None, form_idxs=None):
 19 |         self._break_idxs = break_idxs or set()
 20 |         self._form_idxs = form_idxs or set()
 21 | 
 22 |         self.remove_duplicates()
 23 | 
 24 |     def __str__(self):
 25 |         s = ''
 26 |         for idxs in self._break_idxs:
 27 |             s += 'BREAK {0[0]} {0[1]}\n'.format(idxs)
 28 |         for idxs in self._form_idxs:
 29 |             s += 'ADD {0[0]} {0[1]}\n'.format(idxs)
 30 |         return s
 31 | 
 32 |     def __eq__(self, other):
 33 |         return str(self) == str(other)
 34 | 
 35 |     def __ne__(self, other):
 36 |         return not self == other
 37 | 
 38 |     def __hash__(self):
 39 |         return hash(str(self))
 40 | 
 41 |     def reconstruct_from_str(self, s):
 42 |         self._break_idxs = set()
 43 |         self._form_idxs = set()
 44 |         for line in s.splitlines():
 45 |             if 'BREAK' in line:
 46 |                 idxs = [int(idx) for idx in line.split()[1:]]
 47 |                 self.add_break_idxs(idxs)
 48 |             elif 'ADD' in line:
 49 |                 idxs = [int(idx) for idx in line.split()[1:]]
 50 |                 self.add_form_idxs(idxs)
 51 | 
 52 |     def remove_duplicates(self):
 53 |         self._break_idxs = {tuple(sorted(idxs)) for idxs in self._break_idxs}
 54 |         self._form_idxs = {tuple(sorted(idxs)) for idxs in self._form_idxs}
 55 | 
 56 |     def add_break_idxs(self, idxs):
 57 |         self._break_idxs.add(tuple(sorted(idxs)))
 58 | 
 59 |     def add_form_idxs(self, idxs):
 60 |         self._form_idxs.add(tuple(sorted(idxs)))
 61 | 
 62 |     def is_subset(self, other):
 63 |         """
 64 |         Return True if self is contained in other.
 65 |         """
 66 |         for idxs in self._break_idxs:
 67 |             if idxs not in other._break_idxs:
 68 |                 return False
 69 |         for idxs in self._form_idxs:
 70 |             if idxs not in other._form_idxs:
 71 |                 return False
 72 |         return True
 73 | 
 74 |     def get_connections(self, atoms):
 75 |         atoms_dict = {}
 76 |         for atom in atoms:
 77 |             if atom.idx is None:
 78 |                 raise Exception('Atom {} is missing index'.format(atom.symbol))
 79 |             else:
 80 |                 atoms_dict[atom.idx] = atom
 81 | 
 82 |         connections_break, connections_form = [], []
 83 |         for idxs in self._break_idxs:
 84 |             connection = Connection(atoms_dict[idxs[0]], atoms_dict[idxs[1]])
 85 |             connections_break.append(connection)
 86 |         for idxs in self._form_idxs:
 87 |             connection = Connection(atoms_dict[idxs[0]], atoms_dict[idxs[1]])
 88 |             connections_form.append(connection)
 89 | 
 90 |         return connections_break, connections_form
 91 | 
 92 | 
 93 | def generate_driving_coords(mol, maxbreak=3, maxform=3, maxchange=5, single_change=True, equiv_Hs=False,
 94 |                             minbreak=0, minform=0, minchange=1):
 95 |     """
 96 |     Generate the set of possible driving coordinates given a molecule. Only
 97 |     consider breaking a maximum of `maxbreak`, forming a maximum of `maxform`,
 98 |     and in total changing a maximum of `maxchange` connections (molecular
 99 |     bonds are considered without regard for the bond order). If `single_change`
100 |     is true, consider driving coordinates for (nbreak,nform) in ((0,1),(1,0))
101 |     in addition to the other ones. If `equiv_Hs` is true, generate essentially
102 |     equivalent driving coordinates for different but equivalent hydrogens,
103 |     i.e., those attached to the same non-cyclic tetrahedral carbon.
104 | 
105 |     Can also specify minbreak, minform, and minchange.
106 |     """
107 |     assert all(atom.idx is not None for atom in mol.atoms)
108 |     driving_coords_set = set()
109 | 
110 |     mol = mol.copy(deep=True)
111 |     if not equiv_Hs:
112 |         mol.label_equivalent_hydrogens()
113 | 
114 |     # Enumerate all possible connections between atoms
115 |     # and remove the ones for atoms that are already connected
116 |     atoms = mol.atoms
117 |     connections = mol.get_all_connections()
118 |     all_possible_connections = [Connection(atom1, atom2)
119 |                                 for i, atom1 in enumerate(atoms)
120 |                                 for atom2 in atoms[(i+1):]
121 |                                 if not atom1.frozen and not atom2.frozen]
122 |     all_potential_new_connections = [connection for connection in all_possible_connections
123 |                                      if connection not in connections]
124 | 
125 |     for nbreak in range(minbreak, maxbreak+1):
126 |         for nform in range(minform, maxform+1):
127 |             if nbreak + nform < minchange:
128 |                 continue
129 |             elif nbreak + nform > maxchange:
130 |                 continue
131 |             elif not single_change and (nbreak + nform == 1):
132 |                 continue
133 | 
134 |             # Generate all possible combinations of connection changes
135 |             potential_remove_connections_iter = itertools.combinations(connections, nbreak)
136 |             potential_new_connections_iter = itertools.combinations(all_potential_new_connections, nform)
137 |             potential_connection_changes = itertools.product(potential_remove_connections_iter,
138 |                                                              potential_new_connections_iter)
139 | 
140 |             for connections_to_break, connections_to_form in potential_connection_changes:
141 |                 try:
142 |                     change_connections(mol, connections_to_break, connections_to_form)
143 |                 except ConnectionError:
144 |                     continue
145 |                 else:
146 |                     break_idxs = [(c.atom1.idx, c.atom2.idx) for c in connections_to_break]
147 |                     form_idxs = [(c.atom1.idx, c.atom2.idx) for c in connections_to_form]
148 |                     driving_coords = DrivingCoords(break_idxs=break_idxs, form_idxs=form_idxs)
149 |                     driving_coords_set.add(driving_coords)
150 |                 finally:
151 |                     # Always restore connections for next molecule test
152 |                     change_connections(mol, connections_to_form, connections_to_break, test_validity=False)
153 | 
154 |     return driving_coords_set
155 | 
156 | 
157 | def change_connections(mol, connections_to_break, connections_to_form, test_validity=True):
158 |     for connection in connections_to_break:
159 |         mol.remove_connection(connection)
160 |     for connection in connections_to_form:
161 |         mol.add_connection(connection)
162 | 
163 |     if test_validity:
164 |         # Only have to test the atoms involved in the changed connections
165 |         for connection in connections_to_break:
166 |             if not test_connection_validity(connection):
167 |                 raise ConnectionError('Breaking {} resulted in violation of connection limits'.format(connection))
168 |         for connection in connections_to_form:
169 |             if not test_connection_validity(connection):
170 |                 raise ConnectionError('Forming {} resulted in violation of connection limits'.format(connection))
171 | 
172 | 
173 | def test_connection_validity(connection):
174 |     atom1 = connection.atom1
175 |     atom2 = connection.atom2
176 |     atom1_ll, atom1_ul = connection_limits[atom1.symbol.upper()]
177 |     atom2_ll, atom2_ul = connection_limits[atom2.symbol.upper()]
178 |     if len(atom1.connections) < atom1_ll or len(atom1.connections) > atom1_ul:
179 |         return False
180 |     elif len(atom2.connections) < atom2_ll or len(atom2.connections) > atom2_ul:
181 |         return False
182 |     else:
183 |         return True
184 | 


--------------------------------------------------------------------------------
/dandelion/segsm/ard_gsm/limits.py:
--------------------------------------------------------------------------------
 1 | # First number is the minimum number of connections;
 2 | # second number is the maximum number of connections.
 3 | connection_limits = {
 4 |     'H': (1, 1),
 5 |     'C': (2, 4),
 6 |     'N': (1, 3),
 7 |     'O': (1, 2),
 8 |     'F': (1, 1),
 9 |     'S': (1, 4),
10 |     'CL':(1, 1),
11 |     'BR':(1, 1),
12 |     'LI':(0, 1)
13 | }


--------------------------------------------------------------------------------
/dandelion/segsm/ard_gsm/mol.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding:utf-8 -*-
  3 | 
  4 | import os
  5 | 
  6 | import numpy as np
  7 | from openbabel import pybel
  8 | from rdkit import Chem
  9 | from rdkit.Chem import AllChem, GetPeriodicTable
 10 | import networkx as nx
 11 | _rdkit_periodic_table = GetPeriodicTable()
 12 | 
 13 | 
 14 | class SanitizationError(Exception):
 15 |     """
 16 |     Exception class to handle errors during SMILES perception.
 17 |     """
 18 |     pass
 19 | 
 20 | 
 21 | class Atom(object):
 22 |     """
 23 |     Represents an atom in a molecular graph.
 24 |     """
 25 | 
 26 |     def __init__(self, symbol=None, idx=None, coords=np.array([]), frozen=False):
 27 |         self.symbol = symbol
 28 |         self.idx = idx
 29 |         self.coords = coords
 30 |         self.frozen = frozen
 31 |         self.connections = {}
 32 | 
 33 |     def __str__(self):
 34 |         return '{}: {}'.format(self.idx, self.symbol)
 35 | 
 36 |     def __repr__(self):
 37 |         return '<Atom "{}">'.format(str(self))
 38 | 
 39 |     def copy(self):
 40 |         return Atom(
 41 |             symbol=self.symbol,
 42 |             idx=self.idx,
 43 |             coords=self.coords.copy(),
 44 |             frozen=self.frozen,
 45 |         )
 46 | 
 47 |     def get_atomicnum(self):
 48 |         return _rdkit_periodic_table.GetAtomicNumber(self.symbol)
 49 | 
 50 |     def get_cov_rad(self):
 51 |         return _rdkit_periodic_table.GetRcovalent(self.symbol)
 52 | 
 53 | 
 54 | class Connection(object):
 55 |     """
 56 |     Represents a connection in a molecular graph.
 57 | 
 58 |     Note: Equality and hash are only based on atom symbols and indices.
 59 |     """
 60 | 
 61 |     def __init__(self, atom1, atom2):
 62 |         self._atom1 = atom1
 63 |         self._atom2 = atom2
 64 |         self._make_order_invariant()
 65 | 
 66 |     def __str__(self):
 67 |         return '({})--({})'.format(str(self.atom1), str(self.atom2))
 68 | 
 69 |     def __repr__(self):
 70 |         return '<Connection "{}">'.format(str(self))
 71 | 
 72 |     def __eq__(self, other):
 73 |         return str(self) == str(other)
 74 | 
 75 |     def __ne__(self, other):
 76 |         return not self == other
 77 | 
 78 |     def __hash__(self):
 79 |         return hash(str(self))
 80 | 
 81 |     def _make_order_invariant(self):
 82 |         # Ensure that atom ordering is consistent
 83 |         atoms = [self._atom1, self._atom2]
 84 |         atoms.sort(key=lambda a: a.symbol)
 85 |         if self._atom1.idx is not None or self._atom2.idx is not None:
 86 |             atoms.sort(key=lambda a: a.idx)
 87 |         self._atom1, self._atom2 = atoms
 88 | 
 89 |     @property
 90 |     def atom1(self):
 91 |         return self._atom1
 92 | 
 93 |     @property
 94 |     def atom2(self):
 95 |         return self._atom2
 96 | 
 97 |     @atom1.setter
 98 |     def atom1(self, val):
 99 |         self._atom1 = val
100 |         self._make_order_invariant()
101 | 
102 |     @atom2.setter
103 |     def atom2(self, val):
104 |         self._atom2 = val
105 |         self._make_order_invariant()
106 | 
107 |     def copy(self):
108 |         return Connection(self.atom1, self.atom2)
109 | 
110 | 
111 | class MolGraph(object):
112 |     """
113 |     Class to convert coordinates to a molecular graph
114 |     and to generate driving coordinates.
115 | 
116 |     Note: Atom indices start at 1.
117 |     """
118 | 
119 |     def __init__(self, atoms=None, symbols=None, coords=None, energy=None):
120 |         self.atoms = atoms or []
121 |         self.energy = energy
122 | 
123 |         if not self.atoms and symbols is not None:
124 |             for idx, symbol in enumerate(symbols):
125 |                 atom = Atom(symbol=symbol, idx=idx+1)
126 |                 self.add_atom(atom)
127 | 
128 |         if coords is not None:
129 |             self.set_coords(coords)
130 | 
131 |     def __iter__(self):
132 |         for atom in self.atoms:
133 |             yield atom
134 | 
135 |     def get_formula(self):
136 |         """
137 |         Return the molecular formula corresponding to the graph.
138 |         """
139 |         # Count the numbers of each element
140 |         elements = {}
141 |         for atom in self:
142 |             symbol = atom.symbol
143 |             elements[symbol] = elements.get(symbol, 0) + 1
144 | 
145 |         # Carbon and hydrogen come first if carbon is present, other
146 |         # atoms come in alphabetical order (also hydrogen if there is no
147 |         # carbon)
148 |         formula = ''
149 |         if 'C' in elements.keys():
150 |             count = elements['C']
151 |             formula += 'C{:d}'.format(count) if count > 1 else 'C'
152 |             del elements['C']
153 |             if 'H' in elements.keys():
154 |                 count = elements['H']
155 |                 formula += 'H{:d}'.format(count) if count > 1 else 'H'
156 |                 del elements['H']
157 |         keys = elements.keys()
158 |         keys.sort()
159 |         for key in keys:
160 |             count = elements[key]
161 |             formula += '{}{:d}'.format(key, count) if count > 1 else key
162 | 
163 |         return formula
164 | 
165 |     def to_rdkit_mol(self):
166 |         """
167 |         Convert the graph to an RDKit molecule with atom map numbers set
168 |         by the indices of the atoms.
169 |         """
170 |         assert all(atom.idx is not None for atom in self)
171 | 
172 |         rd_mol = Chem.rdchem.EditableMol(Chem.rdchem.Mol())
173 |         for atom in self:
174 |             rd_atom = Chem.rdchem.Atom(atom.symbol)
175 |             rd_atom.SetAtomMapNum(atom.idx)
176 |             rd_mol.AddAtom(rd_atom)
177 | 
178 |         for atom1 in self:
179 |             for atom2, connection in atom1.connections.items():
180 |                 idx1 = self.atoms.index(atom1)  # This is the index in the atoms list
181 |                 idx2 = self.atoms.index(atom2)
182 |                 if idx1 < idx2:
183 |                     rd_mol.AddBond(idx1, idx2, Chem.rdchem.BondType.SINGLE)
184 | 
185 |         rd_mol = rd_mol.GetMol()
186 |         return rd_mol
187 | 
188 |     def to_pybel_mol(self, from_coords=True):
189 |         """
190 |         Convert the graph to a Pybel molecule. Currently only supports
191 |         creating the molecule from 3D coordinates.
192 |         """
193 |         if from_coords:
194 |             xyz = self.to_xyz()
195 |             mol = pybel.readstring('xyz', xyz)
196 |             return mol
197 |         else:
198 |             raise NotImplementedError('Can only create Pybel molecules from 3D structure')
199 | 
200 |     def to_xyz(self, comment=''):
201 |         """
202 |         Convert the graph to an XYZ-format string. Optionally, add
203 |         comment on the second line.
204 |         """
205 |         for atom in self:
206 |             assert len(atom.coords) != 0
207 |         symbols, coords = self.get_geometry()
208 |         cblock = ['{0}  {1[0]: .10f}  {1[1]: .10f}  {1[2]: .10f}'.format(s, c) for s, c in zip(symbols, coords)]
209 |         return str(len(symbols)) + '\n' + comment + '\n' + '\n'.join(cblock)
210 | 
211 |     def perceive_smiles(self, atommap=True):
212 |         """
213 |         Using the geometry, perceive the corresponding SMILES with bond
214 |         orders using Open Babel and RDKit. In order to create a sensible
215 |         SMILES, first infer the connectivity from the 3D coordinates
216 |         using Open Babel, then convert to InChI to saturate unphysical
217 |         multi-radical structures, then convert to RDKit and match the
218 |         atoms to the ones in self in order to return a SMILES with atom
219 |         mapping corresponding to the order given by the values of
220 |         atom.idx for all atoms in self.
221 | 
222 |         This method requires Open Babel version >=2.4.1
223 |         """
224 | 
225 |         # Get dict of atomic numbers for later comparison.
226 |         atoms_in_mol_true = {}
227 |         for atom in self:
228 |             anum = atom.get_atomicnum()
229 |             atoms_in_mol_true[anum] = atoms_in_mol_true.get(anum, 0) + 1
230 | 
231 |         # There seems to be no particularly simple way in RDKit to read
232 |         # in 3D structures, so use Open Babel for this part. RMG doesn't
233 |         # recognize some single bonds, so we can't use that.
234 |         # We've probably called to_pybel_mol at some previous time to set
235 |         # connections, but it shouldn't be too expensive to do it again.
236 |         pybel_mol = self.to_pybel_mol()
237 | 
238 |         # Open Babel will often make single bonds and generate Smiles
239 |         # that have multiple radicals, which would probably correspond
240 |         # to double bonds. To get around this, convert to InChI (which
241 |         # does not consider bond orders) and then convert to Smiles.
242 |         inchi = pybel_mol.write('inchi', opt={'F': None}).strip()  # Add fixed H layer
243 | 
244 |         # Use RDKit to convert back to Smiles
245 |         mol_sanitized = Chem.MolFromInchi(inchi)
246 | 
247 |         # RDKit doesn't like some hypervalent atoms
248 |         if mol_sanitized is None:
249 |             raise SanitizationError(
250 |                 'Could not convert \n{}\nto Smiles. Unsanitized Smiles: {}'.format(self.to_xyz(),
251 |                                                                                    pybel_mol.write('smi').strip())
252 |             )
253 | 
254 |         # RDKit adds unnecessary hydrogens in some cases. If
255 |         # this happens, give up and return an error.
256 |         mol_sanitized = Chem.AddHs(mol_sanitized)
257 |         atoms_in_mol_sani = {}
258 |         for atom in mol_sanitized.GetAtoms():
259 |             atoms_in_mol_sani[atom.GetAtomicNum()] = atoms_in_mol_sani.get(atom.GetAtomicNum(), 0) + 1
260 |         if atoms_in_mol_sani != atoms_in_mol_true:
261 |             raise SanitizationError(
262 |                 'Could not convert \n{}\nto Smiles. Wrong Smiles: {}'.format(self.to_xyz(),
263 |                                                                              Chem.MolToSmiles(mol_sanitized))
264 |             )
265 | 
266 |         if not atommap:
267 |             return Chem.MolToSmiles(mol_sanitized)
268 | 
269 |         # Because we went through InChI, we lost atom mapping
270 |         # information. Restore it by matching the original molecule.
271 |         # There should only be one unique map.
272 |         mol_with_map = self.to_rdkit_mol()  # This only has single bonds
273 |         mol_sani_sb = Chem.Mol(mol_sanitized)  # Make copy with single bonds only
274 |         for bond in mol_sani_sb.GetBonds():
275 |             bond.SetBondType(Chem.rdchem.BondType.SINGLE)
276 |         match = mol_sani_sb.GetSubstructMatch(mol_with_map)  # Isomorphism mapping
277 |         assert mol_with_map.GetNumAtoms() == len(match)  # Make sure we match all atoms
278 |         for atom in mol_with_map.GetAtoms():
279 |             idx = match[atom.GetIdx()]
280 |             map_num = atom.GetAtomMapNum()
281 |             mol_sanitized.GetAtomWithIdx(idx).SetAtomMapNum(map_num)
282 | 
283 |         # If everything succeeded up to here, we hopefully have a
284 |         # sensible Smiles string with atom mappings for all atoms.
285 |         return Chem.MolToSmiles(mol_sanitized)
286 | 
287 |     def add_atom(self, atom):
288 |         self.atoms.append(atom)
289 |         atom.connections = {}
290 |         return atom
291 | 
292 |     def add_connection(self, connection=None, atom1=None, atom2=None):
293 |         """
294 |         Either add a connection directly or first create one from two
295 |         atoms and then add it.
296 |         """
297 |         if connection is None:
298 |             connection = Connection(atom1, atom2)
299 |         if connection.atom1 not in self.atoms or connection.atom2 not in self.atoms:
300 |             raise Exception('Cannot add connection between atoms not in the graph')
301 |         else:
302 |             connection.atom1.connections[connection.atom2] = connection
303 |             connection.atom2.connections[connection.atom1] = connection
304 |             return connection
305 | 
306 |     def get_all_connections(self):
307 |         return {connection for atom in self.atoms for connection in atom.connections.values()}
308 | 
309 |     def get_connection(self, atom1, atom2):
310 |         if atom1 not in self.atoms or atom2 not in self.atoms:
311 |             raise Exception('One or both of the specified atoms are not in this graph')
312 | 
313 |         try:
314 |             return atom1.connections[atom2]
315 |         except KeyError:
316 |             raise Exception('The specified atoms are not connected in this graph')
317 | 
318 |     def remove_atom(self, atom):
319 |         for atom2 in atom.connections:
320 |             del atom2.connections[atom]
321 |         atom.connections = {}
322 |         self.atoms.remove(atom)
323 | 
324 |     def remove_connection(self, connection):
325 |         if connection.atom1 not in self.atoms or connection.atom2 not in self.atoms:
326 |             raise Exception('Cannot remove connection between atoms not in the graph')
327 |         del connection.atom1.connections[connection.atom2]
328 |         del connection.atom2.connections[connection.atom1]
329 | 
330 |     def copy(self, deep=False):
331 |         other = MolGraph(energy=self.energy)
332 |         atoms = self.atoms
333 |         mapping = {}
334 |         for atom in atoms:
335 |             if deep:
336 |                 atom2 = other.add_atom(atom.copy())
337 |                 mapping[atom] = atom2
338 |             else:
339 |                 connections = atom.connections
340 |                 other.add_atom(atom)
341 |                 atom.connections = connections
342 |         if deep:
343 |             for atom1 in atoms:
344 |                 for atom2 in atom1.connections:
345 |                     connection = atom1.connections[atom2]
346 |                     connection = connection.copy()
347 |                     connection.atom1 = mapping[atom1]
348 |                     connection.atom2 = mapping[atom2]
349 |                     other.add_connection(connection)
350 |         return other
351 | 
352 |     def merge(self, other):
353 |         new = MolGraph()
354 |         for atom in self.atoms:
355 |             connections = atom.connections
356 |             new.add_atom(atom)
357 |             atom.connections = connections
358 |         for atom in other.atoms:
359 |             connections = atom.connections
360 |             new.add_atom(atom)
361 |             atom.connections = connections
362 |         new.energy = self.energy + other.energy
363 |         return new
364 | 
365 |     def split(self):
366 |         new1 = self.copy()
367 |         new2 = MolGraph()
368 | 
369 |         if len(self.atoms) == 0:
370 |             return [new1]
371 | 
372 |         atoms_to_move = [self.atoms[-1]]
373 |         idx = 0
374 |         while idx < len(atoms_to_move):
375 |             for atom2 in atoms_to_move[idx].connections:
376 |                 if atom2 not in atoms_to_move:
377 |                     atoms_to_move.append(atom2)
378 |             idx += 1
379 | 
380 |         if len(new1.atoms) == len(atoms_to_move):
381 |             return [new1]
382 | 
383 |         for atom in atoms_to_move:
384 |             new2.atoms.append(atom)
385 |             new1.atoms.remove(atom)
386 | 
387 |         new = [new2]
388 |         new.extend(new1.split())
389 |         new.energy = None
390 |         return new
391 | 
392 |     def sort_atoms(self):
393 |         self.atoms.sort(key=lambda a: a.idx)
394 | 
395 |     def is_radical(self):
396 |         """
397 |         Determine whether or not the molecule is a radical based on the number
398 |         of valence electrons for each atom. If the total number of valence
399 |         electrons is odd, then it is a radical. This assumes that molecules
400 |         with an even number of electrons are singlets. This method also assumes
401 |         that none of the atoms are charged.
402 |         """
403 |         valence_electrons = {'H': 1, 'C': 4, 'N': 5, 'O': 6, 'F': 7, 'P': 5, 'S': 6, 'Cl': 7, 'Br': 7, 'I': 7, 'Li':1}
404 |         symbols = [atom.symbol for atom in self]
405 |         total_valence_electrons = sum(valence_electrons[s] for s in symbols)
406 |         return bool(total_valence_electrons % 2)
407 | 
408 |     # def is_isomorphic(self, other):
409 |     #     """
410 |     #     Test if self is isomorphic with other, ignoring atom indices.
411 |     #     Requires RMG to do the isomorphism check.
412 |     #     """
413 |     #     self_rmg = self.to_rmg_mol()
414 |     #     other_rmg = other.to_rmg_mol()
415 |     #     return self_rmg.isIsomorphic(other_rmg)
416 | 
417 |     def topology_from_rdkit(self):
418 |         rdkit_molecule = self.to_rdkit_mol()
419 |         topology = nx.Graph()
420 |         for atom in rdkit_molecule.GetAtoms():
421 |             # Add the atoms as nodes
422 |             topology.add_node(atom.GetIdx())
423 | 
424 |             # Add the bonds as edges
425 |             for bonded in atom.GetNeighbors():
426 |                 topology.add_edge(atom.GetIdx(), bonded.GetIdx())
427 | 
428 |         return topology
429 | 
430 |     def is_isomorphic(self, other):
431 |         topology1 = self.topology_from_rdkit()
432 |         topology2 = self.topology_from_rdkit()
433 |         return nx.is_isomorphic(topology1, topology2)
434 | 
435 | 
436 |     def set_coords(self, coords):
437 |         """
438 |         Set atom coordinates. Assumes coords are in same order as self.atoms.
439 |         """
440 |         try:
441 |             coords = np.reshape(coords, (-1,3))
442 |         except ValueError:
443 |             raise Exception('Coordinates cannot be reshaped into matrix of size Nx3')
444 |         assert len(coords) == len(self.atoms)
445 | 
446 |         for atom, xyz in zip(self.atoms, coords):
447 |             atom.coords = xyz
448 | 
449 |     def get_coords(self):
450 |         """
451 |         Get coordinates in the order specified by the atom indices.
452 |         """
453 |         assert all(atom.idx is not None for atom in self)
454 |         atoms = self.atoms[:]
455 |         atoms.sort(key=lambda a: a.idx)
456 |         return np.array([atom.coords for atom in atoms])
457 | 
458 |     def get_symbols(self):
459 |         """
460 |         Get symbols in the order specified by the atom indices.
461 |         """
462 |         assert all(atom.idx is not None for atom in self)
463 |         atoms = self.atoms[:]
464 |         atoms.sort(key=lambda a: a.idx)
465 |         return [atom.symbol for atom in atoms]
466 | 
467 |     def get_geometry(self):
468 |         """
469 |         Get symbols and coordinates in the order specified by the atom
470 |         indices.
471 |         """
472 |         assert all(atom.idx is not None for atom in self)
473 |         atoms = self.atoms[:]
474 |         atoms.sort(key=lambda a: a.idx)
475 |         return [atom.symbol for atom in atoms], np.array([atom.coords for atom in atoms])
476 | 
477 |     def infer_connections(self, use_ob=True):
478 |         """
479 |         Delete connections and set them again based on coordinates.
480 | 
481 |         Note: By default this uses Open Babel, which is better than a
482 |         simple covalent radii check.
483 |         """
484 |         atoms = self.atoms
485 | 
486 |         for atom in atoms:
487 |             assert len(atom.coords) != 0
488 | 
489 |         for atom in atoms:
490 |             for connection in atom.connections:
491 |                 self.remove_connection(connection)
492 | 
493 |         if use_ob:
494 |             pybel_mol = self.to_pybel_mol()  # Should be sorted by atom indices
495 |             assert all(ap.idx == a.idx for ap, a in zip(pybel_mol, self))  # Check to be sure
496 |             mapping = {ap.idx: a for ap, a in zip(pybel_mol, self)}
497 |             for bond in pybel.ob.OBMolBondIter(pybel_mol.OBMol):
498 |                 atom1 = mapping[bond.GetBeginAtomIdx()]
499 |                 atom2 = mapping[bond.GetEndAtomIdx()]
500 |                 connection = Connection(atom1, atom2)
501 |                 self.add_connection(connection)
502 |         else:
503 |             sorted_atoms = sorted(atoms, key=lambda a: a.coords[2])
504 |             for i, atom1 in enumerate(sorted_atoms):
505 |                 for atom2 in sorted_atoms[(i+1):]:
506 |                     crit_dist = (atom1.get_cov_rad() + atom2.get_cov_rad() + 0.45)**2
507 |                     z_boundary = (atom1.coords[2] - atom2.coords[2])**2
508 |                     if z_boundary > 16.0:
509 |                         break
510 |                     dist_sq = sum((atom1.coords - atom2.coords)**2)
511 |                     if dist_sq > crit_dist or dist_sq < 0.4:
512 |                         continue
513 |                     else:
514 |                         connection = Connection(atom1, atom2)
515 |                         self.add_connection(connection)
516 | 
517 |     def is_atom_in_cycle(self, atom):
518 |         return self._is_chain_in_cycle([atom])
519 | 
520 |     def _is_chain_in_cycle(self, chain):
521 |         atom1 = chain[-1]
522 |         for atom2 in atom1.connections:
523 |             if atom2 is chain[0] and len(chain) > 2:
524 |                 return True
525 |             elif atom2 not in chain:
526 |                 chain.append(atom2)
527 |                 if self._is_chain_in_cycle(chain):
528 |                     return True
529 |                 else:
530 |                     chain.remove(atom2)
531 |         return False
532 | 
533 |     #def label_equivalent_hydrogens(self):
534 |     #    """
535 |     #    Mark all equivalent hydrogens as frozen. For now, this assumes that the
536 |     #    carbons they are attached to have 4 connections, which means this
537 |     #    method does not yet work for radicals.
538 |     #    """
539 |     #    if self.is_radical():
540 |     #        raise NotImplementedError('Cannot yet label equivalent hydrogens for radicals')
541 |     #    for atom in self:
542 |     #        if (atom.symbol.upper() == 'C'
543 |     #                and len(atom.connections) == 4
544 |     #                and not self.is_atom_in_cycle(atom)):
545 |     #            first_hydrogen = True
546 |     #            for atom2 in atom.connections:
547 |     #                if atom2.symbol.upper() == 'H':
548 |     #                    if first_hydrogen:
549 |     #                        first_hydrogen = False
550 |     #                    else:
551 |     #                        atom2.frozen = True
552 | 
553 |     def label_equivalent_hydrogens(self):
554 |         """
555 |         This version works with radicals. no assumption of four connections on carbon atoms.
556 |         Also works with other heavy elementts.
557 |         """
558 |         # Proceed even if the molecule is a radical
559 |         for atom in self:
560 |             if atom.symbol.upper() != 'H' and not self.is_atom_in_cycle(atom):
561 |                 hydrogens = [a for a in atom.connections if a.symbol.upper() == 'H']
562 |                 if len(hydrogens) > 1:
563 |                     first_hydrogen = True
564 |                     for hydrogen in hydrogens:
565 |                         if first_hydrogen:
566 |                             first_hydrogen = False
567 |                         else:
568 |                             hydrogen.frozen = True
569 | 


--------------------------------------------------------------------------------
/dandelion/segsm/create_gsm.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import sys
  3 | import glob
  4 | import argparse
  5 | 
  6 | from ase.io import read
  7 | from .ard_gsm.mol import MolGraph
  8 | from .ard_gsm.limits import connection_limits
  9 | from .ard_gsm.driving_coords import generate_driving_coords
 10 | 
 11 | 
 12 | def main(args):
 13 |     
 14 |     print_args(args)
 15 |     
 16 |     input_path           = args.input_path
 17 |     if not os.path.isdir(input_path):
 18 |         sys.exit(f"Error: '{input_path}' is not a directory.")
 19 |     output_path          = args.output_path
 20 |     maxbreak             = args.maxbreak
 21 |     maxform              = args.maxform
 22 |     maxchange            = args.maxchange
 23 |     minbreak             = args.minbreak
 24 |     minform              = args.minform
 25 |     minchange            = args.minchange
 26 |     ignore_single_change = args.ignore_single_change
 27 |     equiv_Hs             = args.equiv_Hs
 28 | 
 29 |     pdir = output_path
 30 |     if not os.path.exists(pdir):
 31 |         os.makedirs(pdir)
 32 | 
 33 |     with open(os.path.join(pdir, 'params.log'), 'w') as f:
 34 |         f.write('Connection limits:\n')
 35 |         for symbol in connection_limits:
 36 |             ll = connection_limits[symbol][0]
 37 |             ul = connection_limits[symbol][1]
 38 |             f.write('  {}: {}, {}\n'.format(symbol, ll, ul))
 39 |         f.write('maxbreak = {}\n'.format(maxbreak))
 40 |         f.write('maxform = {}\n'.format(maxform))
 41 |         f.write('maxchange = {}\n'.format(maxchange))
 42 |         f.write('single_change = {}\n'.format(not ignore_single_change))
 43 |         f.write('equiv_Hs = {}\n'.format(equiv_Hs))
 44 |         f.write('minbreak = {}\n'.format(minbreak))
 45 |         f.write('minform = {}\n'.format(minform))
 46 |         f.write('minchange = {}\n'.format(minchange))
 47 | 
 48 |     # Loop over Mothers
 49 |     for idx, mother in enumerate(glob.iglob(os.path.join(input_path, '**/*.xyz'), recursive=True)):
 50 |         xyz = read(mother)
 51 |         symbols, coords = xyz.get_chemical_symbols(), xyz.get_positions()
 52 |         mol = MolGraph(symbols=symbols, coords=coords)
 53 |         mol.infer_connections()
 54 |         name =  os.path.basename(os.path.dirname(mother))
 55 | 
 56 |         seeds = generate_driving_coords(
 57 |             mol,
 58 |             maxbreak=maxbreak,
 59 |             maxform=maxform,
 60 |             maxchange=maxchange,
 61 |             single_change=not ignore_single_change,
 62 |             equiv_Hs=equiv_Hs,
 63 |             minbreak=minbreak,
 64 |             minform=minform,
 65 |             minchange=minchange
 66 |         )
 67 |         print(f'{len(seeds)} Seeds were generated from {name}')
 68 | 
 69 |         output_path = os.path.join(pdir, '{}'.format(name))
 70 |         if not os.path.exists(output_path):
 71 |             os.mkdir(output_path)
 72 | 
 73 |         # Loop over seeds
 74 |         for idx, seed in enumerate(seeds):
 75 |             
 76 |             gsm_dir = os.path.join(output_path, f'gsm{idx:04}')
 77 |             if not os.path.exists(gsm_dir):
 78 |                 os.mkdir(gsm_dir)
 79 |                 
 80 |             isomers_file = os.path.join(gsm_dir, 'ISOMERS.txt')
 81 |             initial_file = os.path.join(gsm_dir, 'initial.xyz')
 82 |             bash_file = os.path.join(gsm_dir, 'gsm.sh')
 83 |             
 84 |             with open(bash_file, 'w') as f:
 85 |                 f.write('''
 86 | gsm -xyzfile initial.xyz \\
 87 |     -mode SE_GSM \\
 88 |     -num_nodes 30 \\
 89 |     -package xTB_lot \\
 90 |     -isomers ISOMERS.txt \\
 91 |     -xyz_output_format multixyz \\
 92 |     -coordinate_type DLC > gsm_log 2>&1''')        
 93 |             
 94 |             with open(isomers_file, 'w') as f:
 95 |                 f.write(str(seed))
 96 |             with open(initial_file, 'w') as f:
 97 |                 f.write(str(len(symbols)) + '\n')
 98 |                 f.write('\n')
 99 |                 for symbol, xyz in zip(symbols, coords):
100 |                     f.write('{0}  {1[0]: .10f}  {1[1]: .10f}  {1[2]: .10f}\n'.format(symbol, xyz))
101 | 
102 |     print('\nCreating GSM finished!')                    
103 | 
104 | def print_args(args):
105 |     print()
106 |     print("Arguments provided:")
107 |     arg_dict = vars(args)
108 |     for key, value in arg_dict.items():
109 |         print(f"  {key}: {value}")
110 |     print()
111 |     
112 | def get_parser():
113 |     parser = argparse.ArgumentParser(description='Make GSM jobs from mother structures')
114 |     
115 |     parser.add_argument('-i', '--input_path', required=True, 
116 |                         help='Input path of mother structures')    
117 |     parser.add_argument('-o', '--output_path', required=True, 
118 |                         help='Output path of gsm jobs')
119 |     
120 |     parser.add_argument('--maxbreak', type=int, default=2,
121 |                         help='Maximum number of connections to break')
122 |     parser.add_argument('--maxform', type=int, default=2,
123 |                         help='Maximum number of connections to form')    
124 |     parser.add_argument('--maxchange', type=int, default=3,
125 |                         help='Maximum number of connections to change')
126 |     
127 |     parser.add_argument('--minbreak', type=int, default=0,
128 |                         help='Minumum number of connections to break')
129 |     parser.add_argument('--minform', type=int, default=0,
130 |                         help='Minumum number of connections to form')    
131 |     parser.add_argument('--minchange', type=int, default=1,
132 |                         help='Minumum number of connections to change')
133 |     
134 |     parser.add_argument('--ignore_single_change', type=bool, default=True,
135 |                         help='Do not consider single connection changes (e.g., nbreak=1, nform=0)')    
136 |     parser.add_argument('--equiv_Hs', type=bool, default=False,
137 |                         help='Create equivalent driving coordinates for the same reaction with different but\
138 |                         equivalent hydrogens, i.e., hydrogens attached to non-cyclic tetrahedral carbons')
139 |     
140 |     return parser
141 | 
142 | 
143 | if __name__ == "__main__":
144 |     args = get_parser().parse_args()
145 |     main(args)
146 | 
147 | 
148 | 
149 | 


--------------------------------------------------------------------------------
/dandelion/segsm/filter_gsm.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import sys
  3 | import glob
  4 | import shutil
  5 | import argparse
  6 | 
  7 | from rdkit import RDLogger
  8 | from ase.io import read, write
  9 | from openbabel import openbabel
 10 | from .ard_gsm.mol import MolGraph, SanitizationError
 11 | #from ard_gsm.mol import MolGraph, SanitizationError
 12 | 
 13 | # Suppress Noisy warning in the filter
 14 | RDLogger.logger().setLevel(RDLogger.CRITICAL)
 15 | openbabel.obErrorLog.SetOutputLevel(openbabel.obError)
 16 | 
 17 | '''
 18 | Faith of pyGSM run
 19 | 
 20 | 1) png is not made
 21 | - xTB not converge
 22 | - pyGSM suicide on his criteria
 23 | 
 24 | 2) png is made
 25 | - Exiting early -> should filter out
 26 | - Ran out of iterations -> also includes potential rxn
 27 | - Converged -> very rare
 28 | '''
 29 | 
 30 | 
 31 | 
 32 | def parse_gsm_log(keyword, content):
 33 |     """Find the value associated with a keyword in a text content."""
 34 |     # For TS_energy, we're expecting a float, so we use a different pattern
 35 |     if keyword == "TS energy:":
 36 |         pattern = f"{keyword} ([+-]?(\d+(\.\d*)?|\.\d+)([eE][+-]?\d+)?)"
 37 |     else:
 38 |         pattern = f"{keyword} (\d+)"
 39 |     
 40 |     import re
 41 |     matches = re.findall(pattern, content)
 42 |     
 43 |     # Return the matched value; assume there's only one match
 44 |     if matches:
 45 |         return matches[0][0]  # Due to group structures, we take the first element
 46 |     else:
 47 |         return None
 48 | 
 49 | 
 50 | def get_gsm_data(home, seed, string):
 51 |     try:
 52 |         with open(os.path.join(home, seed, string, 'gsm_log'), 'r') as f:
 53 |             content = f.read()
 54 |     except FileNotFoundError:
 55 |         return None
 56 | 
 57 |     nodes = []
 58 |     try:
 59 |         with open(os.path.join(home, seed, string, 'opt_converged_000.xyz'), 'r') as f:
 60 |             for i in range(30):
 61 |                 try:
 62 |                     nodes.append(read(f, i))
 63 |                 except:
 64 |                     break
 65 |     except FileNotFoundError:
 66 |         return None
 67 | 
 68 |     return {
 69 |         "TS_energy"         : float(parse_gsm_log("TS energy:", content)),
 70 |         "reactant_idx"      : int(parse_gsm_log("min reactant node:", content)),
 71 |         "product_idx"       : int(parse_gsm_log("min product node", content)),
 72 |         "TS_idx"            : int(parse_gsm_log("TS node is", content)),
 73 |         "nodes"             : nodes,
 74 |         'energies'          : [float(list(node.info.keys())[0]) for node in nodes]
 75 |     }
 76 | 
 77 | 
 78 | 
 79 | def profile_filter(strings, home, seed, barrier_max, barrier_min, delta_e_min):
 80 |     '''
 81 |     Given gsm success reactions,
 82 |     Filter strings by TS_index and Barrier height and delta_e.
 83 |     '''
 84 |     filtered = {}
 85 |     for string in strings:
 86 |         data = get_gsm_data(home, seed, string)
 87 |         if not data:
 88 |             continue
 89 | 
 90 |         if data["TS_idx"] >= data["product_idx"]: # wrong ts
 91 |             continue
 92 |         if (data["TS_energy"] > barrier_max) or (data["TS_energy"] < barrier_min): # too high or low barrier
 93 |             continue
 94 |         if abs(data['energies'][data['product_idx']]) * 627.503 < delta_e_min: # maybe reactant==product
 95 |             continue
 96 | 
 97 |         product_graph = MolGraph(symbols=data["nodes"][data["product_idx"]].get_chemical_symbols(),
 98 |                                   coords=data["nodes"][data["product_idx"]].get_positions(),
 99 |                                   energy=float(list(data["nodes"][data["product_idx"]].info.keys())[0]))
100 |     
101 |         filtered[string] = {
102 |             'reactant': data["nodes"][data["reactant_idx"]],
103 |             'product': data["nodes"][data["product_idx"]],
104 |             'ts': data["nodes"][data["TS_idx"]],
105 |             'product_graph': product_graph,
106 |             'ts_energy': data["TS_energy"]
107 |         }
108 | 
109 |     return filtered
110 | 
111 | def structure_filter(reactions):
112 |     '''
113 |     Chemically absurd products are filtered here. (graph->pybel->inchi->smiles)
114 |     SMILES are constructed, and saved to the dictionary for the unique filter.
115 |     '''
116 | 
117 |     filtered = {}
118 |     
119 |     for rxn, data in reactions.items():
120 |         try:
121 |             smiles = data['product_graph'].perceive_smiles()
122 |             filtered[rxn] = data
123 |             filtered[rxn]['product_smiles'] = smiles
124 |         except SanitizationError:
125 |             continue
126 |     return filtered
127 | 
128 | def unique_filter(reactions):
129 |     '''
130 |     Duplicates are filtered based on SMILES. 
131 |     If there are more than one of same SMILES, pick the lowest barrier reaction.
132 |     '''
133 |     unique = {}
134 |     for rxn, data in reactions.items():
135 |         smiles = data['product_smiles']
136 |         ts_energy = data['ts_energy']
137 |         if smiles not in unique or ts_energy < unique[smiles]['ts_energy']:
138 |             unique[smiles] = {
139 |                 'reaction_key': rxn,
140 |                 'ts_energy': ts_energy,
141 |                 'reactant': data['reactant'],
142 |                 'product': data['product'],
143 |                 'ts': data['ts'],
144 |             }
145 |     return unique
146 | 
147 | def save_unique_reactions(home, output_path, seed, reactions):
148 |     for smiles, data in reactions.items():
149 |         reaction_dir = os.path.join(output_path, seed, data['reaction_key'])
150 |         os.makedirs(reaction_dir, exist_ok=True)
151 | 
152 |         file_types = ["reactant", "ts", "product"]
153 |         for f_type in file_types:
154 |             write(os.path.join(reaction_dir, f"{f_type}.xyz"), data[f_type])
155 |             write(os.path.join(reaction_dir, f"{f_type}.png"), data[f_type])
156 | 
157 |         shutil.copyfile(os.path.join(home, seed, data['reaction_key'], '0000_string.png'),
158 |                         os.path.join(reaction_dir, 'string.png'))
159 | 
160 |         shutil.copyfile(os.path.join(home, seed, data['reaction_key'], 'opt_converged_000.xyz'),
161 |                         os.path.join(reaction_dir, 'string.xyz'))
162 |         
163 | def main(args):
164 |     
165 |     print_args(args)
166 |     
167 |     input_path  = args.input_path
168 |     if not os.path.isdir(input_path):
169 |         sys.exit(f"Error: '{input_path}' is not a directory.")
170 |     output_path = args.output_path
171 |     if not os.path.exists(output_path):
172 |         os.mkdir(output_path)
173 |     barrier_max = args.barrier_max
174 |     barrier_min = args.barrier_min
175 |     delta_e_min = args.delta_e_min
176 |     
177 |     mothers =  [d for d in os.listdir(input_path) if os.path.isdir(os.path.join(input_path, d))]
178 |     for mother in mothers:
179 |         print('\n◤◢◤◢◤◢◤◢◤◢◤◢◤◢◤◢◤◢◤◢◤◢◤◢◤◢◤◢◤◢◤◢◤◢◤◢')
180 |         print(f'mother: {mother}'.center(35))
181 |         driving_coordinates = list(glob.iglob(os.path.join(input_path, f'{mother}/*/gsm_log')))
182 |         success_strings = [path.split('/')[-2] for path in glob.iglob(os.path.join(input_path, f'{mother}/*/0000_string.png'))]
183 | 
184 |         profile_filtered_strings = profile_filter(success_strings, input_path, mother, barrier_max, barrier_min, delta_e_min)
185 |         structure_filtered_strings = structure_filter(profile_filtered_strings)
186 |         unique_reactions = unique_filter(structure_filtered_strings)
187 | 
188 |         print(f'Initial seeds:                 {len(driving_coordinates):>5}')
189 |         print(f'GSM success reactions:         {len(success_strings):>5}')
190 |         print(f'Profile filtered reactions:    {len(profile_filtered_strings):>5}')
191 |         print(f'Structure filtered reactions:  {len(structure_filtered_strings):>5}')
192 |         print(f'Unique reactions:              {len(unique_reactions):>5}')
193 | 
194 |         save_unique_reactions(input_path, output_path, mother, unique_reactions)
195 | 
196 |     print('\nFiltering GSM finished!')
197 | 
198 | def print_args(args):
199 |     print()
200 |     print("Arguments provided:")
201 |     arg_dict = vars(args)
202 |     for key, value in arg_dict.items():
203 |         print(f"  {key}: {value}")
204 |     print()
205 |     
206 | def get_parser():
207 |     parser = argparse.ArgumentParser(description='Make GSM jobs from mother structures')
208 |     
209 |     parser.add_argument('-i', '--input_path', required=True, 
210 |                         help='Input path of finished gsm jobs')    
211 |     parser.add_argument('-o', '--output_path', required=True, 
212 |                         help='Output path of filtered gsm jobs')
213 |     
214 |     parser.add_argument('--barrier_min', type=int, default=5)
215 |     parser.add_argument('--barrier_max', type=int, default=200)    
216 |     parser.add_argument('--delta_e_min', type=int, default=5)
217 | 
218 |     return parser
219 | 
220 | 
221 | if __name__ == "__main__":
222 |     args = get_parser().parse_args()
223 |     main(args)
224 | 
225 | 
226 | 


--------------------------------------------------------------------------------
/dandelion/segsm/run_gsm.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | import argparse
 4 | import subprocess
 5 | from concurrent.futures import ProcessPoolExecutor, as_completed
 6 | 
 7 | from tqdm import tqdm
 8 | 
 9 | # conda activate ts
10 | # check whether gsm is killed when you interrupted
11 | # use like "nohup python -u 2_run_gsm_jobs > gsm.out &"
12 | 
13 | 
14 | def run_gsm_script(script_dir):
15 |     #print(f"Executing in directory: {script_dir}")
16 |     subprocess.run('bash gsm.sh', cwd=script_dir, capture_output=True, text=True, shell=True)
17 | 
18 | def main(args):
19 |     
20 |     print_args(args)
21 |     
22 |     input_path = args.input_path
23 |     if not os.path.isdir(input_path):
24 |         sys.exit(f"Error: '{input_path}' is not a directory.")
25 |     max_workers = args.max_workers
26 | 
27 |     # Find all directories containing gsm.sh scripts
28 |     script_dirs = [dirpath for dirpath, _, filenames in os.walk(input_path) if "gsm.sh" in filenames]
29 | 
30 |     bar_format = '{l_bar}{bar}| {n_fmt}/{total_fmt} [{elapsed}<{remaining}]'
31 |     with ProcessPoolExecutor(max_workers=max_workers) as executor:
32 |         futures = [executor.submit(run_gsm_script, script_dir) for script_dir in script_dirs] 
33 |         
34 |         for future in tqdm(as_completed(futures), desc='GSM on seeds', 
35 |                            total=len(script_dirs), smoothing=0, bar_format=bar_format, ncols=70):
36 |             pass # just update the tqdm
37 | 
38 |     print('GSM finished!')
39 | 
40 | def print_args(args):
41 |     print()
42 |     print("Arguments provided:")
43 |     arg_dict = vars(args)
44 |     for key, value in arg_dict.items():
45 |         print(f"  {key}: {value}")
46 |     print()
47 |     
48 | def get_parser():
49 |     parser = argparse.ArgumentParser(description='Run GSM jobs concurrently')
50 |     
51 |     parser.add_argument('-i', '--input_path', required=True, 
52 |                         help='Base directory of mothers bearing seeds')
53 |     parser.add_argument('-n', '--max_workers', type=int, default=1, 
54 |                         help='Number of worker processes')
55 | 
56 |     return parser
57 | 
58 | 
59 | if __name__ == "__main__":
60 |     args = get_parser().parse_args()
61 |     main(args)
62 | 


--------------------------------------------------------------------------------
/dandelion/utils/db_h5_tools/db_to_h5.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import sys
  3 | import argparse
  4 | 
  5 | import h5py
  6 | from ase.db import connect
  7 | 
  8 | 
  9 | def main(args):
 10 |     
 11 |     print_args(args)
 12 |     
 13 |     input_path  = args.input_path
 14 |     if not os.path.isfile(input_path):
 15 |         sys.exit(f"Error: '{input_path}' is not a file.")
 16 |     output_path = args.output_path
 17 |     
 18 |     # Data structure to hold the computed results
 19 |     rxn_data = {}
 20 |     
 21 |     rows = []  # List to store all rows
 22 | 
 23 |     # Extract data from ASE database
 24 |     with connect(input_path) as db:
 25 |         for row in db.select():
 26 |             if hasattr(row, 'energy') and hasattr(row, 'forces'):
 27 |                 rows.append(row)
 28 |             
 29 |     # Sort rows based on the unique_id number
 30 |     rows.sort(key=lambda r: int(r.data['unique_id'].split('_')[-1]))
 31 | 
 32 |     # Process sorted rows
 33 |     for row in rows:
 34 |         # Extract unique_id and other data
 35 |         unique_id = row.data['unique_id']
 36 |         chem_group_name, rxn_group_name, index = unique_id.split('_')
 37 |         
 38 |         if chem_group_name not in rxn_data:
 39 |             rxn_data[chem_group_name] = {}
 40 |         
 41 |         if rxn_group_name not in rxn_data[chem_group_name]:
 42 |             rxn_data[chem_group_name][rxn_group_name] = {
 43 |                 'atomic_numbers': row.toatoms().numbers,
 44 |                 'energies': [],
 45 |                 'forces': [],
 46 |                 'positions': []
 47 |             }
 48 |         rxn_data[chem_group_name][rxn_group_name]['energies'].append(row.energy)
 49 |         rxn_data[chem_group_name][rxn_group_name]['forces'].append(row.forces)
 50 |         rxn_data[chem_group_name][rxn_group_name]['positions'].append(row.toatoms().positions)
 51 | 
 52 |     # Save the data to an h5 file
 53 |     with h5py.File(output_path, 'w') as h5file:
 54 |         # Ensure the 'data' group exists
 55 |         if 'data' not in h5file:
 56 |             data_group = h5file.create_group('data')
 57 |         else:
 58 |             data_group = h5file['data']
 59 |         
 60 |         # Iterate through the rxn_data dictionary to save datasets
 61 |         for chem_group_name in rxn_data:
 62 |             if chem_group_name not in data_group:
 63 |                 chem_group = data_group.create_group(chem_group_name)
 64 |             else:
 65 |                 chem_group = data_group[chem_group_name]
 66 |             
 67 |             for rxn_group_name, rxn_entry in rxn_data[chem_group_name].items():
 68 |                 if rxn_group_name not in chem_group:
 69 |                     rxn_group = chem_group.create_group(rxn_group_name)
 70 |                 else:
 71 |                     rxn_group = chem_group[rxn_group_name]
 72 |                 
 73 |                 # Add datasets to the reaction group
 74 |                 rxn_group.create_dataset('atomic_numbers', data=rxn_entry['atomic_numbers'])
 75 |                 rxn_group.create_dataset('wB97x_6-31G(d).energy', data=rxn_entry['energies'])
 76 |                 rxn_group.create_dataset('wB97x_6-31G(d).forces', data=rxn_entry['forces'])
 77 |                 rxn_group.create_dataset('positions', data=rxn_entry['positions'])
 78 | 
 79 |     print('Compiled successfully!')
 80 | 
 81 | def print_args(args):
 82 |     print()
 83 |     print("Arguments provided:")
 84 |     arg_dict = vars(args)
 85 |     for key, value in arg_dict.items():
 86 |         print(f"  {key}: {value}")
 87 |     print()
 88 | 
 89 | def get_parser():
 90 |     parser = argparse.ArgumentParser(description="Translate ase db file into hdf5 file.")
 91 |     
 92 |     parser.add_argument('-i', '--input_path', required=True, 
 93 |                         help="Path of the input wB97X ASE db file")
 94 |     parser.add_argument('-o', '--output_path', required=True, 
 95 |                         help="Path of the output wB97X hdf5 file")
 96 | 
 97 |     return parser
 98 | 
 99 | if __name__ == "__main__":
100 |     args = get_parser().parse_args()
101 |     main(args)
102 | 
103 | 
104 | 


--------------------------------------------------------------------------------
/dandelion/utils/db_h5_tools/h5_to_db.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | import h5py
 4 | import argparse
 5 | 
 6 | from tqdm import tqdm
 7 | from ase import Atoms
 8 | from ase.db import connect
 9 | from ase.calculators.singlepoint import SinglePointCalculator
10 | 
11 | 
12 | def main(args):
13 |     
14 |     print_args(args)
15 |     
16 |     input_path  = args.input_path
17 |     if not os.path.isfile(input_path):
18 |         sys.exit(f"Error: '{input_path}' is not a file.")
19 |     output_path = args.output_path
20 | 
21 | 
22 |     with h5py.File(input_path, 'r') as h5_file:
23 |         data_group = h5_file['data']
24 |         
25 |         # Count total number of configurations 
26 |         total_configs = sum(
27 |             rxn_group['wB97x_6-31G(d).energy'].shape[0]
28 |             for chem_group in data_group.values()
29 |             for rxn_group in chem_group.values()
30 |         )
31 |         
32 |         with connect(output_path) as db:
33 |             with tqdm(total=total_configs, desc="Converting", unit="config") as pbar:
34 |                 for chem_group_name, chem_group in data_group.items():
35 |                     for rxn_group_name, rxn_group in chem_group.items():
36 |                         atomic_numbers = rxn_group['atomic_numbers'][:]
37 |                         positions = rxn_group['positions'][:]
38 |                         energies = rxn_group['wB97x_6-31G(d).energy'][:]
39 |                         forces = rxn_group['wB97x_6-31G(d).forces'][:]
40 |                         
41 |                         for i in range(len(energies)):
42 |                             atoms = Atoms(
43 |                                 numbers=atomic_numbers,
44 |                                 positions=positions[i],
45 |                             )
46 |                             atoms.set_calculator(SinglePointCalculator(
47 |                                 atoms,
48 |                                 energy=energies[i],
49 |                                 forces=forces[i]
50 |                             ))
51 |                             
52 |                             unique_id = f"{chem_group_name}_{rxn_group_name}_{i}"
53 |                             db.write(atoms, data={'unique_id': unique_id})
54 |                             
55 |                             pbar.update(1)
56 | 
57 | def print_args(args):
58 |     print()
59 |     print("Arguments provided:")
60 |     arg_dict = vars(args)
61 |     for key, value in arg_dict.items():
62 |         print(f"  {key}: {value}")
63 |     print()
64 | 
65 | def get_parser():
66 |     parser = argparse.ArgumentParser(description="Translate hdf5 file into ase db file.")
67 |     
68 |     parser.add_argument('-i', '--input_path', required=True, 
69 |                         help="Path of the input wB97X hdf5 file")
70 |     parser.add_argument('-o', '--output_path', required=True, 
71 |                         help="Path of the output wB97X db file")
72 | 
73 |     return parser
74 | 
75 | if __name__ == "__main__":
76 |     args = get_parser().parse_args()
77 |     main(args)
78 | 


--------------------------------------------------------------------------------
/dandelion/utils/db_h5_tools/make_db_from_xyzs.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import glob
 3 | import argparse
 4 | 
 5 | from tqdm import tqdm
 6 | from ase import io
 7 | from ase.db import connect
 8 | 
 9 | 
10 | def main(args):
11 |     
12 |     print_args(args)
13 |     
14 |     input_path = args.input_path
15 |     if not os.path.isdir(input_path):
16 |         sys.exit(f"Error: '{input_path}' is not a directory.")
17 |     output_path = args.output_path
18 | 
19 |     with connect(output_path) as db:
20 |         for file_path in tqdm(glob.glob(os.path.join(input_path, '**/*.xyz'), recursive=True)):
21 |             atoms = io.read(file_path)
22 |             db.write(atoms)
23 | 
24 | def print_args(args):
25 |     print()
26 |     print("Arguments provided:")
27 |     arg_dict = vars(args)
28 |     for key, value in arg_dict.items():
29 |         print(f"  {key}: {value}")
30 |     print()
31 |     
32 | def get_parser():
33 |     parser = argparse.ArgumentParser(description='Merge xyz files in input directory into db file.')
34 |     
35 |     parser.add_argument('-i', '--input_path', required=True, 
36 |                         help='Input path of directory containing xyz files to merge')    
37 |     parser.add_argument('-o', '--output_path', required=True, 
38 |                         help='Output path of the merged db file.')
39 |     
40 |     return parser
41 | 
42 | 
43 | if __name__ == "__main__":
44 |     args = get_parser().parse_args()
45 |     main(args)
46 | 
47 | 
48 | 


--------------------------------------------------------------------------------
/dandelion/utils/db_h5_tools/merge_db.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | import glob
 4 | import argparse
 5 | 
 6 | from tqdm import tqdm 
 7 | from ase.db import connect
 8 | 
 9 | def main(args):
10 |     
11 |     print_args(args)
12 |     
13 |     input_path = args.input_path
14 |     if not os.path.isdir(input_path):
15 |         sys.exit(f"Error: '{input_path}' is not a directory.")
16 |     output_path = args.output_path
17 |     
18 |     with connect(output_path) as db1:
19 |         for f in glob.glob(os.path.join(input_path, '**/wb97x.db'), recursive=True):
20 |             with connect(f) as db2:
21 |                 for row in tqdm(db2.select(), total=db2.count(), desc=f"{f}"):
22 |                     db1.write(row.toatoms())
23 | 
24 | def print_args(args):
25 |     print()
26 |     print("Arguments provided:")
27 |     arg_dict = vars(args)
28 |     for key, value in arg_dict.items():
29 |         print(f"  {key}: {value}")
30 |     print()
31 |     
32 | def get_parser():
33 |     parser = argparse.ArgumentParser(description='Merge db files in input directory')
34 |     
35 |     parser.add_argument('-i', '--input_path', required=True, 
36 |                         help='Input path of directory containing db files to merge')    
37 |     parser.add_argument('-o', '--output_path', required=True, 
38 |                         help='Output path of the merged db file.')
39 |     
40 |     return parser
41 | 
42 | 
43 | if __name__ == "__main__":
44 |     args = get_parser().parse_args()
45 |     main(args)
46 | 
47 | 


--------------------------------------------------------------------------------
/dandelion/utils/db_h5_tools/merge_h5.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | import argparse
 4 | from itertools import repeat
 5 | 
 6 | import h5py
 7 | from tqdm import tqdm
 8 | import glob 
 9 | 
10 | 
11 | def main(args):
12 |     
13 |     print_args(args)
14 |     
15 |     input_path           = args.input_path
16 |     if not os.path.isdir(input_path):
17 |         sys.exit(f"Error: '{input_path}' is not a directory.")
18 |     output_path          = args.output_path
19 | 
20 |     # Open the output file
21 |     with h5py.File(output_path, 'w') as h5file_out:
22 |         # Ensure the 'data' group exists in the output file
23 |         if 'data' not in h5file_out:
24 |             data_group_out = h5file_out.create_group('data')
25 |         else:
26 |             data_group_out = h5file_out['data']
27 |         
28 |         # Iterate through each input file
29 |         for input_path in glob.glob(os.path.join(input_path, '**/wb97x.h5'), recursive=True):
30 |             print(input_path)
31 |             # Determine the prefix ('a' or 'b') based on the input file name
32 |             prefix = os.path.basename(os.path.dirname(input_path))  # Assumes file name is 'a.h5' or 'b.h5'
33 |             
34 |             # Open the input file
35 |             with h5py.File(input_path, 'r') as h5file_in:
36 |                 # Iterate through chemical groups in the input file
37 |                 for chem_group_name, chem_group in tqdm(h5file_in['data'].items(), desc="Formulas"):
38 |                     # Ensure the chemical group exists in the output file
39 |                     if chem_group_name not in data_group_out:
40 |                         chem_group_out = data_group_out.create_group(chem_group_name)
41 |                     else:
42 |                         chem_group_out = data_group_out[chem_group_name]
43 |                     
44 |                     # Iterate through reaction groups in the chemical group
45 |                     for rxn_group_name, rxn_group in tqdm(chem_group.items(), desc=f"Rxns in {chem_group_name}", leave=False):
46 |                         # Prefix the reaction group name with 'a' or 'b'
47 |                         rxn_group_name_prefixed = f"{prefix}_{rxn_group_name}"
48 |                         
49 |                         # Ensure the reaction group exists in the output file
50 |                         if rxn_group_name_prefixed not in chem_group_out:
51 |                             rxn_group_out = chem_group_out.create_group(rxn_group_name_prefixed)
52 |                         else:
53 |                             rxn_group_out = chem_group_out[rxn_group_name_prefixed]
54 |                         
55 |                         # Copy datasets from input to output, creating new datasets
56 |                         for dset_name, dset in rxn_group.items():
57 |                             data = dset[:]
58 |                             rxn_group_out.create_dataset(dset_name, data=data)
59 | 
60 | def print_args(args):
61 |     print()
62 |     print("Arguments provided:")
63 |     arg_dict = vars(args)
64 |     for key, value in arg_dict.items():
65 |         print(f"  {key}: {value}")
66 |     print()
67 |     
68 | def get_parser():
69 |     parser = argparse.ArgumentParser(description='Merge h5 files in input directory')
70 |     
71 |     parser.add_argument('-i', '--input_path', required=True, 
72 |                         help='Input path of directory containing h5 files to merge')    
73 |     parser.add_argument('-o', '--output_path', required=True, 
74 |                         help='Output path of the merged h5 file.')
75 |     
76 |     return parser
77 | 
78 | 
79 | if __name__ == "__main__":
80 |     args = get_parser().parse_args()
81 |     main(args)
82 | 
83 | 


--------------------------------------------------------------------------------
/environment.yml:
--------------------------------------------------------------------------------
 1 | name: ts
 2 | channels:
 3 |   - conda-forge
 4 |   - defaults
 5 | dependencies:
 6 |   - python=3.11.5
 7 |   - pip=23.2.1
 8 |   - ase=3.22.1
 9 |   - imageio=2.31.1
10 |   - matplotlib-base=3.7.2
11 |   - numpy=1.25.2
12 |   - openbabel=3.1.1
13 |   - scipy=1.11.2
14 |   - networkx=3.1
15 |   - xtb-python=22.1


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
 1 | [metadata]
 2 | name = dandelion
 3 | version = attr: dandelion.__version__
 4 | author = Minhyeok Lee
 5 | author_email = mlee@yonsei.ac.kr
 6 | description = Near TS region sampler for machine learning force field
 7 | python_requires = >=3.11
 8 | classifiers =
 9 |     License :: MIT License
10 |     Programming Language :: Python :: 3
11 | long_description = file: README.md
12 | 
13 | [options]
14 | packages = find:
15 | install_requires = 
16 |     h5py==3.9.0
17 |     rdkit==2023.3.3
18 |     tqdm==4.66.1
19 |     typing-extensions==4.8.0
20 | 
21 | [options.entry_points]
22 | console_scripts =
23 |     dand = dandelion.cli:main
24 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | from setuptools import setup
2 | 
3 | setup()


--------------------------------------------------------------------------------