├── sgdml
    ├── intf
    │   ├── __init__.py
    │   └── ase_calc.py
    ├── utils
    │   ├── __init__.py
    │   ├── ui.py
    │   ├── desc.py
    │   ├── io.py
    │   └── perm.py
    ├── solvers
    │   ├── __init__.py
    │   ├── analytic.py
    │   └── iterative.py
    ├── __init__.py
    └── get.py
├── pyproject.toml
├── .gitignore
├── setup.cfg
├── LICENSE.txt
├── setup.py
├── scripts
    ├── sgdml_dataset_to_extxyz.py
    ├── sgdml_datasets_from_model.py
    ├── sgdml_dataset_from_aims.py
    ├── sgdml_dataset_from_ipi.py
    ├── sgdml_dataset_via_ase.py
    └── sgdml_dataset_from_extxyz.py
└── README.md


/sgdml/intf/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/sgdml/utils/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/sgdml/solvers/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
1 | [tool.black]
2 | skip-string-normalization = true
3 | skip-numeric-underscore-normalization = true
4 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | 
 2 | .DS_Store
 3 | 
 4 | # Compiled python modules.
 5 | *.pyc
 6 | 
 7 | # Setuptools distribution folder.
 8 | /dist/
 9 | 
10 | # Python egg metadata, regenerated from source files by setuptools.
11 | /*.egg-info
12 | /*.egg
13 | sgdml/_bmark_cache.npz
14 | 


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
 1 | [flake8]
 2 | max-complexity = 12
 3 | ignore = E501,W503,E741
 4 | select = C,E,F,W
 5 | 
 6 | [isort]
 7 | multi_line_output = 3
 8 | include_trailing_comma = 1
 9 | line_length = 85
10 | sections = FUTURE,STDLIB,TYPING,THIRDPARTY,FIRSTPARTY,LOCALFOLDER
11 | known_typing = typing, typing_extensions
12 | no_lines_before = TYPING
13 | 


--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2018-2022 Stefan Chmiela
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import re
 3 | from io import open
 4 | from setuptools import setup, find_packages
 5 | 
 6 | 
 7 | def get_property(property, package):
 8 |     result = re.search(
 9 |         r'{}\s*=\s*[\'"]([^\'"]*)[\'"]'.format(property),
10 |         open(package + '/__init__.py').read(),
11 |     )
12 |     return result.group(1)
13 | 
14 | 
15 | from os import path
16 | 
17 | this_dir = path.abspath(path.dirname(__file__))
18 | with open(path.join(this_dir, 'README.md'), encoding='utf8') as f:
19 |     long_description = f.read()
20 | 
21 | # Scripts
22 | scripts = []
23 | for dirname, dirnames, filenames in os.walk('scripts'):
24 |     for filename in filenames:
25 |         if filename.endswith('.py'):
26 |             scripts.append(os.path.join(dirname, filename))
27 | 
28 | setup(
29 |     name='sgdml',
30 |     version=get_property('__version__', 'sgdml'),
31 |     description='Reference implementation of the GDML and sGDML force field models.',
32 |     long_description=long_description,
33 |     long_description_content_type='text/markdown',
34 |     classifiers=[
35 |         'Development Status :: 4 - Beta',
36 |         'Environment :: Console',
37 |         'Intended Audience :: Science/Research',
38 |         'Intended Audience :: Education',
39 |         'Intended Audience :: Developers',
40 |         'License :: OSI Approved :: MIT License',
41 |         'Operating System :: MacOS :: MacOS X',
42 |         'Operating System :: POSIX :: Linux',
43 |         'Programming Language :: Python :: 3.7',
44 |         'Topic :: Scientific/Engineering :: Chemistry',
45 |         'Topic :: Scientific/Engineering :: Physics',
46 |         'Topic :: Software Development :: Libraries :: Python Modules',
47 |     ],
48 |     url='http://www.sgdml.org',
49 |     author='Stefan Chmiela',
50 |     author_email='sgdml@chmiela.com',
51 |     license='LICENSE.txt',
52 |     packages=find_packages(),
53 |     install_requires=['torch >= 1.8', 'numpy >= 1.19.0', 'scipy >= 1.1.0', 'psutil', 'future'],
54 |     entry_points={
55 |         'console_scripts': ['sgdml=sgdml.cli:main', 'sgdml-get=sgdml.get:main']
56 |     },
57 |     extras_require={'ase': ['ase >= 3.16.2']},
58 |     scripts=scripts,
59 |     include_package_data=True,
60 |     zip_safe=False,
61 | )
62 | 


--------------------------------------------------------------------------------
/scripts/sgdml_dataset_to_extxyz.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | 
 3 | # MIT License
 4 | #
 5 | # Copyright (c) 2018-2019 Stefan Chmiela
 6 | #
 7 | # Permission is hereby granted, free of charge, to any person obtaining a copy
 8 | # of this software and associated documentation files (the "Software"), to deal
 9 | # in the Software without restriction, including without limitation the rights
10 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
11 | # copies of the Software, and to permit persons to whom the Software is
12 | # furnished to do so, subject to the following conditions:
13 | #
14 | # The above copyright notice and this permission notice shall be included in all
15 | # copies or substantial portions of the Software.
16 | #
17 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
20 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
22 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
23 | # SOFTWARE.
24 | 
25 | from __future__ import print_function
26 | 
27 | import argparse
28 | import os
29 | import sys
30 | 
31 | import numpy as np
32 | 
33 | from sgdml.utils import io, ui
34 | 
35 | 
36 | parser = argparse.ArgumentParser(
37 |     description='Converts a native dataset file to extended XYZ format.'
38 | )
39 | parser.add_argument(
40 |     'dataset',
41 |     metavar='<dataset>',
42 |     type=lambda x: io.is_file_type(x, 'dataset'),
43 |     help='path to dataset file',
44 | )
45 | parser.add_argument(
46 |     '-o',
47 |     '--overwrite',
48 |     dest='overwrite',
49 |     action='store_true',
50 |     help='overwrite existing xyz dataset file',
51 | )
52 | 
53 | args = parser.parse_args()
54 | dataset_path, dataset = args.dataset
55 | 
56 | name = os.path.splitext(os.path.basename(dataset_path))[0]
57 | dataset_file_name = name + '.xyz'
58 | 
59 | xyz_exists = os.path.isfile(dataset_file_name)
60 | if xyz_exists and args.overwrite:
61 |     print(ui.color_str('[INFO]', bold=True) + ' Overwriting existing xyz dataset file.')
62 | if not xyz_exists or args.overwrite:
63 |     print(ui.color_str('[INFO]', bold=True) + ' Writing dataset to \'{}\'...'.format(dataset_file_name))
64 | else:
65 |     sys.exit(
66 |         ui.color_str('[FAIL]', fore_color=ui.RED, bold=True) + ' Dataset \'{}\' already exists.'.format(dataset_file_name)
67 |     )
68 | 
69 | R = dataset['R']
70 | z = dataset['z']
71 | F = dataset['F']
72 | 
73 | lattice = dataset['lattice'] if 'lattice' in dataset else None
74 | 
75 | try:
76 |     with open(dataset_file_name, 'w') as file:
77 | 
78 |         n = R.shape[0]
79 |         for i, r in enumerate(R):
80 | 
81 |             e = np.squeeze(dataset['E'][i]) if 'E' in dataset else None
82 |             f = dataset['F'][i,:,:]
83 |             ext_xyz_str = io.generate_xyz_str(r, z, e=e, f=f, lattice=lattice) + '\n'
84 | 
85 |             file.write(ext_xyz_str)
86 | 
87 |             progr = float(i) / (n - 1)
88 |             ui.callback(i, n - 1, disp_str='Exporting %d data points...' % n)
89 |             
90 | except IOError:
91 |     sys.exit("ERROR: Writing xyz file failed.")
92 | 
93 | print()
94 | 


--------------------------------------------------------------------------------
/scripts/sgdml_datasets_from_model.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | 
  3 | # MIT License
  4 | #
  5 | # Copyright (c) 2018 Stefan Chmiela
  6 | #
  7 | # Permission is hereby granted, free of charge, to any person obtaining a copy
  8 | # of this software and associated documentation files (the "Software"), to deal
  9 | # in the Software without restriction, including without limitation the rights
 10 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 11 | # copies of the Software, and to permit persons to whom the Software is
 12 | # furnished to do so, subject to the following conditions:
 13 | #
 14 | # The above copyright notice and this permission notice shall be included in all
 15 | # copies or substantial portions of the Software.
 16 | #
 17 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 18 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 19 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 20 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 21 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 22 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 23 | # SOFTWARE.
 24 | 
 25 | from __future__ import print_function
 26 | 
 27 | import argparse
 28 | import os
 29 | import sys
 30 | 
 31 | import numpy as np
 32 | 
 33 | from sgdml.utils import io, ui
 34 | 
 35 | parser = argparse.ArgumentParser(
 36 |     description='Extracts the training and test data subsets from a dataset that were used to construct a model.'
 37 | )
 38 | parser.add_argument(
 39 |     'model',
 40 |     metavar='<model_file>',
 41 |     type=lambda x: io.is_file_type(x, 'model'),
 42 |     help='path to model file',
 43 | )
 44 | parser.add_argument(
 45 |     'dataset',
 46 |     metavar='<dataset_file>',
 47 |     type=lambda x: io.is_file_type(x, 'dataset'),
 48 |     help='path to dataset file referenced in model',
 49 | )
 50 | parser.add_argument(
 51 |     '-o',
 52 |     '--overwrite',
 53 |     dest='overwrite',
 54 |     action='store_true',
 55 |     help='overwrite existing files',
 56 | )
 57 | args = parser.parse_args()
 58 | 
 59 | model_path, model = args.model
 60 | dataset_path, dataset = args.dataset
 61 | 
 62 | 
 63 | for s in ['train', 'valid']:
 64 | 
 65 |     if dataset['md5'] != model['md5_' + s]:
 66 |         sys.exit(
 67 |             ui.fail_str('[FAIL]')
 68 |             + ' Dataset fingerprint does not match the one referenced in model for \'%s\'.'
 69 |             % s
 70 |         )
 71 | 
 72 |     idxs = model['idxs_' + s]
 73 |     R = dataset['R'][idxs, :, :]
 74 |     E = dataset['E'][idxs]
 75 |     F = dataset['F'][idxs, :, :]
 76 | 
 77 |     base_vars = {
 78 |         'type': 'd',
 79 |         'name': dataset['name'].astype(str),
 80 |         'theory': dataset['theory'].astype(str),
 81 |         'z': dataset['z'],
 82 |         'R': R,
 83 |         'E': E,
 84 |         'F': F,
 85 |     }
 86 |     base_vars['md5'] = io.dataset_md5(base_vars)
 87 | 
 88 |     subset_file_name = '%s_%s.npz' % (
 89 |         os.path.splitext(os.path.basename(dataset_path))[0],
 90 |         s,
 91 |     )
 92 |     file_exists = os.path.isfile(subset_file_name)
 93 |     if file_exists and args.overwrite:
 94 |         print(ui.info_str('[INFO]') + ' Overwriting existing model file.')
 95 |     if not file_exists or args.overwrite:
 96 |         np.savez_compressed(subset_file_name, **base_vars)
 97 |         ui.callback(1, disp_str='Extracted %s dataset saved to \'%s\'' % (s, subset_file_name)) # DONE
 98 |     else:
 99 |         print(
100 |             ui.warn_str('[WARN]')
101 |             + ' %s dataset \'%s\' already exists.' % (s.capitalize(), subset_file_name)
102 |             + '\n       Run \'python %s -o %s %s\' to overwrite.\n'
103 |             % (os.path.basename(__file__), model_path, dataset_path)
104 |         )
105 |         sys.exit()
106 | 


--------------------------------------------------------------------------------
/sgdml/__init__.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | 
  3 | # MIT License
  4 | #
  5 | # Copyright (c) 2019-2025 Stefan Chmiela
  6 | #
  7 | # Permission is hereby granted, free of charge, to any person obtaining a copy
  8 | # of this software and associated documentation files (the "Software"), to deal
  9 | # in the Software without restriction, including without limitation the rights
 10 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 11 | # copies of the Software, and to permit persons to whom the Software is
 12 | # furnished to do so, subject to the following conditions:
 13 | #
 14 | # The above copyright notice and this permission notice shall be included in all
 15 | # copies or substantial portions of the Software.
 16 | #
 17 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 18 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 19 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 20 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 21 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 22 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 23 | # SOFTWARE.
 24 | 
 25 | __version__ = '1.0.3'
 26 | 
 27 | MAX_PRINT_WIDTH = 100
 28 | LOG_LEVELNAME_WIDTH = 7  # do not modify
 29 | 
 30 | # more descriptive callback status
 31 | DONE = 1
 32 | NOT_DONE = 0
 33 | 
 34 | 
 35 | # Logging
 36 | 
 37 | import copy
 38 | import logging
 39 | import re
 40 | import textwrap
 41 | 
 42 | from .utils import ui
 43 | 
 44 | 
 45 | class ColoredFormatter(logging.Formatter):
 46 | 
 47 |     LEVEL_COLORS = {
 48 |         'DEBUG': (ui.CYAN, ui.BLACK),
 49 |         'INFO': (ui.WHITE, ui.BLACK),
 50 |         'DONE': (ui.GREEN, ui.BLACK),
 51 |         'WARNING': (ui.YELLOW, ui.BLACK),
 52 |         'ERROR': (ui.RED, ui.BLACK),
 53 |         'CRITICAL': (ui.BLACK, ui.RED),
 54 |     }
 55 | 
 56 |     LEVEL_NAMES = {
 57 |         'DEBUG': '[DEBG]',
 58 |         'INFO': '[INFO]',
 59 |         'DONE': '[DONE]',
 60 |         'WARNING': '[WARN]',
 61 |         'ERROR': '[FAIL]',
 62 |         'CRITICAL': '[CRIT]',
 63 |     }
 64 | 
 65 |     def __init__(self, msg, use_color=True):
 66 | 
 67 |         logging.Formatter.__init__(self, msg)
 68 |         self.use_color = use_color
 69 | 
 70 |     def format(self, record):
 71 | 
 72 |         _record = copy.copy(record)
 73 |         levelname = _record.levelname
 74 |         msg = _record.msg
 75 | 
 76 |         levelname = ui.color_str(
 77 |             self.LEVEL_NAMES[levelname],
 78 |             self.LEVEL_COLORS[levelname][0],
 79 |             self.LEVEL_COLORS[levelname][1],
 80 |             bold=True,
 81 |         )
 82 | 
 83 |         if _record.levelname != 'CRITICAL':
 84 |             # wrap long messages (except for critical [i.e. exceptions, since they print a formatted traceback string])
 85 |             msg = ui.wrap_str(msg)
 86 | 
 87 |         # indent multiline strings after the first line
 88 |         msg = ui.indent_str(msg, LOG_LEVELNAME_WIDTH)[LOG_LEVELNAME_WIDTH:]
 89 | 
 90 |         _record.levelname = levelname
 91 |         _record.msg = msg
 92 |         return logging.Formatter.format(self, _record)
 93 | 
 94 | 
 95 | class ColoredLogger(logging.Logger):
 96 |     def __init__(self, name):
 97 | 
 98 |         logging.Logger.__init__(self, name, logging.DEBUG)
 99 | 
100 |         # add 'DONE' logging level
101 |         logging.DONE = logging.INFO + 1
102 |         logging.addLevelName(logging.DONE, 'DONE')
103 | 
104 |         # only display levelname and message
105 |         formatter = ColoredFormatter('%(levelname)s %(message)s')
106 | 
107 |         # this handler will write to sys.stderr by default
108 |         hd = logging.StreamHandler()
109 |         hd.setFormatter(formatter)
110 |         hd.setLevel(
111 |             logging.INFO
112 |         ) # control logging level here
113 | 
114 |         self.addHandler(hd)
115 |         return
116 | 
117 |     def done(self, msg, *args, **kwargs):
118 | 
119 |         if self.isEnabledFor(logging.DONE):
120 |             self._log(logging.DONE, msg, args, **kwargs)
121 | 
122 | 
123 | logging.setLoggerClass(ColoredLogger)
124 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Symmetric Gradient Domain Machine Learning (sGDML)
  2 | 
  3 | For more details visit: [sgdml.org](http://sgdml.org/)  
  4 | Documentation can be found here: [docs.sgdml.org](http://docs.sgdml.org/)
  5 | 
  6 | #### Requirements:
  7 | - Python 3.7+
  8 | - PyTorch (>=1.8)
  9 | - NumPy (>=1.19)
 10 | - SciPy (>=1.1)
 11 | 
 12 | #### Optional:
 13 | - ASE (>=3.16.2) (to run atomistic simulations)
 14 | 
 15 | ## Getting started
 16 | 
 17 | ### Stable release
 18 | 
 19 | Most systems come with the default package manager for Python ``pip`` already preinstalled. Install ``sgdml`` by simply calling:
 20 | 
 21 | ```
 22 | $ pip install sgdml
 23 | ```
 24 | 
 25 | The ``sgdml`` command-line interface and the corresponding Python API can now be used from anywhere on the system.
 26 | 
 27 | ### Development version
 28 | 
 29 | #### (1) Clone the repository
 30 | 
 31 | ```
 32 | $ git clone https://github.com/stefanch/sGDML.git
 33 | $ cd sGDML
 34 | ```
 35 | 
 36 | ...or update your existing local copy with
 37 | 
 38 | ```
 39 | $ git pull origin master
 40 | ```
 41 | 
 42 | #### (2) Install
 43 | 
 44 | ```
 45 | $ pip install -e .
 46 | ```
 47 | 
 48 | Using the flag ``--user``, you can tell ``pip`` to install the package to the current users's home directory, instead of system-wide. This option might require you to update your system's ``PATH`` variable accordingly.
 49 | 
 50 | 
 51 | ### Optional dependencies
 52 | 
 53 | Some functionality of this package relies on third-party libraries that are not installed by default. These optional dependencies (or "package extras") are specified during installation using the "square bracket syntax":
 54 | 
 55 | ```
 56 | $ pip install sgdml[<optional1>]
 57 | ```
 58 | 
 59 | #### Atomic Simulation Environment (ASE)
 60 | 
 61 | If you are interested in interfacing with [ASE](https://wiki.fysik.dtu.dk/ase/) to perform atomistic simulations (see [here](http://docs.sgdml.org/applications.html) for examples), use the ``ase`` keyword:
 62 | 
 63 | ```
 64 | $ pip install sgdml[ase]
 65 | ```
 66 | 
 67 | ## Reconstruct your first force field
 68 | 
 69 | Download one of the example datasets:
 70 | 
 71 | ```
 72 | $ sgdml-get dataset ethanol_dft
 73 | ```
 74 | 
 75 | Train a force field model:
 76 | 
 77 | ```
 78 | $ sgdml all ethanol_dft.npz 200 1000 5000
 79 | ```
 80 | 
 81 | ## Query a force field
 82 | 
 83 | ```python
 84 | import numpy as np
 85 | from sgdml.predict import GDMLPredict
 86 | from sgdml.utils import io
 87 | 
 88 | r,_ = io.read_xyz('geometries/ethanol.xyz') # 9 atoms
 89 | print(r.shape) # (1,27)
 90 | 
 91 | model = np.load('models/ethanol.npz')
 92 | gdml = GDMLPredict(model)
 93 | e,f = gdml.predict(r)
 94 | print(e.shape) # (1,)
 95 | print(f.shape) # (1,27)
 96 | ```
 97 | 
 98 | ## Authors
 99 | 
100 | * Stefan Chmiela
101 | * Jan Hermann
102 | 
103 | We appreciate and welcome contributions and would like to thank the following people for participating in this project:
104 | 
105 | * Huziel Sauceda
106 | * Igor Poltavsky
107 | * Luis Gálvez
108 | * Danny Panknin
109 | * Grégory Fonseca
110 | * Anton Charkin-Gorbulin
111 | 
112 | ## References
113 | 
114 | * [1] Chmiela, S., Tkatchenko, A., Sauceda, H. E., Poltavsky, I., Schütt, K. T., Müller, K.-R.,
115 | *Machine Learning of Accurate Energy-conserving Molecular Force Fields.*
116 | Science Advances, 3(5), e1603015 (2017)   
117 | [10.1126/sciadv.1603015](http://dx.doi.org/10.1126/sciadv.1603015)
118 | 
119 | * [2] Chmiela, S., Sauceda, H. E., Müller, K.-R., Tkatchenko, A.,
120 | *Towards Exact Molecular Dynamics Simulations with Machine-Learned Force Fields.*
121 | Nature Communications, 9(1), 3887 (2018)   
122 | [10.1038/s41467-018-06169-2](https://doi.org/10.1038/s41467-018-06169-2)
123 | 
124 | * [3] Chmiela, S., Sauceda, H. E., Poltavsky, I., Müller, K.-R., Tkatchenko, A.,
125 | *sGDML: Constructing Accurate and Data Efficient Molecular Force Fields Using Machine Learning.*
126 | Computer Physics Communications, 240, 38-45 (2019)
127 | [10.1016/j.cpc.2019.02.007](https://doi.org/10.1016/j.cpc.2019.02.007)
128 | 
129 | * [4] Chmiela, S., Vassilev-Galindo, V., Unke, O. T., Kabylda, A., Sauceda, H. E., Tkatchenko, A., Müller, K.-R.,
130 | *Accurate Global Machine Learning Force Fields for Molecules With Hundreds of Atoms.*
131 | Science Advances, 9(2), e1603015 (2023)
132 | [10.1126/sciadv.adf0873](https://doi.org/10.1126/sciadv.adf0873)


--------------------------------------------------------------------------------
/sgdml/intf/ase_calc.py:
--------------------------------------------------------------------------------
  1 | # MIT License
  2 | #
  3 | # Copyright (c) 2018-2020 Stefan Chmiela
  4 | #
  5 | # Permission is hereby granted, free of charge, to any person obtaining a copy
  6 | # of this software and associated documentation files (the "Software"), to deal
  7 | # in the Software without restriction, including without limitation the rights
  8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  9 | # copies of the Software, and to permit persons to whom the Software is
 10 | # furnished to do so, subject to the following conditions:
 11 | #
 12 | # The above copyright notice and this permission notice shall be included in all
 13 | # copies or substantial portions of the Software.
 14 | #
 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 21 | # SOFTWARE.
 22 | 
 23 | import logging
 24 | import numpy as np
 25 | 
 26 | try:
 27 |     from ase.calculators.calculator import Calculator
 28 |     from ase.units import kcal, mol
 29 | except ImportError:
 30 |     raise ImportError(
 31 |         'Optional ASE dependency not found! Please run \'pip install sgdml[ase]\' to install it.'
 32 |     )
 33 | 
 34 | from ..predict import GDMLPredict
 35 | 
 36 | 
 37 | class SGDMLCalculator(Calculator):
 38 | 
 39 |     implemented_properties = ['energy', 'forces']
 40 | 
 41 |     def __init__(
 42 |         self,
 43 |         model_path,
 44 |         E_to_eV=kcal / mol,
 45 |         F_to_eV_Ang=kcal / mol,
 46 |         use_torch=False,
 47 |         *args,
 48 |         **kwargs
 49 |     ):
 50 |         """
 51 |         ASE calculator for the sGDML force field.
 52 | 
 53 |         A calculator takes atomic numbers and atomic positions from an Atoms object and calculates the energy and forces.
 54 | 
 55 |         Note
 56 |         ----
 57 |         ASE uses eV and Angstrom as energy and length unit, respectively. Unless the paramerters `E_to_eV` and `F_to_eV_Ang` are specified, the sGDML model is assumed to use kcal/mol and Angstorm and the appropriate conversion factors are set accordingly.
 58 |         Here is how to find them: `ASE units <https://wiki.fysik.dtu.dk/ase/ase/units.html>`_.
 59 | 
 60 |         Parameters
 61 |         ----------
 62 |                 model_path : :obj:`str`
 63 |                         Path to a sGDML model file
 64 |                 E_to_eV : float, optional
 65 |                         Conversion factor from whatever energy unit is used by the model to eV. By default this parameter is set to convert from kcal/mol.
 66 |                 F_to_eV_Ang : float, optional
 67 |                         Conversion factor from whatever length unit is used by the model to Angstrom. By default, the length unit is not converted (assumed to be in Angstrom)
 68 |                 use_torch : boolean, optional
 69 |                         Use PyTorch to calculate predictions
 70 |         """
 71 | 
 72 |         super(SGDMLCalculator, self).__init__(*args, **kwargs)
 73 | 
 74 |         self.log = logging.getLogger(__name__)
 75 | 
 76 |         model = np.load(model_path, allow_pickle=True)
 77 |         self.gdml_predict = GDMLPredict(model, use_torch=use_torch)
 78 |         self.gdml_predict.prepare_parallel(n_bulk=1)
 79 | 
 80 |         self.log.warning(
 81 |             'Please remember to specify the proper conversion factors, if your model does not use \'kcal/mol\' and \'Ang\' as units.'
 82 |         )
 83 | 
 84 |         # Converts energy from the unit used by the sGDML model to eV.
 85 |         self.E_to_eV = E_to_eV
 86 | 
 87 |         # Converts length from eV to unit used in sGDML model.
 88 |         self.Ang_to_R = F_to_eV_Ang / E_to_eV
 89 | 
 90 |         # Converts force from the unit used by the sGDML model to eV/Ang.
 91 |         self.F_to_eV_Ang = F_to_eV_Ang
 92 | 
 93 |     def calculate(self, atoms=None, *args, **kwargs):
 94 | 
 95 |         super(SGDMLCalculator, self).calculate(atoms, *args, **kwargs)
 96 | 
 97 |         # convert model units to ASE default units
 98 |         r = np.array(atoms.get_positions()) * self.Ang_to_R
 99 | 
100 |         e, f = self.gdml_predict.predict(r.ravel())
101 | 
102 |         # convert model units to ASE default units (eV and Ang)
103 |         e *= self.E_to_eV
104 |         f *= self.F_to_eV_Ang
105 | 
106 |         self.results = {'energy': e, 'forces': f.reshape(-1, 3)}
107 | 


--------------------------------------------------------------------------------
/sgdml/get.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | 
  3 | # MIT License
  4 | #
  5 | # Copyright (c) 2018-2023 Stefan Chmiela
  6 | #
  7 | # Permission is hereby granted, free of charge, to any person obtaining a copy
  8 | # of this software and associated documentation files (the "Software"), to deal
  9 | # in the Software without restriction, including without limitation the rights
 10 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 11 | # copies of the Software, and to permit persons to whom the Software is
 12 | # furnished to do so, subject to the following conditions:
 13 | #
 14 | # The above copyright notice and this permission notice shall be included in all
 15 | # copies or substantial portions of the Software.
 16 | #
 17 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 18 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 19 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 20 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 21 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 22 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 23 | # SOFTWARE.
 24 | 
 25 | from __future__ import print_function
 26 | 
 27 | import argparse
 28 | import os
 29 | import re
 30 | import sys
 31 | 
 32 | from . import __version__
 33 | from .utils import ui
 34 | 
 35 | if sys.version[0] == '3':
 36 |     raw_input = input
 37 | 
 38 | try:
 39 |     from urllib.request import urlopen
 40 | except ImportError:
 41 |     from urllib2 import urlopen
 42 | 
 43 | 
 44 | def download(command, file_name):
 45 | 
 46 |     base_url = 'http://www.quantum-machine.org/gdml/' + (
 47 |         'data/npz/' if command == 'dataset' else 'models/'
 48 |     )
 49 |     request = urlopen(base_url + file_name)
 50 |     file = open(file_name, 'wb')
 51 |     filesize = int(request.headers['Content-Length'])
 52 | 
 53 |     size = 0
 54 |     block_sz = 1024
 55 |     while True:
 56 |         buffer = request.read(block_sz)
 57 |         if not buffer:
 58 |             break
 59 |         size += len(buffer)
 60 |         file.write(buffer)
 61 | 
 62 |         ui.callback(
 63 |             size,
 64 |             filesize,
 65 |             disp_str='Downloading: {}'.format(file_name),
 66 |             sec_disp_str='{:,} bytes'.format(filesize),
 67 |         )
 68 |     file.close()
 69 | 
 70 | 
 71 | def main():
 72 | 
 73 |     base_url = 'http://www.quantum-machine.org/gdml/'
 74 | 
 75 |     parser = argparse.ArgumentParser()
 76 | 
 77 |     parent_parser = argparse.ArgumentParser(add_help=False)
 78 |     parent_parser.add_argument(
 79 |         '-o',
 80 |         '--overwrite',
 81 |         dest='overwrite',
 82 |         action='store_true',
 83 |         help='overwrite existing files',
 84 |     )
 85 | 
 86 |     subparsers = parser.add_subparsers(title='commands', dest='command')
 87 |     subparsers.required = True
 88 |     parser_dataset = subparsers.add_parser(
 89 |         'dataset', help='download benchmark dataset', parents=[parent_parser]
 90 |     )
 91 |     parser_model = subparsers.add_parser(
 92 |         'model', help='download pre-trained model', parents=[parent_parser]
 93 |     )
 94 | 
 95 |     for subparser in [parser_dataset, parser_model]:
 96 |         subparser.add_argument(
 97 |             'name',
 98 |             metavar='<name>',
 99 |             type=str,
100 |             help='item name',
101 |             nargs='?',
102 |             default=None,
103 |         )
104 | 
105 |     args = parser.parse_args()
106 | 
107 |     print("Contacting server (%s)..." % base_url)
108 | 
109 |     if args.name is not None:
110 | 
111 |         url = '%sget.php?version=%s&%s=%s' % (
112 |             base_url,
113 |             __version__,
114 |             args.command,
115 |             args.name,
116 |         )
117 |         response = urlopen(url)
118 |         match, score = response.read().decode().split(',')
119 |         response.close()
120 | 
121 |         if int(score) == 0 or ui.yes_or_no('Do you mean \'%s\'?' % match):
122 |             download(args.command, match + '.npz')
123 |             return
124 | 
125 |     response = urlopen(
126 |         '%sget.php?version=%s&%s' % (base_url, __version__, args.command)
127 |     )
128 |     line = response.readlines()
129 |     response.close()
130 | 
131 |     print()
132 |     print('Available %ss:' % args.command)
133 | 
134 |     print('{:<2} {:<31}    {:>4}'.format('ID', 'Name', 'Size'))
135 |     print('-' * 42)
136 | 
137 |     items = line[0].split(b';')
138 |     for i, item in enumerate(items):
139 |         name, size = item.split(b',')
140 |         size = int(size) / 1024**2  # Bytes to MBytes
141 | 
142 |         print('{:>2d} {:<30} {:>5.1f} MB'.format(i, name.decode("utf-8"), size))
143 |     print()
144 | 
145 |     down_list = raw_input(
146 |         'Please list which %ss to download (e.g. 0 1 2 6) or type \'all\': '
147 |         % args.command
148 |     )
149 |     down_idxs = []
150 |     if 'all' in down_list.lower():
151 |         down_idxs = list(range(len(items)))
152 |     elif re.match(
153 |         "^ *[0-9][0-9 ]*$", down_list
154 |     ):  # only digits and spaces, at least one digit
155 |         down_idxs = [int(idx) for idx in re.split(r'\s+', down_list.strip())]
156 |         down_idxs = list(set(down_idxs))
157 |     else:
158 |         print(ui.color_str('ABORTED', fore_color=ui.RED, bold=True))
159 | 
160 |     for idx in down_idxs:
161 |         if idx not in range(len(items)):
162 |             print(
163 |                 ui.color_str('[WARN]', fore_color=ui.YELLOW, bold=True)
164 |                 + ' Index '
165 |                 + str(idx)
166 |                 + ' out of range, skipping.'
167 |             )
168 |         else:
169 |             name = items[idx].split(b',')[0].decode("utf-8")
170 |             if os.path.exists(name):
171 |                 print("'%s' exists, skipping." % (name))
172 |                 continue
173 | 
174 |             download(args.command, name + '.npz')
175 | 
176 | 
177 | if __name__ == "__main__":
178 |     main()
179 | 


--------------------------------------------------------------------------------
/sgdml/solvers/analytic.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | 
  3 | # MIT License
  4 | #
  5 | # Copyright (c) 2020-2022 Stefan Chmiela
  6 | #
  7 | # Permission is hereby granted, free of charge, to any person obtaining a copy
  8 | # of this software and associated documentation files (the "Software"), to deal
  9 | # in the Software without restriction, including without limitation the rights
 10 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 11 | # copies of the Software, and to permit persons to whom the Software is
 12 | # furnished to do so, subject to the following conditions:
 13 | #
 14 | # The above copyright notice and this permission notice shall be included in all
 15 | # copies or substantial portions of the Software.
 16 | #
 17 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 18 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 19 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 20 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 21 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 22 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 23 | # SOFTWARE.
 24 | 
 25 | import sys
 26 | import logging
 27 | import warnings
 28 | from functools import partial
 29 | 
 30 | import numpy as np
 31 | import scipy as sp
 32 | import timeit
 33 | 
 34 | from .. import DONE, NOT_DONE
 35 | 
 36 | 
 37 | class Analytic(object):
 38 |     def __init__(self, gdml_train, desc, callback=None):
 39 | 
 40 |         self.log = logging.getLogger(__name__)
 41 | 
 42 |         self.gdml_train = gdml_train
 43 |         self.desc = desc
 44 | 
 45 |         self.callback = callback
 46 | 
 47 |     # from memory_profiler import profile
 48 |     # @profile
 49 |     def solve(self, task, R_desc, R_d_desc, tril_perms_lin, y):
 50 | 
 51 |         sig = task['sig']
 52 |         lam = task['lam']
 53 |         use_E_cstr = task['use_E_cstr']
 54 | 
 55 |         n_train, dim_d = R_d_desc.shape[:2]
 56 |         n_atoms = int((1 + np.sqrt(8 * dim_d + 1)) / 2)
 57 |         dim_i = 3 * n_atoms
 58 | 
 59 |         if self.callback is not None:
 60 |             self.callback = partial(
 61 |                 self.callback,
 62 |                 disp_str='Assembling kernel matrix',
 63 |             )
 64 | 
 65 |         K = -self.gdml_train._assemble_kernel_mat(
 66 |             R_desc,
 67 |             R_d_desc,
 68 |             tril_perms_lin,
 69 |             sig,
 70 |             self.desc,
 71 |             use_E_cstr=use_E_cstr,
 72 |             callback=self.callback,
 73 |         )  # Flip sign to make convex
 74 | 
 75 |         start = timeit.default_timer()
 76 | 
 77 |         with warnings.catch_warnings():
 78 |             warnings.simplefilter('ignore')
 79 | 
 80 |             if K.shape[0] == K.shape[1]:
 81 | 
 82 |                 K[np.diag_indices_from(K)] += lam  # Regularize
 83 | 
 84 |                 if self.callback is not None:
 85 |                     self.callback = partial(
 86 |                         self.callback,
 87 |                         disp_str='Solving linear system (Cholesky factorization)',
 88 |                     )
 89 |                     self.callback(NOT_DONE)
 90 | 
 91 |                 try:
 92 | 
 93 |                     # Cholesky (do not overwrite K in case we need to retry)
 94 |                     L, lower = sp.linalg.cho_factor(
 95 |                         K, overwrite_a=False, check_finite=False
 96 |                     )
 97 |                     alphas = -sp.linalg.cho_solve(
 98 |                         (L, lower), y, overwrite_b=False, check_finite=False
 99 |                     )
100 | 
101 |                 except np.linalg.LinAlgError:  # Try a solver that makes less assumptions
102 | 
103 |                     if self.callback is not None:
104 |                         self.callback = partial(
105 |                             self.callback,
106 |                             disp_str='Solving linear system (LU factorization)      ',  # Keep whitespaces!
107 |                         )
108 |                         self.callback(NOT_DONE)
109 | 
110 |                     try:
111 |                         # LU
112 |                         alphas = -sp.linalg.solve(
113 |                             K, y, overwrite_a=True, overwrite_b=True, check_finite=False
114 |                         )
115 |                     except MemoryError:
116 |                         self.log.critical(
117 |                             'Not enough memory to train this system using a closed form solver.'
118 |                         )
119 |                         print()
120 |                         os._exit(1)
121 | 
122 |                 except MemoryError:
123 |                     self.log.critical(
124 |                         'Not enough memory to train this system using a closed form solver.'
125 |                     )
126 |                     print()
127 |                     os._exit(1)
128 |             else:
129 | 
130 |                 if self.callback is not None:
131 |                     self.callback = partial(
132 |                         self.callback,
133 |                         disp_str='Solving over-determined linear system (least squares approximation)',
134 |                     )
135 |                     self.callback(NOT_DONE)
136 | 
137 |                 # Least squares for non-square K
138 |                 alphas = -np.linalg.lstsq(K, y, rcond=-1)[0]
139 | 
140 |         stop = timeit.default_timer()
141 | 
142 |         if self.callback is not None:
143 |             dur_s = stop - start
144 |             sec_disp_str = 'took {:.1f} s'.format(dur_s) if dur_s >= 0.1 else ''
145 |             self.callback(
146 |                 DONE,
147 |                 disp_str='Training on {:,} points'.format(n_train),
148 |                 sec_disp_str=sec_disp_str,
149 |             )
150 | 
151 |         return alphas
152 | 
153 |     @staticmethod
154 |     def est_memory_requirement(n_train, n_atoms):
155 | 
156 |         est_bytes = 3 * (n_train * 3 * n_atoms) ** 2 * 8  # K + factor(s) of K
157 |         est_bytes += (n_train * 3 * n_atoms) * 8  # alpha
158 | 
159 |         return est_bytes
160 | 


--------------------------------------------------------------------------------
/scripts/sgdml_dataset_from_aims.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | 
  3 | # MIT License
  4 | #
  5 | # Copyright (c) 2018-2022 Stefan Chmiela
  6 | #
  7 | # Permission is hereby granted, free of charge, to any person obtaining a copy
  8 | # of this software and associated documentation files (the "Software"), to deal
  9 | # in the Software without restriction, including without limitation the rights
 10 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 11 | # copies of the Software, and to permit persons to whom the Software is
 12 | # furnished to do so, subject to the following conditions:
 13 | #
 14 | # The above copyright notice and this permission notice shall be included in all
 15 | # copies or substantial portions of the Software.
 16 | #
 17 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 18 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 19 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 20 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 21 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 22 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 23 | # SOFTWARE.
 24 | 
 25 | from __future__ import print_function
 26 | 
 27 | import argparse
 28 | import os
 29 | import sys
 30 | 
 31 | import numpy as np
 32 | 
 33 | from sgdml.utils import io, ui
 34 | 
 35 | 
 36 | def read_reference_data(f):  # noqa C901
 37 |     eV_to_kcalmol = 0.036749326 / 0.0015946679
 38 | 
 39 |     e_next, f_next, geo_next = False, False, False
 40 |     n_atoms = None
 41 |     R, z, E, F = [], [], [], []
 42 | 
 43 |     geo_idx = 0
 44 |     for line in f:
 45 |         if n_atoms:
 46 |             cols = line.split()
 47 |             if e_next:
 48 |                 E.append(float(cols[5]))
 49 |                 e_next = False
 50 |             elif f_next:
 51 |                 a = int(cols[1]) - 1
 52 |                 F.append(list(map(float, cols[2:5])))
 53 |                 if a == n_atoms - 1:
 54 |                     f_next = False
 55 |             elif geo_next:
 56 |                 if 'atom' in cols:
 57 |                     a_count += 1  # noqa: F821
 58 |                     R.append(list(map(float, cols[1:4])))
 59 | 
 60 |                     if geo_idx == 0:
 61 |                         z.append(io._z_str_to_z_dict[cols[4]])
 62 | 
 63 |                     if a_count == n_atoms:
 64 |                         geo_next = False
 65 |                         geo_idx += 1
 66 |             elif 'Energy and forces in a compact form:' in line:
 67 |                 e_next = True
 68 |             elif 'Total atomic forces (unitary forces cleaned) [eV/Ang]:' in line:
 69 |                 f_next = True
 70 |             elif (
 71 |                 'Atomic structure (and velocities) as used in the preceding time step:'
 72 |                 in line
 73 |             ):
 74 |                 geo_next = True
 75 |                 a_count = 0
 76 |         elif 'The structure contains' in line and 'atoms,  and a total of' in line:
 77 |             n_atoms = int(line.split()[3])
 78 |             print('Number atoms per geometry:      {:>7d}'.format(n_atoms))
 79 |             continue
 80 | 
 81 |         if geo_idx > 0 and geo_idx % 1000 == 0:
 82 |             sys.stdout.write("\rNumber geometries found so far: {:>7d}".format(geo_idx))
 83 |             sys.stdout.flush()
 84 |     sys.stdout.write("\rNumber geometries found so far: {:>7d}".format(geo_idx))
 85 |     sys.stdout.flush()
 86 |     print(
 87 |         '\n'
 88 |         + ui.color_str('[INFO]', bold=True)
 89 |         + ' Energies and forces have been converted from eV to kcal/mol(/Ang)'
 90 |     )
 91 | 
 92 |     R = np.array(R).reshape(-1, n_atoms, 3)
 93 |     z = np.array(z)
 94 |     E = np.array(E) * eV_to_kcalmol
 95 |     F = np.array(F).reshape(-1, n_atoms, 3) * eV_to_kcalmol
 96 | 
 97 |     f.close()
 98 |     return (R, z, E, F)
 99 | 
100 | 
101 | parser = argparse.ArgumentParser(description='Creates a dataset from FHI-aims format.')
102 | parser.add_argument(
103 |     'dataset',
104 |     metavar='<dataset>',
105 |     type=argparse.FileType('r'),
106 |     help='path to xyz dataset file',
107 | )
108 | parser.add_argument(
109 |     '-o',
110 |     '--overwrite',
111 |     dest='overwrite',
112 |     action='store_true',
113 |     help='overwrite existing dataset file',
114 | )
115 | args = parser.parse_args()
116 | dataset = args.dataset
117 | 
118 | name = os.path.splitext(os.path.basename(dataset.name))[0]
119 | dataset_file_name = name + '.npz'
120 | 
121 | dataset_exists = os.path.isfile(dataset_file_name)
122 | if dataset_exists and args.overwrite:
123 |     print(ui.color_str('[INFO]', bold=True) + ' Overwriting existing dataset file.')
124 | if not dataset_exists or args.overwrite:
125 |     print('Writing dataset to \'%s\'...' % dataset_file_name)
126 | else:
127 |     sys.exit(
128 |         ui.color_str('[FAIL]', fore_color=ui.RED, bold=True) + ' Dataset \'%s\' already exists.' % dataset_file_name
129 |     )
130 | 
131 | R, z, E, F = read_reference_data(dataset)
132 | 
133 | # Prune all arrays to same length.
134 | n_mols = min(min(R.shape[0], F.shape[0]), E.shape[0])
135 | if n_mols != R.shape[0] or n_mols != F.shape[0] or n_mols != E.shape[0]:
136 |     print(
137 |         ui.color_str('[WARN]', fore_color=ui.YELLOW, bold=True)
138 |         + ' Incomplete output detected: Final dataset was pruned to %d points.' % n_mols
139 |     )
140 | R = R[:n_mols, :, :]
141 | F = F[:n_mols, :, :]
142 | E = E[:n_mols]
143 | 
144 | # Base variables contained in every model file.
145 | base_vars = {
146 |     'type': 'd',
147 |     'R': R,
148 |     'z': z,
149 |     'E': E[:, None],
150 |     'F': F,
151 |     'e_unit': 'kcal/mol',
152 |     'r_unit': 'Ang',
153 |     'name': name,
154 |     'theory': 'unknown',
155 | }
156 | 
157 | base_vars['F_min'], base_vars['F_max'] = np.min(F.ravel()), np.max(F.ravel())
158 | base_vars['F_mean'], base_vars['F_var'] = np.mean(F.ravel()), np.var(F.ravel())
159 | 
160 | base_vars['E_min'], base_vars['E_max'] = np.min(E), np.max(E)
161 | base_vars['E_mean'], base_vars['E_var'] = np.mean(E), np.var(E)
162 | 
163 | base_vars['md5'] = io.dataset_md5(base_vars)
164 | 
165 | np.savez_compressed(dataset_file_name, **base_vars)
166 | print(ui.color_str('DONE', fore_color=ui.GREEN, bold=True))
167 | 


--------------------------------------------------------------------------------
/scripts/sgdml_dataset_from_ipi.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | 
  3 | # MIT License
  4 | #
  5 | # Copyright (c) 2018 Stefan Chmiela
  6 | #
  7 | # Permission is hereby granted, free of charge, to any person obtaining a copy
  8 | # of this software and associated documentation files (the "Software"), to deal
  9 | # in the Software without restriction, including without limitation the rights
 10 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 11 | # copies of the Software, and to permit persons to whom the Software is
 12 | # furnished to do so, subject to the following conditions:
 13 | #
 14 | # The above copyright notice and this permission notice shall be included in all
 15 | # copies or substantial portions of the Software.
 16 | #
 17 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 18 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 19 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 20 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 21 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 22 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 23 | # SOFTWARE.
 24 | 
 25 | from __future__ import print_function
 26 | 
 27 | import argparse
 28 | import os
 29 | import sys
 30 | 
 31 | import numpy as np
 32 | 
 33 | from sgdml.utils import io, ui
 34 | 
 35 | 
 36 | def raw_input_float(prompt):
 37 |     while True:
 38 |         try:
 39 |             return float(input(prompt))
 40 |         except ValueError:
 41 |             print(ui.color_str('[FAIL]', fore_color=ui.RED, bold=True) + ' That is not a valid float.')
 42 | 
 43 | 
 44 | # Assumes that the atoms in each molecule are in the same order.
 45 | def read_concat_xyz(f):
 46 |     n_atoms = None
 47 | 
 48 |     R, z = [], []
 49 |     for i, line in enumerate(f):
 50 |         line = line.strip()
 51 |         if not n_atoms:
 52 |             n_atoms = int(line)
 53 |             print('Number atoms per geometry:      {:>7d}'.format(n_atoms))
 54 | 
 55 |         file_i, line_i = divmod(i, n_atoms + 2)
 56 | 
 57 |         cols = line.split()
 58 |         if line_i >= 2:
 59 |             if file_i == 0:  # first molecule
 60 |                 z.append(io._z_str_to_z_dict[cols[0]])
 61 |             R.append(list(map(float, cols[1:4])))
 62 | 
 63 |         if file_i % 1000 == 0:
 64 |             sys.stdout.write("\rNumber geometries found so far: {:>7d}".format(file_i))
 65 |             sys.stdout.flush()
 66 |     sys.stdout.write("\rNumber geometries found so far: {:>7d}\n".format(file_i))
 67 |     sys.stdout.flush()
 68 | 
 69 |     # Only keep complete entries.
 70 |     R = R[: int(n_atoms * np.floor(len(R) / float(n_atoms)))]
 71 | 
 72 |     R = np.array(R).reshape(-1, n_atoms, 3)
 73 |     z = np.array(z)
 74 | 
 75 |     f.close()
 76 |     return (R, z)
 77 | 
 78 | 
 79 | def read_out_file(f, col):
 80 | 
 81 |     E = []
 82 |     for i, line in enumerate(f):
 83 |         line = line.strip()
 84 |         if line[0] != '#':  # Ignore comments.
 85 |             E.append(float(line.split()[col]))
 86 |         if i % 1000 == 0:
 87 |             sys.stdout.write("\rNumber lines processed so far:  {:>7d}".format(len(E)))
 88 |             sys.stdout.flush()
 89 |     sys.stdout.write("\rNumber lines processed so far:  {:>7d}\n".format(len(E)))
 90 |     sys.stdout.flush()
 91 | 
 92 |     return np.array(E)
 93 | 
 94 | 
 95 | parser = argparse.ArgumentParser(
 96 |     description='Creates a dataset from extended [TODO] format.'
 97 | )
 98 | parser.add_argument(
 99 |     'geometries',
100 |     metavar='<geometries>',
101 |     type=argparse.FileType('r'),
102 |     help='path to XYZ geometry file',
103 | )
104 | parser.add_argument(
105 |     'forces',
106 |     metavar='<forces>',
107 |     type=argparse.FileType('r'),
108 |     help='path to XYZ force file',
109 | )
110 | parser.add_argument(
111 |     'energies',
112 |     metavar='<energies>',
113 |     type=argparse.FileType('r'),
114 |     help='path to CSV force file',
115 | )
116 | parser.add_argument(
117 |     'energy_col',
118 |     metavar='<energy_col>',
119 |     type=lambda x: io.is_strict_pos_int(x),
120 |     help='which column to parse from energy file (zero based)',
121 |     nargs='?',
122 |     default=0,
123 | )
124 | parser.add_argument(
125 |     '-o',
126 |     '--overwrite',
127 |     dest='overwrite',
128 |     action='store_true',
129 |     help='overwrite existing dataset file',
130 | )
131 | args = parser.parse_args()
132 | geometries = args.geometries
133 | forces = args.forces
134 | energies = args.energies
135 | energy_col = args.energy_col
136 | 
137 | name = os.path.splitext(os.path.basename(geometries.name))[0]
138 | dataset_file_name = name + '.npz'
139 | 
140 | dataset_exists = os.path.isfile(dataset_file_name)
141 | if dataset_exists and args.overwrite:
142 |     print(ui.color_str('[INFO]', bold=True) + ' Overwriting existing dataset file.')
143 | if not dataset_exists or args.overwrite:
144 |     print('Writing dataset to \'%s\'...' % dataset_file_name)
145 | else:
146 |     sys.exit(
147 |         ui.color_str('[FAIL]', fore_color=ui.RED, bold=True) + ' Dataset \'%s\' already exists.' % dataset_file_name
148 |     )
149 | 
150 | 
151 | print('Reading geometries...')
152 | R, z = read_concat_xyz(geometries)
153 | 
154 | print('Reading forces...')
155 | F, _ = read_concat_xyz(forces)
156 | 
157 | print('Reading energies from column %d...' % energy_col)
158 | E = read_out_file(energies, energy_col)
159 | 
160 | # Prune all arrays to same length.
161 | n_mols = min(min(R.shape[0], F.shape[0]), E.shape[0])
162 | if n_mols != R.shape[0] or n_mols != F.shape[0] or n_mols != E.shape[0]:
163 |     print(
164 |         ui.color_str('[WARN]', fore_color=ui.YELLOW, bold=True)
165 |         + ' Incomplete output detected: Final dataset was pruned to %d points.' % n_mols
166 |     )
167 | R = R[:n_mols, :, :]
168 | F = F[:n_mols, :, :]
169 | E = E[:n_mols]
170 | 
171 | print(
172 |     ui.color_str('[INFO]', bold=True)
173 |     + ' Geometries, forces and energies must have consistent units.'
174 | )
175 | R_conv_fact = raw_input_float('Unit conversion factor for geometries: ')
176 | R = R * R_conv_fact
177 | F_conv_fact = raw_input_float('Unit conversion factor for forces: ')
178 | F = F * F_conv_fact
179 | E_conv_fact = raw_input_float('Unit conversion factor for energies: ')
180 | E = E * E_conv_fact
181 | 
182 | # Base variables contained in every model file.
183 | base_vars = {
184 |     'type': 'd',
185 |     'R': R,
186 |     'z': z,
187 |     'E': E[:, None],
188 |     'F': F,
189 |     'name': name,
190 |     'theory': 'unknown',
191 | }
192 | base_vars['md5'] = io.dataset_md5(base_vars)
193 | 
194 | np.savez_compressed(dataset_file_name, **base_vars)
195 | ui.color_str('[DONE]', fore_color=ui.GREEN, bold=True)
196 | 


--------------------------------------------------------------------------------
/scripts/sgdml_dataset_via_ase.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | 
  3 | # MIT License
  4 | #
  5 | # Copyright (c) 2018-2022 Stefan Chmiela
  6 | #
  7 | # Permission is hereby granted, free of charge, to any person obtaining a copy
  8 | # of this software and associated documentation files (the "Software"), to deal
  9 | # in the Software without restriction, including without limitation the rights
 10 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 11 | # copies of the Software, and to permit persons to whom the Software is
 12 | # furnished to do so, subject to the following conditions:
 13 | #
 14 | # The above copyright notice and this permission notice shall be included in all
 15 | # copies or substantial portions of the Software.
 16 | #
 17 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 18 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 19 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 20 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 21 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 22 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 23 | # SOFTWARE.
 24 | 
 25 | from __future__ import print_function
 26 | 
 27 | import argparse
 28 | import os
 29 | import sys
 30 | 
 31 | try:
 32 |     from ase.io import read
 33 | except ImportError:
 34 |     raise ImportError('Optional ASE dependency not found! Please run \'pip install sgdml[ase]\' to install it.')
 35 | 
 36 | import numpy as np
 37 | 
 38 | from sgdml import __version__
 39 | from sgdml.utils import io, ui
 40 | 
 41 | if sys.version[0] == '3':
 42 |     raw_input = input
 43 | 
 44 | 
 45 | parser = argparse.ArgumentParser(
 46 |     description='Creates a dataset from any input format supported by ASE.'
 47 | )
 48 | parser.add_argument(
 49 |     'dataset',
 50 |     metavar='<dataset>',
 51 |     type=argparse.FileType('r'),
 52 |     help='path to input dataset file',
 53 | )
 54 | parser.add_argument(
 55 |     '-o',
 56 |     '--overwrite',
 57 |     dest='overwrite',
 58 |     action='store_true',
 59 |     help='overwrite existing dataset file',
 60 | )
 61 | args = parser.parse_args()
 62 | dataset = args.dataset
 63 | 
 64 | 
 65 | name = os.path.splitext(os.path.basename(dataset.name))[0]
 66 | dataset_file_name = name + '.npz'
 67 | 
 68 | dataset_exists = os.path.isfile(dataset_file_name)
 69 | if dataset_exists and args.overwrite:
 70 |     print(ui.color_str('[INFO]', bold=True) + ' Overwriting existing dataset file.')
 71 | if not dataset_exists or args.overwrite:
 72 |     print('Writing dataset to \'{}\'...'.format(dataset_file_name))
 73 | else:
 74 |     sys.exit(
 75 |         ui.color_str('[FAIL]', fore_color=ui.RED, bold=True)
 76 |         + ' Dataset \'{}\' already exists.'.format(dataset_file_name)
 77 |     )
 78 | 
 79 | mols = read(dataset.name, index=':')
 80 | 
 81 | # filter incomplete outputs from trajectory
 82 | mols = [mol for mol in mols if mol.get_calculator() is not None]
 83 | 
 84 | lattice, R, z, E, F = None, None, None, None, None
 85 | 
 86 | calc = mols[0].get_calculator()
 87 | 
 88 | print("\rNumber geometries: {:,}".format(len(mols)))
 89 | #print("\rAvailable properties: " + ', '.join(calc.results))
 90 | print()
 91 | 
 92 | if 'forces' not in calc.results:
 93 |     sys.exit(
 94 |         ui.color_str('[FAIL]', fore_color=ui.RED, bold=True)
 95 |         + ' Forces are missing in the input file!'
 96 |     )
 97 | 
 98 | lattice = np.array(mols[0].get_cell().T)
 99 | if not np.any(lattice):
100 |     print(
101 |         ui.color_str('[INFO]', bold=True)
102 |         + ' No lattice vectors specified.'
103 |     )
104 |     lattice = None
105 | 
106 | Z = np.array([mol.get_atomic_numbers() for mol in mols])
107 | all_z_the_same = (Z == Z[0]).all()
108 | if not all_z_the_same:
109 |     sys.exit(
110 |         ui.color_str('[FAIL]', fore_color=ui.RED, bold=True)
111 |         + ' Order of atoms changes accross dataset.'
112 |     )
113 | 
114 | R = np.array([mol.get_positions() for mol in mols])
115 | z = Z[0]
116 | 
117 | if 'Energy' in mols[0].info:
118 |     E = np.array([float(mol.info['Energy']) for mol in mols])
119 | else:
120 |     E = np.array([mol.get_potential_energy() for mol in mols])
121 | F = np.array([mol.get_forces() for mol in mols])
122 | 
123 | print('Please provide a name for this dataset. Otherwise the original filename will be reused.')
124 | custom_name = raw_input('> ').strip()
125 | if custom_name != '':
126 |     name = custom_name
127 | 
128 | print('Please provide a descriptor for the level of theory used to create this dataset.')
129 | theory = raw_input('> ').strip()
130 | if theory == '':
131 |     theory = 'unknown'
132 | 
133 | # Base variables contained in every model file.
134 | base_vars = {
135 |     'type': 'd',
136 |     'code_version': __version__,
137 |     'name': name,
138 |     'theory': theory,
139 |     'R': R,
140 |     'z': z,
141 |     'F': F,
142 | }
143 | 
144 | base_vars['F_min'], base_vars['F_max'] = np.min(F.ravel()), np.max(F.ravel())
145 | base_vars['F_mean'], base_vars['F_var'] = np.mean(F.ravel()), np.var(F.ravel())
146 | 
147 | print('If you want to convert your original length unit, please provide a conversion factor (default: 1.0): ')
148 | R_to_new_unit = raw_input('> ').strip()
149 | if R_to_new_unit != '':
150 |     R_to_new_unit = float(R_to_new_unit)
151 | else:
152 |     R_to_new_unit = 1.0
153 | 
154 | print('If you want to convert your original energy unit, please provide a conversion factor (default: 1.0): ')
155 | E_to_new_unit = raw_input('> ').strip()
156 | if E_to_new_unit != '':
157 |     E_to_new_unit = float(E_to_new_unit)
158 | else:
159 |     E_to_new_unit = 1.0
160 | 
161 | print('Please provide a description of the length unit, e.g. \'Ang\' or \'au\': ')
162 | print('Note: This string will be stored in the dataset file and passed on to models files for later reference.')
163 | r_unit = raw_input('> ').strip()
164 | if r_unit != '':
165 |     base_vars['r_unit'] = r_unit
166 | 
167 | print('Please provide a description of the energy unit, e.g. \'kcal/mol\' or \'eV\': ')
168 | print('Note: This string will be stored in the dataset file and passed on to models files for later reference.')
169 | e_unit = raw_input('> ').strip()
170 | if e_unit != '':
171 |     base_vars['e_unit'] = e_unit
172 | 
173 | if E is not None:
174 |     base_vars['E'] = E * E_to_new_unit
175 |     base_vars['E_min'], base_vars['E_max'] = np.min(E), np.max(E)
176 |     base_vars['E_mean'], base_vars['E_var'] = np.mean(E), np.var(E)
177 | else:
178 |     print(ui.color_str('[INFO]', bold=True) + ' No energy labels found in dataset.')
179 | 
180 | base_vars['R'] *= R_to_new_unit
181 | base_vars['F'] *= E_to_new_unit / R_to_new_unit
182 | 
183 | if lattice is not None:
184 |     base_vars['lattice'] = lattice
185 | 
186 | base_vars['md5'] = io.dataset_md5(base_vars)
187 | np.savez_compressed(dataset_file_name, **base_vars)
188 | print(ui.color_str('[DONE]', fore_color=ui.GREEN, bold=True))
189 | 


--------------------------------------------------------------------------------
/scripts/sgdml_dataset_from_extxyz.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | 
  3 | # MIT License
  4 | #
  5 | # Copyright (c) 2018-2022 Stefan Chmiela
  6 | #
  7 | # Permission is hereby granted, free of charge, to any person obtaining a copy
  8 | # of this software and associated documentation files (the "Software"), to deal
  9 | # in the Software without restriction, including without limitation the rights
 10 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 11 | # copies of the Software, and to permit persons to whom the Software is
 12 | # furnished to do so, subject to the following conditions:
 13 | #
 14 | # The above copyright notice and this permission notice shall be included in all
 15 | # copies or substantial portions of the Software.
 16 | #
 17 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 18 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 19 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 20 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 21 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 22 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 23 | # SOFTWARE.
 24 | 
 25 | from __future__ import print_function
 26 | 
 27 | import argparse
 28 | import os
 29 | import sys
 30 | 
 31 | try:
 32 |     from ase.io import read
 33 | except ImportError:
 34 |     raise ImportError('Optional ASE dependency not found! Please run \'pip install sgdml[ase]\' to install it.')
 35 | 
 36 | import numpy as np
 37 | 
 38 | from sgdml import __version__
 39 | from sgdml.utils import io, ui
 40 | 
 41 | if sys.version[0] == '3':
 42 |     raw_input = input
 43 | 
 44 | 
 45 | # Note: assumes that the atoms in each molecule are in the same order.
 46 | def read_nonstd_ext_xyz(f):
 47 |     n_atoms = None
 48 | 
 49 |     R, z, E, F = [], [], [], []
 50 |     for i, line in enumerate(f):
 51 |         line = line.strip()
 52 |         if not n_atoms:
 53 |             n_atoms = int(line)
 54 |             print('Number atoms per geometry: {:,}'.format(n_atoms))
 55 | 
 56 |         file_i, line_i = divmod(i, n_atoms + 2)
 57 | 
 58 |         if line_i == 1:
 59 |             try:
 60 |                 e = float(line)
 61 |             except ValueError:
 62 |                 pass
 63 |             else:
 64 |                 E.append(e)
 65 | 
 66 |         cols = line.split()
 67 |         if line_i >= 2:
 68 |             R.append(list(map(float, cols[1:4])))
 69 |             if file_i == 0:  # first molecule
 70 |                 z.append(io._z_str_to_z_dict[cols[0]])
 71 |             F.append(list(map(float, cols[4:7])))
 72 | 
 73 |         if file_i % 1000 == 0:
 74 |             sys.stdout.write('\rNumber geometries found so far: {:,}'.format(file_i))
 75 |             sys.stdout.flush()
 76 |     sys.stdout.write('\rNumber geometries found so far: {:,}'.format(file_i))
 77 |     sys.stdout.flush()
 78 |     print()
 79 | 
 80 |     R = np.array(R).reshape(-1, n_atoms, 3)
 81 |     z = np.array(z)
 82 |     E = None if not E else np.array(E)
 83 |     F = np.array(F).reshape(-1, n_atoms, 3)
 84 | 
 85 |     if F.shape[0] != R.shape[0]:
 86 |         sys.exit(
 87 |             ui.color_str('[FAIL]', fore_color=ui.RED, bold=True)
 88 |             + ' Force labels are missing from dataset or are incomplete!'
 89 |         )
 90 | 
 91 |     f.close()
 92 |     return (R, z, E, F)
 93 | 
 94 | # Extracts info string for each frame.
 95 | def extract_info_from_extxyz(file_path):
 96 |     infos = []
 97 | 
 98 |     with open(file_path) as f:
 99 |         lines = f.readlines()
100 | 
101 |     i = 0
102 |     while i < len(lines):
103 |         try:
104 |             n_atoms = int(lines[i])
105 |         except ValueError:
106 |             raise ValueError(f"Invalid atom count at line {i + 1}")
107 | 
108 |         if i + 1 >= len(lines):
109 |             break
110 | 
111 |         comment_line = lines[i + 1].strip()
112 |         info = {}
113 |         for token in comment_line.split():
114 |             if "=" in token:
115 |                 key, val = token.split("=", 1)
116 |                 val = val.strip('"')
117 |                 try:
118 |                     val = float(val)
119 |                 except ValueError:
120 |                     pass
121 |                 info[key] = val
122 |         infos.append(info)
123 | 
124 |         i += 2 + n_atoms
125 | 
126 |     return infos
127 | 
128 | 
129 | parser = argparse.ArgumentParser(
130 |     description='Creates a dataset from extended XYZ format.'
131 | )
132 | parser.add_argument(
133 |     'dataset',
134 |     metavar='<dataset>',
135 |     type=argparse.FileType('r'),
136 |     help='path to extended xyz dataset file',
137 | )
138 | parser.add_argument(
139 |     '-o',
140 |     '--overwrite',
141 |     dest='overwrite',
142 |     action='store_true',
143 |     help='overwrite existing dataset file',
144 | )
145 | args = parser.parse_args()
146 | dataset = args.dataset
147 | 
148 | 
149 | name = os.path.splitext(os.path.basename(dataset.name))[0]
150 | dataset_file_name = name + '.npz'
151 | 
152 | dataset_exists = os.path.isfile(dataset_file_name)
153 | if dataset_exists and args.overwrite:
154 |     print(ui.color_str('[INFO]', bold=True) + ' Overwriting existing dataset file.')
155 | if not dataset_exists or args.overwrite:
156 |     print('Writing dataset to \'{}\'...'.format(dataset_file_name))
157 | else:
158 |     sys.exit(
159 |         ui.color_str('[FAIL]', fore_color=ui.RED, bold=True)
160 |         + ' Dataset \'{}\' already exists.'.format(dataset_file_name)
161 |     )
162 | 
163 | lattice, R, z, E, F = None, None, None, None, None
164 | 
165 | mols = read(dataset.name, format='extxyz', index=':')
166 | #calc = mols[0].get_calculator() # depreciated
167 | calc = mols[0].calc
168 | is_extxyz = calc is not None
169 | if is_extxyz:
170 | 
171 |     print("\rNumber geometries found: {:,}\n".format(len(mols)))
172 | 
173 |     if 'forces' not in calc.results:
174 |         sys.exit(
175 |             ui.color_str('[FAIL]', fore_color=ui.RED, bold=True)
176 |             + ' Forces are missing in the input file!'
177 |         )
178 | 
179 |     lattice = np.array(mols[0].get_cell().T)
180 |     if not np.any(lattice): # all zeros
181 |         print(
182 |             ui.color_str('[INFO]', bold=True)
183 |             + ' No lattice vectors specified in extended XYZ file.'
184 |         )
185 |         lattice = None
186 | 
187 |     Z = np.array([mol.get_atomic_numbers() for mol in mols])
188 |     all_z_the_same = (Z == Z[0]).all()
189 |     if not all_z_the_same:
190 |         sys.exit(
191 |             ui.color_str('[FAIL]', fore_color=ui.RED, bold=True)
192 |             + ' Order of atoms changes accross dataset.'
193 |         )
194 | 
195 |     R = np.array([mol.get_positions() for mol in mols])
196 |     z = Z[0]
197 | 
198 |     # ASE did not parse info string. Try doing it manually.
199 |     if not mols[0].info:
200 | 
201 |         print(
202 |             ui.color_str('[INFO]', bold=True)
203 |             + ' ASE did not parse info string completely. Try doing it manually.'
204 |         )
205 | 
206 |         infos = extract_info_from_extxyz(dataset.name)
207 |         for mol, info in zip(mols, infos):
208 |             mol.info.update(info)
209 | 
210 |     if 'Energy' in mols[0].info:
211 |         E = np.array([mol.info['Energy'] for mol in mols])
212 |     if 'energy' in mols[0].info:
213 |         E = np.array([mol.info['energy'] for mol in mols])
214 |     F = np.array([mol.get_forces() for mol in mols])
215 | 
216 | else:  # legacy non-standard XYZ format
217 | 
218 |     with open(dataset.name) as f:
219 |         R, z, E, F = read_nonstd_ext_xyz(f)
220 | 
221 | # Base variables contained in every model file.
222 | base_vars = {
223 |     'type': 'd',
224 |     'code_version': __version__,
225 |     'name': name,
226 |     'theory': 'unknown',
227 |     'R': R,
228 |     'z': z,
229 |     'F': F,
230 | }
231 | 
232 | base_vars['F_min'], base_vars['F_max'] = np.min(F.ravel()), np.max(F.ravel())
233 | base_vars['F_mean'], base_vars['F_var'] = np.mean(F.ravel()), np.var(F.ravel())
234 | 
235 | print('Please provide a description of the length unit used in your input file, e.g. \'Ang\' or \'au\': ')
236 | print('Note: This string will be stored in the dataset file and passed on to models files for later reference.')
237 | r_unit = raw_input('> ').strip()
238 | if r_unit != '':
239 |     base_vars['r_unit'] = r_unit
240 | 
241 | print('Please provide a description of the energy unit used in your input file, e.g. \'kcal/mol\' or \'eV\': ')
242 | print('Note: This string will be stored in the dataset file and passed on to models files for later reference.')
243 | e_unit = raw_input('> ').strip()
244 | if e_unit != '':
245 |     base_vars['e_unit'] = e_unit
246 | 
247 | if E is not None:
248 |     base_vars['E'] = E
249 |     base_vars['E_min'], base_vars['E_max'] = np.min(E), np.max(E)
250 |     base_vars['E_mean'], base_vars['E_var'] = np.mean(E), np.var(E)
251 | else:
252 |     print(ui.color_str('[INFO]', bold=True) + ' No energy labels found in dataset.')
253 | 
254 | if lattice is not None:
255 |     base_vars['lattice'] = lattice
256 | 
257 | base_vars['md5'] = io.dataset_md5(base_vars)
258 | np.savez_compressed(dataset_file_name, **base_vars)
259 | print(ui.color_str('[DONE]', fore_color=ui.GREEN, bold=True))
260 | 


--------------------------------------------------------------------------------
/sgdml/utils/ui.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | 
  3 | # MIT License
  4 | #
  5 | # Copyright (c) 2018-2021 Stefan Chmiela
  6 | #
  7 | # Permission is hereby granted, free of charge, to any person obtaining a copy
  8 | # of this software and associated documentation files (the "Software"), to deal
  9 | # in the Software without restriction, including without limitation the rights
 10 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 11 | # copies of the Software, and to permit persons to whom the Software is
 12 | # furnished to do so, subject to the following conditions:
 13 | #
 14 | # The above copyright notice and this permission notice shall be included in all
 15 | # copies or substantial portions of the Software.
 16 | #
 17 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 18 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 19 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 20 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 21 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 22 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 23 | # SOFTWARE.
 24 | 
 25 | from __future__ import print_function
 26 | from functools import partial
 27 | 
 28 | from .. import __version__, MAX_PRINT_WIDTH, LOG_LEVELNAME_WIDTH
 29 | import textwrap
 30 | import re
 31 | import sys
 32 | 
 33 | if sys.version[0] == '3':
 34 |     raw_input = input
 35 | 
 36 | import numpy as np
 37 | 
 38 | 
 39 | def yes_or_no(question):
 40 |     """
 41 |     Ask for yes/no user input on a question.
 42 | 
 43 |     Any response besides ``y`` yields a negative answer.
 44 | 
 45 |     Parameters
 46 |     ----------
 47 |         question : :obj:`str`
 48 |             User question.
 49 |     """
 50 | 
 51 |     reply = raw_input(question + ' (y/n): ').lower().strip()
 52 |     if not reply or reply[0] != 'y':
 53 |         return False
 54 |     else:
 55 |         return True
 56 | 
 57 | 
 58 | last_callback_pct = 0
 59 | 
 60 | 
 61 | def callback(
 62 |     current,
 63 |     total=1,
 64 |     disp_str='',
 65 |     sec_disp_str=None,
 66 |     done_with_warning=False,
 67 |     newline_when_done=True,
 68 | ):
 69 |     """
 70 |     Print progress or toggle bar.
 71 | 
 72 |     Example (progress):
 73 |     ``[ 45%] Task description (secondary string)``
 74 | 
 75 |     Example (toggle, not done):
 76 |     ``[ .. ] Task description (secondary string)``
 77 | 
 78 |     Example (toggle, done):
 79 |     ``[DONE] Task description (secondary string)``
 80 | 
 81 |     Parameters
 82 |     ----------
 83 |         current : int
 84 |             How many items already processed?
 85 |         total : int, optional
 86 |             Total number of items? If there is only
 87 |             one item, the toggle style is used.
 88 |         disp_str : :obj:`str`, optional
 89 |             Task description.
 90 |         sec_disp_str : :obj:`str`, optional
 91 |             Additional string shown in gray.
 92 |         done_with_warning : bool, optional
 93 |             Indicate that the process did not
 94 |             finish successfully.
 95 |         newline_when_done : bool, optional
 96 |             Finish with a newline character once
 97 |             current=total (default: True)?
 98 |     """
 99 | 
100 |     global last_callback_pct
101 | 
102 |     is_toggle = total == 1
103 |     is_done = np.isclose(current - total, 0.0)
104 | 
105 |     bold_color_str = partial(color_str, bold=True)
106 | 
107 |     if is_toggle:
108 | 
109 |         if is_done:
110 |             if done_with_warning:
111 |                 flag_str = bold_color_str('[WARN]', fore_color=YELLOW)
112 |             else:
113 |                 flag_str = bold_color_str('[DONE]', fore_color=GREEN)
114 | 
115 |         else:
116 |             flag_str = bold_color_str('[' + blink_str(' .. ') + ']')
117 |     else:
118 | 
119 |         # Only show progress in 10 percent steps when not printing to terminal.
120 |         pct = int(float(current) * 100 / total)
121 |         pct = int(np.ceil(pct / 10.0)) * 10 if not sys.stdout.isatty() else pct
122 | 
123 |         # Do not print, if there is no need to.
124 |         if not is_done and pct == last_callback_pct:
125 |             return
126 |         else:
127 |             last_callback_pct = pct
128 | 
129 |         flag_str = bold_color_str(
130 |             '[{:3d}%]'.format(pct), fore_color=GREEN if is_done else WHITE
131 |         )
132 | 
133 |     sys.stdout.write('\r{} {}'.format(flag_str, disp_str))
134 | 
135 |     if sec_disp_str is not None:
136 |         w = MAX_PRINT_WIDTH - LOG_LEVELNAME_WIDTH - len(disp_str) - 1
137 |         # sys.stdout.write(' \x1b[90m{0: >{width}}\x1b[0m'.format(sec_disp_str, width=w))
138 |         sys.stdout.write(
139 |             color_str(' {:>{width}}'.format(sec_disp_str, width=w), fore_color=GRAY)
140 |         )
141 | 
142 |     if is_done and newline_when_done:
143 |         sys.stdout.write('\n')
144 | 
145 |     sys.stdout.flush()
146 | 
147 | 
148 | # use this to integrate a callback for a subtask with an existing callback function
149 | # 'subtask_callback = partial(ui.sec_callback, main_callback=self.callback)'
150 | def sec_callback(
151 |     current, total=1, disp_str=None, sec_disp_str=None, main_callback=None, **kwargs
152 | ):
153 |     global last_callback_pct
154 | 
155 |     assert main_callback is not None
156 | 
157 |     is_toggle = total == 1
158 |     is_done = np.isclose(current - total, 0.0)
159 | 
160 |     sec_disp_str = disp_str
161 |     if is_toggle:
162 |         sec_disp_str = '{} | {}'.format(disp_str, 'DONE' if is_done else ' .. ')
163 |     else:
164 | 
165 |         # Only show progress in 10 percent steps when not printing to terminal.
166 |         pct = int(float(current) * 100 / total)
167 |         pct = int(np.ceil(pct / 10.0)) * 10 if not sys.stdout.isatty() else pct
168 | 
169 |         # Do not print, if there is no need to.
170 |         if pct == last_callback_pct:
171 |             return
172 | 
173 |         last_callback_pct = pct
174 |         sec_disp_str = '{} | {:3d}%'.format(disp_str, pct)
175 | 
176 |     main_callback(0, sec_disp_str=sec_disp_str, **kwargs)
177 | 
178 | 
179 | # COLORS
180 | 
181 | BLACK, RED, GREEN, YELLOW, BLUE, MAGENTA, CYAN, WHITE, GRAY = list(range(8)) + [60]
182 | COLOR_SEQ, RESET_SEQ = '\033[{:d};{:d};{:d}m', '\033[0m'
183 | 
184 | ENABLE_COLORED_OUTPUT = (
185 |     sys.stdout.isatty()
186 | )  # Running in a real terminal or piped/redirected?
187 | 
188 | 
189 | def color_str(str, fore_color=WHITE, back_color=BLACK, bold=False):
190 | 
191 |     if ENABLE_COLORED_OUTPUT:
192 | 
193 |         # foreground is set with 30 plus the number of the color, background with 40
194 |         return (
195 |             COLOR_SEQ.format(1 if bold else 0, 30 + fore_color, 40 + back_color)
196 |             + str
197 |             + RESET_SEQ
198 |         )
199 |     else:
200 |         return str
201 | 
202 | 
203 | def blink_str(str):
204 | 
205 |     return '\x1b[5m' + str + '\x1b[0m' if ENABLE_COLORED_OUTPUT else str
206 | 
207 | 
208 | def unicode_str(s):
209 | 
210 |     if sys.version[0] == '3':
211 |         s = str(s, 'utf-8', 'ignore')
212 |     else:
213 |         s = str(s)
214 | 
215 |     return s.rstrip('\x00')  # remove null-characters
216 | 
217 | 
218 | def gen_memory_str(bytes):
219 | 
220 |     pwr = 1024
221 |     n = 0
222 |     pwr_strs = {0: '', 1: 'K', 2: 'M', 3: 'G', 4: 'T'}
223 |     while bytes > pwr and n < 4:
224 |         bytes /= pwr
225 |         n += 1
226 | 
227 |     return '{:.{num_dec_pts}f} {}B'.format(
228 |         bytes, pwr_strs[n], num_dec_pts=max(0, n - 2)
229 |     )  # 1 decimal point for GB, 2 for TB
230 | 
231 | 
232 | def gen_lattice_str(lat):
233 | 
234 |     lat_str, col_widths = gen_mat_str(lat)
235 |     desc_str = (' '.join([('{:' + str(w) + '}') for w in col_widths])).format(
236 |         'a', 'b', 'c'
237 |     ) + '\n'
238 | 
239 |     lat_str = indent_str(lat_str, 21)
240 | 
241 |     return desc_str + lat_str
242 | 
243 | 
244 | def str_plen(str):
245 |     """
246 |     Returns printable length of string. This function can only account for invisible characters due to string styling with ``color_str``.
247 | 
248 |     Parameters
249 |     ----------
250 |         str : :obj:`str`
251 |             String.
252 | 
253 |     Returns
254 |     -------
255 |         :obj:`str`
256 | 
257 |     """
258 | 
259 |     num_colored_subs = str.count(RESET_SEQ)
260 |     return len(str) - (
261 |         14 * num_colored_subs
262 |     )  # 14: length of invisible characters per colored segment
263 | 
264 | 
265 | def wrap_str(str, width=MAX_PRINT_WIDTH - LOG_LEVELNAME_WIDTH):
266 |     """
267 |     Wrap multiline string after a given number of characters. The default maximum line already accounts for the indentation due to the logging level label.
268 | 
269 |     Parameters
270 |     ----------
271 |         str : :obj:`str`
272 |             Multiline string.
273 |         width : int, optional
274 |             Max number of characters in a line.
275 | 
276 |     Returns
277 |     -------
278 |         :obj:`str`
279 | 
280 |     """
281 | 
282 |     return '\n'.join(
283 |         [
284 |             '\n'.join(
285 |                 textwrap.wrap(
286 |                     line,
287 |                     width + (len(line) - str_plen(line)),
288 |                     break_long_words=False,
289 |                     replace_whitespace=False,
290 |                 )
291 |             )
292 |             for line in str.splitlines()
293 |         ]
294 |     )
295 | 
296 | 
297 | def indent_str(str, indent):
298 |     """
299 |     Indents all lines of a multiline string right by a given number of
300 |     characters.
301 | 
302 |     Parameters
303 |     ----------
304 |         str : :obj:`str`
305 |             Multiline string.
306 |         indent : int
307 |             Number of characters added in front of each line.
308 | 
309 |     Returns
310 |     -------
311 |         :obj:`str`
312 | 
313 |     """
314 | 
315 |     return re.sub('^', ' ' * indent, str, flags=re.MULTILINE)
316 | 
317 | 
318 | def wrap_indent_str(label, str, width=MAX_PRINT_WIDTH - LOG_LEVELNAME_WIDTH):
319 |     """
320 |     Wraps and indents a multiline string to arrange it with the provided label in two columns. The default maximum line already accounts for the indentation due to the logging level label.
321 | 
322 |     Example:
323 |     ``<label><multiline string>``
324 | 
325 |     Parameters
326 |     ----------
327 |         label : :obj:`str`
328 |             Label
329 |         str : :obj:`str`
330 |             Multiline string.
331 | 
332 |     Returns
333 |     -------
334 |         :obj:`str`
335 | 
336 |     """
337 | 
338 |     label_len = str_plen(label)
339 | 
340 |     str = wrap_str(str, width - label_len)
341 |     str = indent_str(str, label_len)
342 | 
343 |     return label + str[label_len:]
344 | 
345 | 
346 | def merge_col_str(
347 |     col_str1, col_str2
348 | ):  # merge two multiline strings that represent columns in a table
349 |     """
350 |     Merges two multiline strings that represent columns in a table by
351 |     concatenating each pair of lines.
352 | 
353 |     Note
354 |     ----
355 |         Both strings must have the same number of lines.
356 | 
357 |     Parameters
358 |     ----------
359 |         col_str1 : :obj:`str`
360 |             First multiline string.
361 |         col_str2 : :obj:`str`
362 |             Second multiline string.
363 | 
364 |     Returns
365 |     -------
366 |         :obj:`str`
367 | 
368 |     """
369 | 
370 |     return '\n'.join(
371 |         [
372 |             ' '.join([c1, c2])
373 |             for c1, c2 in zip(col_str1.split('\n'), col_str2.split('\n'))
374 |         ]
375 |     )
376 | 
377 | 
378 | def gen_mat_str(mat):
379 |     """
380 |     Converts a matrix to a multiline string such that the decimal points
381 |     align in each column. Trailing zeros are replaced with spaces.
382 | 
383 |     Parameters
384 |     ----------
385 |         mat : :obj:`numpy.ndarray`
386 | 
387 |     Returns
388 |     -------
389 |         :obj:`str`
390 |             String representation of matrix.
391 | 
392 |     """
393 | 
394 |     def _int_len(
395 |         x,
396 |     ):  # length of string representation before decimal point (including sign)
397 |         return len(str(int(abs(x)))) + (0 if x >= 0 else 1)
398 | 
399 |     def _dec_len(x):  # length of string representation after decimal point
400 | 
401 |         x_str_split = '{:g}'.format(x).split('.')
402 |         return len(x_str_split[1]) if len(x_str_split) > 1 else 0
403 | 
404 |     def _max_int_len_for_col(
405 |         mat, col
406 |     ):  # length of string representation before decimal point for each col
407 |         col_min = np.min(mat[:, col])
408 |         col_max = np.max(mat[:, col])
409 |         return max(_int_len(col_min), _int_len(col_max))
410 | 
411 |     def _max_dec_len_for_col(
412 |         mat, col
413 |     ):  # length of string representation after decimal point for each col
414 |         return max([_dec_len(cell) for cell in mat[:, col]])
415 | 
416 |     n_cols = mat.shape[1]
417 |     col_int_widths = [_max_int_len_for_col(mat, i) for i in range(n_cols)]
418 |     col_dec_widths = [_max_dec_len_for_col(mat, i) for i in range(n_cols)]
419 |     col_widths = [iw + cd + 1 for iw, cd in zip(col_int_widths, col_dec_widths)]
420 | 
421 |     mat_str = ''
422 |     for row in mat:
423 |         if mat_str != '':
424 |             mat_str += '\n'
425 |         mat_str += ' '.join(
426 |             ' ' * max(col_int_widths[j] - _int_len(x), 0)
427 |             + ('{: <' + str(_int_len(x) + col_dec_widths[j] + 1) + 'g}').format(x)
428 |             for j, x in enumerate(row)
429 |         )
430 | 
431 |     return mat_str, col_widths
432 | 
433 | 
434 | def gen_range_str(min, max):
435 |     """
436 |     Generates a string that shows a minimum and maximum value, as well as the range.
437 | 
438 |     Example:
439 |     ``<min> |-- <range> --| <max>``
440 | 
441 |     Parameters
442 |     ----------
443 |         min : float
444 |             Minimum value.
445 |         max : float
446 |             Maximum value.
447 | 
448 |     Returns
449 |     -------
450 |         :obj:`str`
451 | 
452 |     """
453 | 
454 |     return '{:<.3f} |-- {:^8.3f} --| {:<9.3f}'.format(min, max - min, max)
455 | 
456 | 
457 | def print_step_title(title_str, sec_title_str='', underscore=True):
458 | 
459 |     if sec_title_str != '':
460 |         sec_title_str = ' ' + sec_title_str
461 | 
462 |     underscore_str = '\n' + '-' * MAX_PRINT_WIDTH if underscore else ''
463 | 
464 |     print(
465 |         '\n'
466 |         + color_str(
467 |             ' ' + title_str + ' ', fore_color=BLACK, back_color=WHITE, bold=True
468 |         )
469 |         + sec_title_str
470 |         + underscore_str
471 |     )
472 | 
473 | 
474 | def print_two_column_str(str, sec_str=''):
475 | 
476 |     sec_str = color_str(
477 |         '{:>{width}}'.format(sec_str, width=MAX_PRINT_WIDTH - str_plen(str) - 1),
478 |         fore_color=GRAY,
479 |     )
480 |     print('{} {}'.format(str, sec_str))
481 | 
482 |     # print(
483 |     #     '{} \x1b[90m{:>{width}}\x1b[0m'.format(
484 |     #         str, sec_str, width=MAX_PRINT_WIDTH - str_plen(str) - 1
485 |     #     )
486 |     # )
487 | 
488 | 
489 | def print_lattice(lat=None, inset=False):
490 | 
491 |     from . import io
492 | 
493 |     lat_str = 'n/a'
494 |     if lat is not None:
495 |         lat_str = gen_lattice_str(lat)
496 |         lengths, angles = io.lattice_vec_to_par(lat)
497 | 
498 |     if inset:
499 |         print('    {:<16} {}'.format('Lattice:', lat_str))
500 |     else:
501 |         print('  {:<18} {}'.format('Lattice:', lat_str))
502 |     if lat is not None:
503 |         print('    {:<16} a = {:g}, b = {:g}, c = {:g}'.format('Lengths:', *lengths))
504 |         print(
505 |             '    {:<16} alpha = {:g}, beta = {:g}, gamma = {:g}'.format(
506 |                 'Angles [deg]:', *angles
507 |             )
508 |         )
509 | 


--------------------------------------------------------------------------------
/sgdml/utils/desc.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | 
  3 | # MIT License
  4 | #
  5 | # Copyright (c) 2018-2022 Stefan Chmiela, Luis Galvez
  6 | #
  7 | # Permission is hereby granted, free of charge, to any person obtaining a copy
  8 | # of this software and associated documentation files (the "Software"), to deal
  9 | # in the Software without restriction, including without limitation the rights
 10 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 11 | # copies of the Software, and to permit persons to whom the Software is
 12 | # furnished to do so, subject to the following conditions:
 13 | #
 14 | # The above copyright notice and this permission notice shall be included in all
 15 | # copies or substantial portions of the Software.
 16 | #
 17 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 18 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 19 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 20 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 21 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 22 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 23 | # SOFTWARE.
 24 | 
 25 | import numpy as np
 26 | import scipy as sp
 27 | from scipy import spatial
 28 | 
 29 | import multiprocessing as mp
 30 | 
 31 | Pool = mp.get_context('fork').Pool
 32 | 
 33 | from functools import partial
 34 | import timeit
 35 | 
 36 | try:
 37 |     import torch
 38 | except ImportError:
 39 |     _has_torch = False
 40 | else:
 41 |     _has_torch = True
 42 | 
 43 | 
 44 | def _pbc_diff(diffs, lat_and_inv, use_torch=False):
 45 |     """
 46 |     Clamp differences of vectors to super cell.
 47 | 
 48 |     Parameters
 49 |     ----------
 50 |         diffs : :obj:`numpy.ndarray`
 51 |             N x 3 matrix of N pairwise differences between vectors `u - v`
 52 |         lat_and_inv : tuple of :obj:`numpy.ndarray`
 53 |             Tuple of 3 x 3 matrix containing lattice vectors as columns and its inverse.
 54 |         use_torch : boolean, optional
 55 |             Enable, if the inputs are PyTorch objects.
 56 | 
 57 |     Returns
 58 |     -------
 59 |         :obj:`numpy.ndarray`
 60 |             N x 3 matrix clamped differences
 61 |     """
 62 | 
 63 |     lat, lat_inv = lat_and_inv
 64 | 
 65 |     if use_torch and not _has_torch:
 66 |         raise ImportError(
 67 |             'Optional PyTorch dependency not found! Please run \'pip install sgdml[torch]\' to install it or disable the PyTorch option.'
 68 |         )
 69 | 
 70 |     if use_torch:
 71 |         c = lat_inv.mm(diffs.t())
 72 |         diffs -= lat.mm(c.round()).t()
 73 |     else:
 74 |         c = lat_inv.dot(diffs.T)
 75 |         diffs -= lat.dot(np.around(c)).T
 76 | 
 77 |     return diffs
 78 | 
 79 | 
 80 | def _pdist(r, lat_and_inv=None):
 81 |     """
 82 |     Compute pairwise Euclidean distance matrix between all atoms.
 83 | 
 84 |     Parameters
 85 |     ----------
 86 |         r : :obj:`numpy.ndarray`
 87 |             Array of size 3N containing the Cartesian coordinates of
 88 |             each atom.
 89 |         lat_and_inv : tuple of :obj:`numpy.ndarray`, optional
 90 |             Tuple of 3x3 matrix containing lattice vectors as columns and its inverse.
 91 | 
 92 |     Returns
 93 |     -------
 94 |         :obj:`numpy.ndarray`
 95 |             Array of size N(N-1)/2 containing the upper triangle of the pairwise
 96 |             distance matrix between atoms.
 97 |     """
 98 | 
 99 |     r = r.reshape(-1, 3)
100 |     n_atoms = r.shape[0]
101 | 
102 |     if lat_and_inv is None:
103 |         pdist = sp.spatial.distance.pdist(r, 'euclidean')
104 |     else:
105 |         pdist = sp.spatial.distance.pdist(
106 |             r, lambda u, v: np.linalg.norm(_pbc_diff(u - v, lat_and_inv))
107 |         )
108 | 
109 |     tril_idxs = np.tril_indices(n_atoms, k=-1)
110 |     return sp.spatial.distance.squareform(pdist, checks=False)[tril_idxs]
111 | 
112 | 
113 | def _squareform(vec_or_mat):
114 | 
115 |     # vector to matrix representation
116 |     if vec_or_mat.ndim == 1:
117 | 
118 |         n_tril = vec_or_mat.size
119 |         n = int((1 + np.sqrt(8 * n_tril + 1)) / 2)
120 | 
121 |         i, j = np.tril_indices(n, k=-1)
122 | 
123 |         mat = np.zeros((n, n))
124 |         mat[i, j] = vec_or_mat
125 |         mat[j, i] = vec_or_mat
126 | 
127 |         return mat
128 | 
129 |     else:  # matrix to vector
130 | 
131 |         assert vec_or_mat.shape[0] == vec_or_mat.shape[1]  # matrix is square
132 | 
133 |         n = vec_or_mat.shape[0]
134 |         i, j = np.tril_indices(n, k=-1)
135 | 
136 |         return vec_or_mat[i, j]
137 | 
138 | 
139 | def _r_to_desc(r, pdist):
140 |     """
141 |     Generate descriptor for a set of atom positions in Cartesian
142 |     coordinates.
143 | 
144 |     Parameters
145 |     ----------
146 |         r : :obj:`numpy.ndarray`
147 |             Array of size 3N containing the Cartesian coordinates of
148 |             each atom.
149 |         pdist : :obj:`numpy.ndarray`
150 |             Array of size N x N containing the Euclidean distance
151 |             (2-norm) for each pair of atoms.
152 | 
153 |     Returns
154 |     -------
155 |         :obj:`numpy.ndarray`
156 |             Descriptor representation as 1D array of size N(N-1)/2
157 |     """
158 | 
159 |     # Add singleton dimension if input is (,3N).
160 |     if r.ndim == 1:
161 |         r = r[None, :]
162 | 
163 |     return 1.0 / pdist
164 | 
165 | 
166 | def _r_to_d_desc(r, pdist, lat_and_inv=None):
167 |     """
168 |     Generate descriptor Jacobian for a set of atom positions in
169 |     Cartesian coordinates.
170 | 
171 |     This method can apply the minimum-image convention as periodic
172 |     boundary condition for distances between atoms, given the lattice vectors.
173 | 
174 |     Parameters
175 |     ----------
176 |         r : :obj:`numpy.ndarray`
177 |             Array of size 3N containing the Cartesian coordinates of
178 |             each atom.
179 |         pdist : :obj:`numpy.ndarray`
180 |             Array of size N x N containing the Euclidean distance
181 |             (2-norm) for each pair of atoms.
182 |         lat_and_inv : tuple of :obj:`numpy.ndarray`, optional
183 |             Tuple of 3 x 3 matrix containing lattice vectors as columns and its inverse.
184 | 
185 |     Returns
186 |     -------
187 |         :obj:`numpy.ndarray`
188 |             Array of size N(N-1)/2 x 3N containing all partial
189 |             derivatives of the descriptor.
190 |     """
191 | 
192 |     r = r.reshape(-1, 3)
193 |     pdiff = r[:, None] - r[None, :]  # pairwise differences ri - rj
194 | 
195 |     n_atoms = r.shape[0]
196 |     i, j = np.tril_indices(n_atoms, k=-1)
197 | 
198 |     pdiff = pdiff[i, j, :]  # lower triangular
199 | 
200 |     if lat_and_inv is not None:
201 |         pdiff = _pbc_diff(pdiff, lat_and_inv)
202 | 
203 |     d_desc_elem = pdiff / (pdist**3)[:, None]
204 | 
205 |     return d_desc_elem
206 | 
207 | 
208 | def _from_r(r, lat_and_inv=None):
209 |     """
210 |     Generate descriptor and its Jacobian for one molecular geometry
211 |     in Cartesian coordinates.
212 | 
213 |     Parameters
214 |     ----------
215 |         r : :obj:`numpy.ndarray`
216 |             Array of size 3N containing the Cartesian coordinates of
217 |             each atom.
218 |         lat_and_inv : tuple of :obj:`numpy.ndarray`, optional
219 |             Tuple of 3 x 3 matrix containing lattice vectors as columns and its inverse.
220 | 
221 |     Returns
222 |     -------
223 |         :obj:`numpy.ndarray`
224 |             Descriptor representation as 1D array of size N(N-1)/2
225 |         :obj:`numpy.ndarray`
226 |             Array of size N(N-1)/2 x 3N containing all partial
227 |             derivatives of the descriptor.
228 |     """
229 | 
230 |     # Add singleton dimension if input is (,3N).
231 |     if r.ndim == 1:
232 |         r = r[None, :]
233 | 
234 |     pd = _pdist(r, lat_and_inv)
235 | 
236 |     r_desc = _r_to_desc(r, pd)
237 |     r_d_desc = _r_to_d_desc(r, pd, lat_and_inv)
238 | 
239 |     return r_desc, r_d_desc
240 | 
241 | 
242 | class Desc(object):
243 |     # def __init__(self, n_atoms, interact_cut_off=None, max_processes=None):
244 |     def __init__(self, n_atoms, max_processes=None):
245 |         """
246 |         Generate descriptors and their Jacobians for molecular geometries,
247 |         including support for periodic boundary conditions.
248 | 
249 |         Parameters
250 |         ----------
251 |                 n_atoms : int
252 |                         Number of atoms in the represented system.
253 |                 max_processes : int, optional
254 |                         Limit the max. number of processes. Otherwise
255 |                         all CPU cores are used.
256 |         """
257 | 
258 |         self.n_atoms = n_atoms
259 |         self.dim_i = 3 * n_atoms
260 | 
261 |         # Size of the resulting descriptor vector.
262 |         self.dim = (n_atoms * (n_atoms - 1)) // 2
263 | 
264 |         self.tril_indices = np.tril_indices(n_atoms, k=-1)
265 | 
266 |         # Precompute indices for nonzero entries in desriptor derivatives.
267 |         self.d_desc_mask = np.zeros((n_atoms, n_atoms - 1), dtype=int)
268 |         for a in range(n_atoms):  # for each partial derivative
269 |             rows, cols = self.tril_indices
270 |             self.d_desc_mask[a, :] = np.concatenate(
271 |                 [np.where(rows == a)[0], np.where(cols == a)[0]]
272 |             )
273 | 
274 |         self.dim_range = np.arange(self.dim)  # [0, 1, ..., dim-1]
275 | 
276 |         # Precompute indices for nonzero entries in desriptor derivatives.
277 | 
278 |         self.M = np.arange(1, n_atoms)  # indexes matrix row-wise, skipping diagonal
279 |         for a in range(1, n_atoms):
280 |             self.M = np.concatenate((self.M, np.delete(np.arange(n_atoms), a)))
281 | 
282 |         self.A = np.repeat(
283 |             np.arange(n_atoms), n_atoms - 1
284 |         )  # [0, 0, ..., 1, 1, ..., 2, 2, ...]
285 | 
286 |         self.max_processes = max_processes
287 | 
288 |     def from_R(self, R, lat_and_inv=None, max_processes=None, callback=None):
289 |         """
290 |         Generate descriptor and its Jacobian for multiple molecular geometries
291 |         in Cartesian coordinates.
292 | 
293 |         Parameters
294 |         ----------
295 |             R : :obj:`numpy.ndarray`
296 |                 Array of size M x 3N containing the Cartesian coordinates of
297 |                 each atom.
298 |             lat_and_inv : tuple of :obj:`numpy.ndarray`, optional
299 |                 Tuple of 3 x 3 matrix containing lattice vectors as columns and its inverse.
300 |             max_processes : int, optional
301 |                 Limit the max. number of processes. Otherwise
302 |                 all CPU cores are used. This parameter overwrites the global setting as
303 |                 set during initialization.
304 |             callback : callable, optional
305 |                 Descriptor and descriptor Jacobian generation status.
306 |                     current : int
307 |                         Current progress (number of completed descriptors).
308 |                     total : int
309 |                         Task size (total number of descriptors to create).
310 |                     sec_disp_str : :obj:`str`, optional
311 |                         Once complete, this string contains the
312 |                         time it took complete this task (seconds).
313 | 
314 |         Returns
315 |         -------
316 |             :obj:`numpy.ndarray`
317 |                 Array of size M x N(N-1)/2 containing the descriptor representation
318 |                 for each geometry.
319 |             :obj:`numpy.ndarray`
320 |                 Array of size M x N(N-1)/2 x 3N containing all partial
321 |                 derivatives of the descriptor for each geometry.
322 |         """
323 | 
324 |         # Add singleton dimension if input is (,3N).
325 |         if R.ndim == 1:
326 |             R = R[None, :]
327 | 
328 |         M = R.shape[0]
329 |         if M == 1:
330 |             return _from_r(R, lat_and_inv)
331 | 
332 |         R_desc = np.empty([M, self.dim])
333 |         R_d_desc = np.empty([M, self.dim, 3])
334 | 
335 |         # Generate descriptor and their Jacobians
336 |         start = timeit.default_timer()
337 | 
338 |         pool = None
339 |         map_func = map
340 |         max_processes = max_processes or self.max_processes
341 |         if max_processes != 1 and mp.cpu_count() > 1:
342 |             pool = Pool((max_processes or mp.cpu_count()) - 1)  # exclude main process
343 |             map_func = pool.imap
344 | 
345 |         for i, r_desc_r_d_desc in enumerate(
346 |             map_func(partial(_from_r, lat_and_inv=lat_and_inv), R)
347 |         ):
348 |             R_desc[i, :], R_d_desc[i, :, :] = r_desc_r_d_desc
349 | 
350 |             if callback is not None and i < M - 1:
351 |                 callback(i, M - 1)
352 | 
353 |         if pool is not None:
354 |             pool.close()
355 |             pool.join()  # Wait for the worker processes to terminate (to measure total runtime correctly).
356 |             pool = None
357 | 
358 |         stop = timeit.default_timer()
359 | 
360 |         if callback is not None:
361 |             dur_s = stop - start
362 |             sec_disp_str = 'took {:.1f} s'.format(dur_s) if dur_s >= 0.1 else ''
363 |             callback(M, M, sec_disp_str=sec_disp_str)
364 | 
365 |         return R_desc, R_d_desc
366 | 
367 |     # Multiplies descriptor(s) jacobian with 3N-vector(s) from the right side
368 |     def d_desc_dot_vec(self, R_d_desc, vecs, overwrite_vecs=False):
369 | 
370 |         if R_d_desc.ndim == 2:
371 |             R_d_desc = R_d_desc[None, ...]
372 | 
373 |         if vecs.ndim == 1:
374 |             vecs = vecs[None, ...]
375 | 
376 |         i, j = self.tril_indices
377 | 
378 |         vecs = vecs.reshape(vecs.shape[0], -1, 3)
379 | 
380 |         einsum = np.einsum
381 |         if _has_torch and torch.is_tensor(R_d_desc):
382 |             assert torch.is_tensor(vecs)
383 |             einsum = torch.einsum
384 | 
385 |         return einsum('...ij,...ij->...i', R_d_desc, vecs[:, j, :] - vecs[:, i, :])
386 | 
387 |     # Multiplies descriptor(s) jacobian with N(N-1)/2-vector(s) from the left side
388 |     def vec_dot_d_desc(self, R_d_desc, vecs, out=None):
389 | 
390 |         if R_d_desc.ndim == 2:
391 |             R_d_desc = R_d_desc[None, ...]
392 | 
393 |         if vecs.ndim == 1:
394 |             vecs = vecs[None, ...]
395 | 
396 |         assert (
397 |             R_d_desc.shape[0] == 1
398 |             or vecs.shape[0] == 1
399 |             or R_d_desc.shape[0] == vecs.shape[0]
400 |         )  # either multiple descriptors or multiple vectors at once, not both (or the same number of both, than it will must be a multidot)
401 | 
402 |         n = np.max((R_d_desc.shape[0], vecs.shape[0]))
403 |         i, j = self.tril_indices
404 | 
405 |         out = np.zeros((n, self.n_atoms, self.n_atoms, 3))
406 |         out[:, i, j, :] = R_d_desc * vecs[..., None]
407 |         out[:, j, i, :] = -out[:, i, j, :]
408 |         return out.sum(axis=1).reshape(n, -1)
409 | 
410 |         # if out is None or out.shape != (n, self.n_atoms*3):
411 |         #    out = np.zeros((n, self.n_atoms*3))
412 | 
413 |         # R_d_desc_full = np.zeros((self.n_atoms, self.n_atoms, 3))
414 |         # for a in range(n):
415 | 
416 |         #   R_d_desc_full[i, j, :] = R_d_desc * vecs[a, :, None]
417 |         #    R_d_desc_full[j, i, :] = -R_d_desc_full[i, j, :]
418 |         #    out[a,:] = R_d_desc_full.sum(axis=0).ravel()
419 | 
420 |         # return out
421 | 
422 |     def d_desc_from_comp(self, R_d_desc, out=None):
423 |         """
424 |         Convert a compressed representation of a descriptor Jacobian back
425 |         to its full representation.
426 | 
427 |         The compressed representation omits all zeros and scales with N
428 |         instead of N(N-1)/2.
429 | 
430 |         Parameters
431 |         ----------
432 |             R_d_desc : :obj:`numpy.ndarray` or :obj:`torch.tensor`
433 |                 Array of size M x N x N x 3 containing the compressed
434 |                 descriptor Jacobian.
435 |             out : :obj:`numpy.ndarray` or :obj:`torch.tensor`, optional
436 |                 Output argument. This must have the exact kind that would
437 |                 be returned if it was not used.
438 | 
439 |         Note
440 |         ----
441 |                 If used, the output argument must be initialized with zeros!
442 | 
443 |         Returns
444 |         -------
445 |             :obj:`numpy.ndarray` or :obj:`torch.tensor`
446 |                 Array of size M x N(N-1)/2 x 3N containing the full
447 |                 representation.
448 |         """
449 | 
450 |         if R_d_desc.ndim == 2:
451 |             R_d_desc = R_d_desc[None, ...]
452 | 
453 |         n = R_d_desc.shape[0]
454 |         i, j = self.tril_indices
455 | 
456 |         if out is None:
457 |             if _has_torch and torch.is_tensor(R_d_desc):
458 |                 device = R_d_desc.device
459 |                 dtype = R_d_desc.dtype
460 |                 out = torch.zeros((n, self.dim, self.n_atoms, 3), device=device).to(
461 |                     dtype
462 |                 )
463 |             else:
464 |                 out = np.zeros((n, self.dim, self.n_atoms, 3))
465 |         else:
466 |             out = out.reshape(n, self.dim, self.n_atoms, 3)
467 | 
468 |         out[:, self.dim_range, j, :] = R_d_desc
469 |         out[:, self.dim_range, i, :] = -R_d_desc
470 | 
471 |         return out.reshape(-1, self.dim, self.dim_i)
472 | 
473 |     def d_desc_to_comp(self, R_d_desc):
474 |         """
475 |         Convert a descriptor Jacobian to a compressed representation.
476 | 
477 |         The compressed representation omits all zeros and scales with N
478 |         instead of N(N-1)/2.
479 | 
480 |         Parameters
481 |         ----------
482 |             R_d_desc : :obj:`numpy.ndarray`
483 |                 Array of size M x N(N-1)/2 x 3N containing the descriptor
484 |                 Jacobian.
485 | 
486 |         Returns
487 |         -------
488 |             :obj:`numpy.ndarray`
489 |                 Array of size M x N x N x 3 containing the compressed
490 |                 representation.
491 |         """
492 | 
493 |         # Add singleton dimension for single inputs.
494 |         if R_d_desc.ndim == 2:
495 |             R_d_desc = R_d_desc[None, ...]
496 | 
497 |         n = R_d_desc.shape[0]
498 |         n_atoms = int(R_d_desc.shape[2] / 3)
499 | 
500 |         R_d_desc = R_d_desc.reshape(n, -1, n_atoms, 3)
501 | 
502 |         ret = np.zeros((n, n_atoms, n_atoms, 3))
503 |         ret[:, self.M, self.A, :] = R_d_desc[:, self.d_desc_mask.ravel(), self.A, :]
504 | 
505 |         # Take the upper triangle.
506 |         i, j = self.tril_indices
507 |         return ret[:, i, j, :]
508 | 
509 |     @staticmethod
510 |     def perm(perm):
511 |         """
512 |         Convert atom permutation to descriptor permutation.
513 | 
514 |         A permutation of N atoms is converted to a permutation that acts on
515 |         the corresponding descriptor representation. Applying the converted
516 |         permutation to a descriptor is equivalent to permuting the atoms
517 |         first and then generating the descriptor.
518 | 
519 |         Parameters
520 |         ----------
521 |             perm : :obj:`numpy.ndarray`
522 |                 Array of size N containing the atom permutation.
523 | 
524 |         Returns
525 |         -------
526 |             :obj:`numpy.ndarray`
527 |                 Array of size N(N-1)/2 containing the corresponding
528 |                 descriptor permutation.
529 |         """
530 | 
531 |         n = len(perm)
532 | 
533 |         rest = np.zeros((n, n))
534 |         rest[np.tril_indices(n, -1)] = list(range((n**2 - n) // 2))
535 |         rest = rest + rest.T
536 |         rest = rest[perm, :]
537 |         rest = rest[:, perm]
538 | 
539 |         return rest[np.tril_indices(n, -1)].astype(int)
540 | 


--------------------------------------------------------------------------------
/sgdml/utils/io.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | 
  3 | # MIT License
  4 | #
  5 | # Copyright (c) 2018-2021 Stefan Chmiela
  6 | #
  7 | # Permission is hereby granted, free of charge, to any person obtaining a copy
  8 | # of this software and associated documentation files (the "Software"), to deal
  9 | # in the Software without restriction, including without limitation the rights
 10 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 11 | # copies of the Software, and to permit persons to whom the Software is
 12 | # furnished to do so, subject to the following conditions:
 13 | #
 14 | # The above copyright notice and this permission notice shall be included in all
 15 | # copies or substantial portions of the Software.
 16 | #
 17 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 18 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 19 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 20 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 21 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 22 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 23 | # SOFTWARE.
 24 | 
 25 | import argparse
 26 | import hashlib
 27 | import os
 28 | import re
 29 | import sys
 30 | 
 31 | import numpy as np
 32 | 
 33 | from . import ui
 34 | 
 35 | _z_str_to_z_dict = {
 36 |     'H': 1,
 37 |     'He': 2,
 38 |     'Li': 3,
 39 |     'Be': 4,
 40 |     'B': 5,
 41 |     'C': 6,
 42 |     'N': 7,
 43 |     'O': 8,
 44 |     'F': 9,
 45 |     'Ne': 10,
 46 |     'Na': 11,
 47 |     'Mg': 12,
 48 |     'Al': 13,
 49 |     'Si': 14,
 50 |     'P': 15,
 51 |     'S': 16,
 52 |     'Cl': 17,
 53 |     'Ar': 18,
 54 |     'K': 19,
 55 |     'Ca': 20,
 56 |     'Sc': 21,
 57 |     'Ti': 22,
 58 |     'V': 23,
 59 |     'Cr': 24,
 60 |     'Mn': 25,
 61 |     'Fe': 26,
 62 |     'Co': 27,
 63 |     'Ni': 28,
 64 |     'Cu': 29,
 65 |     'Zn': 30,
 66 |     'Ga': 31,
 67 |     'Ge': 32,
 68 |     'As': 33,
 69 |     'Se': 34,
 70 |     'Br': 35,
 71 |     'Kr': 36,
 72 |     'Rb': 37,
 73 |     'Sr': 38,
 74 |     'Y': 39,
 75 |     'Zr': 40,
 76 |     'Nb': 41,
 77 |     'Mo': 42,
 78 |     'Tc': 43,
 79 |     'Ru': 44,
 80 |     'Rh': 45,
 81 |     'Pd': 46,
 82 |     'Ag': 47,
 83 |     'Cd': 48,
 84 |     'In': 49,
 85 |     'Sn': 50,
 86 |     'Sb': 51,
 87 |     'Te': 52,
 88 |     'I': 53,
 89 |     'Xe': 54,
 90 |     'Cs': 55,
 91 |     'Ba': 56,
 92 |     'La': 57,
 93 |     'Ce': 58,
 94 |     'Pr': 59,
 95 |     'Nd': 60,
 96 |     'Pm': 61,
 97 |     'Sm': 62,
 98 |     'Eu': 63,
 99 |     'Gd': 64,
100 |     'Tb': 65,
101 |     'Dy': 66,
102 |     'Ho': 67,
103 |     'Er': 68,
104 |     'Tm': 69,
105 |     'Yb': 70,
106 |     'Lu': 71,
107 |     'Hf': 72,
108 |     'Ta': 73,
109 |     'W': 74,
110 |     'Re': 75,
111 |     'Os': 76,
112 |     'Ir': 77,
113 |     'Pt': 78,
114 |     'Au': 79,
115 |     'Hg': 80,
116 |     'Tl': 81,
117 |     'Pb': 82,
118 |     'Bi': 83,
119 |     'Po': 84,
120 |     'At': 85,
121 |     'Rn': 86,
122 |     'Fr': 87,
123 |     'Ra': 88,
124 |     'Ac': 89,
125 |     'Th': 90,
126 |     'Pa': 91,
127 |     'U': 92,
128 |     'Np': 93,
129 |     'Pu': 94,
130 |     'Am': 95,
131 |     'Cm': 96,
132 |     'Bk': 97,
133 |     'Cf': 98,
134 |     'Es': 99,
135 |     'Fm': 100,
136 |     'Md': 101,
137 |     'No': 102,
138 |     'Lr': 103,
139 |     'Rf': 104,
140 |     'Db': 105,
141 |     'Sg': 106,
142 |     'Bh': 107,
143 |     'Hs': 108,
144 |     'Mt': 109,
145 |     'Ds': 110,
146 |     'Rg': 111,
147 |     'Cn': 112,
148 |     'Uuq': 114,
149 |     'Uuh': 116,
150 | }
151 | _z_to_z_str_dict = {v: k for k, v in _z_str_to_z_dict.items()}
152 | 
153 | 
154 | def z_str_to_z(z_str):
155 |     return np.array([_z_str_to_z_dict[x] for x in z_str])
156 | 
157 | 
158 | def z_to_z_str(z):
159 |     return [_z_to_z_str_dict[int(x)] for x in z]
160 | 
161 | 
162 | def train_dir_name(dataset, n_train, use_sym, use_E, use_E_cstr):
163 | 
164 |     theory_level_str = re.sub(r'[^\w\-_\.]', '.', str(dataset['theory']))
165 |     theory_level_str = re.sub(r'\.\.', '.', theory_level_str)
166 | 
167 |     sym_str = '-sym' if use_sym else ''
168 |     # cprsn_str = '-cprsn' if use_cprsn else ''
169 |     noE_str = '-noE' if not use_E else ''
170 |     Ecstr_str = '-Ecstr' if use_E_cstr else ''
171 | 
172 |     return 'sgdml_cv_%s-%s-train%d%s%s%s' % (
173 |         dataset['name'].astype(str),
174 |         theory_level_str,
175 |         n_train,
176 |         sym_str,
177 |         # cprsn_str,
178 |         noE_str,
179 |         Ecstr_str,
180 |     )
181 | 
182 | 
183 | def task_file_name(task):
184 | 
185 |     n_train = task['idxs_train'].shape[0]
186 |     n_perms = task['perms'].shape[0]
187 |     sig = np.squeeze(task['sig'])
188 | 
189 |     return 'task-train%d-sym%d-sig%04d.npz' % (n_train, n_perms, sig)
190 | 
191 | 
192 | def model_file_name(task_or_model, is_extended=False):
193 | 
194 |     n_train = task_or_model['idxs_train'].shape[0]
195 |     n_perms = task_or_model['perms'].shape[0]
196 |     sig = np.squeeze(task_or_model['sig'])
197 | 
198 |     if is_extended:
199 |         dataset = np.squeeze(task_or_model['dataset_name'])
200 |         theory_level_str = re.sub(
201 |             r'[^\w\-_\.]', '.', str(np.squeeze(task_or_model['dataset_theory']))
202 |         )
203 |         theory_level_str = re.sub(r'\.\.', '.', theory_level_str)
204 |         return '%s-%s-train%d-sym%d.npz' % (dataset, theory_level_str, n_train, n_perms)
205 |     return 'model-train%d-sym%d-sig%04d.npz' % (n_train, n_perms, sig)
206 | 
207 | 
208 | def dataset_md5(dataset):
209 | 
210 |     md5_hash = hashlib.md5()
211 | 
212 |     keys = ['z', 'R']
213 |     if 'E' in dataset:
214 |         keys.append('E')
215 |     keys.append('F')
216 | 
217 |     # only include new extra keys in fingerprint for 'modern' dataset files
218 |     # 'code_version' was included from 0.4.0.dev1
219 |     # opt_keys = ['lattice', 'e_unit', 'E_min', 'E_max', 'E_mean', 'E_var', 'f_unit', 'F_min', 'F_max', 'F_mean', 'F_var']
220 |     # for k in opt_keys:
221 |     #    if k in dataset:
222 |     #        keys.append(k)
223 | 
224 |     for k in keys:
225 |         d = dataset[k]
226 |         if type(d) is np.ndarray:
227 |             d = d.ravel()
228 |         md5_hash.update(hashlib.md5(d).digest())
229 | 
230 |     return md5_hash.hexdigest().encode('utf-8')
231 | 
232 | 
233 | # ## FILES
234 | 
235 | # Read geometry file (xyz format).
236 | # R: (n_geo,3*n_atoms)
237 | # z: (3*n_atoms,)
238 | def read_xyz(file_path):
239 | 
240 |     with open(file_path, 'r') as f:
241 |         n_atoms = None
242 | 
243 |         R, z = [], []
244 |         for i, line in enumerate(f):
245 |             line = line.strip()
246 |             if not n_atoms:
247 |                 n_atoms = int(line)
248 | 
249 |             cols = line.split()
250 |             file_i, line_i = divmod(i, n_atoms + 2)
251 |             if line_i >= 2:
252 |                 R.append(list(map(float, cols[1:4])))
253 |                 if file_i == 0:  # first molecule
254 |                     z.append(_z_str_to_z_dict[cols[0]])
255 | 
256 |         R = np.array(R).reshape(-1, 3 * n_atoms)
257 |         z = np.array(z)
258 | 
259 |         f.close()
260 |     return R, z
261 | 
262 | 
263 | # Write geometry file (xyz format).
264 | def write_geometry(filename, r, z, comment_str=''):
265 | 
266 |     r = np.squeeze(r)
267 |     try:
268 |         with open(filename, 'w') as f:
269 |             f.write(str(len(r)) + '\n' + comment_str)
270 |             for i, atom in enumerate(r):
271 |                 f.write('\n' + _z_to_z_str_dict[z[i]] + '\t')
272 |                 f.write('\t'.join(str(x) for x in atom))
273 |     except IOError:
274 |         sys.exit("ERROR: Writing xyz file failed.")
275 | 
276 | 
277 | # Write geometry file (xyz format).
278 | def generate_xyz_str(r, z, e=None, f=None, lattice=None):
279 | 
280 |     comment_str = ''
281 |     if lattice is not None:
282 |         comment_str += 'Lattice=\"{}\" '.format(
283 |             ' '.join(['{:.12g}'.format(l) for l in lattice.T.ravel()])
284 |         )
285 |     if e is not None:
286 |         comment_str += 'Energy={:.12g} '.format(e)
287 |     comment_str += 'Properties=species:S:1:pos:R:3'
288 |     if f is not None:
289 |         comment_str += ':forces:R:3'
290 | 
291 |     species_str = '\n'.join([_z_to_z_str_dict[z_i] for z_i in z])
292 | 
293 |     r_f_str = ui.gen_mat_str(r)[0]
294 |     if f is not None:
295 |         r_f_str = ui.merge_col_str(r_f_str, ui.gen_mat_str(f)[0])
296 | 
297 |     xyz_str = str(len(r)) + '\n' + comment_str + '\n'
298 |     xyz_str += ui.merge_col_str(species_str, r_f_str)
299 | 
300 |     return xyz_str
301 | 
302 | 
303 | def lattice_vec_to_par(lat):
304 | 
305 |     lat = lat.T
306 |     lengths = [np.linalg.norm(v) for v in lat]
307 | 
308 |     angles = []
309 |     for i in range(3):
310 |         j = i - 1
311 |         k = i - 2
312 | 
313 |         ll = lengths[j] * lengths[k]
314 |         if ll > 1e-16:
315 |             x = np.dot(lat[j], lat[k]) / ll
316 |             angle = 180.0 / np.pi * np.arccos(x)
317 |         else:
318 |             angle = 90.0
319 |         angles.append(angle)
320 | 
321 |     return lengths, angles
322 | 
323 | 
324 | ### FILE HANDLING
325 | 
326 | 
327 | def is_file_type(arg, type):
328 |     """
329 |     Validate file path and check if the file is of the specified type.
330 | 
331 |     Parameters
332 |     ----------
333 |         arg : :obj:`str`
334 |             File path.
335 |         type : {'dataset', 'task', 'model'}
336 |             Possible file types.
337 | 
338 |     Returns
339 |     -------
340 |         (:obj:`str`, :obj:`dict`)
341 |             Tuple of file path (as provided) and data stored in the
342 |             file. The returned instance of NpzFile class must be
343 |             closed to avoid leaking file descriptors.
344 | 
345 |     Raises
346 |     ------
347 |         ArgumentTypeError
348 |             If the provided file path does not lead to a NpzFile.
349 |         ArgumentTypeError
350 |             If the file is not readable.
351 |         ArgumentTypeError
352 |             If the file is of wrong type.
353 |         ArgumentTypeError
354 |             If path/fingerprint is provided, but the path is not valid.
355 |         ArgumentTypeError
356 |             If fingerprint could not be resolved.
357 |         ArgumentTypeError
358 |             If multiple files with the same fingerprint exist.
359 | 
360 |     """
361 | 
362 |     # Replace MD5 dataset fingerprint with file name, if necessary.
363 |     if type == 'dataset' and not arg.endswith('.npz') and not os.path.isdir(arg):
364 |         dir = '.'
365 |         if re.search(r'^[a-f0-9]{32}$', arg):  # arg looks similar to MD5 hash string
366 |             md5_str = arg
367 |         else:  # is it a path with a MD5 hash at the end?
368 |             md5_str = os.path.basename(os.path.normpath(arg))
369 |             dir = os.path.dirname(os.path.normpath(arg))
370 | 
371 |             if dir == '':  # it is only a filename after all, hence not the right type
372 |                 raise argparse.ArgumentTypeError('{0} is not a .npz file'.format(arg))
373 | 
374 |             if re.search(r'^[a-f0-9]{32}$', md5_str) and not os.path.isdir(
375 |                 dir
376 |             ):  # path has MD5 hash string at the end, but directory is not valid
377 |                 raise argparse.ArgumentTypeError('{0} is not a directory'.format(dir))
378 | 
379 |         file_names = filter_file_type(dir, type, md5_match=md5_str)
380 | 
381 |         if not len(file_names):
382 |             raise argparse.ArgumentTypeError(
383 |                 "No {0} files with fingerprint '{1}' found in '{2}'".format(
384 |                     type, md5_str, dir
385 |                 )
386 |             )
387 |         elif len(file_names) > 1:
388 |             error_str = (
389 |                 "Multiple {0} files with fingerprint '{1}' found in '{2}'".format(
390 |                     type, md5_str, dir
391 |                 )
392 |             )
393 |             for file_name in file_names:
394 |                 error_str += '\n       {0}'.format(file_name)
395 | 
396 |             raise argparse.ArgumentTypeError(error_str)
397 |         else:
398 |             arg = os.path.join(dir, file_names[0])
399 | 
400 |     if not arg.endswith('.npz'):
401 |         argparse.ArgumentTypeError('{0} is not a .npz file'.format(arg))
402 | 
403 |     try:
404 |         file = np.load(arg, allow_pickle=True)
405 |     except Exception:
406 |         raise argparse.ArgumentTypeError('{0} is not readable'.format(arg))
407 | 
408 |     if 'type' not in file or file['type'].astype(str) != type[0]:
409 |         raise argparse.ArgumentTypeError('{0} is not a {1} file'.format(arg, type))
410 | 
411 |     return arg, file
412 | 
413 | 
414 | def filter_file_type(dir, type, md5_match=None):
415 |     """
416 |     Filters all files from a directory that match a given type and (optionally)
417 |     a given fingerprint.
418 | 
419 |     Parameters
420 |     ----------
421 |         arg : :obj:`str`
422 |             File path.
423 |         type : {'dataset', 'task', 'model'}
424 |             Possible file types.
425 |         md5_match : :obj:`str`, optional
426 |             Fingerprint string.
427 | 
428 |     Returns
429 |     -------
430 |         :obj:`list` of :obj:`str`
431 |             List of file names that match the specified type and fingerprint
432 |             (if provided).
433 | 
434 |     Raises
435 |     ------
436 |         ArgumentTypeError
437 |             If the directory contains unreadable .npz files.
438 | 
439 |     """
440 | 
441 |     file_names = []
442 |     for file_name in sorted(os.listdir(dir)):
443 |         if file_name.endswith('.npz'):
444 |             file_path = os.path.join(dir, file_name)
445 |             try:
446 |                 file = np.load(file_path, allow_pickle=True)
447 |             except Exception:
448 |                 raise argparse.ArgumentTypeError(
449 |                     '{0} contains unreadable .npz files'.format(arg)
450 |                 )
451 | 
452 |             if 'type' in file and file['type'].astype(str) == type[0]:
453 | 
454 |                 if md5_match is None:
455 |                     file_names.append(file_name)
456 |                 elif 'md5' in file and file['md5'] == md5_match:
457 |                     file_names.append(file_name)
458 | 
459 |             file.close()
460 | 
461 |     return file_names
462 | 
463 | 
464 | def is_valid_file_type(arg_in):
465 |     """
466 |     Check if file is either a valid dataset, task or model file.
467 | 
468 |     Parameters
469 |     ----------
470 |         arg_in : :obj:`str`
471 |             File path.
472 | 
473 |     Returns
474 |     -------
475 |         (:obj:`str`, :obj:`dict`)
476 |             Tuple of file path (as provided) and data stored in the
477 |             file. The returned instance of NpzFile class must be
478 |             closed to avoid leaking file descriptors.
479 | 
480 |     Raises
481 |     ------
482 |         ArgumentTypeError
483 |             If the provided file path does not point to a supported
484 |             file type.
485 | 
486 |     """
487 | 
488 |     arg, file = None, None
489 |     try:
490 |         arg, file = is_file_type(arg_in, 'dataset')
491 |     except argparse.ArgumentTypeError:
492 |         pass
493 | 
494 |     if file is None:
495 |         try:
496 |             arg, file = is_file_type(arg_in, 'task')
497 |         except argparse.ArgumentTypeError:
498 |             pass
499 | 
500 |     if file is None:
501 |         try:
502 |             arg, file = is_file_type(arg_in, 'model')
503 |         except argparse.ArgumentTypeError:
504 |             pass
505 | 
506 |     if file is None:
507 |         raise argparse.ArgumentTypeError(
508 |             '{0} is neither a dataset, task, nor model file'.format(arg)
509 |         )
510 | 
511 |     return arg, file
512 | 
513 | 
514 | def is_dir_with_file_type(arg, type, or_file=False):
515 |     """
516 |     Validate directory path and check if it contains files of the specified type.
517 | 
518 |     Note
519 |     ----
520 |         If a file path is provided, this function acts like its a directory with
521 |         just one file.
522 | 
523 |     Parameters
524 |     ----------
525 |         arg : :obj:`str`
526 |             File path.
527 |         type : {'dataset', 'task', 'model'}
528 |             Possible file types.
529 |         or_file : bool
530 |             If `arg` contains a file path, act like it's a directory
531 |             with just a single file inside.
532 | 
533 |     Returns
534 |     -------
535 |         (:obj:`str`, :obj:`list` of :obj:`str`)
536 |             Tuple of directory path (as provided) and a list of
537 |             contained file names of the specified type.
538 | 
539 |     Raises
540 |     ------
541 |         ArgumentTypeError
542 |             If the provided directory path does not lead to a directory.
543 |         ArgumentTypeError
544 |             If directory contains unreadable files.
545 |         ArgumentTypeError
546 |             If directory contains no files of the specified type.
547 |     """
548 | 
549 |     if or_file and os.path.isfile(arg):  # arg: file path
550 |         _, file = is_file_type(
551 |             arg, type
552 |         )  # raises exception if there is a problem with the file
553 |         file.close()
554 |         file_name = os.path.basename(arg)
555 |         file_dir = os.path.dirname(arg)
556 |         return file_dir, [file_name]
557 |     else:  # arg: dir
558 | 
559 |         if not os.path.isdir(arg):
560 |             raise argparse.ArgumentTypeError('{0} is not a directory'.format(arg))
561 | 
562 |         file_names = filter_file_type(arg, type)
563 | 
564 |         # if not len(file_names):
565 |         #    raise argparse.ArgumentTypeError(
566 |         #        '{0} contains no {1} files'.format(arg, type)
567 |         #    )
568 | 
569 |         return arg, file_names
570 | 
571 | 
572 | def is_task_dir_resumeable(
573 |     train_dir, train_dataset, test_dataset, n_train, n_test, sigs, gdml
574 | ):
575 |     r"""
576 |     Check if a directory contains `task` and/or `model` files that
577 |     match the configuration of a training process specified in the
578 |     remaining arguments.
579 | 
580 |     Check if the training and test datasets in each task match
581 |     `train_dataset` and `test_dataset`, if the number of training and
582 |     test points matches and if the choices for the kernel
583 |     hyper-parameter :math:`\sigma` are contained in the list. Check
584 |     also, if the existing tasks/models contain symmetries and if
585 |     that's consistent with the flag `gdml`. This function is useful
586 |     for determining if a training process can be resumed using the
587 |     existing files or not.
588 | 
589 |     Parameters
590 |     ----------
591 |         train_dir : :obj:`str`
592 |             Path to training directory.
593 |         train_dataset : :obj:`dataset`
594 |             Dataset from which training points are sampled.
595 |         test_dataset : :obj:`test_dataset`
596 |             Dataset from which test points are sampled (may be the
597 |             same as `train_dataset`).
598 |         n_train : int
599 |             Number of training points to sample.
600 |         n_test : int
601 |             Number of test points to sample.
602 |         sigs : :obj:`list` of int
603 |             List of :math:`\sigma` kernel hyper-parameter choices
604 |             (usually: the hyper-parameter search grid)
605 |         gdml : bool
606 |             If `True`, don't include any symmetries in model (GDML),
607 |             otherwise do (sGDML).
608 | 
609 |     Returns
610 |     -------
611 |         bool
612 |             False, if any of the files in the directory do not match
613 |             the training configuration.
614 |     """
615 | 
616 |     for file_name in sorted(os.listdir(train_dir)):
617 |         if file_name.endswith('.npz'):
618 |             file_path = os.path.join(train_dir, file_name)
619 |             file = np.load(file_path, allow_pickle=True)
620 | 
621 |             if 'type' not in file:
622 |                 continue
623 |             elif file['type'] == 't' or file['type'] == 'm':
624 | 
625 |                 if (
626 |                     file['md5_train'] != train_dataset['md5']
627 |                     or file['md5_valid'] != test_dataset['md5']
628 |                     or len(file['idxs_train']) != n_train
629 |                     or len(file['idxs_valid']) != n_test
630 |                     or gdml
631 |                     and file['perms'].shape[0] > 1
632 |                     or file['sig'] not in sigs
633 |                 ):
634 |                     return False
635 | 
636 |     return True
637 | 
638 | 
639 | ### ARGUMENT VALIDATION
640 | 
641 | 
642 | def is_strict_pos_int(arg):
643 |     """
644 |     Validate strictly positive integer input.
645 | 
646 |     Parameters
647 |     ----------
648 |         arg : :obj:`str`
649 |             Integer as string.
650 | 
651 |     Returns
652 |     -------
653 |         int
654 |             Parsed integer.
655 | 
656 |     Raises
657 |     ------
658 |         ArgumentTypeError
659 |             If integer is not > 0.
660 |     """
661 |     x = int(arg)
662 |     if x <= 0:
663 |         raise argparse.ArgumentTypeError('must be strictly positive')
664 |     return x
665 | 
666 | 
667 | def parse_list_or_range(arg):
668 |     """
669 |     Parses a string that represents either an integer or a range in
670 |     the notation ``<start>:<step>:<stop>``.
671 | 
672 |     Parameters
673 |     ----------
674 |         arg : :obj:`str`
675 |             Integer or range string.
676 | 
677 |     Returns
678 |     -------
679 |         int or :obj:`list` of int
680 | 
681 |     Raises
682 |     ------
683 |         ArgumentTypeError
684 |             If input can neither be interpreted as an integer nor a valid range.
685 |     """
686 | 
687 |     if re.match(r'^\d+:\d+:\d+$', arg) or re.match(r'^\d+:\d+$', arg):
688 |         rng_params = list(map(int, arg.split(':')))
689 | 
690 |         step = 1
691 |         if len(rng_params) == 2:  # start, stop
692 |             start, stop = rng_params
693 |         else:  # start, step, stop
694 |             start, step, stop = rng_params
695 | 
696 |         rng = list(range(start, stop + 1, step))  # include last stop-element in range
697 |         if len(rng) == 0:
698 |             raise argparse.ArgumentTypeError('{0} is an empty range'.format(arg))
699 | 
700 |         return rng
701 |     elif re.match(r'^\d+$', arg):
702 |         return int(arg)
703 | 
704 |     raise argparse.ArgumentTypeError(
705 |         '{0} is neither a integer list, nor valid range in the form <start>:[<step>:]<stop>'.format(
706 |             arg
707 |         )
708 |     )
709 | 


--------------------------------------------------------------------------------
/sgdml/solvers/iterative.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | 
  3 | # MIT License
  4 | #
  5 | # Copyright (c) 2020-2025 Stefan Chmiela
  6 | #
  7 | # Permission is hereby granted, free of charge, to any person obtaining a copy
  8 | # of this software and associated documentation files (the "Software"), to deal
  9 | # in the Software without restriction, including without limitation the rights
 10 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 11 | # copies of the Software, and to permit persons to whom the Software is
 12 | # furnished to do so, subject to the following conditions:
 13 | #
 14 | # The above copyright notice and this permission notice shall be included in all
 15 | # copies or substantial portions of the Software.
 16 | #
 17 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 18 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 19 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 20 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 21 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 22 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 23 | # SOFTWARE.
 24 | 
 25 | import os
 26 | import logging
 27 | from functools import partial
 28 | import inspect
 29 | import multiprocessing as mp
 30 | 
 31 | import numpy as np
 32 | import scipy as sp
 33 | import timeit
 34 | import collections
 35 | 
 36 | from .. import DONE, NOT_DONE
 37 | from ..utils import ui
 38 | from ..predict import GDMLPredict
 39 | 
 40 | try:
 41 |     import torch
 42 | except ImportError:
 43 |     _has_torch = False
 44 | else:
 45 |     _has_torch = True
 46 | 
 47 | 
 48 | CG_STEPS_HIST_LEN = (
 49 |     100  # number of past steps to consider when calculatating solver effectiveness
 50 | )
 51 | EFF_RESTART_THRESH = 0  # if solver effectiveness is less than that percentage after 'CG_STEPS_HIST_LEN'-steps, a solver restart is triggert (with stronger preconditioner)
 52 | 
 53 | MAX_NUM_RESTARTS = 6
 54 | 
 55 | 
 56 | class CGRestartException(Exception):
 57 |     pass
 58 | 
 59 | 
 60 | class Iterative(object):
 61 |     def __init__(
 62 |         self,
 63 |         gdml_train,
 64 |         desc,
 65 |         max_memory,
 66 |         max_processes,
 67 |         use_torch,
 68 |         callback=None,
 69 |     ):
 70 | 
 71 |         self.log = logging.getLogger(__name__)
 72 | 
 73 |         self.gdml_train = gdml_train
 74 |         self.gdml_predict = None
 75 |         self.desc = desc
 76 | 
 77 |         self.callback = callback
 78 | 
 79 |         self._max_memory = max_memory
 80 |         self._max_processes = max_processes
 81 |         self._use_torch = use_torch
 82 | 
 83 |     def _init_precon_operator(
 84 |         self, task, R_desc, R_d_desc, tril_perms_lin, inducing_pts_idxs, callback=None
 85 |     ):
 86 | 
 87 |         lam = task['lam']
 88 |         lam_inv = 1.0 / lam
 89 | 
 90 |         sig = task['sig']
 91 | 
 92 |         use_E_cstr = task['use_E_cstr']
 93 | 
 94 |         L_inv_K_mn = self._nystroem_cholesky_factor(
 95 |             R_desc,
 96 |             R_d_desc,
 97 |             tril_perms_lin,
 98 |             sig,
 99 |             lam,
100 |             use_E_cstr=use_E_cstr,
101 |             col_idxs=inducing_pts_idxs,
102 |             callback=callback,
103 |         )
104 | 
105 |         L_inv_K_mn = np.ascontiguousarray(L_inv_K_mn)
106 | 
107 |         lev_scores = np.einsum(
108 |             'i...,i...->...', L_inv_K_mn, L_inv_K_mn
109 |         )  # compute leverage scores because it is basically free once we got the factor
110 | 
111 |         m, n = L_inv_K_mn.shape
112 | 
113 |         if self._use_torch and False:  # TURNED OFF!
114 |             _torch_device = 'cuda' if torch.cuda.is_available() else 'cpu'
115 |             L_inv_K_mn_torch = torch.from_numpy(L_inv_K_mn).to(_torch_device)
116 | 
117 |         global is_primed
118 |         is_primed = False
119 | 
120 |         def _P_vec(v):
121 | 
122 |             global is_primed
123 |             if not is_primed:
124 |                 is_primed = True
125 |                 return v
126 | 
127 |             if self._use_torch and False:  # TURNED OFF!
128 | 
129 |                 v_torch = torch.from_numpy(v).to(_torch_device)[:, None]
130 |                 return (
131 |                     L_inv_K_mn_torch.t().mm(L_inv_K_mn_torch.mm(v_torch)) - v_torch
132 |                 ).cpu().numpy() * lam_inv
133 | 
134 |             else:
135 | 
136 |                 ret = L_inv_K_mn.T.dot(L_inv_K_mn.dot(v))
137 |                 ret -= v
138 |                 ret *= lam_inv
139 | 
140 |                 return ret
141 | 
142 |         return sp.sparse.linalg.LinearOperator((n, n), matvec=_P_vec), lev_scores
143 | 
144 |     def _init_kernel_operator(
145 |         self, task, R_desc, R_d_desc, tril_perms_lin, lam, n, callback=None
146 |     ):
147 | 
148 |         n_train = R_desc.shape[0]
149 | 
150 |         # dummy alphas
151 |         v_F = np.zeros((n - n_train, 1)) if task['use_E_cstr'] else np.zeros((n, 1))
152 |         v_E = np.zeros((n_train, 1)) if task['use_E_cstr'] else None
153 | 
154 |         # Note: The standard deviation is set to 1.0, because we are predicting normalized labels here.
155 |         model = self.gdml_train.create_model(
156 |             task, 'cg', R_desc, R_d_desc, tril_perms_lin, 1.0, v_F, alphas_E=v_E
157 |         )
158 | 
159 |         self.gdml_predict = GDMLPredict(
160 |             model,
161 |             max_memory=self._max_memory,
162 |             max_processes=self._max_processes,
163 |             use_torch=self._use_torch,
164 |         )
165 | 
166 |         self.gdml_predict.set_R_desc(R_desc)  # only needed on CPU
167 |         self.gdml_predict.set_R_d_desc(R_d_desc)
168 | 
169 |         if not self._use_torch:
170 | 
171 |             if callback is not None:
172 |                 callback = partial(callback, disp_str='Optimizing CPU parallelization')
173 |                 callback(NOT_DONE)
174 | 
175 |             self.gdml_predict.prepare_parallel(n_bulk=n_train)
176 | 
177 |             if callback is not None:
178 |                 callback(DONE)
179 | 
180 |         global is_primed
181 |         is_primed = False
182 | 
183 |         def _K_vec(v):
184 | 
185 |             global is_primed
186 |             if not is_primed:
187 |                 is_primed = True
188 |                 return v
189 | 
190 |             v_F, v_E = v, None
191 |             if task['use_E_cstr']:
192 |                 v_F, v_E = v[:-n_train], v[-n_train:]
193 | 
194 |             self.gdml_predict.set_alphas(v_F, alphas_E=v_E)
195 | 
196 |             pred = self.gdml_predict.predict(return_E=task['use_E_cstr'])
197 |             if task['use_E_cstr']:
198 |                 e_pred, f_pred = pred
199 |                 pred = np.hstack((f_pred.ravel(), -e_pred))
200 |             else:
201 |                 pred = pred[0].ravel()
202 | 
203 |             pred -= lam * v
204 |             return pred
205 | 
206 |         return sp.sparse.linalg.LinearOperator((n, n), matvec=_K_vec)
207 | 
208 |     def _nystroem_cholesky_factor(
209 |         self,
210 |         R_desc,
211 |         R_d_desc,
212 |         tril_perms_lin,
213 |         sig,
214 |         lam,
215 |         use_E_cstr,
216 |         col_idxs,
217 |         callback_task_name='',
218 |         callback=None,
219 |     ):
220 | 
221 |         if callback_task_name != '':
222 |             callback_task_name = ' ({})'.format(callback_task_name)
223 | 
224 |         if callback is not None:
225 |             callback = partial(
226 |                 callback,
227 |                 disp_str='Assembling kernel [m x k]{}'.format(callback_task_name),
228 |             )
229 | 
230 |         dim_d = R_desc.shape[1]
231 |         n_atoms = int((1 + np.sqrt(8 * dim_d + 1)) / 2)
232 |         n = R_desc.shape[0] * n_atoms * 3 + (R_desc.shape[0] if use_E_cstr else 0)
233 |         m = len(
234 |             range(*col_idxs.indices(n)) if isinstance(col_idxs, slice) else col_idxs
235 |         )
236 | 
237 |         K_nmm = self.gdml_train._assemble_kernel_mat(
238 |             R_desc,
239 |             R_d_desc,
240 |             tril_perms_lin,
241 |             sig,
242 |             self.desc,
243 |             use_E_cstr=use_E_cstr,
244 |             col_idxs=col_idxs,
245 |             alloc_extra_rows=m,
246 |             callback=callback,
247 |         )
248 | 
249 |         # Store (psd) copy of K_mm in lower part of this oversized K_(n+m)m matrix.
250 |         K_nmm[-m:, :] = -K_nmm[col_idxs, :]
251 | 
252 |         K_nm = K_nmm[:-m, :]
253 |         K_mm = K_nmm[-m:, :]
254 | 
255 |         if callback is not None:
256 |             callback = partial(
257 |                 callback,
258 |                 disp_str='Cholesky fact. (1/2) [k x k]{}'.format(callback_task_name),
259 |             )
260 |             callback(NOT_DONE)
261 | 
262 |         # Additional regularization is almost always necessary here (hence pre_reg=True).
263 |         K_mm, lower = self._cho_factor_stable(K_mm, pre_reg=True)  # overwrites input!
264 |         L_mm = K_mm
265 |         # del K_mm
266 | 
267 |         if callback is not None:
268 |             callback(DONE)
269 |             callback = partial(
270 |                 callback,
271 |                 disp_str='m tri. solves (1/2) [k x k]{}'.format(callback_task_name),
272 |             )
273 |             callback(0, n)
274 | 
275 |         b_start, b_size = 0, int(n / 4)  # update in percentage steps of 25
276 |         for b_stop in list(range(b_size, n, b_size)) + [n]:
277 | 
278 |             K_nm[b_start:b_stop, :] = sp.linalg.solve_triangular(
279 |                 L_mm,
280 |                 K_nm[b_start:b_stop, :].T,
281 |                 lower=lower,
282 |                 trans='T',
283 |                 overwrite_b=True,
284 |                 check_finite=False,
285 |             ).T
286 |             b_start = b_stop
287 | 
288 |             if callback is not None:
289 |                 callback(b_stop, n)
290 | 
291 |         del L_mm
292 | 
293 |         K_nmm[-m:, :] = K_nm.T.dot(K_nm)
294 |         K_nmm[-m:, :][np.diag_indices_from(K_nmm[-m:, :])] += lam
295 |         inner = K_nmm[-m:, :]
296 | 
297 |         if callback is not None:
298 |             callback = partial(
299 |                 callback,
300 |                 disp_str='Cholesky fact. (2/2) [k x k]{}'.format(callback_task_name),
301 |             )
302 |             callback(NOT_DONE)
303 | 
304 |         L_lower = self._cho_factor_stable(
305 |             inner, eps_mag_max=-14
306 |         )  # Do not regularize more than 1e-14.
307 |         if L_lower is not None:
308 |             K_nmm[-m:, :], lower = L_lower
309 |             L = K_nmm[-m:, :]
310 |             del inner
311 |         else:
312 | 
313 |             callback = partial(
314 |                 callback,
315 |                 disp_str='QR fact. (alt.) [k x k]{}'.format(callback_task_name),
316 |             )
317 |             callback(NOT_DONE)
318 | 
319 |             K_nmm[-m:, :] = 0
320 |             K_nmm[-m:, :][np.diag_indices(m)] = np.sqrt(lam)
321 | 
322 |             K_nmm[-m:, :] = np.linalg.qr(K_nmm, mode='r')
323 |             L = K_nmm[-m:, :]
324 |             lower = False
325 | 
326 |         if callback is not None:
327 |             callback(DONE)
328 |             callback = partial(
329 |                 callback,
330 |                 disp_str='m tri. solves (2/2) [k x k]{}'.format(callback_task_name),
331 |             )
332 |             callback(0, n)
333 | 
334 |         b_start, b_size = 0, int(n / 4)  # update in percentage steps of 25
335 |         for b_stop in list(range(b_size, n, b_size)) + [n]:
336 | 
337 |             K_nm[b_start:b_stop, :] = sp.linalg.solve_triangular(
338 |                 L,
339 |                 K_nm[b_start:b_stop, :].T,
340 |                 lower=lower,
341 |                 trans='T',
342 |                 overwrite_b=True,
343 |                 check_finite=False,
344 |             ).T  # Note: Overwrites K_nm to save memory
345 |             b_start = b_stop
346 | 
347 |             if callback is not None:
348 |                 callback(b_stop, n)
349 |         del L
350 | 
351 |         return K_nm.T
352 | 
353 |     def _lev_scores(
354 |         self,
355 |         R_desc,
356 |         R_d_desc,
357 |         tril_perms_lin,
358 |         sig,
359 |         lam,
360 |         use_E_cstr,
361 |         n_inducing_pts,
362 |         callback=None,
363 |     ):
364 | 
365 |         n_train, dim_d = R_d_desc.shape[:2]
366 |         dim_i = 3 * int((1 + np.sqrt(8 * dim_d + 1)) / 2)
367 | 
368 |         # Convert from training points to actual columns.
369 |         # dim_m = (
370 |         #    np.maximum(1, n_inducing_pts // 4) * dim_i
371 |         # )  # only use 1/4 of inducing points for leverage score estimate
372 |         dim_m = dim_i * min(n_inducing_pts, 10)
373 | 
374 |         # Which columns to use for leverage score approximation?
375 |         lev_approx_idxs = np.sort(
376 |             np.random.choice(
377 |                 n_train * dim_i + (n_train if use_E_cstr else 0), dim_m, replace=False
378 |             )
379 |         )  # random subset of columns
380 |         # lev_approx_idxs = np.sort(np.random.choice(n_train*dim_i, dim_m, replace=False)) # random subset of columns
381 | 
382 |         # lev_approx_idxs = np.s_[
383 |         #    :dim_m
384 |         # ]  # first 'dim_m' columns (faster kernel construction)
385 | 
386 |         L_inv_K_mn = self._nystroem_cholesky_factor(
387 |             R_desc,
388 |             R_d_desc,
389 |             tril_perms_lin,
390 |             sig,
391 |             lam,
392 |             use_E_cstr=use_E_cstr,
393 |             col_idxs=lev_approx_idxs,
394 |             callback_task_name='lev. scores',
395 |             callback=callback,
396 |         )
397 | 
398 |         lev_scores = np.einsum('i...,i...->...', L_inv_K_mn, L_inv_K_mn)
399 |         return lev_scores
400 | 
401 |     def inducing_pts_from_lev_scores(self, lev_scores, N):
402 | 
403 |         # Sample 'N' columns with probabilities proportional to the leverage scores.
404 |         inducing_pts_idxs = np.random.choice(
405 |             np.arange(lev_scores.size),
406 |             N,
407 |             replace=False,
408 |             p=lev_scores / lev_scores.sum(),
409 |         )
410 | 
411 |         return np.sort(inducing_pts_idxs)
412 | 
413 |     # performs a cholesky decompostion of a matrix, but regularizes the matrix (if neeeded) until its positive definite
414 |     def _cho_factor_stable(self, M, pre_reg=False, eps_mag_max=1):
415 |         """
416 |         Performs a Cholesky decompostion of a matrix, but regularizes
417 |         as needed until its positive definite.
418 | 
419 |         Parameters
420 |         ----------
421 |             M : :obj:`numpy.ndarray`
422 |                 Matrix to factorize.
423 |             pre_reg : boolean, optional
424 |                 Regularize M right away (machine precision), before
425 |                 trying to factorize it (default: False).
426 | 
427 |         Returns
428 |         -------
429 |             :obj:`numpy.ndarray`
430 |                 Matrix whose upper or lower triangle contains the Cholesky factor of a. Other parts of the matrix contain random data.
431 |             boolean
432 |                 Flag indicating whether the factor is in the lower or upper triangle
433 |         """
434 | 
435 |         eps = np.finfo(float).eps
436 |         eps_mag = int(np.floor(np.log10(eps)))
437 | 
438 |         if pre_reg:
439 |             M[np.diag_indices_from(M)] += eps
440 |             eps_mag += 1  # if additional regularization is necessary, start from the next order of magnitude
441 | 
442 |         for reg in 10.0 ** np.arange(
443 |             eps_mag, eps_mag_max + 1
444 |         ):  # regularize more and more aggressively (strongest regularization: 1)
445 |             try:
446 | 
447 |                 L, lower = sp.linalg.cho_factor(
448 |                     M, overwrite_a=False, check_finite=False
449 |                 )
450 | 
451 |             except np.linalg.LinAlgError as e:
452 | 
453 |                 if 'not positive definite' in str(e):
454 |                     self.log.debug(
455 |                         'Cholesky solver needs more aggressive regularization (adding {} to diagonal)'.format(
456 |                             reg
457 |                         )
458 |                     )
459 |                     M[np.diag_indices_from(M)] += reg
460 |                 else:
461 |                     raise e
462 |             else:
463 |                 return L, lower
464 | 
465 |         self.log.critical(
466 |             'Failed to factorize despite strong regularization (max: {})!\nYou could try a larger sigma.'.format(
467 |                 10.0**eps_mag_max
468 |             )
469 |         )
470 |         print()
471 |         os._exit(1)
472 | 
473 |     def solve(
474 |         self,
475 |         task,
476 |         R_desc,
477 |         R_d_desc,
478 |         tril_perms_lin,
479 |         y,
480 |         y_std,
481 |         tol=1e-4,
482 |         save_progr_callback=None,
483 |     ):
484 | 
485 |         global num_iters, start, resid, avg_tt, m  # , P_t
486 | 
487 |         n_train, n_atoms = task['R_train'].shape[:2]
488 |         dim_i = 3 * n_atoms
489 | 
490 |         sig = task['sig']
491 |         lam = task['lam']
492 | 
493 |         # these keys are only present if the task was created from an existing model
494 |         alphas0_F = task['alphas0_F'] if 'alphas0_F' in task else None
495 |         alphas0_E = task['alphas0_E'] if 'alphas0_E' in task else None
496 |         num_iters0 = task['solver_iters'] if 'solver_iters' in task else 0
497 | 
498 |         # Number of inducing points to use for Nystrom approximation.
499 |         max_memory_bytes = self._max_memory * 1024**3
500 |         max_n_inducing_pts = Iterative.max_n_inducing_pts(
501 |             n_train, n_atoms, max_memory_bytes
502 |         )
503 |         n_inducing_pts = min(n_train, max_n_inducing_pts)
504 |         n_inducing_pts_init = (
505 |             len(task['inducing_pts_idxs']) // (3 * n_atoms)
506 |             if 'inducing_pts_idxs' in task
507 |             else None
508 |         )
509 | 
510 |         if self.callback is not None:
511 |             self.callback = partial(
512 |                 self.callback,
513 |                 disp_str='Building preconditioner (k={} ind. point{})'.format(
514 |                     n_inducing_pts, 's' if n_inducing_pts > 1 else ''
515 |                 ),
516 |             )
517 |         subtask_callback = (
518 |             partial(ui.sec_callback, main_callback=self.callback)
519 |             if self.callback is not None
520 |             else None
521 |         )
522 | 
523 |         lev_scores = None
524 |         if n_inducing_pts_init is not None and n_inducing_pts_init == n_inducing_pts:
525 |             inducing_pts_idxs = task['inducing_pts_idxs']  # reuse old inducing points
526 |         else:
527 |             # Determine good inducing points.
528 |             lev_scores = self._lev_scores(
529 |                 R_desc,
530 |                 R_d_desc,
531 |                 tril_perms_lin,
532 |                 sig,
533 |                 lam,
534 |                 task['use_E_cstr'],
535 |                 n_inducing_pts,
536 |                 callback=subtask_callback,
537 |             )
538 | 
539 |             dim_m = n_inducing_pts * dim_i
540 |             inducing_pts_idxs = self.inducing_pts_from_lev_scores(lev_scores, dim_m)
541 | 
542 |         start = timeit.default_timer()
543 |         P_op, lev_scores = self._init_precon_operator(
544 |             task,
545 |             R_desc,
546 |             R_d_desc,
547 |             tril_perms_lin,
548 |             inducing_pts_idxs,
549 |             callback=subtask_callback,
550 |         )
551 |         stop = timeit.default_timer()
552 | 
553 |         if self.callback is not None:
554 |             dur_s = stop - start
555 |             sec_disp_str = 'took {:.1f} s'.format(dur_s) if dur_s >= 0.1 else ''
556 |             self.callback(DONE, sec_disp_str=sec_disp_str)
557 | 
558 |             self.callback = partial(
559 |                 self.callback,
560 |                 disp_str='Initializing solver',
561 |             )
562 |         subtask_callback = (
563 |             partial(ui.sec_callback, main_callback=self.callback)
564 |             if self.callback is not None
565 |             else None
566 |         )
567 | 
568 |         n = P_op.shape[0]
569 |         K_op = self._init_kernel_operator(
570 |             task, R_desc, R_d_desc, tril_perms_lin, lam, n, callback=subtask_callback
571 |         )
572 | 
573 |         num_iters = int(num_iters0)
574 | 
575 |         if self.callback is not None:
576 | 
577 |             num_devices = (
578 |                 mp.cpu_count() if self._max_processes is None else self._max_processes
579 |             )
580 |             if self._use_torch:
581 |                 num_devices = (
582 |                     torch.cuda.device_count()
583 |                     if torch.cuda.is_available()
584 |                     else torch.get_num_threads()
585 |                 )
586 |             hardware_str = '{:d} {}{}{}'.format(
587 |                 num_devices,
588 |                 'GPU' if self._use_torch and torch.cuda.is_available() else 'CPU',
589 |                 's' if num_devices > 1 else '',
590 |                 '[PyTorch]' if self._use_torch else '',
591 |             )
592 | 
593 |             self.callback(NOT_DONE, sec_disp_str=None)
594 | 
595 |         start = 0
596 |         resid = 0
597 |         avg_tt = 0
598 | 
599 |         global alpha_t, eff, steps_hist, callback_disp_str
600 | 
601 |         alpha_t = None
602 |         if alphas0_F is not None:  # TODO: improve me: this will not workt with E_cstr
603 |             alpha_t = -alphas0_F
604 | 
605 |         if alphas0_E is not None:
606 |             alpha_t = np.hstack((alpha_t, -alphas0_E))
607 | 
608 |         steps_hist = collections.deque(
609 |             maxlen=CG_STEPS_HIST_LEN
610 |         )  # moving average window for step history
611 | 
612 |         callback_disp_str = 'Initializing solver'
613 | 
614 |         def _cg_status(xk):
615 | 
616 |             global num_iters, start, resid, alpha_t, avg_tt, eff, steps_hist, callback_disp_str, P_t
617 | 
618 |             stop = timeit.default_timer()
619 |             tt = 0.0 if start == 0 else (stop - start)
620 |             avg_tt += tt
621 |             start = timeit.default_timer()
622 | 
623 |             old_resid = resid
624 |             try:
625 | 
626 |                 # Can we extract the residual from the solver?
627 |                 f_locals = inspect.currentframe().f_back.f_locals
628 |                 if 'resid' in f_locals:
629 |                     resid = f_locals['resid']
630 |                 elif 'r' in f_locals:
631 |                     resid = np.linalg.norm(f_locals['r'])
632 |                 else:
633 |                     raise KeyError
634 | 
635 |             except KeyError:
636 | 
637 |                 # Fallback: compute residual from scratch (slower)
638 |                 rk = y + K_op @ xk
639 |                 resid = np.linalg.norm(rk)
640 | 
641 |             step = 0 if num_iters == num_iters0 else resid - old_resid
642 |             steps_hist.append(step)
643 | 
644 |             steps_hist_arr = np.array(steps_hist)
645 |             steps_hist_all = np.abs(steps_hist_arr).sum()
646 |             steps_hist_ratio = (
647 |                 (-steps_hist_arr.clip(max=0).sum() / steps_hist_all)
648 |                 if steps_hist_all > 0
649 |                 else 1
650 |             )
651 |             eff = (
652 |                 0 if num_iters == num_iters0 else (int(100 * steps_hist_ratio) - 50) * 2
653 |             )
654 | 
655 |             if tt > 0.0 and num_iters % int(np.ceil(1.0 / tt)) == 0:  # once per second
656 | 
657 |                 train_rmse = resid / np.sqrt(len(y))
658 |                 if self.callback is not None:
659 |                     callback_disp_str = 'Training error (RMSE): forces {:.4f}'.format(
660 |                         train_rmse
661 |                     )
662 |                     self.callback(
663 |                         NOT_DONE,
664 |                         disp_str=callback_disp_str,
665 |                         sec_disp_str=(
666 |                             '{:d} iter @ {} iter/s [eff: {:d}%], k={:d}'.format(
667 |                                 num_iters,
668 |                                 '{:.2f}'.format(1.0 / tt),
669 |                                 eff,
670 |                                 n_inducing_pts,
671 |                             )
672 |                         ),
673 |                     )
674 | 
675 |             # Write out current solution as a model file once every 2 minutes (give or take).
676 |             if (
677 |                 tt > 0.0
678 |                 and num_iters % int(np.ceil(2 * 60.0 / tt)) == 0
679 |                 and num_iters % 10 == 0
680 |             ):
681 | 
682 |                 self.log.debug('Saving model checkpoint.')
683 | 
684 |                 # TODO: support for +E constraints (done?)
685 |                 alphas_F, alphas_E = -xk, None
686 |                 if task['use_E_cstr']:
687 |                     n_train = task['R_train'].shape[0]
688 |                     alphas_F, alphas_E = -xk[:-n_train], -xk[-n_train:]
689 | 
690 |                 unconv_model = self.gdml_train.create_model(
691 |                     task,
692 |                     'cg',
693 |                     R_desc,
694 |                     R_d_desc,
695 |                     tril_perms_lin,
696 |                     y_std,
697 |                     alphas_F,
698 |                     alphas_E=alphas_E,
699 |                 )
700 | 
701 |                 solver_keys = {
702 |                     'solver_tol': tol,
703 |                     'solver_iters': num_iters
704 |                     + 1,  # number of iterations performed (cg solver)
705 |                     'solver_resid': resid,  # residual of solution
706 |                     'norm_y_train': np.linalg.norm(y),
707 |                     'inducing_pts_idxs': inducing_pts_idxs,
708 |                 }
709 | 
710 |                 unconv_model.update(solver_keys)
711 | 
712 |                 # recover integration constant
713 |                 self.gdml_predict.set_alphas(alphas_F, alphas_E=alphas_E)
714 |                 E_pred, _ = self.gdml_predict.predict()
715 | 
716 |                 E_pred *= y_std
717 | 
718 |                 unconv_model['c'] = 0
719 |                 if 'E_train' in task:
720 |                     E_ref = np.squeeze(task['E_train'])
721 |                     unconv_model['c'] = np.mean(E_ref - E_pred)
722 | 
723 |                 if save_progr_callback is not None:
724 |                     save_progr_callback(unconv_model)
725 | 
726 |             num_iters += 1
727 | 
728 |             n_train = task['idxs_train'].shape[0]
729 |             if (
730 |                 len(steps_hist) == CG_STEPS_HIST_LEN
731 |                 and eff <= EFF_RESTART_THRESH
732 |                 and n_inducing_pts < n_train
733 |             ):
734 |                 alpha_t = xk
735 |                 raise CGRestartException
736 | 
737 |         num_restarts = 0
738 |         while True:
739 |             try:
740 |                 alphas, info = sp.sparse.linalg.cg(
741 |                     -K_op,
742 |                     y,
743 |                     x0=alpha_t,
744 |                     M=P_op,
745 |                     rtol=tol,  # norm(residual) <= max(rtol*norm(b), atol)
746 |                     atol=0,
747 |                     maxiter=3
748 |                     * n_atoms
749 |                     * n_train
750 |                     * 10,  # allow 10x as many iterations as theoretically needed (at perfect precision)
751 |                     callback=_cg_status,
752 |                 )
753 |                 alphas = -alphas
754 | 
755 |             except CGRestartException:
756 | 
757 |                 num_restarts += 1
758 |                 steps_hist.clear()
759 | 
760 |                 if num_restarts == MAX_NUM_RESTARTS:
761 |                     info = 1  # convergence to tolerance not achieved
762 |                     alphas = alpha_t
763 |                     break
764 |                 else:
765 |                     num_restarts_left = MAX_NUM_RESTARTS - num_restarts - 1
766 |                     self.log.debug(
767 |                         'Restarts left before giving up: {}{}.'.format(
768 |                             num_restarts_left,
769 |                             ' (final trial)' if num_restarts_left == 0 else '',
770 |                         )
771 |                     )
772 | 
773 |                 # TODO: keep using same number of points
774 | 
775 |                 n_inducing_pts = min(
776 |                     int(np.ceil(1.2 * n_inducing_pts)), n_train
777 |                 )  # increase in increments (ignoring memory limits...)
778 | 
779 |                 subtask_callback = (
780 |                     partial(
781 |                         ui.sec_callback,
782 |                         main_callback=partial(
783 |                             self.callback, disp_str=callback_disp_str
784 |                         ),
785 |                     )
786 |                     if self.callback is not None
787 |                     else None
788 |                 )
789 | 
790 |                 dim_m = n_inducing_pts * dim_i
791 |                 inducing_pts_idxs = self.inducing_pts_from_lev_scores(lev_scores, dim_m)
792 | 
793 |                 del P_op
794 |                 P_op, lev_scores = self._init_precon_operator(
795 |                     task,
796 |                     R_desc,
797 |                     R_d_desc,
798 |                     tril_perms_lin,
799 |                     inducing_pts_idxs,
800 |                     callback=subtask_callback,
801 |                 )
802 | 
803 |             else:
804 |                 break
805 | 
806 |         is_conv = info == 0
807 | 
808 |         if self.callback is not None:
809 | 
810 |             is_conv_warn_str = '' if is_conv else ' (NOT CONVERGED)'
811 |             self.callback(
812 |                 DONE,
813 |                 disp_str='Training on {:,} points{}'.format(n_train, is_conv_warn_str),
814 |                 sec_disp_str=(
815 |                     '{:d} iter @ {} iter/s'.format(
816 |                         num_iters,
817 |                         '{:.2f}'.format(num_iters / avg_tt) if avg_tt > 0 else '--',
818 |                     )
819 |                 ),
820 |                 done_with_warning=not is_conv,
821 |             )
822 | 
823 |         train_rmse = resid / np.sqrt(len(y))
824 | 
825 |         return alphas, tol, num_iters, resid, train_rmse, inducing_pts_idxs, is_conv
826 | 
827 |     @staticmethod
828 |     def max_n_inducing_pts(n_train, n_atoms, max_memory_bytes):
829 | 
830 |         SQUARE_FACT = 5
831 |         LINEAR_FACT = 4
832 | 
833 |         to_bytes = 8
834 |         to_dof = (3 * n_atoms) ** 2 * to_bytes
835 | 
836 |         sq_factor = LINEAR_FACT * n_train * to_dof
837 |         ny_factor = SQUARE_FACT * to_dof
838 | 
839 |         n_inducing_pts = (
840 |             np.sqrt(sq_factor**2 + 4.0 * ny_factor * max_memory_bytes) - sq_factor
841 |         ) / (2 * ny_factor)
842 |         n_inducing_pts = int(n_inducing_pts)
843 | 
844 |         return min(n_inducing_pts, n_train)
845 | 
846 |     @staticmethod
847 |     def est_memory_requirement(n_train, n_inducing_pts, n_atoms):
848 | 
849 |         SQUARE_FACT = 5
850 |         LINEAR_FACT = 4
851 | 
852 |         # est_bytes = n_train * n_inducing_pts * (3 * n_atoms) ** 2 * 8  # P_op
853 |         # est_bytes += 2 * (n_inducing_pts * 3 * n_atoms) ** 2 * 8  # P_op [cho_factor]
854 |         # est_bytes += (n_train * 3 * n_atoms) * 8  # lev_scores
855 |         # est_bytes += (n_train * 3 * n_atoms) * 8  # alpha
856 | 
857 |         est_bytes = LINEAR_FACT * n_train * n_inducing_pts * (3 * n_atoms) ** 2 * 8
858 | 
859 |         est_bytes += (
860 |             SQUARE_FACT * n_inducing_pts * n_inducing_pts * (3 * n_atoms) ** 2 * 8
861 |         )
862 | 
863 |         # est_bytes += (n_train * 3 * n_atoms) * 8  # lev_scores
864 |         # est_bytes += (n_train * 3 * n_atoms) * 8  # alpha
865 | 
866 |         return est_bytes
867 | 


--------------------------------------------------------------------------------
/sgdml/utils/perm.py:
--------------------------------------------------------------------------------
   1 | #!/usr/bin/python
   2 | 
   3 | # MIT License
   4 | #
   5 | # Copyright (c) 2018-2021 Stefan Chmiela
   6 | #
   7 | # Permission is hereby granted, free of charge, to any person obtaining a copy
   8 | # of this software and associated documentation files (the "Software"), to deal
   9 | # in the Software without restriction, including without limitation the rights
  10 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  11 | # copies of the Software, and to permit persons to whom the Software is
  12 | # furnished to do so, subject to the following conditions:
  13 | #
  14 | # The above copyright notice and this permission notice shall be included in all
  15 | # copies or substantial portions of the Software.
  16 | #
  17 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  18 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  19 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  20 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  21 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  22 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  23 | # SOFTWARE.
  24 | 
  25 | from __future__ import print_function
  26 | 
  27 | import multiprocessing as mp
  28 | 
  29 | Pool = mp.get_context('fork').Pool
  30 | 
  31 | import sys
  32 | import timeit
  33 | from functools import partial
  34 | 
  35 | import numpy as np
  36 | import scipy.optimize
  37 | import scipy.spatial.distance
  38 | from scipy.sparse import csr_matrix
  39 | from scipy.sparse.csgraph import minimum_spanning_tree
  40 | 
  41 | from .. import DONE, NOT_DONE
  42 | from .desc import Desc
  43 | from . import ui
  44 | 
  45 | glob = {}
  46 | 
  47 | 
  48 | def share_array(arr_np, typecode):
  49 |     arr = mp.RawArray(typecode, arr_np.ravel())
  50 |     return arr, arr_np.shape
  51 | 
  52 | 
  53 | def _bipartite_match_wkr(i, n_train, same_z_cost):
  54 | 
  55 |     global glob
  56 | 
  57 |     adj_set = np.frombuffer(glob['adj_set']).reshape(glob['adj_set_shape'])
  58 |     v_set = np.frombuffer(glob['v_set']).reshape(glob['v_set_shape'])
  59 |     match_cost = np.frombuffer(glob['match_cost']).reshape(glob['match_cost_shape'])
  60 | 
  61 |     adj_i = scipy.spatial.distance.squareform(adj_set[i, :])
  62 |     v_i = v_set[i, :, :]
  63 | 
  64 |     match_perms = {}
  65 |     for j in range(i + 1, n_train):
  66 | 
  67 |         adj_j = scipy.spatial.distance.squareform(adj_set[j, :])
  68 |         v_j = v_set[j, :, :]
  69 | 
  70 |         cost = -np.fabs(v_i).dot(np.fabs(v_j).T)
  71 |         cost += same_z_cost * np.max(np.abs(cost))
  72 | 
  73 |         _, perm = scipy.optimize.linear_sum_assignment(cost)
  74 | 
  75 |         adj_i_perm = adj_i[:, perm]
  76 |         adj_i_perm = adj_i_perm[perm, :]
  77 | 
  78 |         score_before = np.linalg.norm(adj_i - adj_j)
  79 |         score = np.linalg.norm(adj_i_perm - adj_j)
  80 | 
  81 |         match_cost[i, j] = score
  82 |         if score >= score_before:
  83 |             match_cost[i, j] = score_before
  84 |         elif not np.isclose(score_before, score):  # otherwise perm is identity
  85 |             match_perms[i, j] = perm
  86 | 
  87 |     return match_perms
  88 | 
  89 | 
  90 | def bipartite_match(R, z, lat_and_inv=None, max_processes=None, callback=None):
  91 | 
  92 |     global glob
  93 | 
  94 |     n_train, n_atoms, _ = R.shape
  95 | 
  96 |     # penalty matrix for mixing atom species
  97 |     same_z_cost = np.repeat(z[:, None], len(z), axis=1) - z
  98 |     same_z_cost[same_z_cost != 0] = 1
  99 | 
 100 |     # NEW
 101 | 
 102 |     # penalty matrix for mixing differently bonded atoms
 103 |     # NOTE: needs ASE, expects R to be in angstrom, does not support bond breaking
 104 | 
 105 |     # from ase import Atoms
 106 |     # from ase.geometry.analysis import Analysis
 107 | 
 108 |     # atoms = Atoms(
 109 |     #     z, positions=R[0]
 110 |     # )  # only use first molecule in dataset to find connected components (fix me later, maybe) # *0.529177249
 111 | 
 112 |     # bonds = Analysis(atoms).all_bonds[0]
 113 |     # #n_bonds = np.array([len(bonds_i) for bonds_i in bonds])
 114 | 
 115 |     # same_bonding_cost = np.zeros((n_atoms, n_atoms))
 116 |     # for i in range(n_atoms):
 117 |     #     bi = bonds[i]
 118 |     #     z_bi = z[bi]
 119 |     #     for j in range(i+1,n_atoms):
 120 |     #         bj = bonds[j]
 121 |     #         z_bj = z[bj]
 122 | 
 123 |     #         if set(z_bi) == set(z_bj):
 124 |     #             same_bonding_cost[i,j] = 1
 125 | 
 126 |     # same_bonding_cost += same_bonding_cost.T
 127 | 
 128 |     # same_bonding_cost[np.diag_indices(n_atoms)] = 1
 129 |     # same_bonding_cost = 1-same_bonding_cost
 130 | 
 131 |     # set(a) & set(b)
 132 | 
 133 |     # same_bonding_cost = np.repeat(n_bonds[:, None], len(n_bonds), axis=1) - n_bonds
 134 |     # same_bonding_cost[same_bonding_cost != 0] = 1
 135 | 
 136 |     # NEW
 137 | 
 138 |     match_cost = np.zeros((n_train, n_train))
 139 | 
 140 |     desc = Desc(n_atoms, max_processes=max_processes)
 141 | 
 142 |     adj_set = np.empty((n_train, desc.dim))
 143 |     v_set = np.empty((n_train, n_atoms, n_atoms))
 144 |     for i in range(n_train):
 145 |         r = np.squeeze(R[i, :, :])
 146 | 
 147 |         if lat_and_inv is None:
 148 |             adj = scipy.spatial.distance.pdist(r, 'euclidean')
 149 | 
 150 |             # from ase import Atoms
 151 |             # from ase.geometry.analysis import Analysis
 152 | 
 153 |             # atoms = Atoms(
 154 |             #     z, positions=r
 155 |             # )  # only use first molecule in dataset to find connected components (fix me later, maybe) # *0.529177249
 156 | 
 157 |             # bonds = Analysis(atoms).all_bonds[0]
 158 | 
 159 |             # adj = scipy.spatial.distance.squareform(adj)
 160 | 
 161 |             # bonded = np.zeros((z.size, z.size))
 162 | 
 163 |             # for j, bonded_to in enumerate(bonds):
 164 |             # inv_bonded_to = np.arange(n_atoms)
 165 |             # inv_bonded_to[bonded_to] = 0
 166 | 
 167 |             # adj[j, inv_bonded_to] = 0
 168 | 
 169 |             #    bonded[j, bonded_to] = 1
 170 | 
 171 |             # bonded = bonded + bonded.T
 172 | 
 173 |             # print(bonded)
 174 | 
 175 |         else:
 176 | 
 177 |             from .desc import _pdist, _squareform
 178 | 
 179 |             adj_tri = _pdist(r, lat_and_inv)
 180 |             adj = _squareform(adj_tri)  # our vectorized format to full matrix
 181 |             adj = scipy.spatial.distance.squareform(
 182 |                 adj
 183 |             )  # full matrix to numpy vectorized format
 184 | 
 185 |         w, v = np.linalg.eig(scipy.spatial.distance.squareform(adj))
 186 |         v = v[:, w.argsort()[::-1]]
 187 | 
 188 |         adj_set[i, :] = adj
 189 |         v_set[i, :, :] = v
 190 | 
 191 |     glob['adj_set'], glob['adj_set_shape'] = share_array(adj_set, 'd')
 192 |     glob['v_set'], glob['v_set_shape'] = share_array(v_set, 'd')
 193 |     glob['match_cost'], glob['match_cost_shape'] = share_array(match_cost, 'd')
 194 | 
 195 |     if callback is not None:
 196 |         callback = partial(callback, disp_str='Bi-partite matching')
 197 | 
 198 |     start = timeit.default_timer()
 199 | 
 200 |     pool = None
 201 |     map_func = map
 202 |     if max_processes != 1 and mp.cpu_count() > 1:
 203 |         pool = Pool((max_processes or mp.cpu_count()) - 1)  # exclude main process
 204 |         map_func = pool.imap_unordered
 205 | 
 206 |     match_perms_all = {}
 207 |     for i, match_perms in enumerate(
 208 |         map_func(
 209 |             partial(_bipartite_match_wkr, n_train=n_train, same_z_cost=same_z_cost),
 210 |             list(range(n_train)),
 211 |         )
 212 |     ):
 213 |         match_perms_all.update(match_perms)
 214 | 
 215 |         if callback is not None:
 216 |             callback(i, n_train)
 217 | 
 218 |     if pool is not None:
 219 |         pool.close()
 220 |         pool.join()  # Wait for the worker processes to terminate (to measure total runtime correctly).
 221 |         pool = None
 222 | 
 223 |     stop = timeit.default_timer()
 224 | 
 225 |     dur_s = stop - start
 226 |     sec_disp_str = 'took {:.1f} s'.format(dur_s) if dur_s >= 0.1 else ''
 227 |     if callback is not None:
 228 |         callback(n_train, n_train, sec_disp_str=sec_disp_str)
 229 | 
 230 |     match_cost = np.frombuffer(glob['match_cost']).reshape(glob['match_cost_shape'])
 231 |     match_cost = match_cost + match_cost.T
 232 |     match_cost[np.diag_indices_from(match_cost)] = np.inf
 233 |     match_cost = csr_matrix(match_cost)
 234 | 
 235 |     return match_perms_all, match_cost
 236 | 
 237 | 
 238 | def sync_perm_mat(match_perms_all, match_cost, n_atoms, callback=None):
 239 | 
 240 |     if callback is not None:
 241 |         callback = partial(
 242 |             callback, disp_str='Multi-partite matching (permutation synchronization)'
 243 |         )
 244 |         callback(NOT_DONE)
 245 | 
 246 |     tree = minimum_spanning_tree(match_cost, overwrite=True)
 247 | 
 248 |     perms = np.arange(n_atoms, dtype=int)[None, :]
 249 |     rows, cols = tree.nonzero()
 250 |     for com in zip(rows, cols):
 251 |         perm = match_perms_all.get(com)
 252 |         if perm is not None:
 253 |             perms = np.vstack((perms, perm))
 254 |     perms = np.unique(perms, axis=0)
 255 | 
 256 |     if callback is not None:
 257 |         callback(DONE)
 258 | 
 259 |     return perms
 260 | 
 261 | 
 262 | # convert permutation to dijoined cycles
 263 | def to_cycles(perm):
 264 |     pi = {i: perm[i] for i in range(len(perm))}
 265 |     cycles = []
 266 | 
 267 |     while pi:
 268 |         elem0 = next(iter(pi))  # arbitrary starting element
 269 |         this_elem = pi[elem0]
 270 |         next_item = pi[this_elem]
 271 | 
 272 |         cycle = []
 273 |         while True:
 274 |             cycle.append(this_elem)
 275 |             del pi[this_elem]
 276 |             this_elem = next_item
 277 |             if next_item in pi:
 278 |                 next_item = pi[next_item]
 279 |             else:
 280 |                 break
 281 | 
 282 |         cycles.append(cycle)
 283 | 
 284 |     return cycles
 285 | 
 286 | 
 287 | # find permutation group with larges cardinality
 288 | # note: this is used if transitive closure fails (to salvage at least some permutations)
 289 | def salvage_subgroup(perms):
 290 | 
 291 |     n_perms, n_atoms = perms.shape
 292 | 
 293 |     all_long_cycles = []
 294 |     for i in range(n_perms):
 295 |         long_cycles = [cy for cy in to_cycles(list(perms[i, :])) if len(cy) > 1]
 296 |         all_long_cycles += long_cycles
 297 | 
 298 |     # print(all_long_cycles)
 299 |     # print('--------------')
 300 | 
 301 |     def _cycle_intersects_with_larger_one(cy):
 302 | 
 303 |         for ac in all_long_cycles:
 304 |             if len(cy) < len(ac):
 305 |                 if not set(cy).isdisjoint(ac):
 306 |                     return True
 307 | 
 308 |         return False
 309 | 
 310 |     lcms = []
 311 |     keep_idx_many = []
 312 |     for i in range(n_perms):
 313 | 
 314 |         # print(to_cycles(list(perms[i, :])))
 315 | 
 316 |         # is this permutation valid?
 317 |         # remove permutations that contain cycles that share elements with larger cycles in other perms
 318 |         long_cycles = [cy for cy in to_cycles(list(perms[i, :])) if len(cy) > 1]
 319 | 
 320 |         # print('long cycles:')
 321 |         # print(long_cycles)
 322 | 
 323 |         ignore_perm = any(list(map(_cycle_intersects_with_larger_one, long_cycles)))
 324 | 
 325 |         if not ignore_perm:
 326 |             keep_idx_many.append(i)
 327 | 
 328 |         # print(ignore_perm)
 329 | 
 330 |         # print()
 331 | 
 332 |         # cy_lens = [len(cy) for cy in to_cycles(list(perms[i, :]))]
 333 |         # lcm = np.lcm.reduce(cy_lens)
 334 |         # lcms.append(lcm)
 335 |     # keep_idx = np.argmax(lcms)
 336 |     # perms = np.vstack((np.arange(n_atoms), perms[keep_idx,:]))
 337 |     perms = perms[keep_idx_many, :]
 338 | 
 339 |     # print(perms)
 340 | 
 341 |     return perms
 342 | 
 343 | 
 344 | def complete_sym_group(
 345 |     perms, n_perms_max=None, disp_str='Permutation group completion', callback=None
 346 | ):
 347 | 
 348 |     if callback is not None:
 349 |         callback = partial(callback, disp_str=disp_str)
 350 |         callback(NOT_DONE)
 351 | 
 352 |     perm_added = True
 353 |     while perm_added:
 354 |         perm_added = False
 355 |         n_perms = perms.shape[0]
 356 |         for i in range(n_perms):
 357 |             for j in range(n_perms):
 358 | 
 359 |                 new_perm = perms[i, perms[j, :]]
 360 |                 if not (new_perm == perms).all(axis=1).any():
 361 |                     perm_added = True
 362 |                     perms = np.vstack((perms, new_perm))
 363 | 
 364 |                     # Transitive closure is not converging! Give up and return identity permutation.
 365 |                     if n_perms_max is not None and perms.shape[0] == n_perms_max:
 366 | 
 367 |                         if callback is not None:
 368 |                             callback(
 369 |                                 DONE,
 370 |                                 sec_disp_str='transitive closure has failed',
 371 |                                 done_with_warning=True,
 372 |                             )
 373 |                         return None
 374 | 
 375 |     if callback is not None:
 376 |         callback(
 377 |             DONE,
 378 |             sec_disp_str='found {:d} symmetries'.format(perms.shape[0]),
 379 |         )
 380 | 
 381 |     return perms
 382 | 
 383 | 
 384 | def find_perms(R, z, lat_and_inv=None, callback=None, max_processes=None):
 385 | 
 386 |     m, n_atoms = R.shape[:2]
 387 | 
 388 |     # Find matching for all pairs.
 389 |     match_perms_all, match_cost = bipartite_match(
 390 |         R, z, lat_and_inv, max_processes, callback=callback
 391 |     )
 392 | 
 393 |     # Remove inconsistencies.
 394 |     match_perms = sync_perm_mat(match_perms_all, match_cost, n_atoms, callback=callback)
 395 | 
 396 |     # Commplete symmetric group.
 397 |     # Give up, if transitive closure yields more than 100 unique permutations.
 398 |     sym_group_perms = complete_sym_group(
 399 |         match_perms, n_perms_max=100, callback=callback
 400 |     )
 401 | 
 402 |     # Limit closure to largest cardinality permutation in the set to get at least some symmetries.
 403 |     if sym_group_perms is None:
 404 |         match_perms_subset = salvage_subgroup(match_perms)
 405 |         sym_group_perms = complete_sym_group(
 406 |             match_perms_subset,
 407 |             n_perms_max=100,
 408 |             disp_str='Closure disaster recovery',
 409 |             callback=callback,
 410 |         )
 411 | 
 412 |     return sym_group_perms
 413 | 
 414 | 
 415 | def find_extra_perms(R, z, lat_and_inv=None, callback=None, max_processes=None):
 416 | 
 417 |     m, n_atoms = R.shape[:2]
 418 | 
 419 |     # NEW
 420 | 
 421 |     # catcher
 422 |     # p = np.arange(n_atoms)
 423 |     # plane_3idxs = [19,17,47] # left to right
 424 |     # perm = find_perms_via_reflection(R[0], z, np.arange(n_atoms), plane_3idxs, lat_and_inv=None, max_processes=None)
 425 |     # perms = np.vstack((p[None,:], perm))
 426 |     # plane_3idxs = [(4,5),(2,1),(34,33)]  # top to bottom
 427 |     # perm = find_perms_via_reflection(R[0], z, np.arange(n_atoms), plane_3idxs, lat_and_inv=None, max_processes=None)
 428 |     # perms = np.vstack((perm[None,:], perms))
 429 |     # sym_group_perms = complete_sym_group(perms, n_perms_max=100, callback=callback)
 430 | 
 431 |     # nanotube
 432 |     R = R.copy()
 433 |     frags = find_frags(R[0], z, lat_and_inv=lat_and_inv)
 434 |     print(frags)
 435 | 
 436 |     perms = np.arange(n_atoms)[None, :]
 437 | 
 438 |     plane_3idxs = [280, 281, 273]  # half outer
 439 |     add_perms = find_perms_via_reflection(
 440 |         R[0], z, frags[1], plane_3idxs, lat_and_inv=None, max_processes=None
 441 |     )
 442 |     perms = np.vstack((perms, add_perms))
 443 | 
 444 |     # rotate inner
 445 |     # add_perms = find_perms_via_alignment(R[0], frags[0], [214, 215, 210, 211], [209, 208, 212, 213], z, lat_and_inv=lat_and_inv, max_processes=max_processes)
 446 |     # perms = np.vstack((perms, add_perms))
 447 |     # sym_group_perms = complete_sym_group(perms, callback=callback)
 448 | 
 449 |     # rotate outer
 450 |     # add_perms = find_perms_via_alignment(R[0], frags[1], [361, 360, 368, 369], [363, 362, 356, 357], z, lat_and_inv=lat_and_inv, max_processes=max_processes)
 451 |     # perms = np.vstack((perms, add_perms))
 452 |     # sym_group_perms = complete_sym_group(perms, callback=callback)
 453 | 
 454 |     perms = np.unique(perms, axis=0)
 455 |     sym_group_perms = complete_sym_group(perms, callback=callback)
 456 |     print(sym_group_perms.shape)
 457 | 
 458 |     return sym_group_perms
 459 | 
 460 |     # buckycatcher
 461 |     R = R.copy()  # *0.529177
 462 |     frags = find_frags(R[0], z, lat_and_inv=lat_and_inv)
 463 | 
 464 |     perms = np.arange(n_atoms)[None, :]
 465 | 
 466 |     # syms of catcher
 467 |     plane_3idxs = [54, 47, 17]  # left to right
 468 |     add_perms = find_perms_via_reflection(
 469 |         R[0], z, frags[0], plane_3idxs, lat_and_inv=None, max_processes=None
 470 |     )
 471 |     perms = np.vstack((perms, add_perms))
 472 | 
 473 |     plane_3idxs = [(33, 34), (31, 30), (5, 4)]  # top to bottom
 474 |     add_perms = find_perms_via_reflection(
 475 |         R[0], z, frags[0], plane_3idxs, lat_and_inv=None, max_processes=None
 476 |     )
 477 |     perms = np.vstack((perms, add_perms))
 478 | 
 479 |     # move cells
 480 |     # add_perms = find_perms_via_alignment(R[0], frags[1], [128, 129, 127], [133, 132, 134], z, lat_and_inv=lat_and_inv, max_processes=max_processes)
 481 |     # perms = np.vstack((perms, add_perms))
 482 |     # sym_group_perms = complete_sym_group(perms, callback=callback)
 483 | 
 484 |     # print(sym_group_perms.shape)
 485 | 
 486 |     # rotate cells
 487 |     add_perms = find_perms_via_alignment(
 488 |         R[0],
 489 |         frags[1],
 490 |         [129, 128, 127],
 491 |         [128, 127, 135],
 492 |         z,
 493 |         lat_and_inv=lat_and_inv,
 494 |         max_processes=max_processes,
 495 |     )
 496 |     perms = np.vstack((perms, add_perms))
 497 |     # print(add_perms.shape)
 498 |     # sym_group_perms = complete_sym_group(perms, callback=callback)
 499 | 
 500 |     # rotate cells (triangle)
 501 |     # add_perms = find_perms_via_alignment(R[0], frags[1], [132, 129, 134], [129, 134, 132], z, lat_and_inv=lat_and_inv, max_processes=max_processes)
 502 |     # perms = np.vstack((perms, add_perms))
 503 |     sym_group_perms = complete_sym_group(perms, callback=callback)
 504 | 
 505 |     # print(perms.shape)
 506 |     print(sym_group_perms.shape)
 507 | 
 508 |     # frag 1: bucky ball
 509 |     # perms = find_perms_in_frag(R, z, frags[1], lat_and_inv=lat_and_inv, max_processes=max_processes)
 510 |     # perms = np.vstack((p[None,:], perms))
 511 | 
 512 |     # print('perms')
 513 |     # print(perms.shape)
 514 | 
 515 |     # perms = np.unique(perms, axis=0)
 516 |     # perms = complete_sym_group(perms, callback=callback)
 517 | 
 518 |     # print('perms')
 519 |     # print(perms.shape)
 520 |     # print(sym_group_perms.shape)
 521 | 
 522 |     return sym_group_perms
 523 | 
 524 |     # NEW
 525 | 
 526 | 
 527 | def find_frags(r, z, lat_and_inv=None):
 528 | 
 529 |     from ase import Atoms
 530 |     from ase.geometry.analysis import Analysis
 531 |     from scipy.sparse.csgraph import connected_components
 532 | 
 533 |     print('Finding permutable non-bonded fragments... (assumes Ang!)')
 534 | 
 535 |     lat = None
 536 |     if lat_and_inv:
 537 |         lat = lat_and_inv[0]
 538 | 
 539 |     n_atoms = r.shape[0]
 540 |     atoms = Atoms(
 541 |         z, positions=r, cell=lat, pbc=lat is not None
 542 |     )  # only use first molecule in dataset to find connected components (fix me later, maybe) # *0.529177249
 543 | 
 544 |     adj = Analysis(atoms).adjacency_matrix[0]
 545 |     _, labels = connected_components(csgraph=adj, directed=False, return_labels=True)
 546 | 
 547 |     # frags = []
 548 |     # for label in np.unique(labels):
 549 |     #    frags.append(np.where(labels == label)[0])
 550 |     frags = [np.where(labels == label)[0] for label in np.unique(labels)]
 551 |     n_frags = len(frags)
 552 | 
 553 |     if n_frags == n_atoms:
 554 |         print(
 555 |             'Skipping fragment symmetry search (something went wrong, e.g. length unit not in Angstroms, etc.)'
 556 |         )
 557 |         return None
 558 | 
 559 |     print('| Found ' + str(n_frags) + ' disconnected fragments.')
 560 | 
 561 |     return frags
 562 | 
 563 | 
 564 | def find_frag_perms(R, z, lat_and_inv=None, callback=None, max_processes=None):
 565 | 
 566 |     from ase import Atoms
 567 |     from ase.geometry.analysis import Analysis
 568 |     from scipy.sparse.csgraph import connected_components
 569 | 
 570 |     # TODO: positions must be in Angstrom for this to work!!
 571 | 
 572 |     n_train, n_atoms = R.shape[:2]
 573 |     lat, lat_inv = lat_and_inv
 574 | 
 575 |     atoms = Atoms(
 576 |         z, positions=R[0], cell=lat, pbc=lat is not None
 577 |     )  # only use first molecule in dataset to find connected components (fix me later, maybe) # *0.529177249
 578 | 
 579 |     adj = Analysis(atoms).adjacency_matrix[0]
 580 |     _, labels = connected_components(csgraph=adj, directed=False, return_labels=True)
 581 | 
 582 |     # frags = []
 583 |     # for label in np.unique(labels):
 584 |     #    frags.append(np.where(labels == label)[0])
 585 |     frags = [np.where(labels == label)[0] for label in np.unique(labels)]
 586 |     n_frags = len(frags)
 587 | 
 588 |     if n_frags == n_atoms:
 589 |         print(
 590 |             'Skipping fragment symmetry search (something went wrong, e.g. length unit not in Angstroms, etc.)'
 591 |         )
 592 |         return [range(n_atoms)]
 593 | 
 594 |     # print(labels)
 595 | 
 596 |     # from . import ui, io
 597 |     # xyz_str = io.generate_xyz_str(R[0][np.where(labels == 0)[0], :]*0.529177249, z[np.where(labels == 0)[0]])
 598 |     # xyz_str = ui.indent_str(xyz_str, 2)
 599 |     # sprint(xyz_str)
 600 | 
 601 |     # NEW
 602 | 
 603 |     # uniq_labels = np.unique(labels)
 604 |     # R_cg = np.empty((R.shape[0], len(uniq_labels), R.shape[2]))
 605 |     # z_frags = []
 606 |     # z_cg = []
 607 |     # for label in uniq_labels:
 608 |     #     frag_idxs = np.where(labels == label)[0]
 609 | 
 610 |     #     R_cg[:,label,:] = np.mean(R[:,frag_idxs,:], axis=1)
 611 |     #     z_frag = np.sort(z[frag_idxs])
 612 | 
 613 |     #     z_frag_label = 0
 614 |     #     if len(z_frags) == 0:
 615 |     #         z_frags.append(z_frag)
 616 |     #     else:
 617 |     #         z_frag_label = np.where(np.all(z_frags == z_frag, axis=1))[0]
 618 | 
 619 |     #         if len(z_frag_label) == 0: # not found
 620 |     #             z_frag_label = len(z_frags)
 621 |     #             z_frags.append(z_frag)
 622 |     #         else:
 623 |     #             z_frag_label = z_frag_label[0]
 624 | 
 625 |     #     z_cg.append(z_frag_label)
 626 | 
 627 |     # print(z_cg)
 628 |     # print(R_cg.shape)
 629 | 
 630 |     # perms = find_perms(R_cg, np.array(z_cg), lat_and_inv=lat_and_inv, max_processes=max_processes)
 631 | 
 632 |     # print('cg perms')
 633 |     # print(perms)
 634 | 
 635 |     # NEW
 636 | 
 637 |     # print(n_frags)
 638 | 
 639 |     print('| Found ' + str(n_frags) + ' disconnected fragments.')
 640 | 
 641 |     # ufrags = np.unique([np.sort(z[frag]) for frag in frags])
 642 |     # print(ufrags)
 643 | 
 644 |     # sys.exit()
 645 | 
 646 |     # n_frags_unique = 0 # number of unique fragments
 647 | 
 648 |     # match fragments to find identical ones (allows permutations of fragments)
 649 |     swap_perms = [np.arange(n_atoms)]
 650 |     for f1 in range(n_frags):
 651 |         for f2 in range(f1 + 1, n_frags):
 652 | 
 653 |             sort_idx_f1 = np.argsort(z[frags[f1]])
 654 |             sort_idx_f2 = np.argsort(z[frags[f2]])
 655 |             inv_sort_idx_f2 = inv_perm(sort_idx_f2)
 656 | 
 657 |             z1 = z[frags[f1]][sort_idx_f1]
 658 |             z2 = z[frags[f2]][sort_idx_f2]
 659 | 
 660 |             if np.array_equal(z1, z2):  # fragment have the same composition
 661 | 
 662 |                 for ri in range(
 663 |                     min(10, R.shape[0])
 664 |                 ):  # only use first molecule in dataset for matching (fix me later)
 665 | 
 666 |                     R_match1 = R[ri, frags[f1], :]
 667 |                     R_match2 = R[ri, frags[f2], :]
 668 | 
 669 |                     # if np.array_equal(z1, z2):
 670 | 
 671 |                     R_pair = np.concatenate(
 672 |                         (R_match1[None, sort_idx_f1, :], R_match2[None, sort_idx_f2, :])
 673 |                     )
 674 | 
 675 |                     perms = find_perms(
 676 |                         R_pair, z1, lat_and_inv=lat_and_inv, max_processes=max_processes
 677 |                     )
 678 | 
 679 |                     # embed local permutation into global context
 680 |                     for p in perms:
 681 | 
 682 |                         match_perm = sort_idx_f1[p][inv_sort_idx_f2]
 683 | 
 684 |                         swap_perm = np.arange(n_atoms)
 685 |                         swap_perm[frags[f1]] = frags[f2][match_perm]
 686 |                         swap_perm[frags[f2][match_perm]] = frags[f1]
 687 |                         swap_perms.append(swap_perm)
 688 | 
 689 |             # else:
 690 |             #    n_frags_unique += 1
 691 | 
 692 |     swap_perms = np.unique(np.array(swap_perms), axis=0)
 693 | 
 694 |     # print(swap_perms)
 695 | 
 696 |     # print('| Found ' + str(n_frags_unique) + ' (likely to be) *unique* disconnected fragments.')
 697 | 
 698 |     # commplete symmetric group
 699 |     sym_group_perms = complete_sym_group(swap_perms)
 700 |     print(
 701 |         '| Found '
 702 |         + str(sym_group_perms.shape[0])
 703 |         + ' fragment permutations after closure.'
 704 |     )
 705 | 
 706 |     # return sym_group_perms
 707 | 
 708 |     # match fragments with themselves (to find symmetries in each fragment)
 709 | 
 710 |     def _frag_perm_to_perm(n_atoms, frag_idxs, frag_perms):
 711 | 
 712 |         # frag_idxs - indices of the fragment (one fragment!)
 713 |         # frag_perms - N fragment permutations (Nxn_atoms)
 714 | 
 715 |         perms = np.arange(n_atoms)[None, :]
 716 |         for fp in frag_perms:
 717 | 
 718 |             p = np.arange(n_atoms)
 719 |             p[frag_idxs] = frag_idxs[fp]
 720 |             perms = np.vstack((p[None, :], perms))
 721 | 
 722 |         return perms
 723 | 
 724 |     if n_frags > 1:
 725 |         print('| Finding symmetries in individual fragments.')
 726 |         for f in range(n_frags):
 727 | 
 728 |             R_frag = R[:, frags[f], :]
 729 |             z_frag = z[frags[f]]
 730 | 
 731 |             frag_perms = find_perms(
 732 |                 R_frag, z_frag, lat_and_inv=lat_and_inv, max_processes=max_processes
 733 |             )
 734 | 
 735 |             perms = _frag_perm_to_perm(n_atoms, frags[f], frag_perms)
 736 |             sym_group_perms = np.vstack((perms, sym_group_perms))
 737 | 
 738 |             print('{:d} perms'.format(perms.shape[0]))
 739 | 
 740 |         sym_group_perms = np.unique(sym_group_perms, axis=0)
 741 |     sym_group_perms = complete_sym_group(sym_group_perms, callback=callback)
 742 | 
 743 |     return sym_group_perms
 744 | 
 745 |     # f = 0
 746 |     # perms = find_perms_via_alignment(R[0, :, :], frags[f], [215, 214, 210, 211], [209, 208, 212, 213], z, lat_and_inv=lat_and_inv, max_processes=max_processes)
 747 |     # #perms = find_perms_via_alignment(R[0, :, :], frags[f], [214, 215, 210, 211], [209, 208, 212, 213], z, lat_and_inv=lat_and_inv, max_processes=max_processes)
 748 |     # sym_group_perms = np.vstack((perms[None,:], sym_group_perms))
 749 |     # sym_group_perms = complete_sym_group(sym_group_perms, callback=callback)
 750 | 
 751 |     # #print(sym_group_perms.shape)
 752 | 
 753 |     # #import sys
 754 |     # #sys.exit()
 755 | 
 756 |     # return sym_group_perms
 757 | 
 758 | 
 759 | def _frag_perm_to_perm(n_atoms, frag_idxs, frag_perms):
 760 | 
 761 |     # frag_idxs - indices of the fragment (one fragment!)
 762 |     # frag_perms - N fragment permutations (Nxn_atoms)
 763 | 
 764 |     perms = np.arange(n_atoms)[None, :]
 765 |     for fp in frag_perms:
 766 | 
 767 |         p = np.arange(n_atoms)
 768 |         p[frag_idxs] = frag_idxs[fp]
 769 |         perms = np.vstack((p[None, :], perms))
 770 | 
 771 |     return perms
 772 | 
 773 | 
 774 | def find_perms_in_frag(R, z, frag_idxs, lat_and_inv=None, max_processes=None):
 775 | 
 776 |     n_atoms = R.shape[1]
 777 | 
 778 |     R_frag = R[:, frag_idxs, :]
 779 |     z_frag = z[frag_idxs]
 780 | 
 781 |     frag_perms = find_perms(
 782 |         R_frag, z_frag, lat_and_inv=lat_and_inv, max_processes=max_processes
 783 |     )
 784 | 
 785 |     perms = _frag_perm_to_perm(n_atoms, frag_idxs, frag_perms)
 786 | 
 787 |     return perms
 788 | 
 789 | 
 790 | def find_perms_via_alignment(
 791 |     pts_full,
 792 |     frag_idxs,
 793 |     align_a_idxs,
 794 |     align_b_idxs,
 795 |     z,
 796 |     lat_and_inv=None,
 797 |     max_processes=None,
 798 | ):
 799 | 
 800 |     # 1. find rotatino that aligns points (Nx3 matrix) in 'align_a_idxs' with points in 'align_b_idxs'
 801 |     # 2. rotate the whole thing
 802 |     # find perms by matching those two structures (match atoms that are closest after transformation)
 803 | 
 804 |     # align_a_ctr = np.mean(align_a_pts, axis=0)
 805 |     # align_b_ctr = np.mean(align_b_pts, axis=0)
 806 | 
 807 |     # alignment indices are included in fragment
 808 |     assert np.isin(align_a_idxs, frag_idxs).all()
 809 |     assert np.isin(align_b_idxs, frag_idxs).all()
 810 | 
 811 |     assert len(align_a_idxs) == len(align_b_idxs)
 812 | 
 813 |     # align_a_frag_idxs = np.where(np.in1d(frag_idxs, align_a_idxs))[0]
 814 |     # align_b_frag_idxs = np.where(np.in1d(frag_idxs, align_b_idxs))[0]
 815 | 
 816 |     pts = pts_full[frag_idxs, :]
 817 | 
 818 |     align_a_pts = pts_full[align_a_idxs, :]
 819 |     align_b_pts = pts_full[align_b_idxs, :]
 820 | 
 821 |     ctr = np.mean(pts, axis=0)
 822 |     align_a_pts -= ctr
 823 |     align_b_pts -= ctr
 824 | 
 825 |     ab_cov = align_a_pts.T.dot(align_b_pts)
 826 |     u, s, vh = np.linalg.svd(ab_cov)
 827 |     R = u.dot(vh)
 828 | 
 829 |     if np.linalg.det(R) < 0:
 830 |         vh[2, :] *= -1  # multiply 3rd column of V by -1
 831 |         R = u.dot(vh)
 832 | 
 833 |     pts -= ctr
 834 |     pts_R = pts.copy()
 835 | 
 836 |     pts_R = R.dot(pts_R.T).T
 837 | 
 838 |     pts += ctr
 839 |     pts_R += ctr
 840 | 
 841 |     pts_full_R = pts_full.copy()
 842 |     pts_full_R[frag_idxs, :] = pts_R
 843 | 
 844 |     R_pair = np.vstack((pts_full[None, :, :], pts_full_R[None, :, :]))
 845 | 
 846 |     # from . import io
 847 | 
 848 |     # xyz_str = io.generate_xyz_str(pts_full, z)
 849 |     # print(xyz_str)
 850 | 
 851 |     # xyz_str = io.generate_xyz_str(pts_full_R, z)
 852 |     # print(xyz_str)
 853 | 
 854 |     # z_frag = z[frag_idxs]
 855 | 
 856 |     adj = scipy.spatial.distance.cdist(R_pair[0], R_pair[1], 'euclidean')
 857 |     _, perm = scipy.optimize.linear_sum_assignment(adj)
 858 | 
 859 |     # score_before = np.linalg.norm(adj)
 860 | 
 861 |     # adj_perm = scipy.spatial.distance.cdist(R_pair[0,:], R_pair[0, perm], 'euclidean')
 862 |     # score = np.linalg.norm(adj_perm)
 863 | 
 864 |     # print(score_before)
 865 |     # print(score)
 866 | 
 867 |     # print('---')
 868 | 
 869 |     # print('data \'model example\'', '|', end='')
 870 |     # rint('testing', '|', end='')
 871 |     # n_atoms = pts_full.shape[1]
 872 |     # print(n_atoms)
 873 | 
 874 |     # for p in pts_full[:,:]:
 875 |     #    print('H {:.5f} {:.5f} {:.5f}'.format(*p), '|', end='')
 876 | 
 877 |     # print('end \'model example\';show data')
 878 | 
 879 |     # draw selection
 880 |     if False:
 881 | 
 882 |         print('---')
 883 | 
 884 |         from matplotlib import cm
 885 | 
 886 |         viridis = cm.get_cmap('prism')
 887 |         colors = viridis(np.linspace(0, 1, len(align_a_idxs)))
 888 | 
 889 |         for i, idx in enumerate(align_a_idxs):
 890 |             color_str = (
 891 |                 '['
 892 |                 + str(int(colors[i, 0] * 255))
 893 |                 + ','
 894 |                 + str(int(colors[i, 1] * 255))
 895 |                 + ','
 896 |                 + str(int(colors[i, 2] * 255))
 897 |                 + ']'
 898 |             )
 899 |             print('select atomno=' + str(idx + 1) + '; color ' + color_str)
 900 | 
 901 |         for i, idx in enumerate(align_b_idxs):
 902 |             color_str = (
 903 |                 '['
 904 |                 + str(int(colors[i, 0] * 255))
 905 |                 + ','
 906 |                 + str(int(colors[i, 1] * 255))
 907 |                 + ','
 908 |                 + str(int(colors[i, 2] * 255))
 909 |                 + ']'
 910 |             )
 911 |             print('select atomno=' + str(idx + 1) + '; color ' + color_str)
 912 |         print('---')
 913 | 
 914 |     return perm
 915 | 
 916 | 
 917 | def find_perms_via_reflection(
 918 |     r, z, frag_idxs, plane_3idxs, lat_and_inv=None, max_processes=None
 919 | ):
 920 | 
 921 |     # plane_3idxs can be tuples of atoms (to take their center) or atom indices
 922 | 
 923 |     # pts = pts_full[frag_idxs, :]
 924 |     # pts = r.copy()
 925 | 
 926 |     # compute normal of plane defined by atoms in 'plane_idxs'
 927 | 
 928 |     is_plane_defined_by_bond_centers = type(plane_3idxs[0]) is tuple
 929 |     if is_plane_defined_by_bond_centers:
 930 |         a = (r[plane_3idxs[0][0], :] + r[plane_3idxs[0][1], :]) / 2
 931 |         b = (r[plane_3idxs[1][0], :] + r[plane_3idxs[1][1], :]) / 2
 932 |         c = (r[plane_3idxs[2][0], :] + r[plane_3idxs[2][1], :]) / 2
 933 |     else:
 934 |         a = r[plane_3idxs[0], :]
 935 |         b = r[plane_3idxs[1], :]
 936 |         c = r[plane_3idxs[2], :]
 937 | 
 938 |     ab = b - a
 939 |     ab /= np.linalg.norm(ab)
 940 | 
 941 |     ac = c - a
 942 |     ac /= np.linalg.norm(ac)
 943 | 
 944 |     normal = np.cross(ab, ac)[:, None]
 945 | 
 946 |     # compute reflection matrix
 947 |     reflection = np.eye(3) - 2 * normal.dot(normal.T)
 948 | 
 949 |     r_R = r.copy()
 950 |     r_R[frag_idxs, :] = reflection.dot(r[frag_idxs, :].T).T
 951 | 
 952 |     # R_pair = np.vstack((r[None,:,:], r_R[None,:,:]))
 953 | 
 954 |     adj = scipy.spatial.distance.cdist(r, r_R, 'euclidean')
 955 |     _, perm = scipy.optimize.linear_sum_assignment(adj)
 956 | 
 957 |     print_perm_colors(perm, r, plane_3idxs)
 958 | 
 959 |     # score_before = np.linalg.norm(adj)
 960 | 
 961 |     # adj_perm = scipy.spatial.distance.cdist(R_pair[0,:], R_pair[0, perm], 'euclidean')
 962 |     # score = np.linalg.norm(adj_perm)
 963 | 
 964 |     return perm
 965 | 
 966 | 
 967 | def print_perm_colors(perm, pts, plane_3idxs=None):
 968 | 
 969 |     idx_done = []
 970 |     c = -1
 971 |     for i in range(perm.shape[0]):
 972 |         if i not in idx_done and perm[i] not in idx_done:
 973 |             c += 1
 974 |             idx_done += [i]
 975 |             idx_done += [perm[i]]
 976 | 
 977 |     from matplotlib import cm
 978 | 
 979 |     viridis = cm.get_cmap('prism')
 980 |     colors = viridis(np.linspace(0, 1, c + 1))
 981 | 
 982 |     print('---')
 983 |     print('select all; color [255,255,255]')
 984 | 
 985 |     if plane_3idxs is not None:
 986 | 
 987 |         def pts_str(x):
 988 |             return '{' + str(x[0]) + ', ' + str(x[1]) + ', ' + str(x[2]) + '}'
 989 | 
 990 |         is_plane_defined_by_bond_centers = type(plane_3idxs[0]) is tuple
 991 |         if is_plane_defined_by_bond_centers:
 992 |             a = (pts[plane_3idxs[0][0], :] + pts[plane_3idxs[0][1], :]) / 2
 993 |             b = (pts[plane_3idxs[1][0], :] + pts[plane_3idxs[1][1], :]) / 2
 994 |             c = (pts[plane_3idxs[2][0], :] + pts[plane_3idxs[2][1], :]) / 2
 995 |         else:
 996 |             a = pts[plane_3idxs[0], :]
 997 |             b = pts[plane_3idxs[1], :]
 998 |             c = pts[plane_3idxs[2], :]
 999 | 
1000 |         print(
1001 |             'draw plane1 300 PLANE '
1002 |             + pts_str(a)
1003 |             + ' '
1004 |             + pts_str(b)
1005 |             + ' '
1006 |             + pts_str(c)
1007 |             + ';color $plane1 green'
1008 |         )
1009 | 
1010 |     idx_done = []
1011 |     c = -1
1012 |     for i in range(perm.shape[0]):
1013 |         if i not in idx_done and perm[i] not in idx_done:
1014 | 
1015 |             c += 1
1016 |             color_str = (
1017 |                 '['
1018 |                 + str(int(colors[c, 0] * 255))
1019 |                 + ','
1020 |                 + str(int(colors[c, 1] * 255))
1021 |                 + ','
1022 |                 + str(int(colors[c, 2] * 255))
1023 |                 + ']'
1024 |             )
1025 | 
1026 |             if i != perm[i]:
1027 |                 print('select atomno=' + str(i + 1) + '; color ' + color_str)
1028 |                 print('select atomno=' + str(perm[i] + 1) + '; color ' + color_str)
1029 |             idx_done += [i]
1030 |             idx_done += [perm[i]]
1031 | 
1032 |     print('---')
1033 | 
1034 | 
1035 | def inv_perm(perm):
1036 | 
1037 |     inv_perm = np.empty(perm.size, perm.dtype)
1038 |     inv_perm[perm] = np.arange(perm.T.size)
1039 | 
1040 |     return inv_perm
1041 | 


--------------------------------------------------------------------------------